commit
This commit is contained in:
68
solr/example/films/vectors/create_dataset.py
Normal file
68
solr/example/films/vectors/create_dataset.py
Normal file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This script will use the reduced model created by the `create_model`
|
||||
# script to add a new field in the films dataset, which will store the
|
||||
# film vector according to the embedding model.
|
||||
|
||||
import json
|
||||
|
||||
from sentence_transformers import SentenceTransformer, util
|
||||
import torch
|
||||
|
||||
import films
|
||||
|
||||
#### Load the 10-dimensions model
|
||||
model = SentenceTransformer(films.PATH_FILMS_MODEL)
|
||||
|
||||
#### Load the original films dataset
|
||||
films_dataset = films.load_films_dataset()
|
||||
|
||||
#### Use the embedding model to calculate vectors for all movies
|
||||
films_vectors = films.calculate_films_vectors(model, films_dataset)
|
||||
|
||||
#### Visual evaluation of some specific movies
|
||||
|
||||
def most_similar_movie(target_idx, top_k=5):
|
||||
film = films_dataset[target_idx]
|
||||
film_vector = films_vectors[target_idx]
|
||||
|
||||
cos_scores = util.cos_sim(film_vector, films_vectors)[0]
|
||||
top_results = torch.topk(cos_scores, k=top_k)
|
||||
|
||||
print("\n======================\n")
|
||||
print("Film:", films.get_film_sentence(film).replace("\n", " - "))
|
||||
print("\nTop 5 most similar films in corpus:")
|
||||
|
||||
for score, idx in zip(top_results[0], top_results[1]):
|
||||
movie_str = films.get_film_sentence(films_dataset[idx]).replace("\n", " - ")
|
||||
print(f" - [{idx}] {movie_str} (Score: {score:.4f})")
|
||||
|
||||
most_similar_movie(200)
|
||||
most_similar_movie(100)
|
||||
most_similar_movie(500)
|
||||
most_similar_movie(911)
|
||||
|
||||
|
||||
#### Create the new films dataset by creating a new field with the embedding vector
|
||||
for idx in range(len(films_dataset)):
|
||||
films_dataset[idx]["film_vector"] = list(films_vectors[idx].astype("float64"))
|
||||
|
||||
#### Export the new films dataset for all formats
|
||||
films.export_films_json(films_dataset)
|
||||
films.export_films_xml(films_dataset)
|
||||
films.export_films_csv(films_dataset)
|
||||
Reference in New Issue
Block a user