commit
This commit is contained in:
110
solr/example/films/vectors/create_model.py
Normal file
110
solr/example/films/vectors/create_model.py
Normal file
@@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# In this example, we reduce the dimensionality of the embeddings of
|
||||
# the SBERT pre-trained model 'all-mpnet-base-v2' from 768 to 10 dimensions.
|
||||
#
|
||||
# The code is derived from the SBERT documentation and corresponding example code:
|
||||
# - https://www.sbert.net/examples/training/distillation/README.html
|
||||
# - https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/distillation/dimensionality_reduction.py
|
||||
|
||||
from sklearn.decomposition import PCA
|
||||
from sentence_transformers import SentenceTransformer, LoggingHandler, util, evaluation, models, InputExample
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import gzip
|
||||
import csv
|
||||
import random
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
import films
|
||||
|
||||
#### Just some code to print debug information to stdout
|
||||
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()])
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
#### Create folders structure
|
||||
pathlib.Path("./data/").mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path("./models/").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
######## Load full model ########
|
||||
|
||||
# Model for which we apply dimensionality reduction
|
||||
model = SentenceTransformer("all-mpnet-base-v2")
|
||||
|
||||
# New size for the embeddings
|
||||
new_dimension = 10
|
||||
|
||||
|
||||
######## Evaluate performance of full model ########
|
||||
|
||||
# We use the STS benchmark dataset to see how much performance we loose by using the dimensionality reduction
|
||||
sts_dataset_path = "./data/stsbenchmark.tsv.gz"
|
||||
if not os.path.exists(sts_dataset_path):
|
||||
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
|
||||
|
||||
# We measure the performance of the original model
|
||||
# and later we will measure the performance with the reduces dimension size
|
||||
logger.info("Read STSbenchmark test dataset")
|
||||
eval_examples = []
|
||||
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
|
||||
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
|
||||
for row in reader:
|
||||
if row["split"] == "test":
|
||||
score = float(row["score"]) / 5.0 #Normalize score to range 0 ... 1
|
||||
eval_examples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
|
||||
|
||||
# Evaluate the original model on the STS benchmark dataset
|
||||
stsb_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples, name="sts-benchmark-test")
|
||||
|
||||
logger.info("Original model performance:")
|
||||
stsb_evaluator(model)
|
||||
|
||||
|
||||
######## Reduce the embedding dimensions ########
|
||||
|
||||
# We load the films dataset and creates a list of unique sentences utilizing the movie title and the genres
|
||||
films_dataset = films.load_films_dataset()
|
||||
films_sentences = list(set(films.get_films_sentences(films_dataset)))
|
||||
random.shuffle(films_sentences)
|
||||
|
||||
# To determine the PCA matrix, we need some example sentence embeddings.
|
||||
# Here, we compute the embeddings for all the movies in the films dataset.
|
||||
pca_train_sentences = films_sentences
|
||||
train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True)
|
||||
|
||||
# Compute PCA on the train embeddings matrix
|
||||
pca = PCA(n_components=new_dimension)
|
||||
pca.fit(train_embeddings)
|
||||
pca_comp = np.asarray(pca.components_)
|
||||
|
||||
# We add a dense layer to the model, so that it will produce directly embeddings with the new size
|
||||
dense = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=new_dimension, bias=False, activation_function=torch.nn.Identity())
|
||||
dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
|
||||
model.add_module("dense", dense)
|
||||
|
||||
|
||||
######## Evaluate the model with the reduce embedding size
|
||||
logger.info("Model with {} dimensions:".format(new_dimension))
|
||||
stsb_evaluator(model)
|
||||
|
||||
|
||||
######## Store the reduced model on disc
|
||||
model.save(films.PATH_FILMS_MODEL)
|
||||
Reference in New Issue
Block a user