bryce-solr/solr/example/films/vectors/create_model.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# In this example, we reduce the dimensionality of the embeddings of
# the SBERT pre-trained model 'all-mpnet-base-v2' from 768 to 10 dimensions.
#
# The code is derived from the SBERT documentation and corresponding example code:
#  - https://www.sbert.net/examples/training/distillation/README.html
#  - https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/distillation/dimensionality_reduction.py

from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer, LoggingHandler, util, evaluation, models, InputExample
import logging
import os
import pathlib
import gzip
import csv
import random
import numpy as np
import torch

import films

#### Just some code to print debug information to stdout
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

#### Create folders structure
pathlib.Path("./data/").mkdir(parents=True, exist_ok=True)
pathlib.Path("./models/").mkdir(parents=True, exist_ok=True)


######## Load full model ########

# Model for which we apply dimensionality reduction
model = SentenceTransformer("all-mpnet-base-v2")

# New size for the embeddings
new_dimension = 10


######## Evaluate performance of full model ########

# We use the STS benchmark dataset to see how much performance we loose by using the dimensionality reduction
sts_dataset_path = "./data/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
    util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)

# We measure the performance of the original model
# and later we will measure the performance with the reduces dimension size
logger.info("Read STSbenchmark test dataset")
eval_examples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
    for row in reader:
        if row["split"] == "test":
            score = float(row["score"]) / 5.0 #Normalize score to range 0 ... 1
            eval_examples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))

# Evaluate the original model on the STS benchmark dataset
stsb_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples, name="sts-benchmark-test")

logger.info("Original model performance:")
stsb_evaluator(model)


######## Reduce the embedding dimensions ########

# We load the films dataset and creates a list of unique sentences utilizing the movie title and the genres
films_dataset = films.load_films_dataset()
films_sentences = list(set(films.get_films_sentences(films_dataset)))
random.shuffle(films_sentences)

# To determine the PCA matrix, we need some example sentence embeddings.
# Here, we compute the embeddings for all the movies in the films dataset.
pca_train_sentences = films_sentences
train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True)

# Compute PCA on the train embeddings matrix
pca = PCA(n_components=new_dimension)
pca.fit(train_embeddings)
pca_comp = np.asarray(pca.components_)

# We add a dense layer to the model, so that it will produce directly embeddings with the new size
dense = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=new_dimension, bias=False, activation_function=torch.nn.Identity())
dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
model.add_module("dense", dense)


######## Evaluate the model with the reduce embedding size
logger.info("Model with {} dimensions:".format(new_dimension))
stsb_evaluator(model)


######## Store the reduced model on disc
model.save(films.PATH_FILMS_MODEL)