#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # In this example, we reduce the dimensionality of the embeddings of # the SBERT pre-trained model 'all-mpnet-base-v2' from 768 to 10 dimensions. # # The code is derived from the SBERT documentation and corresponding example code: # - https://www.sbert.net/examples/training/distillation/README.html # - https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/distillation/dimensionality_reduction.py from sklearn.decomposition import PCA from sentence_transformers import SentenceTransformer, LoggingHandler, util, evaluation, models, InputExample import logging import os import pathlib import gzip import csv import random import numpy as np import torch import films #### Just some code to print debug information to stdout logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]) logger = logging.getLogger(__name__) #### Create folders structure pathlib.Path("./data/").mkdir(parents=True, exist_ok=True) pathlib.Path("./models/").mkdir(parents=True, exist_ok=True) ######## Load full model ######## # Model for which we apply dimensionality reduction model = SentenceTransformer("all-mpnet-base-v2") # New size for the embeddings new_dimension = 10 ######## Evaluate performance of full model ######## # We use the STS benchmark dataset to see how much performance we loose by using the dimensionality reduction sts_dataset_path = "./data/stsbenchmark.tsv.gz" if not os.path.exists(sts_dataset_path): util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path) # We measure the performance of the original model # and later we will measure the performance with the reduces dimension size logger.info("Read STSbenchmark test dataset") eval_examples = [] with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn: reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE) for row in reader: if row["split"] == "test": score = float(row["score"]) / 5.0 #Normalize score to range 0 ... 1 eval_examples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score)) # Evaluate the original model on the STS benchmark dataset stsb_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples, name="sts-benchmark-test") logger.info("Original model performance:") stsb_evaluator(model) ######## Reduce the embedding dimensions ######## # We load the films dataset and creates a list of unique sentences utilizing the movie title and the genres films_dataset = films.load_films_dataset() films_sentences = list(set(films.get_films_sentences(films_dataset))) random.shuffle(films_sentences) # To determine the PCA matrix, we need some example sentence embeddings. # Here, we compute the embeddings for all the movies in the films dataset. pca_train_sentences = films_sentences train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True) # Compute PCA on the train embeddings matrix pca = PCA(n_components=new_dimension) pca.fit(train_embeddings) pca_comp = np.asarray(pca.components_) # We add a dense layer to the model, so that it will produce directly embeddings with the new size dense = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=new_dimension, bias=False, activation_function=torch.nn.Identity()) dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp)) model.add_module("dense", dense) ######## Evaluate the model with the reduce embedding size logger.info("Model with {} dimensions:".format(new_dimension)) stsb_evaluator(model) ######## Store the reduced model on disc model.save(films.PATH_FILMS_MODEL)