This commit is contained in:
2026-05-15 22:19:14 -07:00
commit f4f046263c
2058 changed files with 236159 additions and 0 deletions

View File

@@ -0,0 +1,110 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# In this example, we reduce the dimensionality of the embeddings of
# the SBERT pre-trained model 'all-mpnet-base-v2' from 768 to 10 dimensions.
#
# The code is derived from the SBERT documentation and corresponding example code:
# - https://www.sbert.net/examples/training/distillation/README.html
# - https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/distillation/dimensionality_reduction.py
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer, LoggingHandler, util, evaluation, models, InputExample
import logging
import os
import pathlib
import gzip
import csv
import random
import numpy as np
import torch
import films
#### Just some code to print debug information to stdout
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)
#### Create folders structure
pathlib.Path("./data/").mkdir(parents=True, exist_ok=True)
pathlib.Path("./models/").mkdir(parents=True, exist_ok=True)
######## Load full model ########
# Model for which we apply dimensionality reduction
model = SentenceTransformer("all-mpnet-base-v2")
# New size for the embeddings
new_dimension = 10
######## Evaluate performance of full model ########
# We use the STS benchmark dataset to see how much performance we loose by using the dimensionality reduction
sts_dataset_path = "./data/stsbenchmark.tsv.gz"
if not os.path.exists(sts_dataset_path):
util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path)
# We measure the performance of the original model
# and later we will measure the performance with the reduces dimension size
logger.info("Read STSbenchmark test dataset")
eval_examples = []
with gzip.open(sts_dataset_path, "rt", encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
if row["split"] == "test":
score = float(row["score"]) / 5.0 #Normalize score to range 0 ... 1
eval_examples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
# Evaluate the original model on the STS benchmark dataset
stsb_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples, name="sts-benchmark-test")
logger.info("Original model performance:")
stsb_evaluator(model)
######## Reduce the embedding dimensions ########
# We load the films dataset and creates a list of unique sentences utilizing the movie title and the genres
films_dataset = films.load_films_dataset()
films_sentences = list(set(films.get_films_sentences(films_dataset)))
random.shuffle(films_sentences)
# To determine the PCA matrix, we need some example sentence embeddings.
# Here, we compute the embeddings for all the movies in the films dataset.
pca_train_sentences = films_sentences
train_embeddings = model.encode(pca_train_sentences, convert_to_numpy=True)
# Compute PCA on the train embeddings matrix
pca = PCA(n_components=new_dimension)
pca.fit(train_embeddings)
pca_comp = np.asarray(pca.components_)
# We add a dense layer to the model, so that it will produce directly embeddings with the new size
dense = models.Dense(in_features=model.get_sentence_embedding_dimension(), out_features=new_dimension, bias=False, activation_function=torch.nn.Identity())
dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))
model.add_module("dense", dense)
######## Evaluate the model with the reduce embedding size
logger.info("Model with {} dimensions:".format(new_dimension))
stsb_evaluator(model)
######## Store the reduced model on disc
model.save(films.PATH_FILMS_MODEL)