Skip to content

Commit

Permalink
update sts code
Browse files Browse the repository at this point in the history
  • Loading branch information
wshuai190 committed Oct 17, 2024
1 parent 3455614 commit b1774c5
Show file tree
Hide file tree
Showing 8 changed files with 1,049 additions and 1 deletion.
35 changes: 34 additions & 1 deletion sts/README.md
Original file line number Diff line number Diff line change
@@ -1 +1,34 @@
# Starbucks
# Starbucks SRL fine-tuning for STS

This repository contains the code for fine-tuning from any pre-trained model on the STS benchmark dataset.
This repo supports for three types of fine-tuning:
- **baseline**: fine-tuning the small scale models seperatly
- **2d_matryoshka**: fine-tuning the the full scale model with 2d matryoshka
- **starbucks**: fine-tuning the full scale model with starbucks representation learning

## To train models

You can train the models by running the following commands:
```bash
python3 train_baseline.py bert-base-uncased full # full means use all_nli, otherwise only stab to train

python3 train_2d_matryoshka.py bert-base-uncased full # full means use all_nli, otherwise only stab to train

python3 train_starbucks.py bert-base-uncased full 1 # full means use all_nli, otherwise only stab to train, 1 means kl_divergence weight
```
You can change the model name to any other pre-trained model name in the huggingface model hub, or local path to the model.

## To evaluate models
```bash
python3 inference_2d_sts.py [model_name] full diaganol # full means use all_nli, otherwise only stab to train, diaganol means only starbucks sizes
```

Or to evaluate all seperatly trained models at the same time:
```bash
python3 inference_baselines_sts.py [model_name] full # full means use all_nli, otherwise only stab to train
```

We released our model checkpoints on Hugging Face Model Hub: [Starbucks_STS](https://huggingface.co/ielabgroup/Starbucks_STS).



104 changes: 104 additions & 0 deletions sts/inference_2d_sts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import json
import os.path
import sys
from datasets import load_dataset
from tqdm import tqdm
from sentence_transformers import (
SentenceTransformer,
)
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator, SimilarityFunction
import re

model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-uncased" # model name, could be any from huggingface
evaluate_type = sys.argv[2] if len(sys.argv) > 2 else "full" # full or low
layer_dim_type = sys.argv[3] if len(sys.argv) > 3 else "diaganol" # diaganol or full
if evaluate_type == "full":
dataset_dict = {
"stsb": "sentence-transformers/stsb",
"sts12": "mteb/sts12-sts",
"sts13": "mteb/sts13-sts",
"sts14": "mteb/sts14-sts",
"sts15": "mteb/sts15-sts",
"sts16": "mteb/sts16-sts",
"sickr": "mteb/sickr-sts"
}
elif evaluate_type == "low":
dataset_dict = {
"stsb": "sentence-transformers/stsb",
}

final_result_dict = {}

matryoshka_dims = []

matryoshka_layers = []

matryoshka_dims += [768, 512, 256, 128, 64, 32]
matryoshka_layers += [12, 10, 8, 6, 4, 2]

for dataset in tqdm(dataset_dict.keys()):
dataset_loading_name = dataset_dict[dataset]
test_dataset = load_dataset(dataset_loading_name, split="test")
result_dict = {}
for layer_i, layer in enumerate(matryoshka_layers):
evaluators = []
for dim in matryoshka_dims:
if layer_dim_type == "diaganol":
if matryoshka_dims.index(dim) != matryoshka_layers.index(layer):
continue
evaluators.append(
EmbeddingSimilarityEvaluator(
sentences1=test_dataset["sentence1"],
sentences2=test_dataset["sentence2"],
scores=test_dataset["score"],
main_similarity=SimilarityFunction.COSINE,
name=f"sts-test-{dim}",
truncate_dim=dim
)
)
model = SentenceTransformer(model_name)
model[0].auto_model.encoder.layer = model[0].auto_model.encoder.layer[:layer]
test_evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1])
results = test_evaluator(model)

result_dict[layer] = {}
for result_key in list(results.keys()):
if "spearman_cosine" in result_key:
# first copy the key
#result_key save is only the number in the key
result_key_save = re.findall(r'\d+', result_key)[0]
#print(result_key_save)
result_dict[layer][result_key_save] = results[result_key]
final_result_dict[dataset] = result_dict

final_result_dict["average"] = {}
for layer_i, layer in enumerate(matryoshka_layers):
final_result_dict["average"][layer] = {}
#dim = matryoshka_dims[layer_i]
for dim in matryoshka_dims:
if layer_dim_type == "diaganol":
if matryoshka_dims.index(dim) != matryoshka_layers.index(layer):
continue
final_result_dict["average"][layer][dim] = sum([final_result_dict[dataset][layer][str(dim)] for dataset in dataset_dict.keys()]) / len(dataset_dict.keys())

final_result_dict["average_dataset"] = {}
for dataset in dataset_dict.keys():
final_result_dict["average_dataset"][dataset] = []
for layer_i, layer in enumerate(matryoshka_layers):
for dim in matryoshka_dims:
if layer_dim_type == "diaganol":
if matryoshka_dims.index(dim) != matryoshka_layers.index(layer):
continue
final_result_dict["average_dataset"][dataset].append(final_result_dict[dataset][layer][str(dim)])
final_result_dict["average_dataset"][dataset] = sum(final_result_dict["average_dataset"][dataset]) / len(final_result_dict["average_dataset"][dataset])

model_output_folder = model_name.replace("/", "_")
if not os.path.exists(model_output_folder):
os.makedirs(model_output_folder)

out_file = os.path.join(model_output_folder, "sts_results_" + evaluate_type + "_" + layer_dim_type + ".json")
json.dump(final_result_dict, open(out_file, "w"), indent=2)




131 changes: 131 additions & 0 deletions sts/inference_baselines_sts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import json
import logging
import os.path
import sys
import traceback
from datetime import datetime

from datasets import load_dataset

from sentence_transformers import (
SentenceTransformer,
)
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator, SimilarityFunction
import re
from tqdm import tqdm

model_folder = sys.argv[1]
evaluate_type = sys.argv[2]
assert os.path.exists(model_folder), f"Folder {model_folder} does not exist"

if evaluate_type == "full":
dataset_dict = {
"stsb": "sentence-transformers/stsb",
"sts12": "mteb/sts12-sts",
"sts13": "mteb/sts13-sts",
"sts14": "mteb/sts14-sts",
"sts15": "mteb/sts15-sts",
"sts16": "mteb/sts16-sts",
"sickr": "mteb/sickr-sts"
}
elif evaluate_type == "low":
dataset_dict = {
"stsb": "sentence-transformers/stsb",
}


matryoshka_dims = [32, 64, 128, 256, 512, 768]
matryoshka_layers = list(range(2, 13, 2))

result_dict = {}
final_result_dict = {}
for layer in tqdm(matryoshka_layers):
result_dict[layer] = {}
for dim in tqdm(matryoshka_dims):
model_name = os.path.join(model_folder, f"layer_{layer}_dim_{dim}/final")
if not os.path.exists(model_name):
continue

layer_dim_str = str(layer) + "_" + str(dim)

print("Layer:", layer, "Dim:", dim)

model = SentenceTransformer(model_name, truncate_dim=dim)
model[0].auto_model.encoder.layer = model[0].auto_model.encoder.layer[:layer]
if not os.path.exists(model_name):
continue
for dataset in dataset_dict.keys():
dataset_loading_name = dataset_dict[dataset]

test_dataset = load_dataset(dataset_loading_name, split="test")
evaluators = []

evaluators.append(
EmbeddingSimilarityEvaluator(
sentences1=test_dataset["sentence1"],
sentences2=test_dataset["sentence2"],
scores=test_dataset["score"],
main_similarity=SimilarityFunction.COSINE,
name=f"sts-test-{dim}",
truncate_dim=dim,
)
)
test_evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1])
results = test_evaluator(model)

for result_key in list(results.keys()):
if "spearman_cosine" in result_key:
# first copy the key
#result_key save is only the number in the key
result_key_save = re.findall(r'\d+', result_key)[0]
if result_key_save not in result_dict[layer]:
result_dict[layer][result_key_save] = {dataset: results[result_key]}
else:
result_dict[layer][result_key_save][dataset] = results[result_key]



# with open(os.path.join(model_name, "sts_results.json"), "w") as f:
# json.dump(result_dict[layer_dim_str], f, indent=2)

for dataset in dataset_dict.keys():
final_result_dict[dataset] = {}


for layer in result_dict:
for dim in result_dict[layer]:
for dataset in result_dict[layer][dim]:
if dataset not in final_result_dict:
final_result_dict[dataset] = {}
if layer not in final_result_dict[dataset]:
final_result_dict[dataset][layer] = {}
final_result_dict[dataset][layer][dim] = result_dict[layer][dim][dataset]

print(final_result_dict)
final_result_dict["average"] = {}
for layer in final_result_dict["stsb"]:
final_result_dict["average"][layer] = {}
for dim in final_result_dict["stsb"][layer]:
final_result_dict["average"][layer][dim] = []
for dataset in final_result_dict:
if dataset == "average":
continue
final_result_dict["average"][layer][dim].append(final_result_dict[dataset][layer][dim])
final_result_dict["average"][layer][dim] = sum(final_result_dict["average"][layer][dim]) / len(final_result_dict["average"][layer][dim])
print(final_result_dict)
# another average for all sizes of dataset
final_result_dict["average_dataset"] = {}
for dataset in dataset_dict.keys():
final_result_dict["average_dataset"][dataset] = []
for layer in final_result_dict["average"]:
for dim in final_result_dict["average"][layer]:
final_result_dict["average_dataset"][dataset].append(final_result_dict[dataset][layer][dim])
final_result_dict["average_dataset"][dataset] = sum(final_result_dict["average_dataset"][dataset]) / len(final_result_dict["average_dataset"][dataset])


out_file = os.path.join(model_folder, "sts_results_" + evaluate_type + ".json")
json.dump(final_result_dict, open(out_file, "w"), indent=2)




Loading

0 comments on commit b1774c5

Please sign in to comment.