-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
1,049 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,34 @@ | ||
# Starbucks | ||
# Starbucks SRL fine-tuning for STS | ||
|
||
This repository contains the code for fine-tuning from any pre-trained model on the STS benchmark dataset. | ||
This repo supports for three types of fine-tuning: | ||
- **baseline**: fine-tuning the small scale models seperatly | ||
- **2d_matryoshka**: fine-tuning the the full scale model with 2d matryoshka | ||
- **starbucks**: fine-tuning the full scale model with starbucks representation learning | ||
|
||
## To train models | ||
|
||
You can train the models by running the following commands: | ||
```bash | ||
python3 train_baseline.py bert-base-uncased full # full means use all_nli, otherwise only stab to train | ||
|
||
python3 train_2d_matryoshka.py bert-base-uncased full # full means use all_nli, otherwise only stab to train | ||
|
||
python3 train_starbucks.py bert-base-uncased full 1 # full means use all_nli, otherwise only stab to train, 1 means kl_divergence weight | ||
``` | ||
You can change the model name to any other pre-trained model name in the huggingface model hub, or local path to the model. | ||
|
||
## To evaluate models | ||
```bash | ||
python3 inference_2d_sts.py [model_name] full diaganol # full means use all_nli, otherwise only stab to train, diaganol means only starbucks sizes | ||
``` | ||
|
||
Or to evaluate all seperatly trained models at the same time: | ||
```bash | ||
python3 inference_baselines_sts.py [model_name] full # full means use all_nli, otherwise only stab to train | ||
``` | ||
|
||
We released our model checkpoints on Hugging Face Model Hub: [Starbucks_STS](https://huggingface.co/ielabgroup/Starbucks_STS). | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
import json | ||
import os.path | ||
import sys | ||
from datasets import load_dataset | ||
from tqdm import tqdm | ||
from sentence_transformers import ( | ||
SentenceTransformer, | ||
) | ||
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator, SimilarityFunction | ||
import re | ||
|
||
model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-uncased" # model name, could be any from huggingface | ||
evaluate_type = sys.argv[2] if len(sys.argv) > 2 else "full" # full or low | ||
layer_dim_type = sys.argv[3] if len(sys.argv) > 3 else "diaganol" # diaganol or full | ||
if evaluate_type == "full": | ||
dataset_dict = { | ||
"stsb": "sentence-transformers/stsb", | ||
"sts12": "mteb/sts12-sts", | ||
"sts13": "mteb/sts13-sts", | ||
"sts14": "mteb/sts14-sts", | ||
"sts15": "mteb/sts15-sts", | ||
"sts16": "mteb/sts16-sts", | ||
"sickr": "mteb/sickr-sts" | ||
} | ||
elif evaluate_type == "low": | ||
dataset_dict = { | ||
"stsb": "sentence-transformers/stsb", | ||
} | ||
|
||
final_result_dict = {} | ||
|
||
matryoshka_dims = [] | ||
|
||
matryoshka_layers = [] | ||
|
||
matryoshka_dims += [768, 512, 256, 128, 64, 32] | ||
matryoshka_layers += [12, 10, 8, 6, 4, 2] | ||
|
||
for dataset in tqdm(dataset_dict.keys()): | ||
dataset_loading_name = dataset_dict[dataset] | ||
test_dataset = load_dataset(dataset_loading_name, split="test") | ||
result_dict = {} | ||
for layer_i, layer in enumerate(matryoshka_layers): | ||
evaluators = [] | ||
for dim in matryoshka_dims: | ||
if layer_dim_type == "diaganol": | ||
if matryoshka_dims.index(dim) != matryoshka_layers.index(layer): | ||
continue | ||
evaluators.append( | ||
EmbeddingSimilarityEvaluator( | ||
sentences1=test_dataset["sentence1"], | ||
sentences2=test_dataset["sentence2"], | ||
scores=test_dataset["score"], | ||
main_similarity=SimilarityFunction.COSINE, | ||
name=f"sts-test-{dim}", | ||
truncate_dim=dim | ||
) | ||
) | ||
model = SentenceTransformer(model_name) | ||
model[0].auto_model.encoder.layer = model[0].auto_model.encoder.layer[:layer] | ||
test_evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1]) | ||
results = test_evaluator(model) | ||
|
||
result_dict[layer] = {} | ||
for result_key in list(results.keys()): | ||
if "spearman_cosine" in result_key: | ||
# first copy the key | ||
#result_key save is only the number in the key | ||
result_key_save = re.findall(r'\d+', result_key)[0] | ||
#print(result_key_save) | ||
result_dict[layer][result_key_save] = results[result_key] | ||
final_result_dict[dataset] = result_dict | ||
|
||
final_result_dict["average"] = {} | ||
for layer_i, layer in enumerate(matryoshka_layers): | ||
final_result_dict["average"][layer] = {} | ||
#dim = matryoshka_dims[layer_i] | ||
for dim in matryoshka_dims: | ||
if layer_dim_type == "diaganol": | ||
if matryoshka_dims.index(dim) != matryoshka_layers.index(layer): | ||
continue | ||
final_result_dict["average"][layer][dim] = sum([final_result_dict[dataset][layer][str(dim)] for dataset in dataset_dict.keys()]) / len(dataset_dict.keys()) | ||
|
||
final_result_dict["average_dataset"] = {} | ||
for dataset in dataset_dict.keys(): | ||
final_result_dict["average_dataset"][dataset] = [] | ||
for layer_i, layer in enumerate(matryoshka_layers): | ||
for dim in matryoshka_dims: | ||
if layer_dim_type == "diaganol": | ||
if matryoshka_dims.index(dim) != matryoshka_layers.index(layer): | ||
continue | ||
final_result_dict["average_dataset"][dataset].append(final_result_dict[dataset][layer][str(dim)]) | ||
final_result_dict["average_dataset"][dataset] = sum(final_result_dict["average_dataset"][dataset]) / len(final_result_dict["average_dataset"][dataset]) | ||
|
||
model_output_folder = model_name.replace("/", "_") | ||
if not os.path.exists(model_output_folder): | ||
os.makedirs(model_output_folder) | ||
|
||
out_file = os.path.join(model_output_folder, "sts_results_" + evaluate_type + "_" + layer_dim_type + ".json") | ||
json.dump(final_result_dict, open(out_file, "w"), indent=2) | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
import json | ||
import logging | ||
import os.path | ||
import sys | ||
import traceback | ||
from datetime import datetime | ||
|
||
from datasets import load_dataset | ||
|
||
from sentence_transformers import ( | ||
SentenceTransformer, | ||
) | ||
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator, SimilarityFunction | ||
import re | ||
from tqdm import tqdm | ||
|
||
model_folder = sys.argv[1] | ||
evaluate_type = sys.argv[2] | ||
assert os.path.exists(model_folder), f"Folder {model_folder} does not exist" | ||
|
||
if evaluate_type == "full": | ||
dataset_dict = { | ||
"stsb": "sentence-transformers/stsb", | ||
"sts12": "mteb/sts12-sts", | ||
"sts13": "mteb/sts13-sts", | ||
"sts14": "mteb/sts14-sts", | ||
"sts15": "mteb/sts15-sts", | ||
"sts16": "mteb/sts16-sts", | ||
"sickr": "mteb/sickr-sts" | ||
} | ||
elif evaluate_type == "low": | ||
dataset_dict = { | ||
"stsb": "sentence-transformers/stsb", | ||
} | ||
|
||
|
||
matryoshka_dims = [32, 64, 128, 256, 512, 768] | ||
matryoshka_layers = list(range(2, 13, 2)) | ||
|
||
result_dict = {} | ||
final_result_dict = {} | ||
for layer in tqdm(matryoshka_layers): | ||
result_dict[layer] = {} | ||
for dim in tqdm(matryoshka_dims): | ||
model_name = os.path.join(model_folder, f"layer_{layer}_dim_{dim}/final") | ||
if not os.path.exists(model_name): | ||
continue | ||
|
||
layer_dim_str = str(layer) + "_" + str(dim) | ||
|
||
print("Layer:", layer, "Dim:", dim) | ||
|
||
model = SentenceTransformer(model_name, truncate_dim=dim) | ||
model[0].auto_model.encoder.layer = model[0].auto_model.encoder.layer[:layer] | ||
if not os.path.exists(model_name): | ||
continue | ||
for dataset in dataset_dict.keys(): | ||
dataset_loading_name = dataset_dict[dataset] | ||
|
||
test_dataset = load_dataset(dataset_loading_name, split="test") | ||
evaluators = [] | ||
|
||
evaluators.append( | ||
EmbeddingSimilarityEvaluator( | ||
sentences1=test_dataset["sentence1"], | ||
sentences2=test_dataset["sentence2"], | ||
scores=test_dataset["score"], | ||
main_similarity=SimilarityFunction.COSINE, | ||
name=f"sts-test-{dim}", | ||
truncate_dim=dim, | ||
) | ||
) | ||
test_evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1]) | ||
results = test_evaluator(model) | ||
|
||
for result_key in list(results.keys()): | ||
if "spearman_cosine" in result_key: | ||
# first copy the key | ||
#result_key save is only the number in the key | ||
result_key_save = re.findall(r'\d+', result_key)[0] | ||
if result_key_save not in result_dict[layer]: | ||
result_dict[layer][result_key_save] = {dataset: results[result_key]} | ||
else: | ||
result_dict[layer][result_key_save][dataset] = results[result_key] | ||
|
||
|
||
|
||
# with open(os.path.join(model_name, "sts_results.json"), "w") as f: | ||
# json.dump(result_dict[layer_dim_str], f, indent=2) | ||
|
||
for dataset in dataset_dict.keys(): | ||
final_result_dict[dataset] = {} | ||
|
||
|
||
for layer in result_dict: | ||
for dim in result_dict[layer]: | ||
for dataset in result_dict[layer][dim]: | ||
if dataset not in final_result_dict: | ||
final_result_dict[dataset] = {} | ||
if layer not in final_result_dict[dataset]: | ||
final_result_dict[dataset][layer] = {} | ||
final_result_dict[dataset][layer][dim] = result_dict[layer][dim][dataset] | ||
|
||
print(final_result_dict) | ||
final_result_dict["average"] = {} | ||
for layer in final_result_dict["stsb"]: | ||
final_result_dict["average"][layer] = {} | ||
for dim in final_result_dict["stsb"][layer]: | ||
final_result_dict["average"][layer][dim] = [] | ||
for dataset in final_result_dict: | ||
if dataset == "average": | ||
continue | ||
final_result_dict["average"][layer][dim].append(final_result_dict[dataset][layer][dim]) | ||
final_result_dict["average"][layer][dim] = sum(final_result_dict["average"][layer][dim]) / len(final_result_dict["average"][layer][dim]) | ||
print(final_result_dict) | ||
# another average for all sizes of dataset | ||
final_result_dict["average_dataset"] = {} | ||
for dataset in dataset_dict.keys(): | ||
final_result_dict["average_dataset"][dataset] = [] | ||
for layer in final_result_dict["average"]: | ||
for dim in final_result_dict["average"][layer]: | ||
final_result_dict["average_dataset"][dataset].append(final_result_dict[dataset][layer][dim]) | ||
final_result_dict["average_dataset"][dataset] = sum(final_result_dict["average_dataset"][dataset]) / len(final_result_dict["average_dataset"][dataset]) | ||
|
||
|
||
out_file = os.path.join(model_folder, "sts_results_" + evaluate_type + ".json") | ||
json.dump(final_result_dict, open(out_file, "w"), indent=2) | ||
|
||
|
||
|
||
|
Oops, something went wrong.