From 31706f7878d1bcf1fede3eee2236d0e4e7bc4781 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Sun, 21 Apr 2024 13:32:35 +0800 Subject: [PATCH 01/33] new --- FlagEmbedding/reranker/modeling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/FlagEmbedding/reranker/modeling.py b/FlagEmbedding/reranker/modeling.py index 244c99fa..c02b2620 100644 --- a/FlagEmbedding/reranker/modeling.py +++ b/FlagEmbedding/reranker/modeling.py @@ -34,6 +34,7 @@ def forward(self, batch): ranker_out: SequenceClassifierOutput = self.hf_model(**batch, return_dict=True) logits = ranker_out.logits + #相当于是一个 group_size 个 cls 的多分类任务 if self.training: scores = logits.view( self.train_args.per_device_train_batch_size, From a80075f6a07ab3f0d720ca2371098fba3f56ab9b Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Sun, 21 Apr 2024 15:12:00 +0800 Subject: [PATCH 02/33] new --- .../baai_general_embedding/finetune/hn_mine.py | 2 ++ examples/finetune/toy_finetune_data_minedHN.jsonl | 10 ++++++++++ 2 files changed, 12 insertions(+) create mode 100644 examples/finetune/toy_finetune_data_minedHN.jsonl diff --git a/FlagEmbedding/baai_general_embedding/finetune/hn_mine.py b/FlagEmbedding/baai_general_embedding/finetune/hn_mine.py index 02070090..3f3e672f 100644 --- a/FlagEmbedding/baai_general_embedding/finetune/hn_mine.py +++ b/FlagEmbedding/baai_general_embedding/finetune/hn_mine.py @@ -59,6 +59,7 @@ def find_knn_neg(model, input_file, candidate_pool, output_file, sample_range, n corpus = [] queries = [] train_data = [] + # input_file is jsonl, jsonl也是由 query,pos,neg三元组组成,并且 pos 和 neg 都全部放入 corpus 中, query放入 querys 中 for line in open(input_file): line = json.loads(line.strip()) train_data.append(line) @@ -67,6 +68,7 @@ def find_knn_neg(model, input_file, candidate_pool, output_file, sample_range, n corpus.extend(line['neg']) queries.append(line['query']) + # candidate pool和 corpus 库是二选一的 if candidate_pool is not None: if not isinstance(candidate_pool, list): candidate_pool = get_corpus(candidate_pool) diff --git a/examples/finetune/toy_finetune_data_minedHN.jsonl b/examples/finetune/toy_finetune_data_minedHN.jsonl new file mode 100644 index 00000000..a51c6a3c --- /dev/null +++ b/examples/finetune/toy_finetune_data_minedHN.jsonl @@ -0,0 +1,10 @@ +{"query": "Five women walk along a beach wearing flip-flops.", "pos": ["Some women with flip-flops on, are walking along the beach"], "neg": ["The seal of Missouri is perfect.", "George Bush told the Republicans there was no way he would let them even consider this foolish idea, against his top advisors advice.", "They sold their home because they were retiring and not because of the loan.", "A person gives a speech.", "The street was lined with white-painted houses.", "Critical factors for essential activities are set out.", "A man is a pilot of an airplane.", "Conrad was being plotted against, to be hit on the head.", "The fatal dose was not taken when the murderer thought it would be.", "Mother Teresa is an easy choice.", "It lays out critical activities but makes no provision for critical factors related to those activities.", "It is only staged on Winter afternoons in Palma's large bullring.", "A group of Indians are having a funeral", "No matter how old people get they never forget. ", "The morning sunlight was shining brightly and it was warm. "]} +{"query": "A woman standing on a high cliff on one leg looking over a river.", "pos": ["A woman is standing on a cliff."], "neg": ["This is definitely not an endorsement.", "Two men watching a magic show.", "There was a reform in 1996.", "George Bush told the Republicans there was no way he would let them even consider this foolish idea, against his top advisors advice.", "An athlete is competing in the 1500 meter swimming competition.", "The state would prefer for you to do that.", "Conrad was being plotted against, to be hit on the head.", "A man is in a city.", "a dog is running", "man at picnics cut steak", "A girl is wearing blue.", "Meanwhile, the mainland was empty of population.", "A woman sits on a chair.", "Several chefs are sitting down and talking about food.", "Neither the Globe or Mail had comments on the current state of Canada's road system. "]} +{"query": "Two woman are playing instruments; one a clarinet, the other a violin.", "pos": ["Some people are playing a tune."], "neg": ["man at picnics cut steak", "A man is a pilot of an airplane.", "A child is reading in her bedroom.", "This is definitely not an endorsement.", "Neither the Globe or Mail had comments on the current state of Canada's road system. ", "Person in black clothing, with white bandanna and sunglasses waits at a bus stop.", "I face a serious problem at eighteen years old. ", "A boy is sitting outside playing in the sand.", "A man is skiing down a mountain.", "A woman is jogging in the park.", "People watching a spaceship launch.", "Ended as soon as I received the wire.", "The Spring Creek facility is old and outdated.", "A person gives a speech.", "The girl is standing, leaning against the archway."]} +{"query": "A girl with a blue tank top sitting watching three dogs.", "pos": ["A girl is wearing blue."], "neg": ["A man is skiing down a mountain.", "They sold their home because they were retiring and not because of the loan.", "A group of Indians are having a funeral", "The man is talking about hawaii.", "People watching a spaceship launch.", "The street was lined with white-painted houses.", "A woman is riding her bike.", "Two people jumped off the dock.", "Ended as soon as I received the wire.", "The 4 women are sitting on the beach.", "The Spring Creek facility is old and outdated.", "A woman sits on a chair.", "Steele did not keep her original story.", "People are assembled in protest.", "Meanwhile, the mainland was empty of population."]} +{"query": "A yellow dog running along a forest path.", "pos": ["a dog is running"], "neg": ["The man sits at the table and eats food.", "People watching a spaceship launch.", "Nobody is jumping", "Conrad was being plotted against, to be hit on the head.", "The morning sunlight was shining brightly and it was warm. ", "This is definitely not an endorsement.", "A man is in a city.", "A girl is wearing blue.", "A man is a pilot of an airplane.", "A woman sits on a chair.", "It's worth being able to go at a pace you prefer.", "A woman is riding her bike.", "Critical factors for essential activities are set out.", "The child is wearing black.", "Steele did not keep her original story."]} +{"query": "It sets out essential activities in each phase along with critical factors related to those activities.", "pos": ["Critical factors for essential activities are set out."], "neg": ["A girl is wearing blue.", "Some women with flip-flops on, are walking along the beach", "Person on bike", "Financing is an issue for us in public schools.", "man at picnics cut steak", "The people are watching a funeral procession.", "The child is wearing black.", "It is only staged on Winter afternoons in Palma's large bullring.", "The rule discourages people to pay their child support.", "A woman sits on a chair.", "Several chefs are sitting down and talking about food.", "a dog is running", "The street was lined with white-painted houses.", "Steele did not keep her original story.", "The family was falling apart."]} +{"query": "A man giving a speech in a restaurant.", "pos": ["A person gives a speech."], "neg": ["The man sits at the table and eats food.", "A man is skiing down a mountain.", "The Spring Creek facility is old and outdated.", "I face a serious problem at eighteen years old. ", "The child is wearing black.", "No matter how old people get they never forget. ", "Two children is sleeping.", "People watching a spaceship launch.", "The Commission notes that no significant alternatives were considered.", "A man in a vest sits in a car.", "A man is a pilot of an airplane.", "The people are watching a funeral procession.", "Steele did not keep her original story.", "A child is reading in her bedroom.", "Two people jumped off the dock."]} +{"query": "Indians having a gathering with coats and food and drinks.", "pos": ["A group of Indians are having a gathering with food and drinks"], "neg": ["The street was lined with white-painted houses.", "The 4 women are sitting on the beach.", "People watching a spaceship launch.", "The man sits at the table and eats food.", "This is definitely not an endorsement.", "An athlete is competing in the 1500 meter swimming competition.", "A man is in a city.", "It is calming to be assaulted.", "The girl is standing, leaning against the archway.", "The seal of Missouri is perfect.", "It is only staged on Winter afternoons in Palma's large bullring.", "She's not going to court to clear her record.", "Two men watching a magic show.", "A group of women watch soap operas.", "The family was falling apart."]} +{"query": "A woman with violet hair rides her bicycle outside.", "pos": ["A woman is riding her bike."], "neg": ["A group of women watch soap operas.", "a cat is running", "A child is reading in her bedroom.", "The Spring Creek facility is old and outdated.", "Two children is sleeping.", "The 4 women are sitting on the beach.", "It lays out critical activities but makes no provision for critical factors related to those activities.", "Several chefs are sitting down and talking about food.", "She's not going to court to clear her record.", "A group of people plays volleyball.", "People are assembled in protest.", "People watching a spaceship launch.", "The fatal dose was not taken when the murderer thought it would be.", "a fisherman is trying to catch a monkey", "The people are watching a funeral procession."]} +{"query": "A man pulls two women down a city street in a rickshaw.", "pos": ["A man is in a city."], "neg": ["We ran out of firewood and had to use pine needles for the fire.", "Nobody is jumping", "Steele did not keep her original story.", "The man is talking about hawaii.", "A group of Indians are having a gathering with food and drinks", "Person on bike", "A girl sits beside a boy.", "A girl is wearing blue.", "The morning sunlight was shining brightly and it was warm. ", "The Commission notes that no significant alternatives were considered.", "Mother Teresa is an easy choice.", "A group of Indians are having a funeral", "Right information can empower the legal service practices and the justice system. ", "A woman is jogging in the park.", "A person gives a speech."]} From aa9724e4d6a49b3c7c8f975478c5fbad70e61259 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Thu, 25 Apr 2024 20:11:04 +0800 Subject: [PATCH 03/33] new --- FlagEmbedding/reranker/data.py | 2 +- FlagEmbedding/reranker/modeling.py | 2 +- FlagEmbedding/reranker/run.py | 67 ++++++++++++++++++++++-------- FlagEmbedding/reranker/trainer.py | 2 +- debug_args.ipynb | 57 +++++++++++++++++++++++++ utils.py | 33 +++++++++++++++ 6 files changed, 142 insertions(+), 21 deletions(-) create mode 100644 debug_args.ipynb create mode 100644 utils.py diff --git a/FlagEmbedding/reranker/data.py b/FlagEmbedding/reranker/data.py index 5ddf0972..2dcc0620 100644 --- a/FlagEmbedding/reranker/data.py +++ b/FlagEmbedding/reranker/data.py @@ -10,7 +10,7 @@ from transformers import DataCollatorWithPadding from transformers import PreTrainedTokenizer, BatchEncoding -from .arguments import DataArguments +from arguments import DataArguments class TrainDatasetForCE(Dataset): diff --git a/FlagEmbedding/reranker/modeling.py b/FlagEmbedding/reranker/modeling.py index c02b2620..931523f2 100644 --- a/FlagEmbedding/reranker/modeling.py +++ b/FlagEmbedding/reranker/modeling.py @@ -5,7 +5,7 @@ from transformers import AutoModelForSequenceClassification, PreTrainedModel, TrainingArguments from transformers.modeling_outputs import SequenceClassifierOutput -from .arguments import ModelArguments, DataArguments +from arguments import ModelArguments, DataArguments logger = logging.getLogger(__name__) diff --git a/FlagEmbedding/reranker/run.py b/FlagEmbedding/reranker/run.py index 81a51850..a3c314fd 100644 --- a/FlagEmbedding/reranker/run.py +++ b/FlagEmbedding/reranker/run.py @@ -8,13 +8,25 @@ set_seed, ) -from .arguments import ModelArguments, DataArguments -from .data import TrainDatasetForCE, GroupCollator -from .modeling import CrossEncoder -from .trainer import CETrainer +from arguments import ModelArguments, DataArguments +from data import TrainDatasetForCE, GroupCollator +from modeling import CrossEncoder +from trainer import CETrainer logger = logging.getLogger(__name__) - +from pprint import pprint as pp +import sys +sys.path.append("/opt/tiger/FlagEmbedding") +from utils import get_complete_last_checkpoint +import transformers + +def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str): + """Collects the state dict and dump to disk.""" + state_dict = trainer.model.state_dict() + if trainer.args.should_save: + cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()} + del state_dict + trainer._save(output_dir, state_dict=cpu_state_dict) def main(): parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) @@ -23,15 +35,22 @@ def main(): data_args: DataArguments training_args: TrainingArguments - if ( - os.path.exists(training_args.output_dir) - and os.listdir(training_args.output_dir) - and training_args.do_train - and not training_args.overwrite_output_dir - ): - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." - ) + # for args in (model_args, data_args, training_args): pp(args) + + # check and load checkpoint + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and not training_args.overwrite_output_dir: + last_checkpoint = get_complete_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + logger.info( + f"Output directory ({training_args.output_dir}) already exists and is empty." + "Train from scratch" + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) # Setup logging logging.basicConfig( @@ -64,6 +83,7 @@ def main(): model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, cache_dir=model_args.cache_dir, + trust_remote_code=True ) _model_class = CrossEncoder @@ -73,8 +93,17 @@ def main(): from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, + trust_remote_code=True ) + checkpoint = None + if training_args.resume_from_checkpoint is not None: + logger.info(f"train start from {training_args.resume_from_checkpoint}") + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + logger.info(f"train start from {last_checkpoint}") + checkpoint = last_checkpoint + train_dataset = TrainDatasetForCE(data_args, tokenizer=tokenizer) _trainer_class = CETrainer trainer = _trainer_class( @@ -84,11 +113,13 @@ def main(): data_collator=GroupCollator(tokenizer), tokenizer=tokenizer ) + trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_state() + safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) + # Path(training_args.output_dir).mkdir(parents=True, exist_ok=True) - Path(training_args.output_dir).mkdir(parents=True, exist_ok=True) - - trainer.train() - trainer.save_model() + # trainer.train() + # trainer.save_model() if __name__ == "__main__": diff --git a/FlagEmbedding/reranker/trainer.py b/FlagEmbedding/reranker/trainer.py index 7dbc5881..dbb98ef0 100644 --- a/FlagEmbedding/reranker/trainer.py +++ b/FlagEmbedding/reranker/trainer.py @@ -5,7 +5,7 @@ import torch from transformers.trainer import Trainer -from .modeling import CrossEncoder +from modeling import CrossEncoder logger = logging.getLogger(__name__) diff --git a/debug_args.ipynb b/debug_args.ipynb new file mode 100644 index 00000000..8653d06c --- /dev/null +++ b/debug_args.ipynb @@ -0,0 +1,57 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\"--output_dir\", \"/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/toy\", \"--model_name_or_path\", \"/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/saved_model_v100\", \"--train_data\", \"/opt/tiger/toy_data.jsonl\", \"--learning_rate\", \"1e-5\", \"--fp16\", \"--num_train_epochs\", \"3\", \"--per_device_train_batch_size\", \"4\", \"--gradient_accumulation_steps\", \"4\", \"--dataloader_drop_last\", \"--train_group_size\", \"16\", \"--max_len\", \"512\", \"--weight_decay\", \"0.01\", \"--logging_steps\", \"10\", \"--save_strategy\", \"epoch\", \"--save_steps\", \"1\", \"--save_total_limit\", \"3\"]\n" + ] + } + ], + "source": [ + "import json\n", + "args = (str(\"--output_dir /mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/toy --model_name_or_path /mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/saved_model_v100 --train_data /opt/tiger/toy_data.jsonl --learning_rate 1e-5 --fp16 --num_train_epochs 3 --per_device_train_batch_size 4 --gradient_accumulation_steps 4 --dataloader_drop_last --train_group_size 16 --max_len 512 --weight_decay 0.01 --logging_steps 10 --save_strategy epoch --save_steps 1 --save_total_limit 3\".split(\" \")).replace(\"\\'\",\"\\\"\"))\n", + "print(args)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.2 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/utils.py b/utils.py new file mode 100644 index 00000000..32a9cd2b --- /dev/null +++ b/utils.py @@ -0,0 +1,33 @@ +import os +import re + +PREFIX_CHECKPOINT_DIR = "checkpoint" +_re_checkpoint = re.compile(r"^" + PREFIX_CHECKPOINT_DIR + r"\-(\d+)$") + + +def get_complete_last_checkpoint(folder): + """ + because the checkpoint saving may be killed by the process kill, we need to get the real last checkpoint, + check if the last checkpoint is has same file number with the second last one + """ + content = os.listdir(folder) + checkpoints = [ + path + for path in content + if _re_checkpoint.search(path) is not None and os.path.isdir(os.path.join(folder, path)) + ] + if len(checkpoints) == 0: + return + sorted_checkpoints = sorted(checkpoints, key=lambda x: int(_re_checkpoint.search(x).group(1))) + last_checkpoint = os.path.join(folder, sorted_checkpoints[-1]) + if len(sorted_checkpoints) >= 2: + second_last_checkpoint = os.path.join(folder, sorted_checkpoints[-2]) + else: + second_last_checkpoint = last_checkpoint + # check if the two file have same file number + last_checkpoint_file = os.listdir(last_checkpoint) + second_last_checkpoint_file = os.listdir(second_last_checkpoint) + if len(last_checkpoint_file) == len(second_last_checkpoint_file): + return last_checkpoint + else: + return second_last_checkpoint From 0177bc7dff8c93c6ac94212b00f66d302eeb50d5 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Thu, 25 Apr 2024 20:44:53 +0800 Subject: [PATCH 04/33] new --- FlagEmbedding/reranker/run.py | 1 + FlagEmbedding/reranker/trainer.py | 4 ++++ reranker_pretrain.py | 39 +++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+) create mode 100755 reranker_pretrain.py diff --git a/FlagEmbedding/reranker/run.py b/FlagEmbedding/reranker/run.py index a3c314fd..e0146043 100644 --- a/FlagEmbedding/reranker/run.py +++ b/FlagEmbedding/reranker/run.py @@ -106,6 +106,7 @@ def main(): train_dataset = TrainDatasetForCE(data_args, tokenizer=tokenizer) _trainer_class = CETrainer + trainer = _trainer_class( model=model, args=training_args, diff --git a/FlagEmbedding/reranker/trainer.py b/FlagEmbedding/reranker/trainer.py index dbb98ef0..dbbcb81a 100644 --- a/FlagEmbedding/reranker/trainer.py +++ b/FlagEmbedding/reranker/trainer.py @@ -29,3 +29,7 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): def compute_loss(self, model: CrossEncoder, inputs): return model(inputs)['loss'] + + def _load_from_checkpoint(self, resume_from_checkpoint, model=None): + model = self.model.hf_model + super()._load_from_checkpoint(resume_from_checkpoint, model) diff --git a/reranker_pretrain.py b/reranker_pretrain.py new file mode 100755 index 00000000..08c27fb7 --- /dev/null +++ b/reranker_pretrain.py @@ -0,0 +1,39 @@ +import socket +import contextlib +import os +import argparse +import sys +import torch + +def find_unused_port(start_port=37624): + """ + 寻找一个从start_port开始的未使用的端口。 + """ + for port in range(start_port, start_port + 20000): # 检查1000个端口 + with contextlib.suppress(Exception): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(('', port)) + return port + return None # 如果没有找到,返回None + +args = (" ").join(sys.argv[1:]) +print(args) + +# 使用示例 +port = find_unused_port() +if port: + print(f"Found an unused port: {port}") +else: + print("No unused port found.") + +num_gpus = torch.cuda.device_count() + +os.system("cd /opt/tiger/FlagEmbedding") + +# 构建训练命令 +command = f""" +torchrun --master-port={port} --nproc_per_node {num_gpus} /opt/tiger/FlagEmbedding/FlagEmbedding/reranker/run.py {args} +""" + +# 执行命令 +os.system(command) \ No newline at end of file From de4cfe441fe333f80bbf3f9bc936520058dab75f Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Fri, 26 Apr 2024 11:53:23 +0800 Subject: [PATCH 05/33] new --- reranker_pretrain.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/reranker_pretrain.py b/reranker_pretrain.py index 08c27fb7..f44d5837 100755 --- a/reranker_pretrain.py +++ b/reranker_pretrain.py @@ -1,7 +1,6 @@ import socket import contextlib import os -import argparse import sys import torch @@ -30,6 +29,8 @@ def find_unused_port(start_port=37624): os.system("cd /opt/tiger/FlagEmbedding") +if not os.path.exists("/opt/tiger/train_15neg"): os.system("cp -r /mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/train_15neg /opt/tiger/train_15neg") + # 构建训练命令 command = f""" torchrun --master-port={port} --nproc_per_node {num_gpus} /opt/tiger/FlagEmbedding/FlagEmbedding/reranker/run.py {args} From e2a921161fc80d7195c26e8eaf89f4047aceda12 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Fri, 26 Apr 2024 16:59:22 +0800 Subject: [PATCH 06/33] new --- reranker_pretrain.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/reranker_pretrain.py b/reranker_pretrain.py index f44d5837..b1e554da 100755 --- a/reranker_pretrain.py +++ b/reranker_pretrain.py @@ -4,14 +4,15 @@ import sys import torch -def find_unused_port(start_port=37624): +def find_unused_port(start_port=37625): """ 寻找一个从start_port开始的未使用的端口。 """ for port in range(start_port, start_port + 20000): # 检查1000个端口 with contextlib.suppress(Exception): - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.bind(('', port)) + for i in range(10): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(('', port)) return port return None # 如果没有找到,返回None From a3ad2b88f80562f62d2beb0cb41e39af297b5684 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Tue, 7 May 2024 15:36:17 +0800 Subject: [PATCH 07/33] new --- .../reranker/embedding_reranker.ipynb | 244 ++++++++++++++++++ FlagEmbedding/reranker/embedding_run.py | 127 +++++++++ 2 files changed, 371 insertions(+) create mode 100644 FlagEmbedding/reranker/embedding_reranker.ipynb create mode 100644 FlagEmbedding/reranker/embedding_run.py diff --git a/FlagEmbedding/reranker/embedding_reranker.ipynb b/FlagEmbedding/reranker/embedding_reranker.ipynb new file mode 100644 index 00000000..5a9f3c97 --- /dev/null +++ b/FlagEmbedding/reranker/embedding_reranker.ipynb @@ -0,0 +1,244 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "import torch\n", + "from torch import nn\n", + "from transformers import AutoModelForSequenceClassification, PreTrainedModel, TrainingArguments, AutoTokenizer\n", + "from transformers.modeling_outputs import SequenceClassifierOutput" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "model = AutoModelForSequenceClassification.from_pretrained(\"/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/reranker_group30_batch2_v100\", torch_dtype=torch.float16, device_map=\"auto\")\n", + "tokenizer = AutoTokenizer.from_pretrained(\"/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/reranker_group30_batch2_v100\")" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "import torch\n", + "from transformers import AutoModelForSequenceClassification\n", + "\n", + "# 假设 model 是已经加载的模型\n", + "# model = AutoModelForSequenceClassification.from_pretrained(...)\n", + "\n", + "# 假设 group_size 是你的模型处理的样本数量\n", + "group_size = 15 # 根据实际情况设置\n", + "batch_size = 2\n", + "\n", + "# 假设你已经有一个文本序列和对应的标签\n", + "text = \"这是一个示例文本。\"\n", + "label = 1 # 假设标签是1\n", + "\n", + "# 使用模型的分词器对文本进行编码\n", + "encoded_input = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')\n", + "\n", + "# 调整编码后的输入以匹配 group_size\n", + "input_ids = encoded_input['input_ids'].repeat_interleave(batch_size*group_size, dim=0).cuda()\n", + "attention_mask = encoded_input['attention_mask'].repeat_interleave(batch_size*group_size, dim=0).cuda()\n", + "\n", + "print(type(input_ids))\n", + "\n", + "# 创建 batch 字典\n", + "batch = {\n", + " 'input_ids': input_ids,\n", + " 'attention_mask': attention_mask,\n", + "}\n", + "\n", + "labels = torch.tensor([label]*batch_size, dtype=torch.long).cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "def get_embedding(input_ids, attention_mask, model=model, tokenizer=tokenizer):\n", + " hidden_state = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1].cpu()\n", + " attention_mask = attention_mask.cpu()\n", + " seq_lengths = attention_mask.sum(dim=1)\n", + " embeddings = []\n", + " for seq_len, seq_emb in zip(seq_lengths, hidden_state):\n", + " valid_emb = seq_emb[:seq_len]\n", + " embeddings.append(torch.mean(valid_emb, dim=0))\n", + "\n", + " embedding = torch.stack(embeddings)\n", + " return embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "def forward(batch):\n", + " cross_entropy = nn.CrossEntropyLoss(reduction='mean')\n", + " embeddings = get_embedding(**batch)\n", + " loss = batchloss(embeddings)\n", + " return loss" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[-0.2715, -0.0060, -0.6865, ..., -0.9995, -0.5493, 0.4402],\n", + " [-0.2715, -0.0060, -0.6865, ..., -0.9995, -0.5493, 0.4402],\n", + " [-0.2715, -0.0060, -0.6865, ..., -0.9995, -0.5493, 0.4402],\n", + " ...,\n", + " [-0.2715, -0.0060, -0.6865, ..., -0.9995, -0.5493, 0.4402],\n", + " [-0.2715, -0.0060, -0.6865, ..., -0.9995, -0.5493, 0.4402],\n", + " [-0.2715, -0.0060, -0.6865, ..., -0.9995, -0.5493, 0.4402]],\n", + " dtype=torch.float16, grad_fn=)\n", + "torch.Size([30, 768])\n" + ] + } + ], + "source": [ + "results = get_embedding(**batch)\n", + "print(results)\n", + "print(results.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([2, 15, 768])\n" + ] + } + ], + "source": [ + "pred = results.view(batch_size, group_size, -1)\n", + "print(pred.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn.functional as F\n", + "\n", + "def infoNCELoss(anchor, positive, negatives, temperature=1):\n", + " # 计算所有样本的相似度\n", + " pos_similarity = F.cosine_similarity(anchor, positive, dim=-1)\n", + " # 将anchor重复到与负样本相同数量的维度,以便计算\n", + " neg_similarity = F.cosine_similarity(anchor, negatives, dim=-1)\n", + " # 合并正样本和负样本的相似度\n", + " all_similarity = torch.cat([pos_similarity, neg_similarity])\n", + " # 应用温度缩放\n", + " all_similarity /= temperature\n", + " # 计算InfoNCE损失\n", + " loss = - torch.log(torch.exp(pos_similarity)/torch.sum(torch.exp(all_similarity)))\n", + " return loss.mean()\n", + "\n", + "def batchloss(embeddings):\n", + " # 遍历每个batch计算损失\n", + " losses = []\n", + " for i in range(embeddings.size(0)):\n", + " # anchor embeddings\n", + " anchor = embeddings[i, 0].unsqueeze(0) # [1, 768]\n", + " # positive embeddings\n", + " positive = embeddings[i, 1].unsqueeze(0) # [1, 768]\n", + " # 除了anchor和positive之外的所有embeddings作为负样本\n", + " negatives = embeddings[i, 2:] # [13, 768]\n", + " # 计算当前batch的InfoNCE损失\n", + " loss = infoNCELoss(anchor, positive, negatives)\n", + " losses.append(loss)\n", + " # 计算整个batch的平均损失\n", + " batch_loss = torch.mean(torch.stack(losses))\n", + " return batch_loss" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor(2.6431)\n" + ] + } + ], + "source": [ + "# 假设 embeddings 是一个形状为 [batch, group_size, embedding_len] 的张量\n", + "embeddings = torch.randn(2, 15, 768) # 示例数据\n", + "print(batchloss(embeddings))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.2 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/FlagEmbedding/reranker/embedding_run.py b/FlagEmbedding/reranker/embedding_run.py new file mode 100644 index 00000000..e0146043 --- /dev/null +++ b/FlagEmbedding/reranker/embedding_run.py @@ -0,0 +1,127 @@ +import logging +import os +from pathlib import Path + +from transformers import AutoConfig, AutoTokenizer, TrainingArguments +from transformers import ( + HfArgumentParser, + set_seed, +) + +from arguments import ModelArguments, DataArguments +from data import TrainDatasetForCE, GroupCollator +from modeling import CrossEncoder +from trainer import CETrainer + +logger = logging.getLogger(__name__) +from pprint import pprint as pp +import sys +sys.path.append("/opt/tiger/FlagEmbedding") +from utils import get_complete_last_checkpoint +import transformers + +def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str): + """Collects the state dict and dump to disk.""" + state_dict = trainer.model.state_dict() + if trainer.args.should_save: + cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()} + del state_dict + trainer._save(output_dir, state_dict=cpu_state_dict) + +def main(): + parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + model_args: ModelArguments + data_args: DataArguments + training_args: TrainingArguments + + # for args in (model_args, data_args, training_args): pp(args) + + # check and load checkpoint + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and not training_args.overwrite_output_dir: + last_checkpoint = get_complete_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + logger.info( + f"Output directory ({training_args.output_dir}) already exists and is empty." + "Train from scratch" + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + training_args.local_rank, + training_args.device, + training_args.n_gpu, + bool(training_args.local_rank != -1), + training_args.fp16, + ) + logger.info("Training/evaluation parameters %s", training_args) + logger.info("Model parameters %s", model_args) + logger.info("Data parameters %s", data_args) + + set_seed(training_args.seed) + + num_labels = 1 + + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=False, + ) + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + num_labels=num_labels, + cache_dir=model_args.cache_dir, + trust_remote_code=True + ) + _model_class = CrossEncoder + + model = _model_class.from_pretrained( + model_args, data_args, training_args, + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + trust_remote_code=True + ) + + checkpoint = None + if training_args.resume_from_checkpoint is not None: + logger.info(f"train start from {training_args.resume_from_checkpoint}") + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + logger.info(f"train start from {last_checkpoint}") + checkpoint = last_checkpoint + + train_dataset = TrainDatasetForCE(data_args, tokenizer=tokenizer) + _trainer_class = CETrainer + + trainer = _trainer_class( + model=model, + args=training_args, + train_dataset=train_dataset, + data_collator=GroupCollator(tokenizer), + tokenizer=tokenizer + ) + trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_state() + safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) + # Path(training_args.output_dir).mkdir(parents=True, exist_ok=True) + + # trainer.train() + # trainer.save_model() + + +if __name__ == "__main__": + main() From f47647b40a4b702ce5af47b9a47120ef976d8ad1 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Tue, 7 May 2024 15:53:39 +0800 Subject: [PATCH 08/33] new --- FlagEmbedding/reranker/data.py | 12 ++++++ FlagEmbedding/reranker/embedding_run.py | 14 +++---- FlagEmbedding/reranker/modeling.py | 49 +++++++++++++++++++++++++ reranker_pretrain.py | 4 +- 4 files changed, 69 insertions(+), 10 deletions(-) diff --git a/FlagEmbedding/reranker/data.py b/FlagEmbedding/reranker/data.py index 2dcc0620..0af1645b 100644 --- a/FlagEmbedding/reranker/data.py +++ b/FlagEmbedding/reranker/data.py @@ -62,6 +62,18 @@ def __getitem__(self, item) -> List[BatchEncoding]: return batch_data +class TrainDatasetForCL(TrainDatasetForCE): + def __getitem__(self, item) -> List[BatchEncoding]: + query = self.dataset[item]['query'] + pos = random.choice(self.dataset[item]['pos']) + if len(self.dataset[item]['neg']) < self.args.train_group_size - 1: + num = math.ceil((self.args.train_group_size - 1) / len(self.dataset[item]['neg'])) + negs = random.sample(self.dataset[item]['neg'] * num, self.args.train_group_size - 1) + else: + negs = random.sample(self.dataset[item]['neg'], self.args.train_group_size - 1) + + batch_data = self.tokenizer([query]+[pos]+negs) + return batch_data @dataclass diff --git a/FlagEmbedding/reranker/embedding_run.py b/FlagEmbedding/reranker/embedding_run.py index e0146043..6b50f2e5 100644 --- a/FlagEmbedding/reranker/embedding_run.py +++ b/FlagEmbedding/reranker/embedding_run.py @@ -7,10 +7,11 @@ HfArgumentParser, set_seed, ) +from FlagEmbedding.FlagEmbedding.reranker.data import TrainDatasetForCL from arguments import ModelArguments, DataArguments from data import TrainDatasetForCE, GroupCollator -from modeling import CrossEncoder +from modeling import CLEncoder, CrossEncoder from trainer import CETrainer logger = logging.getLogger(__name__) @@ -85,7 +86,7 @@ def main(): cache_dir=model_args.cache_dir, trust_remote_code=True ) - _model_class = CrossEncoder + _model_class = CLEncoder model = _model_class.from_pretrained( model_args, data_args, training_args, @@ -104,24 +105,19 @@ def main(): logger.info(f"train start from {last_checkpoint}") checkpoint = last_checkpoint - train_dataset = TrainDatasetForCE(data_args, tokenizer=tokenizer) + train_dataset = TrainDatasetForCL(data_args, tokenizer=tokenizer) _trainer_class = CETrainer trainer = _trainer_class( model=model, args=training_args, train_dataset=train_dataset, - data_collator=GroupCollator(tokenizer), + data_collator=GroupCollator(tokenizer), #这里依旧是拍平 tokenizer=tokenizer ) trainer.train(resume_from_checkpoint=checkpoint) trainer.save_state() safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) - # Path(training_args.output_dir).mkdir(parents=True, exist_ok=True) - - # trainer.train() - # trainer.save_model() - if __name__ == "__main__": main() diff --git a/FlagEmbedding/reranker/modeling.py b/FlagEmbedding/reranker/modeling.py index 931523f2..c516cbb5 100644 --- a/FlagEmbedding/reranker/modeling.py +++ b/FlagEmbedding/reranker/modeling.py @@ -6,6 +6,8 @@ from transformers.modeling_outputs import SequenceClassifierOutput from arguments import ModelArguments, DataArguments +import torch +import torch.nn.functional as F logger = logging.getLogger(__name__) @@ -65,3 +67,50 @@ def save_pretrained(self, output_dir: str): for k, v in state_dict.items()}) self.hf_model.save_pretrained(output_dir, state_dict=state_dict) + +class CLEncoder(CrossEncoder): + def get_embedding(self, input_ids, attention_mask): + hidden_state = self.hf_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1].cpu() + attention_mask = attention_mask.cpu() + seq_lengths = attention_mask.sum(dim=1) + embeddings = [] + for seq_len, seq_emb in zip(seq_lengths, hidden_state): + valid_emb = seq_emb[:seq_len] + embeddings.append(torch.mean(valid_emb, dim=0)) + embeddings = torch.stack(embeddings) + return embeddings + + def infoNCELoss(self, anchor, positive, negatives, temperature=1): + # 计算所有样本的相似度 + pos_similarity = F.cosine_similarity(anchor, positive, dim=-1) + # 将anchor重复到与负样本相同数量的维度,以便计算 + neg_similarity = F.cosine_similarity(anchor, negatives, dim=-1) + # 合并正样本和负样本的相似度 + all_similarity = torch.cat([pos_similarity, neg_similarity]) + # 应用温度缩放 + all_similarity /= temperature + # 计算InfoNCE损失 + loss = - torch.log(torch.exp(pos_similarity)/torch.sum(torch.exp(all_similarity))) + return loss.mean() + + def batchloss(self, embeddings): + # 遍历每个batch计算损失 + losses = [] + for i in range(embeddings.size(0)): + # anchor embeddings + anchor = embeddings[i, 0].unsqueeze(0) # [1, 768] + # positive embeddings + positive = embeddings[i, 1].unsqueeze(0) # [1, 768] + # 除了anchor和positive之外的所有embeddings作为负样本 + negatives = embeddings[i, 2:] # [len(negs), 768] + # 计算当前batch的InfoNCE损失 + loss = self.infoNCELoss(anchor, positive, negatives) + losses.append(loss) + # 计算整个batch的平均损失 + batch_loss = torch.mean(torch.stack(losses)) + return batch_loss + + def forward(self, batch): + embeddings = self.get_embedding(**batch) + loss = self.batchloss(embeddings) + return loss \ No newline at end of file diff --git a/reranker_pretrain.py b/reranker_pretrain.py index b1e554da..f2e92f1a 100755 --- a/reranker_pretrain.py +++ b/reranker_pretrain.py @@ -19,6 +19,8 @@ def find_unused_port(start_port=37625): args = (" ").join(sys.argv[1:]) print(args) +script_name = "embedding_run" + # 使用示例 port = find_unused_port() if port: @@ -34,7 +36,7 @@ def find_unused_port(start_port=37625): # 构建训练命令 command = f""" -torchrun --master-port={port} --nproc_per_node {num_gpus} /opt/tiger/FlagEmbedding/FlagEmbedding/reranker/run.py {args} +torchrun --master-port={port} --nproc_per_node {num_gpus} /opt/tiger/FlagEmbedding/FlagEmbedding/reranker/{script_name}.py {args} """ # 执行命令 From 903d6d73dd708f38fc1725ca610841b865e2c892 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Tue, 7 May 2024 16:55:07 +0800 Subject: [PATCH 09/33] new --- .../reranker/embedding_reranker.ipynb | 136 ++++++++++++++++++ FlagEmbedding/reranker/embedding_run.py | 3 +- FlagEmbedding/reranker/modeling.py | 13 +- 3 files changed, 148 insertions(+), 4 deletions(-) diff --git a/FlagEmbedding/reranker/embedding_reranker.ipynb b/FlagEmbedding/reranker/embedding_reranker.ipynb index 5a9f3c97..e26212b2 100644 --- a/FlagEmbedding/reranker/embedding_reranker.ipynb +++ b/FlagEmbedding/reranker/embedding_reranker.ipynb @@ -206,6 +206,142 @@ "print(batchloss(embeddings))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 去掉模型的分类头" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "XLMRobertaModel(\n", + " (embeddings): XLMRobertaEmbeddings(\n", + " (word_embeddings): Embedding(250002, 768, padding_idx=1)\n", + " (position_embeddings): Embedding(514, 768, padding_idx=1)\n", + " (token_type_embeddings): Embedding(1, 768)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (encoder): XLMRobertaEncoder(\n", + " (layer): ModuleList(\n", + " (0-11): 12 x XLMRobertaLayer(\n", + " (attention): XLMRobertaAttention(\n", + " (self): XLMRobertaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): XLMRobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): XLMRobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " (intermediate_act_fn): GELUActivation()\n", + " )\n", + " (output): XLMRobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (pooler): XLMRobertaPooler(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (activation): Tanh()\n", + " )\n", + ")\n" + ] + } + ], + "source": [ + "from transformers import XLMRobertaForSequenceClassification, AutoModel, AutoTokenizer\n", + "import torch\n", + "model = AutoModel.from_pretrained('/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/xlmr/models--FacebookAI--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/', torch_dtype=torch.float16)\n", + "tokenizer = AutoTokenizer.from_pretrained(\"/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/xlmr/models--FacebookAI--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/\")\n", + "# print(type(model.modules()))\n", + "print(model)\n", + "# print(model.roberta)\n", + "# print(model.classifier)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "XLMRobertaForSequenceClassification(\n", + " (roberta): XLMRobertaModel(\n", + " (embeddings): XLMRobertaEmbeddings(\n", + " (word_embeddings): Embedding(250002, 768, padding_idx=1)\n", + " (position_embeddings): Embedding(514, 768, padding_idx=1)\n", + " (token_type_embeddings): Embedding(1, 768)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (encoder): XLMRobertaEncoder(\n", + " (layer): ModuleList(\n", + " (0-11): 12 x XLMRobertaLayer(\n", + " (attention): XLMRobertaAttention(\n", + " (self): XLMRobertaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): XLMRobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): XLMRobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " (intermediate_act_fn): GELUActivation()\n", + " )\n", + " (output): XLMRobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + ")\n" + ] + } + ], + "source": [ + "del model.classifier\n", + "print(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sentence_transformers import SentenceTransformer, models" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/FlagEmbedding/reranker/embedding_run.py b/FlagEmbedding/reranker/embedding_run.py index 6b50f2e5..83ae86dd 100644 --- a/FlagEmbedding/reranker/embedding_run.py +++ b/FlagEmbedding/reranker/embedding_run.py @@ -7,8 +7,6 @@ HfArgumentParser, set_seed, ) -from FlagEmbedding.FlagEmbedding.reranker.data import TrainDatasetForCL - from arguments import ModelArguments, DataArguments from data import TrainDatasetForCE, GroupCollator from modeling import CLEncoder, CrossEncoder @@ -18,6 +16,7 @@ from pprint import pprint as pp import sys sys.path.append("/opt/tiger/FlagEmbedding") +from FlagEmbedding.reranker.data import TrainDatasetForCL from utils import get_complete_last_checkpoint import transformers diff --git a/FlagEmbedding/reranker/modeling.py b/FlagEmbedding/reranker/modeling.py index c516cbb5..d351dff0 100644 --- a/FlagEmbedding/reranker/modeling.py +++ b/FlagEmbedding/reranker/modeling.py @@ -2,7 +2,7 @@ import torch from torch import nn -from transformers import AutoModelForSequenceClassification, PreTrainedModel, TrainingArguments +from transformers import AutoModelForSequenceClassification, PreTrainedModel, TrainingArguments, AutoModel from transformers.modeling_outputs import SequenceClassifierOutput from arguments import ModelArguments, DataArguments @@ -56,7 +56,7 @@ def from_pretrained( cls, model_args: ModelArguments, data_args: DataArguments, train_args: TrainingArguments, *args, **kwargs ): - hf_model = AutoModelForSequenceClassification.from_pretrained(*args, **kwargs) + hf_model = AutoModelForSequenceClassification.from_pretrained(*args, **kwargs) #XLMR本身不带分类头 reranker = cls(hf_model, model_args, data_args, train_args) return reranker @@ -69,6 +69,15 @@ def save_pretrained(self, output_dir: str): self.hf_model.save_pretrained(output_dir, state_dict=state_dict) class CLEncoder(CrossEncoder): + @classmethod + def from_pretrained( + cls, model_args: ModelArguments, data_args: DataArguments, train_args: TrainingArguments, + *args, **kwargs + ): + hf_model = AutoModel.from_pretrained(*args, **kwargs) + reranker = cls(hf_model, model_args, data_args, train_args) + return reranker + def get_embedding(self, input_ids, attention_mask): hidden_state = self.hf_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1].cpu() attention_mask = attention_mask.cpu() From 135091511eb1132f4f42cf34d63153ff3bdee725 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Tue, 7 May 2024 18:09:01 +0800 Subject: [PATCH 10/33] new --- .gitignore | 1 + FlagEmbedding/reranker/data.py | 16 ++++++++++++++-- FlagEmbedding/reranker/embedding_reranker.ipynb | 17 +++++++++++++++-- FlagEmbedding/reranker/embedding_run.py | 8 +++++--- FlagEmbedding/reranker/modeling.py | 16 ++++++++++++++++ FlagEmbedding/reranker/trainer.py | 4 ++++ reranker_pretrain.py | 9 +++++++-- 7 files changed, 62 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 9b4fde65..0ee3e91c 100644 --- a/.gitignore +++ b/.gitignore @@ -136,3 +136,4 @@ pic2.py # Pyre type checker .pyre/ +wandb/ \ No newline at end of file diff --git a/FlagEmbedding/reranker/data.py b/FlagEmbedding/reranker/data.py index 0af1645b..ada795e0 100644 --- a/FlagEmbedding/reranker/data.py +++ b/FlagEmbedding/reranker/data.py @@ -63,6 +63,15 @@ def __getitem__(self, item) -> List[BatchEncoding]: return batch_data class TrainDatasetForCL(TrainDatasetForCE): + def create_one_example(self, input): + item = self.tokenizer( + input, + truncation=True, + max_length=self.args.max_len, + padding=False, + ) + return item + def __getitem__(self, item) -> List[BatchEncoding]: query = self.dataset[item]['query'] pos = random.choice(self.dataset[item]['pos']) @@ -71,8 +80,10 @@ def __getitem__(self, item) -> List[BatchEncoding]: negs = random.sample(self.dataset[item]['neg'] * num, self.args.train_group_size - 1) else: negs = random.sample(self.dataset[item]['neg'], self.args.train_group_size - 1) - - batch_data = self.tokenizer([query]+[pos]+negs) + batch_data = [] + batch_data.append(self.create_one_example(query)) + batch_data.append(self.create_one_example(pos)) + for neg in negs: batch_data.append(self.create_one_example(neg)) return batch_data @@ -81,6 +92,7 @@ class GroupCollator(DataCollatorWithPadding): def __call__( self, features ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: + assert type(features[0]) == list if isinstance(features[0], list): features = sum(features, []) return super().__call__(features) diff --git a/FlagEmbedding/reranker/embedding_reranker.ipynb b/FlagEmbedding/reranker/embedding_reranker.ipynb index e26212b2..a3fad8f5 100644 --- a/FlagEmbedding/reranker/embedding_reranker.ipynb +++ b/FlagEmbedding/reranker/embedding_reranker.ipynb @@ -335,19 +335,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from sentence_transformers import SentenceTransformer, models" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 看看 Data 构造" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "query = \"hello\"\n", + "pos = \"hello\"\n", + "negs = [\"hello\",\"hello\"]\n", + "\n", + "batch_data = tokenizer([query]+[pos]+negs, padding=True)" + ] } ], "metadata": { diff --git a/FlagEmbedding/reranker/embedding_run.py b/FlagEmbedding/reranker/embedding_run.py index 83ae86dd..7c97211b 100644 --- a/FlagEmbedding/reranker/embedding_run.py +++ b/FlagEmbedding/reranker/embedding_run.py @@ -9,8 +9,8 @@ ) from arguments import ModelArguments, DataArguments from data import TrainDatasetForCE, GroupCollator -from modeling import CLEncoder, CrossEncoder -from trainer import CETrainer +from modeling import CLEncoder +from trainer import CLTrainer logger = logging.getLogger(__name__) from pprint import pprint as pp @@ -79,6 +79,7 @@ def main(): cache_dir=model_args.cache_dir, use_fast=False, ) + config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, @@ -105,7 +106,7 @@ def main(): checkpoint = last_checkpoint train_dataset = TrainDatasetForCL(data_args, tokenizer=tokenizer) - _trainer_class = CETrainer + _trainer_class = CLTrainer trainer = _trainer_class( model=model, @@ -114,6 +115,7 @@ def main(): data_collator=GroupCollator(tokenizer), #这里依旧是拍平 tokenizer=tokenizer ) + checkpoint=None trainer.train(resume_from_checkpoint=checkpoint) trainer.save_state() safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) diff --git a/FlagEmbedding/reranker/modeling.py b/FlagEmbedding/reranker/modeling.py index d351dff0..91a63c2e 100644 --- a/FlagEmbedding/reranker/modeling.py +++ b/FlagEmbedding/reranker/modeling.py @@ -69,6 +69,15 @@ def save_pretrained(self, output_dir: str): self.hf_model.save_pretrained(output_dir, state_dict=state_dict) class CLEncoder(CrossEncoder): + def __init__(self, hf_model: PreTrainedModel, model_args: ModelArguments, data_args: DataArguments, + train_args: TrainingArguments): + super(CrossEncoder, self).__init__() + self.hf_model = hf_model + self.model_args = model_args + self.train_args = train_args + self.data_args = data_args + self.config = self.hf_model.config + @classmethod def from_pretrained( cls, model_args: ModelArguments, data_args: DataArguments, train_args: TrainingArguments, @@ -95,6 +104,8 @@ def infoNCELoss(self, anchor, positive, negatives, temperature=1): # 将anchor重复到与负样本相同数量的维度,以便计算 neg_similarity = F.cosine_similarity(anchor, negatives, dim=-1) # 合并正样本和负样本的相似度 + print(pos_similarity.shape) + print(neg_similarity.shape) all_similarity = torch.cat([pos_similarity, neg_similarity]) # 应用温度缩放 all_similarity /= temperature @@ -113,6 +124,9 @@ def batchloss(self, embeddings): # 除了anchor和positive之外的所有embeddings作为负样本 negatives = embeddings[i, 2:] # [len(negs), 768] # 计算当前batch的InfoNCE损失 + print("anchor", anchor.shape) + print("positive", positive.shape) + print("negatives", negatives.shape) loss = self.infoNCELoss(anchor, positive, negatives) losses.append(loss) # 计算整个batch的平均损失 @@ -121,5 +135,7 @@ def batchloss(self, embeddings): def forward(self, batch): embeddings = self.get_embedding(**batch) + embeddings = embeddings.reshape(self.train_args.per_device_train_batch_size, self.data_args.train_group_size+1, -1) + print("embeddings", embeddings.shape) loss = self.batchloss(embeddings) return loss \ No newline at end of file diff --git a/FlagEmbedding/reranker/trainer.py b/FlagEmbedding/reranker/trainer.py index dbbcb81a..12ef6c5f 100644 --- a/FlagEmbedding/reranker/trainer.py +++ b/FlagEmbedding/reranker/trainer.py @@ -33,3 +33,7 @@ def compute_loss(self, model: CrossEncoder, inputs): def _load_from_checkpoint(self, resume_from_checkpoint, model=None): model = self.model.hf_model super()._load_from_checkpoint(resume_from_checkpoint, model) + +class CLTrainer(CETrainer): + def compute_loss(self, model: CrossEncoder, inputs): + return model(inputs) \ No newline at end of file diff --git a/reranker_pretrain.py b/reranker_pretrain.py index f2e92f1a..29834346 100755 --- a/reranker_pretrain.py +++ b/reranker_pretrain.py @@ -16,8 +16,7 @@ def find_unused_port(start_port=37625): return port return None # 如果没有找到,返回None -args = (" ").join(sys.argv[1:]) -print(args) +# args = (" ").join(sys.argv[1:]) script_name = "embedding_run" @@ -34,6 +33,12 @@ def find_unused_port(start_port=37625): if not os.path.exists("/opt/tiger/train_15neg"): os.system("cp -r /mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/train_15neg /opt/tiger/train_15neg") +#——————————————————————————————————————————————————debug——————————————————————————————————————————————————————————# +args = "--output_dir /mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/toy --model_name_or_path /mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/xlmr/models--FacebookAI--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/ --train_data /opt/tiger/FlagEmbedding/examples/finetune/toy_finetune_data.jsonl --learning_rate 1e-5 --fp16 --num_train_epochs 5 --per_device_train_batch_size 2 --gradient_accumulation_steps 4 --dataloader_drop_last --train_group_size 10 --max_len 512 --weight_decay 0.01 --logging_steps 10 --save_strategy epoch --save_steps 1 --save_total_limit 3" +print(args) +num_gpus = 1 +#——————————————————————————————————————————————————debug——————————————————————————————————————————————————————————# + # 构建训练命令 command = f""" torchrun --master-port={port} --nproc_per_node {num_gpus} /opt/tiger/FlagEmbedding/FlagEmbedding/reranker/{script_name}.py {args} From 78c7cd326178750b538ca20ab2e4b9b7074666fb Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Tue, 7 May 2024 19:35:16 +0800 Subject: [PATCH 11/33] new --- FlagEmbedding/reranker/embedding_run.py | 6 ++++-- FlagEmbedding/reranker/modeling.py | 28 ++++++++++++++++++------- FlagEmbedding/reranker/trainer.py | 6 +----- reranker_pretrain.py | 8 +++---- 4 files changed, 29 insertions(+), 19 deletions(-) diff --git a/FlagEmbedding/reranker/embedding_run.py b/FlagEmbedding/reranker/embedding_run.py index 7c97211b..5734c43d 100644 --- a/FlagEmbedding/reranker/embedding_run.py +++ b/FlagEmbedding/reranker/embedding_run.py @@ -10,7 +10,7 @@ from arguments import ModelArguments, DataArguments from data import TrainDatasetForCE, GroupCollator from modeling import CLEncoder -from trainer import CLTrainer +from trainer import CETrainer logger = logging.getLogger(__name__) from pprint import pprint as pp @@ -19,6 +19,8 @@ from FlagEmbedding.reranker.data import TrainDatasetForCL from utils import get_complete_last_checkpoint import transformers +import os +os.environ["WANDB_DISABLED"]="true" def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str): """Collects the state dict and dump to disk.""" @@ -106,7 +108,7 @@ def main(): checkpoint = last_checkpoint train_dataset = TrainDatasetForCL(data_args, tokenizer=tokenizer) - _trainer_class = CLTrainer + _trainer_class = CETrainer trainer = _trainer_class( model=model, diff --git a/FlagEmbedding/reranker/modeling.py b/FlagEmbedding/reranker/modeling.py index 91a63c2e..db4034ec 100644 --- a/FlagEmbedding/reranker/modeling.py +++ b/FlagEmbedding/reranker/modeling.py @@ -84,6 +84,11 @@ def from_pretrained( *args, **kwargs ): hf_model = AutoModel.from_pretrained(*args, **kwargs) + try: + del hf_model.classifier + except: + print("model has no classifier head") + reranker = cls(hf_model, model_args, data_args, train_args) return reranker @@ -104,8 +109,8 @@ def infoNCELoss(self, anchor, positive, negatives, temperature=1): # 将anchor重复到与负样本相同数量的维度,以便计算 neg_similarity = F.cosine_similarity(anchor, negatives, dim=-1) # 合并正样本和负样本的相似度 - print(pos_similarity.shape) - print(neg_similarity.shape) + # print(pos_similarity.shape) + # print(neg_similarity.shape) all_similarity = torch.cat([pos_similarity, neg_similarity]) # 应用温度缩放 all_similarity /= temperature @@ -124,9 +129,9 @@ def batchloss(self, embeddings): # 除了anchor和positive之外的所有embeddings作为负样本 negatives = embeddings[i, 2:] # [len(negs), 768] # 计算当前batch的InfoNCE损失 - print("anchor", anchor.shape) - print("positive", positive.shape) - print("negatives", negatives.shape) + # print("anchor", anchor.shape) + # print("positive", positive.shape) + # print("negatives", negatives.shape) loss = self.infoNCELoss(anchor, positive, negatives) losses.append(loss) # 计算整个batch的平均损失 @@ -136,6 +141,13 @@ def batchloss(self, embeddings): def forward(self, batch): embeddings = self.get_embedding(**batch) embeddings = embeddings.reshape(self.train_args.per_device_train_batch_size, self.data_args.train_group_size+1, -1) - print("embeddings", embeddings.shape) - loss = self.batchloss(embeddings) - return loss \ No newline at end of file + # print("embeddings", embeddings.shape) + loss = self.batchloss(embeddings).cuda() + #相当于是一个 group_size 个 cls 的多分类任务 + if self.training: + return SequenceClassifierOutput( + loss=loss, + hidden_states=embeddings, + ) + else: + return embeddings \ No newline at end of file diff --git a/FlagEmbedding/reranker/trainer.py b/FlagEmbedding/reranker/trainer.py index 12ef6c5f..60d8e215 100644 --- a/FlagEmbedding/reranker/trainer.py +++ b/FlagEmbedding/reranker/trainer.py @@ -32,8 +32,4 @@ def compute_loss(self, model: CrossEncoder, inputs): def _load_from_checkpoint(self, resume_from_checkpoint, model=None): model = self.model.hf_model - super()._load_from_checkpoint(resume_from_checkpoint, model) - -class CLTrainer(CETrainer): - def compute_loss(self, model: CrossEncoder, inputs): - return model(inputs) \ No newline at end of file + super()._load_from_checkpoint(resume_from_checkpoint, model) \ No newline at end of file diff --git a/reranker_pretrain.py b/reranker_pretrain.py index 29834346..8de922d5 100755 --- a/reranker_pretrain.py +++ b/reranker_pretrain.py @@ -16,7 +16,7 @@ def find_unused_port(start_port=37625): return port return None # 如果没有找到,返回None -# args = (" ").join(sys.argv[1:]) +args = (" ").join(sys.argv[1:]) script_name = "embedding_run" @@ -34,9 +34,9 @@ def find_unused_port(start_port=37625): if not os.path.exists("/opt/tiger/train_15neg"): os.system("cp -r /mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/train_15neg /opt/tiger/train_15neg") #——————————————————————————————————————————————————debug——————————————————————————————————————————————————————————# -args = "--output_dir /mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/toy --model_name_or_path /mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/xlmr/models--FacebookAI--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/ --train_data /opt/tiger/FlagEmbedding/examples/finetune/toy_finetune_data.jsonl --learning_rate 1e-5 --fp16 --num_train_epochs 5 --per_device_train_batch_size 2 --gradient_accumulation_steps 4 --dataloader_drop_last --train_group_size 10 --max_len 512 --weight_decay 0.01 --logging_steps 10 --save_strategy epoch --save_steps 1 --save_total_limit 3" -print(args) -num_gpus = 1 +# args = "--output_dir /mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/toy --model_name_or_path /mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/xlmr/models--FacebookAI--xlm-roberta-base/snapshots/e73636d4f797dec63c3081bb6ed5c7b0bb3f2089/ --train_data /opt/tiger/FlagEmbedding/examples/finetune/toy_finetune_data.jsonl --learning_rate 1e-5 --fp16 --num_train_epochs 5 --per_device_train_batch_size 2 --gradient_accumulation_steps 4 --dataloader_drop_last --train_group_size 10 --max_len 512 --weight_decay 0.01 --logging_steps 10 --save_strategy epoch --save_steps 1 --save_total_limit 3" +# print(args) +# num_gpus = 1 #——————————————————————————————————————————————————debug——————————————————————————————————————————————————————————# # 构建训练命令 From 4b30eb1bacd87e88ca79d77cab281ce3c5ed535b Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Tue, 7 May 2024 20:02:29 +0800 Subject: [PATCH 12/33] new --- FlagEmbedding/reranker/embedding_run.py | 1 - 1 file changed, 1 deletion(-) diff --git a/FlagEmbedding/reranker/embedding_run.py b/FlagEmbedding/reranker/embedding_run.py index 5734c43d..97691803 100644 --- a/FlagEmbedding/reranker/embedding_run.py +++ b/FlagEmbedding/reranker/embedding_run.py @@ -117,7 +117,6 @@ def main(): data_collator=GroupCollator(tokenizer), #这里依旧是拍平 tokenizer=tokenizer ) - checkpoint=None trainer.train(resume_from_checkpoint=checkpoint) trainer.save_state() safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) From 9582d941ae029d928b8003a21d710d16977463ef Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Tue, 7 May 2024 22:15:50 +0800 Subject: [PATCH 13/33] new --- FlagEmbedding/reranker/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/FlagEmbedding/reranker/data.py b/FlagEmbedding/reranker/data.py index ada795e0..daf52b44 100644 --- a/FlagEmbedding/reranker/data.py +++ b/FlagEmbedding/reranker/data.py @@ -92,7 +92,6 @@ class GroupCollator(DataCollatorWithPadding): def __call__( self, features ) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: - assert type(features[0]) == list if isinstance(features[0], list): features = sum(features, []) return super().__call__(features) From efc0e524735b782a6141ffb48e80afc48a769595 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Thu, 9 May 2024 12:01:57 +0800 Subject: [PATCH 14/33] new --- FlagEmbedding/reranker/modeling.py | 35 +++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/FlagEmbedding/reranker/modeling.py b/FlagEmbedding/reranker/modeling.py index db4034ec..95f27ce0 100644 --- a/FlagEmbedding/reranker/modeling.py +++ b/FlagEmbedding/reranker/modeling.py @@ -150,4 +150,37 @@ def forward(self, batch): hidden_states=embeddings, ) else: - return embeddings \ No newline at end of file + return embeddings + +# 投影头 +class SimpleResBlock(nn.Module): + def __init__(self, channels=768): + super().__init__() + self.pre_norm = nn.LayerNorm(channels) + + self.proj = nn.Sequential( + nn.Linear(channels, channels), + nn.GELU(), + nn.Linear(channels, channels) + ) + def forward(self, x): + x = self.pre_norm(x) + return x + self.proj(x) + +class CLProjEncoder(CLEncoder): + def __init__(self, hf_model: PreTrainedModel, model_args: ModelArguments, data_args: DataArguments, train_args: TrainingArguments): + super().__init__(hf_model, model_args, data_args, train_args) + self.proj = SimpleResBlock() + + def get_embedding(self, input_ids, attention_mask): + hidden_state = self.hf_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1].cpu() + attention_mask = attention_mask.cpu() + seq_lengths = attention_mask.sum(dim=1) + embeddings = [] + for seq_len, seq_emb in zip(seq_lengths, hidden_state): + valid_emb = seq_emb[:seq_len] + embeddings.append(torch.mean(valid_emb, dim=0)) + # query的 embedding 还要做一个投影,和 doc 的空间进行对齐 + embeddings[0] = self.proj.forward(embeddings[0]) + embeddings = torch.stack(embeddings) + return embeddings From ea4873db1a235792d7a861c800da09107ac8043a Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Thu, 9 May 2024 12:06:55 +0800 Subject: [PATCH 15/33] new --- reranker_pretrain.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/reranker_pretrain.py b/reranker_pretrain.py index 8de922d5..e9719e62 100755 --- a/reranker_pretrain.py +++ b/reranker_pretrain.py @@ -3,30 +3,11 @@ import os import sys import torch - -def find_unused_port(start_port=37625): - """ - 寻找一个从start_port开始的未使用的端口。 - """ - for port in range(start_port, start_port + 20000): # 检查1000个端口 - with contextlib.suppress(Exception): - for i in range(10): - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.bind(('', port)) - return port - return None # 如果没有找到,返回None - args = (" ").join(sys.argv[1:]) script_name = "embedding_run" # 使用示例 -port = find_unused_port() -if port: - print(f"Found an unused port: {port}") -else: - print("No unused port found.") - num_gpus = torch.cuda.device_count() os.system("cd /opt/tiger/FlagEmbedding") @@ -41,7 +22,7 @@ def find_unused_port(start_port=37625): # 构建训练命令 command = f""" -torchrun --master-port={port} --nproc_per_node {num_gpus} /opt/tiger/FlagEmbedding/FlagEmbedding/reranker/{script_name}.py {args} +torchrun --standalone --nnodes=1 --nproc_per_node {num_gpus} /opt/tiger/FlagEmbedding/FlagEmbedding/reranker/{script_name}.py {args} """ # 执行命令 From d48a58ccdb049a0ce52c7a7636788e37478b7e45 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Thu, 9 May 2024 12:08:33 +0800 Subject: [PATCH 16/33] new --- FlagEmbedding/reranker/embedding_proj_run .py | 125 ++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 FlagEmbedding/reranker/embedding_proj_run .py diff --git a/FlagEmbedding/reranker/embedding_proj_run .py b/FlagEmbedding/reranker/embedding_proj_run .py new file mode 100644 index 00000000..306ddf97 --- /dev/null +++ b/FlagEmbedding/reranker/embedding_proj_run .py @@ -0,0 +1,125 @@ +import logging +import os +from pathlib import Path + +from transformers import AutoConfig, AutoTokenizer, TrainingArguments +from transformers import ( + HfArgumentParser, + set_seed, +) +from arguments import ModelArguments, DataArguments +from data import TrainDatasetForCE, GroupCollator +from modeling import CLProjEncoder +from trainer import CETrainer + +logger = logging.getLogger(__name__) +from pprint import pprint as pp +import sys +sys.path.append("/opt/tiger/FlagEmbedding") +from FlagEmbedding.reranker.data import TrainDatasetForCL +from utils import get_complete_last_checkpoint +import transformers +import os +os.environ["WANDB_DISABLED"]="true" + +def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str): + """Collects the state dict and dump to disk.""" + state_dict = trainer.model.state_dict() + if trainer.args.should_save: + cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()} + del state_dict + trainer._save(output_dir, state_dict=cpu_state_dict) + +def main(): + parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + model_args: ModelArguments + data_args: DataArguments + training_args: TrainingArguments + + # for args in (model_args, data_args, training_args): pp(args) + + # check and load checkpoint + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and not training_args.overwrite_output_dir: + last_checkpoint = get_complete_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + logger.info( + f"Output directory ({training_args.output_dir}) already exists and is empty." + "Train from scratch" + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + training_args.local_rank, + training_args.device, + training_args.n_gpu, + bool(training_args.local_rank != -1), + training_args.fp16, + ) + logger.info("Training/evaluation parameters %s", training_args) + logger.info("Model parameters %s", model_args) + logger.info("Data parameters %s", data_args) + + set_seed(training_args.seed) + + num_labels = 1 + + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=False, + ) + + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + num_labels=num_labels, + cache_dir=model_args.cache_dir, + trust_remote_code=True + ) + _model_class = CLProjEncoder + + model = _model_class.from_pretrained( + model_args, data_args, training_args, + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + trust_remote_code=True + ) + + checkpoint = None + if training_args.resume_from_checkpoint is not None: + logger.info(f"train start from {training_args.resume_from_checkpoint}") + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + logger.info(f"train start from {last_checkpoint}") + checkpoint = last_checkpoint + + train_dataset = TrainDatasetForCL(data_args, tokenizer=tokenizer) + _trainer_class = CETrainer + + trainer = _trainer_class( + model=model, + args=training_args, + train_dataset=train_dataset, + data_collator=GroupCollator(tokenizer), #这里依旧是拍平 + tokenizer=tokenizer + ) + trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_state() + safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) + +if __name__ == "__main__": + main() From 6ac8671dbcf2833f9e36c519f2ffd6233e975d1a Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Thu, 9 May 2024 13:22:20 +0800 Subject: [PATCH 17/33] new --- ...ing_proj_run .py => embedding_proj_run.py} | 0 FlagEmbedding/reranker/modeling.py | 23 +++++++++++++++++-- reranker_pretrain.py | 2 +- 3 files changed, 22 insertions(+), 3 deletions(-) rename FlagEmbedding/reranker/{embedding_proj_run .py => embedding_proj_run.py} (100%) diff --git a/FlagEmbedding/reranker/embedding_proj_run .py b/FlagEmbedding/reranker/embedding_proj_run.py similarity index 100% rename from FlagEmbedding/reranker/embedding_proj_run .py rename to FlagEmbedding/reranker/embedding_proj_run.py diff --git a/FlagEmbedding/reranker/modeling.py b/FlagEmbedding/reranker/modeling.py index 95f27ce0..049753a6 100644 --- a/FlagEmbedding/reranker/modeling.py +++ b/FlagEmbedding/reranker/modeling.py @@ -170,7 +170,13 @@ def forward(self, x): class CLProjEncoder(CLEncoder): def __init__(self, hf_model: PreTrainedModel, model_args: ModelArguments, data_args: DataArguments, train_args: TrainingArguments): super().__init__(hf_model, model_args, data_args, train_args) - self.proj = SimpleResBlock() + channels = 768 + self.pre_norm = nn.LayerNorm(channels) + self.proj = nn.Sequential( + nn.Linear(channels, channels), + nn.GELU(), + nn.Linear(channels, channels) + ) def get_embedding(self, input_ids, attention_mask): hidden_state = self.hf_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1].cpu() @@ -181,6 +187,19 @@ def get_embedding(self, input_ids, attention_mask): valid_emb = seq_emb[:seq_len] embeddings.append(torch.mean(valid_emb, dim=0)) # query的 embedding 还要做一个投影,和 doc 的空间进行对齐 - embeddings[0] = self.proj.forward(embeddings[0]) + # torch.autograd.set_detect_anomaly(True) + # embeddings = torch.stack(embeddings).cuda() + # embeddings[0] = self.pre_norm(embeddings[0]) + # embeddings[0] = embeddings[0] + self.proj(embeddings[0]) + + # embeddings = torch.stack(embeddings).cuda() + # 应用 pre_norm 转换,不原地修改 + query_embedding = torch.tensor(embeddings[0]).unsqueeze(0).cuda() + # print("query embeddings", query_embedding.shape) + # 应用 proj 转换,创建一个新的张量来存储更新后的值 + updated_embeddings = self.proj(query_embedding) + # 将 proj 的结果加到 embeddings[0] 上,赋值给 embeddings[0] + embeddings[0] = (query_embedding + updated_embeddings).squeeze(0).cpu() embeddings = torch.stack(embeddings) + # embeddings = embeddings.cpu() return embeddings diff --git a/reranker_pretrain.py b/reranker_pretrain.py index e9719e62..180abd65 100755 --- a/reranker_pretrain.py +++ b/reranker_pretrain.py @@ -5,7 +5,7 @@ import torch args = (" ").join(sys.argv[1:]) -script_name = "embedding_run" +script_name = "embedding_proj_run" # 使用示例 num_gpus = torch.cuda.device_count() From 14ae116c1ef09e55d89d9f4573912466590ada4d Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Thu, 9 May 2024 14:32:24 +0800 Subject: [PATCH 18/33] new --- FlagEmbedding/reranker/arguments.py | 1 + FlagEmbedding/reranker/modeling.py | 43 ++++++++++++----------------- FlagEmbedding/reranker/run.py | 17 ++++++++---- reranker_pretrain.py | 4 +-- 4 files changed, 31 insertions(+), 34 deletions(-) diff --git a/FlagEmbedding/reranker/arguments.py b/FlagEmbedding/reranker/arguments.py index ae5d639a..f89554fe 100644 --- a/FlagEmbedding/reranker/arguments.py +++ b/FlagEmbedding/reranker/arguments.py @@ -21,6 +21,7 @@ class ModelArguments: cache_dir: Optional[str] = field( default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} ) + model_type: str = field(default="CrossEncoder") @dataclass diff --git a/FlagEmbedding/reranker/modeling.py b/FlagEmbedding/reranker/modeling.py index 049753a6..df1b581a 100644 --- a/FlagEmbedding/reranker/modeling.py +++ b/FlagEmbedding/reranker/modeling.py @@ -171,35 +171,26 @@ class CLProjEncoder(CLEncoder): def __init__(self, hf_model: PreTrainedModel, model_args: ModelArguments, data_args: DataArguments, train_args: TrainingArguments): super().__init__(hf_model, model_args, data_args, train_args) channels = 768 - self.pre_norm = nn.LayerNorm(channels) + # self.pre_norm = nn.LayerNorm(channels) self.proj = nn.Sequential( nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels) ) - def get_embedding(self, input_ids, attention_mask): - hidden_state = self.hf_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1].cpu() - attention_mask = attention_mask.cpu() - seq_lengths = attention_mask.sum(dim=1) - embeddings = [] - for seq_len, seq_emb in zip(seq_lengths, hidden_state): - valid_emb = seq_emb[:seq_len] - embeddings.append(torch.mean(valid_emb, dim=0)) - # query的 embedding 还要做一个投影,和 doc 的空间进行对齐 - # torch.autograd.set_detect_anomaly(True) - # embeddings = torch.stack(embeddings).cuda() - # embeddings[0] = self.pre_norm(embeddings[0]) - # embeddings[0] = embeddings[0] + self.proj(embeddings[0]) - - # embeddings = torch.stack(embeddings).cuda() - # 应用 pre_norm 转换,不原地修改 - query_embedding = torch.tensor(embeddings[0]).unsqueeze(0).cuda() - # print("query embeddings", query_embedding.shape) - # 应用 proj 转换,创建一个新的张量来存储更新后的值 - updated_embeddings = self.proj(query_embedding) - # 将 proj 的结果加到 embeddings[0] 上,赋值给 embeddings[0] - embeddings[0] = (query_embedding + updated_embeddings).squeeze(0).cpu() - embeddings = torch.stack(embeddings) - # embeddings = embeddings.cpu() - return embeddings + def forward(self, batch): + embeddings = self.get_embedding(**batch) + embeddings = embeddings.reshape(self.train_args.per_device_train_batch_size, self.data_args.train_group_size+1, -1) + # 对 query 做一投影 + querys = embeddings[:,0,:] + embeddings[:,0,:] = self.proj(querys.cuda()).cpu() + # print("embeddings", embeddings.shape) + loss = self.batchloss(embeddings).cuda() + #相当于是一个 group_size 个 cls 的多分类任务 + if self.training: + return SequenceClassifierOutput( + loss=loss, + hidden_states=embeddings, + ) + else: + return embeddings \ No newline at end of file diff --git a/FlagEmbedding/reranker/run.py b/FlagEmbedding/reranker/run.py index e0146043..023ad96f 100644 --- a/FlagEmbedding/reranker/run.py +++ b/FlagEmbedding/reranker/run.py @@ -9,8 +9,8 @@ ) from arguments import ModelArguments, DataArguments -from data import TrainDatasetForCE, GroupCollator -from modeling import CrossEncoder +from data import TrainDatasetForCE, GroupCollator, TrainDatasetForCL +from modeling import CLEncoder, CLProjEncoder, CrossEncoder from trainer import CETrainer logger = logging.getLogger(__name__) @@ -85,7 +85,16 @@ def main(): cache_dir=model_args.cache_dir, trust_remote_code=True ) - _model_class = CrossEncoder + + if model_args.model_type=="CrossEncoder": + _model_class = CrossEncoder + train_dataset = TrainDatasetForCE(data_args, tokenizer=tokenizer) + elif model_args.model_type == "CLEncoder": + _model_class = CLEncoder + train_dataset = TrainDatasetForCL(data_args, tokenizer=tokenizer) + else: + _model_class = CLProjEncoder + train_dataset = TrainDatasetForCL(data_args, tokenizer=tokenizer) model = _model_class.from_pretrained( model_args, data_args, training_args, @@ -104,9 +113,7 @@ def main(): logger.info(f"train start from {last_checkpoint}") checkpoint = last_checkpoint - train_dataset = TrainDatasetForCE(data_args, tokenizer=tokenizer) _trainer_class = CETrainer - trainer = _trainer_class( model=model, args=training_args, diff --git a/reranker_pretrain.py b/reranker_pretrain.py index 180abd65..f87791b6 100755 --- a/reranker_pretrain.py +++ b/reranker_pretrain.py @@ -5,8 +5,6 @@ import torch args = (" ").join(sys.argv[1:]) -script_name = "embedding_proj_run" - # 使用示例 num_gpus = torch.cuda.device_count() @@ -22,7 +20,7 @@ # 构建训练命令 command = f""" -torchrun --standalone --nnodes=1 --nproc_per_node {num_gpus} /opt/tiger/FlagEmbedding/FlagEmbedding/reranker/{script_name}.py {args} +torchrun --standalone --nnodes=1 --nproc_per_node {num_gpus} /opt/tiger/FlagEmbedding/FlagEmbedding/reranker/run.py {args} """ # 执行命令 From 109ed3a55c2c8b5c5df78a795368bbad5358299e Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Thu, 9 May 2024 14:33:37 +0800 Subject: [PATCH 19/33] new --- FlagEmbedding/reranker/run.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/FlagEmbedding/reranker/run.py b/FlagEmbedding/reranker/run.py index 023ad96f..91f89d24 100644 --- a/FlagEmbedding/reranker/run.py +++ b/FlagEmbedding/reranker/run.py @@ -85,16 +85,13 @@ def main(): cache_dir=model_args.cache_dir, trust_remote_code=True ) - - if model_args.model_type=="CrossEncoder": - _model_class = CrossEncoder - train_dataset = TrainDatasetForCE(data_args, tokenizer=tokenizer) - elif model_args.model_type == "CLEncoder": - _model_class = CLEncoder - train_dataset = TrainDatasetForCL(data_args, tokenizer=tokenizer) - else: - _model_class = CLProjEncoder - train_dataset = TrainDatasetForCL(data_args, tokenizer=tokenizer) + + if model_args.model_type=="CrossEncoder": _model_class = CrossEncoder + elif model_args.model_type == "CLEncoder": _model_class = CLEncoder + else: _model_class = CLProjEncoder + + if model_args.model_type=="CrossEncoder": train_dataset = TrainDatasetForCE(data_args, tokenizer=tokenizer) + else: train_dataset = TrainDatasetForCL(data_args, tokenizer=tokenizer) model = _model_class.from_pretrained( model_args, data_args, training_args, From 6ce21cf6d81e3bccb2867ec4655da41f5d520688 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Thu, 9 May 2024 15:15:48 +0800 Subject: [PATCH 20/33] new --- .gitignore | 3 ++- FlagEmbedding/reranker/modeling.py | 4 ++++ FlagEmbedding/reranker/run.py | 10 +++++++++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 0ee3e91c..17ea18e0 100644 --- a/.gitignore +++ b/.gitignore @@ -136,4 +136,5 @@ pic2.py # Pyre type checker .pyre/ -wandb/ \ No newline at end of file +wandb/ +*.txt \ No newline at end of file diff --git a/FlagEmbedding/reranker/modeling.py b/FlagEmbedding/reranker/modeling.py index df1b581a..3815ae88 100644 --- a/FlagEmbedding/reranker/modeling.py +++ b/FlagEmbedding/reranker/modeling.py @@ -108,6 +108,10 @@ def infoNCELoss(self, anchor, positive, negatives, temperature=1): pos_similarity = F.cosine_similarity(anchor, positive, dim=-1) # 将anchor重复到与负样本相同数量的维度,以便计算 neg_similarity = F.cosine_similarity(anchor, negatives, dim=-1) + import json + with open("/opt/tiger/FlagEmbedding/FlagEmbedding/reranker/sim.txt", 'a') as f: + f.write("pos: "+json.dumps(pos_similarity.item())+"\n") + f.write("neg: "+json.dumps(neg_similarity.tolist())+"\n") # 合并正样本和负样本的相似度 # print(pos_similarity.shape) # print(neg_similarity.shape) diff --git a/FlagEmbedding/reranker/run.py b/FlagEmbedding/reranker/run.py index 91f89d24..14cf97b5 100644 --- a/FlagEmbedding/reranker/run.py +++ b/FlagEmbedding/reranker/run.py @@ -2,7 +2,7 @@ import os from pathlib import Path -from transformers import AutoConfig, AutoTokenizer, TrainingArguments +from transformers import AutoConfig, AutoTokenizer, TrainingArguments, TrainerCallback, TrainerState, TrainerControl from transformers import ( HfArgumentParser, set_seed, @@ -19,6 +19,8 @@ sys.path.append("/opt/tiger/FlagEmbedding") from utils import get_complete_last_checkpoint import transformers +import os +os.environ["WANDB_DISABLED"]="true" def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str): """Collects the state dict and dump to disk.""" @@ -28,6 +30,12 @@ def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: st del state_dict trainer._save(output_dir, state_dict=cpu_state_dict) +# 记录similarity pos and neg +# class Loggingback(TrainerCallback): +# def on_train_begin(self, args, state: TrainerState, control: TrainerControl, **kwargs): +# if state.global_step % 10 == 0: +# control. + def main(): parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() From 153f434a17e0fa3688622b62f4dcb663a1acf323 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Thu, 9 May 2024 20:19:03 +0800 Subject: [PATCH 21/33] new --- .../reranker/embedding_reranker.ipynb | 220 ++++++++++++++++++ FlagEmbedding/reranker/modeling.py | 21 +- 2 files changed, 234 insertions(+), 7 deletions(-) diff --git a/FlagEmbedding/reranker/embedding_reranker.ipynb b/FlagEmbedding/reranker/embedding_reranker.ipynb index a3fad8f5..b9c6f0d7 100644 --- a/FlagEmbedding/reranker/embedding_reranker.ipynb +++ b/FlagEmbedding/reranker/embedding_reranker.ipynb @@ -361,6 +361,226 @@ "\n", "batch_data = tokenizer([query]+[pos]+negs, padding=True)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 看看开源的sentence encoder" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "cb32cfe5d0004229bc9805b89f8aada3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/394 [00:00\n", + "n192-024-092:113099:113099 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation\n", + "NCCL version 2.20.5+cuda12.4\n", + "n192-024-092:113099:271766 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0.\n", + "n192-024-092:113099:271766 [0] NCCL INFO NCCL_SOCKET_FAMILY set by environment to AF_INET6\n", + "n192-024-092:113099:271766 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0\n", + "n192-024-092:113099:271766 [0] NCCL INFO NCCL_IB_HCA set to mlx5_2:1\n", + "n192-024-092:113099:271766 [0] NCCL INFO NET/IB : Using [0]mlx5_2:1/RoCE [RO]; OOB eth0:fdbd:dc61:7:34::92<0>\n", + "n192-024-092:113099:271766 [0] NCCL INFO Using non-device net plugin version 0\n", + "n192-024-092:113099:271766 [0] NCCL INFO Using network IB\n", + "n192-024-092:113099:271767 [1] NCCL INFO Using non-device net plugin version 0\n", + "n192-024-092:113099:271767 [1] NCCL INFO Using network IB\n", + "n192-024-092:113099:271766 [0] NCCL INFO comm 0xb205b610 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 1b000 commId 0xa0b4958593d819c3 - Init START\n", + "n192-024-092:113099:271767 [1] NCCL INFO comm 0xb205e980 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId 1c000 commId 0xa0b4958593d819c3 - Init START\n", + "n192-024-092:113099:271767 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffff0000,00ffffff\n", + "n192-024-092:113099:271766 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffff0000,00ffffff\n", + "n192-024-092:113099:271766 [0] NCCL INFO comm 0xb205b610 rank 0 nRanks 2 nNodes 1 localRanks 2 localRank 0 MNNVL 0\n", + "n192-024-092:113099:271766 [0] NCCL INFO Channel 00/02 : 0 1\n", + "n192-024-092:113099:271767 [1] NCCL INFO comm 0xb205e980 rank 1 nRanks 2 nNodes 1 localRanks 2 localRank 1 MNNVL 0\n", + "n192-024-092:113099:271766 [0] NCCL INFO Channel 01/02 : 0 1\n", + "n192-024-092:113099:271766 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1\n", + "n192-024-092:113099:271766 [0] NCCL INFO P2P Chunksize set to 524288\n", + "n192-024-092:113099:271767 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0\n", + "n192-024-092:113099:271767 [1] NCCL INFO P2P Chunksize set to 524288\n", + "n192-024-092:113099:271766 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/direct pointer\n", + "n192-024-092:113099:271767 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/direct pointer\n", + "n192-024-092:113099:271766 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/direct pointer\n", + "n192-024-092:113099:271767 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/direct pointer\n", + "n192-024-092:113099:271766 [0] NCCL INFO Connected all rings\n", + "n192-024-092:113099:271766 [0] NCCL INFO Connected all trees\n", + "n192-024-092:113099:271767 [1] NCCL INFO Connected all rings\n", + "n192-024-092:113099:271767 [1] NCCL INFO Connected all trees\n", + "n192-024-092:113099:271767 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512\n", + "n192-024-092:113099:271767 [1] NCCL INFO 2 coll channels, 0 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer\n", + "n192-024-092:113099:271766 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512\n", + "n192-024-092:113099:271766 [0] NCCL INFO 2 coll channels, 0 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer\n", + "n192-024-092:113099:271767 [1] NCCL INFO comm 0xb205e980 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId 1c000 commId 0xa0b4958593d819c3 - Init COMPLETE\n", + "n192-024-092:113099:271766 [0] NCCL INFO comm 0xb205b610 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 1b000 commId 0xa0b4958593d819c3 - Init COMPLETE\n", + "[[0.855 0.852 ]\n", + " [0.874 0.8555]]\n", + "n192-024-092:113099:271788 [0] NCCL INFO Using non-device net plugin version 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Using network IB\n", + "n192-024-092:113099:271788 [0] NCCL INFO comm 0xb80a5d70 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 1b000 commId 0xdd4574414117f6ad - Init START\n", + "n192-024-092:113099:271788 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffff0000,00ffffff\n", + "n192-024-092:113099:271788 [0] NCCL INFO comm 0xb80a5d70 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 00/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 01/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 02/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 03/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 04/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 05/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 06/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 07/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 08/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 09/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 10/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 11/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 12/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 13/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 14/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 15/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 16/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 17/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 18/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 19/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 20/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 21/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 22/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 23/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 24/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 25/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 26/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 27/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 28/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 29/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 30/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Channel 31/32 : 0\n", + "n192-024-092:113099:271788 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1\n", + "n192-024-092:113099:271788 [0] NCCL INFO P2P Chunksize set to 131072\n", + "n192-024-092:113099:271788 [0] NCCL INFO Connected all rings\n", + "n192-024-092:113099:271788 [0] NCCL INFO Connected all trees\n", + "n192-024-092:113099:271788 [0] NCCL INFO 32 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer\n", + "n192-024-092:113099:271788 [0] NCCL INFO comm 0xb80a5d70 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 1b000 commId 0xdd4574414117f6ad - Init COMPLETE\n", + "[[0.652 0.4424]]\n" + ] + } + ], + "source": [ + "from FlagEmbedding import FlagModel\n", + "sentences_1 = [\"样例数据-1\", \"样例数据-2\"]\n", + "sentences_2 = [\"样例数据-3\", \"样例数据-4\"]\n", + "model = FlagModel('BAAI/bge-large-zh-v1.5', \n", + " query_instruction_for_retrieval=\"为这个句子生成表示以用于检索相关文章:\",\n", + " use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation\n", + "embeddings_1 = model.encode(sentences_1)\n", + "embeddings_2 = model.encode(sentences_2)\n", + "similarity = embeddings_1 @ embeddings_2.T\n", + "print(similarity) \n", + "\n", + "# for s2p(short query to long passage) retrieval task, suggest to use encode_queries() which will automatically add the instruction to each query\n", + "# corpus in retrieval task can still use encode() or encode_corpus(), since they don't need instruction\n", + "queries = ['When was quantum field theory developed?']\n", + "passages = [\"Quantum field theory naturally began with the study of electromagnetic interactions, as the electromagnetic field was the only known classical field as of the 1920s.[8]:1\", \"Cumrun Vafa is a string theorist. His research is focused on the nature of quantum gravity and the relation between geometry and quantum field theories. He is known in the string theory community for his co-discovery, with Strominger, that the Bekenstein-Hawking entropy of a black hole can be accounted for by solitonic states of superstring theory, and for expounding the relation between geometry and field theories that arise through string dualities (culminating in the Gopakumar\\u2013Vafa conjecture). This topic has been known as \\\"geometric engineering of quantum field theories\\\". In 1997, he developed F-theory.\"]\n", + "q_embeddings = model.encode_queries(queries)\n", + "p_embeddings = model.encode(passages)\n", + "scores = q_embeddings @ p_embeddings.T\n", + "print(scores)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/FlagEmbedding/reranker/modeling.py b/FlagEmbedding/reranker/modeling.py index 3815ae88..9e32e9b6 100644 --- a/FlagEmbedding/reranker/modeling.py +++ b/FlagEmbedding/reranker/modeling.py @@ -70,13 +70,14 @@ def save_pretrained(self, output_dir: str): class CLEncoder(CrossEncoder): def __init__(self, hf_model: PreTrainedModel, model_args: ModelArguments, data_args: DataArguments, - train_args: TrainingArguments): + train_args: TrainingArguments, pooling_method = 'cls'): super(CrossEncoder, self).__init__() self.hf_model = hf_model self.model_args = model_args self.train_args = train_args self.data_args = data_args self.config = self.hf_model.config + self.pooling_method = pooling_method @classmethod def from_pretrained( @@ -91,16 +92,22 @@ def from_pretrained( reranker = cls(hf_model, model_args, data_args, train_args) return reranker + + def pooling(self, + last_hidden_state: torch.Tensor, + attention_mask: torch.Tensor = None): + if self.pooling_method == 'cls': + return last_hidden_state[:, 0] + elif self.pooling_method == 'mean': + s = torch.sum(last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=1) + d = attention_mask.sum(dim=1, keepdim=True).float() + return s / d def get_embedding(self, input_ids, attention_mask): hidden_state = self.hf_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1].cpu() attention_mask = attention_mask.cpu() - seq_lengths = attention_mask.sum(dim=1) - embeddings = [] - for seq_len, seq_emb in zip(seq_lengths, hidden_state): - valid_emb = seq_emb[:seq_len] - embeddings.append(torch.mean(valid_emb, dim=0)) - embeddings = torch.stack(embeddings) + embeddings = self.pooling(hidden_state, attention_mask) + embeddings = torch.nn.functional.normalize(embeddings, dim=-1) return embeddings def infoNCELoss(self, anchor, positive, negatives, temperature=1): From d573b4c54d027436bdd940c1548fda00a00af8c9 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Fri, 10 May 2024 15:57:29 +0800 Subject: [PATCH 22/33] new --- .gitignore | 3 ++- FlagEmbedding/baai_general_embedding/finetune/data.py | 2 +- FlagEmbedding/baai_general_embedding/finetune/run.py | 8 ++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 17ea18e0..2d0d03c5 100644 --- a/.gitignore +++ b/.gitignore @@ -137,4 +137,5 @@ pic2.py # Pyre type checker .pyre/ wandb/ -*.txt \ No newline at end of file +*.txt +result/ \ No newline at end of file diff --git a/FlagEmbedding/baai_general_embedding/finetune/data.py b/FlagEmbedding/baai_general_embedding/finetune/data.py index 387c1482..57381e7f 100644 --- a/FlagEmbedding/baai_general_embedding/finetune/data.py +++ b/FlagEmbedding/baai_general_embedding/finetune/data.py @@ -8,7 +8,7 @@ from torch.utils.data import Dataset from transformers import DataCollatorWithPadding, PreTrainedTokenizer -from .arguments import DataArguments +from arguments import DataArguments class TrainDatasetForEmbedding(Dataset): diff --git a/FlagEmbedding/baai_general_embedding/finetune/run.py b/FlagEmbedding/baai_general_embedding/finetune/run.py index ff48281d..792444f3 100644 --- a/FlagEmbedding/baai_general_embedding/finetune/run.py +++ b/FlagEmbedding/baai_general_embedding/finetune/run.py @@ -8,11 +8,11 @@ set_seed, ) -from .arguments import ModelArguments, DataArguments, \ +from arguments import ModelArguments, DataArguments, \ RetrieverTrainingArguments as TrainingArguments -from .data import TrainDatasetForEmbedding, EmbedCollator -from .modeling import BiEncoderModel -from .trainer import BiTrainer +from data import TrainDatasetForEmbedding, EmbedCollator +from modeling import BiEncoderModel +from trainer import BiTrainer logger = logging.getLogger(__name__) From 561916a491c5bd2e4a7515ea71021f1007181f51 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Fri, 10 May 2024 16:11:17 +0800 Subject: [PATCH 23/33] new --- reranker_pretrain.py => pretrain.py | 4 ++-- tmp.py | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) rename reranker_pretrain.py => pretrain.py (90%) create mode 100644 tmp.py diff --git a/reranker_pretrain.py b/pretrain.py similarity index 90% rename from reranker_pretrain.py rename to pretrain.py index f87791b6..4cfcb8a6 100755 --- a/reranker_pretrain.py +++ b/pretrain.py @@ -3,7 +3,7 @@ import os import sys import torch -args = (" ").join(sys.argv[1:]) +args = (" ").join(sys.argv) # 使用示例 num_gpus = torch.cuda.device_count() @@ -20,7 +20,7 @@ # 构建训练命令 command = f""" -torchrun --standalone --nnodes=1 --nproc_per_node {num_gpus} /opt/tiger/FlagEmbedding/FlagEmbedding/reranker/run.py {args} +torchrun --rdzv_backend c10d --rdzv_endpoint localhost:0 --nproc_per_node {num_gpus} /opt/tiger/FlagEmbedding/FlagEmbedding/reranker/run.py {args} """ # 执行命令 diff --git a/tmp.py b/tmp.py new file mode 100644 index 00000000..16d07c33 --- /dev/null +++ b/tmp.py @@ -0,0 +1,5 @@ +import os +import sys +import torch +args = (" ").join(sys.argv) +print(args) \ No newline at end of file From 1e951dcf375780e7d7144104050f2a7232fc7f45 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Fri, 10 May 2024 16:26:13 +0800 Subject: [PATCH 24/33] new --- pretrain.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pretrain.py b/pretrain.py index 4cfcb8a6..1752f7e9 100755 --- a/pretrain.py +++ b/pretrain.py @@ -3,13 +3,11 @@ import os import sys import torch +os.environ["WANDB_DISABLED"]="true" args = (" ").join(sys.argv) - # 使用示例 num_gpus = torch.cuda.device_count() - os.system("cd /opt/tiger/FlagEmbedding") - if not os.path.exists("/opt/tiger/train_15neg"): os.system("cp -r /mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/train_15neg /opt/tiger/train_15neg") #——————————————————————————————————————————————————debug——————————————————————————————————————————————————————————# @@ -19,9 +17,6 @@ #——————————————————————————————————————————————————debug——————————————————————————————————————————————————————————# # 构建训练命令 -command = f""" -torchrun --rdzv_backend c10d --rdzv_endpoint localhost:0 --nproc_per_node {num_gpus} /opt/tiger/FlagEmbedding/FlagEmbedding/reranker/run.py {args} -""" - +command = f"""torchrun --rdzv_backend c10d --rdzv_endpoint localhost:0 --nproc_per_node {num_gpus} {args}""" # 执行命令 os.system(command) \ No newline at end of file From 5006ef484eae48d07baf1afe59970900dcad5389 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Fri, 10 May 2024 18:00:03 +0800 Subject: [PATCH 25/33] new --- FlagEmbedding/baai_general_embedding/finetune/run.py | 1 + 1 file changed, 1 insertion(+) diff --git a/FlagEmbedding/baai_general_embedding/finetune/run.py b/FlagEmbedding/baai_general_embedding/finetune/run.py index 792444f3..4aa3b37f 100644 --- a/FlagEmbedding/baai_general_embedding/finetune/run.py +++ b/FlagEmbedding/baai_general_embedding/finetune/run.py @@ -1,6 +1,7 @@ import logging import os from pathlib import Path +os.environ["WANDB_DISABLED"]="true" from transformers import AutoConfig, AutoTokenizer from transformers import ( From 8a9f2a8d53a84fbec9ba770ae3910497aecace29 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Fri, 10 May 2024 18:08:55 +0800 Subject: [PATCH 26/33] new --- FlagEmbedding/baai_general_embedding/finetune/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FlagEmbedding/baai_general_embedding/finetune/run.py b/FlagEmbedding/baai_general_embedding/finetune/run.py index 4aa3b37f..b499329b 100644 --- a/FlagEmbedding/baai_general_embedding/finetune/run.py +++ b/FlagEmbedding/baai_general_embedding/finetune/run.py @@ -100,7 +100,7 @@ def main(): Path(training_args.output_dir).mkdir(parents=True, exist_ok=True) # Training - trainer.train() + trainer.train(resume_from_checkpoint=True) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) From 5d3d7c9313f94a4024851bf74b9096e1bc9befeb Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Fri, 10 May 2024 19:07:49 +0800 Subject: [PATCH 27/33] new --- FlagEmbedding/baai_general_embedding/finetune/run.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/FlagEmbedding/baai_general_embedding/finetune/run.py b/FlagEmbedding/baai_general_embedding/finetune/run.py index b499329b..70376e6d 100644 --- a/FlagEmbedding/baai_general_embedding/finetune/run.py +++ b/FlagEmbedding/baai_general_embedding/finetune/run.py @@ -100,7 +100,10 @@ def main(): Path(training_args.output_dir).mkdir(parents=True, exist_ok=True) # Training - trainer.train(resume_from_checkpoint=True) + try: + trainer.train(resume_from_checkpoint=True) + except: + trainer.train() trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) From 2de4f5d0dcb8e9f73ee9ed5504c65d70a25da03a Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Fri, 10 May 2024 19:29:43 +0800 Subject: [PATCH 28/33] new --- FlagEmbedding/reranker/modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FlagEmbedding/reranker/modeling.py b/FlagEmbedding/reranker/modeling.py index 9e32e9b6..aa85cc00 100644 --- a/FlagEmbedding/reranker/modeling.py +++ b/FlagEmbedding/reranker/modeling.py @@ -150,7 +150,7 @@ def batchloss(self, embeddings): return batch_loss def forward(self, batch): - embeddings = self.get_embedding(**batch) + embeddings = self.get_embedding(batch["input_ids"], batch["attention_mask"]) embeddings = embeddings.reshape(self.train_args.per_device_train_batch_size, self.data_args.train_group_size+1, -1) # print("embeddings", embeddings.shape) loss = self.batchloss(embeddings).cuda() From 4128093d21420897a33d3690b5ebf6868bc798dc Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Fri, 10 May 2024 22:15:43 +0800 Subject: [PATCH 29/33] new --- FlagEmbedding/reranker/modeling.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/FlagEmbedding/reranker/modeling.py b/FlagEmbedding/reranker/modeling.py index aa85cc00..c4224fc1 100644 --- a/FlagEmbedding/reranker/modeling.py +++ b/FlagEmbedding/reranker/modeling.py @@ -115,10 +115,6 @@ def infoNCELoss(self, anchor, positive, negatives, temperature=1): pos_similarity = F.cosine_similarity(anchor, positive, dim=-1) # 将anchor重复到与负样本相同数量的维度,以便计算 neg_similarity = F.cosine_similarity(anchor, negatives, dim=-1) - import json - with open("/opt/tiger/FlagEmbedding/FlagEmbedding/reranker/sim.txt", 'a') as f: - f.write("pos: "+json.dumps(pos_similarity.item())+"\n") - f.write("neg: "+json.dumps(neg_similarity.tolist())+"\n") # 合并正样本和负样本的相似度 # print(pos_similarity.shape) # print(neg_similarity.shape) From 8adea8ee4a6cea8407d5f425408fbf0f932e35b7 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Sat, 11 May 2024 15:05:39 +0800 Subject: [PATCH 30/33] new --- FlagEmbedding/baai_general_embedding/finetune/data.py | 5 ++--- FlagEmbedding/reranker/data.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/FlagEmbedding/baai_general_embedding/finetune/data.py b/FlagEmbedding/baai_general_embedding/finetune/data.py index 57381e7f..4c0fc1ea 100644 --- a/FlagEmbedding/baai_general_embedding/finetune/data.py +++ b/FlagEmbedding/baai_general_embedding/finetune/data.py @@ -20,15 +20,14 @@ def __init__( if os.path.isdir(args.train_data): train_datasets = [] for file in os.listdir(args.train_data): - temp_dataset = datasets.load_dataset('json', data_files=os.path.join(args.train_data, file), - split='train') + temp_dataset = datasets.load_dataset('json', data_files=os.path.join(args.train_data, file)) if len(temp_dataset) > args.max_example_num_per_dataset: temp_dataset = temp_dataset.select( random.sample(list(range(len(temp_dataset))), args.max_example_num_per_dataset)) train_datasets.append(temp_dataset) self.dataset = datasets.concatenate_datasets(train_datasets) else: - self.dataset = datasets.load_dataset('json', data_files=args.train_data, split='train') + self.dataset = datasets.load_dataset('json') self.tokenizer = tokenizer self.args = args diff --git a/FlagEmbedding/reranker/data.py b/FlagEmbedding/reranker/data.py index daf52b44..af9b3c65 100644 --- a/FlagEmbedding/reranker/data.py +++ b/FlagEmbedding/reranker/data.py @@ -22,12 +22,11 @@ def __init__( if os.path.isdir(args.train_data): train_datasets = [] for file in os.listdir(args.train_data): - temp_dataset = datasets.load_dataset('json', data_files=os.path.join(args.train_data, file), - split='train') + temp_dataset = datasets.load_dataset('json', data_files=os.path.join(args.train_data, file)) train_datasets.append(temp_dataset) self.dataset = datasets.concatenate_datasets(train_datasets) else: - self.dataset = datasets.load_dataset('json', data_files=args.train_data, split='train') + self.dataset = datasets.load_dataset('json', data_files=args.train_data) self.tokenizer = tokenizer self.args = args From daf625b9e31b63484bfcd46c9d1c432122802f50 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Sat, 11 May 2024 15:21:16 +0800 Subject: [PATCH 31/33] new --- FlagEmbedding/baai_general_embedding/finetune/data.py | 5 +++-- FlagEmbedding/reranker/data.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/FlagEmbedding/baai_general_embedding/finetune/data.py b/FlagEmbedding/baai_general_embedding/finetune/data.py index 4c0fc1ea..57381e7f 100644 --- a/FlagEmbedding/baai_general_embedding/finetune/data.py +++ b/FlagEmbedding/baai_general_embedding/finetune/data.py @@ -20,14 +20,15 @@ def __init__( if os.path.isdir(args.train_data): train_datasets = [] for file in os.listdir(args.train_data): - temp_dataset = datasets.load_dataset('json', data_files=os.path.join(args.train_data, file)) + temp_dataset = datasets.load_dataset('json', data_files=os.path.join(args.train_data, file), + split='train') if len(temp_dataset) > args.max_example_num_per_dataset: temp_dataset = temp_dataset.select( random.sample(list(range(len(temp_dataset))), args.max_example_num_per_dataset)) train_datasets.append(temp_dataset) self.dataset = datasets.concatenate_datasets(train_datasets) else: - self.dataset = datasets.load_dataset('json') + self.dataset = datasets.load_dataset('json', data_files=args.train_data, split='train') self.tokenizer = tokenizer self.args = args diff --git a/FlagEmbedding/reranker/data.py b/FlagEmbedding/reranker/data.py index af9b3c65..daf52b44 100644 --- a/FlagEmbedding/reranker/data.py +++ b/FlagEmbedding/reranker/data.py @@ -22,11 +22,12 @@ def __init__( if os.path.isdir(args.train_data): train_datasets = [] for file in os.listdir(args.train_data): - temp_dataset = datasets.load_dataset('json', data_files=os.path.join(args.train_data, file)) + temp_dataset = datasets.load_dataset('json', data_files=os.path.join(args.train_data, file), + split='train') train_datasets.append(temp_dataset) self.dataset = datasets.concatenate_datasets(train_datasets) else: - self.dataset = datasets.load_dataset('json', data_files=args.train_data) + self.dataset = datasets.load_dataset('json', data_files=args.train_data, split='train') self.tokenizer = tokenizer self.args = args From f3358940dda59abf4377506d89992d22a3161787 Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Mon, 13 May 2024 13:43:04 +0800 Subject: [PATCH 32/33] new --- .../finetune/modeling.py | 10 +- debug_args.ipynb | 201 ++++++++++++++++++ 2 files changed, 207 insertions(+), 4 deletions(-) diff --git a/FlagEmbedding/baai_general_embedding/finetune/modeling.py b/FlagEmbedding/baai_general_embedding/finetune/modeling.py index c5d2935f..70600e04 100644 --- a/FlagEmbedding/baai_general_embedding/finetune/modeling.py +++ b/FlagEmbedding/baai_general_embedding/finetune/modeling.py @@ -60,19 +60,21 @@ def __init__(self, def gradient_checkpointing_enable(self, **kwargs): self.model.gradient_checkpointing_enable(**kwargs) - def sentence_embedding(self, hidden_state, mask): + def sentence_embedding(self, output, mask): if self.sentence_pooling_method == 'mean': - s = torch.sum(hidden_state * mask.unsqueeze(-1).float(), dim=1) + s = torch.sum(output.last_hidden_state * mask.unsqueeze(-1).float(), dim=1) d = mask.sum(axis=1, keepdim=True).float() return s / d elif self.sentence_pooling_method == 'cls': - return hidden_state[:, 0] + return output.last_hidden_state[:, 0] + elif self.sentence_pooling_method == 'cls_after_pooler': + return output.pooler_output def encode(self, features): if features is None: return None psg_out = self.model(**features, return_dict=True) - p_reps = self.sentence_embedding(psg_out.last_hidden_state, features['attention_mask']) + p_reps = self.sentence_embedding(psg_out, features['attention_mask']) if self.normlized: p_reps = torch.nn.functional.normalize(p_reps, dim=-1) return p_reps.contiguous() diff --git a/debug_args.ipynb b/debug_args.ipynb index 8653d06c..32465205 100644 --- a/debug_args.ipynb +++ b/debug_args.ipynb @@ -19,6 +19,207 @@ "print(args)" ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import transformers\n", + "from transformers import AutoTokenizer, AutoConfig, AutoModel\n", + "from transformers import TextGenerationPipeline, AutoModelForCausalLM, LlamaTokenizerFast\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "###config: RobertaConfig {\n", + " \"_name_or_path\": \"/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/encoder_simcse_group15_batch1_a100_bge_manner/checkpoint-8000\",\n", + " \"architectures\": [\n", + " \"RobertaModel\"\n", + " ],\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"bos_token_id\": 0,\n", + " \"classifier_dropout\": null,\n", + " \"eos_token_id\": 2,\n", + " \"gradient_checkpointing\": false,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 1024,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 4096,\n", + " \"layer_norm_eps\": 1e-05,\n", + " \"max_position_embeddings\": 514,\n", + " \"model_type\": \"roberta\",\n", + " \"num_attention_heads\": 16,\n", + " \"num_hidden_layers\": 24,\n", + " \"output_past\": true,\n", + " \"pad_token_id\": 1,\n", + " \"position_embedding_type\": \"absolute\",\n", + " \"torch_dtype\": \"float32\",\n", + " \"transformers_version\": \"4.40.2\",\n", + " \"type_vocab_size\": 1,\n", + " \"use_cache\": true,\n", + " \"use_pooler_layer\": true,\n", + " \"vocab_size\": 250002\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "config = transformers.AutoConfig.from_pretrained(\"/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/encoder_simcse_group15_batch1_a100_bge_manner/checkpoint-8000\")\n", + "config.use_pooler_layer = True\n", + "# config.loss_type = model_args.loss_type\n", + "# config.num_labels = model_args.num_labels\n", + "# config.margin = model_args.margin\n", + "# logging(f'Loss Function: {config.loss_type}')\n", + "print('###config: ', config)\n", + "# model = AutoModelForCausalLM.from_pretrained(\"/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/encoder_simcse_group15_batch1_a100_bge_manner/checkpoint-8000\", device_map=\"auto\")\n", + "# tokenizer = AutoTokenizer.from_pretrained(\"/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/encoder_simcse_group15_batch1_a100_bge_manner/checkpoint-8000\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "model = AutoModel.from_pretrained(\n", + " \"/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/encoder_simcse_group15_batch1_a100_bge_manner/checkpoint-8000\",\n", + " config=config\n", + ")\n", + "tokenizer = AutoTokenizer.from_pretrained(\"/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/encoder_simcse_group15_batch1_a100_bge_manner/checkpoint-8000\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RobertaModel(\n", + " (embeddings): RobertaEmbeddings(\n", + " (word_embeddings): Embedding(250002, 1024, padding_idx=1)\n", + " (position_embeddings): Embedding(514, 1024, padding_idx=1)\n", + " (token_type_embeddings): Embedding(1, 1024)\n", + " (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (encoder): RobertaEncoder(\n", + " (layer): ModuleList(\n", + " (0-23): 24 x RobertaLayer(\n", + " (attention): RobertaAttention(\n", + " (self): RobertaSelfAttention(\n", + " (query): Linear(in_features=1024, out_features=1024, bias=True)\n", + " (key): Linear(in_features=1024, out_features=1024, bias=True)\n", + " (value): Linear(in_features=1024, out_features=1024, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): RobertaSelfOutput(\n", + " (dense): Linear(in_features=1024, out_features=1024, bias=True)\n", + " (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): RobertaIntermediate(\n", + " (dense): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (intermediate_act_fn): GELUActivation()\n", + " )\n", + " (output): RobertaOutput(\n", + " (dense): Linear(in_features=4096, out_features=1024, bias=True)\n", + " (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (pooler): RobertaPooler(\n", + " (dense): Linear(in_features=1024, out_features=1024, bias=True)\n", + " (activation): Tanh()\n", + " )\n", + ")\n" + ] + } + ], + "source": [ + "print(model)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'input_ids': tensor([[ 0, 33600, 31, 8999, 2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}\n" + ] + } + ], + "source": [ + "query = tokenizer(\"hello world\", return_tensors=\"pt\")\n", + "print(query)\n", + "output = model(**query)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "odict_keys(['last_hidden_state', 'pooler_output'])\n", + "tensor([[-0.4685, -0.1635, -0.0385, ..., -0.1976, 0.1712, 0.0859]],\n", + " grad_fn=)\n" + ] + } + ], + "source": [ + "print(output.keys())\n", + "print(output.pooler_output)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "odict_keys(['last_hidden_state', 'pooler_output'])\n", + "tensor([[-0.4685, -0.1635, -0.0385, ..., -0.1976, 0.1712, 0.0859]],\n", + " grad_fn=)\n" + ] + } + ], + "source": [ + "model1 = AutoModel.from_pretrained(\n", + " \"/mnt/bn/data-tns-live-llm/leon/experiments/llm/fcbank/encoder_simcse_group15_batch1_a100_bge_manner/checkpoint-8000\"\n", + ")\n", + "output1 = model1(**query)\n", + "print(output1.keys())\n", + "print(output.pooler_output)" + ] + }, { "cell_type": "code", "execution_count": null, From 3af708eb6a227b7d3bfaf758878e4fe5c1f733ec Mon Sep 17 00:00:00 2001 From: "leon.kepler" Date: Mon, 13 May 2024 16:04:59 +0800 Subject: [PATCH 33/33] new --- .../baai_general_embedding/finetune/run.py | 61 +++++++++++++------ 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/FlagEmbedding/baai_general_embedding/finetune/run.py b/FlagEmbedding/baai_general_embedding/finetune/run.py index 70376e6d..311e0239 100644 --- a/FlagEmbedding/baai_general_embedding/finetune/run.py +++ b/FlagEmbedding/baai_general_embedding/finetune/run.py @@ -14,10 +14,21 @@ from data import TrainDatasetForEmbedding, EmbedCollator from modeling import BiEncoderModel from trainer import BiTrainer +import sys +import transformers +sys.path.append("/opt/tiger/FlagEmbedding") +from utils import get_complete_last_checkpoint logger = logging.getLogger(__name__) - +def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str): + """Collects the state dict and dump to disk.""" + state_dict = trainer.model.state_dict() + if trainer.args.should_save: + cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()} + del state_dict + trainer._save(output_dir, state_dict=cpu_state_dict) + def main(): parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() @@ -25,15 +36,20 @@ def main(): data_args: DataArguments training_args: TrainingArguments - if ( - os.path.exists(training_args.output_dir) - and os.listdir(training_args.output_dir) - and training_args.do_train - and not training_args.overwrite_output_dir - ): - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." - ) + # check and load checkpoint + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and not training_args.overwrite_output_dir: + last_checkpoint = get_complete_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + logger.info( + f"Output directory ({training_args.output_dir}) already exists and is empty." + "Train from scratch" + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) # Setup logging logging.basicConfig( @@ -76,6 +92,14 @@ def main(): temperature=training_args.temperature, use_inbatch_neg=training_args.use_inbatch_neg, ) + + checkpoint = None + if training_args.resume_from_checkpoint is not None: + logger.info(f"train start from {training_args.resume_from_checkpoint}") + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + logger.info(f"train start from {last_checkpoint}") + checkpoint = last_checkpoint if training_args.fix_position_embedding: for k, v in model.named_parameters(): @@ -100,15 +124,18 @@ def main(): Path(training_args.output_dir).mkdir(parents=True, exist_ok=True) # Training - try: - trainer.train(resume_from_checkpoint=True) - except: - trainer.train() - trainer.save_model() + trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_state() + safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) + # try: + # trainer.train(resume_from_checkpoint=checkpoint) + # except: + # trainer.train() + # trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) - if trainer.is_world_process_zero(): - tokenizer.save_pretrained(training_args.output_dir) + # if trainer.is_world_process_zero(): + # tokenizer.save_pretrained(training_args.output_dir) if __name__ == "__main__":