From 899fb71173c122417bf995bfb6f8de55669c0cbc Mon Sep 17 00:00:00 2001 From: wangzaistone Date: Tue, 10 Oct 2023 17:35:37 +0800 Subject: [PATCH 1/4] delete duplicate and no used code any more --- dbgpt_hub/configs/__init__.py | 2 -- dbgpt_hub/configs/data_args.py | 10 +++------- dbgpt_hub/configs/model_args.py | 28 +--------------------------- 3 files changed, 4 insertions(+), 36 deletions(-) diff --git a/dbgpt_hub/configs/__init__.py b/dbgpt_hub/configs/__init__.py index 11832e3..b130860 100644 --- a/dbgpt_hub/configs/__init__.py +++ b/dbgpt_hub/configs/__init__.py @@ -1,7 +1,6 @@ from .model_args import ( ModelArguments, TrainingArguments, - ModelInferenceArguments, ) from .data_args import( DataArguments, @@ -13,5 +12,4 @@ "Llama2Template", "ModelArguments", "TrainingArguments", - "ModelInferenceArguments", ] diff --git a/dbgpt_hub/configs/data_args.py b/dbgpt_hub/configs/data_args.py index 9281e44..bb5ae40 100644 --- a/dbgpt_hub/configs/data_args.py +++ b/dbgpt_hub/configs/data_args.py @@ -1,16 +1,12 @@ import os import json -from typing import List, Literal, Optional -from dataclasses import dataclass, field import tiktoken -from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union - -# from dbgpt_hub.llm_base.loggings import get_logger - +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union if TYPE_CHECKING: from transformers import PreTrainedTokenizer + DEFAULT_PROMPT_DICT = { "prompt_input": ("{instruction}\n\n{input}\n\n"), "prompt_no_input": ("{instruction}\n\n"), diff --git a/dbgpt_hub/configs/model_args.py b/dbgpt_hub/configs/model_args.py index 758de07..0ebf354 100644 --- a/dbgpt_hub/configs/model_args.py +++ b/dbgpt_hub/configs/model_args.py @@ -1,14 +1,10 @@ import json -import os -import yaml import torch from dataclasses import dataclass, field, asdict -from typing import Optional, Any, Dict, List,Literal +from typing import Optional, Any, Dict,Literal from transformers import Seq2SeqTrainingArguments from dbgpt_hub.configs.config import ( MODEL_PATH, - DEFAULT_FT_MODEL_NAME, - DATA_PATH, ADAPTER_PATH, ) @@ -118,28 +114,6 @@ def __post_init__(self): HfFolder.save_token(self.hf_auth_token) -@dataclass -class ModelInferenceArguments: - cache_dir: Optional[str] = field(default=None) - model_name_or_path: Optional[str] = field( - default=MODEL_PATH, metadata={"help": "Path to pre-trained model"} - ) - model_max_length: int = field( - default=1024, - metadata={ - "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)." - }, - ) - prompt_template: str = field( - default="default", - metadata={ - "help": "Prompt template name. Such as vanilla, alpaca, llama2, vicuna..., etc." - }, - ) - source_prefix: Optional[str] = field( - default=None, metadata={"help": "Prefix to prepend to every source text."} - ) - @dataclass From 2f9bf9200437cf81225106f9334a1303584e7eb0 Mon Sep 17 00:00:00 2001 From: wangzaistone Date: Tue, 10 Oct 2023 17:47:56 +0800 Subject: [PATCH 2/4] del no used code in data_process --- dbgpt_hub/data_process/data_utils.py | 279 +------------------------- dbgpt_hub/data_process/sft_dataset.py | 139 ------------- 2 files changed, 4 insertions(+), 414 deletions(-) diff --git a/dbgpt_hub/data_process/data_utils.py b/dbgpt_hub/data_process/data_utils.py index c9c0d47..e4edcfd 100644 --- a/dbgpt_hub/data_process/data_utils.py +++ b/dbgpt_hub/data_process/data_utils.py @@ -1,25 +1,21 @@ import hashlib -from itertools import chain import os import numpy as np import pandas as pd import tiktoken - +from itertools import chain from typing import Any, Dict, List, Optional, Tuple, Union,TYPE_CHECKING,Generator from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset, interleave_datasets +from transformers.tokenization_utils import PreTrainedTokenizer from dbgpt_hub.configs.config import EXT2TYPE, IGNORE_INDEX from dbgpt_hub.configs.data_args import DEFAULT_PROMPT_DICT,ALPACA_PROMPT_DICT,SQL_PROMPT_DICT,Template,Llama2Template -from transformers.tokenization_utils import PreTrainedTokenizer - -from .sft_dataset import DataCollatorForSupervisedDataset, SFTInstructionDataset if TYPE_CHECKING: - from dbgpt_hub.configs.model_args import ModelArguments,FinetuningArguments,GeneratingArguments + from dbgpt_hub.configs.model_args import ModelArguments from dbgpt_hub.configs.data_args import DataArguments - from datasets import Dataset, IterableDataset + from datasets import IterableDataset from transformers import TrainingArguments,Seq2SeqTrainingArguments - from datasets import Dataset, IterableDataset from dbgpt_hub.llm_base.loggings import get_logger @@ -129,91 +125,6 @@ def load_data( raise ValueError(f"Error loading dataset from {dataset_path}") -def formate_instruction_dataset( - dataset: Dataset, - dataset_name: str, - dataset_format: str, - instruction_template: str = "default", -) -> Optional[Dict[str, Dataset]]: - """ - Formats a given dataset based on its name and format. - - - Removes unused columns, renames columns to 'input' and 'output', - and applies dataset-specific formatting based on the dataset_name. - - Returns formatted dataset dict if dataset can be formatted, else None. - - Args: - dataset: A dataset object to be formatted. - dataset_name: A string representing the name of the dataset to be formatted. - dataset_format: A string representing the name of the dataset format to be used. - instruction_template: A string representing the name of the prompt template to be used. - - Returns: - A dictionary containing the formatted dataset if the dataset exists in the - specified format. - None if the dataset does not exist or if the format is not recognized. - """ - - def _format_self_instruct(dataset: Dataset) -> Dataset: - """Format Self-Instruct dataset. - - hf_url: https://huggingface.co/datasets/yizhongw/self_instruct/viewer/self_instruct/train - """ - dataset = dataset.rename_column("prompt", "input") - dataset = dataset.rename_column("completion", "output") - return dataset - - def _remove_unused_columns(dataset): - """Remove columns not named 'input' or 'output'.""" - dataset = dataset.remove_columns( - [ - col - for col in dataset.column_names["train"] - if col not in ["input", "output"] - ] - ) - return dataset - - # Format dataset - print(f"The {dataset_name} using {dataset_format} dataset format.") - if dataset_format == "alpaca": - print("By default, We support the Alpaca dataset format.") - elif dataset_format == "spider": - print("By default, We support the spider dataset format.") - elif dataset_format == "self-instruct": - dataset = _format_self_instruct(dataset) - # elif dataset_format == "hh-rlhf": - # dataset = _format_hh_rlhf(dataset) - # elif dataset_format == "oasst1": - # dataset = _format_oasst1(dataset) - # elif dataset_format == "100PoisonMpts": - # dataset = _format_100Poison(dataset) - # elif dataset_format == "dolly": - # dataset = _format_dolly15k(dataset) - # elif dataset_format == "chip2": - # dataset = _format_chip2(dataset) - else: - raise NotImplementedError( - f"Unsupported dataset format: {dataset_format}, Please add the formate function in data_utils.py" - ) - # encode_instruction_example - print(f"Applying instruction template: {instruction_template}") - if instruction_template == "alpaca": - dataset = dataset.map(extract_alpaca_prompt_dataset) - # elif instruction_template == "spider": - # dataset = dataset.map(extract_sql_prompt_dataset) - # elif instruction_template == "random": - # dataset = dataset.map(extract_random_prompt_dataset) - else: - dataset = dataset.map(extract_default_prompt_dataset) - - # Remove unused columns. - print("Removing the unused columns, keep only 'input' and 'output'") - dataset = _remove_unused_columns(dataset) - - return dataset templates: Dict[str, Template] = {} @@ -711,28 +622,6 @@ def preprocess_unsupervised_dataset( return model_inputs - def preprocess_pairwise_dataset(examples): - # build input pairs with format ` X`, `Y1 ` and `Y2 ` - model_inputs = {"prompt_ids": [], "chosen_ids": [], "rejected_ids": []} - for query, response, history, system in construct_example(examples): - prompt_ids, chosen_ids = template.encode_oneturn( - tokenizer, query, response[0], history, system - ) - _, rejected_ids = template.encode_oneturn( - tokenizer, query, response[1], history, system - ) - - if len(prompt_ids) > data_args.max_source_length: - prompt_ids = prompt_ids[: data_args.max_source_length] - if len(chosen_ids) > data_args.max_target_length: - chosen_ids = chosen_ids[: data_args.max_target_length] - if len(rejected_ids) > data_args.max_target_length: - rejected_ids = rejected_ids[: data_args.max_target_length] - - model_inputs["prompt_ids"].append(prompt_ids) - model_inputs["chosen_ids"].append(chosen_ids) - model_inputs["rejected_ids"].append(rejected_ids) - return model_inputs def print_supervised_dataset_example(example): print("input_ids:\n{}".format(example["input_ids"])) @@ -754,34 +643,6 @@ def print_supervised_dataset_example(example): ) ) - def print_pairwise_dataset_example(example): - print("prompt_ids:\n{}".format(example["prompt_ids"])) - print( - "prompt:\n{}".format( - tokenizer.decode(example["prompt_ids"], skip_special_tokens=False) - ) - ) - print("chosen_ids:\n{}".format(example["chosen_ids"])) - print( - "chosen:\n{}".format( - tokenizer.decode(example["chosen_ids"], skip_special_tokens=False) - ) - ) - print("rejected_ids:\n{}".format(example["rejected_ids"])) - print( - "rejected:\n{}".format( - tokenizer.decode(example["rejected_ids"], skip_special_tokens=False) - ) - ) - - def print_unsupervised_dataset_example(example): - print("input_ids:\n{}".format(example["input_ids"])) - print( - "inputs:\n{}".format( - tokenizer.decode(example["input_ids"], skip_special_tokens=False) - ) - ) - dataset = dataset.filter(lambda example: example["prompt"] and example["response"]) preprocess_function = preprocess_supervised_dataset print_function = print_supervised_dataset_example @@ -1002,135 +863,3 @@ def split_train_eval( return train_dataset, eval_dataset -def make_data_module(args): - """ - Make dataset and collator for supervised fine-tuning. - Datasets are expected to have the following columns: { `input`, `output` } - - Available datasets to be selected with `dataset` argument: - - alpaca, 52002 examples - - alpaca cleaned, 51942 examples - - chip2 (OIG), 210289 examples - - self-instruct, 82612 examples - - hh-rlhf (Anthropic), 160800 examples - - longform, 23.7k examples - - oasst1 (OpenAssistant) primary message tree only, 9,846 examples - - Coming soon: - - unnatural instructions core, 66010 examples - - unnatural instructions full, 240670 examples - - alpaca-gpt4, 52002 examples - - unnatural-instructions-gpt4, 9000 examples - - supernatural-instructions, 69624 examples (same as paper with 100 ex/task more can be used) - - flan (FLAN v2), up to 20M examples available - - vicuna - - """ - train_datasets: List[Dataset] = [] - eval_datasets: List[Dataset] = [] - dataset_name_list = args.dataset_name.split(",") - print(f"Loading datasets: {dataset_name_list}") - mutliturn_lst = [dataset_attr.multi_turn for dataset_attr in args.datasets_list] - assert mutliturn_lst.count(mutliturn_lst[0]) == len( - mutliturn_lst - ), "All datasets should be multi-turn or single-turn. As follwing we will concat all datasets, so they should be in the same format." - - for dataset_attr in args.datasets_list: - print("=" * 80) - print("DatasetAttr: {}".format(dataset_attr)) - - if dataset_attr.load_from_local: - dataset_path = dataset_attr.local_path - elif dataset_attr.hf_hub_url: - dataset_path = dataset_attr.hf_hub_url - else: - raise ValueError("Please set the dataset path or hf_hub_url.") - - dataset = load_data(dataset_path, eval_dataset_size=args.eval_dataset_size) - - if not dataset_attr.multi_turn: - dataset = formate_instruction_dataset( - dataset, - dataset_name=dataset_attr.dataset_name, - dataset_format=dataset_attr.dataset_format, - instruction_template=args.instruction_template, - ) - - train_dataset, eval_dataset = split_train_eval( - dataset, - do_eval=args.do_eval, - eval_dataset_size=args.eval_dataset_size, - max_eval_samples=args.max_eval_samples, - do_train=args.do_train, - max_train_samples=args.max_train_samples, - ) - if train_dataset: - print( - "loaded dataset:", - dataset_attr.dataset_name, - " ", - "#train data size:", - len(train_dataset), - ) - train_datasets.append(train_dataset) - if eval_dataset: - print( - "loaded dataset:", - dataset_attr.dataset_name, - " " "#eval data size:", - len(eval_dataset), - ) - eval_datasets.append(eval_dataset) - - concate_train = concatenate_datasets(train_datasets) if train_datasets else None - print( - f"Concatenated dataset list: {dataset_name_list}, #train dataset size: {len(concate_train)}" - ) if concate_train else None - concate_eval = concatenate_datasets(eval_datasets) if eval_datasets else None - print( - f"Concatenated dataset list: {dataset_name_list}, #eval dataset size: {len(concate_eval)}" - ) if concate_eval else None - return concate_train, concate_eval, mutliturn_lst[0] - - -def make_supervised_data_module(tokenizer: PreTrainedTokenizer, args): - train_dataset, eval_dataset, multi_turn = make_data_module(args) - max_seq_length = tokenizer.model_max_length - - train_dataset = ( - SFTInstructionDataset( - train_dataset, - tokenizer=tokenizer, - max_seq_len=max_seq_length, - ) - if args.do_train - else None - ) - - eval_dataset = ( - SFTInstructionDataset( - eval_dataset, - tokenizer=tokenizer, - max_seq_len=max_seq_length, - ) - if args.do_eval - else None - ) - - print( - f"train_dataset: {type(train_dataset)}, mutlti-turn: {multi_turn}, #length: {len(train_dataset)}" - ) if args.do_train else None - print( - f"eval_dataset: {type(eval_dataset)}, mutlti-turn: {multi_turn}, #length: {len(eval_dataset)}" - ) if args.do_eval else None - - print("Adding data collator: ", DataCollatorForSupervisedDataset) - data_collator = DataCollatorForSupervisedDataset( - tokenizer=tokenizer, predict_with_generate=args.predict_with_generate - ) - - return { - "train_dataset": train_dataset, - "eval_dataset": eval_dataset, - "data_collator": data_collator, - } diff --git a/dbgpt_hub/data_process/sft_dataset.py b/dbgpt_hub/data_process/sft_dataset.py index 5150d88..8eea72a 100644 --- a/dbgpt_hub/data_process/sft_dataset.py +++ b/dbgpt_hub/data_process/sft_dataset.py @@ -14,102 +14,6 @@ logger = logging.getLogger(__name__) -class SFTInstructionDataset(Dataset): - """ - Dataset for supervised fine-tuning of instruction following models. - - Converts raw dataset containing source/target instructions - into tokenized input/target pairs with truncation and padding. - - Attributes: - dataset: The raw dataset containing source/target examples - tokenizer: Tokenizer to use for encoding text - max_seq_len: Maximum sequence length for truncation - - """ - - def __init__( - self, - raw_data: DatasetDict, - tokenizer: PreTrainedTokenizer, - max_seq_len: int = 1024, - ): - """ - Initialize the dataset with the raw data and tokenizer. - - Args: - raw_data: Raw dataset containing source/target examples - tokenizer: Tokenizer to encode text - max_seq_len: Max sequence length for truncation - """ - self.dataset = raw_data - self.tokenizer = tokenizer - self.max_seq_len = max_seq_len - - def __len__(self) -> int: - """Return number of examples in dataset""" - return len(self.dataset) - - def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: - """ - Convert an raw example into tokenized input/target pair. - - Args: - idx: Index of the example in the dataset - - Returns: - input_ids: tokenized input sequence - labels: tokenized target sequence - """ - - example = self.dataset[idx] - - source_text = example["input"] - source_text = ( - f"{self.tokenizer.bos_token}{source_text}{self.tokenizer.eos_token}" - ) - - target_text = example["output"] - target_text = f"{target_text}{self.tokenizer.eos_token}" - - # Tokenize the source text - tokenized_source = self.tokenizer( - source_text, - max_length=self.max_seq_len, - truncation=True, - add_special_tokens=False, - ) - # Tokenize the example and source text - tokenized_target = self.tokenizer( - target_text, - max_length=self.max_seq_len, - truncation=True, - add_special_tokens=False, - ) - - source_ids = tokenized_source["input_ids"] - target_ids = tokenized_target["input_ids"] - - # Extract the input_ids tensor - if len(source_ids) > self.max_seq_len: - print( - f"Source length {len(source_ids)} exceeds max seq length of {self.max_seq_len}" - ) - # Create the labels tensor - if len(target_ids) > self.max_seq_len: - print( - f"Target length {len(target_ids)} exceeds max seq length of {self.max_seq_len}" - ) - - input_ids = torch.tensor(source_ids + target_ids) - labels = torch.tensor( - [IGNORE_INDEX for _ in range(len(source_ids))] + copy.deepcopy(target_ids) - ) - - # Construct data dictionary containing inputs and labels - data_dict = {"input_ids": input_ids, "labels": labels} - - return data_dict @dataclass @@ -192,48 +96,5 @@ def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: return data_dict -@dataclass -class DataCollatorForSupervisedDataset: - """ - Collate and pad examples for supervised training. - """ - - tokenizer: PreTrainedTokenizer - predict_with_generate: bool = False - - def __call__( - self, examples: List[Dict[str, torch.Tensor]] - ) -> Dict[str, torch.Tensor]: - """ - Collate examples into dictionary for supervised training. - - Args: - examples: List of examples, each containing 'input_ids' and 'labels' - - Returns: - Dictionary with padded 'input_ids', 'attention_mask' and optionally 'labels' - """ - - # Extract input_ids and labels - input_ids = [example["input_ids"] for example in examples] - labels = [example["labels"] for example in examples] - - # Pad input sequences - input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0) - - # Pad labels if needed - if not self.predict_with_generate: - labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) - - # Create attention mask based on padded input - attention_mask = input_ids.ne(0) - - # Assemble final dict - data_dict = {"input_ids": input_ids, "attention_mask": attention_mask} - if labels is not None: - data_dict["labels"] = labels - - return data_dict - # ## TODO 增加 _pad_tensors_to_target_len 函数,并适配 \ No newline at end of file From 6d45416c0c2ec646cc69d05c7bafe9fc1c9c5e3f Mon Sep 17 00:00:00 2001 From: wangzaistone Date: Tue, 10 Oct 2023 18:28:01 +0800 Subject: [PATCH 3/4] test pass and del noted no used code --- dbgpt_hub/data_process/sql_data_process.py | 60 ---------------------- 1 file changed, 60 deletions(-) diff --git a/dbgpt_hub/data_process/sql_data_process.py b/dbgpt_hub/data_process/sql_data_process.py index 6df8b3d..f01ab25 100644 --- a/dbgpt_hub/data_process/sql_data_process.py +++ b/dbgpt_hub/data_process/sql_data_process.py @@ -13,66 +13,6 @@ class ProcessSqlData: def __init__(self) -> None: pass - # def decode_json_file(self, - # data_file: AnyStr, - # table_file: AnyStr, - # is_multiple_turn=False) -> None: - - # # load data form sql_data_info - - # if data_file.endswith(".jsonl"): - # datas, tables = jsonlines.open(data_file), jsonlines.open(table_file) - # elif data_file.endswith(".json"): - # datas, tables = json.load(open(data_file)), json.load(open(table_file)) - - # """ get table info of table name and columns names, examples: - # { - # 'perpetrator': { - # 'tables': ['perpetrator', 'people'], - # 'tables_and_columns': { - # 'perpetrator': ['Perpetrator_ID', 'People_ID', 'Date', 'Year', 'Location', 'Country', 'Killed', 'Injured'], - # 'people': ['People_ID', 'Name', 'Height', 'Weight', 'Home Town'] - # }, - # 'tables_and_primary_key': { - # 'perpetrator': 'Perpetrator_ID' , - # 'people': 'People_ID', - # }, - # } - # } - - # """ - - # db_dict = {} - # for item in tables[0:1]: - - # db_dict[item["db_id"]] = {} - # db_dict[item["db_id"]]["tables"] = item["table_names_original"] - # db_dict[item["db_id"]]["tables_and_columns"] = {} - # db_dict[item["db_id"]]["tables_and_primary_key"] = {} - # db_dict[item["db_id"]]["tables_and_foreign_key"] = {} - # print(db_dict) - - # coloumns = item["column_names_original"][1:] - # primary_key = item["primary_keys"] - # foreign_keys = item["foreign_keys"] - - # for i, table_name in enumerate(item["table_names_original"]): - # coloumns_name = [col[1] for col in coloumns if col[0] == i] - # db_dict[item["db_id"]]["tables_and_columns"][table_name] = coloumns_name - - # # get promary key info - # for j in range(len(primary_key)): - # if coloumns[primary_key[j]-1][0] == i: - # db_dict[item["db_id"]]["tables_and_primary_key"][table_name] = coloumns[primary_key[j]-1][1] - - # for key in foreign_keys: - # source += "The " + coloumns[key[0]-1][1] + " of " + tables[coloumns[key[0]-1][0]] + " is the foreign key of " + coloumns[key[1]-1][1] + " of " + tables[coloumns[key[1]-1][0]] + ".\n" - - # print(db_dict) - - # one-turn conversation - # if not is_multiple_turn: - def decode_json_file(self, data_file_list, table_file, out_file): """ TO DO: From a2128db6945b9f138777725ec94712d6ccb4c633 Mon Sep 17 00:00:00 2001 From: wangzaistone Date: Tue, 10 Oct 2023 18:59:12 +0800 Subject: [PATCH 4/4] del unnecessary code and test pass --- dbgpt_hub/llm_base/__init__.py | 3 - dbgpt_hub/llm_base/chat_model.py | 2 - dbgpt_hub/llm_base/config_parser.py | 9 +- dbgpt_hub/llm_base/load_tokenizer.py | 355 +++------------------------ dbgpt_hub/llm_base/loggings.py | 2 - dbgpt_hub/llm_base/model_trainer.py | 2 - dbgpt_hub/scripts/train_sft.sh | 6 +- 7 files changed, 37 insertions(+), 342 deletions(-) diff --git a/dbgpt_hub/llm_base/__init__.py b/dbgpt_hub/llm_base/__init__.py index e7dd7f8..8b13789 100644 --- a/dbgpt_hub/llm_base/__init__.py +++ b/dbgpt_hub/llm_base/__init__.py @@ -1,4 +1 @@ -# from .load_tokenizer import get_accelerate_model -# from .save_peft_model_callback import SavePeftModelCallback -# __all__ = ["get_accelerate_model", "SavePeftModelCallback"] diff --git a/dbgpt_hub/llm_base/chat_model.py b/dbgpt_hub/llm_base/chat_model.py index 1d14384..d2272ec 100644 --- a/dbgpt_hub/llm_base/chat_model.py +++ b/dbgpt_hub/llm_base/chat_model.py @@ -9,8 +9,6 @@ from dbgpt_hub.llm_base.load_tokenizer import dispatch_model, load_model_and_tokenizer from dbgpt_hub.llm_base.model_trainer import get_logits_processor from dbgpt_hub.data_process.data_utils import get_template_and_fix_tokenizer -from dbgpt_hub.data_process.data_utils import extract_sql_prompt_dataset -## TODO: 待参考 src/llmtuner/chat/stream_chat.py class ChatModel: diff --git a/dbgpt_hub/llm_base/config_parser.py b/dbgpt_hub/llm_base/config_parser.py index fc1b17e..044152a 100644 --- a/dbgpt_hub/llm_base/config_parser.py +++ b/dbgpt_hub/llm_base/config_parser.py @@ -1,17 +1,14 @@ import os import sys import torch -from dbgpt_hub.llm_base.loggings import get_logger +import transformers +import datasets from transformers.trainer import WEIGHTS_NAME from transformers.modeling_utils import load_sharded_checkpoint from transformers.trainer import WEIGHTS_NAME, WEIGHTS_INDEX_NAME -from typing import Dict -import datasets -import transformers -from typing import Any, Dict, Optional, Tuple from transformers import HfArgumentParser, Seq2SeqTrainingArguments from transformers.trainer_utils import get_last_checkpoint - +from typing import Any, Dict, Optional, Tuple from dbgpt_hub.llm_base.loggings import get_logger from dbgpt_hub.configs.model_args import ( ModelArguments, diff --git a/dbgpt_hub/llm_base/load_tokenizer.py b/dbgpt_hub/llm_base/load_tokenizer.py index 665565e..5f11699 100644 --- a/dbgpt_hub/llm_base/load_tokenizer.py +++ b/dbgpt_hub/llm_base/load_tokenizer.py @@ -1,29 +1,18 @@ -import argparse import os -import warnings -import importlib import torch -from packaging import version -from os.path import join -from typing import Optional, Tuple,Dict -import bitsandbytes as bnb -from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training -from transformers import PreTrainedModel, PreTrainedTokenizer -from peft.tuners.lora import LoraLayer -from transformers import ( - AutoTokenizer, - AutoModelForCausalLM, - BitsAndBytesConfig, - LlamaTokenizer, -) - - -import os import math -import torch +from typing import Optional, Tuple,Dict,TYPE_CHECKING, Literal,List from types import MethodType -from typing import TYPE_CHECKING, Literal, Optional, Tuple,List +from trl import AutoModelForCausalLMWithValueHead +from dbgpt_hub.llm_base.loggings import reset_logging, get_logger +from dbgpt_hub.configs.model_args import FinetuningArguments +from dbgpt_hub.llm_base.adapter import init_adapter +from dbgpt_hub.configs.config import LAYERNORM_NAMES,VALUE_HEAD_FILE_NAME +from transformers import PreTrainedModel, PreTrainedTokenizer +from transformers.utils import check_min_version +from transformers.utils.versions import require_version +from transformers.deepspeed import is_deepspeed_zero3_enabled from transformers import ( AutoConfig, AutoModelForCausalLM, @@ -33,16 +22,6 @@ PreTrainedModel, PreTrainedTokenizerBase, ) -from transformers.utils import check_min_version -from transformers.utils.versions import require_version -from transformers.deepspeed import is_deepspeed_zero3_enabled -from trl import AutoModelForCausalLMWithValueHead - -from dbgpt_hub.llm_base.loggings import reset_logging, get_logger -# from llmtuner.extras.misc import count_parameters, prepare_model_for_training -from dbgpt_hub.configs.model_args import FinetuningArguments -from dbgpt_hub.llm_base.adapter import init_adapter -from dbgpt_hub.configs.config import LAYERNORM_NAMES,VALUE_HEAD_FILE_NAME if TYPE_CHECKING: from transformers import PreTrainedTokenizer @@ -59,280 +38,6 @@ require_version("trl>=0.5.0", "To fix: pip install trl>=0.5.0") -# from dbgpt_hub.utils.model_utils import ( -# smart_tokenizer_and_embedding_resize, -# find_all_linear_names, -# ) - - - -def is_ipex_available(): - def get_major_and_minor_from_version(full_version): - return ( - str(version.parse(full_version).major) - + "." - + str(version.parse(full_version).minor) - ) - - _torch_version = importlib.metadata.version("torch") - if importlib.util.find_spec("intel_extension_for_pytorch") is None: - return False - _ipex_version = "N/A" - try: - _ipex_version = importlib.metadata.version("intel_extension_for_pytorch") - except importlib.metadata.PackageNotFoundError: - return False - torch_major_and_minor = get_major_and_minor_from_version(_torch_version) - ipex_major_and_minor = get_major_and_minor_from_version(_ipex_version) - if torch_major_and_minor != ipex_major_and_minor: - warnings.warn( - f"Intel Extension for PyTorch {ipex_major_and_minor} needs to work with PyTorch {ipex_major_and_minor}.*," - f" but PyTorch {_torch_version} is found. Please switch to the matching version and run again." - ) - return False - return True - - -def smart_tokenizer_and_embedding_resize( - special_tokens_dict: Dict[str, str], - tokenizer: PreTrainedTokenizer, - model: PreTrainedModel, -) -> None: - """Resize tokenizer and embedding to accommodate new special tokens. - 改变tokenizer和embedding的尺寸。 - 一般需要将tokenizer和embedding的尺寸设置为64的倍数,方便GPU加速。 - - Args: - special_tokens_dict (Dict[str, str]): A dictionary of special tokens to be added to the tokenizer. - tokenizer (PreTrainedTokenizer): The tokenizer object to be resized. - model (PreTrainedModel): The model object whose token embeddings are to be resized. - - Returns: - None - - Note: This function resizes the tokenizer to accommodate additional special tokens and the - embedding matrix of the model to match the new size of the tokenizer. If any new special tokens - have been added, the function computes the average embedding values of the existing embeddings - and sets those values for the new special token embeddings. This is done separately for the input - embeddings and output embeddings of the model. - """ - - num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) - model.resize_token_embeddings(len(tokenizer)) - - if num_new_tokens > 0: - input_embeddings_data = model.get_input_embeddings().weight.data - output_embeddings_data = model.get_output_embeddings().weight.data - - # Compute average embeddings of existing tokens - input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean( - dim=0, keepdim=True - ) - output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean( - dim=0, keepdim=True - ) - - input_embeddings_data[-num_new_tokens:] = input_embeddings_avg - output_embeddings_data[-num_new_tokens:] = output_embeddings_avg - - -def find_all_linear_names( - args: argparse.Namespace, model: torch.nn.Module -) -> List[str]: - """ - Returns a list of names of all linear layers present in the given model. - Args: - args (argparse.Namespace): A namespace containing arguments of the script. - model (torch.nn.Module): The PyTorch model to extract linear layer names from. - - Returns: - List[str]: A list of names of all linear layers present in the given model. - - Raises: - TypeError: If `args` is not an instance of `argparse.Namespace`, or if `model` is not an instance \ - of `torch.nn.Module`. - ValueError: If `args.bits` is not 4 or 8. - - Example Usage: - >>> import argparse - >>> parser = argparse.ArgumentParser() - >>> parser.add_argument('--bits', type=int) - >>> args = parser.parse_args(['--bits', '4']) - >>> model = torch.nn.Sequential(torch.nn.Linear(10, 5), torch.nn.Linear(5, 1)) - >>> find_all_linear_names(args, model) - ['0', '1'] - """ - # Determine the correct linear layer class based on the value of `args.bits` - if args.bits == 4: - cls = bnb.nn.Linear4bit - elif args.bits == 8: - cls = bnb.nn.Linear8bitLt - else: - torch.nn.Linear - - lora_module_names = set() - for name, module in model.named_modules(): - # Check if the current module is an instance of the linear layer class - if isinstance(module, cls): - # If yes, split the name of the module into its component parts and add the first or last part to the set - names = name.split(".") - lora_module_names.add(names[0] if len(names) == 1 else names[-1]) - - # Remove 'lm_head' from the set if present (needed for 16-bit) - if "lm_head" in lora_module_names: - lora_module_names.remove("lm_head") - - # Convert the set into a list and return it - return list(lora_module_names) - - - - -## TODO 待将此处的所有调用都替换掉,过去在train_qlora和predict_qlora中用了,待替换,然后删除此处历史代码。 -def get_accelerate_model( - args: argparse.Namespace = None, checkpoint_dir: Optional[str] = None -): - if torch.cuda.is_available(): - n_gpus = torch.cuda.device_count() - if is_ipex_available() and torch.xpu.is_available(): - n_gpus = torch.xpu.device_count() - - max_memory = f"{args.max_memory_MB}MB" - max_memory = {i: max_memory for i in range(n_gpus)} - device_map = "auto" - - # if we are in a distributed setting, we need to set the device map and max memory per device - if os.environ.get("LOCAL_RANK") is not None: - local_rank = int(os.environ.get("LOCAL_RANK", "0")) - device_map = {"": local_rank} - max_memory = {"": max_memory[local_rank]} - - if args.full_finetune: - assert args.bits in [16, 32] - - print(f"loading base model {args.model_name_or_path}...") - compute_dtype = ( - torch.float16 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32) - ) - model = AutoModelForCausalLM.from_pretrained( - args.model_name_or_path, - cache_dir=args.cache_dir, - load_in_4bit=args.bits == 4, - load_in_8bit=args.bits == 8, - device_map=device_map, - max_memory=max_memory, - quantization_config=BitsAndBytesConfig( - load_in_4bit=args.bits == 4, - load_in_8bit=args.bits == 8, - llm_int8_threshold=6.0, - llm_int8_has_fp16_weight=False, - bnb_4bit_compute_dtype=compute_dtype, - bnb_4bit_use_double_quant=args.double_quant, - bnb_4bit_quant_type=args.quant_type, - ), - torch_dtype=( - torch.float32 - if args.fp16 - else (torch.bfloat16 if args.bf16 else torch.float32) - ), - trust_remote_code=args.trust_remote_code, - use_auth_token=args.use_auth_token, - ) - if compute_dtype == torch.float16 and args.bits == 4: - if torch.cuda.is_bf16_supported(): - print("=" * 80) - print( - "Your GPU supports bfloat16, you can accelerate training with the argument --bf16" - ) - print("=" * 80) - - if compute_dtype == torch.float16 and ( - is_ipex_available() and torch.xpu.is_available() - ): - compute_dtype = torch.bfloat16 - print("Intel XPU does not support float16 yet, so switching to bfloat16") - - setattr(model, "model_parallel", True) - setattr(model, "is_parallelizable", True) - - model.config.torch_dtype = ( - torch.float32 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32) - ) - - # Tokenizer - tokenizer = AutoTokenizer.from_pretrained( - args.model_name_or_path, - cache_dir=args.cache_dir, - padding_side="right", - use_fast=False, # Fast tokenizer giving issues. - tokenizer_type="llama" - if ( - "llama" in args.model_name_or_path or "CodeLlama" in args.model_name_or_path - ) - else None, # Needed for HF name change - trust_remote_code=args.trust_remote_code, - use_auth_token=args.use_auth_token, - ) - if tokenizer._pad_token is None: - smart_tokenizer_and_embedding_resize( - special_tokens_dict=dict(pad_token="[PAD]"), - tokenizer=tokenizer, - model=model, - ) - if "llama" in args.model_name_or_path or isinstance(tokenizer, LlamaTokenizer): - # LLaMA tokenizer may not have correct special tokens set. - # Check and add them if missing to prevent them from being parsed into different tokens. - # Note that these are present in the vocabulary. - # Note also that `model.config.pad_token_id` is 0 which corresponds to `` token. - print("Adding special tokens.") - tokenizer.add_special_tokens( - { - "eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id), - "bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id), - "unk_token": tokenizer.convert_ids_to_tokens( - model.config.pad_token_id - if model.config.pad_token_id != -1 - else tokenizer.pad_token_id - ), - } - ) - - if not args.full_finetune: - model = prepare_model_for_kbit_training( - model, use_gradient_checkpointing=args.gradient_checkpointing - ) - - if not args.full_finetune: - if checkpoint_dir is not None: - print("Loading adapters from checkpoint.") - model = PeftModel.from_pretrained( - model, join(checkpoint_dir, "adapter_model"), is_trainable=True - ) - else: - print(f"adding LoRA modules...") - modules = find_all_linear_names(args, model) - config = LoraConfig( - r=args.lora_r, - lora_alpha=args.lora_alpha, - target_modules=modules, - lora_dropout=args.lora_dropout, - bias="none", - task_type="CAUSAL_LM", - ) - model = get_peft_model(model, config) - - for name, module in model.named_modules(): - if isinstance(module, LoraLayer): - if args.bf16: - module = module.to(torch.bfloat16) - if "norm" in name: - module = module.to(torch.float32) - if "lm_head" in name or "embed_tokens" in name: - if hasattr(module, "weight"): - if args.bf16 and module.weight.dtype == torch.float32: - module = module.to(torch.bfloat16) - return model, tokenizer - def count_parameters(model: torch.nn.Module) -> Tuple[int, int]: r""" @@ -401,6 +106,27 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: +def load_valuehead_params(model: torch.nn.Module, checkpoint_dir: os.PathLike) -> bool: + valuehead_file = os.path.join(checkpoint_dir, VALUE_HEAD_FILE_NAME) + if not os.path.exists(valuehead_file): + logger.warning( + "Provided path ({}) does not contain valuehead weights.".format( + checkpoint_dir + ) + ) + return False + valuehead_state_dict = torch.load(valuehead_file, map_location="cpu") + model.register_buffer("reward_head_weight", valuehead_state_dict["summary.weight"]) + model.register_buffer("reward_head_bias", valuehead_state_dict["summary.bias"]) + model.register_buffer( + "default_head_weight", torch.zeros_like(valuehead_state_dict["summary.weight"]) + ) + model.register_buffer( + "default_head_bias", torch.zeros_like(valuehead_state_dict["summary.bias"]) + ) + return True + + def load_model_and_tokenizer( model_args: "ModelArguments", finetuning_args: "FinetuningArguments", @@ -616,25 +342,6 @@ def load_model_and_tokenizer( return model, tokenizer -def load_valuehead_params(model: torch.nn.Module, checkpoint_dir: os.PathLike) -> bool: - valuehead_file = os.path.join(checkpoint_dir, VALUE_HEAD_FILE_NAME) - if not os.path.exists(valuehead_file): - logger.warning( - "Provided path ({}) does not contain valuehead weights.".format( - checkpoint_dir - ) - ) - return False - valuehead_state_dict = torch.load(valuehead_file, map_location="cpu") - model.register_buffer("reward_head_weight", valuehead_state_dict["summary.weight"]) - model.register_buffer("reward_head_bias", valuehead_state_dict["summary.bias"]) - model.register_buffer( - "default_head_weight", torch.zeros_like(valuehead_state_dict["summary.weight"]) - ) - model.register_buffer( - "default_head_bias", torch.zeros_like(valuehead_state_dict["summary.bias"]) - ) - return True def dispatch_model(model: "PreTrainedModel") -> "PreTrainedModel": r""" diff --git a/dbgpt_hub/llm_base/loggings.py b/dbgpt_hub/llm_base/loggings.py index a7e7c06..f253f2a 100644 --- a/dbgpt_hub/llm_base/loggings.py +++ b/dbgpt_hub/llm_base/loggings.py @@ -5,10 +5,8 @@ import time from typing import TYPE_CHECKING from datetime import timedelta - from transformers import TrainerCallback from transformers.trainer_utils import has_length - from dbgpt_hub.configs.config import LOG_FILE_NAME if TYPE_CHECKING: diff --git a/dbgpt_hub/llm_base/model_trainer.py b/dbgpt_hub/llm_base/model_trainer.py index a7a8493..bef69c1 100644 --- a/dbgpt_hub/llm_base/model_trainer.py +++ b/dbgpt_hub/llm_base/model_trainer.py @@ -13,7 +13,6 @@ from dbgpt_hub.llm_base.loggings import get_logger from dbgpt_hub.llm_base.config_parser import get_train_args, get_state_dict,load_trainable_params from dbgpt_hub.llm_base.load_tokenizer import load_model_and_tokenizer - from dbgpt_hub.configs.config import VALUE_HEAD_FILE_NAME,FINETUNING_ARGS_NAME from transformers import Seq2SeqTrainer from transformers.trainer import TRAINING_ARGS_NAME, WEIGHTS_NAME @@ -22,7 +21,6 @@ from transformers.generation.logits_process import LogitsProcessor from transformers.generation.utils import LogitsProcessorList - from peft import PeftModel from trl import PreTrainedModelWrapper from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union,Sequence diff --git a/dbgpt_hub/scripts/train_sft.sh b/dbgpt_hub/scripts/train_sft.sh index df07c22..cbe2187 100644 --- a/dbgpt_hub/scripts/train_sft.sh +++ b/dbgpt_hub/scripts/train_sft.sh @@ -18,10 +18,10 @@ CUDA_VISIBLE_DEVICES=0 python dbgpt_hub/train/sft_train.py \ --per_device_train_batch_size 1 \ --gradient_accumulation_steps 4 \ --lr_scheduler_type cosine_with_restarts \ - --logging_steps 250 \ - --save_steps 500 \ + --logging_steps 10 \ + --save_steps 10 \ --learning_rate 5e-5 \ - --num_train_epochs 2 \ + --num_train_epochs 0.2 \ --plot_loss # --bf16#v100不支持bf16 # test num_train_epochs set to 0.1