Skip to content

Commit

Permalink
fix: pylint format codestyle
Browse files Browse the repository at this point in the history
  • Loading branch information
csunny committed Dec 19, 2023
2 parents f31c00e + 4d48845 commit 1156cd6
Show file tree
Hide file tree
Showing 8 changed files with 205 additions and 94 deletions.
24 changes: 11 additions & 13 deletions dbgpt_hub/baseline/show_result.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import sys
import json
import pkgutil
from typing import Optional, Dict, Any
from prettytable.colortable import ColorTable, Theme

Expand All @@ -27,7 +28,7 @@
"all",
]

baseline_file = "./dbgpt_hub/baseline/baseline.json"
baseline_file = "baseline/baseline.json"
ALPACA = 'I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\\n\\"\\n##Instruction:\\ndepartment_management contains tables such as department, head, management. Table department has columns such as Department_ID, Name, Creation, Ranking, Budget_in_Billions, Num_Employees. Department_ID is the primary key.\\nTable head has columns such as head_ID, name, born_state, age. head_ID is the primary key.\\nTable management has columns such as department_ID, head_ID, temporary_acting. department_ID is the primary key.\\nThe head_ID of management is the foreign key of head_ID of head.\\nThe department_ID of management is the foreign key of Department_ID of department.\\n\\n'
OPENAI = "openai"

Expand Down Expand Up @@ -73,18 +74,11 @@ def init_baseline_json():
json.dump(json_data, file, indent=4)


with open(baseline_file, "r") as file:
baseline_json = json.load(file)


def print_color_table_score(acc_data, dataset, model, method, prompt):
model_data = [dataset, model, method, prompt]
print_table_scores = ColorTable(theme=MYTHEME)
print_table_scores.field_names = HEADER
model_ex = get_model_score(acc_data, "ex", model_data)
model_em = get_model_score(acc_data, "em", model_data)
print_table_scores.add_rows([model_em, model_ex])
print(print_table_scores, "\n")
data = pkgutil.get_data("dbgpt_hub", baseline_file)
if data is not None:
baseline_json = json.loads(data.decode("utf-8"))
else:
raise FileNotFoundError("The JSON file was not found in the package.")


def table_add_row(table_scores, acc_data, dataset, model, method, prompt):
Expand Down Expand Up @@ -150,6 +144,8 @@ def show_score(dataset=None, model=None, method=None, prompt=None):
table_scores = ColorTable(theme=MYTHEME)
table_scores.field_names = HEADER
add_scores_to_table(table_scores, json_data, dataset, model, method, prompt)
table_scores.sortby = "all"
table_scores.reversesort = True
print(table_scores)


Expand Down Expand Up @@ -193,6 +189,8 @@ def show_scores():
table_scores = table_add_row(
table_scores, acc_data, dataset, model, method, prompt
)
table_scores.sortby = "all"
table_scores.reversesort = True
print(table_scores, "\n")


Expand Down
4 changes: 2 additions & 2 deletions dbgpt_hub/configs/data_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,11 @@ class DataArguments:
)
cutoff_len: Optional[int] = field(
default=1024,
metadata={"help": "The maximum length of the model inputs after tokenization."}
metadata={"help": "The maximum length of the model inputs after tokenization."},
)
reserved_label_len: Optional[int] = field(
default=1,
metadata={"help": "The maximum length reserved for label after tokenization."}
metadata={"help": "The maximum length reserved for label after tokenization."},
)
split: Optional[str] = field(
default="train",
Expand Down
10 changes: 5 additions & 5 deletions dbgpt_hub/configs/model_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,13 @@ class ModelArguments:
},
)
hf_hub_token: Optional[str] = field(
default=None,
metadata={"help": "Auth token to log in with Hugging Face Hub."}
default=None, metadata={"help": "Auth token to log in with Hugging Face Hub."}
)
split_special_tokens: Optional[bool] = field(
default=False,
metadata={"help": "Whether or not the special tokens should be split during the tokenization process."}
metadata={
"help": "Whether or not the special tokens should be split during the tokenization process."
},
)

def __post_init__(self):
Expand Down Expand Up @@ -191,8 +192,7 @@ class FinetuningArguments:
Arguments pertaining to which techniques we are going to fine-tuning with.
"""
stage: Optional[Literal["sft", "rm"]] = field(
default="sft",
metadata={"help": "Which stage will be performed in training."}
default="sft", metadata={"help": "Which stage will be performed in training."}
)
finetuning_type: Optional[Literal["lora", "freeze", "full", "none"]] = field(
default="lora", metadata={"help": "Which fine-tuning method to use."}
Expand Down
73 changes: 57 additions & 16 deletions dbgpt_hub/data_process/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,17 @@
import pandas as pd
import tiktoken
from itertools import chain
from typing import Any, Dict, List, Optional, Tuple, Union, TYPE_CHECKING, Generator, Literal
from typing import (
Any,
Dict,
List,
Optional,
Tuple,
Union,
TYPE_CHECKING,
Generator,
Literal,
)
from datasets import (
Dataset,
DatasetDict,
Expand Down Expand Up @@ -63,8 +73,13 @@ def extract_sql_prompt_dataset(example: Dict[str, Any]) -> Dict[str, str]:
prompt_format = SQL_PROMPT_DICT["prompt_no_input"]
return {"input": prompt_format.format(**example)}

def infer_max_len(source_len: int, target_len: int, data_args: "DataArguments") -> Tuple[int, int]:
max_target_len = int(data_args.cutoff_len * (target_len / (source_len + target_len)))

def infer_max_len(
source_len: int, target_len: int, data_args: "DataArguments"
) -> Tuple[int, int]:
max_target_len = int(
data_args.cutoff_len * (target_len / (source_len + target_len))
)
max_target_len = max(max_target_len, data_args.reserved_label_len)
max_source_len = data_args.cutoff_len - max_target_len
return max_source_len, max_target_len
Expand Down Expand Up @@ -585,7 +600,7 @@ def preprocess_dataset(
tokenizer: "PreTrainedTokenizer",
data_args: "DataArguments",
training_args: "Seq2SeqTrainingArguments",
stage: Literal["pt", "sft", "rm", "ppo"]
stage: Literal["pt", "sft", "rm", "ppo"],
) -> Union["Dataset", "IterableDataset"]:
column_names = list(next(iter(dataset)).keys())
template = get_template_and_fix_tokenizer(data_args.template, tokenizer)
Expand Down Expand Up @@ -677,22 +692,37 @@ def preprocess_unsupervised_dataset(

return model_inputs

def preprocess_pairwise_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
def preprocess_pairwise_dataset(
examples: Dict[str, List[Any]]
) -> Dict[str, List[List[int]]]:
# build input pairs with format `<bos> X`, `Y1 <eos>` and `Y2 <eos>` for rm stage
model_inputs = {"prompt_ids": [], "chosen_ids": [], "rejected_ids": []}
for query, response, history, system in construct_example(examples):
if not (isinstance(query, str) and isinstance(response, list) and query != "" and len(response) > 1):
if not (
isinstance(query, str)
and isinstance(response, list)
and query != ""
and len(response) > 1
):
continue

prompt_ids, chosen_ids = template.encode_oneturn(tokenizer, query, response[0], history, system)
_, rejected_ids = template.encode_oneturn(tokenizer, query, response[1], history, system)
prompt_ids, chosen_ids = template.encode_oneturn(
tokenizer, query, response[0], history, system
)
_, rejected_ids = template.encode_oneturn(
tokenizer, query, response[1], history, system
)

# if template.efficient_eos:
chosen_ids += [tokenizer.eos_token_id]
rejected_ids += [tokenizer.eos_token_id]

source_len, target_len = len(prompt_ids), max(len(chosen_ids), len(rejected_ids))
max_source_len, max_target_len = infer_max_len(source_len, target_len, data_args)
source_len, target_len = len(prompt_ids), max(
len(chosen_ids), len(rejected_ids)
)
max_source_len, max_target_len = infer_max_len(
source_len, target_len, data_args
)
if source_len > max_source_len:
prompt_ids = prompt_ids[:max_source_len]
if target_len > max_target_len:
Expand All @@ -704,14 +734,26 @@ def preprocess_pairwise_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Lis
model_inputs["rejected_ids"].append(rejected_ids)

return model_inputs

def print_pairwise_dataset_example(example: Dict[str, List[int]]) -> None:
print("prompt_ids:\n{}".format(example["prompt_ids"]))
print("prompt:\n{}".format(tokenizer.decode(example["prompt_ids"], skip_special_tokens=False)))
print(
"prompt:\n{}".format(
tokenizer.decode(example["prompt_ids"], skip_special_tokens=False)
)
)
print("chosen_ids:\n{}".format(example["chosen_ids"]))
print("chosen:\n{}".format(tokenizer.decode(example["chosen_ids"], skip_special_tokens=False)))
print(
"chosen:\n{}".format(
tokenizer.decode(example["chosen_ids"], skip_special_tokens=False)
)
)
print("rejected_ids:\n{}".format(example["rejected_ids"]))
print("rejected:\n{}".format(tokenizer.decode(example["rejected_ids"], skip_special_tokens=False)))
print(
"rejected:\n{}".format(
tokenizer.decode(example["rejected_ids"], skip_special_tokens=False)
)
)

def print_supervised_dataset_example(example):
print("input_ids:\n{}".format(example["input_ids"]))
Expand All @@ -733,7 +775,6 @@ def print_supervised_dataset_example(example):
)
)


if stage == "pt":
pass
elif stage == "sft" and not training_args.predict_with_generate:
Expand All @@ -744,7 +785,7 @@ def print_supervised_dataset_example(example):
preprocess_function = preprocess_pairwise_dataset
print_function = print_pairwise_dataset_example
else:
pass
pass

with training_args.main_process_first(desc="dataset map pre-processing"):
kwargs = {}
Expand Down
49 changes: 31 additions & 18 deletions dbgpt_hub/llm_base/load_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,22 +106,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:


def load_valuehead_params(
path_or_repo_id: str,
model_args: "ModelArguments"
path_or_repo_id: str, model_args: "ModelArguments"
) -> Dict[str, torch.Tensor]:
r"""
Loads value head parameters from Hugging Face Hub or local disk.
Returns: dict with keys `v_head.summary.weight` and `v_head.summary.bias`.
"""
kwargs = {
"path_or_repo_id": path_or_repo_id,
"cache_dir": model_args.cache_dir
}
kwargs = {"path_or_repo_id": path_or_repo_id, "cache_dir": model_args.cache_dir}

if "token" in inspect.signature(cached_file).parameters:
kwargs["token"] = model_args.hf_hub_token
elif "use_auth_token" in inspect.signature(cached_file).parameters: # for transformers==4.31.0
elif (
"use_auth_token" in inspect.signature(cached_file).parameters
): # for transformers==4.31.0
kwargs["use_auth_token"] = model_args.hf_hub_token
else:
logger.warning("Ignore `hf_hub_token` since matched parameter is not found.")
Expand All @@ -134,24 +132,27 @@ def load_valuehead_params(

try:
from safetensors import safe_open

vhead_file = cached_file(filename=SAFE_WEIGHTS_NAME, **kwargs)
with safe_open(vhead_file, framework="pt", device="cpu") as f:
return {
"v_head.summary.weight": f.get_tensor("v_head.summary.weight"),
"v_head.summary.bias": f.get_tensor("v_head.summary.bias")
"v_head.summary.bias": f.get_tensor("v_head.summary.bias"),
}
except Exception as err:
logger.info("Failed to load {}: {}".format(SAFE_WEIGHTS_NAME, str(err)))

logger.warning("Provided path ({}) does not contain valuehead weights.".format(path_or_repo_id))
logger.warning(
"Provided path ({}) does not contain valuehead weights.".format(path_or_repo_id)
)
return None


def load_model_and_tokenizer(
model_args: "ModelArguments",
finetuning_args: "FinetuningArguments",
is_trainable: Optional[bool] = False,
add_valuehead: Optional[bool] = False
add_valuehead: Optional[bool] = False,
) -> Tuple[PreTrainedModel, "PreTrainedTokenizer"]:
r"""
Loads pretrained model and tokenizer.
Expand All @@ -175,7 +176,7 @@ def load_model_and_tokenizer(
model_args.model_name_or_path,
use_fast=model_args.use_fast_tokenizer,
split_special_tokens=model_args.split_special_tokens,
padding_side="right", # training with left-padded tensors in fp16 precision may cause overflow
padding_side="right", # training with left-padded tensors in fp16 precision may cause overflow
**config_kwargs
)

Expand All @@ -195,11 +196,15 @@ def load_model_and_tokenizer(
else:
setattr(config, "fp16", True)

# Fix config (for Qwen)
# Fix config (for Qwen)
if getattr(config, "model_type", None) == "qwen":
for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]:
for dtype_name, dtype in [
("fp16", torch.float16),
("bf16", torch.bfloat16),
("fp32", torch.float32),
]:
setattr(config, dtype_name, getattr(config, "torch_dtype", None) == dtype)

# Set RoPE scaling
if model_args.rope_scaling is not None:
if hasattr(config, "use_dynamic_ntk"): # for Qwen models
Expand Down Expand Up @@ -324,12 +329,20 @@ def load_model_and_tokenizer(

# Prepare model with valuehead for RLHF
if add_valuehead:
model: "AutoModelForCausalLMWithValueHead" = AutoModelForCausalLMWithValueHead.from_pretrained(model)
ignore_modules = [name for name, _ in model.named_parameters() if "pretrained_model" in name]
model: "AutoModelForCausalLMWithValueHead" = (
AutoModelForCausalLMWithValueHead.from_pretrained(model)
)
ignore_modules = [
name for name, _ in model.named_parameters() if "pretrained_model" in name
]
setattr(model, "_keys_to_ignore_on_save", ignore_modules)
setattr(model, "tie_weights", MethodType(lambda _: None, model)) # use empty method
setattr(
model, "tie_weights", MethodType(lambda _: None, model)
) # use empty method
vhead_path = (
model_args.checkpoint_dir[-1] if model_args.checkpoint_dir is not None else model_args.model_name_or_path
model_args.checkpoint_dir[-1]
if model_args.checkpoint_dir is not None
else model_args.model_name_or_path
)
vhead_params = load_valuehead_params(vhead_path, model_args)
if vhead_params is not None:
Expand Down
Loading

0 comments on commit 1156cd6

Please sign in to comment.