fix: pylint format codestyle

eosphoros-ai · Dec 19, 2023 · 1156cd6 · 1156cd6
2 parents f31c00e + 4d48845
commit 1156cd6
Show file tree

Hide file tree

Showing 8 changed files with 205 additions and 94 deletions.
diff --git a/dbgpt_hub/baseline/show_result.py b/dbgpt_hub/baseline/show_result.py
@@ -1,6 +1,7 @@
 import os
 import sys
 import json
+import pkgutil
 from typing import Optional, Dict, Any
 from prettytable.colortable import ColorTable, Theme
 
@@ -27,7 +28,7 @@
     "all",
 ]
 
-baseline_file = "./dbgpt_hub/baseline/baseline.json"
+baseline_file = "baseline/baseline.json"
 ALPACA = 'I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\\n\\"\\n##Instruction:\\ndepartment_management contains tables such as department, head, management. Table department has columns such as Department_ID, Name, Creation, Ranking, Budget_in_Billions, Num_Employees. Department_ID is the primary key.\\nTable head has columns such as head_ID, name, born_state, age. head_ID is the primary key.\\nTable management has columns such as department_ID, head_ID, temporary_acting. department_ID is the primary key.\\nThe head_ID of management is the foreign key of head_ID of head.\\nThe department_ID of management is the foreign key of Department_ID of department.\\n\\n'
 OPENAI = "openai"
 
@@ -73,18 +74,11 @@ def init_baseline_json():
         json.dump(json_data, file, indent=4)
 
 
-with open(baseline_file, "r") as file:
-    baseline_json = json.load(file)
-
-
-def print_color_table_score(acc_data, dataset, model, method, prompt):
-    model_data = [dataset, model, method, prompt]
-    print_table_scores = ColorTable(theme=MYTHEME)
-    print_table_scores.field_names = HEADER
-    model_ex = get_model_score(acc_data, "ex", model_data)
-    model_em = get_model_score(acc_data, "em", model_data)
-    print_table_scores.add_rows([model_em, model_ex])
-    print(print_table_scores, "\n")
+data = pkgutil.get_data("dbgpt_hub", baseline_file)
+if data is not None:
+    baseline_json = json.loads(data.decode("utf-8"))
+else:
+    raise FileNotFoundError("The JSON file was not found in the package.")
 
 
 def table_add_row(table_scores, acc_data, dataset, model, method, prompt):
@@ -150,6 +144,8 @@ def show_score(dataset=None, model=None, method=None, prompt=None):
     table_scores = ColorTable(theme=MYTHEME)
     table_scores.field_names = HEADER
     add_scores_to_table(table_scores, json_data, dataset, model, method, prompt)
+    table_scores.sortby = "all"
+    table_scores.reversesort = True
     print(table_scores)
 
 
@@ -193,6 +189,8 @@ def show_scores():
                     table_scores = table_add_row(
                         table_scores, acc_data, dataset, model, method, prompt
                     )
+    table_scores.sortby = "all"
+    table_scores.reversesort = True
     print(table_scores, "\n")
 
 

diff --git a/dbgpt_hub/configs/data_args.py b/dbgpt_hub/configs/data_args.py
@@ -83,11 +83,11 @@ class DataArguments:
     )
     cutoff_len: Optional[int] = field(
         default=1024,
-        metadata={"help": "The maximum length of the model inputs after tokenization."}
+        metadata={"help": "The maximum length of the model inputs after tokenization."},
     )
     reserved_label_len: Optional[int] = field(
         default=1,
-        metadata={"help": "The maximum length reserved for label after tokenization."}
+        metadata={"help": "The maximum length reserved for label after tokenization."},
     )
     split: Optional[str] = field(
         default="train",

diff --git a/dbgpt_hub/configs/model_args.py b/dbgpt_hub/configs/model_args.py
@@ -95,12 +95,13 @@ class ModelArguments:
         },
     )
     hf_hub_token: Optional[str] = field(
-        default=None,
-        metadata={"help": "Auth token to log in with Hugging Face Hub."}
+        default=None, metadata={"help": "Auth token to log in with Hugging Face Hub."}
     )
     split_special_tokens: Optional[bool] = field(
         default=False,
-        metadata={"help": "Whether or not the special tokens should be split during the tokenization process."}
+        metadata={
+            "help": "Whether or not the special tokens should be split during the tokenization process."
+        },
     )
 
     def __post_init__(self):
@@ -191,8 +192,7 @@ class FinetuningArguments:
     Arguments pertaining to which techniques we are going to fine-tuning with.
     """
     stage: Optional[Literal["sft", "rm"]] = field(
-        default="sft",
-        metadata={"help": "Which stage will be performed in training."}
+        default="sft", metadata={"help": "Which stage will be performed in training."}
     )
     finetuning_type: Optional[Literal["lora", "freeze", "full", "none"]] = field(
         default="lora", metadata={"help": "Which fine-tuning method to use."}

diff --git a/dbgpt_hub/data_process/data_utils.py b/dbgpt_hub/data_process/data_utils.py
@@ -4,7 +4,17 @@
 import pandas as pd
 import tiktoken
 from itertools import chain
-from typing import Any, Dict, List, Optional, Tuple, Union, TYPE_CHECKING, Generator, Literal
+from typing import (
+    Any,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    Union,
+    TYPE_CHECKING,
+    Generator,
+    Literal,
+)
 from datasets import (
     Dataset,
     DatasetDict,
@@ -63,8 +73,13 @@ def extract_sql_prompt_dataset(example: Dict[str, Any]) -> Dict[str, str]:
         prompt_format = SQL_PROMPT_DICT["prompt_no_input"]
     return {"input": prompt_format.format(**example)}
 
-def infer_max_len(source_len: int, target_len: int, data_args: "DataArguments") -> Tuple[int, int]:
-    max_target_len = int(data_args.cutoff_len * (target_len / (source_len + target_len)))
+
+def infer_max_len(
+    source_len: int, target_len: int, data_args: "DataArguments"
+) -> Tuple[int, int]:
+    max_target_len = int(
+        data_args.cutoff_len * (target_len / (source_len + target_len))
+    )
     max_target_len = max(max_target_len, data_args.reserved_label_len)
     max_source_len = data_args.cutoff_len - max_target_len
     return max_source_len, max_target_len
@@ -585,7 +600,7 @@ def preprocess_dataset(
     tokenizer: "PreTrainedTokenizer",
     data_args: "DataArguments",
     training_args: "Seq2SeqTrainingArguments",
-    stage: Literal["pt", "sft", "rm", "ppo"]
+    stage: Literal["pt", "sft", "rm", "ppo"],
 ) -> Union["Dataset", "IterableDataset"]:
     column_names = list(next(iter(dataset)).keys())
     template = get_template_and_fix_tokenizer(data_args.template, tokenizer)
@@ -677,22 +692,37 @@ def preprocess_unsupervised_dataset(
 
         return model_inputs
 
-    def preprocess_pairwise_dataset(examples: Dict[str, List[Any]]) -> Dict[str, List[List[int]]]:
+    def preprocess_pairwise_dataset(
+        examples: Dict[str, List[Any]]
+    ) -> Dict[str, List[List[int]]]:
         # build input pairs with format `<bos> X`, `Y1 <eos>` and `Y2 <eos>` for rm stage
         model_inputs = {"prompt_ids": [], "chosen_ids": [], "rejected_ids": []}
         for query, response, history, system in construct_example(examples):
-            if not (isinstance(query, str) and isinstance(response, list) and query != "" and len(response) > 1):
+            if not (
+                isinstance(query, str)
+                and isinstance(response, list)
+                and query != ""
+                and len(response) > 1
+            ):
                 continue
 
-            prompt_ids, chosen_ids = template.encode_oneturn(tokenizer, query, response[0], history, system)
-            _, rejected_ids = template.encode_oneturn(tokenizer, query, response[1], history, system)
+            prompt_ids, chosen_ids = template.encode_oneturn(
+                tokenizer, query, response[0], history, system
+            )
+            _, rejected_ids = template.encode_oneturn(
+                tokenizer, query, response[1], history, system
+            )
 
             # if template.efficient_eos:
             chosen_ids += [tokenizer.eos_token_id]
             rejected_ids += [tokenizer.eos_token_id]
 
-            source_len, target_len = len(prompt_ids), max(len(chosen_ids), len(rejected_ids))
-            max_source_len, max_target_len = infer_max_len(source_len, target_len, data_args)
+            source_len, target_len = len(prompt_ids), max(
+                len(chosen_ids), len(rejected_ids)
+            )
+            max_source_len, max_target_len = infer_max_len(
+                source_len, target_len, data_args
+            )
             if source_len > max_source_len:
                 prompt_ids = prompt_ids[:max_source_len]
             if target_len > max_target_len:
@@ -704,14 +734,26 @@ def preprocess_pairwise_dataset(examples: Dict[str, List[Any]]) -> Dict[str, Lis
             model_inputs["rejected_ids"].append(rejected_ids)
 
         return model_inputs
-    
+
     def print_pairwise_dataset_example(example: Dict[str, List[int]]) -> None:
         print("prompt_ids:\n{}".format(example["prompt_ids"]))
-        print("prompt:\n{}".format(tokenizer.decode(example["prompt_ids"], skip_special_tokens=False)))
+        print(
+            "prompt:\n{}".format(
+                tokenizer.decode(example["prompt_ids"], skip_special_tokens=False)
+            )
+        )
         print("chosen_ids:\n{}".format(example["chosen_ids"]))
-        print("chosen:\n{}".format(tokenizer.decode(example["chosen_ids"], skip_special_tokens=False)))
+        print(
+            "chosen:\n{}".format(
+                tokenizer.decode(example["chosen_ids"], skip_special_tokens=False)
+            )
+        )
         print("rejected_ids:\n{}".format(example["rejected_ids"]))
-        print("rejected:\n{}".format(tokenizer.decode(example["rejected_ids"], skip_special_tokens=False)))
+        print(
+            "rejected:\n{}".format(
+                tokenizer.decode(example["rejected_ids"], skip_special_tokens=False)
+            )
+        )
 
     def print_supervised_dataset_example(example):
         print("input_ids:\n{}".format(example["input_ids"]))
@@ -733,7 +775,6 @@ def print_supervised_dataset_example(example):
             )
         )
 
-
     if stage == "pt":
         pass
     elif stage == "sft" and not training_args.predict_with_generate:
@@ -744,7 +785,7 @@ def print_supervised_dataset_example(example):
         preprocess_function = preprocess_pairwise_dataset
         print_function = print_pairwise_dataset_example
     else:
-       pass
+        pass
 
     with training_args.main_process_first(desc="dataset map pre-processing"):
         kwargs = {}

diff --git a/dbgpt_hub/llm_base/load_tokenizer.py b/dbgpt_hub/llm_base/load_tokenizer.py
@@ -106,22 +106,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 def load_valuehead_params(
-    path_or_repo_id: str,
-    model_args: "ModelArguments"
+    path_or_repo_id: str, model_args: "ModelArguments"
 ) -> Dict[str, torch.Tensor]:
     r"""
     Loads value head parameters from Hugging Face Hub or local disk.
 
     Returns: dict with keys `v_head.summary.weight` and `v_head.summary.bias`.
     """
-    kwargs = {
-        "path_or_repo_id": path_or_repo_id,
-        "cache_dir": model_args.cache_dir
-    }
+    kwargs = {"path_or_repo_id": path_or_repo_id, "cache_dir": model_args.cache_dir}
 
     if "token" in inspect.signature(cached_file).parameters:
         kwargs["token"] = model_args.hf_hub_token
-    elif "use_auth_token" in inspect.signature(cached_file).parameters: # for transformers==4.31.0
+    elif (
+        "use_auth_token" in inspect.signature(cached_file).parameters
+    ):  # for transformers==4.31.0
         kwargs["use_auth_token"] = model_args.hf_hub_token
     else:
         logger.warning("Ignore `hf_hub_token` since matched parameter is not found.")
@@ -134,24 +132,27 @@ def load_valuehead_params(
 
     try:
         from safetensors import safe_open
+
         vhead_file = cached_file(filename=SAFE_WEIGHTS_NAME, **kwargs)
         with safe_open(vhead_file, framework="pt", device="cpu") as f:
             return {
                 "v_head.summary.weight": f.get_tensor("v_head.summary.weight"),
-                "v_head.summary.bias": f.get_tensor("v_head.summary.bias")
+                "v_head.summary.bias": f.get_tensor("v_head.summary.bias"),
             }
     except Exception as err:
         logger.info("Failed to load {}: {}".format(SAFE_WEIGHTS_NAME, str(err)))
 
-    logger.warning("Provided path ({}) does not contain valuehead weights.".format(path_or_repo_id))
+    logger.warning(
+        "Provided path ({}) does not contain valuehead weights.".format(path_or_repo_id)
+    )
     return None
 
 
 def load_model_and_tokenizer(
     model_args: "ModelArguments",
     finetuning_args: "FinetuningArguments",
     is_trainable: Optional[bool] = False,
-    add_valuehead: Optional[bool] = False
+    add_valuehead: Optional[bool] = False,
 ) -> Tuple[PreTrainedModel, "PreTrainedTokenizer"]:
     r"""
     Loads pretrained model and tokenizer.
@@ -175,7 +176,7 @@ def load_model_and_tokenizer(
         model_args.model_name_or_path,
         use_fast=model_args.use_fast_tokenizer,
         split_special_tokens=model_args.split_special_tokens,
-        padding_side="right", # training with left-padded tensors in fp16 precision may cause overflow
+        padding_side="right",  # training with left-padded tensors in fp16 precision may cause overflow
         **config_kwargs
     )
 
@@ -195,11 +196,15 @@ def load_model_and_tokenizer(
         else:
             setattr(config, "fp16", True)
 
-       # Fix config (for Qwen)
+    # Fix config (for Qwen)
     if getattr(config, "model_type", None) == "qwen":
-        for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]:
+        for dtype_name, dtype in [
+            ("fp16", torch.float16),
+            ("bf16", torch.bfloat16),
+            ("fp32", torch.float32),
+        ]:
             setattr(config, dtype_name, getattr(config, "torch_dtype", None) == dtype)
-            
+
     # Set RoPE scaling
     if model_args.rope_scaling is not None:
         if hasattr(config, "use_dynamic_ntk"):  # for Qwen models
@@ -324,12 +329,20 @@ def load_model_and_tokenizer(
 
     # Prepare model with valuehead for RLHF
     if add_valuehead:
-        model: "AutoModelForCausalLMWithValueHead" = AutoModelForCausalLMWithValueHead.from_pretrained(model)
-        ignore_modules = [name for name, _ in model.named_parameters() if "pretrained_model" in name]
+        model: "AutoModelForCausalLMWithValueHead" = (
+            AutoModelForCausalLMWithValueHead.from_pretrained(model)
+        )
+        ignore_modules = [
+            name for name, _ in model.named_parameters() if "pretrained_model" in name
+        ]
         setattr(model, "_keys_to_ignore_on_save", ignore_modules)
-        setattr(model, "tie_weights", MethodType(lambda _: None, model)) # use empty method
+        setattr(
+            model, "tie_weights", MethodType(lambda _: None, model)
+        )  # use empty method
         vhead_path = (
-            model_args.checkpoint_dir[-1] if model_args.checkpoint_dir is not None else model_args.model_name_or_path
+            model_args.checkpoint_dir[-1]
+            if model_args.checkpoint_dir is not None
+            else model_args.model_name_or_path
         )
         vhead_params = load_valuehead_params(vhead_path, model_args)
         if vhead_params is not None: