Catch none-valued rope scaling configs

dvlab-research · Nov 2, 2023 · ad9e683 · ad9e683
1 parent 39866af
commit ad9e683
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 8 deletions.
diff --git a/fine-tune.py b/fine-tune.py
@@ -107,7 +107,7 @@ def train():
 
     # NOTE: May expand supported model types in the future
     if model_args.model_type == "gpt-neox":
-        replace_gpt_neox_attn(training_args.use_flash_attn, training_args.use_full_attn) 
+        replace_gpt_neox_attn(training_args.use_flash_attn, training_args.use_full_attn)
     else:
         assert model_args.model_type == "llama", "Only support llama and gpt-neox for now"
         replace_llama_attn(training_args.use_flash_attn, training_args.use_full_attn)
@@ -118,7 +118,10 @@ def train():
         cache_dir=training_args.cache_dir,
     )
 
-    orig_rope_scaling = getattr(config, "rope_scaling",  {"factor": 1})
+    orig_rope_scaling = getattr(config, "rope_scaling", None)
+    if orig_rope_scaling is None:
+        orig_rope_scaling = {"factor": 1}
+
     orig_rope_scaling_factor = orig_rope_scaling["factor"] if "factor" in orig_rope_scaling.keys() else 1
     orig_ctx_len = getattr(config, "max_position_embeddings", None)
     if orig_ctx_len:
@@ -195,7 +198,7 @@ def train():
     model.enable_input_require_grads()     # required for gradient checkpointing
     model.gradient_checkpointing_enable()  # enable gradient checkpointing
     trainer = Trainer(
-        model=model, tokenizer=tokenizer, args=training_args, 
+        model=model, tokenizer=tokenizer, args=training_args,
         train_dataset=dataset["train"],
         eval_dataset=None,
         data_collator=data_collator)

diff --git a/supervised-fine-tune-qlora.py b/supervised-fine-tune-qlora.py
@@ -237,7 +237,7 @@ def train():
 
     # NOTE: May expand supported model types in the future
     if model_args.model_type == "gpt-neox":
-        replace_gpt_neox_attn(training_args.use_flash_attn, training_args.use_full_attn) 
+        replace_gpt_neox_attn(training_args.use_flash_attn, training_args.use_full_attn)
     else:
         replace_llama_attn(training_args.use_flash_attn, training_args.use_full_attn)
 
@@ -247,7 +247,9 @@ def train():
         cache_dir=training_args.cache_dir,
     )
 
-    orig_rope_scaling = getattr(config, "rope_scaling",  {"factor": 1})
+    orig_rope_scaling = getattr(config, "rope_scaling", None)
+    if orig_rope_scaling is None:
+        orig_rope_scaling = {"factor": 1}
     orig_rope_scaling_factor = orig_rope_scaling["factor"] if "factor" in orig_rope_scaling.keys() else 1
     orig_ctx_len = getattr(config, "max_position_embeddings", None)
     if orig_ctx_len:

diff --git a/supervised-fine-tune.py b/supervised-fine-tune.py
@@ -188,7 +188,7 @@ def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
             prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
             for example in list_data_dict
         ]
-        
+
         targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]
 
         logging.warning("Tokenizing inputs... This may take some time...")
@@ -236,7 +236,7 @@ def train():
 
     # NOTE: May expand supported model types in the future
     if model_args.model_type == "gpt-neox":
-        replace_gpt_neox_attn(training_args.use_flash_attn, training_args.use_full_attn) 
+        replace_gpt_neox_attn(training_args.use_flash_attn, training_args.use_full_attn)
     else:
         replace_llama_attn(training_args.use_flash_attn, training_args.use_full_attn)
 
@@ -246,7 +246,9 @@ def train():
         cache_dir=training_args.cache_dir,
     )
 
-    orig_rope_scaling = getattr(config, "rope_scaling",  {"factor": 1})
+    orig_rope_scaling = getattr(config, "rope_scaling", None)
+    if orig_rope_scaling is None:
+        orig_rope_scaling = {"factor": 1}
     orig_rope_scaling_factor = orig_rope_scaling["factor"] if "factor" in orig_rope_scaling.keys() else 1
     orig_ctx_len = getattr(config, "max_position_embeddings", None)
     if orig_ctx_len: