evaluation final

PKU-Alignment · Dec 1, 2024 · f32da8f · f32da8f
2 parents 29cdd9a + cf09339
commit f32da8f
Show file tree

Hide file tree

Showing 43 changed files with 584 additions and 353 deletions.
diff --git a/__init__.py b/__init__.py
@@ -1,3 +1,6 @@
+import os, sys
+sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path
+
 from benchmark.framework import JudgeBase, ExamineeBase
 from benchmark.dummies import DummyJudge
 from challenges.follow import FollowJudge

diff --git a/algorithms/extrapolative_dpo.py b/algorithms/extrapolative_dpo.py
@@ -6,10 +6,9 @@
 import pandas as pd
 import json
 import datasets
-from src.text_writer import write_log
+from src.utils.text_utils import write_log
 from benchmark import JudgeBase, ExamineeBase, PredictJudge
-from algorithms.utils.rw_utils import elicit_rw_preference, default_rw_data
-from algorithms.utils.extrapolation_utils import extrapolate
+from src.utils.data_utils import elicit_rw_preference, default_rw_data, extrapolate
 import warnings
 from tqdm import tqdm
 import numpy as np

diff --git a/algorithms/extrapolative_rlhf.py b/algorithms/extrapolative_rlhf.py
@@ -6,14 +6,14 @@
 import pandas as pd
 import json
 import datasets
-from src.text_writer import write_log
+from src.utils.text_utils import write_log
 from benchmark import JudgeBase, ExamineeBase, PredictJudge
-from algorithms.utils.rw_utils import (
+from src.utils.data_utils import (
     elicit_rw_preference,
     default_rw_data,
     default_ppo_data,
+    extrapolate,
 )
-from algorithms.utils.extrapolation_utils import extrapolate
 import warnings
 from tqdm import tqdm
 from sympy import binomial

diff --git a/algorithms/lifelong_dpo.py b/algorithms/lifelong_dpo.py
@@ -6,9 +6,9 @@
 import pandas as pd
 import json
 import datasets
-from src.text_writer import write_log
+from src.utils.text_utils import write_log
 from benchmark import JudgeBase, ExamineeBase, PredictJudge
-from algorithms.utils.rw_utils import elicit_rw_preference, default_rw_data
+from src.utils.data_utils import elicit_rw_preference, default_rw_data
 import warnings
 from tqdm import tqdm
 

diff --git a/algorithms/lifelong_rlhf.py b/algorithms/lifelong_rlhf.py
@@ -6,9 +6,9 @@
 import pandas as pd
 import json
 import datasets
-from src.text_writer import write_log
+from src.utils.text_utils import write_log
 from benchmark import JudgeBase, ExamineeBase, PredictJudge
-from algorithms.utils.rw_utils import (
+from src.utils.data_utils import (
     elicit_rw_preference,
     default_rw_data,
     default_ppo_data,

diff --git a/build_dataset.py b/build_dataset.py
@@ -1,4 +1,5 @@
-import src.text_writer as tw
+from src.path import root
+import src.utils.text_utils as tu
 import src.cleanser.rule_based_cleanser as rb
 import src.cleanser.localllm_cleanser as llm_cleanser
 import src.model_training.train_hislm as hislm
@@ -8,7 +9,6 @@
 import os
 import time
 
-
 import src.eebo.download_eebo as eebo_dl
 import src.eebo.process_eebo as eebo_pc
 
@@ -26,10 +26,10 @@ def build_EEBO():
 
 def build_gutenberg():
     print("======= START BUILDING GUTENBERG DATASET =======")
-    dir = "./dataset/raw_downloads/Gutenberg/"
+    dir = f"{root}/dataset/raw_downloads/Gutenberg/"
     gtb_gd.get_data_gutenberg(dir)
     gtb_gm.gather_meta(
-        os.path.join(dir, "data/raw"), "./dataset/raw_downloads/Gutenberg_records.txt"
+        os.path.join(dir, "data/raw"), f"{root}/dataset/raw_downloads/Gutenberg_records.txt"
     )
     print("======= FINISHED BUILDING GUTENBERG DATASET =======\n\n\n")
 
@@ -53,7 +53,7 @@ def build_pile_of_law():
 
 
 if __name__ == "__main__":
-    tw.write_log(f"\n\n\n\n\n\n=========== NEW RUN ============\n\n")
+    tu.write_log(f"\n\n\n\n\n\n=========== NEW RUN ============\n\n")
     print(
         "This script is NOT meant to be run as part of the benchmarking process. Unless you would like to replicate the dataset building & model training process, you could directly run `run_benchmark.py` instead, which will automatically download the pre-built dataset and/or models on demand."
     )
@@ -82,7 +82,7 @@ def build_pile_of_law():
                 max_hours=10
             )  # takes ~100h, but if max_hours is supplied then stops after this many hours (won't affect data integrity)
             # finishing up
-            tw.seal_all_files()
+            tu.seal_all_files()
             print("Finished building entire dataset. Proceed to data cleansing.")
 
         if (
@@ -91,8 +91,8 @@ def build_pile_of_law():
         ):
             proceed = True
             rb.cleanse(
-                "./dataset/dataset_text_sequence/",
-                "./dataset/dataset_text_sequence_rulebased_cleansed/",
+                f"{root}/dataset/dataset_text_sequence/",
+                f"{root}/dataset/dataset_text_sequence_rulebased_cleansed/",
             )
             print("Finished rule-based data cleansing. Now exiting.")
 
@@ -102,25 +102,25 @@ def build_pile_of_law():
         ):
             proceed = True
             llm_cleanser.run_cleanser(
-                in_path="./dataset/dataset_text_sequence_rulebased_cleansed/",
-                out_path="./dataset/dataset_text_sequence_llm_cleansed/",
+                in_path=f"{root}/dataset/dataset_text_sequence_rulebased_cleansed/",
+                out_path=f"{root}/dataset/dataset_text_sequence_llm_cleansed/",
             )
 
             # Make llm-cleansed version the official version ("dataset_text_sequence"), and move the other two versions into dataset/raw_downloads
             path = (
-                f"./dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/"
+                f"{root}/dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/"
             )
             os.makedirs(path)
 
             print(f"Moving pre-cleansing version to backup folder...")
             os.rename(
-                "./dataset/dataset_text_sequence/",
+                f"{root}/dataset/dataset_text_sequence/",
                 os.path.join(path, "dataset_text_sequence_original/"),
             )
 
             print(f"Moving rule-cleansed version to backup folder...")
             os.rename(
-                "./dataset/dataset_text_sequence_rulebased_cleansed/",
+                f"{root}/dataset/dataset_text_sequence_rulebased_cleansed/",
                 os.path.join(path, "dataset_text_sequence_rulebased_cleansed/"),
             )
 
@@ -131,7 +131,7 @@ def build_pile_of_law():
 
             print(f"Copying LLM-cleansed version to backup folder...")
             os.rename(
-                "./dataset/dataset_text_sequence_llm_cleansed/",
+                f"{root}/dataset/dataset_text_sequence_llm_cleansed/",
                 os.path.join(path, "dataset_text_sequence_llm_cleansed/"),
             )
 
@@ -148,19 +148,19 @@ def build_pile_of_law():
             proceed = True
 
             print(f"Removing overly small or messy subdatasets...")
-            path = f"./dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/removed/"
+            path = f"{root}/dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/removed/"
             os.makedirs(path)
 
             sub_datasets = [
                 f
-                for f in os.listdir("./dataset/dataset_text_sequence/")
-                if os.path.isdir(os.path.join("./dataset/dataset_text_sequence/", f))
+                for f in os.listdir(f"{root}/dataset/dataset_text_sequence/")
+                if os.path.isdir(os.path.join(f"{root}/dataset/dataset_text_sequence/", f))
             ]
             for sub in sub_datasets:
                 # Remove if size < 10MB AND century number < 13
                 if (
                     hislm.get_directory_size_bytes(
-                        os.path.join("./dataset/dataset_text_sequence/", sub)
+                        os.path.join(f"{root}/dataset/dataset_text_sequence/", sub)
                     )
                     < 10 * 1024 * 1024
                     and int(sub.strip("C")) < 13
@@ -169,7 +169,7 @@ def build_pile_of_law():
                     os.system(f"mv ./dataset/dataset_text_sequence/{sub} {path}")
 
             hislm.run_training(
-                "./dataset/dataset_text_sequence/", "./dataset/dataset_model_sequence/"
+                f"{root}/dataset/dataset_text_sequence/", f"{root}/dataset/dataset_model_sequence/"
             )
             print("Finished model training. Exiting.")
 

diff --git a/challenges/coevolve.py b/challenges/coevolve.py
@@ -1,11 +1,12 @@
+from src.path import root
+from src.utils.data_utils import elicit_rw_preference, default_rw_data
 from benchmark.framework import JudgeBase, ExamineeBase
 from typing import Iterable, Tuple, Dict, Union, List, Any
 from src.abstractions import Model, Data
 import numpy as np
 import scipy.spatial as sp
 import datasets
-import json, os
-from algorithms.utils.rw_utils import elicit_rw_preference, default_rw_data
+import json, os, sys
 
 
 class CoevolveJudge(JudgeBase):
@@ -27,10 +28,10 @@ def reset(self, **kwargs) -> None:
         assert self.simulated_model.model_name == self.model_list[0].model_name
 
         if os.path.exists(
-            f"./output/benchmark_results/initial_supplementary_data.json"
+            f"{root}/output/benchmark_results/initial_supplementary_data.json"
         ):
             with open(
-                f"./output/benchmark_results/initial_supplementary_data.json", "r"
+                f"{root}/output/benchmark_results/initial_supplementary_data.json", "r"
             ) as f:
                 self.supplementary_data = json.load(f)
         else:
@@ -47,7 +48,7 @@ def reset(self, **kwargs) -> None:
 
             # Backup supplementary data
             with open(
-                f"./output/benchmark_results/initial_supplementary_data.json", "w"
+                f"{root}/output/benchmark_results/initial_supplementary_data.json", "w"
             ) as f:
                 json.dump(self.supplementary_data, f)
 

diff --git a/doc_generation/source/conf.py b/doc_generation/source/conf.py
@@ -5,10 +5,11 @@
 
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+from src.path import root
 import os
 import sys
 
-sys.path.insert(0, os.path.abspath("../.."))
+sys.path.insert(0, os.path.abspath(root))
 
 project = "ProgressGym"
 copyright = "2024 PKU Alignment, Tianyi Qiu, Yang Zhang, Xuchuan Huang, Xinze Li"

diff --git a/examples/abstractions/finetuning_datamanip.py b/examples/abstractions/finetuning_datamanip.py
@@ -1,21 +1,30 @@
+from src.path import root
 from src.abstractions import Model, Data, DataFileCollection
 
-if __name__ == "__main__":
+gemma2b_base = Model(
+    model_name="gemma-2b",
+    model_path_or_repoid="google/gemma-2-2b",  # or specify a local path if you have downloaded the model
+    is_instruct_finetuned=False,
+)
 
-    gemma2b_base = Model(
-        model_name="gemma-2b",
-        model_path="google/gemma-2-2b",  # or specify a local path if you have downloaded the model
-        is_instruct_finetuned=False,
-    )
+llama8b_instruct = Model(
+    model_name="Llama-3.1-8B-Instruct",
+    model_path_or_repoid="meta-llama/Llama-3.1-8B-Instruct",
+    is_instruct_finetuned=True,
+)
 
+def continue_pretrain():
     # ============== Continue pretraining from Gemma 2B ==============
+    global gemma2b_c4
     c4_data = Data("c4_demo", data_type="pretrain")
     gemma2b_c4 = gemma2b_base.finetune(
         c4_data, stage="pretrain", algo="full_param", result_model_name="gemma-2b_c4"
     )
     print(gemma2b_c4.is_instruct_finetuned)  # False
 
+def supervised_finetune():
     # ============== Then do SFT using alpaca data ==============
+    global gemma2b_c4_alpaca
     alpaca_data = Data("alpaca_gpt4_en", data_type="sft")
     gemma2b_c4_alpaca = gemma2b_c4.finetune(
         alpaca_data,
@@ -25,17 +34,7 @@
     )
     print(gemma2b_c4_alpaca.is_instruct_finetuned)  # True
     gemma2b_c4_alpaca.save_permanent()  # saved to output/saved/saved_model/gemma-2b_c4_alpaca
-
-    # ============== Then do DPO using ORCA data ==============
-    hh_data = Data("orca_rlhf", data_type="preference")
-    gemma2b_c4_alpaca_orca = gemma2b_c4_alpaca.finetune(
-        hh_data,
-        stage="dpo",
-        algo="full_param",
-        result_model_name="gemma-2b_c4_alpaca_orca",
-    )
-    gemma2b_c4_alpaca_orca.save_permanent()  # saved to output/saved/saved_model/gemma-2b_c4_alpaca_orca
-
+
     # ============== Or maybe, we should censor curse words before SFT ==============
     def remove_curse_words(sample_dict: dict) -> dict:
         filter = lambda s: (
@@ -56,12 +55,12 @@ def remove_curse_words(sample_dict: dict) -> dict:
     )
     gemma2b_c4_alpaca_G.save_permanent()  # saved to output/saved/saved_model/gemma-2b_c4_alpaca_G
     alpaca_data_G.save_permanent_and_register()  # saved to output/saved/saved_model/alpaca_gpt4_en_G.json & added to llama-factory dataset registry
-
+    
     # ============== What about using our own data (scattered across multiple files in multiple directories) for finetuning? ==============
     histext_collection = DataFileCollection(  # build a collection holding json files of year 1826 to 2018
         collection_name="histext_1826_to_2018_collection",
         data_type="pretrain",
-        collection_path="./dataset/dataset_text_sequence/",
+        collection_path=f"{root}/dataset/dataset_text_sequence/",
         file_selection_func=(
             lambda path: "Y" in path and 1826 <= int(path.split("/")[-1][1:6]) <= 2018
         ),  # if this argument is omitted, all json files will be selected
@@ -93,3 +92,46 @@ def remove_nonstr_data(sample_dict: dict) -> dict:
         algo="full_param",
         result_model_name="gemma-2b_histext",
     )
+
+def direct_preference_optimization():
+    # ============== Then do DPO using ORCA data ==============
+    global gemma2b_c4_alpaca_orca
+    hh_data = Data("orca_rlhf", data_type="preference")
+    gemma2b_c4_alpaca_orca = gemma2b_c4_alpaca.finetune(
+        hh_data,
+        stage="dpo",
+        algo="full_param",
+        result_model_name="gemma-2b_c4_alpaca_orca",
+    )
+    gemma2b_c4_alpaca_orca.save_permanent()  # saved to output/saved/saved_model/gemma-2b_c4_alpaca_orca
+
+def dialogue_manipulation():
+    # ============== Generating a dialogue, using a model to play the role of both user and assistant ==============
+    global llama8b_instruct
+    dialogue_data = Data(
+        "dialogue_data",
+        data_content=[
+            {
+                "input": "Is Eiffel Tower in Paris?",
+                "history": [
+                    ["What is the capital of France?", "Paris."],
+                ]
+            }
+        ]
+    )
+    dialogue_data = llama8b_instruct.inference(
+        dialogue_data, "dialogue_data", backend="sglang"
+    )
+    dialogue_data = dialogue_data.switch_role_to_user()
+    dialogue_data = llama8b_instruct.inference(
+        dialogue_data, "dialogue_data", backend="sglang"
+    )
+    dialogue_data = dialogue_data.switch_role_to_assistant()
+    print(list(dialogue_data.all_passages()))
+
+
+if __name__ == "__main__":
+    # continue_pretrain()
+    # supervised_finetune()
+    # direct_preference_optimization()
+    dialogue_manipulation()
diff --git a/examples/abstractions/inference_evaluation.py b/examples/abstractions/inference_evaluation.py
@@ -55,7 +55,7 @@ def logprob_example(histllama: Model):
     # Custom models (local or on hub) can be similarly loaded, e.g.:
     # model = Model(
     #     "mixtral-8x7b-instruct-v0.1",
-    #     model_path="mistralai/Mixtral-8x7B-Instruct-v0.1",
+    #     model_path_or_repoid="mistralai/Mixtral-8x7B-Instruct-v0.1",
     #     template_type="mistral",
     # )
 

diff --git a/run_benchmark.py b/run_benchmark.py
@@ -28,6 +28,7 @@
 Note that all names are case-sensitive. Dummies are for debugging purposes only.
 """
 
+from src.path import root
 import pdb
 import traceback
 import argparse
@@ -97,7 +98,7 @@ def run_benchmark(
         parser.add_argument(
             "--output_dir",
             type=str,
-            default="./output/benchmark_results",
+            default=f"{root}/output/benchmark_results",
             required=False,
         )
         args, unknownargs = parser.parse_known_args()