Skip to content

Commit

Permalink
evaluation final
Browse files Browse the repository at this point in the history
  • Loading branch information
pacman000 committed Dec 1, 2024
2 parents 29cdd9a + cf09339 commit f32da8f
Show file tree
Hide file tree
Showing 43 changed files with 584 additions and 353 deletions.
3 changes: 3 additions & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os, sys
sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path

from benchmark.framework import JudgeBase, ExamineeBase
from benchmark.dummies import DummyJudge
from challenges.follow import FollowJudge
Expand Down
5 changes: 2 additions & 3 deletions algorithms/extrapolative_dpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@
import pandas as pd
import json
import datasets
from src.text_writer import write_log
from src.utils.text_utils import write_log
from benchmark import JudgeBase, ExamineeBase, PredictJudge
from algorithms.utils.rw_utils import elicit_rw_preference, default_rw_data
from algorithms.utils.extrapolation_utils import extrapolate
from src.utils.data_utils import elicit_rw_preference, default_rw_data, extrapolate
import warnings
from tqdm import tqdm
import numpy as np
Expand Down
6 changes: 3 additions & 3 deletions algorithms/extrapolative_rlhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
import pandas as pd
import json
import datasets
from src.text_writer import write_log
from src.utils.text_utils import write_log
from benchmark import JudgeBase, ExamineeBase, PredictJudge
from algorithms.utils.rw_utils import (
from src.utils.data_utils import (
elicit_rw_preference,
default_rw_data,
default_ppo_data,
extrapolate,
)
from algorithms.utils.extrapolation_utils import extrapolate
import warnings
from tqdm import tqdm
from sympy import binomial
Expand Down
4 changes: 2 additions & 2 deletions algorithms/lifelong_dpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
import pandas as pd
import json
import datasets
from src.text_writer import write_log
from src.utils.text_utils import write_log
from benchmark import JudgeBase, ExamineeBase, PredictJudge
from algorithms.utils.rw_utils import elicit_rw_preference, default_rw_data
from src.utils.data_utils import elicit_rw_preference, default_rw_data
import warnings
from tqdm import tqdm

Expand Down
4 changes: 2 additions & 2 deletions algorithms/lifelong_rlhf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
import pandas as pd
import json
import datasets
from src.text_writer import write_log
from src.utils.text_utils import write_log
from benchmark import JudgeBase, ExamineeBase, PredictJudge
from algorithms.utils.rw_utils import (
from src.utils.data_utils import (
elicit_rw_preference,
default_rw_data,
default_ppo_data,
Expand Down
38 changes: 19 additions & 19 deletions build_dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import src.text_writer as tw
from src.path import root
import src.utils.text_utils as tu
import src.cleanser.rule_based_cleanser as rb
import src.cleanser.localllm_cleanser as llm_cleanser
import src.model_training.train_hislm as hislm
Expand All @@ -8,7 +9,6 @@
import os
import time


import src.eebo.download_eebo as eebo_dl
import src.eebo.process_eebo as eebo_pc

Expand All @@ -26,10 +26,10 @@ def build_EEBO():

def build_gutenberg():
print("======= START BUILDING GUTENBERG DATASET =======")
dir = "./dataset/raw_downloads/Gutenberg/"
dir = f"{root}/dataset/raw_downloads/Gutenberg/"
gtb_gd.get_data_gutenberg(dir)
gtb_gm.gather_meta(
os.path.join(dir, "data/raw"), "./dataset/raw_downloads/Gutenberg_records.txt"
os.path.join(dir, "data/raw"), f"{root}/dataset/raw_downloads/Gutenberg_records.txt"
)
print("======= FINISHED BUILDING GUTENBERG DATASET =======\n\n\n")

Expand All @@ -53,7 +53,7 @@ def build_pile_of_law():


if __name__ == "__main__":
tw.write_log(f"\n\n\n\n\n\n=========== NEW RUN ============\n\n")
tu.write_log(f"\n\n\n\n\n\n=========== NEW RUN ============\n\n")
print(
"This script is NOT meant to be run as part of the benchmarking process. Unless you would like to replicate the dataset building & model training process, you could directly run `run_benchmark.py` instead, which will automatically download the pre-built dataset and/or models on demand."
)
Expand Down Expand Up @@ -82,7 +82,7 @@ def build_pile_of_law():
max_hours=10
) # takes ~100h, but if max_hours is supplied then stops after this many hours (won't affect data integrity)
# finishing up
tw.seal_all_files()
tu.seal_all_files()
print("Finished building entire dataset. Proceed to data cleansing.")

if (
Expand All @@ -91,8 +91,8 @@ def build_pile_of_law():
):
proceed = True
rb.cleanse(
"./dataset/dataset_text_sequence/",
"./dataset/dataset_text_sequence_rulebased_cleansed/",
f"{root}/dataset/dataset_text_sequence/",
f"{root}/dataset/dataset_text_sequence_rulebased_cleansed/",
)
print("Finished rule-based data cleansing. Now exiting.")

Expand All @@ -102,25 +102,25 @@ def build_pile_of_law():
):
proceed = True
llm_cleanser.run_cleanser(
in_path="./dataset/dataset_text_sequence_rulebased_cleansed/",
out_path="./dataset/dataset_text_sequence_llm_cleansed/",
in_path=f"{root}/dataset/dataset_text_sequence_rulebased_cleansed/",
out_path=f"{root}/dataset/dataset_text_sequence_llm_cleansed/",
)

# Make llm-cleansed version the official version ("dataset_text_sequence"), and move the other two versions into dataset/raw_downloads
path = (
f"./dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/"
f"{root}/dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/"
)
os.makedirs(path)

print(f"Moving pre-cleansing version to backup folder...")
os.rename(
"./dataset/dataset_text_sequence/",
f"{root}/dataset/dataset_text_sequence/",
os.path.join(path, "dataset_text_sequence_original/"),
)

print(f"Moving rule-cleansed version to backup folder...")
os.rename(
"./dataset/dataset_text_sequence_rulebased_cleansed/",
f"{root}/dataset/dataset_text_sequence_rulebased_cleansed/",
os.path.join(path, "dataset_text_sequence_rulebased_cleansed/"),
)

Expand All @@ -131,7 +131,7 @@ def build_pile_of_law():

print(f"Copying LLM-cleansed version to backup folder...")
os.rename(
"./dataset/dataset_text_sequence_llm_cleansed/",
f"{root}/dataset/dataset_text_sequence_llm_cleansed/",
os.path.join(path, "dataset_text_sequence_llm_cleansed/"),
)

Expand All @@ -148,19 +148,19 @@ def build_pile_of_law():
proceed = True

print(f"Removing overly small or messy subdatasets...")
path = f"./dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/removed/"
path = f"{root}/dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/removed/"
os.makedirs(path)

sub_datasets = [
f
for f in os.listdir("./dataset/dataset_text_sequence/")
if os.path.isdir(os.path.join("./dataset/dataset_text_sequence/", f))
for f in os.listdir(f"{root}/dataset/dataset_text_sequence/")
if os.path.isdir(os.path.join(f"{root}/dataset/dataset_text_sequence/", f))
]
for sub in sub_datasets:
# Remove if size < 10MB AND century number < 13
if (
hislm.get_directory_size_bytes(
os.path.join("./dataset/dataset_text_sequence/", sub)
os.path.join(f"{root}/dataset/dataset_text_sequence/", sub)
)
< 10 * 1024 * 1024
and int(sub.strip("C")) < 13
Expand All @@ -169,7 +169,7 @@ def build_pile_of_law():
os.system(f"mv ./dataset/dataset_text_sequence/{sub} {path}")

hislm.run_training(
"./dataset/dataset_text_sequence/", "./dataset/dataset_model_sequence/"
f"{root}/dataset/dataset_text_sequence/", f"{root}/dataset/dataset_model_sequence/"
)
print("Finished model training. Exiting.")

Expand Down
11 changes: 6 additions & 5 deletions challenges/coevolve.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from src.path import root
from src.utils.data_utils import elicit_rw_preference, default_rw_data
from benchmark.framework import JudgeBase, ExamineeBase
from typing import Iterable, Tuple, Dict, Union, List, Any
from src.abstractions import Model, Data
import numpy as np
import scipy.spatial as sp
import datasets
import json, os
from algorithms.utils.rw_utils import elicit_rw_preference, default_rw_data
import json, os, sys


class CoevolveJudge(JudgeBase):
Expand All @@ -27,10 +28,10 @@ def reset(self, **kwargs) -> None:
assert self.simulated_model.model_name == self.model_list[0].model_name

if os.path.exists(
f"./output/benchmark_results/initial_supplementary_data.json"
f"{root}/output/benchmark_results/initial_supplementary_data.json"
):
with open(
f"./output/benchmark_results/initial_supplementary_data.json", "r"
f"{root}/output/benchmark_results/initial_supplementary_data.json", "r"
) as f:
self.supplementary_data = json.load(f)
else:
Expand All @@ -47,7 +48,7 @@ def reset(self, **kwargs) -> None:

# Backup supplementary data
with open(
f"./output/benchmark_results/initial_supplementary_data.json", "w"
f"{root}/output/benchmark_results/initial_supplementary_data.json", "w"
) as f:
json.dump(self.supplementary_data, f)

Expand Down
3 changes: 2 additions & 1 deletion doc_generation/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@

# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
from src.path import root
import os
import sys

sys.path.insert(0, os.path.abspath("../.."))
sys.path.insert(0, os.path.abspath(root))

project = "ProgressGym"
copyright = "2024 PKU Alignment, Tianyi Qiu, Yang Zhang, Xuchuan Huang, Xinze Li"
Expand Down
80 changes: 61 additions & 19 deletions examples/abstractions/finetuning_datamanip.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,30 @@
from src.path import root
from src.abstractions import Model, Data, DataFileCollection

if __name__ == "__main__":
gemma2b_base = Model(
model_name="gemma-2b",
model_path_or_repoid="google/gemma-2-2b", # or specify a local path if you have downloaded the model
is_instruct_finetuned=False,
)

gemma2b_base = Model(
model_name="gemma-2b",
model_path="google/gemma-2-2b", # or specify a local path if you have downloaded the model
is_instruct_finetuned=False,
)
llama8b_instruct = Model(
model_name="Llama-3.1-8B-Instruct",
model_path_or_repoid="meta-llama/Llama-3.1-8B-Instruct",
is_instruct_finetuned=True,
)

def continue_pretrain():
# ============== Continue pretraining from Gemma 2B ==============
global gemma2b_c4
c4_data = Data("c4_demo", data_type="pretrain")
gemma2b_c4 = gemma2b_base.finetune(
c4_data, stage="pretrain", algo="full_param", result_model_name="gemma-2b_c4"
)
print(gemma2b_c4.is_instruct_finetuned) # False

def supervised_finetune():
# ============== Then do SFT using alpaca data ==============
global gemma2b_c4_alpaca
alpaca_data = Data("alpaca_gpt4_en", data_type="sft")
gemma2b_c4_alpaca = gemma2b_c4.finetune(
alpaca_data,
Expand All @@ -25,17 +34,7 @@
)
print(gemma2b_c4_alpaca.is_instruct_finetuned) # True
gemma2b_c4_alpaca.save_permanent() # saved to output/saved/saved_model/gemma-2b_c4_alpaca

# ============== Then do DPO using ORCA data ==============
hh_data = Data("orca_rlhf", data_type="preference")
gemma2b_c4_alpaca_orca = gemma2b_c4_alpaca.finetune(
hh_data,
stage="dpo",
algo="full_param",
result_model_name="gemma-2b_c4_alpaca_orca",
)
gemma2b_c4_alpaca_orca.save_permanent() # saved to output/saved/saved_model/gemma-2b_c4_alpaca_orca


# ============== Or maybe, we should censor curse words before SFT ==============
def remove_curse_words(sample_dict: dict) -> dict:
filter = lambda s: (
Expand All @@ -56,12 +55,12 @@ def remove_curse_words(sample_dict: dict) -> dict:
)
gemma2b_c4_alpaca_G.save_permanent() # saved to output/saved/saved_model/gemma-2b_c4_alpaca_G
alpaca_data_G.save_permanent_and_register() # saved to output/saved/saved_model/alpaca_gpt4_en_G.json & added to llama-factory dataset registry

# ============== What about using our own data (scattered across multiple files in multiple directories) for finetuning? ==============
histext_collection = DataFileCollection( # build a collection holding json files of year 1826 to 2018
collection_name="histext_1826_to_2018_collection",
data_type="pretrain",
collection_path="./dataset/dataset_text_sequence/",
collection_path=f"{root}/dataset/dataset_text_sequence/",
file_selection_func=(
lambda path: "Y" in path and 1826 <= int(path.split("/")[-1][1:6]) <= 2018
), # if this argument is omitted, all json files will be selected
Expand Down Expand Up @@ -93,3 +92,46 @@ def remove_nonstr_data(sample_dict: dict) -> dict:
algo="full_param",
result_model_name="gemma-2b_histext",
)

def direct_preference_optimization():
# ============== Then do DPO using ORCA data ==============
global gemma2b_c4_alpaca_orca
hh_data = Data("orca_rlhf", data_type="preference")
gemma2b_c4_alpaca_orca = gemma2b_c4_alpaca.finetune(
hh_data,
stage="dpo",
algo="full_param",
result_model_name="gemma-2b_c4_alpaca_orca",
)
gemma2b_c4_alpaca_orca.save_permanent() # saved to output/saved/saved_model/gemma-2b_c4_alpaca_orca

def dialogue_manipulation():
# ============== Generating a dialogue, using a model to play the role of both user and assistant ==============
global llama8b_instruct
dialogue_data = Data(
"dialogue_data",
data_content=[
{
"input": "Is Eiffel Tower in Paris?",
"history": [
["What is the capital of France?", "Paris."],
]
}
]
)
dialogue_data = llama8b_instruct.inference(
dialogue_data, "dialogue_data", backend="sglang"
)
dialogue_data = dialogue_data.switch_role_to_user()
dialogue_data = llama8b_instruct.inference(
dialogue_data, "dialogue_data", backend="sglang"
)
dialogue_data = dialogue_data.switch_role_to_assistant()
print(list(dialogue_data.all_passages()))


if __name__ == "__main__":
# continue_pretrain()
# supervised_finetune()
# direct_preference_optimization()
dialogue_manipulation()
2 changes: 1 addition & 1 deletion examples/abstractions/inference_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def logprob_example(histllama: Model):
# Custom models (local or on hub) can be similarly loaded, e.g.:
# model = Model(
# "mixtral-8x7b-instruct-v0.1",
# model_path="mistralai/Mixtral-8x7B-Instruct-v0.1",
# model_path_or_repoid="mistralai/Mixtral-8x7B-Instruct-v0.1",
# template_type="mistral",
# )

Expand Down
3 changes: 2 additions & 1 deletion run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
Note that all names are case-sensitive. Dummies are for debugging purposes only.
"""

from src.path import root
import pdb
import traceback
import argparse
Expand Down Expand Up @@ -97,7 +98,7 @@ def run_benchmark(
parser.add_argument(
"--output_dir",
type=str,
default="./output/benchmark_results",
default=f"{root}/output/benchmark_results",
required=False,
)
args, unknownargs = parser.parse_known_args()
Expand Down
Loading

0 comments on commit f32da8f

Please sign in to comment.