From 59c59a1a9f3e47178571d656ee779bc1df47cd56 Mon Sep 17 00:00:00 2001 From: "Tianyi (Alex) Qiu" Date: Thu, 28 Nov 2024 00:01:31 -0800 Subject: [PATCH 1/9] feat(abstractions): add support for multi-turn dialogue manipulation --- examples/abstractions/finetuning_datamanip.py | 71 +++++++--- src/abstractions/data.py | 132 +++++++++++++++--- src/abstractions/model.py | 6 +- 3 files changed, 168 insertions(+), 41 deletions(-) diff --git a/examples/abstractions/finetuning_datamanip.py b/examples/abstractions/finetuning_datamanip.py index 2ec5f6a..669b30a 100644 --- a/examples/abstractions/finetuning_datamanip.py +++ b/examples/abstractions/finetuning_datamanip.py @@ -1,21 +1,23 @@ from src.abstractions import Model, Data, DataFileCollection -if __name__ == "__main__": - - gemma2b_base = Model( - model_name="gemma-2b", - model_path="google/gemma-2-2b", # or specify a local path if you have downloaded the model - is_instruct_finetuned=False, - ) +gemma2b_base = Model( + model_name="gemma-2b", + model_path="google/gemma-2-2b", # or specify a local path if you have downloaded the model + is_instruct_finetuned=False, +) +def continue_pretrain(): # ============== Continue pretraining from Gemma 2B ============== + global gemma2b_c4 c4_data = Data("c4_demo", data_type="pretrain") gemma2b_c4 = gemma2b_base.finetune( c4_data, stage="pretrain", algo="full_param", result_model_name="gemma-2b_c4" ) print(gemma2b_c4.is_instruct_finetuned) # False +def supervised_finetune(): # ============== Then do SFT using alpaca data ============== + global gemma2b_c4_alpaca alpaca_data = Data("alpaca_gpt4_en", data_type="sft") gemma2b_c4_alpaca = gemma2b_c4.finetune( alpaca_data, @@ -25,17 +27,7 @@ ) print(gemma2b_c4_alpaca.is_instruct_finetuned) # True gemma2b_c4_alpaca.save_permanent() # saved to output/saved/saved_model/gemma-2b_c4_alpaca - - # ============== Then do DPO using ORCA data ============== - hh_data = Data("orca_rlhf", data_type="preference") - gemma2b_c4_alpaca_orca = gemma2b_c4_alpaca.finetune( - hh_data, - stage="dpo", - algo="full_param", - result_model_name="gemma-2b_c4_alpaca_orca", - ) - gemma2b_c4_alpaca_orca.save_permanent() # saved to output/saved/saved_model/gemma-2b_c4_alpaca_orca - + # ============== Or maybe, we should censor curse words before SFT ============== def remove_curse_words(sample_dict: dict) -> dict: filter = lambda s: ( @@ -56,7 +48,7 @@ def remove_curse_words(sample_dict: dict) -> dict: ) gemma2b_c4_alpaca_G.save_permanent() # saved to output/saved/saved_model/gemma-2b_c4_alpaca_G alpaca_data_G.save_permanent_and_register() # saved to output/saved/saved_model/alpaca_gpt4_en_G.json & added to llama-factory dataset registry - + # ============== What about using our own data (scattered across multiple files in multiple directories) for finetuning? ============== histext_collection = DataFileCollection( # build a collection holding json files of year 1826 to 2018 collection_name="histext_1826_to_2018_collection", @@ -93,3 +85,44 @@ def remove_nonstr_data(sample_dict: dict) -> dict: algo="full_param", result_model_name="gemma-2b_histext", ) + +def direct_preference_optimization(): + # ============== Then do DPO using ORCA data ============== + global gemma2b_c4_alpaca_orca + hh_data = Data("orca_rlhf", data_type="preference") + gemma2b_c4_alpaca_orca = gemma2b_c4_alpaca.finetune( + hh_data, + stage="dpo", + algo="full_param", + result_model_name="gemma-2b_c4_alpaca_orca", + ) + gemma2b_c4_alpaca_orca.save_permanent() # saved to output/saved/saved_model/gemma-2b_c4_alpaca_orca + +def dialogue_manipulation(): + global gemma2b_c4_alpaca_orca_dialogue + dialogue_data = Data( + "dialogue_data", + data_content=[ + { + "input": "Is Eiffel Tower in Paris?", + "history": [ + ["What is the capital of France?", "Paris."], + ] + } + ] + ) + dialogue_data = gemma2b_c4_alpaca_orca.inference( + dialogue_data, "dialogue_data", backend="sglang" + ) + dialogue_data = dialogue_data.switch_role_to_user() + dialogue_data = gemma2b_c4_alpaca_orca.inference( + dialogue_data, "dialogue_data", backend="sglang" + ) + dialogue_data = dialogue_data.switch_role_to_assistant() + + +if __name__ == "__main__": + continue_pretrain() + supervised_finetune() + direct_preference_optimization() + dialogue_manipulation() \ No newline at end of file diff --git a/src/abstractions/data.py b/src/abstractions/data.py index 7dfbd6d..abfcab0 100644 --- a/src/abstractions/data.py +++ b/src/abstractions/data.py @@ -56,6 +56,13 @@ class Data: name2data: Dict[str, Any] = {} always_force_rewrite: bool = True data_type: Literal["pretrain", "sft", "preference"] + + default_key_fields = { + "prompt": "instruction", + "query": "input", + "response": "output", + "history": "history", + } # check with user before removing a file @classmethod @@ -196,7 +203,7 @@ def transform( :param result_data_name: The name of the resulting data. Do not include path in result_data_name. :type result_data_name: str - :param forced_rewrite: Whether to forcefully rewrite the existing data + :param forced_rewrite: Whether to forcefully rewrite the existing file, if there is one. :type forced_rewrite: bool = False :param max_batch_size: If max_batch_size is specified and is >1, the transformation function must take inputs of type List[Dict] and return a List[Dict]. @@ -220,29 +227,19 @@ def write_dict(sample_dict: Dict): def map_key_fields_fn(sample_dict: Dict) -> Dict: nonlocal self - if "prompt" in self.key_fields and self.key_fields["prompt"] != "instruction": - sample_dict["instruction"] = sample_dict[self.key_fields["prompt"]] - del sample_dict[self.key_fields["prompt"]] - if "query" in self.key_fields and self.key_fields["query"] != "input": - sample_dict["input"] = sample_dict[self.key_fields["query"]] - del sample_dict[self.key_fields["query"]] - if "response" in self.key_fields and self.key_fields["response"] != "output": - sample_dict["output"] = sample_dict[self.key_fields["response"]] - del sample_dict[self.key_fields["response"]] + for k, v in self.default_key_fields.items(): + if k in self.key_fields and self.key_fields[k] != v: + sample_dict[v] = sample_dict[self.key_fields[k]] + del sample_dict[self.key_fields[k]] return sample_dict def inv_map_key_fields_fn(sample_dict: Dict) -> Dict: nonlocal self - if "instruction" in sample_dict and self.key_fields["prompt"] != "instruction": - sample_dict[self.key_fields["prompt"]] = sample_dict["instruction"] - del sample_dict["instruction"] - if "input" in sample_dict and self.key_fields["query"] != "input": - sample_dict[self.key_fields["query"]] = sample_dict["input"] - del sample_dict["input"] - if "output" in sample_dict and self.key_fields["response"] != "output": - sample_dict[self.key_fields["response"]] = sample_dict["output"] - del sample_dict["output"] + for k, v in self.default_key_fields.items(): + if v in sample_dict and self.key_fields[k] != v: + sample_dict[self.key_fields[k]] = sample_dict[v] + del sample_dict[v] return sample_dict @@ -284,6 +281,94 @@ def inv_map_key_fields_fn(sample_dict: Dict) -> Dict: result.key_fields = self.key_fields.copy() return result + def move_current_to_history(self): + """ + Move the current dialogue turn in the prompt/question field and the response/predict field to the history field. + + :return: The data after the operation. + :rtype: Data. + """ + def move_to_history_fn(sample_dict: Dict) -> Dict: + if sample_dict.get("instruction", "") or sample_dict.get("input", "") or sample_dict.get("output", "") or sample_dict.get("predict", ""): + assert (sample_dict.get("instruction", "") or sample_dict.get("input", "")) and (sample_dict.get("output", "") or sample_dict.get("predict", "")) + sample_dict["history"] = sample_dict.get("history", []) + [ + [ + sample_dict.get("instruction", "") + + ("\n\n" if "instruction" in sample_dict and "input" in sample_dict else "") + + sample_dict.get("input", ""), + sample_dict.get("output", "") + sample_dict.get("predict", "") + ] + ] + sample_dict.pop("instruction", None) + sample_dict.pop("input", None) + sample_dict.pop("output", None) + sample_dict.pop("predict", None) + + return sample_dict + + return self.transform(move_to_history_fn, self.data_name, forced_rewrite=True, map_key_fields=True) + + def switch_role_to_user(self, user_system_prompt: str = None, dialogue_starter: str = None): + """ + Switch the prompt/question field and the response/predict field, thereby shifting the dialogue turn from the assistant to the user. + + :param user_system_prompt: The system prompt of the user role. + :type user_system_prompt: str = None + + :param dialogue_starter: Placeholder message for the "zeroth" dialogue turn by the assistant that prompts the user to start the conversation. + :type dialogue_starter: str = None + + :return: The data after the operation. + :rtype: Data. + """ + if user_system_prompt is None: + user_system_prompt = "You are an assistant tasked with questioning the user, aka your partner. Ask informed questions to guide the conversation, follow up on the user's responses, and generally follow a natural conversation flow. Don't be too courteous; be concise." + + if dialogue_starter is None: + dialogue_starter = "I am your partner. Please directly ask your first question." + + moved_to_history = self.move_current_to_history() + + def switch_role_to_user_fn(sample_dict: Dict) -> Dict: + assert not (sample_dict.get("instruction", "") or sample_dict.get("input", "") or sample_dict.get("output", "") or sample_dict.get("predict", "")) + + all_histories = [h[i] for h in sample_dict.get("history", []) for i in range(2)] + all_histories = [dialogue_starter] + all_histories + assert len(all_histories) % 2 == 1 + sample_dict["history"] = [[all_histories[i], all_histories[i + 1]] for i in range(len(all_histories)-1, 2)] + sample_dict["instruction"] = all_histories[-1] + sample_dict["system"] = user_system_prompt + return sample_dict + + return moved_to_history.transform(switch_role_to_user_fn, self.data_name, forced_rewrite=True, map_key_fields=True) + + def switch_role_to_assistant(self, assistant_system_prompt: str = None): + """ + Switch the prompt/question field and the response/predict field, thereby shifting the dialogue turn from the user to the assistant. + + :param assistant_system_prompt: The system prompt of the assistant role. + :type assistant_system_prompt: str = None + + :return: The data after the operation. + :rtype: Data. + """ + if assistant_system_prompt is None: + assistant_system_prompt = "Please answer the user's questions. Be concise and not overly courteous, but be informative and provide all necessary details." + + moved_to_history = self.move_current_to_history() + + def switch_role_to_assistant_fn(sample_dict: Dict) -> Dict: + assert not (sample_dict.get("instruction", "") or sample_dict.get("input", "") or sample_dict.get("output", "") or sample_dict.get("predict", "")) + + all_histories = [h[i] for h in sample_dict.get("history", []) for i in range(2)] + assert len(all_histories) % 2 == 0 + sample_dict["history"] = [[all_histories[i], all_histories[i + 1]] for i in range(1, len(all_histories)-1, 2)] + sample_dict["instruction"] = all_histories[-1] + sample_dict["system"] = assistant_system_prompt + return sample_dict + + return moved_to_history.transform(switch_role_to_assistant_fn, self.data_name, forced_rewrite=True, map_key_fields=True) + def manage_llama_factory_registration( self, operation: Literal["add", "remove", "query"], forced_update: bool = True ) -> bool: @@ -372,6 +457,7 @@ def set_key_fields( query_field_name: Optional[str] = None, response_field_name: Optional[str] = None, system_field_name: Optional[str] = None, + history_field_name: Optional[str] = None, suppress_registration_update: bool = False, **kwargs, ) -> None: @@ -393,6 +479,9 @@ def set_key_fields( :param system_field_name: The name of the system field :type system_field_name: Optional[str] = None + + :param history_field_name: The name of the history field + :type history_field_name: Optional[str] = None :param suppress_registration_update: Whether to suppress the update of the registration :type suppress_registration_update: bool = False @@ -428,6 +517,11 @@ def set_key_fields( del self.key_fields["system"] elif system_field_name: self.key_fields["system"] = system_field_name + + if history_field_name == "" and "history" in self.key_fields: + del self.key_fields["history"] + elif history_field_name: + self.key_fields["history"] = history_field_name if isinstance(kwargs, dict): for k, v in kwargs.items(): diff --git a/src/abstractions/model.py b/src/abstractions/model.py index 57158d9..642dcab 100644 --- a/src/abstractions/model.py +++ b/src/abstractions/model.py @@ -662,7 +662,7 @@ def inference( result_data_name: str, backend: Literal["sglang", "vllm", "deepspeed", "serial"] = "sglang", batch_size_multiplier_log2: int = 0, - temperature=0.0, + temperature=0.25, purpose: Literal["responses", "logprobs"] = "responses", ) -> Union[Data, List[Dict[str, str]]]: """Performance inference on a dataset (currently only instruction datasets are tested, with the same format as SFT datasets), @@ -681,7 +681,7 @@ def inference( :type batch_size_multiplier_log2: int = 0 :param temperature: The temperature parameter - :type temperature: float = 0.0 + :type temperature: float = 0.25 :param purpose: The purpose of the inference. It can be "responses" or "logprobs". If "logprobs", the log probability of the prompt itself (and the assistant response supplied in the `predict` field, if exists) is returned in the `logprob` field of the resulting dataset, without doing any completion. If "responses", the completion text is saved in the `predict` field of the resulting dataset. :type purpose: Literal["responses", "logprobs"] = "responses" @@ -829,7 +829,7 @@ def __inference_parallel_deepspeed( data: Data, result_data_name: str, batch_size_multiplier_log2: int = 0, - temperature: float = 0, + temperature: float = 0.25, ) -> Data: """Deepspeed implementation for `inference()`.""" From dc8bf89f5bb8418709223f8cc9b3d262e985ea40 Mon Sep 17 00:00:00 2001 From: "Tianyi (Alex) Qiu" Date: Thu, 28 Nov 2024 00:13:10 -0800 Subject: [PATCH 2/9] fix(abstractions): remove legacy __init__ argument --- examples/abstractions/finetuning_datamanip.py | 22 +++++++++++++------ examples/abstractions/inference_evaluation.py | 2 +- src/abstractions/model.py | 8 +++---- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/examples/abstractions/finetuning_datamanip.py b/examples/abstractions/finetuning_datamanip.py index 669b30a..85def04 100644 --- a/examples/abstractions/finetuning_datamanip.py +++ b/examples/abstractions/finetuning_datamanip.py @@ -2,10 +2,16 @@ gemma2b_base = Model( model_name="gemma-2b", - model_path="google/gemma-2-2b", # or specify a local path if you have downloaded the model + model_path_or_repoid="google/gemma-2-2b", # or specify a local path if you have downloaded the model is_instruct_finetuned=False, ) +llama8b_instruct = Model( + model_name="Llama-3.1-8B-Instruct", + model_path_or_repoid="meta-llama/Llama-3.1-8B-Instruct", + is_instruct_finetuned=True, +) + def continue_pretrain(): # ============== Continue pretraining from Gemma 2B ============== global gemma2b_c4 @@ -99,7 +105,8 @@ def direct_preference_optimization(): gemma2b_c4_alpaca_orca.save_permanent() # saved to output/saved/saved_model/gemma-2b_c4_alpaca_orca def dialogue_manipulation(): - global gemma2b_c4_alpaca_orca_dialogue + # ============== Generating a dialogue, using a model to play the role of both user and assistant ============== + global llama8b_instruct dialogue_data = Data( "dialogue_data", data_content=[ @@ -111,18 +118,19 @@ def dialogue_manipulation(): } ] ) - dialogue_data = gemma2b_c4_alpaca_orca.inference( + dialogue_data = llama8b_instruct.inference( dialogue_data, "dialogue_data", backend="sglang" ) dialogue_data = dialogue_data.switch_role_to_user() - dialogue_data = gemma2b_c4_alpaca_orca.inference( + dialogue_data = llama8b_instruct.inference( dialogue_data, "dialogue_data", backend="sglang" ) dialogue_data = dialogue_data.switch_role_to_assistant() + print(list(dialogue_data.all_passages())) if __name__ == "__main__": - continue_pretrain() - supervised_finetune() - direct_preference_optimization() + # continue_pretrain() + # supervised_finetune() + # direct_preference_optimization() dialogue_manipulation() \ No newline at end of file diff --git a/examples/abstractions/inference_evaluation.py b/examples/abstractions/inference_evaluation.py index d4734ca..7d0b24a 100644 --- a/examples/abstractions/inference_evaluation.py +++ b/examples/abstractions/inference_evaluation.py @@ -55,7 +55,7 @@ def logprob_example(histllama: Model): # Custom models (local or on hub) can be similarly loaded, e.g.: # model = Model( # "mixtral-8x7b-instruct-v0.1", - # model_path="mistralai/Mixtral-8x7B-Instruct-v0.1", + # model_path_or_repoid="mistralai/Mixtral-8x7B-Instruct-v0.1", # template_type="mistral", # ) diff --git a/src/abstractions/model.py b/src/abstractions/model.py index 642dcab..4fef66e 100644 --- a/src/abstractions/model.py +++ b/src/abstractions/model.py @@ -216,7 +216,7 @@ def deep_copy( return Model( model_name=copied_name, is_instruct_finetuned=self.is_instruct_finetuned, - model_path=copied_path, + model_path_or_repoid=copied_path, num_gpus=self.num_gpus, template_type=self.template_type, ) @@ -490,7 +490,7 @@ def finetune( result = Model( model_name=result_model_name, is_instruct_finetuned=(self.is_instruct_finetuned or stage == "sft"), - model_path=f"./output/training_results/{escape(result_model_name)}/", + model_path_or_repoid=f"./output/training_results/{escape(result_model_name)}/", num_gpus=self.num_gpus, ) @@ -516,7 +516,7 @@ def finetune( result = Model( model_name=result_model_name, is_instruct_finetuned=result.is_instruct_finetuned, - model_path=merged_model_path, + model_path_or_repoid=merged_model_path, num_gpus=self.num_gpus, template_type=self.template_type, ) @@ -652,7 +652,7 @@ def __rlhf( return Model( model_name=the_name, is_instruct_finetuned=self.is_instruct_finetuned, - model_path=the_path, + model_path_or_repoid=the_path, num_gpus=self.num_gpus, ) From f98db155f3b80c65a27dd78fa2c2e12ff07eac46 Mon Sep 17 00:00:00 2001 From: "Tianyi (Alex) Qiu" Date: Thu, 28 Nov 2024 00:27:16 -0800 Subject: [PATCH 3/9] feat(abstractions): add support for sglang inference on dialogues --- src/abstractions/backends.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/abstractions/backends.py b/src/abstractions/backends.py index ac3ad04..5248e20 100644 --- a/src/abstractions/backends.py +++ b/src/abstractions/backends.py @@ -568,9 +568,20 @@ def dict_to_dialogue_list( :rtype: Union[List[Dict[str, str]], List[List[Dict[str, str]]] """ if isinstance(dic, dict): - res = [{"role": "user", "content": dic["input"]}] - if "instruction" in dic: - res = [{"role": "system", "content": dic["instruction"]}] + res + res = [] + + if "system" in dic: + res = [{"role": "system", "content": dic["system"]}] + + if "history" in dic: + for turn in dic["history"]: + res.append({"role": "user", "content": turn[0]}, {"role": "assistant", "content": turn[1]}) + + if "input" in dic or "instruction" in dic: + input = dic.get("input", "") + instruction = dic.get("instruction", "") + res.append({"role": "user", "content": input + ("\n\n" if input and instruction else "") + instruction}) + if purpose == "logprobs" and "predict" in dic and isinstance(dic["predict"], str): res.append({"role": "assistant", "content": dic["predict"]}) From d553febb8045405efbd61bb6a81e20ec0dbe5890 Mon Sep 17 00:00:00 2001 From: "Tianyi (Alex) Qiu" Date: Thu, 28 Nov 2024 00:40:48 -0800 Subject: [PATCH 4/9] chore: renaming modules --- algorithms/extrapolative_dpo.py | 2 +- algorithms/extrapolative_rlhf.py | 2 +- algorithms/lifelong_dpo.py | 2 +- algorithms/lifelong_rlhf.py | 2 +- algorithms/utils/extrapolation_utils.py | 2 +- algorithms/utils/rw_utils.py | 2 +- build_dataset.py | 2 +- src/abstractions/data.py | 2 +- src/abstractions/model.py | 2 +- src/cleanser/gpt_cleanser.py | 2 +- src/cleanser/localllm_cleanser.py | 2 +- src/cleanser/rule_based_cleanser.py | 2 +- src/data_analysis/metadata_analysis.py | 4 ++-- src/eebo/process_eebo.py | 2 +- src/gutenberg/get_meta.py | 4 ++-- src/internet_archive/get_sources.py | 2 +- src/model_training/curate_prompts.py | 2 +- src/model_training/train_hislm.py | 2 +- src/pile_of_law/get_data.py | 2 +- src/{text_writer.py => text_utils.py} | 0 20 files changed, 21 insertions(+), 21 deletions(-) rename src/{text_writer.py => text_utils.py} (100%) diff --git a/algorithms/extrapolative_dpo.py b/algorithms/extrapolative_dpo.py index 12c43a9..3a2420f 100644 --- a/algorithms/extrapolative_dpo.py +++ b/algorithms/extrapolative_dpo.py @@ -6,7 +6,7 @@ import pandas as pd import json import datasets -from src.text_writer import write_log +from src.text_utils import write_log from benchmark import JudgeBase, ExamineeBase, PredictJudge from algorithms.utils.rw_utils import elicit_rw_preference, default_rw_data from algorithms.utils.extrapolation_utils import extrapolate diff --git a/algorithms/extrapolative_rlhf.py b/algorithms/extrapolative_rlhf.py index 9a01a9c..b97266d 100644 --- a/algorithms/extrapolative_rlhf.py +++ b/algorithms/extrapolative_rlhf.py @@ -6,7 +6,7 @@ import pandas as pd import json import datasets -from src.text_writer import write_log +from src.text_utils import write_log from benchmark import JudgeBase, ExamineeBase, PredictJudge from algorithms.utils.rw_utils import ( elicit_rw_preference, diff --git a/algorithms/lifelong_dpo.py b/algorithms/lifelong_dpo.py index 53b3098..69a3de4 100644 --- a/algorithms/lifelong_dpo.py +++ b/algorithms/lifelong_dpo.py @@ -6,7 +6,7 @@ import pandas as pd import json import datasets -from src.text_writer import write_log +from src.text_utils import write_log from benchmark import JudgeBase, ExamineeBase, PredictJudge from algorithms.utils.rw_utils import elicit_rw_preference, default_rw_data import warnings diff --git a/algorithms/lifelong_rlhf.py b/algorithms/lifelong_rlhf.py index 1fe3ce2..21283a0 100644 --- a/algorithms/lifelong_rlhf.py +++ b/algorithms/lifelong_rlhf.py @@ -6,7 +6,7 @@ import pandas as pd import json import datasets -from src.text_writer import write_log +from src.text_utils import write_log from benchmark import JudgeBase, ExamineeBase, PredictJudge from algorithms.utils.rw_utils import ( elicit_rw_preference, diff --git a/algorithms/utils/extrapolation_utils.py b/algorithms/utils/extrapolation_utils.py index 091d59d..619d0c3 100644 --- a/algorithms/utils/extrapolation_utils.py +++ b/algorithms/utils/extrapolation_utils.py @@ -7,7 +7,7 @@ import pandas as pd import json import datasets -from src.text_writer import write_log +from src.text_utils import write_log import warnings from tqdm import tqdm from sympy import binomial diff --git a/algorithms/utils/rw_utils.py b/algorithms/utils/rw_utils.py index 6d3872c..b3e2d92 100644 --- a/algorithms/utils/rw_utils.py +++ b/algorithms/utils/rw_utils.py @@ -6,7 +6,7 @@ import pandas as pd import json from datasets import load_dataset -from src.text_writer import write_log +from src.text_utils import write_log import src.evaluation.utils as eval_utils from benchmark import JudgeBase, ExamineeBase, PredictJudge import warnings diff --git a/build_dataset.py b/build_dataset.py index d5ca6df..aad89e7 100644 --- a/build_dataset.py +++ b/build_dataset.py @@ -1,4 +1,4 @@ -import src.text_writer as tw +import src.text_utils as tw import src.cleanser.rule_based_cleanser as rb import src.cleanser.localllm_cleanser as llm_cleanser import src.model_training.train_hislm as hislm diff --git a/src/abstractions/data.py b/src/abstractions/data.py index abfcab0..f032c26 100644 --- a/src/abstractions/data.py +++ b/src/abstractions/data.py @@ -14,7 +14,7 @@ import os import json import warnings -import src.text_writer as tw +import src.text_utils as tw from tqdm import tqdm with open("./src/abstractions/configs/abstractions_config.json", "r") as config_file: diff --git a/src/abstractions/model.py b/src/abstractions/model.py index 4fef66e..320f869 100644 --- a/src/abstractions/model.py +++ b/src/abstractions/model.py @@ -7,7 +7,7 @@ import json import torch import warnings -import src.text_writer as tw +import src.text_utils as tw import random import numpy as np from transformers import ( diff --git a/src/cleanser/gpt_cleanser.py b/src/cleanser/gpt_cleanser.py index e126b9a..6dd9970 100644 --- a/src/cleanser/gpt_cleanser.py +++ b/src/cleanser/gpt_cleanser.py @@ -7,7 +7,7 @@ convo_clear_history, independent_get_response, ) -from src.text_writer import write_log +from src.text_utils import write_log from threading import Thread, Lock from concurrent.futures import ThreadPoolExecutor, as_completed import time diff --git a/src/cleanser/localllm_cleanser.py b/src/cleanser/localllm_cleanser.py index e66d348..f77c5df 100644 --- a/src/cleanser/localllm_cleanser.py +++ b/src/cleanser/localllm_cleanser.py @@ -3,7 +3,7 @@ from vllm import LLM, SamplingParams from src.abstractions import Model, Data, fill_in_QA_template, DataFileCollection from typing import List, Tuple, Dict, Optional, Set -from src.text_writer import write_log, JsonListWriter +from src.text_utils import write_log, JsonListWriter import time import re diff --git a/src/cleanser/rule_based_cleanser.py b/src/cleanser/rule_based_cleanser.py index 5617484..8360d00 100644 --- a/src/cleanser/rule_based_cleanser.py +++ b/src/cleanser/rule_based_cleanser.py @@ -1,6 +1,6 @@ import os, json import re, unicodedata -import src.text_writer as tw +import src.text_utils as tw from tqdm import tqdm last_cleaned = "" diff --git a/src/data_analysis/metadata_analysis.py b/src/data_analysis/metadata_analysis.py index 1550290..496d8bd 100644 --- a/src/data_analysis/metadata_analysis.py +++ b/src/data_analysis/metadata_analysis.py @@ -4,7 +4,7 @@ import matplotlib.pyplot as plt import scipy.stats as stats from tqdm import tqdm -from src import text_writer +from src import text_utils import logging mpl_logger = logging.getLogger("matplotlib") @@ -142,7 +142,7 @@ def main(): recursive=True, ) ): - dict_iterator = text_writer.read_json_memory_efficient(pathname) + dict_iterator = text_utils.read_json_memory_efficient(pathname) for doc in tqdm(dict_iterator): # extract metadata lst_content_length.append(len(doc.get("content", ""))) diff --git a/src/eebo/process_eebo.py b/src/eebo/process_eebo.py index c630818..4dee0df 100644 --- a/src/eebo/process_eebo.py +++ b/src/eebo/process_eebo.py @@ -2,7 +2,7 @@ import os from tqdm import tqdm import json -import src.text_writer as tw +import src.text_utils as tw # a utility function called by build_eebo_dataset, to read the contents in an eebo xml file in a suitable manner diff --git a/src/gutenberg/get_meta.py b/src/gutenberg/get_meta.py index 7f6d5c3..f69a079 100644 --- a/src/gutenberg/get_meta.py +++ b/src/gutenberg/get_meta.py @@ -1,10 +1,10 @@ -import src.text_writer as tw +import src.text_utils as tw import os, json import csv from tqdm import tqdm """ -This file is to be excecuted after get_data.py. Reads metadata from raw files and writes them via text_writer API +This file is to be excecuted after get_data.py. Reads metadata from raw files and writes them via text_utils API """ diff --git a/src/internet_archive/get_sources.py b/src/internet_archive/get_sources.py index 8598266..4161eeb 100644 --- a/src/internet_archive/get_sources.py +++ b/src/internet_archive/get_sources.py @@ -1,5 +1,5 @@ import os, json, requests -import src.text_writer as tw +import src.text_utils as tw from tqdm import tqdm import re import time diff --git a/src/model_training/curate_prompts.py b/src/model_training/curate_prompts.py index 3b448b3..0b72c21 100644 --- a/src/model_training/curate_prompts.py +++ b/src/model_training/curate_prompts.py @@ -95,7 +95,7 @@ def __init__(self, collection_name: str, data_type: Literal['pretrain', 'sft', ' """ from src.abstractions import Data, DataFileCollection, fill_in_QA_template -from src.text_writer import write_log, read_json_memory_efficient +from src.text_utils import write_log, read_json_memory_efficient # Multithreading for GPT interaction import threading diff --git a/src/model_training/train_hislm.py b/src/model_training/train_hislm.py index 6717556..dbd9d38 100644 --- a/src/model_training/train_hislm.py +++ b/src/model_training/train_hislm.py @@ -1,5 +1,5 @@ from src.abstractions import Model, Data, DataFileCollection -from src.text_writer import write_log +from src.text_utils import write_log from collections import defaultdict from typing import List import os diff --git a/src/pile_of_law/get_data.py b/src/pile_of_law/get_data.py index ba7dd36..0b29a70 100644 --- a/src/pile_of_law/get_data.py +++ b/src/pile_of_law/get_data.py @@ -2,7 +2,7 @@ from bs4 import BeautifulSoup import os, json import lzma -import src.text_writer as tw +import src.text_utils as tw from tqdm import tqdm from copy import copy diff --git a/src/text_writer.py b/src/text_utils.py similarity index 100% rename from src/text_writer.py rename to src/text_utils.py From fe6980f7306de5e660e198fa3ccd1d6826d78cec Mon Sep 17 00:00:00 2001 From: "Tianyi (Alex) Qiu" Date: Thu, 28 Nov 2024 10:41:55 -0800 Subject: [PATCH 5/9] chore: support importing from outside --- __init__.py | 3 + algorithms/extrapolative_dpo.py | 5 +- algorithms/extrapolative_rlhf.py | 6 +- algorithms/lifelong_dpo.py | 4 +- algorithms/lifelong_rlhf.py | 4 +- build_dataset.py | 34 +++++------ challenges/coevolve.py | 11 ++-- doc_generation/source/conf.py | 3 +- examples/abstractions/finetuning_datamanip.py | 3 +- run_benchmark.py | 3 +- src/abstractions/backends.py | 21 +++---- src/abstractions/configs/templates_configs.py | 27 ++++++++- src/abstractions/data.py | 37 ++++++------ src/abstractions/model.py | 58 ++++++++++--------- src/cleanser/gpt_cleanser.py | 5 +- src/cleanser/localllm_cleanser.py | 3 +- src/cleanser/rule_based_cleanser.py | 7 ++- src/data_analysis/metadata_analysis.py | 8 ++- src/data_analysis/word2vec_gensim.py | 1 + src/download_models.py | 5 +- src/eebo/download_eebo.py | 3 +- src/eebo/process_eebo.py | 5 +- src/evaluation/figure.py | 5 +- src/evaluation/metric.py | 13 +++-- src/evaluation/quantify.py | 43 +++++++------- src/evaluation/test_eval_01.py | 18 +++--- src/evaluation/test_eval_02.py | 15 ++--- src/evaluation/utils.py | 39 +++++++------ src/gutenberg/get_meta.py | 7 ++- src/gutenberg/src/bookshelves.py | 1 + src/gutenberg/src/metadataparser.py | 1 + src/gutenberg/src/metaquery.py | 1 + src/internet_archive/get_sources.py | 7 ++- src/model_training/curate_prompts.py | 5 +- src/model_training/train_hislm.py | 7 ++- src/path.py | 3 + src/pile_of_law/get_data.py | 9 +-- src/utils/data_utils/__init__.py | 2 + .../utils/data_utils}/extrapolation_utils.py | 13 +---- .../utils/data_utils}/rw_utils.py | 11 ++-- src/{ => utils}/gpt_utils.py | 17 +++--- src/{ => utils}/text_utils.py | 9 +-- 42 files changed, 266 insertions(+), 216 deletions(-) create mode 100644 src/path.py create mode 100644 src/utils/data_utils/__init__.py rename {algorithms/utils => src/utils/data_utils}/extrapolation_utils.py (94%) rename {algorithms/utils => src/utils/data_utils}/rw_utils.py (97%) rename src/{ => utils}/gpt_utils.py (92%) rename src/{ => utils}/text_utils.py (97%) diff --git a/__init__.py b/__init__.py index 065ab69..8a293b5 100644 --- a/__init__.py +++ b/__init__.py @@ -1,3 +1,6 @@ +import os, sys +sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path + from benchmark.framework import JudgeBase, ExamineeBase from benchmark.dummies import DummyJudge from challenges.follow import FollowJudge diff --git a/algorithms/extrapolative_dpo.py b/algorithms/extrapolative_dpo.py index 3a2420f..6ea5561 100644 --- a/algorithms/extrapolative_dpo.py +++ b/algorithms/extrapolative_dpo.py @@ -6,10 +6,9 @@ import pandas as pd import json import datasets -from src.text_utils import write_log +from src.utils.text_utils import write_log from benchmark import JudgeBase, ExamineeBase, PredictJudge -from algorithms.utils.rw_utils import elicit_rw_preference, default_rw_data -from algorithms.utils.extrapolation_utils import extrapolate +from src.utils.data_utils import elicit_rw_preference, default_rw_data, extrapolate import warnings from tqdm import tqdm import numpy as np diff --git a/algorithms/extrapolative_rlhf.py b/algorithms/extrapolative_rlhf.py index b97266d..2052de0 100644 --- a/algorithms/extrapolative_rlhf.py +++ b/algorithms/extrapolative_rlhf.py @@ -6,14 +6,14 @@ import pandas as pd import json import datasets -from src.text_utils import write_log +from src.utils.text_utils import write_log from benchmark import JudgeBase, ExamineeBase, PredictJudge -from algorithms.utils.rw_utils import ( +from src.utils.data_utils import ( elicit_rw_preference, default_rw_data, default_ppo_data, + extrapolate, ) -from algorithms.utils.extrapolation_utils import extrapolate import warnings from tqdm import tqdm from sympy import binomial diff --git a/algorithms/lifelong_dpo.py b/algorithms/lifelong_dpo.py index 69a3de4..acce30e 100644 --- a/algorithms/lifelong_dpo.py +++ b/algorithms/lifelong_dpo.py @@ -6,9 +6,9 @@ import pandas as pd import json import datasets -from src.text_utils import write_log +from src.utils.text_utils import write_log from benchmark import JudgeBase, ExamineeBase, PredictJudge -from algorithms.utils.rw_utils import elicit_rw_preference, default_rw_data +from src.utils.data_utils import elicit_rw_preference, default_rw_data import warnings from tqdm import tqdm diff --git a/algorithms/lifelong_rlhf.py b/algorithms/lifelong_rlhf.py index 21283a0..20f51c8 100644 --- a/algorithms/lifelong_rlhf.py +++ b/algorithms/lifelong_rlhf.py @@ -6,9 +6,9 @@ import pandas as pd import json import datasets -from src.text_utils import write_log +from src.utils.text_utils import write_log from benchmark import JudgeBase, ExamineeBase, PredictJudge -from algorithms.utils.rw_utils import ( +from src.utils.data_utils import ( elicit_rw_preference, default_rw_data, default_ppo_data, diff --git a/build_dataset.py b/build_dataset.py index aad89e7..f62175f 100644 --- a/build_dataset.py +++ b/build_dataset.py @@ -1,4 +1,5 @@ -import src.text_utils as tw +from src.path import root +import src.utils.text_utils as tw import src.cleanser.rule_based_cleanser as rb import src.cleanser.localllm_cleanser as llm_cleanser import src.model_training.train_hislm as hislm @@ -8,7 +9,6 @@ import os import time - import src.eebo.download_eebo as eebo_dl import src.eebo.process_eebo as eebo_pc @@ -26,10 +26,10 @@ def build_EEBO(): def build_gutenberg(): print("======= START BUILDING GUTENBERG DATASET =======") - dir = "./dataset/raw_downloads/Gutenberg/" + dir = f"{root}/dataset/raw_downloads/Gutenberg/" gtb_gd.get_data_gutenberg(dir) gtb_gm.gather_meta( - os.path.join(dir, "data/raw"), "./dataset/raw_downloads/Gutenberg_records.txt" + os.path.join(dir, "data/raw"), f"{root}/dataset/raw_downloads/Gutenberg_records.txt" ) print("======= FINISHED BUILDING GUTENBERG DATASET =======\n\n\n") @@ -91,8 +91,8 @@ def build_pile_of_law(): ): proceed = True rb.cleanse( - "./dataset/dataset_text_sequence/", - "./dataset/dataset_text_sequence_rulebased_cleansed/", + f"{root}/dataset/dataset_text_sequence/", + f"{root}/dataset/dataset_text_sequence_rulebased_cleansed/", ) print("Finished rule-based data cleansing. Now exiting.") @@ -102,25 +102,25 @@ def build_pile_of_law(): ): proceed = True llm_cleanser.run_cleanser( - in_path="./dataset/dataset_text_sequence_rulebased_cleansed/", - out_path="./dataset/dataset_text_sequence_llm_cleansed/", + in_path=f"{root}/dataset/dataset_text_sequence_rulebased_cleansed/", + out_path=f"{root}/dataset/dataset_text_sequence_llm_cleansed/", ) # Make llm-cleansed version the official version ("dataset_text_sequence"), and move the other two versions into dataset/raw_downloads path = ( - f"./dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/" + f"{root}/dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/" ) os.makedirs(path) print(f"Moving pre-cleansing version to backup folder...") os.rename( - "./dataset/dataset_text_sequence/", + f"{root}/dataset/dataset_text_sequence/", os.path.join(path, "dataset_text_sequence_original/"), ) print(f"Moving rule-cleansed version to backup folder...") os.rename( - "./dataset/dataset_text_sequence_rulebased_cleansed/", + f"{root}/dataset/dataset_text_sequence_rulebased_cleansed/", os.path.join(path, "dataset_text_sequence_rulebased_cleansed/"), ) @@ -131,7 +131,7 @@ def build_pile_of_law(): print(f"Copying LLM-cleansed version to backup folder...") os.rename( - "./dataset/dataset_text_sequence_llm_cleansed/", + f"{root}/dataset/dataset_text_sequence_llm_cleansed/", os.path.join(path, "dataset_text_sequence_llm_cleansed/"), ) @@ -148,19 +148,19 @@ def build_pile_of_law(): proceed = True print(f"Removing overly small or messy subdatasets...") - path = f"./dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/removed/" + path = f"{root}/dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/removed/" os.makedirs(path) sub_datasets = [ f - for f in os.listdir("./dataset/dataset_text_sequence/") - if os.path.isdir(os.path.join("./dataset/dataset_text_sequence/", f)) + for f in os.listdir(f"{root}/dataset/dataset_text_sequence/") + if os.path.isdir(os.path.join(f"{root}/dataset/dataset_text_sequence/", f)) ] for sub in sub_datasets: # Remove if size < 10MB AND century number < 13 if ( hislm.get_directory_size_bytes( - os.path.join("./dataset/dataset_text_sequence/", sub) + os.path.join(f"{root}/dataset/dataset_text_sequence/", sub) ) < 10 * 1024 * 1024 and int(sub.strip("C")) < 13 @@ -169,7 +169,7 @@ def build_pile_of_law(): os.system(f"mv ./dataset/dataset_text_sequence/{sub} {path}") hislm.run_training( - "./dataset/dataset_text_sequence/", "./dataset/dataset_model_sequence/" + f"{root}/dataset/dataset_text_sequence/", f"{root}/dataset/dataset_model_sequence/" ) print("Finished model training. Exiting.") diff --git a/challenges/coevolve.py b/challenges/coevolve.py index 3f28144..1d1d4b1 100644 --- a/challenges/coevolve.py +++ b/challenges/coevolve.py @@ -1,11 +1,12 @@ +from src.path import root +from src.utils.data_utils import elicit_rw_preference, default_rw_data from benchmark.framework import JudgeBase, ExamineeBase from typing import Iterable, Tuple, Dict, Union, List, Any from src.abstractions import Model, Data import numpy as np import scipy.spatial as sp import datasets -import json, os -from algorithms.utils.rw_utils import elicit_rw_preference, default_rw_data +import json, os, sys class CoevolveJudge(JudgeBase): @@ -27,10 +28,10 @@ def reset(self, **kwargs) -> None: assert self.simulated_model.model_name == self.model_list[0].model_name if os.path.exists( - f"./output/benchmark_results/initial_supplementary_data.json" + f"{root}/output/benchmark_results/initial_supplementary_data.json" ): with open( - f"./output/benchmark_results/initial_supplementary_data.json", "r" + f"{root}/output/benchmark_results/initial_supplementary_data.json", "r" ) as f: self.supplementary_data = json.load(f) else: @@ -47,7 +48,7 @@ def reset(self, **kwargs) -> None: # Backup supplementary data with open( - f"./output/benchmark_results/initial_supplementary_data.json", "w" + f"{root}/output/benchmark_results/initial_supplementary_data.json", "w" ) as f: json.dump(self.supplementary_data, f) diff --git a/doc_generation/source/conf.py b/doc_generation/source/conf.py index a467b62..c5f6018 100644 --- a/doc_generation/source/conf.py +++ b/doc_generation/source/conf.py @@ -5,10 +5,11 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information +from src.path import root import os import sys -sys.path.insert(0, os.path.abspath("../..")) +sys.path.insert(0, os.path.abspath(root)) project = "ProgressGym" copyright = "2024 PKU Alignment, Tianyi Qiu, Yang Zhang, Xuchuan Huang, Xinze Li" diff --git a/examples/abstractions/finetuning_datamanip.py b/examples/abstractions/finetuning_datamanip.py index 85def04..2d0463b 100644 --- a/examples/abstractions/finetuning_datamanip.py +++ b/examples/abstractions/finetuning_datamanip.py @@ -1,3 +1,4 @@ +from src.path import root from src.abstractions import Model, Data, DataFileCollection gemma2b_base = Model( @@ -59,7 +60,7 @@ def remove_curse_words(sample_dict: dict) -> dict: histext_collection = DataFileCollection( # build a collection holding json files of year 1826 to 2018 collection_name="histext_1826_to_2018_collection", data_type="pretrain", - collection_path="./dataset/dataset_text_sequence/", + collection_path=f"{root}/dataset/dataset_text_sequence/", file_selection_func=( lambda path: "Y" in path and 1826 <= int(path.split("/")[-1][1:6]) <= 2018 ), # if this argument is omitted, all json files will be selected diff --git a/run_benchmark.py b/run_benchmark.py index 6e7df2d..7340a67 100644 --- a/run_benchmark.py +++ b/run_benchmark.py @@ -28,6 +28,7 @@ Note that all names are case-sensitive. Dummies are for debugging purposes only. """ +from src.path import root import pdb import traceback import argparse @@ -97,7 +98,7 @@ def run_benchmark( parser.add_argument( "--output_dir", type=str, - default="./output/benchmark_results", + default=f"{root}/output/benchmark_results", required=False, ) args, unknownargs = parser.parse_known_args() diff --git a/src/abstractions/backends.py b/src/abstractions/backends.py index 5248e20..7a0e328 100644 --- a/src/abstractions/backends.py +++ b/src/abstractions/backends.py @@ -1,4 +1,5 @@ # Edit flashinfer cascade.py to make it compatible with Python 3.8 +from src.path import root import os path = os.path.join( @@ -31,16 +32,16 @@ import random # create output directories -os.makedirs("./output/benchmark_results", exist_ok=True) -os.makedirs("./output/datasets", exist_ok=True) -os.makedirs("./output/evaluation_results", exist_ok=True) -os.makedirs("./output/inference_results", exist_ok=True) -os.makedirs("./output/training_results", exist_ok=True) -os.makedirs("./output/rlhf_results", exist_ok=True) -os.makedirs("./output/merged_lora_results", exist_ok=True) -os.makedirs("./output/saved/saved_model/", exist_ok=True) -os.makedirs("./output/saved/saved_data/", exist_ok=True) -os.makedirs("./output/downloaded", exist_ok=True) +os.makedirs(f"{root}/output/benchmark_results", exist_ok=True) +os.makedirs(f"{root}/output/datasets", exist_ok=True) +os.makedirs(f"{root}/output/evaluation_results", exist_ok=True) +os.makedirs(f"{root}/output/inference_results", exist_ok=True) +os.makedirs(f"{root}/output/training_results", exist_ok=True) +os.makedirs(f"{root}/output/rlhf_results", exist_ok=True) +os.makedirs(f"{root}/output/merged_lora_results", exist_ok=True) +os.makedirs(f"{root}/output/saved/saved_model/", exist_ok=True) +os.makedirs(f"{root}/output/saved/saved_data/", exist_ok=True) +os.makedirs(f"{root}/output/downloaded", exist_ok=True) random.seed(time.time()) MY_USERNAME = pwd.getpwuid(os.getuid()).pw_name diff --git a/src/abstractions/configs/templates_configs.py b/src/abstractions/configs/templates_configs.py index 73f25ed..2a807ee 100644 --- a/src/abstractions/configs/templates_configs.py +++ b/src/abstractions/configs/templates_configs.py @@ -1,5 +1,7 @@ +from src.path import root from string import Template import json +import os from typing import Dict, Any, Literal, Optional, List, Union bash_command_template = """PYTHONNOUSERSITE=1 MASTER_PORT=9902 conda run --no-capture-output -n %s deepspeed %s --master_port=9902 ./libs/llama_factory/src/train_bash.py \\ @@ -121,8 +123,31 @@ """ ) -with open("./src/abstractions/configs/abstractions_config.json", "r") as config_file: +with open(f"{root}/src/abstractions/configs/abstractions_config.json", "r") as config_file: abstractions_config = json.load(config_file) + + data_search_paths: List[str] = abstractions_config["data_search_paths"] + data_save_path: str = abstractions_config["data_save_path"] + + if not os.path.exists(data_save_path): + data_save_path = f"{root}/" + data_save_path + if not os.path.exists(data_save_path): + raise FileNotFoundError(f"Data save path {data_save_path} doesn't exist.") + + for i, path in enumerate(data_search_paths): + if not os.path.exists(path): + data_search_paths[i] = f"{root}/" + path + model_search_paths: List[str] = abstractions_config["model_search_paths"] model_save_path: str = abstractions_config["model_save_path"] + + if not os.path.exists(model_save_path): + model_save_path = f"{root}/" + model_save_path + if not os.path.exists(model_save_path): + raise FileNotFoundError(f"Model save path {model_save_path} doesn't exist.") + + for i, path in enumerate(model_search_paths): + if not os.path.exists(path): + model_search_paths[i] = f"{root}/" + path + multinode_master_addr: str = abstractions_config["multinode_master_addr"] diff --git a/src/abstractions/data.py b/src/abstractions/data.py index f032c26..d17c225 100644 --- a/src/abstractions/data.py +++ b/src/abstractions/data.py @@ -1,4 +1,5 @@ # from src.abstractions.model import Model # Uncommenting will lead to circular import +from src.path import root from typing import ( Dict, Any, @@ -14,13 +15,9 @@ import os import json import warnings -import src.text_utils as tw +import src.utils.text_utils as tw from tqdm import tqdm - -with open("./src/abstractions/configs/abstractions_config.json", "r") as config_file: - abstractions_config = json.load(config_file) - data_search_paths: List[str] = abstractions_config["data_search_paths"] - data_save_path: str = abstractions_config["data_save_path"] +from src.abstractions.configs.templates_configs import * # helper function, escape spaces in paths @@ -126,7 +123,7 @@ def __init__( if data_content is not None: if data_path is None: - data_path = f"./output/datasets/{data_name}.json" + data_path = f"{root}/output/datasets/{data_name}.json" Data.ask_and_remove_if_exists(data_path, forced_rewrite=True) with tw.JsonListWriter(data_path) as json_writer: @@ -153,11 +150,11 @@ def __init__( print( f"Data {data_name} not found locally. Searching among Llama-Factory datasets." ) - with open("./libs/llama_factory/data/dataset_info.json", "r") as in_file: + with open(f"{root}/libs/llama_factory/data/dataset_info.json", "r") as in_file: registrations = json.load(in_file) if self.data_name in registrations: - self.data_path = f'./libs/llama_factory/data/{registrations[self.data_name]["file_name"]}' + self.data_path = f'{root}/libs/llama_factory/data/{registrations[self.data_name]["file_name"]}' print(f'Found {registrations[self.data_name]["file_name"]}.') else: raise FileNotFoundError( @@ -215,7 +212,7 @@ def transform( :return: The data after transformation. :rtype: Data. """ - out_path = f"./output/datasets/{result_data_name}.json" + out_path = f"{root}/output/datasets/{result_data_name}.json" Data.ask_and_remove_if_exists(out_path, forced_rewrite) def write_dict(sample_dict: Dict): @@ -385,19 +382,19 @@ def manage_llama_factory_registration( :return: A boolean meaning the registration status before this operation. :rtype: bool. """ - with open("./libs/llama_factory/data/dataset_info.json", "r") as in_file: + with open(f"{root}/libs/llama_factory/data/dataset_info.json", "r") as in_file: registrations = json.load(in_file) return_val = self.data_name in registrations if operation == "add": - path = f"./libs/llama_factory/data/{self.data_name}.json" + path = f"{root}/libs/llama_factory/data/{self.data_name}.json" if "llama_factory/data" not in self.data_path: Data.ask_and_remove_if_exists(path, forced_rewrite=True) os.system(f"cp {escape(self.data_path)} {escape(path)}") if operation == "add" and (forced_update or not return_val): - path = f"./libs/llama_factory/data/{self.data_name}.json" + path = f"{root}/libs/llama_factory/data/{self.data_name}.json" if "llama_factory" not in self.data_path: # if is not built-in dataset if ("prompt" not in self.key_fields) or ( @@ -423,22 +420,22 @@ def manage_llama_factory_registration( f"Adding registration of data {self.data_name}: {registrations[self.data_name]}." ) - with open("./libs/llama_factory/data/dataset_info.json", "w") as out_file: + with open(f"{root}/libs/llama_factory/data/dataset_info.json", "w") as out_file: json.dump(registrations, out_file) print(f"Successfully completed registration of data {self.data_name}.") elif operation == "remove" and return_val: with open( - "./libs/llama_factory/data/dataset_info_original.json", "r" + f"{root}/libs/llama_factory/data/dataset_info_original.json", "r" ) as in_file: registrations_original = json.load(in_file) assert self.data_name not in registrations_original - path = f'./libs/llama_factory/data/{registrations[self.data_name]["file_name"]}' + path = f'{root}/libs/llama_factory/data/{registrations[self.data_name]["file_name"]}' del registrations[self.data_name] - with open("./libs/llama_factory/data/dataset_info.json", "w") as out_file: + with open(f"{root}/libs/llama_factory/data/dataset_info.json", "w") as out_file: json.dump(registrations, out_file) if os.path.exists(path): @@ -636,7 +633,7 @@ def __init__( DataFileCollection(collection_name='histtext_1826_to_2018', data_type='pretrain', - collection_path = './dataset/dataset_text_sequence/', + collection_path = f'{root}/dataset/dataset_text_sequence/', file_selection_func = (lambda path: 1826 <= int(path.split('/')[-1][1:6]) <= 2018)) """ @@ -742,7 +739,7 @@ def transform( :param suppress_tqdm: Whether to suppress the tqdm progress bar :type suppress_tqdm: bool = False """ - result_dir = f"./output/datasets/{result_collection_name}/" + result_dir = f"{root}/output/datasets/{result_collection_name}/" DataFileCollection.ask_and_remove_if_exists(result_dir, forced_rewrite) os.makedirs(result_dir, exist_ok=True) @@ -836,7 +833,7 @@ def convert_to_Data( :param filter_fields: Fields to filter the data (default is None) :type filter_fields: Optional = None """ - path = f"./output/datasets/{result_data_name}.json" + path = f"{root}/output/datasets/{result_data_name}.json" Data.ask_and_remove_if_exists(path, forced_rewrite) with open(path, "w") as out_file: diff --git a/src/abstractions/model.py b/src/abstractions/model.py index 320f869..8a1f21b 100644 --- a/src/abstractions/model.py +++ b/src/abstractions/model.py @@ -1,3 +1,4 @@ +from src.path import root import math from src.abstractions.data import Data from src.evaluation.quantify import calculate_model @@ -7,7 +8,7 @@ import json import torch import warnings -import src.text_utils as tw +import src.utils.text_utils as tw import random import numpy as np from transformers import ( @@ -126,7 +127,7 @@ def __init__( Examples: .. code-block:: python - Model(model_name = 'Gemma-2B_sft', is_instruct_finetuned = True, model_path = './output/training_results/Gemma-2B_sft/') + Model(model_name = 'Gemma-2B_sft', is_instruct_finetuned = True, model_path = f'{root}/output/training_results/Gemma-2B_sft/') Model(model_name = 'Gemma-2B_sft', is_instruct_finetuned = True) """ @@ -158,7 +159,7 @@ def __init__( raise FileNotFoundError(f"{self.model_path} is not a repo ID.") new_path = os.path.join( - "./output/downloaded", self.model_path.split("/")[-1] + f"{root}/output/downloaded", self.model_path.split("/")[-1] ) download_model(self.model_path, new_path) self.model_path = new_path @@ -208,7 +209,7 @@ def deep_copy( if dest_suffix else dest_full_name ) - copied_path = os.path.join(os.path.join("output", dest_subdir), copied_name) + copied_path = os.path.join(os.path.join(root, "output", dest_subdir), copied_name) Model.ask_and_remove_if_exists(copied_path, forced_rewrite=False) if not os.path.exists(copied_path): shutil.copytree(path, copied_path) @@ -425,7 +426,7 @@ def finetune( cmd = bash_command_template % ( "pa38-lf" if num_nodes == 1 else "multinode-s", # conda environment deepspeed_args, # deepspeed settings - "./src/abstractions/configs/LF_examples/full_multi_gpu/ds_z3_config.json", # deepspeed config; this file usable for both full_param and lora + f"{root}/src/abstractions/configs/LF_examples/full_multi_gpu/ds_z3_config.json", # deepspeed config; this file usable for both full_param and lora ("pt" if stage == "pretrain" else stage), # stage - pt, sft, dpo "train", # current operation - train or predict "", # do sample; ignored here @@ -433,7 +434,7 @@ def finetune( data.data_name, # dataset (automatically registered in llama-factory) self.template_type, # template type ("lora" if algo == "lora" else "full"), # type - full_param or lora - f"./output/training_results/{escape(result_model_name)}/", # where to save the training results (and checkpoints etc.) + f"{root}/output/training_results/{escape(result_model_name)}/", # where to save the training results (and checkpoints etc.) 2 ** max( 0, 3 + batch_size_multiplier_log2 @@ -485,12 +486,12 @@ def finetune( print( stage + " model saved at ", - f"./output/training_results/{escape(result_model_name)}/", + f"{root}/output/training_results/{escape(result_model_name)}/", ) result = Model( model_name=result_model_name, is_instruct_finetuned=(self.is_instruct_finetuned or stage == "sft"), - model_path_or_repoid=f"./output/training_results/{escape(result_model_name)}/", + model_path_or_repoid=f"{root}/output/training_results/{escape(result_model_name)}/", num_gpus=self.num_gpus, ) @@ -501,7 +502,7 @@ def finetune( # for LORA models, merge the adapter with the model if algo == "lora": print("Merging LORA model...") - merged_model_path = f"./output/merged_lora_results/{result_model_name}" + merged_model_path = f"{root}/output/merged_lora_results/{result_model_name}" cmd = bash_command_for_lora_merging % ( "pa38-lf", self.model_path, @@ -544,7 +545,8 @@ def __rlhf( "RLHF is not supported for models with auto template type." ) - rw_results = "./" + os.path.join( + rw_results = os.path.join( + root, "output", "rlhf_results", os.path.basename(self.model_path) + "_reward_" + code + "_results", @@ -567,8 +569,8 @@ def __rlhf( cmd = bash_command_for_rw % ( "pa38-lf" if num_nodes == 1 else "multinode", # conda environment deepspeed_args, # deepspeed settings - # './src/abstractions/configs/LF_examples/deepspeed/ds_z2_config.json', - "./src/abstractions/configs/LF_examples/full_multi_gpu/ds_z3_config.json", + # f'{root}/src/abstractions/configs/LF_examples/deepspeed/ds_z2_config.json', + f"{root}/src/abstractions/configs/LF_examples/full_multi_gpu/ds_z3_config.json", rw_path, rw_data.data_name, self.template_type, @@ -614,8 +616,8 @@ def __rlhf( cmd = bash_command_for_ppo % ( "pa38-lf" if num_nodes == 1 else "multinode", # conda environment deepspeed_args, # deepspeed settings - # './src/abstractions/configs/LF_examples/full_multi_gpu/ds_z3_config.json', - "./src/abstractions/configs/LF_examples/deepspeed/ds_z2_config.json", + # f'{root}/src/abstractions/configs/LF_examples/full_multi_gpu/ds_z3_config.json', + f"{root}/src/abstractions/configs/LF_examples/deepspeed/ds_z2_config.json", self.model_path, rw_results, "lora" if use_lora else "full", @@ -838,7 +840,7 @@ def __inference_parallel_deepspeed( operation="add" ) - result_data_path = f"./output/inference_results/{escape(result_data_name)}/" + result_data_path = f"{root}/output/inference_results/{escape(result_data_name)}/" # run prediction deepspeed_args = ( @@ -849,7 +851,7 @@ def __inference_parallel_deepspeed( cmd = bash_command_template % ( "pa38-lf", # conda environment deepspeed_args, # num_gpus; only set if CUDA_VISIBLE_DEVICES is not set - "./src/abstractions/configs/LF_examples/full_multi_gpu/ds_z3_config.json", # deepspeed config; this file usable for both full_param and lora + f"{root}/src/abstractions/configs/LF_examples/full_multi_gpu/ds_z3_config.json", # deepspeed config; this file usable for both full_param and lora ("sft" if data.data_type != "pretrain" else "pt"), # stage - sft or pt "predict", # current operation - train or predict "\n--do_sample \\", # do sample @@ -953,9 +955,9 @@ def __inference_serial( """Serial implementation for `inference()`.""" data_name = result_data_name # self.model_name + "_inference_output" - os.makedirs(os.path.join("output", "inference_results", "inf"), exist_ok=True) + os.makedirs(os.path.join(root, "output", "inference_results", "inf"), exist_ok=True) data_path = os.path.join( - "output", "inference_results", "inf", data_name + ".json" + root, "output", "inference_results", "inf", data_name + ".json" ) with tw.JsonListWriter( @@ -1018,13 +1020,13 @@ def __evaluate_fast(self, logprobs = False) -> np.ndarray: "Fast evaluation is only supported for models using alpaca template." ) - if not os.path.exists("output"): - os.mkdir("output") - if not os.path.exists(os.path.join("output", "evaluation_results")): - os.mkdir(os.path.join("output", "evaluation_results")) + if not os.path.exists(f"{root}/output"): + os.mkdir(f"{root}/output") + if not os.path.exists(os.path.join(root, "output", "evaluation_results")): + os.mkdir(os.path.join(root, "output", "evaluation_results")) # output csv goes here experiment_directory = os.path.join( - "output", "evaluation_results", self.model_name + "_single" + root, "output", "evaluation_results", self.model_name + "_single" ) if not os.path.exists(experiment_directory): os.mkdir(experiment_directory) @@ -1064,11 +1066,11 @@ def __evaluate_slow_moralchoice(self) -> np.ndarray: This method returns an incomplete result and is slow; therefore it's abandoned. """ - if not os.path.exists(os.path.join("output", "evaluation_results")): - os.mkdir(os.path.join("output", "evaluation_results")) + if not os.path.exists(os.path.join(root, "output", "evaluation_results")): + os.mkdir(os.path.join(root, "output", "evaluation_results")) directory = os.path.join( - "libs", "moralchoice", "data", "responses", self.model_name + "_single" + root, "libs", "moralchoice", "data", "responses", self.model_name + "_single" ) if os.path.exists(directory) and ( @@ -1089,13 +1091,13 @@ def __evaluate_slow_moralchoice(self) -> np.ndarray: low_vec = calculate_model( self.model_name + "_single", "low", - os.path.join("output", "evaluation_results"), + os.path.join(root, "output", "evaluation_results"), self.model_name, ) high_vec = calculate_model( self.model_name + "_single", "high", - os.path.join("output", "evaluation_results"), + os.path.join(root, "output", "evaluation_results"), self.model_name, ) return 1 / 3 * low_vec + 2 / 3 * high_vec diff --git a/src/cleanser/gpt_cleanser.py b/src/cleanser/gpt_cleanser.py index 6dd9970..c0c6cc0 100644 --- a/src/cleanser/gpt_cleanser.py +++ b/src/cleanser/gpt_cleanser.py @@ -1,13 +1,14 @@ +from src.path import root from typing import List, Tuple, Dict, Optional, Set from src.abstractions import DataFileCollection -from src.gpt_utils import ( +from src.utils.gpt_utils import ( context_len, gpt, convo_get_response, convo_clear_history, independent_get_response, ) -from src.text_utils import write_log +from src.utils.text_utils import write_log from threading import Thread, Lock from concurrent.futures import ThreadPoolExecutor, as_completed import time diff --git a/src/cleanser/localllm_cleanser.py b/src/cleanser/localllm_cleanser.py index f77c5df..6580381 100644 --- a/src/cleanser/localllm_cleanser.py +++ b/src/cleanser/localllm_cleanser.py @@ -1,9 +1,10 @@ +from src.path import root from collections import Counter import os from vllm import LLM, SamplingParams from src.abstractions import Model, Data, fill_in_QA_template, DataFileCollection from typing import List, Tuple, Dict, Optional, Set -from src.text_utils import write_log, JsonListWriter +from src.utils.text_utils import write_log, JsonListWriter import time import re diff --git a/src/cleanser/rule_based_cleanser.py b/src/cleanser/rule_based_cleanser.py index 8360d00..4ebf18a 100644 --- a/src/cleanser/rule_based_cleanser.py +++ b/src/cleanser/rule_based_cleanser.py @@ -1,6 +1,7 @@ +from src.path import root import os, json import re, unicodedata -import src.text_utils as tw +import src.utils.text_utils as tw from tqdm import tqdm last_cleaned = "" @@ -97,6 +98,6 @@ def cleanse(dataset_path, to_path): if __name__ == "__main__": cleanse( - "../../shared_storage/our_datasets/HisText_Mar8_Guten_EEBO_PoL_IA10_unrefined/", - "../../shared_storage/our_datasets/HisText_Mar8_Guten_EEBO_PoL_IA10_rulebased_refined/", + f"{root}/../../shared_storage/our_datasets/HisText_Mar8_Guten_EEBO_PoL_IA10_unrefined/", + f"{root}/../../shared_storage/our_datasets/HisText_Mar8_Guten_EEBO_PoL_IA10_rulebased_refined/", ) diff --git a/src/data_analysis/metadata_analysis.py b/src/data_analysis/metadata_analysis.py index 496d8bd..3a05e64 100644 --- a/src/data_analysis/metadata_analysis.py +++ b/src/data_analysis/metadata_analysis.py @@ -4,7 +4,7 @@ import matplotlib.pyplot as plt import scipy.stats as stats from tqdm import tqdm -from src import text_utils +from src import utils import logging mpl_logger = logging.getLogger("matplotlib") @@ -14,6 +14,8 @@ import glob import pandas as pd +from src.path import root + # %% def metadata_desc(lst_content_length, lst_source_dataset, lst_culture, lst_language): @@ -138,11 +140,11 @@ def main(): # iteratively read in all json files in directory for pathname in tqdm( glob.iglob( - "dataset/dataset_text_sequence/histext_1826_to_2018_collection_G/**/**/*.json", + f"{root}/dataset/dataset_text_sequence/histext_1826_to_2018_collection_G/**/**/*.json", recursive=True, ) ): - dict_iterator = text_utils.read_json_memory_efficient(pathname) + dict_iterator = utils.read_json_memory_efficient(pathname) for doc in tqdm(dict_iterator): # extract metadata lst_content_length.append(len(doc.get("content", ""))) diff --git a/src/data_analysis/word2vec_gensim.py b/src/data_analysis/word2vec_gensim.py index aba3155..5a2d463 100644 --- a/src/data_analysis/word2vec_gensim.py +++ b/src/data_analysis/word2vec_gensim.py @@ -1,3 +1,4 @@ +from src.path import root import numpy as np import scipy import json diff --git a/src/download_models.py b/src/download_models.py index a09166e..7e790cd 100644 --- a/src/download_models.py +++ b/src/download_models.py @@ -1,3 +1,4 @@ +from src.path import root import subprocess @@ -16,11 +17,11 @@ def download_all_models(download_8B=True, download_70B=False): for i in range(13, 22): download_model( f"PKU-Alignment/ProgressGym-HistLlama3-8B-C0{i}-pretrain", - f"./dataset/dataset_model_sequence/8B-C0{i}-pretrain", + f"{root}/dataset/dataset_model_sequence/8B-C0{i}-pretrain", ) if download_70B: for i in range(13, 22): download_model( f"PKU-Alignment/ProgressGym-HistLlama3-70B-C0{i}-instruct", - f"./dataset/dataset_model_sequence/70B-C0{i}-instruct", + f"{root}/dataset/dataset_model_sequence/70B-C0{i}-instruct", ) diff --git a/src/eebo/download_eebo.py b/src/eebo/download_eebo.py index c788aa0..4326e47 100644 --- a/src/eebo/download_eebo.py +++ b/src/eebo/download_eebo.py @@ -1,8 +1,9 @@ +from src.path import root import os from time import sleep -def download_eebo(path: str = "./dataset/raw_downloads/EEBO.zip"): +def download_eebo(path: str = f"{root}/dataset/raw_downloads/EEBO.zip"): if os.path.exists(path.replace(".zip", "")): print(f"EEBO already exists; continue") return diff --git a/src/eebo/process_eebo.py b/src/eebo/process_eebo.py index 4dee0df..aa1e02b 100644 --- a/src/eebo/process_eebo.py +++ b/src/eebo/process_eebo.py @@ -1,8 +1,9 @@ +from src.path import root import xml.etree.ElementTree as ET import os from tqdm import tqdm import json -import src.text_utils as tw +import src.utils.text_utils as tw # a utility function called by build_eebo_dataset, to read the contents in an eebo xml file in a suitable manner @@ -25,7 +26,7 @@ def remove_tag(root: ET.Element, tagname: str): # if download_eebo is already called, build_eebo_dataset is the only thing you need to call to build the EEBO dataset -def build_eebo_dataset(eebo_path: str = "./dataset/raw_downloads/EEBO/"): +def build_eebo_dataset(eebo_path: str = f"{root}/dataset/raw_downloads/EEBO/"): for phase_num in [1, 2]: print(f"start building dataset from EEBO Phase {phase_num} (2 in total)") root_dir = os.path.join(eebo_path, f"eebo_phase{phase_num}") diff --git a/src/evaluation/figure.py b/src/evaluation/figure.py index 500b25a..1924251 100644 --- a/src/evaluation/figure.py +++ b/src/evaluation/figure.py @@ -2,6 +2,7 @@ import quantify as qt import json import numpy as np +from src.path import root def get_dim(template_raw, template_out, idx): @@ -21,10 +22,10 @@ def get_dim(template_raw, template_out, idx): template1 = ( - "output/evaluation_results/8b-C0{n}-instruct_single/8b-C0{n}-instructraw.json" + f"{root}/" + "output/evaluation_results/8b-C0{n}-instruct_single/8b-C0{n}-instructraw.json" ) template2 = ( - "output/evaluation_results/8b-C0{n}-instruct_single/8b-C0{n}-instructdim.json" + f"{root}/" + "output/evaluation_results/8b-C0{n}-instruct_single/8b-C0{n}-instructdim.json" ) idx = [13, 14, 15, 16, 17, 18, 19, 20, 21] vecs = get_dim(template1, template2, idx) diff --git a/src/evaluation/metric.py b/src/evaluation/metric.py index 7b7942e..d2f3dfb 100644 --- a/src/evaluation/metric.py +++ b/src/evaluation/metric.py @@ -1,5 +1,6 @@ import os, json import numpy as np +from src.path import root def cosine_similarity(v1, v2): @@ -113,7 +114,7 @@ def predict(path): continue score = get_score_predict(os.path.join(path, p)) result.update(score) - with open("output/benchmark_results/predict.json", "w") as f: + with open(f"{root}/output/benchmark_results/predict.json", "w") as f: json.dump(result, f) @@ -128,7 +129,7 @@ def coevolve(path): continue score = get_score_coevolve(os.path.join(path, p)) result.update(score) - with open("output/benchmark_results/coevolve.json", "w") as f: + with open(f"{root}/output/benchmark_results/coevolve.json", "w") as f: json.dump(result, f) @@ -143,13 +144,13 @@ def follow(path): continue score = get_score_follow(os.path.join(path, p)) result.update(score) - with open("output/benchmark_results/follow.json", "w") as f: + with open(f"{root}/output/benchmark_results/follow.json", "w") as f: json.dump(result, f) -# predict('output/benchmark_results/complete_results') -# coevolve('output/benchmark_results/complete_results') -# follow('output/benchmark_results/complete_results') +# predict(f'{root}/output/benchmark_results/complete_results') +# coevolve(f'{root}/output/benchmark_results/complete_results') +# follow(f'{root}/output/benchmark_results/complete_results') def calculate_score(key, dict): diff --git a/src/evaluation/quantify.py b/src/evaluation/quantify.py index d4020bd..468ebce 100644 --- a/src/evaluation/quantify.py +++ b/src/evaluation/quantify.py @@ -12,6 +12,7 @@ from scipy.stats import linregress from numpy.polynomial.polynomial import Polynomial from scipy.stats import f +from src.path import root # [cnt] key - scenario code : val - dict (of ab_cnt, ab_1, cp_cnt, cp_1, rp_cnt, rp_1) + moral_vec # [moral_vec] key - scenario code : val - its moral vec. kth = 0 indicates not related dimenson, = 1 indicates preferred dimension, = 1 indicates rejected dimension @@ -31,11 +32,11 @@ "violate duties", ] -if not os.path.exists("output/evaluation_results/figs"): - os.makedirs("output/evaluation_results/figs") +if not os.path.exists(f"{root}/output/evaluation_results/figs"): + os.makedirs(f"{root}/output/evaluation_results/figs") -if not os.path.exists("logs/eval"): - os.makedirs("logs/eval") +if not os.path.exists(f"{root}logs/eval"): + os.makedirs(f"{root}/logs/eval") def _calculate_model(test_name, high_or_low, model_name): @@ -49,10 +50,10 @@ def _calculate_model(test_name, high_or_low, model_name): ) )(high_or_low) raw_dir = os.path.join( - "output", "evaluation_results", test_name, model_name + "_raw.json" + root, "output", "evaluation_results", test_name, model_name + "_raw.json" ) scenario_dir = os.path.join( - "src", "moralchoice", "assets", "data", "scenarios", scenario + ".csv" + root, "src", "moralchoice", "assets", "data", "scenarios", scenario + ".csv" ) mrl_vec = {} result = {} @@ -108,7 +109,7 @@ def _calculate_model(test_name, high_or_low, model_name): avg_vec = sum([np.array(x[1]) for x in mrl_vec.items()]) / len(mrl_vec.items()) with open( os.path.join( - "output", "evaluation_results", test_name, model_name + "_collected.json" + root, "output", "evaluation_results", test_name, model_name + "_collected.json" ), "w", ) as f: @@ -156,9 +157,9 @@ def calculate_model(test_dir, model_name): raw_dict = json.load(f) ref_dir = [ - "src/evaluation/raw_dataset/moralchoice/final.csv", - "src/evaluation/raw_dataset/foundation/final.csv", - "src/evaluation/raw_dataset/views/final.csv", + f"{root}/src/evaluation/raw_dataset/moralchoice/final.csv", + f"{root}/src/evaluation/raw_dataset/foundation/final.csv", + f"{root}/src/evaluation/raw_dataset/views/final.csv", ] # ref_dict = [csv_to_dict_list(t) for t in ref_dir] @@ -224,8 +225,8 @@ def calculate_model(test_dir, model_name): """ logging invalid """ - mode = "a+" if os.path.exists("logs/eval/log.txt") else "w" - with open("logs/eval/log.txt", mode) as f: + mode = "a+" if os.path.exists(f"{root}/logs/eval/log.txt") else "w" + with open(f"{root}/logs/eval/log.txt", mode) as f: f.write("invalid, " + key + ", count, " + str(valid_cnt) + "\n") invalid[2] += 1 continue @@ -262,8 +263,8 @@ def calculate_model(test_dir, model_name): """ logging invalid """ - mode = "a+" if os.path.exists("logs/eval/log.txt") else "w" - with open("logs/eval/log.txt", mode) as f: + mode = "a+" if os.path.exists(f"{root}/logs/eval/log.txt") else "w" + with open(f"{root}/logs/eval/log.txt", mode) as f: f.write("invalid, " + key + ", count, " + str(valid_cnt) + "\n") invalid[1] += 1 continue @@ -356,7 +357,7 @@ def plot_parallel_coordinates(data, title, tuples): plt.ylabel("Value") plt.title("Parallel coordinate plot of variations in high dimensional data") plt.show() - plt.savefig("output/evaluation_results/figs/" + title + "_parr.png") + plt.savefig(f"{root}/output/evaluation_results/figs/" + title + "_parr.png") def plot_heatmap(data, title, label_set, tuples = None, norm = "column"): """ @@ -414,7 +415,7 @@ def plot_heatmap(data, title, label_set, tuples = None, norm = "column"): plt.xlabel("Dimensions") plt.ylabel("HistLlama") plt.show() - plt.savefig("output/evaluation_results/figs/" + title + "_heat.png") + plt.savefig(f"{root}/output/evaluation_results/figs/" + title + "_heat.png") def plot_vectors(vectors, dim_start, name): print(vectors) @@ -444,7 +445,7 @@ def plot_vectors(vectors, dim_start, name): plt.show() plt.savefig( - "output/evaluation_results/figs/" + name_mapping[name] + "_line_real.png" + f"{root}/output/evaluation_results/figs/" + name_mapping[name] + "_line_real.png" ) @@ -464,7 +465,7 @@ def analyze_vectors_quadratic(vectors): p_values = [] positive_coefficients = [] negative_coefficients = [] - np.savetxt("output/evaluation_results/figs/quad.txt", vectors, fmt='%f') + np.savetxt(f"{root}/output/evaluation_results/figs/quad.txt", vectors, fmt='%f') for dim in range(num_dimensions): x = np.arange(len(vectors)) y = vectors[:, dim] @@ -490,7 +491,7 @@ def analyze_vectors_quadratic(vectors): # Print coefficients and p-values print("Quadratic coefficients and p-values for each dimension:") - with open("output/evaluation_results/figs/quad.txt", 'a') as f: + with open(f"{root}/output/evaluation_results/figs/quad.txt", 'a') as f: for i, (coeffs, p_value) in enumerate(zip(coefficients, p_values)): print(f"Dimension {i + 1}: coefficients = {coeffs}, p-value = {p_value}") f.write(f"Dimension {i + 1}: coefficients = {coeffs}, p-value = {p_value}\n\n") @@ -514,7 +515,7 @@ def analyze_vectors_quadratic(vectors): plt.legend() plt.show() - plt.savefig("output/evaluation_results/figs/quad.png") + plt.savefig(f"{root}/output/evaluation_results/figs/quad.png") def analyze_vectors(vectors): @@ -560,4 +561,4 @@ def analyze_vectors(vectors): plt.legend() plt.show() - plt.savefig("output/evaluation_results/lin.png") + plt.savefig(f"{root}/output/evaluation_results/lin.png") diff --git a/src/evaluation/test_eval_01.py b/src/evaluation/test_eval_01.py index 849cf95..5725def 100644 --- a/src/evaluation/test_eval_01.py +++ b/src/evaluation/test_eval_01.py @@ -1,15 +1,15 @@ import os, json # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" - +from src.path import root from ..abstractions import Model from .utils import generate_alpaca, _collect from multiprocessing import freeze_support from . import quantify as qt import numpy as np """ -generate_alpaca('mc', os.path.join('src', 'evaluation', 'raw_dataset', 'moralchoice')) -generate_alpaca('views', os.path.join('src', 'evaluation', 'raw_dataset', 'views')) -generate_alpaca('foundation', os.path.join('src', 'evaluation', 'raw_dataset', 'foundation')) +generate_alpaca('mc', os.path.join(root, 'src', 'evaluation', 'raw_dataset', 'moralchoice')) +generate_alpaca('views', os.path.join(root, 'src', 'evaluation', 'raw_dataset', 'views')) +generate_alpaca('foundation', os.path.join(root, 'src', 'evaluation', 'raw_dataset', 'foundation')) """ if __name__ == "__main__": freeze_support() @@ -30,21 +30,21 @@ boi = Model(m) v = boi.evaluate(method="fast", logprobs = True) ''' - with open("output/datasets/evaluation_output_mc_" + m + ".json", 'r') as f: + with open(f"{root}/output/datasets/evaluation_output_mc_" + m + ".json", 'r') as f: d = json.load(f) raw = _collect(d) - with open('output/evaluation_results/' + m + '_single/' + m + '_raw.json', 'w') as f: + with open(f'{root}/output/evaluation_results/' + m + '_single/' + m + '_raw.json', 'w') as f: json.dump(raw, f) - v = qt.calculate_model('output/evaluation_results/' + m + '_single/', m) + v = qt.calculate_model(f'{root}/output/evaluation_results/' + m + '_single/', m) ''' vec.append(v) test_name = "logprob_test" - with open("output/evaluation_results/" + test_name + ".json", "w") as f: + with open(f"{root}/output/evaluation_results/" + test_name + ".json", "w") as f: lst = [list(boi) for boi in vec] json.dump(lst, f) vec = np.array(vec) # qt.analyze_vectors_quadratic(vec) - # vec = json.load(open("output/evaluation_results/" + test_name + ".json", "r")) + # vec = json.load(open(f"{root}/output/evaluation_results/" + test_name + ".json", "r")) # qt.plot_parallel_coordinates(vec) qt.plot_heatmap(vec[:, 10:15], test_name + '_foundation', label_set = 2, norm = "group") qt.plot_heatmap(vec[:, 15:19], test_name + '_view',label_set = 3, norm = "group") diff --git a/src/evaluation/test_eval_02.py b/src/evaluation/test_eval_02.py index 3a04a65..508c957 100644 --- a/src/evaluation/test_eval_02.py +++ b/src/evaluation/test_eval_02.py @@ -4,10 +4,11 @@ from multiprocessing import freeze_support import src.evaluation.quantify as qt import numpy as np +from src.path import root """ -generate_alpaca('mc', os.path.join('src', 'evaluation', 'raw_dataset', 'moralchoice')) -generate_alpaca('views', os.path.join('src', 'evaluation', 'raw_dataset', 'views')) -generate_alpaca('foundation', os.path.join('src', 'evaluation', 'raw_dataset', 'foundation')) +generate_alpaca('mc', os.path.join(root, 'src', 'evaluation', 'raw_dataset', 'moralchoice')) +generate_alpaca('views', os.path.join(root, 'src', 'evaluation', 'raw_dataset', 'views')) +generate_alpaca('foundation', os.path.join(root, 'src', 'evaluation', 'raw_dataset', 'foundation')) """ if __name__ == "__main__": os.environ["CUDA_VISIBLE_DEVICE"] = "0" @@ -19,16 +20,16 @@ ] vec = [] for m in set_model: - with open("output/datasets/evaluation_output_mc_" + m + ".json", 'r') as f: + with open(f"{root}/output/datasets/evaluation_output_mc_" + m + ".json", 'r') as f: d = json.load(f) raw = collect(d, logprobs = True) - with open('output/evaluation_results/' + m + '_single/' + m + '_raw.json', 'w') as f: + with open(f'{root}/output/evaluation_results/' + m + '_single/' + m + '_raw.json', 'w') as f: json.dump(raw, f) - v = qt.calculate_model('output/evaluation_results/' + m + '_single/', m) + v = qt.calculate_model(f'{root}/output/evaluation_results/' + m + '_single/', m) vec.append(v) test_name = "8b_all_fixed" vec = np.array(vec) - with open("output/evaluation_results/" + test_name + ".json", "w") as f: + with open(f"{root}/output/evaluation_results/" + test_name + ".json", "w") as f: lst = [list(boi) for boi in vec] json.dump(lst, f) qt.plot_heatmap(vec[:, 10:15], test_name + '_foundation', label_set = 2, norm = "group") diff --git a/src/evaluation/utils.py b/src/evaluation/utils.py index 2149abf..dc0904a 100644 --- a/src/evaluation/utils.py +++ b/src/evaluation/utils.py @@ -1,3 +1,4 @@ +from src.path import root import json, os, argparse import statistics, itertools import csv, Levenshtein @@ -6,30 +7,30 @@ import copy repeat = 1 -assets = os.path.join("src", "evaluation", "assets") +assets = os.path.join(root, "src", "evaluation", "assets") -if not os.path.exists("logs/eval"): - os.makedirs("logs/eval") +if not os.path.exists(f"{root}/logs/eval"): + os.makedirs(f"{root}/logs/eval") def regenerate_inputs(logprobs=False) -> Data: - input_file = os.path.join("src", "evaluation", "assets", "input_alpaca.json") + input_file = os.path.join(root, "src", "evaluation", "assets", "input_alpaca.json") if os.path.exists(input_file): os.remove(input_file) generate_alpaca( - "mc", os.path.join("src", "evaluation", "raw_dataset", "moralchoice"), rearrange = True, logprobs=logprobs + "mc", os.path.join(root, "src", "evaluation", "raw_dataset", "moralchoice"), rearrange = True, logprobs=logprobs ) - generate_alpaca("views", os.path.join("src", "evaluation", "raw_dataset", "views"), rearrange = True, logprobs=logprobs) + generate_alpaca("views", os.path.join(root, "src", "evaluation", "raw_dataset", "views"), rearrange = True, logprobs=logprobs) generate_alpaca( - "foundation", os.path.join("src", "evaluation", "raw_dataset", "foundation"), rearrange = True, logprobs=logprobs + "foundation", os.path.join(root, "src", "evaluation", "raw_dataset", "foundation"), rearrange = True, logprobs=logprobs ) result = Data( "evaluation", data_type="sft", - data_path="./src/evaluation/assets/input_alpaca.json", + data_path=f"{root}/src/evaluation/assets/input_alpaca.json", ) result.set_key_fields(prompt_field_name="instruction", query_field_name="input") return result @@ -116,7 +117,7 @@ def semantic_matching(item, mapping, four=False, verbal=False): optionC, optionD = item["action" + str(1+ mapping.index(3))], item["action" + str(1+ mapping.index(4))] response_template = os.path.join( - "src", "evaluation", "assets", "data", "response_templates" + root, "src", "evaluation", "assets", "data", "response_templates" ) with open(os.path.join(response_template, "refusals.txt"), "r") as f: refusals = f.readlines() @@ -244,8 +245,8 @@ def semantic_matching(item, mapping, four=False, verbal=False): """ logging invalid """ - mode = "a+" if os.path.exists("logs/eval/log_sem.txt") else "w+" - with open("logs/eval/log_sem.txt", mode) as f: + mode = "a+" if os.path.exists(f"{root}/logs/eval/log_sem.txt") else "w+" + with open(f"{root}/logs/eval/log_sem.txt", mode) as f: f.write( "answer [" + answer + "];\n templates [" + ";".join(answers_action1) + "]\n" ) @@ -338,7 +339,7 @@ def __collect(output_data): if prob > old_prob: middle[s_id][q_type][mapping_id][0] = predict_id middle[s_id][q_type][mapping_id][1] = prob - with open('output/evaluation_results/middle.json', 'a+') as f: + with open(f'{root}/output/evaluation_results/middle.json', 'a+') as f: record = copy.deepcopy(middle) for x in record.keys(): for y in record[x].keys(): @@ -532,7 +533,7 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False): output_list_dic.extend([boi_ab, boi_compare, boi_repeat]) try: with open( - os.path.join("src", "evaluation", "assets", "input_alpaca.json"), "r" + os.path.join(root, "src", "evaluation", "assets", "input_alpaca.json"), "r" ) as f: temp = json.load(f) except: @@ -541,7 +542,7 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False): temp.extend(output_list_dic) with open( - os.path.join("src", "evaluation", "assets", "input_alpaca.json"), "w" + os.path.join(root, "src", "evaluation", "assets", "input_alpaca.json"), "w" ) as f: json.dump(temp, f) print("done", source) @@ -636,7 +637,7 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False): with open( - os.path.join("src", "evaluation", "assets", "input_alpaca.json"), "r" + os.path.join(root, "src", "evaluation", "assets", "input_alpaca.json"), "r" ) as f: temp = json.load(f) @@ -644,7 +645,7 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False): temp.extend(output_list_dic) with open( - os.path.join("src", "evaluation", "assets", "input_alpaca.json"), "w" + os.path.join(root, "src", "evaluation", "assets", "input_alpaca.json"), "w" ) as f: json.dump(temp, f) print("done", source, cut, "made the cut") @@ -688,19 +689,19 @@ def collect_dim(output_from_collect): look_up_dict = [] look_up_dict.append( csv_to_dict( - "src/evaluation/raw_dataset/moralchoice/final.csv", + f"{root}/src/evaluation/raw_dataset/moralchoice/final.csv", ["scenario_id", "generation_rule"], ) ) look_up_dict.append( csv_to_dict( - "src/evaluation/raw_dataset/foundation/final.csv", + f"{root}/src/evaluation/raw_dataset/foundation/final.csv", ["scenario_id", "generation_theme"], ) ) look_up_dict.append( csv_to_dict( - "src/evaluation/raw_dataset/views/final.csv", + f"{root}/src/evaluation/raw_dataset/views/final.csv", ["scenario_id", "generation_theme"], ) ) diff --git a/src/gutenberg/get_meta.py b/src/gutenberg/get_meta.py index f69a079..4c52ea6 100644 --- a/src/gutenberg/get_meta.py +++ b/src/gutenberg/get_meta.py @@ -1,10 +1,11 @@ -import src.text_utils as tw +from src.path import root +import src.utils.text_utils as tw import os, json import csv from tqdm import tqdm """ -This file is to be excecuted after get_data.py. Reads metadata from raw files and writes them via text_utils API +This file is to be excecuted after get_data.py. Reads metadata from raw files and writes them via utils API """ @@ -61,7 +62,7 @@ def gather_meta(raw_dir, record): """ with open( os.path.join( - "dataset", "raw_downloads", "Gutenberg", "metadata", "metadata.csv" + root, "dataset", "raw_downloads", "Gutenberg", "metadata", "metadata.csv" ) ) as file: reader = csv.reader(file) diff --git a/src/gutenberg/src/bookshelves.py b/src/gutenberg/src/bookshelves.py index c74205b..f0d007b 100644 --- a/src/gutenberg/src/bookshelves.py +++ b/src/gutenberg/src/bookshelves.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Functions to download, parse and filter Gutenberg's bookshelves.""" +from src.path import root import os import glob import numpy as np diff --git a/src/gutenberg/src/metadataparser.py b/src/gutenberg/src/metadataparser.py index f668435..3858620 100644 --- a/src/gutenberg/src/metadataparser.py +++ b/src/gutenberg/src/metadataparser.py @@ -5,6 +5,7 @@ Based on https://bitbucket.org/c-w/gutenberg/ """ +from src.path import root import os import re import tarfile diff --git a/src/gutenberg/src/metaquery.py b/src/gutenberg/src/metaquery.py index 0c2938b..699acff 100644 --- a/src/gutenberg/src/metaquery.py +++ b/src/gutenberg/src/metaquery.py @@ -9,6 +9,7 @@ """ +from src.path import root import os import pandas as pd import numpy as np diff --git a/src/internet_archive/get_sources.py b/src/internet_archive/get_sources.py index 4161eeb..11edd69 100644 --- a/src/internet_archive/get_sources.py +++ b/src/internet_archive/get_sources.py @@ -1,14 +1,15 @@ +from src.path import root import os, json, requests -import src.text_utils as tw +import src.utils.text_utils as tw from tqdm import tqdm import re import time def build_internet_archive_LibOfCong(max_hours: int = None): - ia_path = "./src/internet_archive/ia" + ia_path = f"{root}/src/internet_archive/ia" identifier_list_path = ( - "./dataset/raw_downloads/internet_archive_identifier_list_libofcong.txt" + f"{root}/dataset/raw_downloads/internet_archive_identifier_list_libofcong.txt" ) os.system(f"chmod +x {ia_path}") diff --git a/src/model_training/curate_prompts.py b/src/model_training/curate_prompts.py index 0b72c21..adcbb02 100644 --- a/src/model_training/curate_prompts.py +++ b/src/model_training/curate_prompts.py @@ -94,14 +94,15 @@ def __init__(self, collection_name: str, data_type: Literal['pretrain', 'sft', ' ``` """ +from src.path import root from src.abstractions import Data, DataFileCollection, fill_in_QA_template -from src.text_utils import write_log, read_json_memory_efficient +from src.utils.text_utils import write_log, read_json_memory_efficient # Multithreading for GPT interaction import threading from threading import Thread, Lock from concurrent.futures import ThreadPoolExecutor, as_completed -from src.gpt_utils import context_len, independent_get_response +from src.utils.gpt_utils import context_len, independent_get_response # For converting jsonl to json import json diff --git a/src/model_training/train_hislm.py b/src/model_training/train_hislm.py index dbd9d38..f09a8e8 100644 --- a/src/model_training/train_hislm.py +++ b/src/model_training/train_hislm.py @@ -1,5 +1,6 @@ +from src.path import root from src.abstractions import Model, Data, DataFileCollection -from src.text_utils import write_log +from src.utils.text_utils import write_log from collections import defaultdict from typing import List import os @@ -260,8 +261,8 @@ def run_training(dataset_dir: str, models_save_dir: str, num_gpus: int = None): "lang_stat": lang_stat, } - os.makedirs("./logs", exist_ok=True) - with open("./logs/century_stats.json", "w") as f: + os.makedirs(f"{root}/logs", exist_ok=True) + with open(f"{root}/logs/century_stats.json", "w") as f: json.dump(century_stats, f) # Start training diff --git a/src/path.py b/src/path.py new file mode 100644 index 0000000..c8bf47c --- /dev/null +++ b/src/path.py @@ -0,0 +1,3 @@ +import os, sys +root = "src".join(os.path.dirname(os.path.abspath(__file__)).split("src")[:-1]) +print(f"Library root directory: {root}") \ No newline at end of file diff --git a/src/pile_of_law/get_data.py b/src/pile_of_law/get_data.py index 0b29a70..770a76c 100644 --- a/src/pile_of_law/get_data.py +++ b/src/pile_of_law/get_data.py @@ -1,8 +1,9 @@ +from src.path import root import requests from bs4 import BeautifulSoup import os, json import lzma -import src.text_utils as tw +import src.utils.text_utils as tw from tqdm import tqdm from copy import copy @@ -18,8 +19,8 @@ def get_pile_of_law(): """ DOWNLOADING COMPRESSED FILES FROM HUGGINGFACE """ - compressed_dir = "./dataset/raw_downloads/pile_of_law_compressed" - decompressed_dir = "./dataset/raw_downloads/pile_of_law_decompressed" + compressed_dir = f"{root}/dataset/raw_downloads/pile_of_law_compressed" + decompressed_dir = f"{root}/dataset/raw_downloads/pile_of_law_decompressed" if not os.path.isdir(compressed_dir): os.mkdir(compressed_dir) if not os.path.isdir(decompressed_dir): @@ -106,7 +107,7 @@ def read_jsonl_and_concat(jsonl_dir): dd["culture"] = "English" dd["source_dataset"] = "Pile_of_Law" - with open("./src/pile_of_law/source.json", "r") as f: + with open(f"{root}/src/pile_of_law/source.json", "r") as f: source_dict = json.load(f) dd["source_dataset_detailed"] = "Pile_of_Law_" + name dd["source_dataset_detailed_explanation"] = ( diff --git a/src/utils/data_utils/__init__.py b/src/utils/data_utils/__init__.py new file mode 100644 index 0000000..df8e89b --- /dev/null +++ b/src/utils/data_utils/__init__.py @@ -0,0 +1,2 @@ +from .extrapolation_utils import * +from .rw_utils import * \ No newline at end of file diff --git a/algorithms/utils/extrapolation_utils.py b/src/utils/data_utils/extrapolation_utils.py similarity index 94% rename from algorithms/utils/extrapolation_utils.py rename to src/utils/data_utils/extrapolation_utils.py index 619d0c3..bb7e8c8 100644 --- a/algorithms/utils/extrapolation_utils.py +++ b/src/utils/data_utils/extrapolation_utils.py @@ -1,15 +1,8 @@ from copy import deepcopy from typing import Iterable, Tuple, Dict, List, Literal, Union -from src.abstractions import Model, Data -from time import strftime, localtime -import os, sys -import random -import pandas as pd +# from src.abstractions import Data import json -import datasets -from src.text_utils import write_log import warnings -from tqdm import tqdm from sympy import binomial import numpy as np @@ -19,8 +12,8 @@ def extrapolate( current_timestep: int, timesteps_ahead: int, extrapolation_order: int, - preference_history: List[Data], -) -> Data: + preference_history: List["Data"], +) -> "Data": """Extrapolate the preference data using the preference history, looking timesteps_ahead timesteps ahead.""" exp_order = min(extrapolation_order, len(preference_history) - 1) diff --git a/algorithms/utils/rw_utils.py b/src/utils/data_utils/rw_utils.py similarity index 97% rename from algorithms/utils/rw_utils.py rename to src/utils/data_utils/rw_utils.py index b3e2d92..3e45a7d 100644 --- a/algorithms/utils/rw_utils.py +++ b/src/utils/data_utils/rw_utils.py @@ -1,12 +1,12 @@ +from src.path import root from typing import Iterable, Tuple, Dict, List, Literal -from src.abstractions import Model, Data, fill_in_QA_template +from src.abstractions import Data from time import strftime, localtime import os, sys import random import pandas as pd import json from datasets import load_dataset -from src.text_utils import write_log import src.evaluation.utils as eval_utils from benchmark import JudgeBase, ExamineeBase, PredictJudge import warnings @@ -213,6 +213,7 @@ def elicit_rw_preference( print("initiate rw dataset construction") save_path = os.path.join( + root, "output", "rlhf_results", f"preference_{examinee.instance_id}_{judge.instance_id}_{examinee.current_timestep}.json", @@ -296,14 +297,10 @@ def filter(s: str) -> str: assert status is None if aligned: rw_data.append([]) - write_log( - "invalid response from judge, " + str(dic) + "|| response over", - log_name="rlhf", - ) debug_data.append([dic, temp]) - with open("./logs/debug_data.json", "w") as f: + with open(f"{root}/logs/debug_data.json", "w") as f: json.dump(debug_data, f) with open(save_path, "w") as f: diff --git a/src/gpt_utils.py b/src/utils/gpt_utils.py similarity index 92% rename from src/gpt_utils.py rename to src/utils/gpt_utils.py index 1132678..ac24df7 100644 --- a/src/gpt_utils.py +++ b/src/utils/gpt_utils.py @@ -1,3 +1,4 @@ +from src.path import root from threading import Thread from typing import List, Tuple import os @@ -8,10 +9,10 @@ import re import csv -if not os.path.exists("./logs/eval"): - os.makedirs("./logs/eval") +if not os.path.exists(f"{root}/logs/eval"): + os.makedirs(f"{root}/logs/eval") -with open("./src/abstractions/configs/abstractions_config.json", "r") as config_file: +with open(f"{root}/src/abstractions/configs/abstractions_config.json", "r") as config_file: abstractions_config = json.load(config_file) if "openai_mirror" in abstractions_config: mirror_url = abstractions_config["openai_mirror"] @@ -45,7 +46,7 @@ def get_system_prompt(src="default"): - prompt_dir = os.path.join("src", "evaluation", "assets", "expand_prompt.json") + prompt_dir = os.path.join(root, "src", "evaluation", "assets", "expand_prompt.json") if src == "default": return f"You are a research-oriented large language model trained by OpenAI, based on the GPT-4 architecture." with open(prompt_dir, "r") as f: @@ -169,9 +170,9 @@ def independent_get_response_parallel(prompts: List[str]) -> List[str]: def construct_input_for_expand(source): proto_dir = os.path.join( - "src", "evaluation", "raw_dataset", source, "prototype.csv" + root, "src", "evaluation", "raw_dataset", source, "prototype.csv" ) - template_dir = "src/evaluation/assets/expand_prompt.json" + template_dir = f"{root}/src/evaluation/assets/expand_prompt.json" out_dir = os.path.join(os.path.basename(proto_dir), "generated.csv") with open(template_dir, "r") as f: boi = json.load(f) @@ -190,8 +191,8 @@ def construct_input_for_expand(source): def _debug(output): - mode = "a+" if os.path.exists("logs/eval/log.txt") else "w" - with open("logs/eval/log.txt", mode) as f: + mode = "a+" if os.path.exists(f"{root}/logs/eval/log.txt") else "w" + with open(f"{root}/logs/eval/log.txt", mode) as f: f.write("\n".join(output) + "\n") diff --git a/src/text_utils.py b/src/utils/text_utils.py similarity index 97% rename from src/text_utils.py rename to src/utils/text_utils.py index 04f366b..145e27d 100644 --- a/src/text_utils.py +++ b/src/utils/text_utils.py @@ -1,3 +1,4 @@ +from src.path import root import json from typing import Tuple, Iterable, Dict, Hashable, Any import os @@ -20,12 +21,12 @@ not in year_lengths: uninitialized. """ year_lengths = {} -os.makedirs("./logs", exist_ok=True) +os.makedirs(f"{root}/logs", exist_ok=True) def write_log(s: str, log_name: str = "build_dataset"): time_str = strftime("%d %b %H:%M:%S", localtime()) - with open(f"./logs/{log_name}.log", "a") as log_file: + with open(f"{root}/logs/{log_name}.log", "a") as log_file: log_file.write(time_str + " --- " + s + "\n") @@ -34,7 +35,7 @@ def year2path(yr_num: int) -> Tuple[str, str]: returns (filefolder, filefullpath) for any given year number """ century = "C%03d" % (yr_num // 100 + 1,) - century_path = f"./dataset/dataset_text_sequence/{century}/" + century_path = f"{root}/dataset/dataset_text_sequence/{century}/" if not os.path.exists(century_path): os.mkdir(century_path) @@ -107,7 +108,7 @@ def write_single_entry( out_file.write(json.dumps(json_dict)) -undated_path = "./dataset/dataset_text_sequence/undated.json" +undated_path = f"{root}/dataset/dataset_text_sequence/undated.json" undated_count = 0 From a58dcd62a300c83d49cf7a793fd9c9c61eebe90d Mon Sep 17 00:00:00 2001 From: "Tianyi (Alex) Qiu" Date: Thu, 28 Nov 2024 10:45:58 -0800 Subject: [PATCH 6/9] fix: correct formatting of src.path.root --- src/path.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/path.py b/src/path.py index c8bf47c..9eed252 100644 --- a/src/path.py +++ b/src/path.py @@ -1,3 +1,3 @@ import os, sys -root = "src".join(os.path.dirname(os.path.abspath(__file__)).split("src")[:-1]) +root = "src".join(os.path.dirname(os.path.abspath(__file__)).split("src")[:-1]).strip("/").strip("\\") print(f"Library root directory: {root}") \ No newline at end of file From 273e501364b45dd3fcc41c5611fb952bab7dcb82 Mon Sep 17 00:00:00 2001 From: "Tianyi (Alex) Qiu" Date: Thu, 28 Nov 2024 10:53:04 -0800 Subject: [PATCH 7/9] fix: robust path finding --- src/abstractions/configs/templates_configs.py | 6 ++++-- src/evaluation/quantify.py | 2 +- src/path.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/abstractions/configs/templates_configs.py b/src/abstractions/configs/templates_configs.py index 2a807ee..9d3ea99 100644 --- a/src/abstractions/configs/templates_configs.py +++ b/src/abstractions/configs/templates_configs.py @@ -132,7 +132,8 @@ if not os.path.exists(data_save_path): data_save_path = f"{root}/" + data_save_path if not os.path.exists(data_save_path): - raise FileNotFoundError(f"Data save path {data_save_path} doesn't exist.") + print(f"Data save path {data_save_path} doesn't exist. Creating it.") + os.makedirs(data_save_path) for i, path in enumerate(data_search_paths): if not os.path.exists(path): @@ -144,7 +145,8 @@ if not os.path.exists(model_save_path): model_save_path = f"{root}/" + model_save_path if not os.path.exists(model_save_path): - raise FileNotFoundError(f"Model save path {model_save_path} doesn't exist.") + print(f"Model save path {model_save_path} doesn't exist. Creating it.") + os.makedirs(model_save_path) for i, path in enumerate(model_search_paths): if not os.path.exists(path): diff --git a/src/evaluation/quantify.py b/src/evaluation/quantify.py index 468ebce..ecd9038 100644 --- a/src/evaluation/quantify.py +++ b/src/evaluation/quantify.py @@ -35,7 +35,7 @@ if not os.path.exists(f"{root}/output/evaluation_results/figs"): os.makedirs(f"{root}/output/evaluation_results/figs") -if not os.path.exists(f"{root}logs/eval"): +if not os.path.exists(f"{root}/logs/eval"): os.makedirs(f"{root}/logs/eval") diff --git a/src/path.py b/src/path.py index 9eed252..aaea076 100644 --- a/src/path.py +++ b/src/path.py @@ -1,3 +1,3 @@ import os, sys -root = "src".join(os.path.dirname(os.path.abspath(__file__)).split("src")[:-1]).strip("/").strip("\\") +root = "src".join(os.path.dirname(os.path.abspath(__file__)).split("src")[:-1]).rstrip("/").rstrip("\\") print(f"Library root directory: {root}") \ No newline at end of file From 418255f2f827f4e392205f38bd749d9f2de0e770 Mon Sep 17 00:00:00 2001 From: "Tianyi (Alex) Qiu" Date: Thu, 28 Nov 2024 10:59:50 -0800 Subject: [PATCH 8/9] chore: reduce redunant output --- src/evaluation/utils.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/evaluation/utils.py b/src/evaluation/utils.py index dc0904a..67506dd 100644 --- a/src/evaluation/utils.py +++ b/src/evaluation/utils.py @@ -442,7 +442,6 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False): If both (1) and (2) are mostly true, output 'YES' verbatim, otherwise 'NO' verbatim. If you are unsure, output 'SKIP' verbatim. No extra output is allowed. """ cut = 0 - print("logprobs", logprobs) if source == "mc" or source == "foundation": """ ab, repeat, compare, each 'repeat' times. @@ -537,7 +536,6 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False): ) as f: temp = json.load(f) except: - print("writing in new input.json") temp = [] temp.extend(output_list_dic) @@ -545,7 +543,7 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False): os.path.join(root, "src", "evaluation", "assets", "input_alpaca.json"), "w" ) as f: json.dump(temp, f) - print("done", source) + elif source == "views": """ abcd (one fav. and one worst), repeat, each 'repeat' times. @@ -641,14 +639,12 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False): ) as f: temp = json.load(f) - print("appending to input.json") - temp.extend(output_list_dic) with open( os.path.join(root, "src", "evaluation", "assets", "input_alpaca.json"), "w" ) as f: json.dump(temp, f) - print("done", source, cut, "made the cut") + def get_dim(key, dict_list): From cf09339a993356bf55e52f8fa8d9ca56bb5c9730 Mon Sep 17 00:00:00 2001 From: Tianyi Qiu Date: Sat, 30 Nov 2024 03:01:20 +0000 Subject: [PATCH 9/9] fix(abstractions): data transformation with identical source and destination --- build_dataset.py | 6 +- src/abstractions/backends.py | 1 + src/abstractions/data.py | 116 +++++++++++++++++---------- src/abstractions/model.py | 14 ++-- src/cleanser/rule_based_cleanser.py | 8 +- src/eebo/process_eebo.py | 8 +- src/gutenberg/get_meta.py | 12 +-- src/internet_archive/get_sources.py | 28 +++---- src/model_training/curate_prompts.py | 2 +- src/pile_of_law/get_data.py | 18 ++--- src/utils/text_utils.py | 1 + 11 files changed, 123 insertions(+), 91 deletions(-) diff --git a/build_dataset.py b/build_dataset.py index f62175f..1a1b049 100644 --- a/build_dataset.py +++ b/build_dataset.py @@ -1,5 +1,5 @@ from src.path import root -import src.utils.text_utils as tw +import src.utils.text_utils as tu import src.cleanser.rule_based_cleanser as rb import src.cleanser.localllm_cleanser as llm_cleanser import src.model_training.train_hislm as hislm @@ -53,7 +53,7 @@ def build_pile_of_law(): if __name__ == "__main__": - tw.write_log(f"\n\n\n\n\n\n=========== NEW RUN ============\n\n") + tu.write_log(f"\n\n\n\n\n\n=========== NEW RUN ============\n\n") print( "This script is NOT meant to be run as part of the benchmarking process. Unless you would like to replicate the dataset building & model training process, you could directly run `run_benchmark.py` instead, which will automatically download the pre-built dataset and/or models on demand." ) @@ -82,7 +82,7 @@ def build_pile_of_law(): max_hours=10 ) # takes ~100h, but if max_hours is supplied then stops after this many hours (won't affect data integrity) # finishing up - tw.seal_all_files() + tu.seal_all_files() print("Finished building entire dataset. Proceed to data cleansing.") if ( diff --git a/src/abstractions/backends.py b/src/abstractions/backends.py index 7a0e328..442d4b1 100644 --- a/src/abstractions/backends.py +++ b/src/abstractions/backends.py @@ -481,6 +481,7 @@ def sglang_process_batch( ) assert len(output) == len(sample_dicts) + count = 0 for _ in range(20): bad_indices = [ k diff --git a/src/abstractions/data.py b/src/abstractions/data.py index d17c225..aab6098 100644 --- a/src/abstractions/data.py +++ b/src/abstractions/data.py @@ -15,7 +15,7 @@ import os import json import warnings -import src.utils.text_utils as tw +import src.utils.text_utils as tu from tqdm import tqdm from src.abstractions.configs.templates_configs import * @@ -126,7 +126,7 @@ def __init__( data_path = f"{root}/output/datasets/{data_name}.json" Data.ask_and_remove_if_exists(data_path, forced_rewrite=True) - with tw.JsonListWriter(data_path) as json_writer: + with tu.JsonListWriter(data_path) as json_writer: for element in data_content: json_writer.append(element) @@ -169,9 +169,19 @@ def __init__( else: Data.name2data[data_name] = [self] - def copy(self) -> "Data": - """Returns a shallow copy of the current Data instance.""" - cp = Data(self.data_name, self.data_type, self.data_path) + def copy(self, data_name: str = None) -> "Data": + """ + Returns a copy of the current Data instance. + Shallow copy if data_name is not provided or identical to the current data_name; deep copy otherwise. + """ + if data_name and data_name != self.data_name: + new_data_path = f"{root}/output/datasets/{data_name}.json" + Data.ask_and_remove_if_exists(new_data_path, forced_rewrite=True) + execute(f"cp {escape(self.data_path)} {escape(new_data_path)}") + cp = Data(data_name, self.data_type, new_data_path) + else: + cp = Data(self.data_name, self.data_type, self.data_path) + cp.key_fields = self.key_fields.copy() return cp @@ -213,6 +223,19 @@ def transform( :rtype: Data. """ out_path = f"{root}/output/datasets/{result_data_name}.json" + if self.data_name == result_data_name or self.data_path == out_path: + warnings.warn( + f"Data name {result_data_name} is the same as the current data name. The old instance will be invalidated." + ) + return self.copy("temp_transform_artifact").transform( + transformation, + result_data_name, + forced_rewrite, + max_batch_size, + keep_key_fields, + map_key_fields, + ) + Data.ask_and_remove_if_exists(out_path, forced_rewrite) def write_dict(sample_dict: Dict): @@ -225,7 +248,7 @@ def write_dict(sample_dict: Dict): def map_key_fields_fn(sample_dict: Dict) -> Dict: nonlocal self for k, v in self.default_key_fields.items(): - if k in self.key_fields and self.key_fields[k] != v: + if k in self.key_fields and self.key_fields.get(k, v) != v and self.key_fields[k] in sample_dict: sample_dict[v] = sample_dict[self.key_fields[k]] del sample_dict[self.key_fields[k]] @@ -234,7 +257,7 @@ def map_key_fields_fn(sample_dict: Dict) -> Dict: def inv_map_key_fields_fn(sample_dict: Dict) -> Dict: nonlocal self for k, v in self.default_key_fields.items(): - if v in sample_dict and self.key_fields[k] != v: + if v in sample_dict and self.key_fields.get(k, v) != v: sample_dict[self.key_fields[k]] = sample_dict[v] del sample_dict[v] @@ -245,27 +268,29 @@ def inv_map_key_fields_fn(sample_dict: Dict) -> Dict: is_first = True if max_batch_size == 1: - for element in tw.read_json_memory_efficient(self.data_path): - if map_key_fields: - element = map_key_fields_fn(element) - - transformed = transformation(element) - if transformed is not None: - write_dict(transformed if not map_key_fields else inv_map_key_fields_fn(transformed)) + with tu.JsonListReader(self.data_path) as reader: + for element in reader: + if map_key_fields: + element = map_key_fields_fn(element) + + transformed = transformation(element) + if transformed is not None: + write_dict(transformed if not map_key_fields else inv_map_key_fields_fn(transformed)) else: buffer = [] - for element in tw.read_json_memory_efficient(self.data_path): - if map_key_fields: - element = map_key_fields_fn(element) - - buffer.append(element) - if len(buffer) == max_batch_size: - for e in transformation(buffer): - write_dict(e if not map_key_fields else inv_map_key_fields_fn(e)) - buffer = [] - out_file.flush() + with tu.JsonListReader(self.data_path) as reader: + for element in reader: + if map_key_fields: + element = map_key_fields_fn(element) + + buffer.append(element) + if len(buffer) == max_batch_size: + for e in transformation(buffer): + write_dict(e if not map_key_fields else inv_map_key_fields_fn(e)) + buffer = [] + out_file.flush() if buffer: for e in transformation(buffer): @@ -566,8 +591,9 @@ def all_passages(self) -> Iterable[Dict[Hashable, Any]]: """ Returns an iterator of all passages (json dicts) in this dataset. """ - for element in tw.read_json_memory_efficient(self.data_path): - yield element + with tu.JsonListReader(self.data_path) as reader: + for element in reader: + yield element class DataFileCollection: @@ -704,8 +730,9 @@ def all_passages(self) -> Iterable[Dict[Hashable, Any]]: list(self.all_files()) ): # remove list() if it turns out that the file count is super huge assert in_path[: len(self.collection_path)] == self.collection_path - for element in tw.read_json_memory_efficient(in_path): - yield element + with tu.JsonListReader(in_path) as reader: + for element in reader: + yield element def transform( self, @@ -766,21 +793,23 @@ def write_dict(sample_dict: Dict): is_first = True if max_batch_size == 1: - for element in tw.read_json_memory_efficient(in_path): - transformed = transformation(element) - if transformed is not None: - write_dict(transformed) + with tu.JsonListReader(in_path) as reader: + for element in reader: + transformed = transformation(element) + if transformed is not None: + write_dict(transformed) else: buffer = [] - for element in tw.read_json_memory_efficient(in_path): - buffer.append(element) - if len(buffer) == max_batch_size: - for e in transformation(buffer): - write_dict(e) - buffer = [] - out_file.flush() + with tu.JsonListReader(in_path) as reader: + for element in reader: + buffer.append(element) + if len(buffer) == max_batch_size: + for e in transformation(buffer): + write_dict(e) + buffer = [] + out_file.flush() if buffer: for e in transformation(buffer): @@ -843,10 +872,11 @@ def convert_to_Data( for in_path in tqdm( list(self.all_files()) ): # remove list() if it turns out that the file count is super huge - for element in tw.read_json_memory_efficient(in_path): - out_file.write("\n" if is_first else ",\n") - is_first = False - out_file.write(json.dumps(clean_dict(element, filter_fields))) + with tu.JsonListReader(in_path) as reader: + for element in reader: + out_file.write("\n" if is_first else ",\n") + is_first = False + out_file.write(json.dumps(clean_dict(element, filter_fields))) out_file.write("\n]") diff --git a/src/abstractions/model.py b/src/abstractions/model.py index 8a1f21b..e5d0369 100644 --- a/src/abstractions/model.py +++ b/src/abstractions/model.py @@ -8,7 +8,7 @@ import json import torch import warnings -import src.utils.text_utils as tw +import src.utils.text_utils as tu import random import numpy as np from transformers import ( @@ -702,7 +702,7 @@ def inference( :code:`serial` - Serial inference. """ - tw.write_log( + tu.write_log( f"Inference start, with result_data_name = {result_data_name} and backend = {backend}." ) input_is_data = isinstance(data, Data) @@ -773,7 +773,7 @@ def inference( f'Backend {backend} not recognized. Options are "sglang", "vllm", "deepspeed", and "serial".' ) - tw.write_log( + tu.write_log( f"Inference finished, with result_data_name = {result_data_name} and backend = {backend}." ) @@ -908,7 +908,7 @@ def __inference_parallel_deepspeed( result_data_path, f"{escape(result_data_name)}.json" ) - with tw.JsonListWriter(final_file_path) as writer: + with tu.JsonListWriter(final_file_path) as writer: with open(initial_file_path, "r") as results_file: for i, (input_dict, result) in enumerate( zip(data.all_passages(), results_file) @@ -960,11 +960,11 @@ def __inference_serial( root, "output", "inference_results", "inf", data_name + ".json" ) - with tw.JsonListWriter( + with tu.JsonListWriter( data_path ) as writer: # memory-efficient: no need to place all answers in memory if isinstance(input_data, Data): - eles = tw.read_json_memory_efficient(input_data.data_path) + eles = tu.read_json_memory_efficient(input_data.data_path) else: eles = input_data @@ -988,7 +988,7 @@ def __inference_serial( # display the first element to showcase results if writer.is_first: - tw.write_log( + tu.write_log( f"Inference sample: {ele}. Raw response: {repr(response)}." ) diff --git a/src/cleanser/rule_based_cleanser.py b/src/cleanser/rule_based_cleanser.py index 4ebf18a..59ee177 100644 --- a/src/cleanser/rule_based_cleanser.py +++ b/src/cleanser/rule_based_cleanser.py @@ -1,7 +1,7 @@ from src.path import root import os, json import re, unicodedata -import src.utils.text_utils as tw +import src.utils.text_utils as tu from tqdm import tqdm last_cleaned = "" @@ -63,13 +63,13 @@ def cleanse_text(text): def cleanse_dir(dirct, to_dir): os.makedirs(to_dir) for year in tqdm(os.listdir(dirct), desc=dirct.split("/")[-1]): - generator = tw.read_json_memory_efficient(os.path.join(dirct, year)) + generator = tu.read_json_memory_efficient(os.path.join(dirct, year)) out = [] for boi in generator: orig_len = len(boi["content"]) boi["content"] = cleanse_text(boi["content"]) - tw.write_log( + tu.write_log( "cleansed an object in " + year + ", length reduced from " @@ -83,7 +83,7 @@ def cleanse_dir(dirct, to_dir): if len(boi["content"]) > 200: out.append(boi) else: - tw.write_log(f"Ignoring {repr(boi['content'])}.") + tu.write_log(f"Ignoring {repr(boi['content'])}.") with open(os.path.join(to_dir, year), "w") as file: json.dump(out, file) diff --git a/src/eebo/process_eebo.py b/src/eebo/process_eebo.py index aa1e02b..7942c15 100644 --- a/src/eebo/process_eebo.py +++ b/src/eebo/process_eebo.py @@ -3,7 +3,7 @@ import os from tqdm import tqdm import json -import src.utils.text_utils as tw +import src.utils.text_utils as tu # a utility function called by build_eebo_dataset, to read the contents in an eebo xml file in a suitable manner @@ -106,10 +106,10 @@ def build_eebo_dataset(eebo_path: str = f"{root}/dataset/raw_downloads/EEBO/"): 1000 < year_earliest <= year <= year_latest < 2025 and year_latest - year_earliest < 50 ): - tw.write_single_entry(json_dict=json_element) + tu.write_single_entry(json_dict=json_element) else: del json_element["creation_year"] - tw.write_log( + tu.write_log( f"EEBO: Uncertainty too large, saving to undated.json: {line.strip()}" ) - tw.report_undated_entry(json_dict=json_element) + tu.report_undated_entry(json_dict=json_element) diff --git a/src/gutenberg/get_meta.py b/src/gutenberg/get_meta.py index 4c52ea6..714eec8 100644 --- a/src/gutenberg/get_meta.py +++ b/src/gutenberg/get_meta.py @@ -1,5 +1,5 @@ from src.path import root -import src.utils.text_utils as tw +import src.utils.text_utils as tu import os, json import csv from tqdm import tqdm @@ -56,7 +56,7 @@ def gather_meta(raw_dir, record): break assert add["content"] - # add['creation_year'] = tw.decode_year_num(add["created_timestamp"], 1100, 2024) + # add['creation_year'] = tu.decode_year_num(add["created_timestamp"], 1100, 2024) """ Taking average from the author's y.o.b & y.o.d """ @@ -81,20 +81,20 @@ def gather_meta(raw_dir, record): add["creation_year"] = None break if add["creation_year"] is not None: - tw.write_single_entry(json_dict=add) + tu.write_single_entry(json_dict=add) else: - tw.report_undated_entry(add) + tu.report_undated_entry(add) gutenberg_failure_counter += 1 if ( gutenberg_failure_counter <= 100 or gutenberg_failure_counter % 100 == 0 ): - tw.write_log( + tu.write_log( f'Gutenberg: {gutenberg_failure_counter}-th time, saving to undated.json: created_timestamp={add["created_timestamp"]},{full_timestamp}' ) except Exception as e: gutenberg_failure_counter += 1 - tw.write_log( + tu.write_log( f"Gutenberg: {gutenberg_failure_counter}-th time, exception {type(e)} {e}" ) diff --git a/src/internet_archive/get_sources.py b/src/internet_archive/get_sources.py index 11edd69..612e884 100644 --- a/src/internet_archive/get_sources.py +++ b/src/internet_archive/get_sources.py @@ -1,6 +1,6 @@ from src.path import root import os, json, requests -import src.utils.text_utils as tw +import src.utils.text_utils as tu from tqdm import tqdm import re import time @@ -44,11 +44,11 @@ def build_internet_archive_LibOfCong(max_hours: int = None): if max_hours is not None: if time.monotonic() - start_time > 3600 * max_hours: print("time is up! ending loop.") - tw.write_log("IA-LOC: time is up! ending loop.") + tu.write_log("IA-LOC: time is up! ending loop.") break if example_counter % 100 == 2: - tw.write_log( + tu.write_log( "IA-LOC: time %.0f/%.0f = %.2f%%, progress %d/%d = %.2f%%" % ( time.monotonic() - start_time, @@ -62,11 +62,11 @@ def build_internet_archive_LibOfCong(max_hours: int = None): try: if example_counter <= 200: - tw.write_log(f"IA-LOC: {identifier} request #1 starts") + tu.write_log(f"IA-LOC: {identifier} request #1 starts") response = requests.get(url) response.raise_for_status() if example_counter <= 200: - tw.write_log(f"IA-LOC: {identifier} request #1 ends") + tu.write_log(f"IA-LOC: {identifier} request #1 ends") content = response.json().get("files", []) content_strings = [] @@ -74,7 +74,7 @@ def build_internet_archive_LibOfCong(max_hours: int = None): for file_info in content: if file_info["name"].endswith(".txt"): if example_counter <= 200: - tw.write_log( + tu.write_log( f'IA-LOC: {identifier}-{file_info["name"]} request #2 starts' ) file_url = ( @@ -83,7 +83,7 @@ def build_internet_archive_LibOfCong(max_hours: int = None): file_response = requests.get(file_url) file_response.raise_for_status() if example_counter <= 200: - tw.write_log( + tu.write_log( f'IA-LOC: {identifier}-{file_info["name"]} request #2 ends' ) file_content = file_response.text @@ -93,18 +93,18 @@ def build_internet_archive_LibOfCong(max_hours: int = None): if "date" not in metadata: nodate_counter += 1 - tw.write_log( + tu.write_log( f'IA-LOC: {nodate_counter}-th time, metadata contains no "date" field: {metadata}' ) continue date_str: str = metadata["date"] - creation_year = tw.decode_year_num(date_str, 510, 2024) + creation_year = tu.decode_year_num(date_str, 510, 2024) if example_counter <= 200 or ( type(creation_year) == int and creation_year < 500 ): - tw.write_log( + tu.write_log( f"IA-LOC: date {date_str} interpreted as year {creation_year}" ) @@ -115,18 +115,18 @@ def build_internet_archive_LibOfCong(max_hours: int = None): if creation_year is None: nodate_counter += 1 - tw.write_log( + tu.write_log( f"IA-LOC: {nodate_counter}-th time, date {date_str} uninterpretable; saving to undated.json" ) - tw.report_undated_entry(json_dict=metadata) + tu.report_undated_entry(json_dict=metadata) else: - tw.write_single_entry(json_dict=metadata) + tu.write_single_entry(json_dict=metadata) # raise an exception if request fails # except requests.exceptions.RequestException as e: except Exception as e: download_fail_counter += 1 - tw.write_log( + tu.write_log( f"IA-LOC: {download_fail_counter}-th time, error fetching metadata for {identifier}: {type(e)} {e}" ) diff --git a/src/model_training/curate_prompts.py b/src/model_training/curate_prompts.py index adcbb02..fc44aa9 100644 --- a/src/model_training/curate_prompts.py +++ b/src/model_training/curate_prompts.py @@ -96,7 +96,7 @@ def __init__(self, collection_name: str, data_type: Literal['pretrain', 'sft', ' from src.path import root from src.abstractions import Data, DataFileCollection, fill_in_QA_template -from src.utils.text_utils import write_log, read_json_memory_efficient +from src.utils.text_utils import write_log # Multithreading for GPT interaction import threading diff --git a/src/pile_of_law/get_data.py b/src/pile_of_law/get_data.py index 770a76c..be2946d 100644 --- a/src/pile_of_law/get_data.py +++ b/src/pile_of_law/get_data.py @@ -3,7 +3,7 @@ from bs4 import BeautifulSoup import os, json import lzma -import src.utils.text_utils as tw +import src.utils.text_utils as tu from tqdm import tqdm from copy import copy @@ -32,14 +32,14 @@ def download_file(url, folder_path): file_path = os.path.join(folder_path, file_name) # 发起HTTP请求并下载文件 - tw.write_log(f"PileOfLaw: start downloading {url}") + tu.write_log(f"PileOfLaw: start downloading {url}") with requests.get(url, stream=True) as r: r.raise_for_status() with open(file_path, "wb") as f: for chunk in r.iter_content(chunk_size=8192): if chunk: f.write(chunk) - tw.write_log(f"PileOfLaw: File '{file_name}' downloaded to '{folder_path}'") + tu.write_log(f"PileOfLaw: File '{file_name}' downloaded to '{folder_path}'") # 示例网页链接 url = "https://huggingface.co/datasets/pile-of-law/pile-of-law/tree/main/data" @@ -66,7 +66,7 @@ def download_file(url, folder_path): this_url = "https://huggingface.co" + file_link file_name = this_url.split("/")[-1].split("?")[0] if os.path.isfile(os.path.join(download_folder, file_name)): - tw.write_log( + tu.write_log( f"PileOfLaw: file {file_name} already downloaded. skipping to the next file in line." ) else: @@ -124,23 +124,23 @@ def read_jsonl_and_concat(jsonl_dir): creation_year = dd["created_timestamp"] if type(dd["created_timestamp"]) == str: - creation_year = tw.decode_year_num( + creation_year = tu.decode_year_num( dd["created_timestamp"], 1100, 2024 ) if creation_year is None: - tw.report_undated_entry(dd) + tu.report_undated_entry(dd) PoL_failure_counter += 1 - tw.write_log( + tu.write_log( f'PileOfLaw: {PoL_failure_counter}-th time, saving to undated.json: created_timestamp={dd["created_timestamp"] if "created_timestamp" in dd else None}' ) else: dd["creation_year"] = creation_year - tw.write_single_entry(json_dict=dd) + tu.write_single_entry(json_dict=dd) except Exception as e: PoL_failure_counter += 1 - tw.write_log( + tu.write_log( f"PileOfLaw: {PoL_failure_counter}-th time, error processing metadata for {i}-th entry of {name}: {type(e)} {e}" ) diff --git a/src/utils/text_utils.py b/src/utils/text_utils.py index 145e27d..5a8de1e 100644 --- a/src/utils/text_utils.py +++ b/src/utils/text_utils.py @@ -308,4 +308,5 @@ def append(self, element: Any): def __exit__(self, type, value, traceback): self.file_obj.write("\n]") + self.file_obj.flush() self.file_obj.close()