From 8955a2226a5dcca8c20b8ecc3a18262f7bbbc434 Mon Sep 17 00:00:00 2001
From: "Tianyi (Alex) Qiu" <qiutianyi.qty@gmail.com>
Date: Fri, 20 Dec 2024 18:32:53 -0800
Subject: [PATCH] chore: perform formatting

---
 __init__.py                                   |   2 +
 build_dataset.py                              |  14 +-
 examples/abstractions/finetuning_datamanip.py |  26 +-
 examples/abstractions/inference_evaluation.py |  15 +-
 .../src/llmtuner/data/template.py             |   5 +-
 src/abstractions/backends.py                  | 186 +++++++----
 src/abstractions/configs/templates_configs.py |  33 +-
 src/abstractions/data.py                      | 292 ++++++++++++------
 src/abstractions/model.py                     |  93 ++++--
 src/download_models.py                        |  12 +-
 src/evaluation/figure.py                      |   6 +-
 src/evaluation/quantify.py                    |  70 +++--
 src/evaluation/test_eval_01.py                |  30 +-
 src/evaluation/test_eval_02.py                |  23 +-
 src/evaluation/utils.py                       | 191 ++++++++----
 src/gutenberg/get_meta.py                     |   7 +-
 src/path.py                                   |  11 +-
 src/utils/data_utils/__init__.py              |   2 +-
 src/utils/data_utils/extrapolation_utils.py   |   1 +
 src/utils/data_utils/rw_utils.py              |   2 +-
 src/utils/gpt_utils.py                        |   4 +-
 21 files changed, 678 insertions(+), 347 deletions(-)

diff --git a/__init__.py b/__init__.py
index fa6ed27..a0ee496 100644
--- a/__init__.py
+++ b/__init__.py
@@ -1,10 +1,12 @@
 import os, sys
+
 sys.path = [os.path.dirname(os.path.abspath(__file__))] + sys.path
 
 if not eval(os.environ.get("LOUD_BACKEND", "0")):
     os.environ["WANDB_DISABLED"] = "true"
 
 import logging
+
 logging.basicConfig(level=logging.ERROR)
 
 from benchmark.framework import JudgeBase, ExamineeBase
diff --git a/build_dataset.py b/build_dataset.py
index 1a1b049..7f44f94 100644
--- a/build_dataset.py
+++ b/build_dataset.py
@@ -29,7 +29,8 @@ def build_gutenberg():
     dir = f"{root}/dataset/raw_downloads/Gutenberg/"
     gtb_gd.get_data_gutenberg(dir)
     gtb_gm.gather_meta(
-        os.path.join(dir, "data/raw"), f"{root}/dataset/raw_downloads/Gutenberg_records.txt"
+        os.path.join(dir, "data/raw"),
+        f"{root}/dataset/raw_downloads/Gutenberg_records.txt",
     )
     print("======= FINISHED BUILDING GUTENBERG DATASET =======\n\n\n")
 
@@ -107,9 +108,7 @@ def build_pile_of_law():
             )
 
             # Make llm-cleansed version the official version ("dataset_text_sequence"), and move the other two versions into dataset/raw_downloads
-            path = (
-                f"{root}/dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/"
-            )
+            path = f"{root}/dataset/raw_downloads/dataset_text_sequence_versions/{timestamp}/"
             os.makedirs(path)
 
             print(f"Moving pre-cleansing version to backup folder...")
@@ -154,7 +153,9 @@ def build_pile_of_law():
             sub_datasets = [
                 f
                 for f in os.listdir(f"{root}/dataset/dataset_text_sequence/")
-                if os.path.isdir(os.path.join(f"{root}/dataset/dataset_text_sequence/", f))
+                if os.path.isdir(
+                    os.path.join(f"{root}/dataset/dataset_text_sequence/", f)
+                )
             ]
             for sub in sub_datasets:
                 # Remove if size < 10MB AND century number < 13
@@ -169,7 +170,8 @@ def build_pile_of_law():
                     os.system(f"mv ./dataset/dataset_text_sequence/{sub} {path}")
 
             hislm.run_training(
-                f"{root}/dataset/dataset_text_sequence/", f"{root}/dataset/dataset_model_sequence/"
+                f"{root}/dataset/dataset_text_sequence/",
+                f"{root}/dataset/dataset_model_sequence/",
             )
             print("Finished model training. Exiting.")
 
diff --git a/examples/abstractions/finetuning_datamanip.py b/examples/abstractions/finetuning_datamanip.py
index d4bb66c..19f3f03 100644
--- a/examples/abstractions/finetuning_datamanip.py
+++ b/examples/abstractions/finetuning_datamanip.py
@@ -13,6 +13,7 @@
     is_instruct_finetuned=True,
 )
 
+
 def continue_pretrain():
     # ============== Continue pretraining from Gemma 2B ==============
     global gemma2b_c4
@@ -22,6 +23,7 @@ def continue_pretrain():
     )
     print(gemma2b_c4.is_instruct_finetuned)  # False
 
+
 def supervised_finetune():
     # ============== Then do SFT using alpaca data ==============
     global gemma2b_c4_alpaca
@@ -34,7 +36,7 @@ def supervised_finetune():
     )
     print(gemma2b_c4_alpaca.is_instruct_finetuned)  # True
     gemma2b_c4_alpaca.save_permanent()  # saved to output/saved/saved_model/gemma-2b_c4_alpaca
-    
+
     # ============== Or maybe, we should censor curse words before SFT ==============
     def remove_curse_words(sample_dict: dict) -> dict:
         filter = lambda s: (
@@ -55,7 +57,7 @@ def remove_curse_words(sample_dict: dict) -> dict:
     )
     gemma2b_c4_alpaca_G.save_permanent()  # saved to output/saved/saved_model/gemma-2b_c4_alpaca_G
     alpaca_data_G.save_permanent_and_register()  # saved to output/saved/saved_model/alpaca_gpt4_en_G.json & added to llama-factory dataset registry
-    
+
     # ============== What about using our own data (scattered across multiple files in multiple directories) for finetuning? ==============
     histext_collection = DataFileCollection(  # build a collection holding json files of year 1826 to 2018
         collection_name="histext_1826_to_2018_collection",
@@ -93,6 +95,7 @@ def remove_nonstr_data(sample_dict: dict) -> dict:
         result_model_name="gemma-2b_histext",
     )
 
+
 def direct_preference_optimization():
     # ============== Then do DPO using ORCA data ==============
     global gemma2b_c4_alpaca_orca
@@ -105,6 +108,7 @@ def direct_preference_optimization():
     )
     gemma2b_c4_alpaca_orca.save_permanent()  # saved to output/saved/saved_model/gemma-2b_c4_alpaca_orca
 
+
 def dialogue_manipulation():
     # ============== Generating a dialogue, using a model to play the role of both user and assistant ==============
     global llama8b_instruct
@@ -115,33 +119,33 @@ def dialogue_manipulation():
                 "input": "Is Eiffel Tower in Paris?",
                 "history": [
                     ["What is the capital of France?", "Paris."],
-                ]
+                ],
             }
-        ]
+        ],
     )
-    
+
     def converse():
         nonlocal dialogue_data
-        
+
         dialogue_data = llama8b_instruct.inference(
             dialogue_data, "dialogue_data2", backend="sglang"
         )
         dialogue_data = dialogue_data.switch_role_to_user()
-        
+
         dialogue_data = llama8b_instruct.inference(
             dialogue_data, "dialogue_data3", backend="sglang"
         )
         dialogue_data = dialogue_data.switch_role_to_assistant()
-    
+
     for i in range(5):
         converse()
-    
+
     print(list(dialogue_data.all_passages()))
     print(list(dialogue_data.to_openai_format()))
-    
+
 
 if __name__ == "__main__":
     continue_pretrain()
     supervised_finetune()
     direct_preference_optimization()
-    dialogue_manipulation()
\ No newline at end of file
+    dialogue_manipulation()
diff --git a/examples/abstractions/inference_evaluation.py b/examples/abstractions/inference_evaluation.py
index 7d0b24a..3494ab4 100644
--- a/examples/abstractions/inference_evaluation.py
+++ b/examples/abstractions/inference_evaluation.py
@@ -30,9 +30,9 @@ def dataset_inference_example(histllama: Model):
 
 def logprob_example(histllama: Model):
     custom_data = Data(
-        "custom_data", 
+        "custom_data",
         data_type="sft",
-        data_content = [
+        data_content=[
             {
                 "input": "What is the capital of France?",
                 "predict": ["Paris", "Washington D.C.", "London", "Berlin"],
@@ -42,9 +42,12 @@ def logprob_example(histllama: Model):
     custom_data.set_key_fields(query_field_name="input")
 
     logprob_output: Data = histllama.inference(
-        custom_data, "8B-C021-infer-custom-deepspeed", backend="sglang", purpose="logprobs"
+        custom_data,
+        "8B-C021-infer-custom-deepspeed",
+        backend="sglang",
+        purpose="logprobs",
     )
-    print(list(logprob_output.all_passages())) 
+    print(list(logprob_output.all_passages()))
     # [{'predict': ['Paris', 'Washington D.C.', 'London', 'Berlin'], 'input': 'What is the capital of France?', 'logprob': [-9.92294692993164, -17.21290510520339, -11.677074432373047, -12.903636932373047]}]
 
 
@@ -58,6 +61,6 @@ def logprob_example(histllama: Model):
     #     model_path_or_repoid="mistralai/Mixtral-8x7B-Instruct-v0.1",
     #     template_type="mistral",
     # )
-    
+
     dataset_inference_example(histllama)
-    logprob_example(histllama)
\ No newline at end of file
+    logprob_example(histllama)
diff --git a/libs/llama_factory/src/llmtuner/data/template.py b/libs/llama_factory/src/llmtuner/data/template.py
index 031282f..60f5a84 100644
--- a/libs/llama_factory/src/llmtuner/data/template.py
+++ b/libs/llama_factory/src/llmtuner/data/template.py
@@ -713,7 +713,10 @@ def get_template_and_fix_tokenizer(
         ]
     ),
     format_system=StringFormatter(
-        slots=[{"bos_token"}, "<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]
+        slots=[
+            {"bos_token"},
+            "<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>",
+        ]
     ),
     format_observation=StringFormatter(
         slots=[
diff --git a/src/abstractions/backends.py b/src/abstractions/backends.py
index 7b79bf9..a6b6b3c 100644
--- a/src/abstractions/backends.py
+++ b/src/abstractions/backends.py
@@ -210,6 +210,7 @@ def get_model_size(model_repoid_or_path: str) -> float:
     )
     return model_size
 
+
 def start_inference_backend(
     model_repoid_or_path: str,
     backend_type: Literal["sglang", "vllm"] = "sglang",
@@ -219,43 +220,43 @@ def start_inference_backend(
     num_gpus: int = None,
     template_type: Literal["auto", "alpaca", "mistral", "llama3"] = "auto",
 ) -> Tuple[subprocess.Popen, Callable, Callable]:
-    """Start an inference backend for a given model. 
+    """Start an inference backend for a given model.
     Returns a tuple containing the backend process and the function to process a batch of samples.
     When purpose is "logprobs", the returned function will return the log probability of the prompt text itself, without generating any text. The probability will be stored in the "logprob" field of the output dictionary, with all other fields staying the same.
     When purpose is "responses", the returned function will generate a response to the prompt text. The response will be stored in the "predict" field of the output dictionary, with all other fields staying the same.
 
     :param model_repoid_or_path: The model repo ID or path (e.g., "meta-llama/Meta-Llama-3-8B-Instruct").
     :type model_repoid_or_path: str
-    
+
     :param backend_type: The type of backend to start, defaults to "sglang"
     :type backend_type: Literal["sglang", "vllm"], optional
-    
+
     :param purpose: The purpose of the backend, defaults to "logprobs"
     :type purpose: Literal["responses, "logprobs"], optional
-    
+
     :param silent: Whether to run the backend silently, defaults to True
     :type silent: bool, optional
-    
+
     :param port: The port number to use for the backend, defaults to PORT_NUM
     :type port: int, optional
-    
+
     :param num_gpus: The number of GPUs to use for the backend, defaults to None (use all available GPUs)
     :type num_gpus: int, optional
-    
+
     :param template_type: The type of template to use for the backend, defaults to "auto", which uses the appropriate template (not limited to alpaca/mistral/llama3) based on the model config file
     :type template_type: Literal["auto", "alpaca", "mistral", "llama3"], optional
-    
+
     :return: A tuple containing the backend process, the function to process a batch of samples (type signature: List[dict] -> List[dict], with optional metadata arguments), and the function to destroy the backend after use.
     :rtype: Tuple[subprocess.Popen, Callable, Callable]
     """
     if eval(os.environ.get("LOUD_BACKEND", "0")):
         silent = False
-    
+
     if num_gpus is None:
         num_gpus = torch.cuda.device_count()
 
     if backend_type == "vllm":
-        
+
         if purpose == "logprobs":
             raise ValueError("VLLM backend does not support logprobs purpose.")
 
@@ -304,7 +305,9 @@ def vllm_process_batch(
 
             prompts = [
                 fill_in_QA_template(
-                    dic.get("instruction"), dic.get("input"), model_repoid_or_path=template_type
+                    dic.get("instruction"),
+                    dic.get("input"),
+                    model_repoid_or_path=template_type,
                 )
                 for dic in sample_dicts
             ]
@@ -317,7 +320,7 @@ def vllm_process_batch(
                 dic["predict"] = generated_text
 
             return sample_dicts
-        
+
         def vllm_free_gpu_memory():
             """Remove the vllm model and free vllm cache. This should wipe out all GPU memory used by self."""
             if destroy_model_parallel is not None:
@@ -354,24 +357,27 @@ def vllm_free_gpu_memory():
             warnings.warn(
                 f"SGLang backend only supports auto template type. Ignoring template_type={template_type}. This is not an issue if you simply intend to perform inference on HistLlama models, but may be an issue if the model is neither in the HistLlama family nor in SGLang's supported models list, in which case you may use NO_SGLANG=1 to disable sglang backend."
             )
-        
+
         backend_key = f"{model_repoid_or_path}-{backend_type}-{purpose}-{num_gpus}"
         connected = False
-        
+
         if os.path.exists(f"{root}/output/backend_history.json"):
             with open(f"{root}/output/backend_history.json", "r") as f:
                 backend_history = json.load(f)
         else:
             backend_history = {}
-        
+
         print(f"Current backend history: {backend_history}", flush=True)
         print(f"Looking for prior backend with key {backend_key}...", flush=True)
-        
+
         if backend_key in backend_history:
             backend_port = backend_history[backend_key]
-            print(f"Found prior backend with key {backend_key} at port {backend_port}.", flush=True)
-            
-            try:    
+            print(
+                f"Found prior backend with key {backend_key} at port {backend_port}.",
+                flush=True,
+            )
+
+            try:
                 sgl.set_default_backend(sgl.RuntimeEndpoint(f"http://localhost:{port}"))
                 connected = True
                 backend = None
@@ -379,7 +385,7 @@ def vllm_free_gpu_memory():
             except:
                 del backend_history[backend_key]
                 print("Failed to connect to backend. Will start a new one.", flush=True)
-        
+
         if not connected:
             with open(os.devnull, "w") as devnull:
                 frac_static = 0.8 if purpose == "responses" else 0.7
@@ -410,10 +416,10 @@ def vllm_free_gpu_memory():
                     min_gpus_per_instance = (
                         2 if model_size <= 30 else 4 if model_size <= 80 else 8
                     )
-                    
+
                     if os.environ.get("FORCE_TP"):
                         min_gpus_per_instance = int(os.environ.get("FORCE_TP"))
-                                    
+
                     assert num_gpus % min_gpus_per_instance == 0
                     args = [
                         "python",
@@ -443,7 +449,9 @@ def vllm_free_gpu_memory():
                 if "smol" in model_repoid_or_path.lower():
                     args += ["--chat-template=chatml"]
 
-                print(f"Starting backend for {model_repoid_or_path} - {args}", flush=True)
+                print(
+                    f"Starting backend for {model_repoid_or_path} - {args}", flush=True
+                )
 
                 if silent:
                     new_env = os.environ.copy()
@@ -453,8 +461,11 @@ def vllm_free_gpu_memory():
                     )
                 else:
                     backend = subprocess.Popen(args)
-                
-                print(f"Registered backend with key {backend_key} at port {port}.", flush=True)
+
+                print(
+                    f"Registered backend with key {backend_key} at port {port}.",
+                    flush=True,
+                )
                 backend_history[backend_key] = port
                 with open(f"{root}/output/backend_history.json", "w") as f:
                     json.dump(backend_history, f)
@@ -463,8 +474,12 @@ def vllm_free_gpu_memory():
             for _ in range(40):
                 time.sleep(30)
                 try:
-                    print(f"Trying to connect to backend (at port {port})...", flush=True)
-                    sgl.set_default_backend(sgl.RuntimeEndpoint(f"http://localhost:{port}"))
+                    print(
+                        f"Trying to connect to backend (at port {port})...", flush=True
+                    )
+                    sgl.set_default_backend(
+                        sgl.RuntimeEndpoint(f"http://localhost:{port}")
+                    )
                     print("Connected to backend.", flush=True)
                     break
                 except:
@@ -478,7 +493,11 @@ def vllm_free_gpu_memory():
 
         @sgl.function
         def get_response(
-            s, conversation: List, temperature: float = 0.2, max_tokens: int = None, options: list = []
+            s,
+            conversation: List,
+            temperature: float = 0.2,
+            max_tokens: int = None,
+            options: list = [],
         ) -> str:
             nonlocal purpose
             last_role = None
@@ -487,25 +506,25 @@ def get_response(
                 if turn["role"] == "assistant":
                     s += sgl.assistant(turn["content"])
                     last_role = "assistant"
-                
+
                 elif turn["role"] == "user":
                     s += sgl.user(turn["content"])
                     last_role = "user"
-                
+
                 elif turn["role"] == "system":
                     s += sgl.system(turn["content"])
-                
+
                 else:
                     raise ValueError(f"Unknown role: {turn['role']}")
 
             if purpose == "responses" or options:
                 assert last_role == "user"
                 s += sgl.assistant_begin()
-            
+
             if options:
                 s += sgl.gen(
                     "NA",
-                    max_tokens=max(len(x) for x in options)+10,
+                    max_tokens=max(len(x) for x in options) + 10,
                     choices=options,
                 )
 
@@ -526,7 +545,7 @@ def sglang_process_batch(
             When purpose is "responses", it will generate a response to the prompt text. The response will be stored in the "predict" field of the output dictionary, with all other fields staying the same.
             """
             nonlocal purpose
-            
+
             if not os.environ.get("ALLOW_EMPTY_INSTRUCTION") or not eval(
                 os.environ.get("ALLOW_EMPTY_INSTRUCTION")
             ):
@@ -542,7 +561,14 @@ def sglang_process_batch(
                         del dic["input"]
 
             dialogues = dict_to_dialogue_list(sample_dicts, purpose)
-            options_lists = [(dic["predict"] if "predict" in dic and isinstance(dic["predict"], list) else []) for dic in sample_dicts]
+            options_lists = [
+                (
+                    dic["predict"]
+                    if "predict" in dic and isinstance(dic["predict"], list)
+                    else []
+                )
+                for dic in sample_dicts
+            ]
             output = get_response.run_batch(
                 [
                     {
@@ -599,7 +625,7 @@ def sglang_process_batch(
                 warnings.warn(
                     f"{count} cases still not completed after 10 retries. Use NO_SGLANG=1 to disable sglang backend."
                 )
-            
+
             if count > 100 or count / len(output) > 0.01:
                 raise Exception(f"Too many cases ({count}) still not completed.")
 
@@ -608,49 +634,57 @@ def sglang_process_batch(
                 if out.get_meta_info("NA") is None:
                     failure_count += 1
                     continue
-                
+
                 if purpose == "logprobs":
                     if "predict" in dic and isinstance(dic["predict"], list):
                         dic["logprob"] = [
                             sum(x[0] for x in y if x[0] is not None)
-                            for y in list(out.get_meta_info("NA")['input_token_logprobs'])
+                            for y in list(
+                                out.get_meta_info("NA")["input_token_logprobs"]
+                            )
                         ]
                         assert len(dic["logprob"]) == len(dic["predict"])
                     else:
                         dic["logprob"] = sum(
-                            x[0] for x in list(out.get_meta_info("NA")['input_token_logprobs']) if x[0] is not None
+                            x[0]
+                            for x in list(
+                                out.get_meta_info("NA")["input_token_logprobs"]
+                            )
+                            if x[0] is not None
                         )
                 else:
                     dic["predict"] = (
                         out["NA"] if out.get_meta_info("NA") is not None else None
                     )
-            
+
             if failure_count > count:
-                raise Exception(f"More actual failures ({failure_count}) than cases not completed ({count}), which is unexpected.")
-            
+                raise Exception(
+                    f"More actual failures ({failure_count}) than cases not completed ({count}), which is unexpected."
+                )
+
             return sample_dicts
-        
+
         def sglang_free_gpu_memory():
             """Wipe out all GPU memory used by the user."""
             nonlocal backend_key
-            
+
             # Remove the backend from the history
             with open(f"{root}/output/backend_history.json", "r") as f:
                 backend_history = json.load(f)
-                
+
             backend_history.pop(backend_key)
             with open(f"{root}/output/backend_history.json", "w") as f:
                 json.dump(backend_history, f)
-            
+
             # Kill the backend process
             try:
                 backend.kill()
             except:
                 print("backend.kill() failed.")
-            
+
             MY_USERNAME = pwd.getpwuid(os.getuid()).pw_name
             print(f"Killing all processes on GPU for user {MY_USERNAME}.")
-            
+
             devices = Device.cuda.all()
             signal.signal(signal.SIGCHLD, signal.SIG_IGN)
             for device in devices:
@@ -658,7 +692,7 @@ def sglang_free_gpu_memory():
                 processes = GpuProcess.take_snapshots(processes.values(), failsafe=True)
                 for process in processes:
                     if process.username.lower() == MY_USERNAME.lower():
-                        print(f'Killing process {process.pid}: {process.cmdline}')
+                        print(f"Killing process {process.pid}: {process.cmdline}")
                         os.kill(process.pid, signal.SIGTERM)
                         os.kill(process.pid, signal.SIGINT)
                         os.kill(process.pid, signal.SIGKILL)
@@ -669,7 +703,8 @@ def sglang_free_gpu_memory():
 
 
 def dict_to_dialogue_list(
-    dic: Union[dict, List[dict]], purpose: Literal["responses", "logprobs"] = "responses"
+    dic: Union[dict, List[dict]],
+    purpose: Literal["responses", "logprobs"] = "responses",
 ) -> Union[List[Dict[str, str]], List[List[Dict[str, str]]]]:
     """Transform a dictionary into a list of dialogue turns in OpenAI format.
 
@@ -680,25 +715,36 @@ def dict_to_dialogue_list(
     """
     if isinstance(dic, dict):
         res = []
-        
+
         if "system" in dic:
             res = [{"role": "system", "content": dic["system"]}]
-        
+
         if "history" in dic:
             for turn in dic["history"]:
                 res.append({"role": "user", "content": turn[0]})
                 res.append({"role": "assistant", "content": turn[1]})
-        
+
         if "input" in dic or "instruction" in dic:
             input = dic.get("input", "")
             instruction = dic.get("instruction", "")
-            res.append({"role": "user", "content": input + ("\n\n" if input and instruction else "") + instruction})
-        
-        if purpose == "logprobs" and "predict" in dic and isinstance(dic["predict"], str):
+            res.append(
+                {
+                    "role": "user",
+                    "content": input
+                    + ("\n\n" if input and instruction else "")
+                    + instruction,
+                }
+            )
+
+        if (
+            purpose == "logprobs"
+            and "predict" in dic
+            and isinstance(dic["predict"], str)
+        ):
             res.append({"role": "assistant", "content": dic["predict"]})
         elif "output" in dic:
             res.append({"role": "assistant", "content": dic["output"]})
-        
+
         return res
 
     return [dict_to_dialogue_list(d) for d in dic]
@@ -715,34 +761,38 @@ def fill_in_QA_template(
 
     :param instruction: The task instruction, defaults to "". Either this or full_dict must be provided.
     :type instruction: str, optional
-    
+
     :param input: Supplementary input to the task, defaults to "".
     :type input: str, optional
-    
+
     :param suffix: Suffix to add to the prompt, defaults to "".
     :type suffix: str, optional
-    
+
     :param full_dict: The full dictionary containing the instruction and input, defaults to None. Either this or instruction must be provided. If this is provided, instruction, input, and suffix will be ignored.
     :type full_dict: dict, optional
-    
+
     :param model_repoid_or_path: The model repo ID or path (e.g., "meta-llama/Meta-Llama-3-8B-Instruct"), or one of the special values "alpaca" or "mistral" or "llama3", defaults to "alpaca".
     :type model_repoid_or_path: Union[Literal["alpaca", "mistral", "llama3"], str], optional
-    
+
     :return: The prompt with the instruction and input filled in.
     :rtype: str
     """
 
     instruction = instruction.strip()
     input = input.strip()
-    
+
     # Convert full_dict to instruction and input
     if full_dict and model_repoid_or_path in ["alpaca", "mistral"]:
-        assert "history" not in full_dict, "History field not supported with alpaca/mistral template."
-        assert "system" not in full_dict, "System field not supported with alpaca/mistral template."
-        
+        assert (
+            "history" not in full_dict
+        ), "History field not supported with alpaca/mistral template."
+        assert (
+            "system" not in full_dict
+        ), "System field not supported with alpaca/mistral template."
+
         instruction = full_dict.get("instruction", "")
         input = full_dict.get("input", "")
-        
+
     if input and not instruction:
         warnings.warn("Swapping instruction and input fields.")
         instruction, input = input, instruction
@@ -775,7 +825,7 @@ def fill_in_QA_template(
     else:
         if model_repoid_or_path == "llama3":
             model_repoid_or_path = "meta-llama/Meta-Llama-3-8B-Instruct"
-            
+
         if suffix:
             warnings.warn(
                 f"Suffix not supported except with mistral template. Ignoring suffix."
diff --git a/src/abstractions/configs/templates_configs.py b/src/abstractions/configs/templates_configs.py
index b5b530e..821bc96 100644
--- a/src/abstractions/configs/templates_configs.py
+++ b/src/abstractions/configs/templates_configs.py
@@ -6,17 +6,17 @@
 
 
 class GlobalState:
-    
+
     # Public variables
     continuous_backend: bool = False
-    
+
     # Private variables
     __active_backend_destroyers: List[Callable[[], None]] = []
 
     def __init__(self, **kwargs: Dict[str, Any]):
         """
         Temporarily set global state variables of ProgressGym for the duration of a context manager block.
-        
+
         Example:
         ```
         with GlobalState(continuous_backend=True):
@@ -33,15 +33,16 @@ def __enter__(self):
     def __exit__(self, type, value, traceback):
         for k, v in self.prior_state.items():
             setattr(GlobalState, k, v)
-        
+
         for destroy in GlobalState.__active_backend_destroyers:
             destroy()
-        
+
         GlobalState.__active_backend_destroyers = []
-    
+
     def register_destroyer(destroyer: Callable[[], None]):
         GlobalState.__active_backend_destroyers.append(destroyer)
 
+
 bash_command_template = f"""PYTHONNOUSERSITE=1 MASTER_PORT=9902 conda run --no-capture-output -n %s deepspeed %s --master_port=9902 {root}/libs/llama_factory/src/train_bash.py \\
     --deepspeed %s \\
     --ddp_timeout 180000000 \\
@@ -157,33 +158,35 @@ def register_destroyer(destroyer: Callable[[], None]):
 """
 )
 
-with open(f"{root}/src/abstractions/configs/abstractions_config.json", "r") as config_file:
+with open(
+    f"{root}/src/abstractions/configs/abstractions_config.json", "r"
+) as config_file:
     abstractions_config = json.load(config_file)
-    
+
     data_search_paths: List[str] = abstractions_config["data_search_paths"]
     data_save_path: str = abstractions_config["data_save_path"]
-    
+
     if not os.path.exists(data_save_path):
         data_save_path = f"{root}/" + data_save_path
     if not os.path.exists(data_save_path):
         print(f"Data save path {data_save_path} doesn't exist. Creating it.")
         os.makedirs(data_save_path)
-    
+
     for i, path in enumerate(data_search_paths):
         if not os.path.exists(path):
-            data_search_paths[i] = f"{root}/" + path 
-    
+            data_search_paths[i] = f"{root}/" + path
+
     model_search_paths: List[str] = abstractions_config["model_search_paths"]
     model_save_path: str = abstractions_config["model_save_path"]
-    
+
     if not os.path.exists(model_save_path):
         model_save_path = f"{root}/" + model_save_path
     if not os.path.exists(model_save_path):
         print(f"Model save path {model_save_path} doesn't exist. Creating it.")
         os.makedirs(model_save_path)
-    
+
     for i, path in enumerate(model_search_paths):
         if not os.path.exists(path):
             model_search_paths[i] = f"{root}/" + path
-    
+
     multinode_master_addr: str = abstractions_config["multinode_master_addr"]
diff --git a/src/abstractions/data.py b/src/abstractions/data.py
index 0b0f5e2..c753e44 100644
--- a/src/abstractions/data.py
+++ b/src/abstractions/data.py
@@ -56,7 +56,7 @@ class Data:
     name2data: Dict[str, Any] = {}
     always_force_rewrite: bool = True
     data_type: Literal["pretrain", "sft", "preference"]
-    
+
     default_key_fields = {
         "prompt": "instruction",
         "query": "input",
@@ -153,7 +153,9 @@ def __init__(
             print(
                 f"Data {data_name} not found locally. Searching among Llama-Factory datasets."
             )
-            with open(f"{root}/libs/llama_factory/data/dataset_info.json", "r") as in_file:
+            with open(
+                f"{root}/libs/llama_factory/data/dataset_info.json", "r"
+            ) as in_file:
                 registrations = json.load(in_file)
 
             if self.data_name in registrations:
@@ -174,7 +176,7 @@ def __init__(
 
     def copy(self, data_name: str = None) -> "Data":
         """
-        Returns a copy of the current Data instance. 
+        Returns a copy of the current Data instance.
         Shallow copy if data_name is not provided or identical to the current data_name; deep copy otherwise.
         """
         if data_name and data_name != self.data_name:
@@ -184,16 +186,18 @@ def copy(self, data_name: str = None) -> "Data":
             cp = Data(data_name, self.data_type, new_data_path)
         else:
             cp = Data(self.data_name, self.data_type, self.data_path)
-        
+
         cp.key_fields = self.key_fields.copy()
         return cp
-    
+
     def to_openai_format(self) -> Iterable[List[Dict[str, str]]]:
         """
         Convert the data to OpenAI format, where each dialogue is a list of dictionaries with string keys and string values.
         Each dictionary represents a dialogue turn.
         """
-        convert_fn: Callable[[Dict], List[Dict]] = partial(dict_to_dialogue_list, purpose="logprobs")
+        convert_fn: Callable[[Dict], List[Dict]] = partial(
+            dict_to_dialogue_list, purpose="logprobs"
+        )
         for element in self.all_passages():
             yield convert_fn(element)
 
@@ -236,11 +240,11 @@ def transform(
         """
         out_path = f"{root}/output/datasets/{result_data_name}.json"
         if self.data_name == result_data_name or self.data_path == out_path:
-            if eval(os.environ.get('LOUD_BACKEND', 'False')):
+            if eval(os.environ.get("LOUD_BACKEND", "False")):
                 warnings.warn(
                     f"Data name {result_data_name} is the same as the current data name. The old instance will be invalidated."
                 )
-            
+
             return self.copy("temp_transform_artifact").transform(
                 transformation,
                 result_data_name,
@@ -249,7 +253,7 @@ def transform(
                 keep_key_fields,
                 map_key_fields,
             )
-            
+
         Data.ask_and_remove_if_exists(out_path, forced_rewrite)
 
         def write_dict(sample_dict: Dict):
@@ -258,23 +262,27 @@ def write_dict(sample_dict: Dict):
             is_first = False
             out_file.write(json.dumps(sample_dict))
             # out_file.flush()
-        
+
         def map_key_fields_fn(sample_dict: Dict) -> Dict:
             nonlocal self
             for k, v in self.default_key_fields.items():
-                if k in self.key_fields and self.key_fields.get(k, v) != v and self.key_fields[k] in sample_dict:
+                if (
+                    k in self.key_fields
+                    and self.key_fields.get(k, v) != v
+                    and self.key_fields[k] in sample_dict
+                ):
                     sample_dict[v] = sample_dict[self.key_fields[k]]
                     del sample_dict[self.key_fields[k]]
-            
+
             return sample_dict
-        
+
         def inv_map_key_fields_fn(sample_dict: Dict) -> Dict:
             nonlocal self
             for k, v in self.default_key_fields.items():
                 if v in sample_dict and self.key_fields.get(k, v) != v:
                     sample_dict[self.key_fields[k]] = sample_dict[v]
                     del sample_dict[v]
-            
+
             return sample_dict
 
         with open(out_path, "w") as out_file:
@@ -287,10 +295,14 @@ def inv_map_key_fields_fn(sample_dict: Dict) -> Dict:
                         element = copy.deepcopy(element)
                         if map_key_fields:
                             element = map_key_fields_fn(element)
-                        
+
                         transformed = transformation(element)
                         if transformed is not None:
-                            write_dict(transformed if not map_key_fields else inv_map_key_fields_fn(transformed))
+                            write_dict(
+                                transformed
+                                if not map_key_fields
+                                else inv_map_key_fields_fn(transformed)
+                            )
 
             else:
                 buffer = []
@@ -300,17 +312,23 @@ def inv_map_key_fields_fn(sample_dict: Dict) -> Dict:
                         element = copy.deepcopy(element)
                         if map_key_fields:
                             element = map_key_fields_fn(element)
-                        
+
                         buffer.append(element)
                         if len(buffer) == max_batch_size:
                             for e in transformation(buffer):
-                                write_dict(e if not map_key_fields else inv_map_key_fields_fn(e))
+                                write_dict(
+                                    e
+                                    if not map_key_fields
+                                    else inv_map_key_fields_fn(e)
+                                )
                             buffer = []
                             out_file.flush()
 
                 if buffer:
                     for e in transformation(buffer):
-                        write_dict(e if not map_key_fields else inv_map_key_fields_fn(e))
+                        write_dict(
+                            e if not map_key_fields else inv_map_key_fields_fn(e)
+                        )
 
             out_file.write("\n]")
 
@@ -322,47 +340,66 @@ def inv_map_key_fields_fn(sample_dict: Dict) -> Dict:
     def move_current_to_history(self, out_of_place: bool = False) -> "Data":
         """
         Move the current dialogue turn in the prompt/question field and the response/predict field to the history field.
-        
+
         :param out_of_place: Whether to perform the operation out-of-place. If out_of_place is True, the original data will not be modified, and a new Data instance with an annotated name will be returned. Otherwise, the original data will be modified in-place, and the same Data instance will be returned.
         :type out_of_place: bool = False
-        
+
         :return: The data after the operation.
         :rtype: Data.
         """
+
         def move_to_history_fn(sample_dict: Dict) -> Dict:
-            if sample_dict.get("instruction", "") or sample_dict.get("input", "") or sample_dict.get("output", "") or sample_dict.get("predict", ""):
-                assert (sample_dict.get("instruction", "") or sample_dict.get("input", "")) and (sample_dict.get("output", "") or sample_dict.get("predict", ""))
+            if (
+                sample_dict.get("instruction", "")
+                or sample_dict.get("input", "")
+                or sample_dict.get("output", "")
+                or sample_dict.get("predict", "")
+            ):
+                assert (
+                    sample_dict.get("instruction", "") or sample_dict.get("input", "")
+                ) and (sample_dict.get("output", "") or sample_dict.get("predict", ""))
                 sample_dict["history"] = sample_dict.get("history", []) + [
                     [
-                        sample_dict.get("instruction", "") + 
-                            ("\n\n" if "instruction" in sample_dict and "input" in sample_dict else "") +
-                            sample_dict.get("input", ""),
-                        sample_dict.get("output", "") + sample_dict.get("predict", "")
+                        sample_dict.get("instruction", "")
+                        + (
+                            "\n\n"
+                            if "instruction" in sample_dict and "input" in sample_dict
+                            else ""
+                        )
+                        + sample_dict.get("input", ""),
+                        sample_dict.get("output", "") + sample_dict.get("predict", ""),
                     ]
                 ]
                 sample_dict.pop("instruction", None)
                 sample_dict.pop("input", None)
                 sample_dict.pop("output", None)
                 sample_dict.pop("predict", None)
-            
+
             return sample_dict
-        
+
         new_data_name = (self.data_name + "_moved") if out_of_place else self.data_name
-        return self.transform(move_to_history_fn, new_data_name, forced_rewrite=True, map_key_fields=True)
-    
-    def switch_role_to_user(self, user_system_prompt: Union[str, Iterable[str]] = None, dialogue_starter: Union[str, Iterable[str]] = None, out_of_place: bool = False) -> "Data":
+        return self.transform(
+            move_to_history_fn, new_data_name, forced_rewrite=True, map_key_fields=True
+        )
+
+    def switch_role_to_user(
+        self,
+        user_system_prompt: Union[str, Iterable[str]] = None,
+        dialogue_starter: Union[str, Iterable[str]] = None,
+        out_of_place: bool = False,
+    ) -> "Data":
         """
         Switch the prompt/question field and the response/predict field, thereby shifting the dialogue turn from the assistant to the user.
-        
+
         :param user_system_prompt: The system prompt of the user role. Can be a single string or an iterable of strings, where each string corresponds to the prompt for a different sample in the dataset. If None, a default prompt will be used.
         :type user_system_prompt: Union[str, Iterable[str]] = None
-        
+
         :param dialogue_starter: Placeholder message for the "zeroth" dialogue turn by the assistant that prompts the user to start the conversation.
         :type dialogue_starter: str = None
-        
+
         :param out_of_place: Whether to perform the operation out-of-place. If out_of_place is True, the original data will not be modified, and a new Data instance with an annotated name will be returned. Otherwise, the original data will be modified in-place, and the same Data instance will be returned.
         :type out_of_place: bool = False
-        
+
         :return: The data after the operation.
         :rtype: Data.
         """
@@ -370,41 +407,68 @@ def switch_role_to_user(self, user_system_prompt: Union[str, Iterable[str]] = No
             user_system_prompt = "You are an assistant tasked with questioning the user, aka your partner. Ask informed questions to guide the conversation, follow up on the user's responses, and generally follow a natural conversation flow. Don't be too courteous; be concise."
         elif isinstance(user_system_prompt, list):
             user_system_prompt = iter(user_system_prompt)
-        
+
         if dialogue_starter is None:
             dialogue_starter = "I am your partner. Please start the conversation."
         elif isinstance(dialogue_starter, list):
             dialogue_starter = iter(dialogue_starter)
-        
+
         moved_to_history = self.move_current_to_history(out_of_place)
-        
+
         def switch_role_to_user_fn(sample_dict: Dict) -> Dict:
-            assert not (sample_dict.get("instruction", "") or sample_dict.get("input", "") or sample_dict.get("output", "") or sample_dict.get("predict", ""))
-            
-            current_user_system_prompt = user_system_prompt if isinstance(user_system_prompt, str) else next(user_system_prompt)
-            current_dialogue_starter = dialogue_starter if isinstance(dialogue_starter, str) else next(dialogue_starter)
-            
-            all_histories = [h[i] for h in sample_dict.get("history", []) for i in range(2)]
+            assert not (
+                sample_dict.get("instruction", "")
+                or sample_dict.get("input", "")
+                or sample_dict.get("output", "")
+                or sample_dict.get("predict", "")
+            )
+
+            current_user_system_prompt = (
+                user_system_prompt
+                if isinstance(user_system_prompt, str)
+                else next(user_system_prompt)
+            )
+            current_dialogue_starter = (
+                dialogue_starter
+                if isinstance(dialogue_starter, str)
+                else next(dialogue_starter)
+            )
+
+            all_histories = [
+                h[i] for h in sample_dict.get("history", []) for i in range(2)
+            ]
             all_histories = [current_dialogue_starter] + all_histories
             assert len(all_histories) % 2 == 1
-            sample_dict["history"] = [[all_histories[i], all_histories[i + 1]] for i in range(0, len(all_histories)-1, 2)]
+            sample_dict["history"] = [
+                [all_histories[i], all_histories[i + 1]]
+                for i in range(0, len(all_histories) - 1, 2)
+            ]
             sample_dict["instruction"] = all_histories[-1]
             sample_dict["system"] = current_user_system_prompt
             return sample_dict
-        
+
         new_data_name = (self.data_name + "_user") if out_of_place else self.data_name
-        return moved_to_history.transform(switch_role_to_user_fn, new_data_name, forced_rewrite=True, map_key_fields=True)
-    
-    def switch_role_to_assistant(self, assistant_system_prompt: Union[str, Iterable[str]] = None, out_of_place: bool = False) -> "Data":
+        return moved_to_history.transform(
+            switch_role_to_user_fn,
+            new_data_name,
+            forced_rewrite=True,
+            map_key_fields=True,
+        )
+
+    def switch_role_to_assistant(
+        self,
+        assistant_system_prompt: Union[str, Iterable[str]] = None,
+        out_of_place: bool = False,
+    ) -> "Data":
         """
         Switch the prompt/question field and the response/predict field, thereby shifting the dialogue turn from the user to the assistant.
-        
+
         :param assistant_system_prompt: The system prompt of the assistant role. Can be a single string or an iterable of strings, where each string corresponds to the prompt for a different sample in the dataset. If None, a default prompt will be used.
         :type assistant_system_prompt: Union[str, Iterable[str]] = None
-        
+
         :param out_of_place: Whether to perform the operation out-of-place. If out_of_place is True, the original data will not be modified, and a new Data instance with an annotated name will be returned. Otherwise, the original data will be modified in-place, and the same Data instance will be returned.
         :type out_of_place: bool = False
-        
+
         :return: The data after the operation.
         :rtype: Data.
         """
@@ -412,66 +476,101 @@ def switch_role_to_assistant(self, assistant_system_prompt: Union[str, Iterable[
             assistant_system_prompt = "Please answer the user's questions. Be concise and not overly courteous, but be informative and provide all necessary details."
         elif isinstance(assistant_system_prompt, list):
             assistant_system_prompt = iter(assistant_system_prompt)
-        
+
         moved_to_history = self.move_current_to_history(out_of_place)
-        
+
         def switch_role_to_assistant_fn(sample_dict: Dict) -> Dict:
-            assert not (sample_dict.get("instruction", "") or sample_dict.get("input", "") or sample_dict.get("output", "") or sample_dict.get("predict", ""))
-            
-            current_assistant_system_prompt = assistant_system_prompt if isinstance(assistant_system_prompt, str) else next(assistant_system_prompt)
-            
-            all_histories = [h[i] for h in sample_dict.get("history", []) for i in range(2)]
+            assert not (
+                sample_dict.get("instruction", "")
+                or sample_dict.get("input", "")
+                or sample_dict.get("output", "")
+                or sample_dict.get("predict", "")
+            )
+
+            current_assistant_system_prompt = (
+                assistant_system_prompt
+                if isinstance(assistant_system_prompt, str)
+                else next(assistant_system_prompt)
+            )
+
+            all_histories = [
+                h[i] for h in sample_dict.get("history", []) for i in range(2)
+            ]
             assert len(all_histories) % 2 == 0
-            sample_dict["history"] = [[all_histories[i], all_histories[i + 1]] for i in range(1, len(all_histories)-1, 2)]
+            sample_dict["history"] = [
+                [all_histories[i], all_histories[i + 1]]
+                for i in range(1, len(all_histories) - 1, 2)
+            ]
             sample_dict["instruction"] = all_histories[-1]
             sample_dict["system"] = current_assistant_system_prompt
             return sample_dict
 
-        new_data_name = (self.data_name + "_assistant") if out_of_place else self.data_name
-        return moved_to_history.transform(switch_role_to_assistant_fn, new_data_name, forced_rewrite=True, map_key_fields=True)
-    
-    def append_content(self, field_key: str, content: Union[str, Iterable[str]], out_of_place: bool = False, map_key_fields: bool = False) -> "Data":
+        new_data_name = (
+            (self.data_name + "_assistant") if out_of_place else self.data_name
+        )
+        return moved_to_history.transform(
+            switch_role_to_assistant_fn,
+            new_data_name,
+            forced_rewrite=True,
+            map_key_fields=True,
+        )
+
+    def append_content(
+        self,
+        field_key: str,
+        content: Union[str, Iterable[str]],
+        out_of_place: bool = False,
+        map_key_fields: bool = False,
+    ) -> "Data":
         """
         Append content to a specified field in the dataset.
-        
+
         :param field_key: The key of the field to append content to.
         :type field_key: str
-        
+
         :param content: The content to append. Can be a single string or an iterable of strings, where each string corresponds to the content to append for a different sample in the dataset.
         :type content: Union[str, Iterable[str]]
-        
+
         :param out_of_place: Whether to perform the operation out-of-place. If out_of_place is True, the original data will not be modified, and a new Data instance with an annotated name will be returned. Otherwise, the original data will be modified in-place, and the same Data instance will be returned.
         :type out_of_place: bool = False
-        
+
         :param map_key_fields: Whether to map the key fields to the default key fields before appending content.
         :type map_key_fields: bool = False
-        
+
         :return: The data after the operation.
         :rtype: Data.
         """
         if isinstance(content, list):
             content = iter(content)
-        
+
         def append_content_fn(sample_dict: Dict) -> Dict:
             current_content = content if isinstance(content, str) else next(content)
             sample_dict[field_key] = sample_dict.get(field_key, "") + current_content
             return sample_dict
-        
-        new_data_name = (self.data_name + "_appended") if out_of_place else self.data_name
-        return self.transform(append_content_fn, new_data_name, forced_rewrite=True, map_key_fields=map_key_fields)
-    
+
+        new_data_name = (
+            (self.data_name + "_appended") if out_of_place else self.data_name
+        )
+        return self.transform(
+            append_content_fn,
+            new_data_name,
+            forced_rewrite=True,
+            map_key_fields=map_key_fields,
+        )
+
     def filter_incomplete_samples(self, out_of_place: bool = False) -> "Data":
         """
         Remove the samples that has at least one of the key fields missing.
-        
+
         :param out_of_place: Whether to perform the operation out-of-place. If out_of_place is True, the original data will not be modified, and a new Data instance with an annotated name will be returned. Otherwise, the original data will be modified in-place, and the same Data instance will be returned.
         :type out_of_place: bool = False
-        
+
         :return: The data after the operation.
         :rtype: Data.
         """
         total_count = 0
         failure_count = 0
+
         def filter_incomplete_samples_fn(sample_dict: Dict) -> Dict:
             nonlocal self, total_count, failure_count
             total_count += 1
@@ -479,16 +578,27 @@ def filter_incomplete_samples_fn(sample_dict: Dict) -> Dict:
                 if k not in sample_dict:
                     failure_count += 1
                     return None
-            
+
             return sample_dict
-        
-        new_data_name = (self.data_name + "_filtered") if out_of_place else self.data_name
-        result = self.transform(filter_incomplete_samples_fn, new_data_name, forced_rewrite=True, map_key_fields=False)
-        if failure_count and (eval(os.environ.get("LOUD_BACKEND", "0")) or failure_count * 8 > total_count):
-            warnings.warn(f"Removed {failure_count} out of {total_count} samples due to missing key fields.")
-        
+
+        new_data_name = (
+            (self.data_name + "_filtered") if out_of_place else self.data_name
+        )
+        result = self.transform(
+            filter_incomplete_samples_fn,
+            new_data_name,
+            forced_rewrite=True,
+            map_key_fields=False,
+        )
+        if failure_count and (
+            eval(os.environ.get("LOUD_BACKEND", "0")) or failure_count * 8 > total_count
+        ):
+            warnings.warn(
+                f"Removed {failure_count} out of {total_count} samples due to missing key fields."
+            )
+
         return result
-    
+
     def manage_llama_factory_registration(
         self, operation: Literal["add", "remove", "query"], forced_update: bool = True
     ) -> bool:
@@ -543,7 +653,9 @@ def manage_llama_factory_registration(
                 f"Adding registration of data {self.data_name}: {registrations[self.data_name]}."
             )
 
-            with open(f"{root}/libs/llama_factory/data/dataset_info.json", "w") as out_file:
+            with open(
+                f"{root}/libs/llama_factory/data/dataset_info.json", "w"
+            ) as out_file:
                 json.dump(registrations, out_file)
 
             print(f"Successfully completed registration of data {self.data_name}.")
@@ -558,7 +670,9 @@ def manage_llama_factory_registration(
             path = f'{root}/libs/llama_factory/data/{registrations[self.data_name]["file_name"]}'
             del registrations[self.data_name]
 
-            with open(f"{root}/libs/llama_factory/data/dataset_info.json", "w") as out_file:
+            with open(
+                f"{root}/libs/llama_factory/data/dataset_info.json", "w"
+            ) as out_file:
                 json.dump(registrations, out_file)
 
             if os.path.exists(path):
@@ -596,7 +710,7 @@ def set_key_fields(
 
         :param system_field_name: The name of the system field
         :type system_field_name: Optional[str] = None
-        
+
         :param history_field_name: The name of the history field
         :type history_field_name: Optional[str] = None
 
@@ -634,7 +748,7 @@ def set_key_fields(
             del self.key_fields["system"]
         elif system_field_name:
             self.key_fields["system"] = system_field_name
-        
+
         if history_field_name == "" and "history" in self.key_fields:
             del self.key_fields["history"]
         elif history_field_name:
diff --git a/src/abstractions/model.py b/src/abstractions/model.py
index 8036d36..861a67c 100644
--- a/src/abstractions/model.py
+++ b/src/abstractions/model.py
@@ -51,7 +51,7 @@ def inference_standalone(
         if not eval(os.environ.get("LOUD_BACKEND", "False")):
             sys.stdout = devnull
             sys.stderr = devnull
-        
+
         backend, process_batch, mopup_memory = start_inference_backend(
             model_path,
             backend_type,
@@ -59,7 +59,7 @@ def inference_standalone(
             template_type=template_type,
             purpose=purpose,
         )
-        
+
         if GlobalState.continuous_backend:
             print("Continuous backend is enabled.")
             GlobalState.register_destroyer(mopup_memory)
@@ -72,7 +72,9 @@ def inference_standalone(
             prompt_field_name=prompt_field_name, query_field_name=query_field_name
         )
         result_data = data.transform(
-            transformation=partial(process_batch, temperature=temperature, max_tokens=max_tokens),
+            transformation=partial(
+                process_batch, temperature=temperature, max_tokens=max_tokens
+            ),
             result_data_name=result_data_name,
             forced_rewrite=(
                 Model.always_force_rewrite
@@ -87,7 +89,7 @@ def inference_standalone(
         print("Memory mopup done.")
         if conn is not None:
             conn.send(result_data.data_path)
-        
+
         sys.stdout, sys.stderr = old_stdout, old_stderr
         return result_data.data_path
 
@@ -154,7 +156,7 @@ def __init__(
         if os.environ.get("DEFAULT_TEMPLATE") and not template_type:
             template_type = os.environ["DEFAULT_TEMPLATE"].lower()
             assert template_type in ["auto", "alpaca", "mistral", "llama3"]
-        
+
         if not num_gpus:
             num_gpus = torch.cuda.device_count()
 
@@ -233,7 +235,9 @@ def deep_copy(
             if dest_suffix
             else dest_full_name
         )
-        copied_path = os.path.join(os.path.join(root, "output", dest_subdir), copied_name)
+        copied_path = os.path.join(
+            os.path.join(root, "output", dest_subdir), copied_name
+        )
         Model.ask_and_remove_if_exists(copied_path, forced_rewrite=False)
         if not os.path.exists(copied_path):
             shutil.copytree(path, copied_path)
@@ -380,7 +384,7 @@ def finetune(
             ), "For RLHF, ppo_data must be an instance of Data or not provided at all."
         else:
             raise ValueError(f"Unsupported stage {stage}.")
-        
+
         data = data.filter_incomplete_samples(out_of_place=True)
 
         if lr is None:
@@ -453,7 +457,11 @@ def finetune(
                 "",  # do sample; ignored here
                 self.model_path,  # where to find the original model
                 data.data_name,  # dataset (automatically registered in llama-factory)
-                (f"\n    --template {self.template_type} \\" if self.template_type != "auto" else ""),  # template type
+                (
+                    f"\n    --template {self.template_type} \\"
+                    if self.template_type != "auto"
+                    else ""
+                ),  # template type
                 ("lora" if algo == "lora" else "full"),  # type - full_param or lora
                 f"{root}/output/training_results/{escape(result_model_name)}/",  # where to save the training results (and checkpoints etc.)
                 2
@@ -528,7 +536,11 @@ def finetune(
                 "pa38-lf",
                 self.model_path,
                 result.model_path,
-                (f"\n    --template {self.template_type} \\" if self.template_type != "auto" else ""),  # template type
+                (
+                    f"\n    --template {self.template_type} \\"
+                    if self.template_type != "auto"
+                    else ""
+                ),  # template type
                 merged_model_path,
             )
             print(cmd)
@@ -594,7 +606,11 @@ def __rlhf(
             f"{root}/src/abstractions/configs/LF_examples/full_multi_gpu/ds_z3_config.json",
             rw_path,
             rw_data.data_name,
-            (f"\n    --template {self.template_type} \\" if self.template_type != "auto" else ""),  # template type
+            (
+                f"\n    --template {self.template_type} \\"
+                if self.template_type != "auto"
+                else ""
+            ),  # template type
             rw_results,
             2 ** max(0, 3 + batch_size_multiplier_log2),  # per_device_train_batch_size
             2 ** max(0, 4 + batch_size_multiplier_log2),  # per_device_eval_batch_size
@@ -643,7 +659,11 @@ def __rlhf(
             rw_results,
             "lora" if use_lora else "full",
             ppo_data.data_name,
-            (f"\n    --template {self.template_type} \\" if self.template_type != "auto" else ""),  # template type
+            (
+                f"\n    --template {self.template_type} \\"
+                if self.template_type != "auto"
+                else ""
+            ),  # template type
             the_path,
             2 ** max(0, 1 + batch_size_multiplier_log2),  # per_device_train_batch_size
             2 ** max(0, 2 + batch_size_multiplier_log2),  # per_device_eval_batch_size
@@ -706,10 +726,10 @@ def inference(
 
         :param temperature: The temperature parameter.
         :type temperature: float = 0.25
-        
+
         :param max_tokens: The maximum number of tokens to generate. Ignored if purpose is "logprobs".
         :type max_tokens: int = 8192
-        
+
         :param purpose: The purpose of the inference. It can be "responses" or "logprobs". If "logprobs", the log probability of the prompt itself (and the assistant response supplied in the `predict` field, if exists) is returned in the `logprob` field of the resulting dataset, without doing any completion. If "responses", the completion text is saved in the `predict` field of the resulting dataset.
         :type purpose: Literal["responses", "logprobs"] = "responses"
 
@@ -754,7 +774,7 @@ def inference(
                 "Logprobs are only supported with backend=sglang. Switching to sglang backend."
             )
             backend = "sglang"
-        
+
         if input_is_data:
             assert (
                 data.data_type != "pretrain" or backend == "deepspeed"
@@ -809,7 +829,13 @@ def inference(
         return result
 
     def __inference_parallel_segregated(
-        self, data: Data, result_data_name: str, temperature: float, max_tokens: int, backend_type: str, purpose: str
+        self,
+        data: Data,
+        result_data_name: str,
+        temperature: float,
+        max_tokens: int,
+        backend_type: str,
+        purpose: str,
     ) -> Data:
         """sglang/vllm implementation for `inference()`, but performed in a separate process to free up GPU memory. This is the recommended implementation, due to its superior speed and robustness."""
         data_path = data.data_path
@@ -825,7 +851,7 @@ def __inference_parallel_segregated(
 
         if eval(os.environ.get("LOUD_BACKEND", "False")):
             print(f"GlobalState.continuous_backend = {GlobalState.continuous_backend}")
-        
+
         if not GlobalState.continuous_backend:
             # run inference_standalone in a separate process
             multiprocessing.set_start_method("spawn", force=True)
@@ -866,7 +892,7 @@ def __inference_parallel_segregated(
                 purpose,
                 None,
             )
-        
+
         print("Inference results saved at ", result_data_path)
 
         return Data(
@@ -887,7 +913,9 @@ def __inference_parallel_deepspeed(
             operation="add"
         )
 
-        result_data_path = f"{root}/output/inference_results/{escape(result_data_name)}/"
+        result_data_path = (
+            f"{root}/output/inference_results/{escape(result_data_name)}/"
+        )
 
         # run prediction
         deepspeed_args = (
@@ -904,7 +932,11 @@ def __inference_parallel_deepspeed(
             "\n--do_sample \\",  # do sample
             self.model_path,  # where to save the resulting model
             data.data_name,  # dataset (automatically registered in llama-factory)
-            (f"\n    --template {self.template_type} \\" if self.template_type != "auto" else ""),  # template type
+            (
+                f"\n    --template {self.template_type} \\"
+                if self.template_type != "auto"
+                else ""
+            ),  # template type
             "full",  # type - full_param or lora; useless here
             result_data_path,  # where to save the inference results
             2 ** max(0, 3 + batch_size_multiplier_log2),  # per_device_train_batch_size
@@ -1002,7 +1034,9 @@ def __inference_serial(
         """Serial implementation for `inference()`."""
 
         data_name = result_data_name  # self.model_name + "_inference_output"
-        os.makedirs(os.path.join(root, "output", "inference_results", "inf"), exist_ok=True)
+        os.makedirs(
+            os.path.join(root, "output", "inference_results", "inf"), exist_ok=True
+        )
         data_path = os.path.join(
             root, "output", "inference_results", "inf", data_name + ".json"
         )
@@ -1048,7 +1082,9 @@ def __inference_serial(
             else list(result_data.all_passages())
         )
 
-    def evaluate(self, method: Literal["fast", "dummy"] = "fast", logprobs=True) -> np.ndarray:
+    def evaluate(
+        self, method: Literal["fast", "dummy"] = "fast", logprobs=True
+    ) -> np.ndarray:
         """
         Returns a high-dimensional vector representing morality preference of the model. Choose "dummy" for fast debugging runs.
         """
@@ -1061,7 +1097,7 @@ def evaluate(self, method: Literal["fast", "dummy"] = "fast", logprobs=True) ->
                 f'Method {method} not recognized. Options are "fast" and "dummy".'
             )
 
-    def __evaluate_fast(self, logprobs = True) -> np.ndarray:
+    def __evaluate_fast(self, logprobs=True) -> np.ndarray:
         if self.template_type != "alpaca":
             raise NotImplementedError(
                 "Fast evaluation is only supported for models using alpaca template."
@@ -1084,18 +1120,18 @@ def __evaluate_fast(self, logprobs = True) -> np.ndarray:
         else:
             evaluation_input = eval_utils.regenerate_inputs()
             p = "responses"
-        
+
         print("evaluation query begins")
         evaluation_output = self.inference(
             evaluation_input,
             "evaluation_output_mc_" + self.model_name,
             backend="sglang",
-            purpose=p
+            purpose=p,
         )
         print("answers at", evaluation_output.data_path)
         with open(evaluation_output.data_path, "r") as f:
             evaluation_output_data = json.load(f)
-        raw_stats = eval_utils.collect(evaluation_output_data, logprobs = logprobs)
+        raw_stats = eval_utils.collect(evaluation_output_data, logprobs=logprobs)
         with open(
             os.path.join(experiment_directory, self.model_name + "_raw.json"), "w"
         ) as f:
@@ -1117,7 +1153,12 @@ def __evaluate_slow_moralchoice(self) -> np.ndarray:
             os.mkdir(os.path.join(root, "output", "evaluation_results"))
 
         directory = os.path.join(
-            root, "libs", "moralchoice", "data", "responses", self.model_name + "_single"
+            root,
+            "libs",
+            "moralchoice",
+            "data",
+            "responses",
+            self.model_name + "_single",
         )
 
         if os.path.exists(directory) and (
diff --git a/src/download_models.py b/src/download_models.py
index e0621fd..1214cf6 100644
--- a/src/download_models.py
+++ b/src/download_models.py
@@ -8,8 +8,16 @@ def download_model(model_name: str, save_path: str):
         with open(os.devnull, "w") as devnull:
             process = subprocess.Popen(
                 ["huggingface-cli", "download", model_name, "--local-dir", save_path],
-                stdout=(devnull if not eval(os.environ.get("LOUD_BACKEND", "False")) else None),
-                stderr=(devnull if not eval(os.environ.get("LOUD_BACKEND", "False")) else None),
+                stdout=(
+                    devnull
+                    if not eval(os.environ.get("LOUD_BACKEND", "False"))
+                    else None
+                ),
+                stderr=(
+                    devnull
+                    if not eval(os.environ.get("LOUD_BACKEND", "False"))
+                    else None
+                ),
             )
             process.wait()
 
diff --git a/src/evaluation/figure.py b/src/evaluation/figure.py
index 1924251..03fea2e 100644
--- a/src/evaluation/figure.py
+++ b/src/evaluation/figure.py
@@ -22,10 +22,12 @@ def get_dim(template_raw, template_out, idx):
 
 
 template1 = (
-    f"{root}/" + "output/evaluation_results/8b-C0{n}-instruct_single/8b-C0{n}-instructraw.json"
+    f"{root}/"
+    + "output/evaluation_results/8b-C0{n}-instruct_single/8b-C0{n}-instructraw.json"
 )
 template2 = (
-    f"{root}/" + "output/evaluation_results/8b-C0{n}-instruct_single/8b-C0{n}-instructdim.json"
+    f"{root}/"
+    + "output/evaluation_results/8b-C0{n}-instruct_single/8b-C0{n}-instructdim.json"
 )
 idx = [13, 14, 15, 16, 17, 18, 19, 20, 21]
 vecs = get_dim(template1, template2, idx)
diff --git a/src/evaluation/quantify.py b/src/evaluation/quantify.py
index 926cc83..287f94a 100644
--- a/src/evaluation/quantify.py
+++ b/src/evaluation/quantify.py
@@ -109,7 +109,11 @@ def __calculate_model(test_name, high_or_low, model_name):
     avg_vec = sum([np.array(x[1]) for x in mrl_vec.items()]) / len(mrl_vec.items())
     with open(
         os.path.join(
-            root, "output", "evaluation_results", test_name, model_name + "_collected.json"
+            root,
+            "output",
+            "evaluation_results",
+            test_name,
+            model_name + "_collected.json",
         ),
         "w",
     ) as f:
@@ -209,7 +213,7 @@ def calculate_model(test_dir, model_name, logprob=False):
                         template[i] = -1
                     else:
                         continue
-                        #print("non-conflicting morality", row[0])
+                        # print("non-conflicting morality", row[0])
             mrl_vec[0][name] = list(template * (mal))
 
     for key in raw_dict.keys():
@@ -255,15 +259,17 @@ def calculate_model(test_dir, model_name, logprob=False):
                     mrl_vec[2][key] += np.array(entry["repeat2_fav"][:4]) / (
                         entry["repeat2_fav"][-1]
                     )
-            mrl_vec[2][key] /= int(entry["4c_fav"][-1] != 0) + int(entry["repeat2_fav"][-1] != 0)
-           
+            mrl_vec[2][key] /= int(entry["4c_fav"][-1] != 0) + int(
+                entry["repeat2_fav"][-1] != 0
+            )
+
         if num == 1:
             # ref_dict = csv_to_dict_list(ref_dir[1], ['scenario_id', 'generation_theme'])
             ref_dict = csv_to_dict(ref_dir[1], ["generation_theme"])
             if key not in ref_dict.keys():
                 print("key", key, "not found")
                 continue
-            
+
             """
             registering invalid
             """
@@ -331,6 +337,7 @@ def calculate_model(test_dir, model_name, logprob=False):
 
     return res
 
+
 def normalize_matrix(matrix, ranges):
     """
     For each row, divide the elements from column ai to column bi by the sum of all elements in that row between columns ai and bi.
@@ -338,23 +345,24 @@ def normalize_matrix(matrix, ranges):
     Arguments:
         matrix: An input matrix with m rows and n columns (numpy array).
         ranges: A list of tuples (ai, bi), where each tuple represents the column range (0-based index) for which the operation will be performed on each row.
-    
+
     Returns:
         The processed matrix (numpy array).
     """
-    matrix = np.array(matrix) 
+    matrix = np.array(matrix)
     m, n = matrix.shape
     if ranges == None:
-        ranges = [(0, n-1)] 
+        ranges = [(0, n - 1)]
     for row_idx in range(m):
-        for (ai, bi) in ranges:
+        for ai, bi in ranges:
             # calculate the sum of columns ai-bi for each row
-            sum_elements = np.sum(matrix[row_idx, ai:bi+1])
+            sum_elements = np.sum(matrix[row_idx, ai : bi + 1])
             if sum_elements != 0:  # avoid division by zero
-                matrix[row_idx, ai:bi+1] /= sum_elements
-    
+                matrix[row_idx, ai : bi + 1] /= sum_elements
+
     return matrix
 
+
 def plot_parallel_coordinates(data, title, tuples):
     """
     Parallel coordinate plot to visualize variations in high dimensional data
@@ -381,7 +389,8 @@ def plot_parallel_coordinates(data, title, tuples):
     plt.show()
     plt.savefig(f"{root}/output/evaluation_results/figs/" + title + "_parr.png")
 
-def plot_heatmap(data, title, label_set, tuples = None, norm = "column"):
+
+def plot_heatmap(data, title, label_set, tuples=None, norm="column"):
     """
     Heatmap for list of 19-dimensional vector.
 
@@ -390,18 +399,19 @@ def plot_heatmap(data, title, label_set, tuples = None, norm = "column"):
     """
     if norm == "group":
         data = normalize_matrix(data, tuples)
-        #data -= np.mean(data, axis=0, keepdims=True)
-        #data /= np.std(data, axis=0, keepdims=True)
+        # data -= np.mean(data, axis=0, keepdims=True)
+        # data /= np.std(data, axis=0, keepdims=True)
     if norm == "column":
         data -= np.mean(data, axis=0, keepdims=True)
         data /= np.std(data, axis=0, keepdims=True)
-    #data /= np.sum(data, axis=0, keepdims=True)
+    # data /= np.sum(data, axis=0, keepdims=True)
 
     # Heatmap with appropriate labels
     plt.figure(figsize=(12, 8))
     xlabels = None
     if label_set == 1:
-        xlabels = [ "Do not cause death",
+        xlabels = [
+            "Do not cause death",
             "Do no inflict pain",
             "Do not disable",
             "Do not restrict freedom",
@@ -410,21 +420,18 @@ def plot_heatmap(data, title, label_set, tuples = None, norm = "column"):
             "Do not cheat",
             "Do not break promise",
             "Do not violate law",
-            "Do not violate duties"]
+            "Do not violate duties",
+        ]
     elif label_set == 2:
         xlabels = [
             "Harm/Care",
             "In Group & Loyalty",
             "Fairness & Reciprocity",
             "Authority & Respect",
-            "Purity & Sancity"]
-    elif label_set == 3:
-        xlabels = [
-            "Traditional",
-            "Modern",
-            "Post-modern",
-            "Integrated"
+            "Purity & Sancity",
         ]
+    elif label_set == 3:
+        xlabels = ["Traditional", "Modern", "Post-modern", "Integrated"]
     ax = sns.heatmap(
         data,
         annot=True,
@@ -439,6 +446,7 @@ def plot_heatmap(data, title, label_set, tuples = None, norm = "column"):
     plt.show()
     plt.savefig(f"{root}/output/evaluation_results/figs/" + title + "_heat.png")
 
+
 def plot_vectors(vectors, dim_start, name):
     print(vectors)
     num_vectors = len(vectors)
@@ -467,7 +475,9 @@ def plot_vectors(vectors, dim_start, name):
 
     plt.show()
     plt.savefig(
-        f"{root}/output/evaluation_results/figs/" + name_mapping[name] + "_line_real.png"
+        f"{root}/output/evaluation_results/figs/"
+        + name_mapping[name]
+        + "_line_real.png"
     )
 
 
@@ -487,7 +497,7 @@ def analyze_vectors_quadratic(vectors):
     p_values = []
     positive_coefficients = []
     negative_coefficients = []
-    np.savetxt(f"{root}/output/evaluation_results/figs/quad.txt", vectors, fmt='%f')
+    np.savetxt(f"{root}/output/evaluation_results/figs/quad.txt", vectors, fmt="%f")
     for dim in range(num_dimensions):
         x = np.arange(len(vectors))
         y = vectors[:, dim]
@@ -513,10 +523,12 @@ def analyze_vectors_quadratic(vectors):
 
     # Print coefficients and p-values
     print("Quadratic coefficients and p-values for each dimension:")
-    with open(f"{root}/output/evaluation_results/figs/quad.txt", 'a') as f:
+    with open(f"{root}/output/evaluation_results/figs/quad.txt", "a") as f:
         for i, (coeffs, p_value) in enumerate(zip(coefficients, p_values)):
             print(f"Dimension {i + 1}: coefficients = {coeffs}, p-value = {p_value}")
-            f.write(f"Dimension {i + 1}: coefficients = {coeffs}, p-value = {p_value}\n\n") 
+            f.write(
+                f"Dimension {i + 1}: coefficients = {coeffs}, p-value = {p_value}\n\n"
+            )
 
     # Plot positive coefficients
     plt.figure(figsize=(14, 7))
diff --git a/src/evaluation/test_eval_01.py b/src/evaluation/test_eval_01.py
index e4689dd..1e8e0ae 100644
--- a/src/evaluation/test_eval_01.py
+++ b/src/evaluation/test_eval_01.py
@@ -1,4 +1,5 @@
 import os, json
+
 # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 from src.path import root
 from ..abstractions import Model
@@ -6,6 +7,7 @@
 from multiprocessing import freeze_support
 from . import quantify as qt
 import numpy as np
+
 """
 generate_alpaca('mc', os.path.join(root, 'src', 'evaluation', 'raw_dataset', 'moralchoice'))
 generate_alpaca('views', os.path.join(root, 'src', 'evaluation', 'raw_dataset', 'views'))
@@ -13,7 +15,7 @@
 """
 if __name__ == "__main__":
     freeze_support()
-    
+
     set_model = [
         "8B-C013-instruct",
         "8B-C014-instruct",
@@ -23,21 +25,23 @@
         "8B-C018-instruct",
         "8B-C019-instruct",
         "8B-C020-instruct",
-        "8B-C021-instruct"
+        "8B-C021-instruct",
     ]
-    #set_model = ["8B-C018-instruct"]
+    # set_model = ["8B-C018-instruct"]
     vec = []
     for m in set_model:
-        #boi = Model(m)
-        #v = boi.evaluate(method="fast", logprobs = True)
-        
-        with open("output/datasets/evaluation_output_mc_" + m + ".json", 'r') as f:
+        # boi = Model(m)
+        # v = boi.evaluate(method="fast", logprobs = True)
+
+        with open("output/datasets/evaluation_output_mc_" + m + ".json", "r") as f:
             d = json.load(f)
         raw = _collect(d)
-        with open(f'{root}/output/evaluation_results/' + m + '_single/' + m + '_raw.json', 'w') as f:
+        with open(
+            f"{root}/output/evaluation_results/" + m + "_single/" + m + "_raw.json", "w"
+        ) as f:
             json.dump(raw, f)
-        v = qt.calculate_model('output/evaluation_results/' + m + '_single/', m)
-        
+        v = qt.calculate_model("output/evaluation_results/" + m + "_single/", m)
+
         vec.append(v)
     test_name = "8b_13to21"
     with open("output/evaluation_results/" + test_name + ".json", "w") as f:
@@ -47,6 +51,6 @@
     qt.analyze_vectors_quadratic(vec)
     # vec = json.load(open("output/evaluation_results/" + test_name + ".json", "r"))
     # qt.plot_parallel_coordinates(vec)
-    qt.plot_heatmap(vec[:, 10:15], test_name + '_foundation', label_set = 2, norm = "group")
-    qt.plot_heatmap(vec[:, 15:19],  test_name + '_view',label_set = 3, norm = "group")
-    qt.plot_heatmap(vec[:, :10], test_name + '_morality', label_set = 1, norm = "column")
+    qt.plot_heatmap(vec[:, 10:15], test_name + "_foundation", label_set=2, norm="group")
+    qt.plot_heatmap(vec[:, 15:19], test_name + "_view", label_set=3, norm="group")
+    qt.plot_heatmap(vec[:, :10], test_name + "_morality", label_set=1, norm="column")
diff --git a/src/evaluation/test_eval_02.py b/src/evaluation/test_eval_02.py
index 2d138f1..bc04732 100644
--- a/src/evaluation/test_eval_02.py
+++ b/src/evaluation/test_eval_02.py
@@ -5,6 +5,7 @@
 import src.evaluation.quantify as qt
 import numpy as np
 from src.path import root
+
 """
 generate_alpaca('mc', os.path.join(root, 'src', 'evaluation', 'raw_dataset', 'moralchoice'))
 generate_alpaca('views', os.path.join(root, 'src', 'evaluation', 'raw_dataset', 'views'))
@@ -21,24 +22,28 @@
         "8B-C018-instruct",
         "8B-C019-instruct",
         "8B-C020-instruct",
-        "8B-C021-instruct"
+        "8B-C021-instruct",
     ]
-    with open('src/evaluation/assets/input_alpaca.json', 'r') as f:
+    with open("src/evaluation/assets/input_alpaca.json", "r") as f:
         ref = json.load(f)
     display = []
     for m in set_model:
-        with open(f"{root}/output/datasets/evaluation_output_mc_" + m + ".json", 'r') as f:
+        with open(
+            f"{root}/output/datasets/evaluation_output_mc_" + m + ".json", "r"
+        ) as f:
             d = json.load(f)
-        raw = collect(d, logprobs = True)
-        with open(f'{root}/output/evaluation_results/' + m + '_single/' + m + '_raw.json', 'w') as f:
+        raw = collect(d, logprobs=True)
+        with open(
+            f"{root}/output/evaluation_results/" + m + "_single/" + m + "_raw.json", "w"
+        ) as f:
             json.dump(raw, f)
-        v = qt.calculate_model(f'{root}/output/evaluation_results/' + m + '_single/', m)
+        v = qt.calculate_model(f"{root}/output/evaluation_results/" + m + "_single/", m)
         vec.append(v)
     test_name = "8b_all_fixed"
     vec = np.array(vec)
     with open(f"{root}/output/evaluation_results/" + test_name + ".json", "w") as f:
         lst = [list(boi) for boi in vec]
         json.dump(lst, f)
-    qt.plot_heatmap(vec[:, 10:15], test_name + '_foundation', label_set = 2, norm = "group")
-    qt.plot_heatmap(vec[:, 15:19],  test_name + '_view',label_set = 3, norm = "group")
-    qt.plot_heatmap(vec[:, :10], test_name + '_morality', label_set = 1, norm = "group")
+    qt.plot_heatmap(vec[:, 10:15], test_name + "_foundation", label_set=2, norm="group")
+    qt.plot_heatmap(vec[:, 15:19], test_name + "_view", label_set=3, norm="group")
+    qt.plot_heatmap(vec[:, :10], test_name + "_morality", label_set=1, norm="group")
diff --git a/src/evaluation/utils.py b/src/evaluation/utils.py
index a9ced9f..030c887 100644
--- a/src/evaluation/utils.py
+++ b/src/evaluation/utils.py
@@ -20,11 +20,22 @@ def regenerate_inputs(logprobs=False) -> Data:
         os.remove(input_file)
 
     generate_alpaca(
-        "mc", os.path.join(root, "src", "evaluation", "raw_dataset", "moralchoice"), rearrange = True, logprobs=logprobs
+        "mc",
+        os.path.join(root, "src", "evaluation", "raw_dataset", "moralchoice"),
+        rearrange=True,
+        logprobs=logprobs,
     )
-    generate_alpaca("views", os.path.join(root, "src", "evaluation", "raw_dataset", "views"), rearrange = True, logprobs=logprobs)
     generate_alpaca(
-        "foundation", os.path.join(root, "src", "evaluation", "raw_dataset", "foundation"), rearrange = True, logprobs=logprobs
+        "views",
+        os.path.join(root, "src", "evaluation", "raw_dataset", "views"),
+        rearrange=True,
+        logprobs=logprobs,
+    )
+    generate_alpaca(
+        "foundation",
+        os.path.join(root, "src", "evaluation", "raw_dataset", "foundation"),
+        rearrange=True,
+        logprobs=logprobs,
     )
 
     result = Data(
@@ -107,14 +118,21 @@ def inside(a, string_list, verbal=False):
             return True
     return False
 
+
 def semantic_matching(item, mapping, four=False, verbal=False):
     answer = item["predict"]
     q_type = item["question_type"]
     s_id = item["scenario_id"]
-    
-    optionA, optionB = item["action" + str(1+ mapping.index(1))], item["action" + str(1+ mapping.index(2))]
+
+    optionA, optionB = (
+        item["action" + str(1 + mapping.index(1))],
+        item["action" + str(1 + mapping.index(2))],
+    )
     if four:
-        optionC, optionD = item["action" + str(1+ mapping.index(3))], item["action" + str(1+ mapping.index(4))]
+        optionC, optionD = (
+            item["action" + str(1 + mapping.index(3))],
+            item["action" + str(1 + mapping.index(4))],
+        )
 
     response_template = os.path.join(
         root, "src", "evaluation", "assets", "data", "response_templates"
@@ -216,8 +234,6 @@ def semantic_matching(item, mapping, four=False, verbal=False):
                 return "action3"
             if answer.startswith("d"):
                 return "action4"
-        
-
 
     if inside(answer, answers_action1, verbal):
         return "action1"
@@ -241,7 +257,7 @@ def semantic_matching(item, mapping, four=False, verbal=False):
         if answer in answers_action4:
             return "action4"
     """
-    
+
     """
     logging invalid
     """
@@ -252,6 +268,7 @@ def semantic_matching(item, mapping, four=False, verbal=False):
         )
     return "invalid"
 
+
 def _collect(output_data):
     output = {}
     invalid = 0
@@ -266,7 +283,7 @@ def _collect(output_data):
             if q_type.startswith("4c") or q_type.startswith("repeat2"):
                 output[s_id] = {
                     "4c_fav": [0, 0, 0, 0, 0],
-                    "repeat2_fav": [0, 0, 0, 0, 0]
+                    "repeat2_fav": [0, 0, 0, 0, 0],
                 }
             else:
                 output[s_id] = {
@@ -280,16 +297,15 @@ def _collect(output_data):
                 output[s_id][q_type][-1] += np.exp(x)
         else:
             invalid += 1
-    
+
     print(f"{invalid} out of {len(output_data)} entries are invalid")
     return output
-        
 
 
 def __collect(output_data):
     """
     Sub-function called by 'collect' to perform logprob-based answer collection.
-    
+
     need middle: {'V_XXX':{'ab':np.array(n,2), 'compare':...}, 'V_XXX':...}
     output: {'ab':[a, b, inv., total], 'compare':...}
     """
@@ -306,7 +322,7 @@ def __collect(output_data):
             if q_type.startswith("4c") or q_type.startswith("repeat2"):
                 base_logprobs[s_id] = {
                     "4c_fav": np.zeros(24),
-                    "repeat2_fav": np.zeros(24)
+                    "repeat2_fav": np.zeros(24),
                 }
             else:
                 base_logprobs[s_id] = {
@@ -324,13 +340,19 @@ def __collect(output_data):
         if predict_id == -1:
             continue
         prob = np.exp(entry["logprob"] - base_logprobs[s_id][q_type][mapping_id])
-        print("prob", prob, s_id, entry["logprob"], base_logprobs[s_id][q_type][mapping_id])
+        print(
+            "prob",
+            prob,
+            s_id,
+            entry["logprob"],
+            base_logprobs[s_id][q_type][mapping_id],
+        )
         assert prob < 1
         if not s_id in middle.keys():
             if q_type.startswith("4c") or q_type.startswith("repeat2"):
                 middle[s_id] = {
                     "4c_fav": np.zeros((24, 2)),
-                    "repeat2_fav": np.zeros((24, 2))
+                    "repeat2_fav": np.zeros((24, 2)),
                 }
                 middle[s_id]["4c_fav"][:, -1] = -1000
                 middle[s_id]["repeat2_fav"][:, -1] = -1000
@@ -347,51 +369,76 @@ def __collect(output_data):
         if prob > old_prob:
             middle[s_id][q_type][mapping_id][0] = predict_id
             middle[s_id][q_type][mapping_id][1] = prob
-        with open(f'{root}/output/evaluation_results/middle.json', 'a+') as f:
+        with open(f"{root}/output/evaluation_results/middle.json", "a+") as f:
             record = copy.deepcopy(middle)
             for x in record.keys():
                 for y in record[x].keys():
                     record[x][y] = record[x][y].tolist()
-            f.write('\n********\n')
+            f.write("\n********\n")
             json.dump(record, f)
     output = {}
     for k in middle.keys():
         assert k not in output.keys()
         if "4c_fav" in middle[k].keys():
-            #print(middle[k]["4c_fav"])
+            # print(middle[k]["4c_fav"])
             output[k] = {
-                    "4c_fav": [0, 0, 0, 0, 0, 0],
-                    "repeat2_fav": [0, 0, 0, 0, 0, 0],
+                "4c_fav": [0, 0, 0, 0, 0, 0],
+                "repeat2_fav": [0, 0, 0, 0, 0, 0],
             }
-            output[k]["4c_fav"] = [np.sum(middle[k]["4c_fav"][:, 0] == 1), np.sum(middle[k]["4c_fav"][:, 0] == 2), 
-                                   np.sum(middle[k]["4c_fav"][:, 0] == 3), np.sum(middle[k]["4c_fav"][:, 0] == 4), 0, 24]
-            output[k]["repeat2_fav"] = [np.sum(middle[k]["repeat2_fav"][:, 0] == 1), np.sum(middle[k]["repeat2_fav"][:, 0] == 2), 
-                                   np.sum(middle[k]["repeat2_fav"][:, 0] == 3), np.sum(middle[k]["repeat2_fav"][:, 0] == 4), 0, 24] 
+            output[k]["4c_fav"] = [
+                np.sum(middle[k]["4c_fav"][:, 0] == 1),
+                np.sum(middle[k]["4c_fav"][:, 0] == 2),
+                np.sum(middle[k]["4c_fav"][:, 0] == 3),
+                np.sum(middle[k]["4c_fav"][:, 0] == 4),
+                0,
+                24,
+            ]
+            output[k]["repeat2_fav"] = [
+                np.sum(middle[k]["repeat2_fav"][:, 0] == 1),
+                np.sum(middle[k]["repeat2_fav"][:, 0] == 2),
+                np.sum(middle[k]["repeat2_fav"][:, 0] == 3),
+                np.sum(middle[k]["repeat2_fav"][:, 0] == 4),
+                0,
+                24,
+            ]
             for x in ["4c_fav", "repeat2_fav"]:
                 output[k][x][-1] = sum(output[k][x][:4])
             output[k]["4c_fav"] = [int(x) for x in output[k]["4c_fav"]]
-            output[k]["repeat2_fav"] = [int(x) for x in output[k]["repeat2_fav"]]    
+            output[k]["repeat2_fav"] = [int(x) for x in output[k]["repeat2_fav"]]
         else:
-            #print(middle[k]["ab"])
+            # print(middle[k]["ab"])
             output[k] = {
-                    "ab": [0, 0, 0, 0],
-                    "compare": [0, 0, 0, 0],
-                    "repeat": [0, 0, 0, 0],
+                "ab": [0, 0, 0, 0],
+                "compare": [0, 0, 0, 0],
+                "repeat": [0, 0, 0, 0],
             }
-            output[k]["ab"] = [int(np.sum(middle[k]["ab"][:, 0] == 1)), int(np.sum(middle[k]["ab"][:, 0] == 2)),
-                               int(np.sum(middle[k]["ab"][:, 0] == 0)), 2]
-            output[k]["compare"] = [int(np.sum(middle[k]["compare"][:, 0] == 1)), int(np.sum(middle[k]["compare"][:, 0] == 2)),
-                                    int(np.sum(middle[k]["compare"][:, 0] == 0)), 2]  
-            output[k]["repeat"] = [int(np.sum(middle[k]["repeat"][:, 0] == 1)), int(np.sum(middle[k]["repeat"][:, 0] == 2)), 
-                                   int(np.sum(middle[k]["repeat"][:, 0] == 0)), 2]  
-            '''
+            output[k]["ab"] = [
+                int(np.sum(middle[k]["ab"][:, 0] == 1)),
+                int(np.sum(middle[k]["ab"][:, 0] == 2)),
+                int(np.sum(middle[k]["ab"][:, 0] == 0)),
+                2,
+            ]
+            output[k]["compare"] = [
+                int(np.sum(middle[k]["compare"][:, 0] == 1)),
+                int(np.sum(middle[k]["compare"][:, 0] == 2)),
+                int(np.sum(middle[k]["compare"][:, 0] == 0)),
+                2,
+            ]
+            output[k]["repeat"] = [
+                int(np.sum(middle[k]["repeat"][:, 0] == 1)),
+                int(np.sum(middle[k]["repeat"][:, 0] == 2)),
+                int(np.sum(middle[k]["repeat"][:, 0] == 0)),
+                2,
+            ]
+            """
             if sum(output[k]["ab"][:3]) != output[k]["ab"][-1]:
                 print(middle[k]["ab"])
-            '''
+            """
             for x in ["ab", "compare", "repeat"]:
                 output[k][x][-1] = int(output[k][x][0] + output[k][x][1])
     return output
 
+
 def collect(output_data, logprobs):
     """
     input: alpaca styled raw query output
@@ -440,7 +487,8 @@ def collect(output_data, logprobs):
 
     return output
 
-def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False):
+
+def generate_alpaca(source: str, dir: str, rearrange=True, logprobs=False):
     GPTtemplate = """
     Given as input a question and a set of answers, determine whether ...
     (1). The question and the context proposed in the question is TIMELESS,
@@ -479,13 +527,15 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False):
 
         output_list_dic = []
         if not rearrange:
-            mappings = [(1,2)]
+            mappings = [(1, 2)]
         else:
-            mappings = [(1,2), (2,1)]
-        
+            mappings = [(1, 2), (2, 1)]
+
         for key, boi in context_and_action.items():
             for mapping in mappings:
-                rearranged_actions = [boi["action" + str(mapping[x])] for x in range(len(mapping))]
+                rearranged_actions = [
+                    boi["action" + str(mapping[x])] for x in range(len(mapping))
+                ]
                 boi_ab = {
                     "scenario_id": boi["scenario_id"],
                     "question_type": "ab",
@@ -493,7 +543,7 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False):
                     "action1": rearranged_actions[0],
                     "action2": rearranged_actions[1],
                     "mapping": mapping,
-                    "mapping_id": mappings.index(mapping)
+                    "mapping_id": mappings.index(mapping),
                 }
                 boi_compare = {
                     "scenario_id": boi["scenario_id"],
@@ -502,7 +552,7 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False):
                     "action1": rearranged_actions[0],
                     "action2": rearranged_actions[1],
                     "mapping": mapping,
-                    "mapping_id": mappings.index(mapping)
+                    "mapping_id": mappings.index(mapping),
                 }
                 boi_repeat = {
                     "scenario_id": boi["scenario_id"],
@@ -511,36 +561,46 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False):
                     "action1": rearranged_actions[0],
                     "action2": rearranged_actions[1],
                     "mapping": mapping,
-                    "mapping_id": mappings.index(mapping)
+                    "mapping_id": mappings.index(mapping),
                 }
                 boi_ab["instruction"] = instruction[0].strip()
                 boi_ab["input"] = (
                     question[0]
-                    .format(boi["context"], rearranged_actions[0], rearranged_actions[1])
+                    .format(
+                        boi["context"], rearranged_actions[0], rearranged_actions[1]
+                    )
                     .strip()
                 )
                 boi_compare["instruction"] = instruction[1].strip()
                 boi_compare["input"] = (
                     question[1]
-                    .format(boi["context"], rearranged_actions[0], rearranged_actions[1])
+                    .format(
+                        boi["context"], rearranged_actions[0], rearranged_actions[1]
+                    )
                     .strip()
                 )
                 boi_repeat["instruction"] = instruction[2].strip()
                 boi_repeat["input"] = (
                     question[2]
-                    .format(boi["context"], rearranged_actions[0], rearranged_actions[1])
+                    .format(
+                        boi["context"], rearranged_actions[0], rearranged_actions[1]
+                    )
                     .strip()
                 )
-                mapping_action = [('[A]', rearranged_actions[0], 'yes'), ('[B]', rearranged_actions[1], 'no')]
+                mapping_action = [
+                    ("[A]", rearranged_actions[0], "yes"),
+                    ("[B]", rearranged_actions[1], "no"),
+                ]
                 if logprobs:
-                    boi_ab["predict"] = [mapping_action[i-1][0] for i in mapping]
-                    boi_repeat["predict"] = [mapping_action[i-1][1] for i in mapping]
-                    boi_compare["predict"] = [mapping_action[i-1][2] for i in mapping]
+                    boi_ab["predict"] = [mapping_action[i - 1][0] for i in mapping]
+                    boi_repeat["predict"] = [mapping_action[i - 1][1] for i in mapping]
+                    boi_compare["predict"] = [mapping_action[i - 1][2] for i in mapping]
                 cut += 1
                 output_list_dic.extend([boi_ab, boi_compare, boi_repeat])
         try:
             with open(
-                os.path.join(root, "src", "evaluation", "assets", "input_alpaca.json"), "r"
+                os.path.join(root, "src", "evaluation", "assets", "input_alpaca.json"),
+                "r",
             ) as f:
                 temp = json.load(f)
         except:
@@ -551,7 +611,7 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False):
             os.path.join(root, "src", "evaluation", "assets", "input_alpaca.json"), "w"
         ) as f:
             json.dump(temp, f)
-    
+
     elif source == "views":
         """
         abcd (one fav. and one worst), repeat, each 'repeat' times.
@@ -587,7 +647,9 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False):
 
         for key, boi in context_and_action.items():
             for mapping in mappings:
-                rearranged_actions = [boi["action" + str(mapping[x])] for x in range(len(mapping))]
+                rearranged_actions = [
+                    boi["action" + str(mapping[x])] for x in range(len(mapping))
+                ]
                 boi_ab_f = {
                     "scenario_id": boi["scenario_id"],
                     "question_type": "4c_fav",
@@ -597,7 +659,7 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False):
                     "action3": rearranged_actions[2],
                     "action4": rearranged_actions[3],
                     "mapping": mapping,
-                    "mapping_id": mappings.index(mapping)
+                    "mapping_id": mappings.index(mapping),
                 }
                 boi_rp_f = {
                     "scenario_id": boi["scenario_id"],
@@ -608,7 +670,7 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False):
                     "action3": rearranged_actions[2],
                     "action4": rearranged_actions[3],
                     "mapping": mapping,
-                    "mapping_id": mappings.index(mapping)
+                    "mapping_id": mappings.index(mapping),
                 }
                 boi_ab_f["instruction"] = instruction[0].strip()
                 boi_ab_f["input"] = (
@@ -635,12 +697,16 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False):
                     .strip()
                 )
                 cut += 1
-                mapping_action = [('[A]', rearranged_actions[0]), ('[B]', rearranged_actions[1]), ('[C]', rearranged_actions[2]),  ('[D]', rearranged_actions[3])]
+                mapping_action = [
+                    ("[A]", rearranged_actions[0]),
+                    ("[B]", rearranged_actions[1]),
+                    ("[C]", rearranged_actions[2]),
+                    ("[D]", rearranged_actions[3]),
+                ]
                 if logprobs:
-                    boi_ab_f["predict"] = [mapping_action[i-1][0] for i in mapping]
-                    boi_rp_f["predict"] = [mapping_action[i-1][1] for i in mapping]
+                    boi_ab_f["predict"] = [mapping_action[i - 1][0] for i in mapping]
+                    boi_rp_f["predict"] = [mapping_action[i - 1][1] for i in mapping]
                 output_list_dic.extend([boi_ab_f, boi_rp_f])
-                
 
         with open(
             os.path.join(root, "src", "evaluation", "assets", "input_alpaca.json"), "r"
@@ -652,7 +718,6 @@ def generate_alpaca(source: str, dir: str, rearrange = True, logprobs = False):
             os.path.join(root, "src", "evaluation", "assets", "input_alpaca.json"), "w"
         ) as f:
             json.dump(temp, f)
-        
 
 
 def get_dim(key, dict_list):
diff --git a/src/gutenberg/get_meta.py b/src/gutenberg/get_meta.py
index 714eec8..923629e 100644
--- a/src/gutenberg/get_meta.py
+++ b/src/gutenberg/get_meta.py
@@ -62,7 +62,12 @@ def gather_meta(raw_dir, record):
             """
             with open(
                 os.path.join(
-                    root, "dataset", "raw_downloads", "Gutenberg", "metadata", "metadata.csv"
+                    root,
+                    "dataset",
+                    "raw_downloads",
+                    "Gutenberg",
+                    "metadata",
+                    "metadata.csv",
                 )
             ) as file:
                 reader = csv.reader(file)
diff --git a/src/path.py b/src/path.py
index 617cf06..9bd794f 100644
--- a/src/path.py
+++ b/src/path.py
@@ -1,4 +1,9 @@
 import os, sys
-root = "src".join(os.path.dirname(os.path.abspath(__file__)).split("src")[:-1]).rstrip("/").rstrip("\\")
-if eval(os.environ.get('LOUD_BACKEND', 'False')):
-    print(f"Library root directory: {root}")
\ No newline at end of file
+
+root = (
+    "src".join(os.path.dirname(os.path.abspath(__file__)).split("src")[:-1])
+    .rstrip("/")
+    .rstrip("\\")
+)
+if eval(os.environ.get("LOUD_BACKEND", "False")):
+    print(f"Library root directory: {root}")
diff --git a/src/utils/data_utils/__init__.py b/src/utils/data_utils/__init__.py
index df8e89b..1fbab14 100644
--- a/src/utils/data_utils/__init__.py
+++ b/src/utils/data_utils/__init__.py
@@ -1,2 +1,2 @@
 from .extrapolation_utils import *
-from .rw_utils import *
\ No newline at end of file
+from .rw_utils import *
diff --git a/src/utils/data_utils/extrapolation_utils.py b/src/utils/data_utils/extrapolation_utils.py
index bb7e8c8..a18deda 100644
--- a/src/utils/data_utils/extrapolation_utils.py
+++ b/src/utils/data_utils/extrapolation_utils.py
@@ -1,5 +1,6 @@
 from copy import deepcopy
 from typing import Iterable, Tuple, Dict, List, Literal, Union
+
 # from src.abstractions import Data
 import json
 import warnings
diff --git a/src/utils/data_utils/rw_utils.py b/src/utils/data_utils/rw_utils.py
index 425d2b1..1e3375e 100644
--- a/src/utils/data_utils/rw_utils.py
+++ b/src/utils/data_utils/rw_utils.py
@@ -40,7 +40,7 @@ def transformation(dic: Dict) -> Dict:
         print(
             f"Truncated dataset {data.data_name} from size {original_size} to {len(list(truncated_data.all_passages()))}."
         )
-    
+
     return truncated_data
 
 
diff --git a/src/utils/gpt_utils.py b/src/utils/gpt_utils.py
index ac24df7..9388ea9 100644
--- a/src/utils/gpt_utils.py
+++ b/src/utils/gpt_utils.py
@@ -12,7 +12,9 @@
 if not os.path.exists(f"{root}/logs/eval"):
     os.makedirs(f"{root}/logs/eval")
 
-with open(f"{root}/src/abstractions/configs/abstractions_config.json", "r") as config_file:
+with open(
+    f"{root}/src/abstractions/configs/abstractions_config.json", "r"
+) as config_file:
     abstractions_config = json.load(config_file)
     if "openai_mirror" in abstractions_config:
         mirror_url = abstractions_config["openai_mirror"]