From b3e5afca099cb38af6ffeb770a0a5fc341551c45 Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Sun, 29 Sep 2024 01:33:23 -0500
Subject: [PATCH 01/30] Created sample application.

---
 main.py                               | 261 ++++++++++++++++++++++++++
 methods/language_modeling.py          |  12 +-
 methods/process_of_elimination.py     |  10 +-
 methods/process_of_elimination_vqa.py |   8 +-
 methods/utils/data.py                 |  80 ++++++--
 methods/utils/methods.py              |  16 +-
 methods/utils/utils.py                |  11 +-
 methods/vision_language_modeling.py   |  12 +-
 8 files changed, 360 insertions(+), 50 deletions(-)
 create mode 100644 main.py

diff --git a/main.py b/main.py
new file mode 100644
index 0000000..20f3ddf
--- /dev/null
+++ b/main.py
@@ -0,0 +1,261 @@
+# a framework for inference on multiple choice tasks.
+from argparse import Namespace
+import copy
+import logging
+import os
+import subprocess
+
+import questionary
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+
+from methods.utils.data import(
+    create_multiple_choice_prompt,
+    preprocess_function_seq2seq_vqa,
+    preprocess_function_seq2seq_vqa_channel,
+    preprocess_function_causal_vqa,
+    preprocess_function_causal_vqa_channel
+)
+from methods.utils.methods import(
+    compute_conditional_score_seq2seq_vqa,
+    compute_conditional_score_causal_vqa,
+    compute_mask_process_of_elimination,
+    inference_process_of_elimination,
+    inference_language_modeling,
+    inference_calibration
+)
+from methods.utils.utils import(
+    load_data,
+    load_model,
+    set_seed
+)
+
+all_checkpoints = {
+    "BLIP2": ["Salesforce/blip2-opt-2.7b", "Salesforce/blip2-flan-t5-xl"],
+    "InstructBLIP": ["Salesforce/instructblip-vicuna-7b"],
+    "GIT": ["microsoft/git-base-vqav2", "microsoft/git-base-textvqa"],
+    "PaliGemma": ["google/paligemma-3b-ft-science-qa-448", "google/paligemma-3b-ft-vqav2-448", "google/paligemma-3b-ft-ai2d-448"],
+    "Idefics2": ["HuggingFaceM4/idefics2-8b"]
+}
+
+logger = logging.getLogger(__name__)
+
+def main():
+
+    # step 1: collect arguments
+    args = Namespace()
+    args.seed = 0
+    
+    args.model_family = questionary.select(
+        message="Select model family?",
+        choices=["BLIP2", "InstructBLIP","GIT","PaliGemma","Idefics2"],
+        default="GIT").ask()
+    
+    checkpoints_choices = all_checkpoints[args.model_family]
+    args.checkpoint = questionary.select(
+        message="Select model checkpoint?",
+        choices=checkpoints_choices,
+        default=checkpoints_choices[0]).ask()
+    
+    args.loading_precision = questionary.select(
+        message="Select model checkpoint?",
+        choices=["FP32", "FP16", "BF16", "INT8"],
+        default="FP32").ask()
+    
+    args.output_dir = questionary.path(
+        message='Output Directory?',
+        only_directories=True,
+        default=f"/content/model/").ask()
+
+    args.dataset="single_inference"
+    args.batch_size=1
+    args.sample=1
+    args.n_shot=0
+
+    args.multiple_choice_prompt=""
+    args.calibration_prompt=" the answer is:"
+    args.process_of_elimination_prompt="Select the most suitable option to answer the question. Ignore [MASK] options."
+
+    args.scoring_method_for_process_of_elimination = questionary.select(
+        message="Select scoring method?",
+        choices=["channel","calibration","language_modeling","multiple_choice_prompt"],
+        default="language_modeling").ask()
+    
+    args.mask_strategy_for_process_of_elimination=questionary.select(
+        message="Select mask strategy?",
+        choices=["below_average","lowest"],
+        default="below_average").ask()
+    
+    args.prompting_method_for_process_of_elimination = "multiple_choice_prompt"
+    args.mask_token = None
+    
+    args.question = questionary.text("Question:").ask()
+    args.choices = questionary.text("Choices [comma seprated]:").ask()
+    args.choices = args.choices.split(',')
+    args.num_options = len(args.choices)
+    args.image_path = questionary.path("Image Path?").ask()
+    args.label = questionary.select(
+        message="Answer:",
+        choices=[str(x) for x in range(args.num_options)]).ask()
+    args.label = int(args.label)
+    args.method = "process_of_elimination"
+
+    # print(args)
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.setLevel(logging.INFO)
+
+    # step 2: set random seed to ensure reproducibility.
+    logger.info(f"Set random seed to {args.seed}.")
+    set_seed(args.seed)
+
+    # step 3: download model
+    logger.info(f"Download {args.model_family} model: {args.checkpoint}.")
+    subprocess.call(f"python models/model_downloaders/model_downloaders.py \
+            --model_family {args.model_family} \
+            --checkpoint {args.checkpoint} \
+            --output_dir {args.output_dir}", shell=True)
+
+    # step 4: load model, tokenizer. Then move to gpu, and set to evaluation mode.
+    logger.info(f"Load {args.model_family} model: {args.checkpoint}.")
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    # get model path: ../models/args.model_family/args.checkpoint
+    model_path = os.path.join(args.output_dir, args.model_family, args.checkpoint)
+    model, tokenizer = load_model(device, model_path, args)
+    if args.model_family in ["BLIP2", "InstructBLIP", "PaliGemma", "Idefics2"]:
+        compute_func = compute_conditional_score_seq2seq_vqa
+        preprocess_func = preprocess_function_seq2seq_vqa
+        preprocess_func_channel = preprocess_function_seq2seq_vqa_channel
+        remove_columns = ['header_input_ids', 
+                        'header_attention_mask', 
+                        'ending_input_ids', 
+                        'ending_attention_mask',
+                        'images',]
+        processor = tokenizer
+        tokenizer = processor.tokenizer
+    elif args.model_family in ["GIT"]:
+        compute_func = compute_conditional_score_causal_vqa
+        preprocess_func = preprocess_function_causal_vqa
+        preprocess_func_channel = preprocess_function_causal_vqa_channel
+        remove_columns = [
+            'input_ids',
+            'labels',
+            'images',
+            'ending_attention_mask'
+        ]
+        processor = tokenizer
+        tokenizer = processor.tokenizer
+    else:
+        raise NotImplementedError
+
+    # step 5: load and preprocess data.
+    logger.info(f"Load data: {args.dataset}.")
+    
+    # evaluate on dataset
+    multiple_choice_prompt = args.multiple_choice_prompt
+    # multiple_choice_prompt = args.multiple_choice_prompt
+    args.multiple_choice_prompt = None
+    ending_names, header_name, image_header_name, raw_dataset, n_shot_dataset = load_data(args) 
+    
+    mcp_args = copy.deepcopy(args)
+    mcp_args.multiple_choice_prompt = multiple_choice_prompt
+    _, _, _, raw_mcp_dataset, n_shot_mcp_dataset = load_data(mcp_args) 
+    
+    logger.info(f"Preprocess data: {args.dataset}.")
+    fn_kwargs = {
+        "ending_names": ending_names, 
+        "header_name": header_name, 
+        "tokenizer": tokenizer,
+        "processor": processor,
+        "image_header_name": image_header_name
+    }
+    num_of_options = len(ending_names)
+    tokenized_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
+    eval_dataloader = DataLoader(tokenized_dataset, batch_size=args.batch_size, shuffle=False)
+
+    # step 5: (evaluation) inference on data, and compute accuracy.
+    logger.info(f"Start inference (method: {args.method}) on {args.dataset} using {args.model_family} model: {args.checkpoint}.")
+    scoring_method = args.scoring_method_for_process_of_elimination
+    logger.info(f"Step 1: Computing masks. Scoring method: {scoring_method}.")
+    if scoring_method == "channel":
+        tokenized_channel_dataset = raw_dataset.map(preprocess_func_channel, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
+        eval_channel_dataloader = DataLoader(tokenized_channel_dataset, batch_size=args.batch_size, shuffle=False)
+        avg_log_probs, _, _, lm_predictions = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
+    elif scoring_method == "calibration":
+        fn_kwargs = {"ending_names": ending_names, 
+                    "header_name": "uncond_premise", # the difference is here
+                    "tokenizer": tokenizer,
+                    "processor": processor,
+                    "image_header_name": image_header_name}
+        tokenized_calibration_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
+        eval_calibration_dataloader = DataLoader(tokenized_calibration_dataset, batch_size=args.batch_size, shuffle=False)    
+        avg_log_probs, _, _, lm_predictions = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
+    elif scoring_method == "language_modeling":
+        avg_log_probs, _, _, lm_predictions = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+    elif scoring_method == "multiple_choice_prompt":
+        # mcp_args = copy.deepcopy(args)
+        # mcp_args.multiple_choice_prompt = multiple_choice_prompt
+        # _, _, raw_mcp_dataset, n_shot_mcp_dataset = load_data(mcp_args)
+        # raw_mcp_dataset, n_shot_mcp_dataset = create_n_shot_splits(raw_mcp_dataset, n_shot_mcp_dataset, args)    
+        tokenized_dataset = raw_mcp_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
+        eval_mcp_dataloader = DataLoader(tokenized_dataset, batch_size=args.batch_size, shuffle=False)
+        avg_log_probs, _, lm_predictions = inference_language_modeling(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
+    else:
+        raise NotImplementedError # unlikely to happen.
+    
+    mask_strategy = args.mask_strategy_for_process_of_elimination
+    if mask_strategy == "min_k":
+        # masking the most k UNLIKELY options
+        min_k = args.min_k 
+        if min_k >= num_of_options:
+            min_k = num_of_options - 1
+        mask_kwargs = {"min_k": min_k,}
+    else:
+        mask_kwargs = {}
+    masks = compute_mask_process_of_elimination(avg_log_probs, mask_strategy, **mask_kwargs)
+    # construct an oracle mask that only keeps the correct lable to 1, and other options to 0
+    # oracle_masks = torch.zeros_like(avg_log_probs)
+    # oracle_masks[torch.arange(oracle_masks.size(0)), tokenized_dataset["label"]] = 1
+    masks = masks.to(torch.float32)
+    # compute mask accuracy, i.e., check whether mask that correspond to labels is 1
+    mask_result = masks[torch.arange(masks.size(0)), tokenized_dataset["label"]]
+    mask_accuracy = torch.sum(mask_result) / mask_result.size(0)
+    logger.info(f"Mask accuracy: {mask_accuracy}")
+    args.mask_accuracy = mask_accuracy.item()
+    masked_dataset = tokenized_dataset.map(lambda example, idx: {"mask": masks[idx]}, 
+                                with_indices=True, 
+                                batched=True,
+                                remove_columns=remove_columns)
+    
+    prompting_method = args.prompting_method_for_process_of_elimination
+    logger.info(f"Step 2: Creating multiple choice prompt. Prompting method: {prompting_method}.")
+    # if args.prompting_method_for_process_of_elimination
+    # mcp_kwargs = {"multiple_choice_prompt": multiple_choice_prompt,}
+    mask_token = args.mask_token
+    if mask_token is not None:
+        if mask_token == "":
+            args.process_of_elimination_prompt = args.process_of_elimination_prompt.replace("[MASK]", "empty")   
+        else:
+            args.process_of_elimination_prompt = args.process_of_elimination_prompt.replace("[MASK]", mask_token)   
+    mcp_kwargs = {
+        "multiple_choice_prompt": args.process_of_elimination_prompt,
+        "scoring_method": scoring_method,
+        "num_of_options": num_of_options,
+        "mask_token": mask_token,
+    }
+    mcp_dataset = masked_dataset.map(create_multiple_choice_prompt, fn_kwargs=mcp_kwargs)
+
+    logger.info(f"Step 3: Final Inference")
+    mcp_dataset = mcp_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
+    eval_mcp_dataloader = DataLoader(mcp_dataset, batch_size=args.batch_size, shuffle=False)
+    poe_avg_log_probs,  lm_accuracy, _, lm_predictions = inference_process_of_elimination(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
+    option = int(lm_predictions.numpy()[0])
+    logger.info(f"Answer: {option}")
+
+if __name__ == "__main__":
+    main()
diff --git a/methods/language_modeling.py b/methods/language_modeling.py
index 25b9922..7631fea 100644
--- a/methods/language_modeling.py
+++ b/methods/language_modeling.py
@@ -110,7 +110,7 @@ def main():
         # step 5: (evaluation) inference on data, and compute accuracy.
         logger.info(f"Start inference (method: {args.method}) on {args.dataset} using {args.model_family} model: {args.checkpoint}.")
         if args.method in ["language_modeling", "multiple_choice_prompt"]:
-            _, lm_accuracy, avg_lm_accuracy = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
         elif args.method == "contrastive_decoding":
             logger.info(f"Load {args.model_family} amateur model: {args.amateur_checkpoint}.")
             # get model path: ../models/args.model_family/args.checkpoint
@@ -118,8 +118,8 @@ def main():
             amateur_model, _ = load_model(device, amateur_model_path, args)
             # we want to integrate contrastive decoding with other methods, so we need separate output from each model.
             # compute log probs on each model        
-            exp_avg_log_probs, exp_lm_accuracy, exp_avg_lm_accuracy = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
-            ama_avg_log_probs, ama_lm_accuracy, ama_avg_lm_accuracy = inference_language_modeling(amateur_model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+            exp_avg_log_probs, exp_lm_accuracy, exp_avg_lm_accuracy, _ = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+            ama_avg_log_probs, ama_lm_accuracy, ama_avg_lm_accuracy, _ = inference_language_modeling(amateur_model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
             # calculate difference, and may introduce extra parameters.
             avg_log_probs = exp_avg_log_probs - ama_avg_log_probs
             labels = raw_dataset['label']
@@ -134,12 +134,12 @@ def main():
                         "tokenizer": tokenizer,}
             tokenized_calibration_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
             eval_calibration_dataloader = DataLoader(tokenized_calibration_dataset, batch_size=args.batch_size, shuffle=False)    
-            _, lm_accuracy, avg_lm_accuracy = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
         elif args.method == "channel":
             # simple solution: swap first sentence and second sentence in both preprocessing functions
             tokenized_channel_dataset = raw_dataset.map(preprocess_func_channel, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
             eval_channel_dataloader = DataLoader(tokenized_channel_dataset, batch_size=args.batch_size, shuffle=False)
-            _, lm_accuracy, avg_lm_accuracy = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
         elif args.method == "generate_synonyms":
             # 3 stpes: generate synonyms, then map datasets, then inference.
             logger.info(f"Generate synonyms for {args.dataset}.")
@@ -158,7 +158,7 @@ def main():
                         "tokenizer": tokenizer,}
             tokenized_synonyms_dataset = synonyms_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
             eval_synonyms_dataloader = DataLoader(tokenized_synonyms_dataset, batch_size=args.batch_size, shuffle=False)
-            _, lm_accuracy, avg_lm_accuracy = inference_generate_synonyms(model, eval_synonyms_dataloader, device, compute_func, tokenizer.pad_token_id, num_of_options, args.number_of_synonyms)
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_generate_synonyms(model, eval_synonyms_dataloader, device, compute_func, tokenizer.pad_token_id, num_of_options, args.number_of_synonyms)
         else:
             raise NotImplementedError
 
diff --git a/methods/process_of_elimination.py b/methods/process_of_elimination.py
index 87f9293..97db2b5 100644
--- a/methods/process_of_elimination.py
+++ b/methods/process_of_elimination.py
@@ -124,16 +124,16 @@ def main():
         if scoring_method == "channel":
             tokenized_channel_dataset = raw_dataset.map(preprocess_func_channel, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
             eval_channel_dataloader = DataLoader(tokenized_channel_dataset, batch_size=args.batch_size, shuffle=False)
-            avg_log_probs, _, _ = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
+            avg_log_probs, _, _, _ = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
         elif scoring_method == "calibration":
             fn_kwargs = {"ending_names": ending_names, 
                         "header_name": "uncond_premise", # the difference is here
                         "tokenizer": tokenizer,}
             tokenized_calibration_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
             eval_calibration_dataloader = DataLoader(tokenized_calibration_dataset, batch_size=args.batch_size, shuffle=False)    
-            avg_log_probs, _, _ = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
+            avg_log_probs, _, _, _ = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
         elif scoring_method == "language_modeling":
-            avg_log_probs, _, _ = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+            avg_log_probs, _, _, _ = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
         elif scoring_method == "multiple_choice_prompt":
             # mcp_args = copy.deepcopy(args)
             # mcp_args.multiple_choice_prompt = multiple_choice_prompt
@@ -141,7 +141,7 @@ def main():
             # raw_mcp_dataset, n_shot_mcp_dataset = create_n_shot_splits(raw_mcp_dataset, n_shot_mcp_dataset, args)    
             tokenized_dataset = raw_mcp_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
             eval_mcp_dataloader = DataLoader(tokenized_dataset, batch_size=args.batch_size, shuffle=False)
-            avg_log_probs, _, _ = inference_language_modeling(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
+            avg_log_probs, _, _, _ = inference_language_modeling(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
         else:
             raise NotImplementedError # unlikely to happen.
         
@@ -195,7 +195,7 @@ def main():
         logger.info(f"Step 3: Final Inference")
         mcp_dataset = mcp_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
         eval_mcp_dataloader = DataLoader(mcp_dataset, batch_size=args.batch_size, shuffle=False)
-        poe_avg_log_probs,  lm_accuracy, _ = inference_process_of_elimination(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
+        poe_avg_log_probs,  lm_accuracy, _, _ = inference_process_of_elimination(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
 
         # step 6: some postprocessing, including saving and displyaing output.
         save_path = os.path.join("../results", f"{args.method}.csv")
diff --git a/methods/process_of_elimination_vqa.py b/methods/process_of_elimination_vqa.py
index 83779f8..7b9e4d9 100644
--- a/methods/process_of_elimination_vqa.py
+++ b/methods/process_of_elimination_vqa.py
@@ -164,7 +164,7 @@ def main():
         if scoring_method == "channel":
             tokenized_channel_dataset = raw_dataset.map(preprocess_func_channel, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
             eval_channel_dataloader = DataLoader(tokenized_channel_dataset, batch_size=args.batch_size, shuffle=False)
-            avg_log_probs, _, _ = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
+            avg_log_probs, _, _, _ = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
         elif scoring_method == "calibration":
             if args.model_family in ["BLIP2", "InstructBLIP", "GIT", "PaliGemma", "Idefics2"]:
                 fn_kwargs = {"ending_names": ending_names, 
@@ -180,7 +180,7 @@ def main():
             eval_calibration_dataloader = DataLoader(tokenized_calibration_dataset, batch_size=args.batch_size, shuffle=False)    
             avg_log_probs, _, _ = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
         elif scoring_method == "language_modeling":
-            avg_log_probs, _, _ = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+            avg_log_probs, _, _, _ = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
         elif scoring_method == "multiple_choice_prompt":
             # mcp_args = copy.deepcopy(args)
             # mcp_args.multiple_choice_prompt = multiple_choice_prompt
@@ -188,7 +188,7 @@ def main():
             # raw_mcp_dataset, n_shot_mcp_dataset = create_n_shot_splits(raw_mcp_dataset, n_shot_mcp_dataset, args)    
             tokenized_dataset = raw_mcp_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
             eval_mcp_dataloader = DataLoader(tokenized_dataset, batch_size=args.batch_size, shuffle=False)
-            avg_log_probs, _, _ = inference_language_modeling(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
+            avg_log_probs, _, _, _ = inference_language_modeling(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
         else:
             raise NotImplementedError # unlikely to happen.
         
@@ -242,7 +242,7 @@ def main():
         logger.info(f"Step 3: Final Inference")
         mcp_dataset = mcp_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
         eval_mcp_dataloader = DataLoader(mcp_dataset, batch_size=args.batch_size, shuffle=False)
-        poe_avg_log_probs,  lm_accuracy, _ = inference_process_of_elimination(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
+        poe_avg_log_probs,  lm_accuracy, _, _ = inference_process_of_elimination(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
 
         # step 6: some postprocessing, including saving and displyaing output.
         save_path = os.path.join("../results", f"{args.method}.csv")
diff --git a/methods/utils/data.py b/methods/utils/data.py
index 8c64ae1..405cc8a 100644
--- a/methods/utils/data.py
+++ b/methods/utils/data.py
@@ -821,20 +821,20 @@ def generate_n_shot_poe_demonstrations(n_shot_dataset, num_of_options):
     return n_shot_demonstrations, n_shot_poe_demonstrations
 
 def vqa_loader(path, args):
-    versionType = '' # this should be '' when using VQA v2.0 dataset
-    taskType = 'MultipleChoice' # 'OpenEnded' only for v2.0. 'OpenEnded' or 'MultipleChoice' for v1.0
-    dataType = 'mscoco'  # 'mscoco' only for v1.0. 'mscoco' for real and 'abstract_v002' for abstract for v1.0.
-    dataSubType = 'train2014'
-    annFile = '%s/Annotations/%s%s_%s_annotations.json'%(path, versionType, dataType, dataSubType)
-    quesFile = '%s/Questions/%s%s_%s_%s_questions.json'%(path, versionType, taskType, dataType, dataSubType)
-    imgDir = '%s/Images/%s/%s' %(path, dataType, dataSubType)
+    version_type = '' # this should be '' when using VQA v2.0 dataset
+    task_type = 'MultipleChoice' # 'OpenEnded' only for v2.0. 'OpenEnded' or 'MultipleChoice' for v1.0
+    data_type = 'mscoco'  # 'mscoco' only for v1.0. 'mscoco' for real and 'abstract_v002' for abstract for v1.0.
+    data_subtype = 'train2014'
+    ann_file = '%s/Annotations/%s%s_%s_annotations.json'%(path, version_type, data_type, data_subtype)
+    question_file = '%s/Questions/%s%s_%s_%s_questions.json'%(path, version_type, task_type, data_type, data_subtype)
+    img_dir = '%s/Images/%s/%s' %(path, data_type, data_subtype)
     alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
     examples = []
 
     print('Loading annotations and questions...')
-    train_anno = json.load(open(annFile, 'r'))
-    train_ques = json.load(open(quesFile, 'r'))
+    train_anno = json.load(open(ann_file, 'r'))
+    train_ques = json.load(open(question_file, 'r'))
 
     if args.calibration_prompt is not None:
         uncond_premise = args.calibration_prompt
@@ -845,7 +845,7 @@ def vqa_loader(path, args):
         ans = train_anno['annotations'][i]['multiple_choice_answer']
         img_id = train_anno['annotations'][i]['image_id']
         # question_id = train_anno['annotations'][i]['question_id']
-        image_path = os.path.join(imgDir, 'COCO_train2014_' + '%012d.jpg' % img_id)
+        image_path = os.path.join(img_dir, 'COCO_train2014_' + '%012d.jpg' % img_id)
 
         question = train_ques['questions'][i]['question']
         mc_ans = train_ques['questions'][i]['multiple_choices']
@@ -881,17 +881,16 @@ def vqa_loader(path, args):
     return examples
 
 def scienceqa_loader(path, args):
-    annFile = '%s/ScienceQA_DATA/problems.json'%(path)
-    # traintestFile = '%s/ScienceQA_DATA/pid_splits.json'%(path)
-    imgDir = '%s/ScienceQA_DATA/train' %(path)
+    ann_file = '%s/ScienceQA_DATA/problems.json'%(path)
+    img_dir = '%s/ScienceQA_DATA/train' %(path)
     alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
     examples = []
 
     print('Loading annotations and images...')
-    anno = json.load(open(annFile, 'r'))
+    anno = json.load(open(ann_file, 'r'))
     # train_test_split = json.load(open(traintestFile, 'r'))
-    train_ids = os.listdir(imgDir)
+    train_ids = os.listdir(img_dir)
     train_anno = {id: anno[id] for id in train_ids}
 
     if args.calibration_prompt is not None:
@@ -909,7 +908,7 @@ def scienceqa_loader(path, args):
         if (not len(mc_ans) == args.num_options) or (image_file == None):
             continue
 
-        image_path = os.path.join(os.path.join(imgDir, img_id), image_file)
+        image_path = os.path.join(os.path.join(img_dir, img_id), image_file)
         if getattr(args, 'multiple_choice_prompt', None) is not None:
             hypotheses = mc_ans
             # Question: How does a bishop move from one place to another?
@@ -938,14 +937,14 @@ def scienceqa_loader(path, args):
     return examples
 
 def ai2d_loader(path, args):
-    questionDir = '%s/ai2d/questions' %(path)
+    question_dir = '%s/ai2d/questions' %(path)
     imgDir = '%s/ai2d/images' %(path)
     alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
     examples = []
 
     print('Loading annotations and images...')
-    train_files = os.listdir(questionDir)
+    train_files = os.listdir(question_dir)
 
     if args.calibration_prompt is not None:
         uncond_premise = args.calibration_prompt
@@ -953,7 +952,7 @@ def ai2d_loader(path, args):
         uncond_premise = " the answer is:"
 
     for i, file in enumerate(train_files):
-        anno = json.load(open(os.path.join(questionDir, file), 'r'))
+        anno = json.load(open(os.path.join(question_dir, file), 'r'))
         questions = anno["questions"]
         imageName = anno["imageName"]
         for question, value in questions.items():
@@ -990,4 +989,47 @@ def ai2d_loader(path, args):
                 example[0][f'hypothesis{idx}'] = ans
             examples+=example
     print("Dataset Length: ",len(examples))
+    return examples
+
+def single_inference_loader(path, args):
+    alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+
+    examples = []
+
+    print('Loading single question and image...')
+    question = args.question
+    mc_ans = args.choices
+    label = args.label
+    image_path = path
+    
+    if args.calibration_prompt is not None:
+        uncond_premise = args.calibration_prompt
+    else:
+        uncond_premise = " the answer is:"
+
+    if getattr(args, 'multiple_choice_prompt', None) is not None:
+        hypotheses = mc_ans
+        # Question: How does a bishop move from one place to another?
+        # A. chess game
+        # B. church
+        # C. in a car
+        # D. queen
+        # Answer:
+        options = "\n".join([f"{alphabets[i]}. {ans}" for i, ans in enumerate(mc_ans)])
+        premise = f"{args.multiple_choice_prompt} Question: {question}\n{options}\nAnswer:"
+    else:
+        hypotheses = mc_ans
+        premise = question + uncond_premise
+
+    example = [{
+        'premise': premise, 
+        'image_path': image_path, 
+        'uncond_premise': uncond_premise,  
+        'label': label
+        }]
+
+    for idx, ans in enumerate(hypotheses):
+        example[0][f'hypothesis{idx}'] = ans
+    examples+=example
+    print("Dataset Length: ",len(examples))
     return examples
\ No newline at end of file
diff --git a/methods/utils/methods.py b/methods/utils/methods.py
index 3c9d3e4..70c292d 100644
--- a/methods/utils/methods.py
+++ b/methods/utils/methods.py
@@ -105,7 +105,7 @@ def inference_language_modeling(model, eval_dataloader, device, compute_func, pa
         avg_lm_accuracy = (avg_lm_predictions == labels).sum().item() / len(labels)
         pbar.set_description(f"Language modeling accuracy: {lm_accuracy:.4f}, Average language modeling accuracy: {avg_lm_accuracy:.4f}")
     avg_log_probs = torch.cat(avg_log_probs, dim=0)
-    return avg_log_probs, lm_accuracy, avg_lm_accuracy
+    return avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions
 
 def inference_generate_synonyms(model, eval_dataloader, device, compute_func, pad_token_id, num_of_options, num_of_synonyms):
     model.eval()
@@ -144,7 +144,7 @@ def inference_generate_synonyms(model, eval_dataloader, device, compute_func, pa
         avg_lm_accuracy = (avg_lm_predictions == labels).sum().item() / len(labels)
         pbar.set_description(f"Language modeling accuracy: {lm_accuracy:.4f}, Average language modeling accuracy: {avg_lm_accuracy:.4f}")
     avg_log_probs = torch.cat(avg_log_probs, dim=0)
-    return avg_log_probs, lm_accuracy, avg_lm_accuracy
+    return avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions
 
 def inference_calibration(model, eval_dataloader, eval_calibration_dataloader, device, compute_func, pad_token_id):
     model.eval()
@@ -175,7 +175,7 @@ def inference_calibration(model, eval_dataloader, eval_calibration_dataloader, d
         avg_lm_accuracy = (avg_lm_predictions == labels).sum().item() / len(labels)
         pbar.set_description(f"Calibration accuracy: {lm_accuracy:.4f}, Average calibration accuracy: {avg_lm_accuracy:.4f}")
     avg_log_probs = torch.cat(avg_log_probs, dim=0)
-    return avg_log_probs, lm_accuracy, avg_lm_accuracy
+    return avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions
 
 def inference_contrastive_decoding(method, model, **kwargs):
     args = kwargs["args"]
@@ -203,7 +203,7 @@ def inference_contrastive_decoding(method, model, **kwargs):
     tokenized_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
     eval_dataloader = DataLoader(tokenized_dataset, batch_size=args.batch_size, shuffle=False)
     if method in ["language_modeling", "multiple_choice_prompt"]:
-        avg_log_probs, lm_accuracy, avg_lm_accuracy = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
     elif method == "calibration":
         fn_kwargs = {"ending_names": ending_names, 
                     "header_name": "uncond_premise", # the difference is here
@@ -216,15 +216,15 @@ def inference_contrastive_decoding(method, model, **kwargs):
                         "image_header_name": image_header_name}
         tokenized_calibration_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
         eval_calibration_dataloader = DataLoader(tokenized_calibration_dataset, batch_size=args.batch_size, shuffle=False)    
-        avg_log_probs, lm_accuracy, avg_lm_accuracy = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
+        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
     elif method == "channel":
         # simple solution: swap first sentence and second sentence in both preprocessing functions
         tokenized_channel_dataset = raw_dataset.map(preprocess_func_channel, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
         eval_channel_dataloader = DataLoader(tokenized_channel_dataset, batch_size=args.batch_size, shuffle=False)
-        avg_log_probs, lm_accuracy, avg_lm_accuracy = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
+        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
     else:
         raise NotImplementedError
-    return avg_log_probs, lm_accuracy, avg_lm_accuracy
+    return avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions
 
 def compute_mask_process_of_elimination(avg_log_probs, mask_strategy, **kwargs):
     masks = torch.ones_like(avg_log_probs)
@@ -282,7 +282,7 @@ def inference_process_of_elimination(model, eval_dataloader, device, compute_fun
         avg_lm_accuracy = (avg_lm_predictions == labels).sum().item() / len(labels)
         pbar.set_description(f"Process of elimination accuracy: {lm_accuracy:.4f}, Average process of elimination accuracy: {avg_lm_accuracy:.4f}")
     avg_log_probs = torch.cat(avg_log_probs, dim=0)
-    return avg_log_probs, lm_accuracy, avg_lm_accuracy
+    return avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions
 
 def compute_conditional_score_seq2seq(batch, model, device, pad_token_id):
     # returns log_prob of p(y|x) for each batch
diff --git a/methods/utils/utils.py b/methods/utils/utils.py
index e383d08..409a4b3 100644
--- a/methods/utils/utils.py
+++ b/methods/utils/utils.py
@@ -34,7 +34,8 @@
     anli_loader,
     vqa_loader,
     scienceqa_loader,
-    ai2d_loader
+    ai2d_loader,
+    single_inference_loader
 )
 
 def set_seed(seed):
@@ -411,6 +412,12 @@ def load_data(args):
         header_name = "premise"
         image_header_name = "image_path"
         loader = ai2d_loader
+    elif args.dataset == "single_inference":
+        file_path = args.image_path
+        ending_names = [f"hypothesis{i}" for i in range(args.num_options)]
+        header_name = "premise"
+        image_header_name = "image_path"
+        loader = single_inference_loader
     else:
         print(f"{args.dataset}: downloader not implemented.")
         return
@@ -423,7 +430,7 @@ def load_data(args):
         train_dataset = Dataset.from_list(train_data).with_format("torch")
     else: # BB tasks have no train set. 
         train_dataset = dev_dataset
-    if args.dataset in ["vqa", "scienceqa", "ai2d"]:
+    if args.dataset in ["vqa", "scienceqa", "ai2d", "single_inference"]:
         return ending_names, header_name, image_header_name, dev_dataset, train_dataset
     return ending_names, header_name, dev_dataset, train_dataset
 
diff --git a/methods/vision_language_modeling.py b/methods/vision_language_modeling.py
index 2ebe5ad..1e1e9cc 100644
--- a/methods/vision_language_modeling.py
+++ b/methods/vision_language_modeling.py
@@ -137,7 +137,7 @@ def main():
         # step 5: (evaluation) inference on data, and compute accuracy.
         logger.info(f"Start inference (method: {args.method}) on {args.dataset} using {args.model_family} model: {args.checkpoint}.")
         if args.method in ["vision_language_modeling", "multiple_choice_prompt"]:
-            _, lm_accuracy, avg_lm_accuracy = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
         elif args.method == "contrastive_decoding":
             logger.info(f"Load {args.model_family} amateur model: {args.amateur_checkpoint}.")
             # get model path: ../models/args.model_family/args.checkpoint
@@ -145,8 +145,8 @@ def main():
             amateur_model, _ = load_model(device, amateur_model_path, args)
             # we want to integrate contrastive decoding with other methods, so we need separate output from each model.
             # compute log probs on each model        
-            exp_avg_log_probs, exp_lm_accuracy, exp_avg_lm_accuracy = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
-            ama_avg_log_probs, ama_lm_accuracy, ama_avg_lm_accuracy = inference_language_modeling(amateur_model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+            exp_avg_log_probs, exp_lm_accuracy, exp_avg_lm_accuracy, _ = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+            ama_avg_log_probs, ama_lm_accuracy, ama_avg_lm_accuracy, _ = inference_language_modeling(amateur_model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
             # calculate difference, and may introduce extra parameters.
             avg_log_probs = exp_avg_log_probs - ama_avg_log_probs
             labels = raw_dataset['label']
@@ -167,12 +167,12 @@ def main():
                         "image_header_name": image_header_name}
             tokenized_calibration_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
             eval_calibration_dataloader = DataLoader(tokenized_calibration_dataset, batch_size=args.batch_size, shuffle=False)    
-            _, lm_accuracy, avg_lm_accuracy = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
         elif args.method == "channel":
             # simple solution: swap first sentence and second sentence in both preprocessing functions
             tokenized_channel_dataset = raw_dataset.map(preprocess_func_channel, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
             eval_channel_dataloader = DataLoader(tokenized_channel_dataset, batch_size=args.batch_size, shuffle=False)
-            _, lm_accuracy, avg_lm_accuracy = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
         elif args.method == "generate_synonyms":
             # 3 stpes: generate synonyms, then map datasets, then inference.
             logger.info(f"Generate synonyms for {args.dataset}.")
@@ -197,7 +197,7 @@ def main():
                             "image_header_name": image_header_name}
             tokenized_synonyms_dataset = synonyms_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
             eval_synonyms_dataloader = DataLoader(tokenized_synonyms_dataset, batch_size=args.batch_size, shuffle=False)
-            _, lm_accuracy, avg_lm_accuracy = inference_generate_synonyms(model, eval_synonyms_dataloader, device, compute_func, tokenizer.pad_token_id, num_of_options, args.number_of_synonyms)
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_generate_synonyms(model, eval_synonyms_dataloader, device, compute_func, tokenizer.pad_token_id, num_of_options, args.number_of_synonyms)
         else:
             raise NotImplementedError
 

From efea97b90eac9da429db9ac12202b85f78d655ea Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Sun, 6 Oct 2024 12:02:22 -0500
Subject: [PATCH 02/30] Added project template files.

---
 .github/FUNDING.yml                           |  12 ++
 .github/ISSUE_TEMPLATE/bug_report.md          |  31 +++
 .github/ISSUE_TEMPLATE/feature_request.md     |  20 ++
 .github/PULL_REQUEST_TEMPLATE.md              |  15 ++
 .github/dependabot.yml                        |   6 +
 .github/init.sh                               |  68 ++++++
 .github/release_message.sh                    |   3 +
 .github/rename_project.sh                     |  36 ++++
 .github/template.yml                          |   1 +
 .github/workflows/main.yml                    |  92 ++++++++
 .github/workflows/release.yml                 |  52 +++++
 .github/workflows/rename_project.yml          |  42 ++++
 .gitignore                                    | 132 ++++++++++++
 ABOUT_THIS_TEMPLATE.md                        | 198 ++++++++++++++++++
 CONTRIBUTING.md                               | 113 ++++++++++
 Containerfile                                 |   5 +
 HISTORY.md                                    |  13 ++
 MANIFEST.in                                   |   5 +
 Makefile                                      | 122 +++++++++++
 README.md                                     |  23 +-
 docs/index.md                                 |  17 ++
 mkdocs.yml                                    |   2 +
 project_name/VERSION                          |   1 +
 project_name/__init__.py                      |   0
 project_name/__main__.py                      |   6 +
 project_name/base.py                          |  17 ++
 main.py => project_name/cli.py                |  10 +-
 .../data}/data_downloaders.sh                 |   0
 .../methods}/11_few_shot_vqa.sh               |   0
 .../methods}/1_main_exp.sh                    |   0
 .../methods}/2_logical_reasoning.sh           |   0
 {methods => project_name/methods}/3_mask.sh   |   0
 {methods => project_name/methods}/4_llm.sh    |   0
 .../methods}/5_few_shot.sh                    |   0
 .../methods}/6_num_option.sh                  |   0
 .../methods}/7_main_exp_vqa.sh                |   0
 .../methods}/9_mask_vqa.sh                    |   0
 .../methods}/language_modeling.py             |   0
 .../methods}/process_of_elimination.py        |   0
 .../methods}/process_of_elimination_vqa.py    |   0
 .../methods}/utils/data.py                    |   0
 .../methods}/utils/methods.py                 |   0
 .../methods}/utils/models.py                  |   0
 .../methods}/utils/utils.py                   |   0
 .../methods}/vision_language_modeling.py      |   0
 .../model_downloaders/model_downloaders.py    |   0
 .../model_downloaders/model_downloaders.sh    |   0
 .../results}/calibration.csv                  |   0
 .../results}/calibration1.csv                 |   0
 {results => project_name/results}/channel.csv |   0
 .../results}/channel1.csv                     |   0
 .../results}/multiple_choice_prompt.csv       |   0
 .../results}/multiple_choice_prompt1.csv      |   0
 .../results}/process_of_elimination.csv       |   0
 .../results}/process_of_elimination1.csv      |   0
 .../results}/vision_language_modeling.csv     |   0
 .../results}/vision_language_modeling1.csv    |   0
 requirements-test.txt                         |  10 +
 setup.py                                      |  46 ++++
 tests/__init__.py                             |   0
 tests/conftest.py                             |  14 ++
 tests/test_base.py                            |   5 +
 62 files changed, 1111 insertions(+), 6 deletions(-)
 create mode 100644 .github/FUNDING.yml
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md
 create mode 100644 .github/PULL_REQUEST_TEMPLATE.md
 create mode 100644 .github/dependabot.yml
 create mode 100755 .github/init.sh
 create mode 100755 .github/release_message.sh
 create mode 100755 .github/rename_project.sh
 create mode 100644 .github/template.yml
 create mode 100644 .github/workflows/main.yml
 create mode 100644 .github/workflows/release.yml
 create mode 100644 .github/workflows/rename_project.yml
 create mode 100644 .gitignore
 create mode 100644 ABOUT_THIS_TEMPLATE.md
 create mode 100644 CONTRIBUTING.md
 create mode 100644 Containerfile
 create mode 100644 HISTORY.md
 create mode 100644 MANIFEST.in
 create mode 100644 Makefile
 create mode 100644 docs/index.md
 create mode 100644 mkdocs.yml
 create mode 100644 project_name/VERSION
 create mode 100644 project_name/__init__.py
 create mode 100644 project_name/__main__.py
 create mode 100644 project_name/base.py
 rename main.py => project_name/cli.py (98%)
 rename {data => project_name/data}/data_downloaders.sh (100%)
 rename {methods => project_name/methods}/11_few_shot_vqa.sh (100%)
 rename {methods => project_name/methods}/1_main_exp.sh (100%)
 rename {methods => project_name/methods}/2_logical_reasoning.sh (100%)
 rename {methods => project_name/methods}/3_mask.sh (100%)
 rename {methods => project_name/methods}/4_llm.sh (100%)
 rename {methods => project_name/methods}/5_few_shot.sh (100%)
 rename {methods => project_name/methods}/6_num_option.sh (100%)
 rename {methods => project_name/methods}/7_main_exp_vqa.sh (100%)
 rename {methods => project_name/methods}/9_mask_vqa.sh (100%)
 rename {methods => project_name/methods}/language_modeling.py (100%)
 rename {methods => project_name/methods}/process_of_elimination.py (100%)
 rename {methods => project_name/methods}/process_of_elimination_vqa.py (100%)
 rename {methods => project_name/methods}/utils/data.py (100%)
 rename {methods => project_name/methods}/utils/methods.py (100%)
 rename {methods => project_name/methods}/utils/models.py (100%)
 rename {methods => project_name/methods}/utils/utils.py (100%)
 rename {methods => project_name/methods}/vision_language_modeling.py (100%)
 rename {models => project_name/models}/model_downloaders/model_downloaders.py (100%)
 rename {models => project_name/models}/model_downloaders/model_downloaders.sh (100%)
 rename {results => project_name/results}/calibration.csv (100%)
 rename {results => project_name/results}/calibration1.csv (100%)
 rename {results => project_name/results}/channel.csv (100%)
 rename {results => project_name/results}/channel1.csv (100%)
 rename {results => project_name/results}/multiple_choice_prompt.csv (100%)
 rename {results => project_name/results}/multiple_choice_prompt1.csv (100%)
 rename {results => project_name/results}/process_of_elimination.csv (100%)
 rename {results => project_name/results}/process_of_elimination1.csv (100%)
 rename {results => project_name/results}/vision_language_modeling.csv (100%)
 rename {results => project_name/results}/vision_language_modeling1.csv (100%)
 create mode 100644 requirements-test.txt
 create mode 100644 setup.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/conftest.py
 create mode 100644 tests/test_base.py

diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 0000000..1e9d2a1
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,12 @@
+# These are supported funding model platforms
+
+github: [rochacbruno]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000..0d9360d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,31 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug, help wanted
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Desktop (please complete the following information):**
+ - OS: [e.g. iOS]
+ - Version [e.g. 22]
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000..cc98b69
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: enhancement, question
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000..9ccc736
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,15 @@
+### Summary :memo:
+_Write an overview about it._
+
+### Details
+_Describe more what you did on changes._
+1. (...)
+2. (...)
+
+### Bugfixes :bug: (delete if dind't have any)
+-
+
+### Checks
+- [ ] Closed #798
+- [ ] Tested Changes
+- [ ] Stakeholder Approval
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..120c689
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
\ No newline at end of file
diff --git a/.github/init.sh b/.github/init.sh
new file mode 100755
index 0000000..8a32ee3
--- /dev/null
+++ b/.github/init.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+overwrite_template_dir=0
+
+while getopts t:o flag
+do
+    case "${flag}" in
+        t) template=${OPTARG};;
+        o) overwrite_template_dir=1;;
+    esac
+done
+
+if [ -z "${template}" ]; then
+    echo "Available templates: flask"
+    read -p "Enter template name: " template
+fi
+
+repo_urlname=$(basename -s .git `git config --get remote.origin.url`)
+repo_name=$(basename -s .git `git config --get remote.origin.url` | tr '-' '_' | tr '[:upper:]' '[:lower:]')
+repo_owner=$(git config --get remote.origin.url | awk -F ':' '{print $2}' | awk -F '/' '{print $1}')
+echo "Repo name: ${repo_name}"
+echo "Repo owner: ${repo_owner}"
+echo "Repo urlname: ${repo_urlname}"
+
+if [ -f ".github/workflows/rename_project.yml" ]; then
+    .github/rename_project.sh -a "${repo_owner}" -n "${repo_name}" -u "${repo_urlname}" -d "Awesome ${repo_name} created by ${repo_owner}"
+fi
+
+function download_template {
+    rm -rf "${template_dir}"
+    mkdir -p .github/templates
+    git clone "${template_url}" "${template_dir}"
+}
+
+echo "Using template:${template}"
+template_url="https://github.com/rochacbruno/${template}-project-template"
+template_dir=".github/templates/${template}"
+if [ -d "${template_dir}" ]; then
+    # Template directory already exists
+    if [ "${overwrite_template_dir}" -eq 1 ]; then
+        # user passed -o flag, delete and re-download
+        echo "Overwriting ${template_dir}"
+        download_template
+    else
+        # Ask user if they want to overwrite
+        echo "Directory ${template_dir} already exists."
+        read -p "Do you want to overwrite it? [y/N] " -n 1 -r
+        echo 
+        if [[ $REPLY =~ ^[Yy]$ ]]; then
+            echo "Overwriting ${template_dir}"
+            download_template
+        else
+            # User decided not to overwrite
+            echo "Using existing ${template_dir}"
+        fi
+    fi
+else
+    # Template directory does not exist, download it
+    echo "Downloading ${template_url}"
+    download_template
+fi
+
+echo "Applying ${template} template to this project"}
+./.github/templates/${template}/apply.sh -a "${repo_owner}" -n "${repo_name}" -u "${repo_urlname}" -d "Awesome ${repo_name} created by ${repo_owner}"
+
+# echo "Removing temporary template files"
+# rm -rf .github/templates/${template}
+
+echo "Done! review, commit and push the changes"
diff --git a/.github/release_message.sh b/.github/release_message.sh
new file mode 100755
index 0000000..f5a9062
--- /dev/null
+++ b/.github/release_message.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+previous_tag=$(git tag --sort=-creatordate | sed -n 2p)
+git shortlog "${previous_tag}.." | sed 's/^./    &/'
diff --git a/.github/rename_project.sh b/.github/rename_project.sh
new file mode 100755
index 0000000..8f05495
--- /dev/null
+++ b/.github/rename_project.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+while getopts a:n:u:d: flag
+do
+    case "${flag}" in
+        a) author=${OPTARG};;
+        n) name=${OPTARG};;
+        u) urlname=${OPTARG};;
+        d) description=${OPTARG};;
+    esac
+done
+
+echo "Author: $author";
+echo "Project Name: $name";
+echo "Project URL name: $urlname";
+echo "Description: $description";
+
+echo "Renaming project..."
+
+original_author="author_name"
+original_name="project_name"
+original_urlname="project_urlname"
+original_description="project_description"
+# for filename in $(find . -name "*.*") 
+for filename in $(git ls-files) 
+do
+    sed -i "s/$original_author/$author/g" $filename
+    sed -i "s/$original_name/$name/g" $filename
+    sed -i "s/$original_urlname/$urlname/g" $filename
+    sed -i "s/$original_description/$description/g" $filename
+    echo "Renamed $filename"
+done
+
+mv project_name $name
+
+# This command runs only once on GHA!
+rm -rf .github/template.yml
diff --git a/.github/template.yml b/.github/template.yml
new file mode 100644
index 0000000..3386bee
--- /dev/null
+++ b/.github/template.yml
@@ -0,0 +1 @@
+author: rochacbruno
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 0000000..5e82e08
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,92 @@
+# This is a basic workflow to help you get started with Actions
+
+name: CI
+
+# Controls when the workflow will run
+on:
+  # Triggers the workflow on push or pull request events but only for the main branch
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+jobs:
+  linter:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.9]
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install project
+        run: make install
+      - name: Run linter
+        run: make lint
+
+  tests_linux:
+    needs: linter
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.9]
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install project
+        run: make install
+      - name: Run tests
+        run: make test
+      - name: "Upload coverage to Codecov"
+        uses: codecov/codecov-action@v3
+        # with:
+        #   fail_ci_if_error: true
+
+  tests_mac:
+    needs: linter
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.9]
+        os: [macos-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install project
+        run: make install
+      - name: Run tests
+        run: make test
+
+  tests_win:
+    needs: linter
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.9]
+        os: [windows-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install Pip
+        run: pip install --user --upgrade pip
+      - name: Install project
+        run: pip install -e .[test]
+      - name: run tests
+        run: pytest -s -vvvv -l --tb=long tests
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..c38b48e
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,52 @@
+name: Upload Python Package
+permissions:
+  contents: write
+
+on:
+  push:
+    # Sequence of patterns matched against refs/tags
+    tags:
+      - '*' # Push events to matching v*, i.e. v1.0, v20.15.10
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+jobs:
+  release:
+    name: Create Release
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          # by default, it uses a depth of 1
+          # this fetches all history so that we can read each commit
+          fetch-depth: 0
+      - name: Generate Changelog
+        run: .github/release_message.sh > release_message.md
+      - name: Release
+        uses: softprops/action-gh-release@v1
+        with:
+          body_path: release_message.md
+
+  deploy:
+    needs: release
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install setuptools wheel twine
+    - name: Build and publish
+      env:
+        TWINE_USERNAME: __token__
+        TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+      run: |
+        python setup.py sdist bdist_wheel
+        twine upload dist/*
diff --git a/.github/workflows/rename_project.yml b/.github/workflows/rename_project.yml
new file mode 100644
index 0000000..ae6d2eb
--- /dev/null
+++ b/.github/workflows/rename_project.yml
@@ -0,0 +1,42 @@
+name: Rename the project from template
+
+on: [push]
+
+permissions: write-all
+
+jobs:
+  rename-project:
+    if: ${{ !contains (github.repository, '/python-project-template') }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          # by default, it uses a depth of 1
+          # this fetches all history so that we can read each commit
+          fetch-depth: 0
+          ref: ${{ github.head_ref }}
+          
+      - run: echo "REPOSITORY_NAME=$(echo '${{ github.repository }}' | awk -F '/' '{print $2}' | tr '-' '_' | tr '[:upper:]' '[:lower:]')" >> $GITHUB_ENV
+        shell: bash
+
+      - run: echo "REPOSITORY_URLNAME=$(echo '${{ github.repository }}' | awk -F '/' '{print $2}')" >> $GITHUB_ENV
+        shell: bash
+
+      - run: echo "REPOSITORY_OWNER=$(echo '${{ github.repository }}' | awk -F '/' '{print $1}')" >> $GITHUB_ENV
+        shell: bash
+        
+      - name: Is this still a template
+        id: is_template
+        run: echo "::set-output name=is_template::$(ls .github/template.yml &> /dev/null && echo true || echo false)"
+
+      - name: Rename the project
+        if: steps.is_template.outputs.is_template == 'true'
+        run: |
+          echo "Renaming the project with -a(author) ${{ env.REPOSITORY_OWNER }} -n(name) ${{ env.REPOSITORY_NAME }} -u(urlname) ${{ env.REPOSITORY_URLNAME }}"
+          .github/rename_project.sh -a ${{ env.REPOSITORY_OWNER }} -n ${{ env.REPOSITORY_NAME }} -u ${{ env.REPOSITORY_URLNAME }} -d "Awesome ${{ env.REPOSITORY_NAME }} created by ${{ env.REPOSITORY_OWNER }}"
+              
+      - uses: stefanzweifel/git-auto-commit-action@v4
+        with:
+          commit_message: "✅ Ready to clone and code."
+          # commit_options: '--amend --no-edit'
+          push_options: --force
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2d0fadb
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,132 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# templates
+.github/templates/*
diff --git a/ABOUT_THIS_TEMPLATE.md b/ABOUT_THIS_TEMPLATE.md
new file mode 100644
index 0000000..11795f3
--- /dev/null
+++ b/ABOUT_THIS_TEMPLATE.md
@@ -0,0 +1,198 @@
+# About this template
+
+Hi, I created this template to help you get started with a new project.
+
+I have created and maintained a number of python libraries, applications and 
+frameworks and during those years I have learned a lot about how to create a 
+project structure and how to structure a project to be as modular and simple 
+as possible.
+
+Some decisions I have made while creating this template are:
+
+ - Create a project structure that is as modular as possible.
+ - Keep it simple and easy to maintain.
+ - Allow for a lot of flexibility and customizability.
+ - Low dependency (this template doesn't add dependencies)
+
+## Structure
+
+Lets take a look at the structure of this template:
+
+```text
+├── Containerfile            # The file to build a container using buildah or docker
+├── CONTRIBUTING.md          # Onboarding instructions for new contributors
+├── docs                     # Documentation site (add more .md files here)
+│   └── index.md             # The index page for the docs site
+├── .github                  # Github metadata for repository
+│   ├── release_message.sh   # A script to generate a release message
+│   └── workflows            # The CI pipeline for Github Actions
+├── .gitignore               # A list of files to ignore when pushing to Github
+├── HISTORY.md               # Auto generated list of changes to the project
+├── LICENSE                  # The license for the project
+├── Makefile                 # A collection of utilities to manage the project
+├── MANIFEST.in              # A list of files to include in a package
+├── mkdocs.yml               # Configuration for documentation site
+├── project_name             # The main python package for the project
+│   ├── base.py              # The base module for the project
+│   ├── __init__.py          # This tells Python that this is a package
+│   ├── __main__.py          # The entry point for the project
+│   └── VERSION              # The version for the project is kept in a static file
+├── README.md                # The main readme for the project
+├── setup.py                 # The setup.py file for installing and packaging the project
+├── requirements.txt         # An empty file to hold the requirements for the project
+├── requirements-test.txt    # List of requirements for testing and devlopment
+├── setup.py                 # The setup.py file for installing and packaging the project
+└── tests                    # Unit tests for the project (add mote tests files here)
+    ├── conftest.py          # Configuration, hooks and fixtures for pytest
+    ├── __init__.py          # This tells Python that this is a test package
+    └── test_base.py         # The base test case for the project
+```
+
+## FAQ
+
+Frequent asked questions.
+
+### Why this template is not using [Poetry](https://python-poetry.org/) ?
+
+I really like Poetry and I think it is a great tool to manage your python projects,
+if you want to switch to poetry, you can run `make switch-to-poetry`.
+
+But for this template I wanted to keep it simple.
+
+Setuptools is the most simple and well supported way of packaging a Python project,
+it doesn't require extra dependencies and is the easiest way to install the project.
+
+Also, poetry doesn't have a good support for installing projects in development mode yet.
+
+### Why the `requirements.txt` is empty ?
+
+This template is a low dependency project, so it doesn't have any extra dependencies.
+You can add new dependencies as you will or you can use the `make init` command to
+generate a `requirements.txt` file based on the template you choose `flask, fastapi, click etc`.
+
+### Why there is a `requirements-test.txt` file ?
+
+This file lists all the requirements for testing and development,
+I think the development environment and testing environment should be as similar as possible.
+
+Except those tools that are up to the developer choice (like ipython, ipdb etc).
+
+### Why the template doesn't have a `pyproject.toml` file ?
+
+It is possible to run `pip install https://github.com/name/repo/tarball/main` and
+have pip to download the package direcly from Git repo.
+
+For that to work you need to have a `setup.py` file, and `pyproject.toml` is not
+supported for that kind of installation.
+
+I think it is easier for example you want to install specific branch or tag you can
+do `pip install https://github.com/name/repo/tarball/{TAG|REVISON|COMMIT}`
+
+People automating CI for your project will be grateful for having a setup.py file
+
+### Why isn't this template made as a cookiecutter template?
+
+I really like [cookiecutter](https://github.com/cookiecutter/cookiecutter) and it is a great way to create new projects,
+but for this template I wanted to use the Github `Use this template` button,
+to use this template doesn't require to install extra tooling such as cookiecutter.
+
+Just click on [Use this template](https://github.com/rochacbruno/python-project-template/generate) and you are good to go.
+
+The substituions are done using github actions and a simple sed script.
+
+### Why `VERSION` is kept in a static plain text file?
+
+I used to have my version inside my main module in a `__version__` variable, then
+I had to do some tricks to read that version variable inside the setuptools 
+`setup.py` file because that would be available only after the installation.
+
+I decided to keep the version in a static file because it is easier to read from
+wherever I want without the need to install the package.
+
+e.g: `cat project_name/VERSION` will get the project version without harming
+with module imports or anything else, it is useful for CI, logs and debugging.
+
+### Why to include `tests`, `history` and `Containerfile` as part of the release?
+
+The `MANIFEST.in` file is used to include the files in the release, once the 
+project is released to PyPI all the files listed on MANIFEST.in will be included
+even if the files are static or not related to Python.
+
+Some build systems such as RPM, DEB, AUR for some Linux distributions, and also
+internal repackaging systems tends to run the tests before the packaging is performed.
+
+The Containerfile can be useful to provide a safer execution environment for 
+the project when running on a testing environment.
+
+I added those files to make it easier for packaging in different formats.
+
+### Why conftest includes a go_to_tmpdir fixture?
+
+When your project deals with file system operations, it is a good idea to use
+a fixture to create a temporary directory and then remove it after the test.
+
+Before executing each test pytest will create a temporary directory and will
+change the working directory to that path and run the test.
+
+So the test can create temporary artifacts isolated from other tests.
+
+After the execution Pytest will remove the temporary directory.
+
+### Why this template is not using [pre-commit](https://pre-commit.com/) ?
+
+pre-commit is an excellent tool to automate checks and formatting on your code.
+
+However I figured out that pre-commit adds extra dependency and it an entry barrier
+for new contributors.
+
+Having the linting, checks and formatting as simple commands on the [Makefile](Makefile)
+makes it easier to undestand and change.
+
+Once the project is bigger and complex, having pre-commit as a dependency can be a good idea.
+
+### Why the CLI is not using click?
+
+I wanted to provide a simple template for a CLI application on the project main entry point
+click and typer are great alternatives but are external dependencies and this template
+doesn't add dependencies besides those used for development.
+
+### Why this doesn't provide a full example of application using Flask or Django?
+
+as I said before, I want it to be simple and multipurpose, so I decided to not include
+external dependencies and programming design decisions.
+
+It is up to you to decide if you want to use Flask or Django and to create your application
+the way you think is best.
+
+This template provides utilities in the Makefile to make it easier to you can run:
+
+```bash
+$ make init 
+Which template do you want to apply? [flask, fastapi, click, typer]? > flask
+Generating a new project with Flask ...
+```
+
+Then the above will download the Flask template and apply it to the project.
+
+## The Makefile
+
+All the utilities for the template and project are on the Makefile
+
+```bash
+❯ make
+Usage: make <target>
+
+Targets:
+help:             ## Show the help.
+install:          ## Install the project in dev mode.
+fmt:              ## Format code using black & isort.
+lint:             ## Run pep8, black, mypy linters.
+test: lint        ## Run tests and generate coverage report.
+watch:            ## Run tests on every change.
+clean:            ## Clean unused files.
+virtualenv:       ## Create a virtual environment.
+release:          ## Create a new tag for release.
+docs:             ## Build the documentation.
+switch-to-poetry: ## Switch to poetry package manager.
+init:             ## Initialize the project based on an application template.
+```
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..0d0dd72
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,113 @@
+# How to develop on this project
+
+project_name welcomes contributions from the community.
+
+**You need PYTHON3!**
+
+This instructions are for linux base systems. (Linux, MacOS, BSD, etc.)
+## Setting up your own fork of this repo.
+
+- On github interface click on `Fork` button.
+- Clone your fork of this repo. `git clone git@github.com:YOUR_GIT_USERNAME/project_urlname.git`
+- Enter the directory `cd project_urlname`
+- Add upstream repo `git remote add upstream https://github.com/author_name/project_urlname`
+
+## Setting up your own virtual environment
+
+Run `make virtualenv` to create a virtual environment.
+then activate it with `source .venv/bin/activate`.
+
+## Install the project in develop mode
+
+Run `make install` to install the project in develop mode.
+
+## Run the tests to ensure everything is working
+
+Run `make test` to run the tests.
+
+## Create a new branch to work on your contribution
+
+Run `git checkout -b my_contribution`
+
+## Make your changes
+
+Edit the files using your preferred editor. (we recommend VIM or VSCode)
+
+## Format the code
+
+Run `make fmt` to format the code.
+
+## Run the linter
+
+Run `make lint` to run the linter.
+
+## Test your changes
+
+Run `make test` to run the tests.
+
+Ensure code coverage report shows `100%` coverage, add tests to your PR.
+
+## Build the docs locally
+
+Run `make docs` to build the docs.
+
+Ensure your new changes are documented.
+
+## Commit your changes
+
+This project uses [conventional git commit messages](https://www.conventionalcommits.org/en/v1.0.0/).
+
+Example: `fix(package): update setup.py arguments 🎉` (emojis are fine too)
+
+## Push your changes to your fork
+
+Run `git push origin my_contribution`
+
+## Submit a pull request
+
+On github interface, click on `Pull Request` button.
+
+Wait CI to run and one of the developers will review your PR.
+## Makefile utilities
+
+This project comes with a `Makefile` that contains a number of useful utility.
+
+```bash 
+❯ make
+Usage: make <target>
+
+Targets:
+help:             ## Show the help.
+install:          ## Install the project in dev mode.
+fmt:              ## Format code using black & isort.
+lint:             ## Run pep8, black, mypy linters.
+test: lint        ## Run tests and generate coverage report.
+watch:            ## Run tests on every change.
+clean:            ## Clean unused files.
+virtualenv:       ## Create a virtual environment.
+release:          ## Create a new tag for release.
+docs:             ## Build the documentation.
+switch-to-poetry: ## Switch to poetry package manager.
+init:             ## Initialize the project based on an application template.
+```
+
+## Making a new release
+
+This project uses [semantic versioning](https://semver.org/) and tags releases with `X.Y.Z`
+Every time a new tag is created and pushed to the remote repo, github actions will
+automatically create a new release on github and trigger a release on PyPI.
+
+For this to work you need to setup a secret called `PIPY_API_TOKEN` on the project settings>secrets, 
+this token can be generated on [pypi.org](https://pypi.org/account/).
+
+To trigger a new release all you need to do is.
+
+1. If you have changes to add to the repo
+    * Make your changes following the steps described above.
+    * Commit your changes following the [conventional git commit messages](https://www.conventionalcommits.org/en/v1.0.0/).
+2. Run the tests to ensure everything is working.
+4. Run `make release` to create a new tag and push it to the remote repo.
+
+the `make release` will ask you the version number to create the tag, ex: type `0.1.1` when you are asked.
+
+> **CAUTION**:  The make release will change local changelog files and commit all the unstaged changes you have.
diff --git a/Containerfile b/Containerfile
new file mode 100644
index 0000000..83bb605
--- /dev/null
+++ b/Containerfile
@@ -0,0 +1,5 @@
+FROM python:3.7-slim
+COPY . /app
+WORKDIR /app
+RUN pip install .
+CMD ["project_name"]
diff --git a/HISTORY.md b/HISTORY.md
new file mode 100644
index 0000000..9bf6ef0
--- /dev/null
+++ b/HISTORY.md
@@ -0,0 +1,13 @@
+Changelog
+=========
+
+
+0.1.2 (2021-08-14)
+------------------
+- Fix release, README and windows CI. [Bruno Rocha]
+- Release: version 0.1.0. [Bruno Rocha]
+
+
+0.1.0 (2021-08-14)
+------------------
+- Add release command. [Bruno Rocha]
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..ef198d6
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,5 @@
+include LICENSE
+include HISTORY.md
+include Containerfile
+graft tests
+graft project_name
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..52d91ac
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,122 @@
+.ONESHELL:
+ENV_PREFIX=$(shell python -c "if __import__('pathlib').Path('.venv/bin/pip').exists(): print('.venv/bin/')")
+USING_POETRY=$(shell grep "tool.poetry" pyproject.toml && echo "yes")
+
+.PHONY: help
+help:             ## Show the help.
+	@echo "Usage: make <target>"
+	@echo ""
+	@echo "Targets:"
+	@fgrep "##" Makefile | fgrep -v fgrep
+
+
+.PHONY: show
+show:             ## Show the current environment.
+	@echo "Current environment:"
+	@if [ "$(USING_POETRY)" ]; then poetry env info && exit; fi
+	@echo "Running using $(ENV_PREFIX)"
+	@$(ENV_PREFIX)python -V
+	@$(ENV_PREFIX)python -m site
+
+.PHONY: install
+install:          ## Install the project in dev mode.
+	@if [ "$(USING_POETRY)" ]; then poetry install && exit; fi
+	@echo "Don't forget to run 'make virtualenv' if you got errors."
+	$(ENV_PREFIX)pip install -e .[test]
+
+.PHONY: fmt
+fmt:              ## Format code using black & isort.
+	$(ENV_PREFIX)isort project_name/
+	$(ENV_PREFIX)black -l 79 project_name/
+	$(ENV_PREFIX)black -l 79 tests/
+
+.PHONY: lint
+lint:             ## Run pep8, black, mypy linters.
+	$(ENV_PREFIX)flake8 project_name/
+	$(ENV_PREFIX)black -l 79 --check project_name/
+	$(ENV_PREFIX)black -l 79 --check tests/
+	$(ENV_PREFIX)mypy --ignore-missing-imports project_name/
+
+.PHONY: test
+test: lint        ## Run tests and generate coverage report.
+	$(ENV_PREFIX)pytest -v --cov-config .coveragerc --cov=project_name -l --tb=short --maxfail=1 tests/
+	$(ENV_PREFIX)coverage xml
+	$(ENV_PREFIX)coverage html
+
+.PHONY: watch
+watch:            ## Run tests on every change.
+	ls **/**.py | entr $(ENV_PREFIX)pytest -s -vvv -l --tb=long --maxfail=1 tests/
+
+.PHONY: clean
+clean:            ## Clean unused files.
+	@find ./ -name '*.pyc' -exec rm -f {} \;
+	@find ./ -name '__pycache__' -exec rm -rf {} \;
+	@find ./ -name 'Thumbs.db' -exec rm -f {} \;
+	@find ./ -name '*~' -exec rm -f {} \;
+	@rm -rf .cache
+	@rm -rf .pytest_cache
+	@rm -rf .mypy_cache
+	@rm -rf build
+	@rm -rf dist
+	@rm -rf *.egg-info
+	@rm -rf htmlcov
+	@rm -rf .tox/
+	@rm -rf docs/_build
+
+.PHONY: virtualenv
+virtualenv:       ## Create a virtual environment.
+	@if [ "$(USING_POETRY)" ]; then poetry install && exit; fi
+	@echo "creating virtualenv ..."
+	@rm -rf .venv
+	@python3 -m venv .venv
+	@./.venv/bin/pip install -U pip
+	@./.venv/bin/pip install -e .[test]
+	@echo
+	@echo "!!! Please run 'source .venv/bin/activate' to enable the environment !!!"
+
+.PHONY: release
+release:          ## Create a new tag for release.
+	@echo "WARNING: This operation will create s version tag and push to github"
+	@read -p "Version? (provide the next x.y.z semver) : " TAG
+	@echo "$${TAG}" > project_name/VERSION
+	@$(ENV_PREFIX)gitchangelog > HISTORY.md
+	@git add project_name/VERSION HISTORY.md
+	@git commit -m "release: version $${TAG} 🚀"
+	@echo "creating git tag : $${TAG}"
+	@git tag $${TAG}
+	@git push -u origin HEAD --tags
+	@echo "Github Actions will detect the new tag and release the new version."
+
+.PHONY: docs
+docs:             ## Build the documentation.
+	@echo "building documentation ..."
+	@$(ENV_PREFIX)mkdocs build
+	URL="site/index.html"; xdg-open $$URL || sensible-browser $$URL || x-www-browser $$URL || gnome-open $$URL || open $$URL
+
+.PHONY: switch-to-poetry
+switch-to-poetry: ## Switch to poetry package manager.
+	@echo "Switching to poetry ..."
+	@if ! poetry --version > /dev/null; then echo 'poetry is required, install from https://python-poetry.org/'; exit 1; fi
+	@rm -rf .venv
+	@poetry init --no-interaction --name=a_flask_test --author=rochacbruno
+	@echo "" >> pyproject.toml
+	@echo "[tool.poetry.scripts]" >> pyproject.toml
+	@echo "project_name = 'project_name.__main__:main'" >> pyproject.toml
+	@cat requirements.txt | while read in; do poetry add --no-interaction "$${in}"; done
+	@cat requirements-test.txt | while read in; do poetry add --no-interaction "$${in}" --dev; done
+	@poetry install --no-interaction
+	@mkdir -p .github/backup
+	@mv requirements* .github/backup
+	@mv setup.py .github/backup
+	@echo "You have switched to https://python-poetry.org/ package manager."
+	@echo "Please run 'poetry shell' or 'poetry run project_name'"
+
+.PHONY: init
+init:             ## Initialize the project based on an application template.
+	@./.github/init.sh
+
+
+# This project has been generated from rochacbruno/python-project-template
+# __author__ = 'rochacbruno'
+# __repo__ = https://github.com/rochacbruno/python-project-template
+# __sponsor__ = https://github.com/sponsors/rochacbruno/
diff --git a/README.md b/README.md
index a099068..727a755 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,22 @@
-# MM-PoE
\ No newline at end of file
+# MM-PoE
+
+[![codecov](https://codecov.io/gh/author_name/project_urlname/branch/main/graph/badge.svg?token=project_urlname_token_here)](https://codecov.io/gh/author_name/project_urlname)
+[![CI](https://github.com/author_name/project_urlname/actions/workflows/main.yml/badge.svg)](https://github.com/author_name/project_urlname/actions/workflows/main.yml)
+
+## Install it from PyPI
+
+```bash
+pip install MM-PoE
+```
+
+## Usage
+
+```bash
+$ python -m MM-PoE
+#or
+$ MM-PoE
+```
+
+## Development
+
+Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 0000000..000ea34
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,17 @@
+# Welcome to MkDocs
+
+For full documentation visit [mkdocs.org](https://www.mkdocs.org).
+
+## Commands
+
+* `mkdocs new [dir-name]` - Create a new project.
+* `mkdocs serve` - Start the live-reloading docs server.
+* `mkdocs build` - Build the documentation site.
+* `mkdocs -h` - Print help message and exit.
+
+## Project layout
+
+    mkdocs.yml    # The configuration file.
+    docs/
+        index.md  # The documentation homepage.
+        ...       # Other markdown pages, images and other files.
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..33a69ca
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,2 @@
+site_name: project_name
+theme: readthedocs
diff --git a/project_name/VERSION b/project_name/VERSION
new file mode 100644
index 0000000..6e8bf73
--- /dev/null
+++ b/project_name/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/project_name/__init__.py b/project_name/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/project_name/__main__.py b/project_name/__main__.py
new file mode 100644
index 0000000..2ba8b18
--- /dev/null
+++ b/project_name/__main__.py
@@ -0,0 +1,6 @@
+"""Entry point for project_name."""
+
+from project_name.cli import main  # pragma: no cover
+
+if __name__ == "__main__":  # pragma: no cover
+    main()
diff --git a/project_name/base.py b/project_name/base.py
new file mode 100644
index 0000000..ac590b9
--- /dev/null
+++ b/project_name/base.py
@@ -0,0 +1,17 @@
+"""
+project_name base module.
+
+This is the principal module of the project_name project.
+here you put your main classes and objects.
+
+Be creative! do whatever you want!
+
+If you want to replace this with a Flask application run:
+
+    $ make init
+
+and then choose `flask` as template.
+"""
+
+# example constant variable
+NAME = "project_name"
diff --git a/main.py b/project_name/cli.py
similarity index 98%
rename from main.py
rename to project_name/cli.py
index 20f3ddf..f79ff6d 100644
--- a/main.py
+++ b/project_name/cli.py
@@ -43,8 +43,11 @@
 logger = logging.getLogger(__name__)
 
 def main():
-
-    # step 1: collect arguments
+    """
+    The main function executes on commands:
+    `python -m MM-PoE` and `$ MM-PoE `.
+    """
+     # step 1: collect arguments
     args = Namespace()
     args.seed = 0
     
@@ -256,6 +259,3 @@ def main():
     poe_avg_log_probs,  lm_accuracy, _, lm_predictions = inference_process_of_elimination(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
     option = int(lm_predictions.numpy()[0])
     logger.info(f"Answer: {option}")
-
-if __name__ == "__main__":
-    main()
diff --git a/data/data_downloaders.sh b/project_name/data/data_downloaders.sh
similarity index 100%
rename from data/data_downloaders.sh
rename to project_name/data/data_downloaders.sh
diff --git a/methods/11_few_shot_vqa.sh b/project_name/methods/11_few_shot_vqa.sh
similarity index 100%
rename from methods/11_few_shot_vqa.sh
rename to project_name/methods/11_few_shot_vqa.sh
diff --git a/methods/1_main_exp.sh b/project_name/methods/1_main_exp.sh
similarity index 100%
rename from methods/1_main_exp.sh
rename to project_name/methods/1_main_exp.sh
diff --git a/methods/2_logical_reasoning.sh b/project_name/methods/2_logical_reasoning.sh
similarity index 100%
rename from methods/2_logical_reasoning.sh
rename to project_name/methods/2_logical_reasoning.sh
diff --git a/methods/3_mask.sh b/project_name/methods/3_mask.sh
similarity index 100%
rename from methods/3_mask.sh
rename to project_name/methods/3_mask.sh
diff --git a/methods/4_llm.sh b/project_name/methods/4_llm.sh
similarity index 100%
rename from methods/4_llm.sh
rename to project_name/methods/4_llm.sh
diff --git a/methods/5_few_shot.sh b/project_name/methods/5_few_shot.sh
similarity index 100%
rename from methods/5_few_shot.sh
rename to project_name/methods/5_few_shot.sh
diff --git a/methods/6_num_option.sh b/project_name/methods/6_num_option.sh
similarity index 100%
rename from methods/6_num_option.sh
rename to project_name/methods/6_num_option.sh
diff --git a/methods/7_main_exp_vqa.sh b/project_name/methods/7_main_exp_vqa.sh
similarity index 100%
rename from methods/7_main_exp_vqa.sh
rename to project_name/methods/7_main_exp_vqa.sh
diff --git a/methods/9_mask_vqa.sh b/project_name/methods/9_mask_vqa.sh
similarity index 100%
rename from methods/9_mask_vqa.sh
rename to project_name/methods/9_mask_vqa.sh
diff --git a/methods/language_modeling.py b/project_name/methods/language_modeling.py
similarity index 100%
rename from methods/language_modeling.py
rename to project_name/methods/language_modeling.py
diff --git a/methods/process_of_elimination.py b/project_name/methods/process_of_elimination.py
similarity index 100%
rename from methods/process_of_elimination.py
rename to project_name/methods/process_of_elimination.py
diff --git a/methods/process_of_elimination_vqa.py b/project_name/methods/process_of_elimination_vqa.py
similarity index 100%
rename from methods/process_of_elimination_vqa.py
rename to project_name/methods/process_of_elimination_vqa.py
diff --git a/methods/utils/data.py b/project_name/methods/utils/data.py
similarity index 100%
rename from methods/utils/data.py
rename to project_name/methods/utils/data.py
diff --git a/methods/utils/methods.py b/project_name/methods/utils/methods.py
similarity index 100%
rename from methods/utils/methods.py
rename to project_name/methods/utils/methods.py
diff --git a/methods/utils/models.py b/project_name/methods/utils/models.py
similarity index 100%
rename from methods/utils/models.py
rename to project_name/methods/utils/models.py
diff --git a/methods/utils/utils.py b/project_name/methods/utils/utils.py
similarity index 100%
rename from methods/utils/utils.py
rename to project_name/methods/utils/utils.py
diff --git a/methods/vision_language_modeling.py b/project_name/methods/vision_language_modeling.py
similarity index 100%
rename from methods/vision_language_modeling.py
rename to project_name/methods/vision_language_modeling.py
diff --git a/models/model_downloaders/model_downloaders.py b/project_name/models/model_downloaders/model_downloaders.py
similarity index 100%
rename from models/model_downloaders/model_downloaders.py
rename to project_name/models/model_downloaders/model_downloaders.py
diff --git a/models/model_downloaders/model_downloaders.sh b/project_name/models/model_downloaders/model_downloaders.sh
similarity index 100%
rename from models/model_downloaders/model_downloaders.sh
rename to project_name/models/model_downloaders/model_downloaders.sh
diff --git a/results/calibration.csv b/project_name/results/calibration.csv
similarity index 100%
rename from results/calibration.csv
rename to project_name/results/calibration.csv
diff --git a/results/calibration1.csv b/project_name/results/calibration1.csv
similarity index 100%
rename from results/calibration1.csv
rename to project_name/results/calibration1.csv
diff --git a/results/channel.csv b/project_name/results/channel.csv
similarity index 100%
rename from results/channel.csv
rename to project_name/results/channel.csv
diff --git a/results/channel1.csv b/project_name/results/channel1.csv
similarity index 100%
rename from results/channel1.csv
rename to project_name/results/channel1.csv
diff --git a/results/multiple_choice_prompt.csv b/project_name/results/multiple_choice_prompt.csv
similarity index 100%
rename from results/multiple_choice_prompt.csv
rename to project_name/results/multiple_choice_prompt.csv
diff --git a/results/multiple_choice_prompt1.csv b/project_name/results/multiple_choice_prompt1.csv
similarity index 100%
rename from results/multiple_choice_prompt1.csv
rename to project_name/results/multiple_choice_prompt1.csv
diff --git a/results/process_of_elimination.csv b/project_name/results/process_of_elimination.csv
similarity index 100%
rename from results/process_of_elimination.csv
rename to project_name/results/process_of_elimination.csv
diff --git a/results/process_of_elimination1.csv b/project_name/results/process_of_elimination1.csv
similarity index 100%
rename from results/process_of_elimination1.csv
rename to project_name/results/process_of_elimination1.csv
diff --git a/results/vision_language_modeling.csv b/project_name/results/vision_language_modeling.csv
similarity index 100%
rename from results/vision_language_modeling.csv
rename to project_name/results/vision_language_modeling.csv
diff --git a/results/vision_language_modeling1.csv b/project_name/results/vision_language_modeling1.csv
similarity index 100%
rename from results/vision_language_modeling1.csv
rename to project_name/results/vision_language_modeling1.csv
diff --git a/requirements-test.txt b/requirements-test.txt
new file mode 100644
index 0000000..e89ee5c
--- /dev/null
+++ b/requirements-test.txt
@@ -0,0 +1,10 @@
+# These requirements are for development and testing only, not for production.
+pytest
+coverage
+flake8
+black
+isort
+pytest-cov
+mypy
+gitchangelog
+mkdocs
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..7547627
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,46 @@
+"""Python setup.py for project_name package"""
+import io
+import os
+from setuptools import find_packages, setup
+
+
+def read(*paths, **kwargs):
+    """Read the contents of a text file safely.
+    >>> read("project_name", "VERSION")
+    '0.1.0'
+    >>> read("README.md")
+    ...
+    """
+
+    content = ""
+    with io.open(
+        os.path.join(os.path.dirname(__file__), *paths),
+        encoding=kwargs.get("encoding", "utf8"),
+    ) as open_file:
+        content = open_file.read().strip()
+    return content
+
+
+def read_requirements(path):
+    return [
+        line.strip()
+        for line in read(path).split("\n")
+        if not line.startswith(('"', "#", "-", "git+"))
+    ]
+
+
+setup(
+    name="project_name",
+    version=read("project_name", "VERSION"),
+    description="project_description",
+    url="https://github.com/author_name/project_urlname/",
+    long_description=read("README.md"),
+    long_description_content_type="text/markdown",
+    author="author_name",
+    packages=find_packages(exclude=["tests", ".github"]),
+    install_requires=read_requirements("requirements.txt"),
+    entry_points={
+        "console_scripts": ["project_name = project_name.__main__:main"]
+    },
+    extras_require={"test": read_requirements("requirements-test.txt")},
+)
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..1cbb7b1
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,14 @@
+import sys
+import pytest
+
+
+# each test runs on cwd to its temp dir
+@pytest.fixture(autouse=True)
+def go_to_tmpdir(request):
+    # Get the fixture dynamically by its name.
+    tmpdir = request.getfixturevalue("tmpdir")
+    # ensure local test created packages can be imported
+    sys.path.insert(0, str(tmpdir))
+    # Chdir only for the duration of the test.
+    with tmpdir.as_cwd():
+        yield
diff --git a/tests/test_base.py b/tests/test_base.py
new file mode 100644
index 0000000..f1b765f
--- /dev/null
+++ b/tests/test_base.py
@@ -0,0 +1,5 @@
+from project_name.base import NAME
+
+
+def test_base():
+    assert NAME == "project_name"

From 1e9088d3fbacd41e9a693d5c856fd27e325bd90e Mon Sep 17 00:00:00 2001
From: souradipp76 <souradipp76@users.noreply.github.com>
Date: Sun, 6 Oct 2024 17:02:42 +0000
Subject: [PATCH 03/30] =?UTF-8?q?=E2=9C=85=20Ready=20to=20clone=20and=20co?=
 =?UTF-8?q?de.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/rename_project.sh                     | 10 +++++-----
 .github/template.yml                          |  1 -
 ABOUT_THIS_TEMPLATE.md                        |  4 ++--
 CONTRIBUTING.md                               |  8 ++++----
 Containerfile                                 |  2 +-
 MANIFEST.in                                   |  2 +-
 Makefile                                      | 20 +++++++++----------
 README.md                                     |  4 ++--
 mkdocs.yml                                    |  2 +-
 {project_name => mm_poe}/VERSION              |  0
 {project_name => mm_poe}/__init__.py          |  0
 mm_poe/__main__.py                            |  6 ++++++
 {project_name => mm_poe}/base.py              |  6 +++---
 {project_name => mm_poe}/cli.py               |  0
 .../data/data_downloaders.sh                  |  0
 .../methods/11_few_shot_vqa.sh                |  0
 .../methods/1_main_exp.sh                     |  0
 .../methods/2_logical_reasoning.sh            |  0
 {project_name => mm_poe}/methods/3_mask.sh    |  0
 {project_name => mm_poe}/methods/4_llm.sh     |  0
 .../methods/5_few_shot.sh                     |  0
 .../methods/6_num_option.sh                   |  0
 .../methods/7_main_exp_vqa.sh                 |  0
 .../methods/9_mask_vqa.sh                     |  0
 .../methods/language_modeling.py              |  0
 .../methods/process_of_elimination.py         |  0
 .../methods/process_of_elimination_vqa.py     |  0
 .../methods/utils/data.py                     |  0
 .../methods/utils/methods.py                  |  0
 .../methods/utils/models.py                   |  0
 .../methods/utils/utils.py                    |  0
 .../methods/vision_language_modeling.py       |  0
 .../model_downloaders/model_downloaders.py    |  0
 .../model_downloaders/model_downloaders.sh    |  0
 .../results/calibration.csv                   |  0
 .../results/calibration1.csv                  |  0
 {project_name => mm_poe}/results/channel.csv  |  0
 {project_name => mm_poe}/results/channel1.csv |  0
 .../results/multiple_choice_prompt.csv        |  0
 .../results/multiple_choice_prompt1.csv       |  0
 .../results/process_of_elimination.csv        |  0
 .../results/process_of_elimination1.csv       |  0
 .../results/vision_language_modeling.csv      |  0
 .../results/vision_language_modeling1.csv     |  0
 project_name/__main__.py                      |  6 ------
 setup.py                                      | 16 +++++++--------
 tests/test_base.py                            |  4 ++--
 47 files changed, 45 insertions(+), 46 deletions(-)
 delete mode 100644 .github/template.yml
 rename {project_name => mm_poe}/VERSION (100%)
 rename {project_name => mm_poe}/__init__.py (100%)
 create mode 100644 mm_poe/__main__.py
 rename {project_name => mm_poe}/base.py (68%)
 rename {project_name => mm_poe}/cli.py (100%)
 rename {project_name => mm_poe}/data/data_downloaders.sh (100%)
 rename {project_name => mm_poe}/methods/11_few_shot_vqa.sh (100%)
 rename {project_name => mm_poe}/methods/1_main_exp.sh (100%)
 rename {project_name => mm_poe}/methods/2_logical_reasoning.sh (100%)
 rename {project_name => mm_poe}/methods/3_mask.sh (100%)
 rename {project_name => mm_poe}/methods/4_llm.sh (100%)
 rename {project_name => mm_poe}/methods/5_few_shot.sh (100%)
 rename {project_name => mm_poe}/methods/6_num_option.sh (100%)
 rename {project_name => mm_poe}/methods/7_main_exp_vqa.sh (100%)
 rename {project_name => mm_poe}/methods/9_mask_vqa.sh (100%)
 rename {project_name => mm_poe}/methods/language_modeling.py (100%)
 rename {project_name => mm_poe}/methods/process_of_elimination.py (100%)
 rename {project_name => mm_poe}/methods/process_of_elimination_vqa.py (100%)
 rename {project_name => mm_poe}/methods/utils/data.py (100%)
 rename {project_name => mm_poe}/methods/utils/methods.py (100%)
 rename {project_name => mm_poe}/methods/utils/models.py (100%)
 rename {project_name => mm_poe}/methods/utils/utils.py (100%)
 rename {project_name => mm_poe}/methods/vision_language_modeling.py (100%)
 rename {project_name => mm_poe}/models/model_downloaders/model_downloaders.py (100%)
 rename {project_name => mm_poe}/models/model_downloaders/model_downloaders.sh (100%)
 rename {project_name => mm_poe}/results/calibration.csv (100%)
 rename {project_name => mm_poe}/results/calibration1.csv (100%)
 rename {project_name => mm_poe}/results/channel.csv (100%)
 rename {project_name => mm_poe}/results/channel1.csv (100%)
 rename {project_name => mm_poe}/results/multiple_choice_prompt.csv (100%)
 rename {project_name => mm_poe}/results/multiple_choice_prompt1.csv (100%)
 rename {project_name => mm_poe}/results/process_of_elimination.csv (100%)
 rename {project_name => mm_poe}/results/process_of_elimination1.csv (100%)
 rename {project_name => mm_poe}/results/vision_language_modeling.csv (100%)
 rename {project_name => mm_poe}/results/vision_language_modeling1.csv (100%)
 delete mode 100644 project_name/__main__.py

diff --git a/.github/rename_project.sh b/.github/rename_project.sh
index 8f05495..8836454 100755
--- a/.github/rename_project.sh
+++ b/.github/rename_project.sh
@@ -16,10 +16,10 @@ echo "Description: $description";
 
 echo "Renaming project..."
 
-original_author="author_name"
-original_name="project_name"
-original_urlname="project_urlname"
-original_description="project_description"
+original_author="souradipp76"
+original_name="mm_poe"
+original_urlname="MM-PoE"
+original_description="Awesome mm_poe created by souradipp76"
 # for filename in $(find . -name "*.*") 
 for filename in $(git ls-files) 
 do
@@ -30,7 +30,7 @@ do
     echo "Renamed $filename"
 done
 
-mv project_name $name
+mv mm_poe $name
 
 # This command runs only once on GHA!
 rm -rf .github/template.yml
diff --git a/.github/template.yml b/.github/template.yml
deleted file mode 100644
index 3386bee..0000000
--- a/.github/template.yml
+++ /dev/null
@@ -1 +0,0 @@
-author: rochacbruno
diff --git a/ABOUT_THIS_TEMPLATE.md b/ABOUT_THIS_TEMPLATE.md
index 11795f3..da03159 100644
--- a/ABOUT_THIS_TEMPLATE.md
+++ b/ABOUT_THIS_TEMPLATE.md
@@ -32,7 +32,7 @@ Lets take a look at the structure of this template:
 ├── Makefile                 # A collection of utilities to manage the project
 ├── MANIFEST.in              # A list of files to include in a package
 ├── mkdocs.yml               # Configuration for documentation site
-├── project_name             # The main python package for the project
+├── mm_poe             # The main python package for the project
 │   ├── base.py              # The base module for the project
 │   ├── __init__.py          # This tells Python that this is a package
 │   ├── __main__.py          # The entry point for the project
@@ -109,7 +109,7 @@ I had to do some tricks to read that version variable inside the setuptools
 I decided to keep the version in a static file because it is easier to read from
 wherever I want without the need to install the package.
 
-e.g: `cat project_name/VERSION` will get the project version without harming
+e.g: `cat mm_poe/VERSION` will get the project version without harming
 with module imports or anything else, it is useful for CI, logs and debugging.
 
 ### Why to include `tests`, `history` and `Containerfile` as part of the release?
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0d0dd72..0409a32 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,6 +1,6 @@
 # How to develop on this project
 
-project_name welcomes contributions from the community.
+mm_poe welcomes contributions from the community.
 
 **You need PYTHON3!**
 
@@ -8,9 +8,9 @@ This instructions are for linux base systems. (Linux, MacOS, BSD, etc.)
 ## Setting up your own fork of this repo.
 
 - On github interface click on `Fork` button.
-- Clone your fork of this repo. `git clone git@github.com:YOUR_GIT_USERNAME/project_urlname.git`
-- Enter the directory `cd project_urlname`
-- Add upstream repo `git remote add upstream https://github.com/author_name/project_urlname`
+- Clone your fork of this repo. `git clone git@github.com:YOUR_GIT_USERNAME/MM-PoE.git`
+- Enter the directory `cd MM-PoE`
+- Add upstream repo `git remote add upstream https://github.com/souradipp76/MM-PoE`
 
 ## Setting up your own virtual environment
 
diff --git a/Containerfile b/Containerfile
index 83bb605..1c1e8bc 100644
--- a/Containerfile
+++ b/Containerfile
@@ -2,4 +2,4 @@ FROM python:3.7-slim
 COPY . /app
 WORKDIR /app
 RUN pip install .
-CMD ["project_name"]
+CMD ["mm_poe"]
diff --git a/MANIFEST.in b/MANIFEST.in
index ef198d6..9782b99 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,4 +2,4 @@ include LICENSE
 include HISTORY.md
 include Containerfile
 graft tests
-graft project_name
+graft mm_poe
diff --git a/Makefile b/Makefile
index 52d91ac..2c5fc65 100644
--- a/Makefile
+++ b/Makefile
@@ -26,20 +26,20 @@ install:          ## Install the project in dev mode.
 
 .PHONY: fmt
 fmt:              ## Format code using black & isort.
-	$(ENV_PREFIX)isort project_name/
-	$(ENV_PREFIX)black -l 79 project_name/
+	$(ENV_PREFIX)isort mm_poe/
+	$(ENV_PREFIX)black -l 79 mm_poe/
 	$(ENV_PREFIX)black -l 79 tests/
 
 .PHONY: lint
 lint:             ## Run pep8, black, mypy linters.
-	$(ENV_PREFIX)flake8 project_name/
-	$(ENV_PREFIX)black -l 79 --check project_name/
+	$(ENV_PREFIX)flake8 mm_poe/
+	$(ENV_PREFIX)black -l 79 --check mm_poe/
 	$(ENV_PREFIX)black -l 79 --check tests/
-	$(ENV_PREFIX)mypy --ignore-missing-imports project_name/
+	$(ENV_PREFIX)mypy --ignore-missing-imports mm_poe/
 
 .PHONY: test
 test: lint        ## Run tests and generate coverage report.
-	$(ENV_PREFIX)pytest -v --cov-config .coveragerc --cov=project_name -l --tb=short --maxfail=1 tests/
+	$(ENV_PREFIX)pytest -v --cov-config .coveragerc --cov=mm_poe -l --tb=short --maxfail=1 tests/
 	$(ENV_PREFIX)coverage xml
 	$(ENV_PREFIX)coverage html
 
@@ -78,9 +78,9 @@ virtualenv:       ## Create a virtual environment.
 release:          ## Create a new tag for release.
 	@echo "WARNING: This operation will create s version tag and push to github"
 	@read -p "Version? (provide the next x.y.z semver) : " TAG
-	@echo "$${TAG}" > project_name/VERSION
+	@echo "$${TAG}" > mm_poe/VERSION
 	@$(ENV_PREFIX)gitchangelog > HISTORY.md
-	@git add project_name/VERSION HISTORY.md
+	@git add mm_poe/VERSION HISTORY.md
 	@git commit -m "release: version $${TAG} 🚀"
 	@echo "creating git tag : $${TAG}"
 	@git tag $${TAG}
@@ -101,7 +101,7 @@ switch-to-poetry: ## Switch to poetry package manager.
 	@poetry init --no-interaction --name=a_flask_test --author=rochacbruno
 	@echo "" >> pyproject.toml
 	@echo "[tool.poetry.scripts]" >> pyproject.toml
-	@echo "project_name = 'project_name.__main__:main'" >> pyproject.toml
+	@echo "mm_poe = 'mm_poe.__main__:main'" >> pyproject.toml
 	@cat requirements.txt | while read in; do poetry add --no-interaction "$${in}"; done
 	@cat requirements-test.txt | while read in; do poetry add --no-interaction "$${in}" --dev; done
 	@poetry install --no-interaction
@@ -109,7 +109,7 @@ switch-to-poetry: ## Switch to poetry package manager.
 	@mv requirements* .github/backup
 	@mv setup.py .github/backup
 	@echo "You have switched to https://python-poetry.org/ package manager."
-	@echo "Please run 'poetry shell' or 'poetry run project_name'"
+	@echo "Please run 'poetry shell' or 'poetry run mm_poe'"
 
 .PHONY: init
 init:             ## Initialize the project based on an application template.
diff --git a/README.md b/README.md
index 727a755..86e407e 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # MM-PoE
 
-[![codecov](https://codecov.io/gh/author_name/project_urlname/branch/main/graph/badge.svg?token=project_urlname_token_here)](https://codecov.io/gh/author_name/project_urlname)
-[![CI](https://github.com/author_name/project_urlname/actions/workflows/main.yml/badge.svg)](https://github.com/author_name/project_urlname/actions/workflows/main.yml)
+[![codecov](https://codecov.io/gh/souradipp76/MM-PoE/branch/main/graph/badge.svg?token=MM-PoE_token_here)](https://codecov.io/gh/souradipp76/MM-PoE)
+[![CI](https://github.com/souradipp76/MM-PoE/actions/workflows/main.yml/badge.svg)](https://github.com/souradipp76/MM-PoE/actions/workflows/main.yml)
 
 ## Install it from PyPI
 
diff --git a/mkdocs.yml b/mkdocs.yml
index 33a69ca..c1ac1bb 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,2 +1,2 @@
-site_name: project_name
+site_name: mm_poe
 theme: readthedocs
diff --git a/project_name/VERSION b/mm_poe/VERSION
similarity index 100%
rename from project_name/VERSION
rename to mm_poe/VERSION
diff --git a/project_name/__init__.py b/mm_poe/__init__.py
similarity index 100%
rename from project_name/__init__.py
rename to mm_poe/__init__.py
diff --git a/mm_poe/__main__.py b/mm_poe/__main__.py
new file mode 100644
index 0000000..f87117f
--- /dev/null
+++ b/mm_poe/__main__.py
@@ -0,0 +1,6 @@
+"""Entry point for mm_poe."""
+
+from mm_poe.cli import main  # pragma: no cover
+
+if __name__ == "__main__":  # pragma: no cover
+    main()
diff --git a/project_name/base.py b/mm_poe/base.py
similarity index 68%
rename from project_name/base.py
rename to mm_poe/base.py
index ac590b9..9b0870f 100644
--- a/project_name/base.py
+++ b/mm_poe/base.py
@@ -1,7 +1,7 @@
 """
-project_name base module.
+mm_poe base module.
 
-This is the principal module of the project_name project.
+This is the principal module of the mm_poe project.
 here you put your main classes and objects.
 
 Be creative! do whatever you want!
@@ -14,4 +14,4 @@
 """
 
 # example constant variable
-NAME = "project_name"
+NAME = "mm_poe"
diff --git a/project_name/cli.py b/mm_poe/cli.py
similarity index 100%
rename from project_name/cli.py
rename to mm_poe/cli.py
diff --git a/project_name/data/data_downloaders.sh b/mm_poe/data/data_downloaders.sh
similarity index 100%
rename from project_name/data/data_downloaders.sh
rename to mm_poe/data/data_downloaders.sh
diff --git a/project_name/methods/11_few_shot_vqa.sh b/mm_poe/methods/11_few_shot_vqa.sh
similarity index 100%
rename from project_name/methods/11_few_shot_vqa.sh
rename to mm_poe/methods/11_few_shot_vqa.sh
diff --git a/project_name/methods/1_main_exp.sh b/mm_poe/methods/1_main_exp.sh
similarity index 100%
rename from project_name/methods/1_main_exp.sh
rename to mm_poe/methods/1_main_exp.sh
diff --git a/project_name/methods/2_logical_reasoning.sh b/mm_poe/methods/2_logical_reasoning.sh
similarity index 100%
rename from project_name/methods/2_logical_reasoning.sh
rename to mm_poe/methods/2_logical_reasoning.sh
diff --git a/project_name/methods/3_mask.sh b/mm_poe/methods/3_mask.sh
similarity index 100%
rename from project_name/methods/3_mask.sh
rename to mm_poe/methods/3_mask.sh
diff --git a/project_name/methods/4_llm.sh b/mm_poe/methods/4_llm.sh
similarity index 100%
rename from project_name/methods/4_llm.sh
rename to mm_poe/methods/4_llm.sh
diff --git a/project_name/methods/5_few_shot.sh b/mm_poe/methods/5_few_shot.sh
similarity index 100%
rename from project_name/methods/5_few_shot.sh
rename to mm_poe/methods/5_few_shot.sh
diff --git a/project_name/methods/6_num_option.sh b/mm_poe/methods/6_num_option.sh
similarity index 100%
rename from project_name/methods/6_num_option.sh
rename to mm_poe/methods/6_num_option.sh
diff --git a/project_name/methods/7_main_exp_vqa.sh b/mm_poe/methods/7_main_exp_vqa.sh
similarity index 100%
rename from project_name/methods/7_main_exp_vqa.sh
rename to mm_poe/methods/7_main_exp_vqa.sh
diff --git a/project_name/methods/9_mask_vqa.sh b/mm_poe/methods/9_mask_vqa.sh
similarity index 100%
rename from project_name/methods/9_mask_vqa.sh
rename to mm_poe/methods/9_mask_vqa.sh
diff --git a/project_name/methods/language_modeling.py b/mm_poe/methods/language_modeling.py
similarity index 100%
rename from project_name/methods/language_modeling.py
rename to mm_poe/methods/language_modeling.py
diff --git a/project_name/methods/process_of_elimination.py b/mm_poe/methods/process_of_elimination.py
similarity index 100%
rename from project_name/methods/process_of_elimination.py
rename to mm_poe/methods/process_of_elimination.py
diff --git a/project_name/methods/process_of_elimination_vqa.py b/mm_poe/methods/process_of_elimination_vqa.py
similarity index 100%
rename from project_name/methods/process_of_elimination_vqa.py
rename to mm_poe/methods/process_of_elimination_vqa.py
diff --git a/project_name/methods/utils/data.py b/mm_poe/methods/utils/data.py
similarity index 100%
rename from project_name/methods/utils/data.py
rename to mm_poe/methods/utils/data.py
diff --git a/project_name/methods/utils/methods.py b/mm_poe/methods/utils/methods.py
similarity index 100%
rename from project_name/methods/utils/methods.py
rename to mm_poe/methods/utils/methods.py
diff --git a/project_name/methods/utils/models.py b/mm_poe/methods/utils/models.py
similarity index 100%
rename from project_name/methods/utils/models.py
rename to mm_poe/methods/utils/models.py
diff --git a/project_name/methods/utils/utils.py b/mm_poe/methods/utils/utils.py
similarity index 100%
rename from project_name/methods/utils/utils.py
rename to mm_poe/methods/utils/utils.py
diff --git a/project_name/methods/vision_language_modeling.py b/mm_poe/methods/vision_language_modeling.py
similarity index 100%
rename from project_name/methods/vision_language_modeling.py
rename to mm_poe/methods/vision_language_modeling.py
diff --git a/project_name/models/model_downloaders/model_downloaders.py b/mm_poe/models/model_downloaders/model_downloaders.py
similarity index 100%
rename from project_name/models/model_downloaders/model_downloaders.py
rename to mm_poe/models/model_downloaders/model_downloaders.py
diff --git a/project_name/models/model_downloaders/model_downloaders.sh b/mm_poe/models/model_downloaders/model_downloaders.sh
similarity index 100%
rename from project_name/models/model_downloaders/model_downloaders.sh
rename to mm_poe/models/model_downloaders/model_downloaders.sh
diff --git a/project_name/results/calibration.csv b/mm_poe/results/calibration.csv
similarity index 100%
rename from project_name/results/calibration.csv
rename to mm_poe/results/calibration.csv
diff --git a/project_name/results/calibration1.csv b/mm_poe/results/calibration1.csv
similarity index 100%
rename from project_name/results/calibration1.csv
rename to mm_poe/results/calibration1.csv
diff --git a/project_name/results/channel.csv b/mm_poe/results/channel.csv
similarity index 100%
rename from project_name/results/channel.csv
rename to mm_poe/results/channel.csv
diff --git a/project_name/results/channel1.csv b/mm_poe/results/channel1.csv
similarity index 100%
rename from project_name/results/channel1.csv
rename to mm_poe/results/channel1.csv
diff --git a/project_name/results/multiple_choice_prompt.csv b/mm_poe/results/multiple_choice_prompt.csv
similarity index 100%
rename from project_name/results/multiple_choice_prompt.csv
rename to mm_poe/results/multiple_choice_prompt.csv
diff --git a/project_name/results/multiple_choice_prompt1.csv b/mm_poe/results/multiple_choice_prompt1.csv
similarity index 100%
rename from project_name/results/multiple_choice_prompt1.csv
rename to mm_poe/results/multiple_choice_prompt1.csv
diff --git a/project_name/results/process_of_elimination.csv b/mm_poe/results/process_of_elimination.csv
similarity index 100%
rename from project_name/results/process_of_elimination.csv
rename to mm_poe/results/process_of_elimination.csv
diff --git a/project_name/results/process_of_elimination1.csv b/mm_poe/results/process_of_elimination1.csv
similarity index 100%
rename from project_name/results/process_of_elimination1.csv
rename to mm_poe/results/process_of_elimination1.csv
diff --git a/project_name/results/vision_language_modeling.csv b/mm_poe/results/vision_language_modeling.csv
similarity index 100%
rename from project_name/results/vision_language_modeling.csv
rename to mm_poe/results/vision_language_modeling.csv
diff --git a/project_name/results/vision_language_modeling1.csv b/mm_poe/results/vision_language_modeling1.csv
similarity index 100%
rename from project_name/results/vision_language_modeling1.csv
rename to mm_poe/results/vision_language_modeling1.csv
diff --git a/project_name/__main__.py b/project_name/__main__.py
deleted file mode 100644
index 2ba8b18..0000000
--- a/project_name/__main__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-"""Entry point for project_name."""
-
-from project_name.cli import main  # pragma: no cover
-
-if __name__ == "__main__":  # pragma: no cover
-    main()
diff --git a/setup.py b/setup.py
index 7547627..f87cda3 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,4 @@
-"""Python setup.py for project_name package"""
+"""Python setup.py for mm_poe package"""
 import io
 import os
 from setuptools import find_packages, setup
@@ -6,7 +6,7 @@
 
 def read(*paths, **kwargs):
     """Read the contents of a text file safely.
-    >>> read("project_name", "VERSION")
+    >>> read("mm_poe", "VERSION")
     '0.1.0'
     >>> read("README.md")
     ...
@@ -30,17 +30,17 @@ def read_requirements(path):
 
 
 setup(
-    name="project_name",
-    version=read("project_name", "VERSION"),
-    description="project_description",
-    url="https://github.com/author_name/project_urlname/",
+    name="mm_poe",
+    version=read("mm_poe", "VERSION"),
+    description="Awesome mm_poe created by souradipp76",
+    url="https://github.com/souradipp76/MM-PoE/",
     long_description=read("README.md"),
     long_description_content_type="text/markdown",
-    author="author_name",
+    author="souradipp76",
     packages=find_packages(exclude=["tests", ".github"]),
     install_requires=read_requirements("requirements.txt"),
     entry_points={
-        "console_scripts": ["project_name = project_name.__main__:main"]
+        "console_scripts": ["mm_poe = mm_poe.__main__:main"]
     },
     extras_require={"test": read_requirements("requirements-test.txt")},
 )
diff --git a/tests/test_base.py b/tests/test_base.py
index f1b765f..dd26d70 100644
--- a/tests/test_base.py
+++ b/tests/test_base.py
@@ -1,5 +1,5 @@
-from project_name.base import NAME
+from mm_poe.base import NAME
 
 
 def test_base():
-    assert NAME == "project_name"
+    assert NAME == "mm_poe"

From 54e1f5238ea06a3978804b3ca0e925fcd06121c2 Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Sun, 6 Oct 2024 12:51:25 -0500
Subject: [PATCH 04/30] Fixed app.

---
 HISTORY.md                                    | 11 ---------
 README.md                                     | 23 +++++++++++++++----
 mm_poe/__main__.py                            |  4 ++--
 mm_poe/base.py                                | 17 --------------
 mm_poe/cli.py                                 | 16 +++++++------
 mm_poe/methods/__init_.py                     |  0
 mm_poe/methods/utils/__init__.py              |  0
 .../model_downloaders/model_downloaders.sh    |  2 +-
 setup.py                                      |  2 +-
 tests/test_base.py                            |  5 ----
 10 files changed, 31 insertions(+), 49 deletions(-)
 delete mode 100644 mm_poe/base.py
 create mode 100644 mm_poe/methods/__init_.py
 create mode 100644 mm_poe/methods/utils/__init__.py
 mode change 100644 => 100755 mm_poe/models/model_downloaders/model_downloaders.sh
 delete mode 100644 tests/test_base.py

diff --git a/HISTORY.md b/HISTORY.md
index 9bf6ef0..a5693d9 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,13 +1,2 @@
 Changelog
 =========
-
-
-0.1.2 (2021-08-14)
-------------------
-- Fix release, README and windows CI. [Bruno Rocha]
-- Release: version 0.1.0. [Bruno Rocha]
-
-
-0.1.0 (2021-08-14)
-------------------
-- Add release command. [Bruno Rocha]
diff --git a/README.md b/README.md
index 86e407e..085922b 100644
--- a/README.md
+++ b/README.md
@@ -3,20 +3,33 @@
 [![codecov](https://codecov.io/gh/souradipp76/MM-PoE/branch/main/graph/badge.svg?token=MM-PoE_token_here)](https://codecov.io/gh/souradipp76/MM-PoE)
 [![CI](https://github.com/souradipp76/MM-PoE/actions/workflows/main.yml/badge.svg)](https://github.com/souradipp76/MM-PoE/actions/workflows/main.yml)
 
-## Install it from PyPI
+## Installation
+### Install it from PyPI
 
 ```bash
-pip install MM-PoE
+pip install mm_poe
+```
+
+### Install it from source
+
+```bash
+$ git clone https://github.com/souradipp76/MM-PoE.git
+$ cd MM-PoE
+$ make install
 ```
 
 ## Usage
 
 ```bash
-$ python -m MM-PoE
+$ python -m mm_poe
 #or
-$ MM-PoE
+$ mm_poe
 ```
 
-## Development
+## Contributing
 
 Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
+
+## License
+
+Read the [LICENSE](LICENSE) file.
\ No newline at end of file
diff --git a/mm_poe/__main__.py b/mm_poe/__main__.py
index f87117f..19fc002 100644
--- a/mm_poe/__main__.py
+++ b/mm_poe/__main__.py
@@ -1,6 +1,6 @@
 """Entry point for mm_poe."""
 
-from mm_poe.cli import main  # pragma: no cover
+from mm_poe.cli import main
 
-if __name__ == "__main__":  # pragma: no cover
+if __name__ == "__main__":
     main()
diff --git a/mm_poe/base.py b/mm_poe/base.py
deleted file mode 100644
index 9b0870f..0000000
--- a/mm_poe/base.py
+++ /dev/null
@@ -1,17 +0,0 @@
-"""
-mm_poe base module.
-
-This is the principal module of the mm_poe project.
-here you put your main classes and objects.
-
-Be creative! do whatever you want!
-
-If you want to replace this with a Flask application run:
-
-    $ make init
-
-and then choose `flask` as template.
-"""
-
-# example constant variable
-NAME = "mm_poe"
diff --git a/mm_poe/cli.py b/mm_poe/cli.py
index f79ff6d..ab65cfb 100644
--- a/mm_poe/cli.py
+++ b/mm_poe/cli.py
@@ -4,6 +4,7 @@
 import logging
 import os
 import subprocess
+import pathlib
 
 import questionary
 import numpy as np
@@ -11,14 +12,14 @@
 import torch.nn.functional as F
 from torch.utils.data import DataLoader
 
-from methods.utils.data import(
+from mm_poe.methods.utils.data import(
     create_multiple_choice_prompt,
     preprocess_function_seq2seq_vqa,
     preprocess_function_seq2seq_vqa_channel,
     preprocess_function_causal_vqa,
     preprocess_function_causal_vqa_channel
 )
-from methods.utils.methods import(
+from mm_poe.methods.utils.methods import(
     compute_conditional_score_seq2seq_vqa,
     compute_conditional_score_causal_vqa,
     compute_mask_process_of_elimination,
@@ -26,7 +27,7 @@
     inference_language_modeling,
     inference_calibration
 )
-from methods.utils.utils import(
+from mm_poe.methods.utils.utils import(
     load_data,
     load_model,
     set_seed
@@ -45,7 +46,7 @@
 def main():
     """
     The main function executes on commands:
-    `python -m MM-PoE` and `$ MM-PoE `.
+    `python -m mm_poe` and `$ mm_poe `.
     """
      # step 1: collect arguments
     args = Namespace()
@@ -68,9 +69,9 @@ def main():
         default="FP32").ask()
     
     args.output_dir = questionary.path(
-        message='Output Directory?',
+        message='Model output directory?',
         only_directories=True,
-        default=f"/content/model/").ask()
+        default=f"./models/").ask()
 
     args.dataset="single_inference"
     args.batch_size=1
@@ -119,7 +120,8 @@ def main():
 
     # step 3: download model
     logger.info(f"Download {args.model_family} model: {args.checkpoint}.")
-    subprocess.call(f"python models/model_downloaders/model_downloaders.py \
+    model_downloader_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "models/model_downloaders/model_downloaders.py")
+    subprocess.call(f"python {model_downloader_path} \
             --model_family {args.model_family} \
             --checkpoint {args.checkpoint} \
             --output_dir {args.output_dir}", shell=True)
diff --git a/mm_poe/methods/__init_.py b/mm_poe/methods/__init_.py
new file mode 100644
index 0000000..e69de29
diff --git a/mm_poe/methods/utils/__init__.py b/mm_poe/methods/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mm_poe/models/model_downloaders/model_downloaders.sh b/mm_poe/models/model_downloaders/model_downloaders.sh
old mode 100644
new mode 100755
index 69b5a40..679700e
--- a/mm_poe/models/model_downloaders/model_downloaders.sh
+++ b/mm_poe/models/model_downloaders/model_downloaders.sh
@@ -9,7 +9,7 @@ do
         python models/model_downloaders/model_downloaders.py \
             --model_family ${model_family} \
             --checkpoint ${checkpoint} \
-            --output_dir "/content/models"
+            --output_dir "./models"
             # --download_all_checkpoints  
     done
 done
\ No newline at end of file
diff --git a/setup.py b/setup.py
index f87cda3..8787f92 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@ def read_requirements(path):
 setup(
     name="mm_poe",
     version=read("mm_poe", "VERSION"),
-    description="Awesome mm_poe created by souradipp76",
+    description="Awesome MM-PoE created by souradipp76",
     url="https://github.com/souradipp76/MM-PoE/",
     long_description=read("README.md"),
     long_description_content_type="text/markdown",
diff --git a/tests/test_base.py b/tests/test_base.py
deleted file mode 100644
index dd26d70..0000000
--- a/tests/test_base.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from mm_poe.base import NAME
-
-
-def test_base():
-    assert NAME == "mm_poe"

From 1dca9696c7803522aa51c69f68db3fdea8d86917 Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Sun, 6 Oct 2024 13:00:15 -0500
Subject: [PATCH 05/30] Minor changes in cli app.

---
 mm_poe/cli.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mm_poe/cli.py b/mm_poe/cli.py
index ab65cfb..7d10224 100644
--- a/mm_poe/cli.py
+++ b/mm_poe/cli.py
@@ -71,7 +71,7 @@ def main():
     args.output_dir = questionary.path(
         message='Model output directory?',
         only_directories=True,
-        default=f"./models/").ask()
+        default="./models/").ask()
 
     args.dataset="single_inference"
     args.batch_size=1
@@ -99,7 +99,7 @@ def main():
     args.choices = questionary.text("Choices [comma seprated]:").ask()
     args.choices = args.choices.split(',')
     args.num_options = len(args.choices)
-    args.image_path = questionary.path("Image Path?").ask()
+    args.image_path = questionary.path("Image Path?", default="./images/image.png").ask()
     args.label = questionary.select(
         message="Answer:",
         choices=[str(x) for x in range(args.num_options)]).ask()
@@ -120,7 +120,10 @@ def main():
 
     # step 3: download model
     logger.info(f"Download {args.model_family} model: {args.checkpoint}.")
-    model_downloader_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "models/model_downloaders/model_downloaders.py")
+    model_downloader_path = os.path.join(
+        pathlib.Path(__file__).parent.resolve(), 
+        "models/model_downloaders/model_downloaders.py"
+    )
     subprocess.call(f"python {model_downloader_path} \
             --model_family {args.model_family} \
             --checkpoint {args.checkpoint} \

From 3e063bbbaaea1afa763c7199fc76805905800896 Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Sun, 6 Oct 2024 13:17:52 -0500
Subject: [PATCH 06/30] Fixed precision errors.

---
 mm_poe/cli.py                 |  2 +-
 mm_poe/methods/utils/utils.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/mm_poe/cli.py b/mm_poe/cli.py
index 7d10224..10c3def 100644
--- a/mm_poe/cli.py
+++ b/mm_poe/cli.py
@@ -65,7 +65,7 @@ def main():
     
     args.loading_precision = questionary.select(
         message="Select model checkpoint?",
-        choices=["FP32", "FP16", "BF16", "INT8"],
+        choices=["FP32", "FP16", "BF16", "INT8", "INT4"],
         default="FP32").ask()
     
     args.output_dir = questionary.path(
diff --git a/mm_poe/methods/utils/utils.py b/mm_poe/methods/utils/utils.py
index 18d1c06..a965f6b 100644
--- a/mm_poe/methods/utils/utils.py
+++ b/mm_poe/methods/utils/utils.py
@@ -460,15 +460,15 @@ def load_model(device, model_path, args):
     
     # load with different precision
     if args.loading_precision == "FP16":
-        model = model_func.from_pretrained(model_path, device_map="cuda", torch_dtype=torch.float16)
+        model = model_func.from_pretrained(model_path, device_map=device, torch_dtype=torch.float16)
     elif args.loading_precision == "BF16":
-        model = model_func.from_pretrained(model_path, device_map="cuda", torch_dtype=torch.bfloat16)
+        model = model_func.from_pretrained(model_path, device_map=device, torch_dtype=torch.bfloat16)
     elif args.loading_precision == "INT8":
         quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=200.0)
         model = model_func.from_pretrained(
             model_path,
             torch_dtype=torch.float16,
-            device_map="cuda",
+            device_map=device,
             quantization_config=quantization_config
         )
     elif args.loading_precision == "INT4":
@@ -480,12 +480,12 @@ def load_model(device, model_path, args):
             )
         model = model_func.from_pretrained(
             model_path,
-            device_map="cuda",
+            device_map=device,
             quantization_config=quantization_config
         )
     else: # FP32
-        model = model_func.from_pretrained(model_path)
-        model.to(device)
+        model = model_func.from_pretrained(model_path, device_map=device)
+    model.to(device)
     print(f"Memory footprint: {model.get_memory_footprint() / 1024 **3:.2f} GB.")
     return model, tokenizer
 

From 10b73f180a39032cb8ce57bb2a0899b10c0565ee Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Mon, 7 Oct 2024 01:08:17 -0500
Subject: [PATCH 07/30] Adding initial tests.

---
 tests/methods/utils/test_utils.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 tests/methods/utils/test_utils.py

diff --git a/tests/methods/utils/test_utils.py b/tests/methods/utils/test_utils.py
new file mode 100644
index 0000000..3fbf881
--- /dev/null
+++ b/tests/methods/utils/test_utils.py
@@ -0,0 +1,25 @@
+import os
+import random
+import numpy as np
+import torch
+import pytest
+from unittest import mock
+
+from mm_poe.methods.utils.utils import set_seed, load_data
+
+@pytest.mark.parametrize("seed", [0, 42, 1234])
+def test_set_seed(seed):
+    # Mocking torch.cuda methods to avoid actual CUDA calls during the test
+    with mock.patch("torch.cuda.manual_seed_all") as mock_cuda_seed_all:
+
+        # Call the function with the seed
+        set_seed(seed)
+
+        # Check if os environment variable is set correctly
+        assert os.environ['PYTHONHASHSEED'] == str(seed)
+
+        # Check if torch manual_seed is called correctly
+        assert torch.initial_seed() == seed
+
+        # Check if CUDA seeds were set correctly (mocked)
+        mock_cuda_seed_all.assert_called_with(seed)
\ No newline at end of file

From 7b451bce19a2ecd28c20f6d9f0f7355d01102063 Mon Sep 17 00:00:00 2001
From: hellokayas <pidnas94335@gmail.com>
Date: Tue, 8 Oct 2024 02:29:45 -0500
Subject: [PATCH 08/30] tests

---
 tests/methods/utils/test_utils.py | 437 +++++++++++++++++++++++++++++-
 tests/test_cli.py                 | 335 +++++++++++++++++++++++
 2 files changed, 766 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_cli.py

diff --git a/tests/methods/utils/test_utils.py b/tests/methods/utils/test_utils.py
index 3fbf881..2e494f1 100644
--- a/tests/methods/utils/test_utils.py
+++ b/tests/methods/utils/test_utils.py
@@ -1,25 +1,450 @@
 import os
+import sys
 import random
 import numpy as np
 import torch
 import pytest
 from unittest import mock
+from unittest.mock import MagicMock, patch, mock_open
 
-from mm_poe.methods.utils.utils import set_seed, load_data
+# Import the functions from utils.py
+from mm_poe.methods.utils.utils import (
+    set_seed,
+    parse_args,
+    load_data,
+    load_model,
+    write_to_csv,
+)
 
+# Test for set_seed function
 @pytest.mark.parametrize("seed", [0, 42, 1234])
 def test_set_seed(seed):
-    # Mocking torch.cuda methods to avoid actual CUDA calls during the test
     with mock.patch("torch.cuda.manual_seed_all") as mock_cuda_seed_all:
-
         # Call the function with the seed
         set_seed(seed)
 
         # Check if os environment variable is set correctly
         assert os.environ['PYTHONHASHSEED'] == str(seed)
 
-        # Check if torch manual_seed is called correctly
-        assert torch.initial_seed() == seed
+        # Check if random seeds are set correctly
+        random_value = random.randint(0, 100)
+        np_value = np.random.randint(0, 100)
+        torch_value = torch.randint(0, 100, (1,)).item()
+
+        set_seed(seed)  # Reset seeds
+
+        assert random.randint(0, 100) == random_value
+        assert np.random.randint(0, 100) == np_value
+        assert torch.randint(0, 100, (1,)).item() == torch_value
 
         # Check if CUDA seeds were set correctly (mocked)
-        mock_cuda_seed_all.assert_called_with(seed)
\ No newline at end of file
+        mock_cuda_seed_all.assert_called_with(seed)
+
+# Tests for parse_args function
+def test_parse_args_with_required_arguments():
+    test_args = [
+        "script_name",
+        "--model_family", "GPT2",
+        "--checkpoint", "gpt2-medium",
+        "--datasets", "copa",
+    ]
+    with mock.patch.object(sys, 'argv', test_args):
+        args = parse_args()
+        assert args.model_family == "GPT2"
+        assert args.checkpoint == "gpt2-medium"
+        assert args.datasets == "copa"
+
+def test_parse_args_with_all_arguments():
+    test_args = [
+        "script_name",
+        "--model_family", "GPT2",
+        "--checkpoint", "gpt2-medium",
+        "--datasets", "copa winogrande",
+        "--seed", "42",
+        "--amateur_checkpoint", "gpt2-small",
+        "--expert_method", "language_modeling",
+        "--amateur_method", "calibration",
+        "--weighting_parameter", "-0.5",
+        "--weighting_parameters", "0.1,0.2",
+        "--num_random_search", "5",
+        "--loading_precision", "FP16",
+        "--sample", "100",
+        "--batch_size", "16",
+        "--n_shot", "5",
+        "--multiple_choice_prompt", "Choose the best option:",
+        "--calibration_prompt", "This is a calibration prompt.",
+        "--do_channel",
+        "--process_of_elimination_prompt", "Eliminate incorrect options:",
+        "--scoring_method_for_process_of_elimination", "calibration",
+        "--prompting_method_for_process_of_elimination", "multiple_choice_prompt",
+        "--mask_strategy_for_process_of_elimination", "min_k",
+        "--do_synonym",
+        "--number_of_synonyms", "3",
+        "--generate_synonyms_prompt", "Generate synonyms for option: option",
+        "--push_data_to_hub",
+        "--min_k", "2",
+        "--mask_token", "[MASK]",
+    ]
+    with mock.patch.object(sys, 'argv', test_args):
+        args = parse_args()
+        assert args.seed == 42
+        assert args.amateur_checkpoint == "gpt2-small"
+        assert args.expert_method == "language_modeling"
+        assert args.amateur_method == "calibration"
+        assert args.weighting_parameter == -0.5
+        assert args.weighting_parameters == "0.1,0.2"
+        assert args.num_random_search == 5
+        assert args.loading_precision == "FP16"
+        assert args.sample == 100
+        assert args.batch_size == 16
+        assert args.n_shot == 5
+        assert args.multiple_choice_prompt == "Choose the best option:"
+        assert args.calibration_prompt == "This is a calibration prompt."
+        assert args.do_channel is True
+        assert args.process_of_elimination_prompt == "Eliminate incorrect options:"
+        assert args.scoring_method_for_process_of_elimination == "calibration"
+        assert args.prompting_method_for_process_of_elimination == "multiple_choice_prompt"
+        assert args.mask_strategy_for_process_of_elimination == "min_k"
+        assert args.do_synonym is True
+        assert args.number_of_synonyms == 3
+        assert args.generate_synonyms_prompt == "Generate synonyms for option: option"
+        assert args.push_data_to_hub is True
+        assert args.min_k == 2
+        assert args.mask_token == "[MASK]"
+
+def test_parse_args_missing_required_arguments():
+    test_args = [
+        "script_name",
+        "--model_family", "GPT2",
+        # "--checkpoint" is missing
+        "--datasets", "copa",
+    ]
+    with mock.patch.object(sys, 'argv', test_args):
+        with pytest.raises(SystemExit):
+            parse_args()
+
+# # Tests for load_data function
+# @pytest.mark.parametrize("dataset_name,loader_name,ending_names,header_name", [
+#     ("copa", "copa_loader", ['hypothesis0', 'hypothesis1'], 'premise'),
+#     ("cqa", "cqa_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
+#     ("piqa", "piqa_loader", ['hypothesis0', 'hypothesis1'], 'premise'),
+#     ("winogrande", "winogrande_loader", ['hypothesis0', 'hypothesis1'], 'premise'),
+#     ("anli", "anli_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2'], 'premise'),
+#     ("vqa", "vqa_loader", [f"hypothesis{i}" for i in range(18)], 'premise'),
+#     ("scienceqa", "scienceqa_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
+#     ("ai2d", "ai2d_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
+#     ("single_inference", "single_inference_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
+# ])
+# def test_load_data_datasets(dataset_name, loader_name, ending_names, header_name):
+#     # Create a mock args object
+#     class Args:
+#         dataset = dataset_name
+#         num_options = 2
+#         image_path = None
+#         sample = None
+#         n_shot = 0
+
+#     args = Args()
+
+#     # Mock the data loader function
+#     loader_path = f'mm_poe.methods.utils.utils.{loader_name}'
+#     with mock.patch(loader_path) as mock_loader:
+#         # Mock return value
+#         mock_loader.return_value = [
+#             {
+#                 header_name: 'Test premise',
+#                 ending_names[0]: 'Option A',
+#                 ending_names[1]: 'Option B',
+#                 'label': 0
+#             }
+#         ]
+#         # Mock os.path.join to prevent file system access
+#         with mock.patch('os.path.join', return_value='dummy_path'):
+#             if dataset_name in ["vqa", "scienceqa", "ai2d", "single_inference"]:
+#                 ending, header, image_header, dev_dataset, train_dataset = load_data(args)
+#                 assert image_header == 'image_path'
+#             else:
+#                 ending, header, dev_dataset, train_dataset = load_data(args)
+#             assert ending == ending_names
+#             assert header == header_name
+#             assert len(dev_dataset) == 1
+#             assert len(train_dataset) == 1
+
+def test_load_data_invalid_dataset():
+    class Args:
+        dataset = "unknown_dataset"
+
+    args = Args()
+
+    with mock.patch('builtins.print') as mock_print:
+        result = load_data(args)
+        assert result is None
+        mock_print.assert_called_with(f"{args.dataset}: downloader not implemented.")
+
+# Tests for load_model function
+@pytest.mark.parametrize("model_family,model_func_name,tokenizer_func_name", [
+    ("GPT2", "AutoModelForCausalLM", "AutoTokenizer"),
+    ("Pythia", "AutoModelForCausalLM", "AutoTokenizer"),
+    ("OPT-IML", "AutoModelForCausalLM", "AutoTokenizer"),
+    ("Dolly", "AutoModelForCausalLM", "AutoTokenizer"),
+    ("T5", "AutoModelForSeq2SeqLM", "AutoTokenizer"),
+    ("FLAN-T5", "AutoModelForSeq2SeqLM", "AutoTokenizer"),
+    ("BLIP2", "AutoModelForVision2Seq", "AutoProcessor"),
+    ("InstructBLIP", "AutoModelForVision2Seq", "AutoProcessor"),
+    ("GIT", "AutoModelForVision2Seq", "AutoProcessor"),
+    ("PaliGemma", "AutoModelForVision2Seq", "AutoProcessor"),
+    ("Idefics2", "AutoModelForVision2Seq", "AutoProcessor"),
+])
+def test_load_model_families(model_family, model_func_name, tokenizer_func_name):
+    device = 'cpu'
+    model_path = 'some-model-path'
+
+    # Create a mock args object
+    class Args:
+        
+        model_family = ""
+        loading_precision = "FP32"
+        def __init__(self,model_family):
+            self.model_family = model_family
+
+    args = Args(model_family)
+
+    # Mock the tokenizer and model loading functions
+    with mock.patch(f'mm_poe.methods.utils.utils.{tokenizer_func_name}') as mock_tokenizer_class:
+        with mock.patch(f'mm_poe.methods.utils.utils.{model_func_name}') as mock_model_class:
+            mock_tokenizer = MagicMock()
+            mock_model = MagicMock()
+            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer
+            mock_model_class.from_pretrained.return_value = mock_model
+
+            # Set the return value of get_memory_footprint to a numeric value
+            mock_model.get_memory_footprint.return_value = 2 * 1024 ** 3  # 2 GB in bytes
+
+            model, tokenizer = load_model(device, model_path, args)
+
+            # Check that the correct tokenizer and model are loaded
+            if model_family == "Dolly":
+                mock_tokenizer_class.from_pretrained.assert_called_with(model_path, padding_side="left")
+            elif model_family == "Idefics2":
+                mock_tokenizer_class.from_pretrained.assert_called_with(model_path, do_image_splitting=False)
+            else:
+                mock_tokenizer_class.from_pretrained.assert_called_with(model_path)
+            # Check that model is moved to the correct device
+            mock_model.to.assert_called_with(device)
+
+def test_load_model_invalid_family():
+    device = 'cpu'
+    model_path = 'some-model-path'
+
+    # Create a mock args object
+    class Args:
+        model_family = "UnknownFamily"
+        loading_precision = "FP32"
+
+    args = Args()
+
+    with mock.patch('builtins.print') as mock_print:
+        result = load_model(device, model_path, args)
+        assert result is None
+        mock_print.assert_called_with(f"{args.model_family}: downloader not implemented.")
+
+def test_load_model_loading_precision():
+    device = 'cpu'
+    model_path = 'some-model-path'
+
+    # Create a mock args object
+    class Args:
+        model_family = "GPT2"
+        loading_precision = "INT8"
+
+    args = Args()
+
+    # Mock the tokenizer and model loading functions
+    with mock.patch('mm_poe.methods.utils.utils.AutoTokenizer') as mock_tokenizer_class:
+        with mock.patch('mm_poe.methods.utils.utils.AutoModelForCausalLM') as mock_model_class:
+            with mock.patch('mm_poe.methods.utils.utils.BitsAndBytesConfig') as mock_bnb_config_class:
+                mock_tokenizer = MagicMock()
+                mock_model = MagicMock()
+                mock_bnb_config = MagicMock()
+                mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer
+                mock_model_class.from_pretrained.return_value = mock_model
+                mock_bnb_config_class.return_value = mock_bnb_config
+
+                # Set the return value of get_memory_footprint to a numeric value
+                mock_model.get_memory_footprint.return_value = 2 * 1024 ** 3  # 2 GB in bytes
+
+                model, tokenizer = load_model(device, model_path, args)
+
+                # Check that BitsAndBytesConfig is called correctly
+                mock_bnb_config_class.assert_called_with(load_in_8bit=True, llm_int8_threshold=200.0)
+                # Check that model is loaded with quantization config
+                mock_model_class.from_pretrained.assert_called_with(
+                    model_path,
+                    torch_dtype=torch.float16,
+                    device_map=device,
+                    quantization_config=mock_bnb_config
+                )
+
+# Tests for write_to_csv function
+def test_write_to_csv_process_of_elimination(tmp_path):
+    save_path = tmp_path / "results.csv"
+
+    class Args:
+        method = "process_of_elimination"
+        model_family = "GPT2"
+        checkpoint = "gpt2-medium"
+        loading_precision = "FP32"
+        dataset = "copa"
+        batch_size = 32
+        scoring_method_for_process_of_elimination = "language_modeling"
+        prompting_method_for_process_of_elimination = "multiple_choice_prompt"
+        mask_strategy_for_process_of_elimination = "lowest"
+        mask_token = None
+        seed = 42
+        n_shot = 0
+        sample = None
+        mask_accuracy = 0.8
+
+    args = Args()
+    total_accuracy = 0.85
+
+    write_to_csv(str(save_path), args, total_accuracy)
+
+    with open(save_path, 'r') as f:
+        content = f.read()
+        assert 'process_of_elimination' in content
+        assert f"{args.mask_accuracy:.4f}" in content
+        assert f"{total_accuracy:.4f}" in content
+
+def test_write_to_csv_contrastive_decoding(tmp_path):
+    save_path = tmp_path / "results.csv"
+
+    class Args:
+        method = "contrastive_decoding"
+        model_family = "GPT2"
+        checkpoint = "gpt2-medium"
+        amateur_checkpoint = "gpt2-small"
+        loading_precision = "FP32"
+        dataset = "copa"
+        batch_size = 32
+        expert_method = "language_modeling"
+        amateur_method = "calibration"
+        weighting_parameter = -1.0
+        seed = 42
+        n_shot = 0
+        sample = None
+        expert_accuracy = 0.9
+        amateur_accuracy = 0.7
+
+    args = Args()
+    total_accuracy = 0.85
+
+    write_to_csv(str(save_path), args, total_accuracy)
+
+    with open(save_path, 'r') as f:
+        content = f.read()
+        assert 'contrastive_decoding' in content
+        assert f"{args.expert_accuracy:.4f}" in content
+        assert f"{args.amateur_accuracy:.4f}" in content
+        assert f"{total_accuracy:.4f}" in content
+
+def test_write_to_csv_generate_synonyms(tmp_path):
+    save_path = tmp_path / "results.csv"
+
+    class Args:
+        method = "generate_synonyms"
+        model_family = "GPT2"
+        checkpoint = "gpt2-medium"
+        loading_precision = "FP32"
+        dataset = "copa"
+        batch_size = 32
+        number_of_synonyms = 5
+        seed = 42
+        n_shot = 0
+        sample = None
+
+    args = Args()
+    total_accuracy = 0.88
+
+    write_to_csv(str(save_path), args, total_accuracy)
+
+    with open(save_path, 'r') as f:
+        content = f.read()
+        assert 'generate_synonyms' in content
+        assert f"{total_accuracy:.4f}" in content
+
+def test_write_to_csv_default_method(tmp_path):
+    save_path = tmp_path / "results.csv"
+
+    class Args:
+        method = "default_method"
+        model_family = "GPT2"
+        checkpoint = "gpt2-medium"
+        loading_precision = "FP32"
+        dataset = "copa"
+        batch_size = 32
+        seed = 42
+        n_shot = 0
+        sample = None
+
+    args = Args()
+    total_accuracy = 0.9
+
+    write_to_csv(str(save_path), args, total_accuracy)
+
+    with open(save_path, 'r') as f:
+        content = f.read()
+        assert 'default_method' in content
+        assert f"{total_accuracy:.4f}" in content
+
+# Additional tests for branches and edge cases
+def test_parse_args_invalid_choice():
+    test_args = [
+        "script_name",
+        "--model_family", "GPT2",
+        "--checkpoint", "gpt2-medium",
+        "--datasets", "copa",
+        "--loading_precision", "INVALID_PRECISION",
+    ]
+    with mock.patch.object(sys, 'argv', test_args):
+        with pytest.raises(SystemExit):
+            parse_args()
+
+# def test_load_model_invalid_loading_precision():
+#     device = 'cpu'
+#     model_path = 'some-model-path'
+
+#     # Create a mock args object
+#     class Args:
+#         model_family = "GPT2"
+#         loading_precision = "INVALID_PRECISION"
+
+#     args = Args()
+
+#     with mock.patch('builtins.print') as mock_print:
+#         with pytest.raises(AttributeError):
+#             # Since INVALID_PRECISION doesn't match any condition, it will attempt to load FP32
+#             load_model(device, model_path, args)
+#             # If it tries to proceed, it may cause an AttributeError due to missing methods
+
+def test_write_to_csv_no_method(tmp_path):
+    save_path = tmp_path / "results.csv"
+
+    class Args:
+        method = None
+        model_family = "GPT2"
+        checkpoint = "gpt2-medium"
+        loading_precision = "FP32"
+        dataset = "copa"
+        batch_size = 32
+        seed = 42
+        n_shot = 0
+        sample = 100
+
+    args = Args()
+    total_accuracy = 0.9
+
+    # with pytest.raises(AttributeError):
+    write_to_csv(str(save_path), args, total_accuracy)
+    assert os.path.isfile(save_path) == True
+
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..c881213
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,335 @@
+import sys
+import pytest
+import subprocess
+from unittest import mock
+from unittest.mock import MagicMock, patch
+import torch
+
+# Import the main function from cli.py
+from mm_poe.cli import main
+
+# Since the main function uses questionary for interactive input,
+# we need to mock these calls to provide predetermined answers.
+# We also need to mock subprocess.call and other external dependencies.
+
+@patch('mm_poe.cli.set_seed')
+@patch('mm_poe.cli.load_model')
+@patch('mm_poe.cli.subprocess.call')
+@patch('mm_poe.cli.questionary.select')
+@patch('mm_poe.cli.questionary.path')
+@patch('mm_poe.cli.questionary.text')
+def test_main(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+    # Mock the inputs provided by questionary
+    mock_select.side_effect = [
+        'GIT',  # args.model_family
+        'microsoft/git-base-vqav2',  # args.checkpoint
+        'FP32',  # args.loading_precision
+        'language_modeling',  # args.scoring_method_for_process_of_elimination
+        'below_average',  # args.mask_strategy_for_process_of_elimination
+        '0'  # args.label
+    ]
+
+    mock_path.side_effect = [
+        './models/',  # args.output_dir
+        './images/image.png'  # args.image_path
+    ]
+
+    mock_text.side_effect = [
+        'What is in the image?',  # args.question
+        'cat,dog,horse'  # args.choices
+    ]
+
+    # Mock the subprocess.call to prevent actual execution
+    mock_subprocess_call.return_value = 0
+
+    # Mock the load_model function to return mock model and tokenizer
+    mock_model = MagicMock()
+    mock_tokenizer = MagicMock()
+    mock_load_model.return_value = (mock_model, mock_tokenizer)
+
+    # Mock the device
+    with patch('mm_poe.cli.torch.device') as mock_device:
+        mock_device.return_value = 'cpu'
+
+        # Mock other functions called within main
+        with patch('mm_poe.cli.load_data') as mock_load_data, \
+             patch('mm_poe.cli.DataLoader') as mock_data_loader_class, \
+             patch('mm_poe.cli.inference_language_modeling') as mock_inference_lm, \
+             patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
+             patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
+             patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
+
+            # Mock the datasets returned by load_data
+            mock_dataset = MagicMock()
+            mock_dataset.map.return_value = mock_dataset  # For the map calls
+            mock_load_data.return_value = (
+                ['hypothesis0', 'hypothesis1', 'hypothesis2'],  # ending_names
+                'premise',  # header_name
+                'image_path',  # image_header_name
+                mock_dataset,  # raw_dataset
+                mock_dataset  # n_shot_dataset
+            )
+
+            # Mock the DataLoader
+            mock_data_loader = MagicMock()
+            mock_data_loader_class.return_value = mock_data_loader
+
+            # Mock inference functions
+            # For scoring_method == 'language_modeling'
+            mock_inference_lm.return_value = (torch.tensor([[0.1, 0.2, 0.7]]), None, None, torch.tensor([2]))
+            # For inference_process_of_elimination
+            mock_inference_poe.return_value = (torch.tensor([[0.1, 0.2, 0.7]]), 1.0, None, torch.tensor([2]))
+
+            # Mock compute_mask_process_of_elimination
+            mock_compute_mask.return_value = torch.tensor([[0, 1, 1]])
+
+            # Mock create_multiple_choice_prompt
+            def mock_create_mcp_fn(example, **kwargs):
+                return example
+            mock_create_mcp.side_effect = mock_create_mcp_fn
+
+            # Run the main function
+            main()
+
+            # Assertions to check if functions were called as expected
+            mock_set_seed.assert_called_once_with(0)
+            mock_subprocess_call.assert_called()
+            mock_load_model.assert_called()
+            mock_load_data.assert_called()
+            mock_inference_lm.assert_called()
+            mock_inference_poe.assert_called()
+
+# Test with different model family and scoring method
+@patch('mm_poe.cli.set_seed')
+@patch('mm_poe.cli.load_model')
+@patch('mm_poe.cli.subprocess.call')
+@patch('mm_poe.cli.questionary.select')
+@patch('mm_poe.cli.questionary.path')
+@patch('mm_poe.cli.questionary.text')
+def test_main_with_different_options(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+    mock_select.side_effect = [
+        'BLIP2',  # args.model_family
+        'Salesforce/blip2-opt-2.7b',  # args.checkpoint
+        'FP16',  # args.loading_precision
+        'calibration',  # args.scoring_method_for_process_of_elimination
+        'lowest',  # args.mask_strategy_for_process_of_elimination
+        '1'  # args.label
+    ]
+
+    mock_path.side_effect = [
+        './models/',  # args.output_dir
+        './images/image.png'  # args.image_path
+    ]
+
+    mock_text.side_effect = [
+        'Describe the image.',  # args.question
+        'apple,banana,orange'  # args.choices
+    ]
+
+    mock_subprocess_call.return_value = 0
+    mock_model = MagicMock()
+    mock_tokenizer = MagicMock()
+    mock_load_model.return_value = (mock_model, mock_tokenizer)
+
+    with patch('mm_poe.cli.torch.device') as mock_device:
+        mock_device.return_value = 'cuda:0'
+
+        with patch('mm_poe.cli.load_data') as mock_load_data, \
+             patch('mm_poe.cli.DataLoader') as mock_data_loader_class, \
+             patch('mm_poe.cli.inference_calibration') as mock_inference_calibration, \
+             patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
+             patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
+             patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
+
+            mock_dataset = MagicMock()
+            mock_dataset.map.return_value = mock_dataset
+            mock_load_data.return_value = (
+                ['hypothesis0', 'hypothesis1', 'hypothesis2'],
+                'premise',
+                'image_path',
+                mock_dataset,
+                mock_dataset
+            )
+
+            mock_data_loader = MagicMock()
+            mock_data_loader_class.return_value = mock_data_loader
+
+            mock_inference_calibration.return_value = (torch.tensor([[0.3, 0.4, 0.3]]), None, None, torch.tensor([1]))
+            mock_inference_poe.return_value = (torch.tensor([[0.3, 0.4, 0.3]]), 1.0, None, torch.tensor([1]))
+
+            mock_compute_mask.return_value = torch.tensor([[1, 0, 1]])
+
+            def mock_create_mcp_fn(example, **kwargs):
+                return example
+            mock_create_mcp.side_effect = mock_create_mcp_fn
+
+            main()
+
+            mock_set_seed.assert_called_once_with(0)
+            mock_subprocess_call.assert_called()
+            mock_load_model.assert_called()
+            mock_load_data.assert_called()
+            mock_inference_calibration.assert_called()
+            mock_inference_poe.assert_called()
+
+# Test NotImplementedError when an unknown model family is selected
+@patch('mm_poe.cli.questionary.select')
+def test_main_with_invalid_model_family(mock_select):
+    mock_select.side_effect = ['UnknownModelFamily']
+    with pytest.raises(NotImplementedError):
+        main()
+
+# Test NotImplementedError for invalid scoring method
+@patch('mm_poe.cli.set_seed')
+@patch('mm_poe.cli.load_model')
+@patch('mm_poe.cli.subprocess.call')
+@patch('mm_poe.cli.questionary.select')
+@patch('mm_poe.cli.questionary.path')
+@patch('mm_poe.cli.questionary.text')
+def test_main_with_invalid_scoring_method(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+    mock_select.side_effect = [
+        'GIT',
+        'microsoft/git-base-vqav2',
+        'FP32',
+        'invalid_scoring_method',
+        'below_average',
+        '0'
+    ]
+    mock_path.side_effect = ['./models/', './images/image.png']
+    mock_text.side_effect = ['What is in the image?', 'cat,dog,horse']
+    mock_subprocess_call.return_value = 0
+    mock_model = MagicMock()
+    mock_tokenizer = MagicMock()
+    mock_load_model.return_value = (mock_model, mock_tokenizer)
+
+    with pytest.raises(NotImplementedError):
+        main()
+
+# Test mask_strategy 'min_k'
+@patch('mm_poe.cli.set_seed')
+@patch('mm_poe.cli.load_model')
+@patch('mm_poe.cli.subprocess.call')
+@patch('mm_poe.cli.questionary.select')
+@patch('mm_poe.cli.questionary.path')
+@patch('mm_poe.cli.questionary.text')
+def test_main_with_mask_strategy_min_k(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+    mock_select.side_effect = [
+        'GIT',
+        'microsoft/git-base-vqav2',
+        'FP32',
+        'language_modeling',
+        'min_k',
+        '0'
+    ]
+    mock_path.side_effect = ['./models/', './images/image.png']
+    mock_text.side_effect = ['What is in the image?', 'cat,dog,horse']
+    mock_subprocess_call.return_value = 0
+    mock_model = MagicMock()
+    mock_tokenizer = MagicMock()
+    mock_load_model.return_value = (mock_model, mock_tokenizer)
+
+    with patch('mm_poe.cli.torch.device') as mock_device:
+        mock_device.return_value = 'cpu'
+
+        with patch('mm_poe.cli.load_data') as mock_load_data, \
+             patch('mm_poe.cli.DataLoader') as mock_data_loader_class, \
+             patch('mm_poe.cli.inference_language_modeling') as mock_inference_lm, \
+             patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
+             patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
+             patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
+
+            mock_dataset = MagicMock()
+            mock_dataset.map.return_value = mock_dataset
+            mock_load_data.return_value = (
+                ['hypothesis0', 'hypothesis1', 'hypothesis2'],
+                'premise',
+                'image_path',
+                mock_dataset,
+                mock_dataset
+            )
+
+            mock_data_loader = MagicMock()
+            mock_data_loader_class.return_value = mock_data_loader
+
+            mock_inference_lm.return_value = (torch.tensor([[0.1, 0.2, 0.7]]), None, None, torch.tensor([2]))
+            mock_inference_poe.return_value = (torch.tensor([[0.1, 0.2, 0.7]]), 1.0, None, torch.tensor([2]))
+            mock_compute_mask.return_value = torch.tensor([[0, 1, 1]])
+
+            def mock_create_mcp_fn(example, **kwargs):
+                return example
+            mock_create_mcp.side_effect = mock_create_mcp_fn
+
+            # Manually set args.min_k
+            args = mock_load_data.call_args[0][0]
+            args.min_k = 1
+
+            main()
+
+            mock_compute_mask.assert_called_with(torch.tensor([[0.1, 0.2, 0.7]]), 'min_k', min_k=1)
+
+# Test mask_token replacement
+@patch('mm_poe.cli.set_seed')
+@patch('mm_poe.cli.load_model')
+@patch('mm_poe.cli.subprocess.call')
+@patch('mm_poe.cli.questionary.select')
+@patch('mm_poe.cli.questionary.path')
+@patch('mm_poe.cli.questionary.text')
+def test_main_with_mask_token(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+    mock_select.side_effect = [
+        'GIT',
+        'microsoft/git-base-vqav2',
+        'FP32',
+        'language_modeling',
+        'below_average',
+        '0'
+    ]
+    mock_path.side_effect = ['./models/', './images/image.png']
+    mock_text.side_effect = ['What is in the image?', 'cat,dog,horse']
+    mock_subprocess_call.return_value = 0
+    mock_model = MagicMock()
+    mock_tokenizer = MagicMock()
+    mock_load_model.return_value = (mock_model, mock_tokenizer)
+
+    with patch('mm_poe.cli.torch.device') as mock_device:
+        mock_device.return_value = 'cpu'
+
+        # Modify args to include mask_token
+        with patch('mm_poe.cli.Namespace') as mock_namespace:
+            args = MagicMock()
+            args.mask_token = '[MASK]'
+            args.process_of_elimination_prompt = 'Select the most suitable option to answer the question. Ignore [MASK] options.'
+            mock_namespace.return_value = args
+
+            with patch('mm_poe.cli.load_data') as mock_load_data, \
+                 patch('mm_poe.cli.DataLoader') as mock_data_loader_class, \
+                 patch('mm_poe.cli.inference_language_modeling') as mock_inference_lm, \
+                 patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
+                 patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
+                 patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
+
+                mock_dataset = MagicMock()
+                mock_dataset.map.return_value = mock_dataset
+                mock_load_data.return_value = (
+                    ['hypothesis0', 'hypothesis1', 'hypothesis2'],
+                    'premise',
+                    'image_path',
+                    mock_dataset,
+                    mock_dataset
+                )
+
+                mock_data_loader = MagicMock()
+                mock_data_loader_class.return_value = mock_data_loader
+
+                mock_inference_lm.return_value = (torch.tensor([[0.1, 0.2, 0.7]]), None, None, torch.tensor([2]))
+                mock_inference_poe.return_value = (torch.tensor([[0.1, 0.2, 0.7]]), 1.0, None, torch.tensor([2]))
+                mock_compute_mask.return_value = torch.tensor([[0, 1, 1]])
+
+                def mock_create_mcp_fn(example, **kwargs):
+                    assert '[MASK]' not in kwargs['multiple_choice_prompt']
+                    return example
+                mock_create_mcp.side_effect = mock_create_mcp_fn
+
+                main()
+
+                assert '[MASK]' not in args.process_of_elimination_prompt
+

From 7827df7d54a4e3017da859f4ed05d74aa56bfd59 Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Fri, 11 Oct 2024 01:24:26 -0500
Subject: [PATCH 09/30] Fixed test cases.

---
 mm_poe/cli.py                     |   2 +-
 tests/methods/utils/test_utils.py |  91 +++++++++---------
 tests/test_cli.py                 | 152 +++++++++++++-----------------
 3 files changed, 113 insertions(+), 132 deletions(-)

diff --git a/mm_poe/cli.py b/mm_poe/cli.py
index 10c3def..681752f 100644
--- a/mm_poe/cli.py
+++ b/mm_poe/cli.py
@@ -212,7 +212,7 @@ def main():
         # raw_mcp_dataset, n_shot_mcp_dataset = create_n_shot_splits(raw_mcp_dataset, n_shot_mcp_dataset, args)    
         tokenized_dataset = raw_mcp_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
         eval_mcp_dataloader = DataLoader(tokenized_dataset, batch_size=args.batch_size, shuffle=False)
-        avg_log_probs, _, lm_predictions = inference_language_modeling(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
+        avg_log_probs, _, _, lm_predictions = inference_language_modeling(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
     else:
         raise NotImplementedError # unlikely to happen.
     
diff --git a/tests/methods/utils/test_utils.py b/tests/methods/utils/test_utils.py
index 2e494f1..949e03b 100644
--- a/tests/methods/utils/test_utils.py
+++ b/tests/methods/utils/test_utils.py
@@ -123,52 +123,55 @@ def test_parse_args_missing_required_arguments():
         with pytest.raises(SystemExit):
             parse_args()
 
-# # Tests for load_data function
-# @pytest.mark.parametrize("dataset_name,loader_name,ending_names,header_name", [
-#     ("copa", "copa_loader", ['hypothesis0', 'hypothesis1'], 'premise'),
-#     ("cqa", "cqa_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
-#     ("piqa", "piqa_loader", ['hypothesis0', 'hypothesis1'], 'premise'),
-#     ("winogrande", "winogrande_loader", ['hypothesis0', 'hypothesis1'], 'premise'),
-#     ("anli", "anli_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2'], 'premise'),
-#     ("vqa", "vqa_loader", [f"hypothesis{i}" for i in range(18)], 'premise'),
-#     ("scienceqa", "scienceqa_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
-#     ("ai2d", "ai2d_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
-#     ("single_inference", "single_inference_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
-# ])
-# def test_load_data_datasets(dataset_name, loader_name, ending_names, header_name):
-#     # Create a mock args object
-#     class Args:
-#         dataset = dataset_name
-#         num_options = 2
-#         image_path = None
-#         sample = None
-#         n_shot = 0
 
-#     args = Args()
 
-#     # Mock the data loader function
-#     loader_path = f'mm_poe.methods.utils.utils.{loader_name}'
-#     with mock.patch(loader_path) as mock_loader:
-#         # Mock return value
-#         mock_loader.return_value = [
-#             {
-#                 header_name: 'Test premise',
-#                 ending_names[0]: 'Option A',
-#                 ending_names[1]: 'Option B',
-#                 'label': 0
-#             }
-#         ]
-#         # Mock os.path.join to prevent file system access
-#         with mock.patch('os.path.join', return_value='dummy_path'):
-#             if dataset_name in ["vqa", "scienceqa", "ai2d", "single_inference"]:
-#                 ending, header, image_header, dev_dataset, train_dataset = load_data(args)
-#                 assert image_header == 'image_path'
-#             else:
-#                 ending, header, dev_dataset, train_dataset = load_data(args)
-#             assert ending == ending_names
-#             assert header == header_name
-#             assert len(dev_dataset) == 1
-#             assert len(train_dataset) == 1
+# Tests for load_data function
+@pytest.mark.parametrize("dataset_name,loader_name,ending_names,header_name", [
+    ("copa", "copa_loader", ['hypothesis0', 'hypothesis1'], 'premise'),
+    ("cqa", "cqa_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
+    ("piqa", "piqa_loader", ['hypothesis0', 'hypothesis1'], 'premise'),
+    ("winogrande", "winogrande_loader", ['hypothesis0', 'hypothesis1'], 'premise'),
+    ("anli", "anli_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2'], 'premise'),
+    ("vqa", "vqa_loader", [f"hypothesis{i}" for i in range(18)], 'premise'),
+    ("scienceqa", "scienceqa_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
+    ("ai2d", "ai2d_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
+    ("single_inference", "single_inference_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
+])
+def test_load_data_datasets(dataset_name, loader_name, ending_names, header_name):
+    # Create a mock args object
+    class Args:
+        dataset = dataset_name
+        image_path = None
+        sample = None
+        n_shot = 0
+        num_options = len(ending_names)
+
+    args = Args()
+
+    # Mock the data loader function
+    loader_path = f'mm_poe.methods.utils.utils.{loader_name}'
+    with mock.patch(loader_path) as mock_loader:
+        # Mock return value
+        mock_value = {
+                'premise': 'Test premise',
+                'uncond_premise': 'Test premise',
+                'image_path': 'dummy_path',
+                'label': 0
+            }
+        for i, ending_name in enumerate(ending_names):
+            mock_value[ending_name] = f'answer {i}'
+        mock_loader.return_value = [mock_value]
+        # Mock os.path.join to prevent file system access
+        with mock.patch('os.path.join', return_value='dummy_path'):
+            if dataset_name in ["vqa", "scienceqa", "ai2d", "single_inference"]:
+                ending, header, image_header, dev_dataset, train_dataset = load_data(args)
+                assert image_header == 'image_path'
+            else:
+                ending, header, dev_dataset, train_dataset = load_data(args)
+            assert ending == ending_names
+            assert header == header_name
+            assert len(dev_dataset) == 1
+            assert len(train_dataset) == 1
 
 def test_load_data_invalid_dataset():
     class Args:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index c881213..f1939ec 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -20,7 +20,7 @@
 @patch('mm_poe.cli.questionary.text')
 def test_main(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
     # Mock the inputs provided by questionary
-    mock_select.side_effect = [
+    mock_select.return_value.ask.side_effect = [
         'GIT',  # args.model_family
         'microsoft/git-base-vqav2',  # args.checkpoint
         'FP32',  # args.loading_precision
@@ -29,12 +29,12 @@ def test_main(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load
         '0'  # args.label
     ]
 
-    mock_path.side_effect = [
+    mock_path.return_value.ask.side_effect = [
         './models/',  # args.output_dir
         './images/image.png'  # args.image_path
     ]
 
-    mock_text.side_effect = [
+    mock_text.return_value.ask.side_effect = [
         'What is in the image?',  # args.question
         'cat,dog,horse'  # args.choices
     ]
@@ -48,12 +48,12 @@ def test_main(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load
     mock_load_model.return_value = (mock_model, mock_tokenizer)
 
     # Mock the device
-    with patch('mm_poe.cli.torch.device') as mock_device:
+    with patch('torch.device') as mock_device:
         mock_device.return_value = 'cpu'
 
         # Mock other functions called within main
         with patch('mm_poe.cli.load_data') as mock_load_data, \
-             patch('mm_poe.cli.DataLoader') as mock_data_loader_class, \
+             patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
              patch('mm_poe.cli.inference_language_modeling') as mock_inference_lm, \
              patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
              patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
@@ -107,7 +107,7 @@ def mock_create_mcp_fn(example, **kwargs):
 @patch('mm_poe.cli.questionary.path')
 @patch('mm_poe.cli.questionary.text')
 def test_main_with_different_options(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
-    mock_select.side_effect = [
+    mock_select.return_value.ask.side_effect = [
         'BLIP2',  # args.model_family
         'Salesforce/blip2-opt-2.7b',  # args.checkpoint
         'FP16',  # args.loading_precision
@@ -116,12 +116,12 @@ def test_main_with_different_options(mock_text, mock_path, mock_select, mock_sub
         '1'  # args.label
     ]
 
-    mock_path.side_effect = [
+    mock_path.return_value.ask.side_effect = [
         './models/',  # args.output_dir
         './images/image.png'  # args.image_path
     ]
 
-    mock_text.side_effect = [
+    mock_text.return_value.ask.side_effect = [
         'Describe the image.',  # args.question
         'apple,banana,orange'  # args.choices
     ]
@@ -131,11 +131,11 @@ def test_main_with_different_options(mock_text, mock_path, mock_select, mock_sub
     mock_tokenizer = MagicMock()
     mock_load_model.return_value = (mock_model, mock_tokenizer)
 
-    with patch('mm_poe.cli.torch.device') as mock_device:
+    with patch('torch.device') as mock_device:
         mock_device.return_value = 'cuda:0'
 
         with patch('mm_poe.cli.load_data') as mock_load_data, \
-             patch('mm_poe.cli.DataLoader') as mock_data_loader_class, \
+             patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
              patch('mm_poe.cli.inference_calibration') as mock_inference_calibration, \
              patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
              patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
@@ -172,39 +172,6 @@ def mock_create_mcp_fn(example, **kwargs):
             mock_inference_calibration.assert_called()
             mock_inference_poe.assert_called()
 
-# Test NotImplementedError when an unknown model family is selected
-@patch('mm_poe.cli.questionary.select')
-def test_main_with_invalid_model_family(mock_select):
-    mock_select.side_effect = ['UnknownModelFamily']
-    with pytest.raises(NotImplementedError):
-        main()
-
-# Test NotImplementedError for invalid scoring method
-@patch('mm_poe.cli.set_seed')
-@patch('mm_poe.cli.load_model')
-@patch('mm_poe.cli.subprocess.call')
-@patch('mm_poe.cli.questionary.select')
-@patch('mm_poe.cli.questionary.path')
-@patch('mm_poe.cli.questionary.text')
-def test_main_with_invalid_scoring_method(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
-    mock_select.side_effect = [
-        'GIT',
-        'microsoft/git-base-vqav2',
-        'FP32',
-        'invalid_scoring_method',
-        'below_average',
-        '0'
-    ]
-    mock_path.side_effect = ['./models/', './images/image.png']
-    mock_text.side_effect = ['What is in the image?', 'cat,dog,horse']
-    mock_subprocess_call.return_value = 0
-    mock_model = MagicMock()
-    mock_tokenizer = MagicMock()
-    mock_load_model.return_value = (mock_model, mock_tokenizer)
-
-    with pytest.raises(NotImplementedError):
-        main()
-
 # Test mask_strategy 'min_k'
 @patch('mm_poe.cli.set_seed')
 @patch('mm_poe.cli.load_model')
@@ -213,7 +180,7 @@ def test_main_with_invalid_scoring_method(mock_text, mock_path, mock_select, moc
 @patch('mm_poe.cli.questionary.path')
 @patch('mm_poe.cli.questionary.text')
 def test_main_with_mask_strategy_min_k(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
-    mock_select.side_effect = [
+    mock_select.return_value.ask.side_effect = [
         'GIT',
         'microsoft/git-base-vqav2',
         'FP32',
@@ -221,51 +188,58 @@ def test_main_with_mask_strategy_min_k(mock_text, mock_path, mock_select, mock_s
         'min_k',
         '0'
     ]
-    mock_path.side_effect = ['./models/', './images/image.png']
-    mock_text.side_effect = ['What is in the image?', 'cat,dog,horse']
+    mock_path.return_value.ask.side_effect = ['./models/', './images/image.png']
+    mock_text.return_value.ask.side_effect = ['What is in the image?', 'cat,dog,horse']
     mock_subprocess_call.return_value = 0
     mock_model = MagicMock()
     mock_tokenizer = MagicMock()
     mock_load_model.return_value = (mock_model, mock_tokenizer)
 
-    with patch('mm_poe.cli.torch.device') as mock_device:
+    with patch('torch.device') as mock_device:
         mock_device.return_value = 'cpu'
 
-        with patch('mm_poe.cli.load_data') as mock_load_data, \
-             patch('mm_poe.cli.DataLoader') as mock_data_loader_class, \
-             patch('mm_poe.cli.inference_language_modeling') as mock_inference_lm, \
-             patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
-             patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
-             patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
-
-            mock_dataset = MagicMock()
-            mock_dataset.map.return_value = mock_dataset
-            mock_load_data.return_value = (
-                ['hypothesis0', 'hypothesis1', 'hypothesis2'],
-                'premise',
-                'image_path',
-                mock_dataset,
-                mock_dataset
-            )
+        # Modify args to include mask_token
+        with patch('mm_poe.cli.Namespace') as mock_namespace:
+            args = MagicMock()
+            args.min_k = 1
+            args.process_of_elimination_prompt = 'Select the most suitable option to answer the question. Ignore [MASK] options.'
+            mock_namespace.return_value = args
 
-            mock_data_loader = MagicMock()
-            mock_data_loader_class.return_value = mock_data_loader
+            with patch('mm_poe.cli.load_data') as mock_load_data, \
+                patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
+                patch('mm_poe.cli.inference_language_modeling') as mock_inference_lm, \
+                patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
+                patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
+                patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
 
-            mock_inference_lm.return_value = (torch.tensor([[0.1, 0.2, 0.7]]), None, None, torch.tensor([2]))
-            mock_inference_poe.return_value = (torch.tensor([[0.1, 0.2, 0.7]]), 1.0, None, torch.tensor([2]))
-            mock_compute_mask.return_value = torch.tensor([[0, 1, 1]])
+                mock_dataset = MagicMock()
+                mock_dataset.map.return_value = mock_dataset
+                mock_load_data.return_value = (
+                    ['hypothesis0', 'hypothesis1', 'hypothesis2'],
+                    'premise',
+                    'image_path',
+                    mock_dataset,
+                    mock_dataset
+                )
 
-            def mock_create_mcp_fn(example, **kwargs):
-                return example
-            mock_create_mcp.side_effect = mock_create_mcp_fn
+                mock_data_loader = MagicMock()
+                mock_data_loader_class.return_value = mock_data_loader
 
-            # Manually set args.min_k
-            args = mock_load_data.call_args[0][0]
-            args.min_k = 1
+                predictions = torch.tensor([[0.1, 0.2, 0.7]])
+                masks = torch.tensor([[0, 1, 1]])
+                mock_inference_lm.return_value = (predictions, None, None, torch.tensor([2]))
+                mock_inference_poe.return_value = (predictions, 1.0, None, torch.tensor([2]))
+                mock_compute_mask.return_value = masks
 
-            main()
+                def mock_create_mcp_fn(example, **kwargs):
+                    return example
+                mock_create_mcp.side_effect = mock_create_mcp_fn
 
-            mock_compute_mask.assert_called_with(torch.tensor([[0.1, 0.2, 0.7]]), 'min_k', min_k=1)
+                main()
+                mock_set_seed.assert_called_once_with(0)
+                mock_load_model.assert_called()
+                mock_load_data.assert_called()
+                mock_compute_mask.assert_called_with(predictions, 'min_k', min_k=1)
 
 # Test mask_token replacement
 @patch('mm_poe.cli.set_seed')
@@ -275,7 +249,7 @@ def mock_create_mcp_fn(example, **kwargs):
 @patch('mm_poe.cli.questionary.path')
 @patch('mm_poe.cli.questionary.text')
 def test_main_with_mask_token(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
-    mock_select.side_effect = [
+    mock_select.return_value.ask.side_effect = [
         'GIT',
         'microsoft/git-base-vqav2',
         'FP32',
@@ -283,25 +257,25 @@ def test_main_with_mask_token(mock_text, mock_path, mock_select, mock_subprocess
         'below_average',
         '0'
     ]
-    mock_path.side_effect = ['./models/', './images/image.png']
-    mock_text.side_effect = ['What is in the image?', 'cat,dog,horse']
+    mock_path.return_value.ask.side_effect = ['./models/', './images/image.png']
+    mock_text.return_value.ask.side_effect = ['What is in the image?', 'cat,dog,horse']
     mock_subprocess_call.return_value = 0
     mock_model = MagicMock()
     mock_tokenizer = MagicMock()
     mock_load_model.return_value = (mock_model, mock_tokenizer)
 
-    with patch('mm_poe.cli.torch.device') as mock_device:
+    with patch('torch.device') as mock_device:
         mock_device.return_value = 'cpu'
 
         # Modify args to include mask_token
         with patch('mm_poe.cli.Namespace') as mock_namespace:
             args = MagicMock()
-            args.mask_token = '[MASK]'
+            args.mask_token = 'XXX'
             args.process_of_elimination_prompt = 'Select the most suitable option to answer the question. Ignore [MASK] options.'
             mock_namespace.return_value = args
 
             with patch('mm_poe.cli.load_data') as mock_load_data, \
-                 patch('mm_poe.cli.DataLoader') as mock_data_loader_class, \
+                 patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
                  patch('mm_poe.cli.inference_language_modeling') as mock_inference_lm, \
                  patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
                  patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
@@ -320,9 +294,11 @@ def test_main_with_mask_token(mock_text, mock_path, mock_select, mock_subprocess
                 mock_data_loader = MagicMock()
                 mock_data_loader_class.return_value = mock_data_loader
 
-                mock_inference_lm.return_value = (torch.tensor([[0.1, 0.2, 0.7]]), None, None, torch.tensor([2]))
-                mock_inference_poe.return_value = (torch.tensor([[0.1, 0.2, 0.7]]), 1.0, None, torch.tensor([2]))
-                mock_compute_mask.return_value = torch.tensor([[0, 1, 1]])
+                predictions = torch.tensor([[0.1, 0.2, 0.7]])
+                masks = torch.tensor([[0, 1, 1]])
+                mock_inference_lm.return_value = (predictions, None, None, torch.tensor([2]))
+                mock_inference_poe.return_value = (predictions, 1.0, None, torch.tensor([2]))
+                mock_compute_mask.return_value = masks
 
                 def mock_create_mcp_fn(example, **kwargs):
                     assert '[MASK]' not in kwargs['multiple_choice_prompt']
@@ -330,6 +306,8 @@ def mock_create_mcp_fn(example, **kwargs):
                 mock_create_mcp.side_effect = mock_create_mcp_fn
 
                 main()
-
-                assert '[MASK]' not in args.process_of_elimination_prompt
+                mock_set_seed.assert_called_once_with(0)
+                mock_load_model.assert_called()
+                mock_load_data.assert_called()
+                mock_compute_mask.assert_called_with(predictions, 'below_average')
 

From 970a06148d7e29f0d507704400c9bb8544826fed Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Fri, 11 Oct 2024 02:32:43 -0500
Subject: [PATCH 10/30] Added test case for more loaders.

---
 tests/methods/utils/test_utils.py | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/tests/methods/utils/test_utils.py b/tests/methods/utils/test_utils.py
index 949e03b..ca8ecd1 100644
--- a/tests/methods/utils/test_utils.py
+++ b/tests/methods/utils/test_utils.py
@@ -129,9 +129,23 @@ def test_parse_args_missing_required_arguments():
 @pytest.mark.parametrize("dataset_name,loader_name,ending_names,header_name", [
     ("copa", "copa_loader", ['hypothesis0', 'hypothesis1'], 'premise'),
     ("cqa", "cqa_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
+    ("obqa", "obqa_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3'], 'premise'),
     ("piqa", "piqa_loader", ['hypothesis0', 'hypothesis1'], 'premise'),
+    ("qasc", "qasc_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4', 'hypothesis5', 'hypothesis6', 'hypothesis7'], 'premise'),
+    ("siqa", "siqa_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2'], 'premise'),
     ("winogrande", "winogrande_loader", ['hypothesis0', 'hypothesis1'], 'premise'),
     ("anli", "anli_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2'], 'premise'),
+    ("disambiguation_qa", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2'], 'premise'),
+    ("conceptual_combinations", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3'], 'premise'),
+    ("date_understanding", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4', 'hypothesis5'], 'premise'),
+    ("emoji_movie", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
+    ("ruin_names", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3'], 'premise'),
+    ("penguins_in_a_table", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
+    ("strange_stories", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3'], 'premise'),
+    ("reasoning_about_colored_objects", "date_understanding_loader", [f"hypothesis{i}" for i in range(18)], 'premise'),
+    ("symbol_interpretation", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
+    ("tracking_shuffled_objects", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
+    ("logical_deduction_three_objects", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2'], 'premise'),
     ("vqa", "vqa_loader", [f"hypothesis{i}" for i in range(18)], 'premise'),
     ("scienceqa", "scienceqa_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
     ("ai2d", "ai2d_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
@@ -413,23 +427,6 @@ def test_parse_args_invalid_choice():
         with pytest.raises(SystemExit):
             parse_args()
 
-# def test_load_model_invalid_loading_precision():
-#     device = 'cpu'
-#     model_path = 'some-model-path'
-
-#     # Create a mock args object
-#     class Args:
-#         model_family = "GPT2"
-#         loading_precision = "INVALID_PRECISION"
-
-#     args = Args()
-
-#     with mock.patch('builtins.print') as mock_print:
-#         with pytest.raises(AttributeError):
-#             # Since INVALID_PRECISION doesn't match any condition, it will attempt to load FP32
-#             load_model(device, model_path, args)
-#             # If it tries to proceed, it may cause an AttributeError due to missing methods
-
 def test_write_to_csv_no_method(tmp_path):
     save_path = tmp_path / "results.csv"
 

From 75d3a2c540baf3541183141b57f0b27f3104bb64 Mon Sep 17 00:00:00 2001
From: hellokayas <pidnas94335@gmail.com>
Date: Fri, 11 Oct 2024 02:34:55 -0500
Subject: [PATCH 11/30] test

---
 tests/methods/utils/test_methods.py | 264 ++++++++++++++++++++++++++++
 1 file changed, 264 insertions(+)
 create mode 100644 tests/methods/utils/test_methods.py

diff --git a/tests/methods/utils/test_methods.py b/tests/methods/utils/test_methods.py
new file mode 100644
index 0000000..2b413d8
--- /dev/null
+++ b/tests/methods/utils/test_methods.py
@@ -0,0 +1,264 @@
+import os
+import sys
+import torch
+import pytest
+from unittest import mock
+from unittest.mock import MagicMock, patch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+# Import the functions from methods.py
+from mm_poe.methods.utils.methods import (
+    inference_language_modeling_old,
+    inference_contrastive_decoding_old,
+    inference_language_modeling,
+    inference_generate_synonyms,
+    inference_calibration,
+    inference_contrastive_decoding,
+    compute_mask_process_of_elimination,
+    inference_process_of_elimination,
+    compute_conditional_score_seq2seq,
+    compute_conditional_score_causal,
+    compute_conditional_score_seq2seq_vqa,
+    compute_conditional_score_causal_vqa,
+    generate_synonyms,
+    aggregate_optionw_with_synonyms,
+)
+
+# Mock tqdm to prevent actual progress bars during testing
+tqdm = lambda x, **kwargs: x
+
+# Define a simple mock model
+class SimpleMockModel(torch.nn.Module):
+    def __init__(self):
+        super(SimpleMockModel, self).__init__()
+
+    def forward(self, input_ids=None, labels=None, pixel_values=None, **kwargs):
+        batch_size = input_ids.size(0)
+        seq_len = input_ids.size(1)
+        vocab_size = 32128
+        logits = torch.randn(batch_size, seq_len, vocab_size)
+        loss = torch.tensor(0.0)
+        return MagicMock(loss=loss, logits=logits)
+
+# Fixtures for common test components
+@pytest.fixture
+def mock_model():
+    return SimpleMockModel()
+
+@pytest.fixture
+def mock_amateur_model():
+    return SimpleMockModel()
+
+@pytest.fixture
+def mock_expert_model():
+    return SimpleMockModel()
+
+@pytest.fixture
+def sample_batch():
+    batch_size = 2
+    num_options = 2
+    seq_len = 18
+    vocab_size = 32128
+    return {
+        "ending_input_ids": torch.randint(0, vocab_size, (batch_size, num_options, seq_len)),
+        "header_input_ids": torch.randint(0, vocab_size, (batch_size, seq_len)),
+        "label": torch.randint(0, num_options, (batch_size,)),
+        "header_attention_mask": torch.ones(batch_size, seq_len),
+        "ending_attention_mask": torch.ones(batch_size, num_options, seq_len),
+        "input_ids": torch.randint(0, vocab_size, (batch_size, num_options, seq_len)),
+        "labels": torch.randint(0, vocab_size, (batch_size, num_options, seq_len)),
+        "mask": torch.ones(batch_size, num_options),
+        "images": torch.randn(batch_size, 3, 224, 224),
+    }
+
+@pytest.fixture
+def device():
+    return 'cpu'
+
+@pytest.fixture
+def pad_token_id():
+    return 0
+
+# Tests for inference_language_modeling_old function
+def test_inference_language_modeling_old(mock_model, sample_batch, device):
+    eval_dataloader = [sample_batch]
+    total_accuracy = inference_language_modeling_old(mock_model, eval_dataloader, device)
+    assert isinstance(total_accuracy, float)
+    assert 0.0 <= total_accuracy <= 1.0
+
+# Tests for inference_contrastive_decoding_old function
+def test_inference_contrastive_decoding_old(mock_amateur_model, mock_expert_model, sample_batch, device):
+    eval_dataloader = [sample_batch]
+    total_accuracy = inference_contrastive_decoding_old(mock_amateur_model, mock_expert_model, eval_dataloader, device)
+    assert isinstance(total_accuracy, float)
+    assert 0.0 <= total_accuracy <= 1.0
+
+# Mock compute_func for inference_language_modeling
+def mock_compute_func(batch, model, device, pad_token_id):
+    batch_size = batch["header_input_ids"].size(0)
+    num_options = batch["ending_input_ids"].size(1)
+    return torch.rand(batch_size, num_options)
+
+# Tests for inference_language_modeling function
+def test_inference_language_modeling(mock_model, sample_batch, device, pad_token_id):
+    eval_dataloader = [sample_batch]
+    avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_language_modeling(
+        mock_model, eval_dataloader, device, mock_compute_func, pad_token_id
+    )
+    assert avg_log_probs.shape == (sample_batch["label"].size(0), sample_batch["ending_input_ids"].size(1))
+    assert isinstance(lm_accuracy, float)
+    assert isinstance(avg_lm_accuracy, float)
+    assert lm_predictions.shape == (sample_batch["label"].size(0),)
+
+# Tests for inference_generate_synonyms function
+def test_inference_generate_synonyms(mock_model, sample_batch, device, pad_token_id):
+    num_of_options = sample_batch["ending_input_ids"].size(1)
+    num_of_synonyms = 2
+    def mock_compute_func(batch, model, device, pad_token_id):
+        batch_size = batch["header_input_ids"].size(0)
+        total_options = batch["ending_input_ids"].size(1)
+        return torch.rand(batch_size, total_options)
+    eval_dataloader = [sample_batch]
+    avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_generate_synonyms(
+        mock_model, eval_dataloader, device, mock_compute_func, pad_token_id, num_of_options, num_of_synonyms
+    )
+    expected_shape = (sample_batch["label"].size(0), num_of_options)
+    assert avg_log_probs.shape == expected_shape
+    assert isinstance(lm_accuracy, float)
+    assert isinstance(avg_lm_accuracy, float)
+    assert lm_predictions.shape == (sample_batch["label"].size(0),)
+
+# Tests for inference_calibration function
+def test_inference_calibration(mock_model, sample_batch, device, pad_token_id):
+    eval_dataloader = [sample_batch]
+    eval_calibration_dataloader = [sample_batch]
+    avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_calibration(
+        mock_model, eval_dataloader, eval_calibration_dataloader, device, mock_compute_func, pad_token_id
+    )
+    assert avg_log_probs.shape == (sample_batch["label"].size(0), sample_batch["ending_input_ids"].size(1))
+    assert isinstance(lm_accuracy, float)
+    assert isinstance(avg_lm_accuracy, float)
+    assert lm_predictions.shape == (sample_batch["label"].size(0),)
+
+# Tests for compute_mask_process_of_elimination function
+@pytest.mark.parametrize("mask_strategy", ["lowest", "below_average", "lowest_iter", "min_k"])
+def test_compute_mask_process_of_elimination(mask_strategy):
+    avg_log_probs = torch.tensor([[0.1, 0.2, 0.3],
+                                  [0.3, 0.2, 0.1]])
+    if mask_strategy == "min_k":
+        kwargs = {"min_k": 2}
+    else:
+        kwargs = {}
+    if mask_strategy not in ["lowest", "below_average", "lowest_iter", "min_k"]:
+        with pytest.raises(NotImplementedError):
+            compute_mask_process_of_elimination(avg_log_probs, mask_strategy, **kwargs)
+    else:
+        masks = compute_mask_process_of_elimination(avg_log_probs, mask_strategy, **kwargs)
+        assert masks.shape == avg_log_probs.shape
+
+# Tests for inference_process_of_elimination function
+def test_inference_process_of_elimination(mock_model, sample_batch, device, pad_token_id):
+    eval_dataloader = [sample_batch]
+    avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_process_of_elimination(
+        mock_model, eval_dataloader, device, mock_compute_func, pad_token_id
+    )
+    assert avg_log_probs.shape == (sample_batch["label"].size(0), sample_batch["ending_input_ids"].size(1))
+    assert isinstance(lm_accuracy, float)
+    assert isinstance(avg_lm_accuracy, float)
+    assert lm_predictions.shape == (sample_batch["label"].size(0),)
+
+# Tests for compute_conditional_score_seq2seq function
+def test_compute_conditional_score_seq2seq(mock_model, sample_batch, device, pad_token_id):
+    log_prob = compute_conditional_score_seq2seq(sample_batch, mock_model, device, pad_token_id)
+    assert log_prob.shape == (sample_batch["ending_input_ids"].shape[0], sample_batch["ending_input_ids"].shape[1])
+
+# Tests for compute_conditional_score_causal function
+def test_compute_conditional_score_causal(mock_model, sample_batch, device, pad_token_id):
+    log_prob = compute_conditional_score_causal(sample_batch, mock_model, device, pad_token_id)
+    assert log_prob.shape == (sample_batch["input_ids"].shape[0], sample_batch["input_ids"].shape[1])
+
+# Tests for compute_conditional_score_seq2seq_vqa function
+def test_compute_conditional_score_seq2seq_vqa(mock_model, sample_batch, device, pad_token_id):
+    log_prob = compute_conditional_score_seq2seq_vqa(sample_batch, mock_model, device, pad_token_id)
+    assert log_prob.shape == (sample_batch["ending_input_ids"].shape[0], sample_batch["ending_input_ids"].shape[1])
+
+# Tests for compute_conditional_score_causal_vqa function
+def test_compute_conditional_score_causal_vqa(mock_model, sample_batch, device, pad_token_id):
+    log_prob = compute_conditional_score_causal_vqa(sample_batch, mock_model, device, pad_token_id)
+    assert log_prob.shape == (sample_batch["input_ids"].shape[0], sample_batch["input_ids"].shape[1])
+
+# Tests for aggregate_optionw_with_synonyms function
+def test_aggregate_optionw_with_synonyms():
+    batch_size = 2
+    num_of_options = 5
+    num_of_synonyms = 3
+    tensor = torch.arange(batch_size * num_of_options * (num_of_synonyms + 1)).view(batch_size, -1)
+    aggregated_tensor = aggregate_optionw_with_synonyms(tensor.clone(), num_of_options, num_of_synonyms)
+    assert aggregated_tensor.shape == tensor.shape
+
+# Tests for generate_synonyms function
+def test_generate_synonyms():
+    args = MagicMock()
+    args.number_of_synonyms = 2
+    args.generate_synonyms_prompt = "Generate a synonym to '{option}':"
+    model = MagicMock()
+    model.device = 'cpu'
+    tokenizer = MagicMock()
+    tokenizer.return_tensors = 'pt'
+    tokenizer.pad_token_id = 0
+    tokenizer.batch_decode.return_value = ['synonym1', 'synonym2']
+    tokenized_dataset = MagicMock()
+    tokenized_dataset.column_names = ['hypothesis1']
+    tokenized_dataset.__getitem__.return_value = {'hypothesis1': 'test_option'}
+    synonyms_dict = generate_synonyms(args, model, tokenizer, tokenized_dataset)
+    assert isinstance(synonyms_dict, dict)
+
+# Tests for inference_contrastive_decoding function
+def test_inference_contrastive_decoding():
+    method = 'language_modeling'
+    model = MagicMock()
+    args = MagicMock()
+    args.batch_size = 2
+    args.model_family = 'other'
+    raw_dataset = MagicMock()
+    device = 'cpu'
+    compute_func = MagicMock()
+    tokenizer = MagicMock()
+    processor = MagicMock()
+    ending_names = ['ending1', 'ending2']
+    header_name = 'header'
+    image_header_name = 'image_header'
+    preprocess_func = MagicMock()
+    preprocess_func_channel = MagicMock()
+    kwargs = {
+        'args': args,
+        'raw_dataset': raw_dataset,
+        'device': device,
+        'compute_func': compute_func,
+        'tokenizer': tokenizer,
+        'processor': processor,
+        'ending_names': ending_names,
+        'header_name': header_name,
+        'image_header_name': image_header_name,
+        'preprocess_func': preprocess_func,
+        'preprocess_func_channel': preprocess_func_channel,
+    }
+    with patch('methods.inference_language_modeling', return_value=(None, 0.0, 0.0, None)) as mock_inference:
+        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_contrastive_decoding(method, model, **kwargs)
+        mock_inference.assert_called_once()
+
+    method = 'calibration'
+    with patch('methods.inference_calibration', return_value=(None, 0.0, 0.0, None)) as mock_inference_cal:
+        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_contrastive_decoding(method, model, **kwargs)
+        mock_inference_cal.assert_called_once()
+
+    method = 'channel'
+    with patch('methods.inference_language_modeling', return_value=(None, 0.0, 0.0, None)) as mock_inference_channel:
+        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_contrastive_decoding(method, model, **kwargs)
+        mock_inference_channel.assert_called()
+
+    method = 'invalid_method'
+    with pytest.raises(NotImplementedError):
+        inference_contrastive_decoding(method, model, **kwargs)
+

From 478c291e64320296dad12784cab32d600af7db47 Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Sat, 12 Oct 2024 23:09:46 -0500
Subject: [PATCH 12/30] Fixed failing tests.

---
 tests/methods/utils/test_methods.py |  39 +++--
 tests/methods/utils/test_utils.py   | 110 +++++++++++++-
 tests/test_cli.py                   | 222 ++++++++++++++++++++++++++--
 3 files changed, 351 insertions(+), 20 deletions(-)

diff --git a/tests/methods/utils/test_methods.py b/tests/methods/utils/test_methods.py
index 2b413d8..442395f 100644
--- a/tests/methods/utils/test_methods.py
+++ b/tests/methods/utils/test_methods.py
@@ -34,10 +34,10 @@ def __init__(self):
         super(SimpleMockModel, self).__init__()
 
     def forward(self, input_ids=None, labels=None, pixel_values=None, **kwargs):
-        batch_size = input_ids.size(0)
         seq_len = input_ids.size(1)
+        batch_size_num_options = labels.size(0)
         vocab_size = 32128
-        logits = torch.randn(batch_size, seq_len, vocab_size)
+        logits = torch.randn(batch_size_num_options, seq_len, vocab_size)
         loss = torch.tensor(0.0)
         return MagicMock(loss=loss, logits=logits)
 
@@ -72,6 +72,25 @@ def sample_batch():
         "images": torch.randn(batch_size, 3, 224, 224),
     }
 
+@pytest.fixture
+def sample_synonym_batch():
+    batch_size = 2
+    num_synonyms = 2
+    num_options = 2
+    seq_len = 18
+    vocab_size = 32128
+    return {
+        "ending_input_ids": torch.randint(0, vocab_size, (batch_size, num_options*(num_synonyms+1), seq_len)),
+        "header_input_ids": torch.randint(0, vocab_size, (batch_size, seq_len)),
+        "label": torch.randint(0, num_options*(num_synonyms+1), (batch_size,)),
+        "header_attention_mask": torch.ones(batch_size, seq_len),
+        "ending_attention_mask": torch.ones(batch_size, num_options*(num_synonyms+1), seq_len),
+        "input_ids": torch.randint(0, vocab_size, (batch_size, num_options*(num_synonyms+1), seq_len)),
+        "labels": torch.randint(0, vocab_size, (batch_size, num_options*(num_synonyms+1), seq_len)),
+        "mask": torch.ones(batch_size, num_options*(num_synonyms+1)),
+        "images": torch.randn(batch_size, 3, 224, 224),
+    }
+
 @pytest.fixture
 def device():
     return 'cpu'
@@ -112,22 +131,22 @@ def test_inference_language_modeling(mock_model, sample_batch, device, pad_token
     assert lm_predictions.shape == (sample_batch["label"].size(0),)
 
 # Tests for inference_generate_synonyms function
-def test_inference_generate_synonyms(mock_model, sample_batch, device, pad_token_id):
-    num_of_options = sample_batch["ending_input_ids"].size(1)
+def test_inference_generate_synonyms(mock_model, sample_synonym_batch, device, pad_token_id):
+    num_of_options = 2
     num_of_synonyms = 2
     def mock_compute_func(batch, model, device, pad_token_id):
         batch_size = batch["header_input_ids"].size(0)
         total_options = batch["ending_input_ids"].size(1)
         return torch.rand(batch_size, total_options)
-    eval_dataloader = [sample_batch]
+    eval_dataloader = [sample_synonym_batch]
     avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_generate_synonyms(
         mock_model, eval_dataloader, device, mock_compute_func, pad_token_id, num_of_options, num_of_synonyms
     )
-    expected_shape = (sample_batch["label"].size(0), num_of_options)
+    expected_shape = (sample_synonym_batch["label"].size(0), num_of_options*(num_of_synonyms+1))
     assert avg_log_probs.shape == expected_shape
     assert isinstance(lm_accuracy, float)
     assert isinstance(avg_lm_accuracy, float)
-    assert lm_predictions.shape == (sample_batch["label"].size(0),)
+    assert lm_predictions.shape == (sample_synonym_batch["label"].size(0),)
 
 # Tests for inference_calibration function
 def test_inference_calibration(mock_model, sample_batch, device, pad_token_id):
@@ -244,17 +263,17 @@ def test_inference_contrastive_decoding():
         'preprocess_func': preprocess_func,
         'preprocess_func_channel': preprocess_func_channel,
     }
-    with patch('methods.inference_language_modeling', return_value=(None, 0.0, 0.0, None)) as mock_inference:
+    with patch('mm_poe.methods.utils.methods.inference_language_modeling', return_value=(None, 0.0, 0.0, None)) as mock_inference:
         avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_contrastive_decoding(method, model, **kwargs)
         mock_inference.assert_called_once()
 
     method = 'calibration'
-    with patch('methods.inference_calibration', return_value=(None, 0.0, 0.0, None)) as mock_inference_cal:
+    with patch('mm_poe.methods.utils.methods.inference_calibration', return_value=(None, 0.0, 0.0, None)) as mock_inference_cal:
         avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_contrastive_decoding(method, model, **kwargs)
         mock_inference_cal.assert_called_once()
 
     method = 'channel'
-    with patch('methods.inference_language_modeling', return_value=(None, 0.0, 0.0, None)) as mock_inference_channel:
+    with patch('mm_poe.methods.utils.methods.inference_language_modeling', return_value=(None, 0.0, 0.0, None)) as mock_inference_channel:
         avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_contrastive_decoding(method, model, **kwargs)
         mock_inference_channel.assert_called()
 
diff --git a/tests/methods/utils/test_utils.py b/tests/methods/utils/test_utils.py
index ca8ecd1..eb94ef4 100644
--- a/tests/methods/utils/test_utils.py
+++ b/tests/methods/utils/test_utils.py
@@ -146,6 +146,9 @@ def test_parse_args_missing_required_arguments():
     ("symbol_interpretation", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
     ("tracking_shuffled_objects", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
     ("logical_deduction_three_objects", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2'], 'premise'),
+    ("logical_deduction_five_objects", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
+    ("logical_deduction_seven_objects", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4', 'hypothesis5', 'hypothesis6'], 'premise'),
+    ("anli_r1", "anli_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2'], 'premise'),
     ("vqa", "vqa_loader", [f"hypothesis{i}" for i in range(18)], 'premise'),
     ("scienceqa", "scienceqa_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
     ("ai2d", "ai2d_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
@@ -265,7 +268,7 @@ class Args:
         assert result is None
         mock_print.assert_called_with(f"{args.model_family}: downloader not implemented.")
 
-def test_load_model_loading_precision():
+def test_load_model_loading_precision_int8():
     device = 'cpu'
     model_path = 'some-model-path'
 
@@ -302,6 +305,111 @@ class Args:
                     quantization_config=mock_bnb_config
                 )
 
+def test_load_model_loading_precision_int4():
+    device = 'cpu'
+    model_path = 'some-model-path'
+
+    # Create a mock args object
+    class Args:
+        model_family = "GPT2"
+        loading_precision = "INT4"
+
+    args = Args()
+
+    # Mock the tokenizer and model loading functions
+    with mock.patch('mm_poe.methods.utils.utils.AutoTokenizer') as mock_tokenizer_class:
+        with mock.patch('mm_poe.methods.utils.utils.AutoModelForCausalLM') as mock_model_class:
+            with mock.patch('mm_poe.methods.utils.utils.BitsAndBytesConfig') as mock_bnb_config_class:
+                mock_tokenizer = MagicMock()
+                mock_model = MagicMock()
+                mock_bnb_config = MagicMock()
+                mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer
+                mock_model_class.from_pretrained.return_value = mock_model
+                mock_bnb_config_class.return_value = mock_bnb_config
+
+                # Set the return value of get_memory_footprint to a numeric value
+                mock_model.get_memory_footprint.return_value = 2 * 1024 ** 3  # 2 GB in bytes
+
+                model, tokenizer = load_model(device, model_path, args)
+
+                # Check that BitsAndBytesConfig is called correctly
+                mock_bnb_config_class.assert_called_with(
+                    load_in_4bit=True, 
+                    bnb_4bit_quant_type="nf4",
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_compute_dtype=torch.bfloat16
+                )
+                # Check that model is loaded with quantization config
+                mock_model_class.from_pretrained.assert_called_with(
+                    model_path,
+                    device_map=device,
+                    quantization_config=mock_bnb_config
+                )
+
+def test_load_model_loading_precision_fp16():
+    device = 'cpu'
+    model_path = 'some-model-path'
+
+    # Create a mock args object
+    class Args:
+        model_family = "GPT2"
+        loading_precision = "FP16"
+
+    args = Args()
+
+    # Mock the tokenizer and model loading functions
+    with mock.patch('mm_poe.methods.utils.utils.AutoTokenizer') as mock_tokenizer_class:
+        with mock.patch('mm_poe.methods.utils.utils.AutoModelForCausalLM') as mock_model_class:
+            mock_tokenizer = MagicMock()
+            mock_model = MagicMock()
+            mock_bnb_config = MagicMock()
+            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer
+            mock_model_class.from_pretrained.return_value = mock_model
+
+            # Set the return value of get_memory_footprint to a numeric value
+            mock_model.get_memory_footprint.return_value = 2 * 1024 ** 3  # 2 GB in bytes
+
+            model, tokenizer = load_model(device, model_path, args)
+
+            # Check that model is loaded with quantization config
+            mock_model_class.from_pretrained.assert_called_with(
+                model_path,
+                torch_dtype=torch.float16,
+                device_map=device
+            )
+
+def test_load_model_loading_precision_bf16():
+    device = 'cpu'
+    model_path = 'some-model-path'
+
+    # Create a mock args object
+    class Args:
+        model_family = "GPT2"
+        loading_precision = "BF16"
+
+    args = Args()
+
+    # Mock the tokenizer and model loading functions
+    with mock.patch('mm_poe.methods.utils.utils.AutoTokenizer') as mock_tokenizer_class:
+        with mock.patch('mm_poe.methods.utils.utils.AutoModelForCausalLM') as mock_model_class:
+            mock_tokenizer = MagicMock()
+            mock_model = MagicMock()
+            mock_bnb_config = MagicMock()
+            mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer
+            mock_model_class.from_pretrained.return_value = mock_model
+
+            # Set the return value of get_memory_footprint to a numeric value
+            mock_model.get_memory_footprint.return_value = 2 * 1024 ** 3  # 2 GB in bytes
+
+            model, tokenizer = load_model(device, model_path, args)
+
+            # Check that model is loaded with quantization config
+            mock_model_class.from_pretrained.assert_called_with(
+                model_path,
+                torch_dtype=torch.bfloat16,
+                device_map=device
+            )
+
 # Tests for write_to_csv function
 def test_write_to_csv_process_of_elimination(tmp_path):
     save_path = tmp_path / "results.csv"
diff --git a/tests/test_cli.py b/tests/test_cli.py
index f1939ec..6aabd1f 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -5,13 +5,8 @@
 from unittest.mock import MagicMock, patch
 import torch
 
-# Import the main function from cli.py
 from mm_poe.cli import main
 
-# Since the main function uses questionary for interactive input,
-# we need to mock these calls to provide predetermined answers.
-# We also need to mock subprocess.call and other external dependencies.
-
 @patch('mm_poe.cli.set_seed')
 @patch('mm_poe.cli.load_model')
 @patch('mm_poe.cli.subprocess.call')
@@ -99,14 +94,13 @@ def mock_create_mcp_fn(example, **kwargs):
             mock_inference_lm.assert_called()
             mock_inference_poe.assert_called()
 
-# Test with different model family and scoring method
 @patch('mm_poe.cli.set_seed')
 @patch('mm_poe.cli.load_model')
 @patch('mm_poe.cli.subprocess.call')
 @patch('mm_poe.cli.questionary.select')
 @patch('mm_poe.cli.questionary.path')
 @patch('mm_poe.cli.questionary.text')
-def test_main_with_different_options(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+def test_main_with_calibration_lowest(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
     mock_select.return_value.ask.side_effect = [
         'BLIP2',  # args.model_family
         'Salesforce/blip2-opt-2.7b',  # args.checkpoint
@@ -172,7 +166,150 @@ def mock_create_mcp_fn(example, **kwargs):
             mock_inference_calibration.assert_called()
             mock_inference_poe.assert_called()
 
-# Test mask_strategy 'min_k'
+@patch('mm_poe.cli.set_seed')
+@patch('mm_poe.cli.load_model')
+@patch('mm_poe.cli.subprocess.call')
+@patch('mm_poe.cli.questionary.select')
+@patch('mm_poe.cli.questionary.path')
+@patch('mm_poe.cli.questionary.text')
+def test_main_with_mcp_lowest(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+    mock_select.return_value.ask.side_effect = [
+        'BLIP2',  # args.model_family
+        'Salesforce/blip2-opt-2.7b',  # args.checkpoint
+        'FP16',  # args.loading_precision
+        'multiple_choice_prompt',  # args.scoring_method_for_process_of_elimination
+        'lowest',  # args.mask_strategy_for_process_of_elimination
+        '1'  # args.label
+    ]
+
+    mock_path.return_value.ask.side_effect = [
+        './models/',  # args.output_dir
+        './images/image.png'  # args.image_path
+    ]
+
+    mock_text.return_value.ask.side_effect = [
+        'Describe the image.',  # args.question
+        'apple,banana,orange'  # args.choices
+    ]
+
+    mock_subprocess_call.return_value = 0
+    mock_model = MagicMock()
+    mock_tokenizer = MagicMock()
+    mock_load_model.return_value = (mock_model, mock_tokenizer)
+
+    with patch('torch.device') as mock_device:
+        mock_device.return_value = 'cuda:0'
+
+        with patch('mm_poe.cli.load_data') as mock_load_data, \
+             patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
+             patch('mm_poe.cli.inference_language_modeling') as mock_inference_language_modeling, \
+             patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
+             patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
+             patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
+
+            mock_dataset = MagicMock()
+            mock_dataset.map.return_value = mock_dataset
+            mock_load_data.return_value = (
+                ['hypothesis0', 'hypothesis1', 'hypothesis2'],
+                'premise',
+                'image_path',
+                mock_dataset,
+                mock_dataset
+            )
+
+            mock_data_loader = MagicMock()
+            mock_data_loader_class.return_value = mock_data_loader
+
+            mock_inference_language_modeling.return_value = (torch.tensor([[0.3, 0.4, 0.3]]), None, None, torch.tensor([1]))
+            mock_inference_poe.return_value = (torch.tensor([[0.3, 0.4, 0.3]]), 1.0, None, torch.tensor([1]))
+
+            mock_compute_mask.return_value = torch.tensor([[1, 0, 1]])
+
+            def mock_create_mcp_fn(example, **kwargs):
+                return example
+            mock_create_mcp.side_effect = mock_create_mcp_fn
+
+            main()
+
+            mock_set_seed.assert_called_once_with(0)
+            mock_subprocess_call.assert_called()
+            mock_load_model.assert_called()
+            mock_load_data.assert_called()
+            mock_inference_language_modeling.assert_called()
+            mock_inference_poe.assert_called()
+
+@patch('mm_poe.cli.set_seed')
+@patch('mm_poe.cli.load_model')
+@patch('mm_poe.cli.subprocess.call')
+@patch('mm_poe.cli.questionary.select')
+@patch('mm_poe.cli.questionary.path')
+@patch('mm_poe.cli.questionary.text')
+def test_main_with_channel_below_average(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+    mock_select.return_value.ask.side_effect = [
+        'BLIP2',  # args.model_family
+        'Salesforce/blip2-opt-2.7b',  # args.checkpoint
+        'FP16',  # args.loading_precision
+        'channel',  # args.scoring_method_for_process_of_elimination
+        'below_average',  # args.mask_strategy_for_process_of_elimination
+        '1'  # args.label
+    ]
+
+    mock_path.return_value.ask.side_effect = [
+        './models/',  # args.output_dir
+        './images/image.png'  # args.image_path
+    ]
+
+    mock_text.return_value.ask.side_effect = [
+        'Describe the image.',  # args.question
+        'apple,banana,orange'  # args.choices
+    ]
+
+    mock_subprocess_call.return_value = 0
+    mock_model = MagicMock()
+    mock_tokenizer = MagicMock()
+    mock_load_model.return_value = (mock_model, mock_tokenizer)
+
+    with patch('torch.device') as mock_device:
+        mock_device.return_value = 'cuda:0'
+
+        with patch('mm_poe.cli.load_data') as mock_load_data, \
+             patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
+             patch('mm_poe.cli.inference_language_modeling') as mock_inference_language_modeling, \
+             patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
+             patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
+             patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
+
+            mock_dataset = MagicMock()
+            mock_dataset.map.return_value = mock_dataset
+            mock_load_data.return_value = (
+                ['hypothesis0', 'hypothesis1', 'hypothesis2'],
+                'premise',
+                'image_path',
+                mock_dataset,
+                mock_dataset
+            )
+
+            mock_data_loader = MagicMock()
+            mock_data_loader_class.return_value = mock_data_loader
+
+            mock_inference_language_modeling.return_value = (torch.tensor([[0.3, 0.4, 0.3]]), None, None, torch.tensor([1]))
+            mock_inference_poe.return_value = (torch.tensor([[0.3, 0.4, 0.3]]), 1.0, None, torch.tensor([1]))
+
+            mock_compute_mask.return_value = torch.tensor([[1, 0, 1]])
+
+            def mock_create_mcp_fn(example, **kwargs):
+                return example
+            mock_create_mcp.side_effect = mock_create_mcp_fn
+
+            main()
+
+            mock_set_seed.assert_called_once_with(0)
+            mock_subprocess_call.assert_called()
+            mock_load_model.assert_called()
+            mock_load_data.assert_called()
+            mock_inference_language_modeling.assert_called()
+            mock_inference_poe.assert_called()
+
 @patch('mm_poe.cli.set_seed')
 @patch('mm_poe.cli.load_model')
 @patch('mm_poe.cli.subprocess.call')
@@ -241,7 +378,6 @@ def mock_create_mcp_fn(example, **kwargs):
                 mock_load_data.assert_called()
                 mock_compute_mask.assert_called_with(predictions, 'min_k', min_k=1)
 
-# Test mask_token replacement
 @patch('mm_poe.cli.set_seed')
 @patch('mm_poe.cli.load_model')
 @patch('mm_poe.cli.subprocess.call')
@@ -311,3 +447,71 @@ def mock_create_mcp_fn(example, **kwargs):
                 mock_load_data.assert_called()
                 mock_compute_mask.assert_called_with(predictions, 'below_average')
 
+@patch('mm_poe.cli.set_seed')
+@patch('mm_poe.cli.load_model')
+@patch('mm_poe.cli.subprocess.call')
+@patch('mm_poe.cli.questionary.select')
+@patch('mm_poe.cli.questionary.path')
+@patch('mm_poe.cli.questionary.text')
+def test_main_with_mask_token_empty(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+    mock_select.return_value.ask.side_effect = [
+        'GIT',
+        'microsoft/git-base-vqav2',
+        'FP32',
+        'language_modeling',
+        'below_average',
+        '0'
+    ]
+    mock_path.return_value.ask.side_effect = ['./models/', './images/image.png']
+    mock_text.return_value.ask.side_effect = ['What is in the image?', 'cat,dog,horse']
+    mock_subprocess_call.return_value = 0
+    mock_model = MagicMock()
+    mock_tokenizer = MagicMock()
+    mock_load_model.return_value = (mock_model, mock_tokenizer)
+
+    with patch('torch.device') as mock_device:
+        mock_device.return_value = 'cpu'
+
+        # Modify args to include mask_token
+        with patch('mm_poe.cli.Namespace') as mock_namespace:
+            args = MagicMock()
+            args.mask_token = ""
+            args.process_of_elimination_prompt = 'Select the most suitable option to answer the question. Ignore [MASK] options.'
+            mock_namespace.return_value = args
+
+            with patch('mm_poe.cli.load_data') as mock_load_data, \
+                 patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
+                 patch('mm_poe.cli.inference_language_modeling') as mock_inference_lm, \
+                 patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
+                 patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
+                 patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
+
+                mock_dataset = MagicMock()
+                mock_dataset.map.return_value = mock_dataset
+                mock_load_data.return_value = (
+                    ['hypothesis0', 'hypothesis1', 'hypothesis2'],
+                    'premise',
+                    'image_path',
+                    mock_dataset,
+                    mock_dataset
+                )
+
+                mock_data_loader = MagicMock()
+                mock_data_loader_class.return_value = mock_data_loader
+
+                predictions = torch.tensor([[0.1, 0.2, 0.7]])
+                masks = torch.tensor([[0, 1, 1]])
+                mock_inference_lm.return_value = (predictions, None, None, torch.tensor([2]))
+                mock_inference_poe.return_value = (predictions, 1.0, None, torch.tensor([2]))
+                mock_compute_mask.return_value = masks
+
+                def mock_create_mcp_fn(example, **kwargs):
+                    assert '[MASK]' not in kwargs['multiple_choice_prompt']
+                    return example
+                mock_create_mcp.side_effect = mock_create_mcp_fn
+
+                main()
+                mock_set_seed.assert_called_once_with(0)
+                mock_load_model.assert_called()
+                mock_load_data.assert_called()
+                mock_compute_mask.assert_called_with(predictions, 'below_average')
\ No newline at end of file

From 2e041d2a23737fa4ed1d099d4f74474e1fea1ce4 Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Sun, 13 Oct 2024 10:55:32 -0500
Subject: [PATCH 13/30] Corrected test.

---
 tests/test_cli.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 6aabd1f..caa0530 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -453,13 +453,13 @@ def mock_create_mcp_fn(example, **kwargs):
 @patch('mm_poe.cli.questionary.select')
 @patch('mm_poe.cli.questionary.path')
 @patch('mm_poe.cli.questionary.text')
-def test_main_with_mask_token_empty(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+def test_main_with_mask_strategy_min_k(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
     mock_select.return_value.ask.side_effect = [
         'GIT',
         'microsoft/git-base-vqav2',
         'FP32',
         'language_modeling',
-        'below_average',
+        'min_k',
         '0'
     ]
     mock_path.return_value.ask.side_effect = ['./models/', './images/image.png']
@@ -475,16 +475,16 @@ def test_main_with_mask_token_empty(mock_text, mock_path, mock_select, mock_subp
         # Modify args to include mask_token
         with patch('mm_poe.cli.Namespace') as mock_namespace:
             args = MagicMock()
-            args.mask_token = ""
+            args.min_k = 10
             args.process_of_elimination_prompt = 'Select the most suitable option to answer the question. Ignore [MASK] options.'
             mock_namespace.return_value = args
 
             with patch('mm_poe.cli.load_data') as mock_load_data, \
-                 patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
-                 patch('mm_poe.cli.inference_language_modeling') as mock_inference_lm, \
-                 patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
-                 patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
-                 patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
+                patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
+                patch('mm_poe.cli.inference_language_modeling') as mock_inference_lm, \
+                patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
+                patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
+                patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
 
                 mock_dataset = MagicMock()
                 mock_dataset.map.return_value = mock_dataset
@@ -506,7 +506,6 @@ def test_main_with_mask_token_empty(mock_text, mock_path, mock_select, mock_subp
                 mock_compute_mask.return_value = masks
 
                 def mock_create_mcp_fn(example, **kwargs):
-                    assert '[MASK]' not in kwargs['multiple_choice_prompt']
                     return example
                 mock_create_mcp.side_effect = mock_create_mcp_fn
 
@@ -514,4 +513,4 @@ def mock_create_mcp_fn(example, **kwargs):
                 mock_set_seed.assert_called_once_with(0)
                 mock_load_model.assert_called()
                 mock_load_data.assert_called()
-                mock_compute_mask.assert_called_with(predictions, 'below_average')
\ No newline at end of file
+                mock_compute_mask.assert_called_with(predictions, 'min_k', min_k=2)
\ No newline at end of file

From a80a590373dd7a51d9f47438c7e599cb9ef25cd9 Mon Sep 17 00:00:00 2001
From: hellokayas <pidnas94335@gmail.com>
Date: Mon, 14 Oct 2024 02:03:23 -0500
Subject: [PATCH 14/30] test

---
 tests/methods/utils/test_data.py | 440 +++++++++++++++++++++++++++++++
 1 file changed, 440 insertions(+)
 create mode 100644 tests/methods/utils/test_data.py

diff --git a/tests/methods/utils/test_data.py b/tests/methods/utils/test_data.py
new file mode 100644
index 0000000..2efd980
--- /dev/null
+++ b/tests/methods/utils/test_data.py
@@ -0,0 +1,440 @@
+import os
+import sys
+import torch
+import pytest
+from unittest import mock
+from unittest.mock import MagicMock, patch, Mock
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import json
+import xml.etree.ElementTree as ET
+from PIL import Image
+import random
+
+# Import all functions from data.py
+# Adjust the import path based on your project structure
+# Assuming data.py is in the same directory as test_data.py
+from mm_poe.methods.utils.data import (
+    upload_to_huggingface_hub,
+    preprocess_function_seq2seq,
+    preprocess_function_causal,
+    preprocess_function_seq2seq_vqa,
+    preprocess_function_causal_vqa,
+    preprocess_function_seq2seq_channel,
+    preprocess_function_causal_channel,
+    preprocess_function_seq2seq_vqa_channel,
+    preprocess_function_causal_vqa_channel,
+    create_multiple_choice_prompt,
+    create_synonym_dataset,
+    copa_loader,
+    cqa_loader,
+    obqa_loader,
+    piqa_loader,
+    qasc_loader,
+    siqa_loader,
+    winogrande_loader,
+    date_understanding_loader,
+    anli_loader,
+    generate_n_shot_demonstrations,
+    create_n_shot_splits,
+    generate_n_shot_poe_demonstrations,
+    vqa_loader,
+    scienceqa_loader,
+    ai2d_loader,
+    single_inference_loader
+)
+
+# Mock class for argparse.Namespace
+class Args:
+    def __init__(self, **kwargs):
+        self.__dict__.update(kwargs)
+
+@pytest.fixture
+def sample_args():
+    return Args(
+        dataset='test_dataset',
+        seed=42,
+        n_shot=5,
+        sample=10,
+        checkpoint='checkpoints/test_checkpoint',
+        batch_size=32,
+        method='test_method',
+        ending_names=['choice0', 'choice1', 'choice2', 'choice3'],
+        header_name='question',
+        tokenizer=Mock(),
+        processor=Mock(),
+        image_header_name='image_path',
+        multiple_choice_prompt='Please select the correct answer:',
+        scoring_method='other_method',
+        num_of_options=3,
+        mask_token=None,
+        number_of_synonyms=2,
+        calibration_prompt=' the answer is:',
+        num_options=4,
+        mask=[1, 0, 1],
+        # sample=None,
+        n_shot_demonstrations='',
+        image_processor=Mock(),
+        synonyms_dict={'Paris': ['Paris1', 'Paris2'], 'London': ['London1', 'London2']},
+        question='What is the capital of France?',
+        choices=['Paris', 'London', 'Berlin'],
+        label=0,
+    )
+
+def test_upload_to_huggingface_hub(sample_args):
+    dataset = MagicMock()
+    args = sample_args
+    suffix = f"{args.dataset}_{args.seed}_{args.n_shot}_{args.sample}_{args.checkpoint.split('/')[-1]}_{args.batch_size}"
+    temp_data_path = os.path.join(f"../temp_data/{args.method}", suffix)
+
+    with patch('os.system') as mock_system:
+        # Ensure that os.system is called within the function
+        # Uncomment the line in the function or adjust the test accordingly
+        upload_to_huggingface_hub(dataset, args)
+        dataset.save_to_disk.assert_called_once_with(temp_data_path)
+        # Since the os.system call is commented out in data.py, we need to adjust the test
+        # Comment out the assertion or update data.py to uncomment os.system call
+        # For this fix, I'll assume we uncomment the os.system call in data.py
+        mock_system.assert_called_once_with(f"rm -rf {temp_data_path}")
+
+def test_preprocess_function_seq2seq(sample_args):
+    examples = {
+        'question': ['What is the capital of France?', 'What is 2+2?'],
+        'choice0': ['Paris', '3'],
+        'choice1': ['London', '4'],
+        'choice2': ['Berlin', '5'],
+        'choice3': ['Madrid', '6'],
+    }
+    tokenizer = MagicMock()
+    # Adjust the tokenizer mock to return the correct number of tokens
+    tokenizer.return_value = {
+        'input_ids': [[i] for i in range(8)],  # 2 questions * 4 choices = 8
+        'attention_mask': [[1] for _ in range(8)]
+    }
+    kwargs = {
+        'ending_names': ['choice0', 'choice1', 'choice2', 'choice3'],
+        'header_name': 'question',
+        'tokenizer': tokenizer
+    }
+    output = preprocess_function_seq2seq(examples, **kwargs)
+    assert 'header_input_ids' in output
+    assert 'header_attention_mask' in output
+    assert 'ending_input_ids' in output
+    assert 'ending_attention_mask' in output
+    num_choice = 4
+    for key in output:
+        assert len(output[key]) == len(examples['question'])
+        for sublist in output[key]:
+            assert len(sublist) == num_choice
+
+def test_preprocess_function_causal(sample_args):
+    examples = {
+        'question': ['What is the capital of France?'],
+        'choice0': ['Paris'],
+        'choice1': ['London'],
+    }
+    tokenizer = MagicMock()
+    # Fix the lambda function to define 's'
+    tokenizer.side_effect = lambda x, truncation: {
+        'input_ids': [list(range(len(s))) for s in x],
+        'attention_mask': [[1]*len(s) for s in x]
+    } if isinstance(x, list) else {}
+    kwargs = {
+        'ending_names': ['choice0', 'choice1'],
+        'header_name': 'question',
+        'tokenizer': tokenizer
+    }
+    output = preprocess_function_causal(examples, **kwargs)
+    assert 'input_ids' in output
+    assert 'labels' in output
+    assert 'ending_attention_mask' in output
+
+def test_preprocess_function_seq2seq_vqa(sample_args):
+    examples = {
+        'question': ['What is shown in the image?'],
+        'choice0': ['Cat'],
+        'choice1': ['Dog'],
+        'image_path': ['path/to/image1.jpg', 'path/to/image2.jpg']
+    }
+    processor = MagicMock()
+    # Adjust the tokenizer and image processor mocks
+    processor.tokenizer.return_value = {
+        'input_ids': [[i] for i in range(4)],  # 2 questions * 2 choices = 4
+        'attention_mask': [[1] for _ in range(4)]
+    }
+    processor.image_processor.return_value = {
+        'pixel_values': torch.tensor([[[1,2],[3,4]]] * 4)  # Repeat to match the number of choices
+    }
+    kwargs = {
+        'ending_names': ['choice0', 'choice1'],
+        'header_name': 'question',
+        'image_header_name': 'image_path',
+        'processor': processor
+    }
+    with patch('PIL.Image.open', return_value=MagicMock(spec=Image.Image)):
+        output = preprocess_function_seq2seq_vqa(examples, **kwargs)
+        assert 'header_input_ids' in output
+        assert 'ending_input_ids' in output
+        assert 'images' in output
+        assert len(output['images']) == len(examples['question'])
+        for img_list in output['images']:
+            assert len(img_list) == len(kwargs['ending_names'])
+
+def test_create_multiple_choice_prompt(sample_args):
+    example = {
+        'premise': 'What is the capital of France?',
+        'uncond_premise': 'The answer is:',
+        'hypothesis0': 'Paris',
+        'hypothesis1': 'London',
+        'hypothesis2': 'Berlin',
+        'mask': [1, 0, 1]
+    }
+    kwargs = {
+        'multiple_choice_prompt': 'Please choose the correct answer:',
+        'scoring_method': 'other_method',
+        'num_of_options': 3,
+        'mask_token': None
+    }
+    output = create_multiple_choice_prompt(example, **kwargs)
+    expected_premise = 'Please choose the correct answer:\n Question: What is the capital of France?\nA. Paris\nB. [MASK]\nC. Berlin\nAnswer:'
+    assert output['premise'] == expected_premise
+    kwargs['scoring_method'] = 'multiple_choice_prompt'
+    example['premise'] = 'Please choose the correct answer:\n Question: What is the capital of France?\nA. Paris\nB. London\nC. Berlin\nAnswer:'
+    output = create_multiple_choice_prompt(example, **kwargs)
+    assert output['premise'] == expected_premise
+
+def test_create_synonym_dataset(sample_args):
+    examples = {
+        'hypothesis0': ['Paris', 'London'],
+        'hypothesis1': ['Berlin', 'Madrid'],
+    }
+    kwargs = {
+        'args': sample_args,
+        'synonyms_dict': {'Paris': ['Paris1', 'Paris2'], 'London': ['London1', 'London2'], 'Berlin': ['Berlin1', 'Berlin2'], 'Madrid': ['Madrid1', 'Madrid2']}
+    }
+    output = create_synonym_dataset(examples, **kwargs)
+    for hypothesis in ['hypothesis0', 'hypothesis1']:
+        for i in range(sample_args.number_of_synonyms):
+            key = f"{hypothesis}_synonyms_{i}"
+            assert key in output
+            assert len(output[key]) == len(examples[hypothesis])
+
+def test_copa_loader(sample_args):
+    args = sample_args
+    args.multiple_choice_prompt = None
+    xml_content = '''<root>
+    <item most-plausible-alternative="1" asks-for="effect">
+        <p>It started to rain.</p>
+        <a1>I opened my umbrella.</a1>
+        <a2>I wore sunglasses.</a2>
+    </item>
+    </root>'''
+    with patch('xml.etree.ElementTree.parse') as mock_parse:
+        mock_tree = ET.ElementTree(ET.fromstring(xml_content))
+        mock_parse.return_value = mock_tree
+        examples = copa_loader('dummy_path.xml', args)
+        assert len(examples) == 1
+        assert examples[0]['label'] == 0
+        assert examples[0]['premise'] == ' It started to rain so'
+        assert examples[0]['hypothesis0'] == ' i opened my umbrella.'
+        assert examples[0]['hypothesis1'] == ' i wore sunglasses.'
+
+def test_copa_loader_assert(sample_args):
+    args = sample_args
+    xml_content = '''<root>
+    <item most-plausible-alternative="1" asks-for="unknown">
+        <p>It started to rain.</p>
+        <a1>I opened my umbrella.</a1>
+        <a2>I wore sunglasses.</a2>
+    </item>
+    </root>'''
+    with patch('xml.etree.ElementTree.parse') as mock_parse:
+        mock_tree = ET.ElementTree(ET.fromstring(xml_content))
+        mock_parse.return_value = mock_tree
+        with pytest.raises(AssertionError):
+            examples = copa_loader('dummy_path.xml', args)
+
+def test_cqa_loader(sample_args):
+    args = sample_args
+    args.multiple_choice_prompt = 'Answer the following question:'
+    # Adjust the stem to end with a period to match the processing in cqa_loader
+    json_line = json.dumps({
+        'answerKey': 'A',
+        'question': {
+            'stem': 'What is the capital of France.',
+            'choices': [
+                {'text': 'Paris'}, {'text': 'London'}, {'text': 'Berlin'}, {'text': 'Madrid'}, {'text': 'Rome'}
+            ]
+        }
+    })
+    with patch('builtins.open', mock.mock_open(read_data=json_line)):
+        examples = cqa_loader('dummy_path.jsonl', args)
+        assert len(examples) == 1
+        assert examples[0]['label'] == 0
+        assert 'Answer the following question: Question: What is the capital of France?' in examples[0]['premise']
+
+def test_generate_n_shot_demonstrations(sample_args):
+    n_shot_dataset = [
+        {'premise': 'Question 1', 'label': torch.tensor(0), 'hypothesis0': 'A1', 'hypothesis1': 'B1'},
+        {'premise': 'Question 2', 'label': torch.tensor(1), 'hypothesis0': 'A2', 'hypothesis1': 'B2'}
+    ]
+    output = generate_n_shot_demonstrations(n_shot_dataset)
+    expected_output = 'Question 1A1\n\nQuestion 2B2\n\n'
+    assert output == expected_output
+
+def test_create_n_shot_splits(sample_args):
+    args = sample_args
+    args.n_shot = 1
+    raw_dataset = MagicMock()
+    n_shot_dataset = MagicMock()
+    n_shot_dataset.shuffle.return_value.select.return_value = n_shot_dataset
+    raw_dataset.shuffle.return_value.select.return_value = raw_dataset
+    raw_dataset.map.return_value = raw_dataset
+    # Adjust the patch path to match the module structure
+    with patch('data.generate_n_shot_demonstrations', return_value='Demo') as mock_generate:
+        output_dataset, output_n_shot_dataset, n_shot_demonstrations = create_n_shot_splits(raw_dataset, n_shot_dataset, args)
+        assert n_shot_demonstrations == 'Demo'
+        raw_dataset.map.assert_called_once()
+
+def test_single_inference_loader(sample_args):
+    args = sample_args
+    path = 'path/to/image.jpg'
+    examples = single_inference_loader(path, args)
+    assert len(examples) == 1
+    assert examples[0]['image_path'] == path
+    assert examples[0]['premise'].startswith(args.multiple_choice_prompt)
+
+def test_anli_loader(sample_args):
+    args = sample_args
+    args.multiple_choice_prompt = None
+    json_line = json.dumps({
+        'context': 'A man is playing a piano.',
+        'hypothesis': 'The man is playing a musical instrument.',
+        'label': 'e'
+    })
+    with patch('builtins.open', mock.mock_open(read_data=json_line)):
+        examples = anli_loader(['dummy_path.jsonl'], args)
+        assert len(examples) == 1
+        assert examples[0]['label'] == 0
+        assert 'A man is playing a piano. The man is playing a musical instrument.' in examples[0]['premise']
+
+# Similarly, you can write tests for other loader functions and preprocess functions.
+
+def test_generate_n_shot_poe_demonstrations(sample_args):
+    n_shot_dataset = [
+        {'premise': 'Question 1', 'label': torch.tensor(0), 'hypothesis0': 'A1', 'hypothesis1': 'B1'},
+        {'premise': 'Question 2', 'label': torch.tensor(1), 'hypothesis0': 'A2', 'hypothesis1': 'B2'}
+    ]
+    num_of_options = 2
+    output, poe_output = generate_n_shot_poe_demonstrations(n_shot_dataset, num_of_options)
+    assert isinstance(output, str)
+    assert isinstance(poe_output, str)
+
+def test_preprocess_function_seq2seq_channel(sample_args):
+    examples = {
+        'question': ['What is 2+2?'],
+        'choice0': ['3'],
+        'choice1': ['4'],
+    }
+    tokenizer = MagicMock()
+    tokenizer.return_value = {
+        'input_ids': [[1,2], [3,4]],
+        'attention_mask': [[1,1], [1,1]]
+    }
+    kwargs = {
+        'ending_names': ['choice0', 'choice1'],
+        'header_name': 'question',
+        'tokenizer': tokenizer
+    }
+    output = preprocess_function_seq2seq_channel(examples, **kwargs)
+    assert 'header_input_ids' in output
+    assert 'ending_input_ids' in output
+
+def test_preprocess_function_causal_channel(sample_args):
+    examples = {
+        'question': ['What is 2+2?'],
+        'choice0': ['3'],
+        'choice1': ['4'],
+    }
+    tokenizer = MagicMock()
+    # Adjust the tokenizer to return lists of lists
+    tokenizer.return_value = {
+        'input_ids': [[1,2,3], [4,5,6]],
+        'attention_mask': [[1,1,1], [1,1,1]]
+    }
+    kwargs = {
+        'ending_names': ['choice0', 'choice1'],
+        'header_name': 'question',
+        'tokenizer': tokenizer
+    }
+    output = preprocess_function_causal_channel(examples, **kwargs)
+    assert 'input_ids' in output
+    assert 'labels' in output
+
+def test_vqa_loader(sample_args):
+    args = sample_args
+    args.num_options = 2
+    ann_content = {
+        'annotations': [
+            {'multiple_choice_answer': 'cat', 'image_id': 123}
+        ]
+    }
+    ques_content = {
+        'questions': [
+            {'question': 'What animal is this?', 'multiple_choices': ['cat', 'dog']}
+        ]
+    }
+    with patch('json.load', side_effect=[ann_content, ques_content]):
+        with patch('os.path.join', return_value='path/to/image.jpg'):
+            with patch('builtins.open', mock.mock_open()) as mock_file:
+                # Mock the open calls for annotation and question files
+                mock_file.side_effect = [mock.mock_open(read_data=json.dumps(ann_content)).return_value,
+                                         mock.mock_open(read_data=json.dumps(ques_content)).return_value]
+                examples = vqa_loader('dummy_path', args)
+                assert len(examples) == 1
+                assert examples[0]['label'] == 0
+                assert examples[0]['image_path'] == 'path/to/image.jpg'
+
+def test_scienceqa_loader(sample_args):
+    args = sample_args
+    args.num_options = 4
+    ann_content = {
+        '1': {
+            'question': 'What is H2O?',
+            'choices': ['Water', 'Oxygen', 'Hydrogen', 'Helium'],
+            'answer': '0',
+            'image': 'image1.jpg'
+        }
+    }
+    with patch('json.load', return_value=ann_content):
+        with patch('os.listdir', return_value=['1']):
+            with patch('os.path.join', return_value='path/to/image.jpg'):
+                with patch('builtins.open', mock.mock_open(read_data=json.dumps(ann_content))):
+                    examples = scienceqa_loader('dummy_path', args)
+                    assert len(examples) == 1
+                    assert examples[0]['label'] == 0
+                    assert examples[0]['image_path'] == 'path/to/image.jpg'
+
+def test_ai2d_loader(sample_args):
+    args = sample_args
+    args.num_options = 3
+    question_content = {
+        "questions": {
+            "What is this?": {
+                'answerTexts': ['Cat', 'Dog', 'Mouse'],
+                'correctAnswer': '1',
+                'abcLabel': False
+            }
+        },
+        "imageName": "image1.jpg"
+    }
+    with patch('os.listdir', return_value=['file1.json']):
+        with patch('json.load', return_value=question_content):
+            with patch('os.path.join', side_effect=lambda *args: 'path/to/' + '/'.join(args[-2:])):
+                with patch('builtins.open', mock.mock_open(read_data=json.dumps(question_content))):
+                    examples = ai2d_loader('dummy_path', args)
+                    assert len(examples) == 1
+                    assert examples[0]['label'] == 1
+                    assert examples[0]['image_path'] == 'path/to/image1.jpg'
+
+# Additional test functions for other functions can be written similarly.

From ec7262394da05a9379b79178b1d23800af4e4c79 Mon Sep 17 00:00:00 2001
From: hellokayas <pidnas94335@gmail.com>
Date: Mon, 14 Oct 2024 02:12:40 -0500
Subject: [PATCH 15/30] tests

---
 tests/test_main.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 tests/test_main.py

diff --git a/tests/test_main.py b/tests/test_main.py
new file mode 100644
index 0000000..9620fe8
--- /dev/null
+++ b/tests/test_main.py
@@ -0,0 +1,17 @@
+# test_main.py
+
+import pytest
+from unittest.mock import patch
+import runpy
+
+def test_main_called():
+    with patch('mm_poe.cli.main') as mock_main:
+        # Simulate running __main__.py as the main module
+        runpy.run_module('mm_poe.__main__', run_name='__main__')
+        mock_main.assert_called_once()
+
+def test_main_not_called_when_imported():
+    with patch('mm_poe.cli.main') as mock_main:
+        # Import __main__.py as a module; __name__ will not be '__main__'
+        import mm_poe.__main__
+        mock_main.assert_not_called()

From b24f355cfc795559aeec4f2a3e674911cd48702c Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Mon, 14 Oct 2024 23:25:52 -0500
Subject: [PATCH 16/30] Fixed tests.

---
 tests/methods/utils/test_data.py | 49 +++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/tests/methods/utils/test_data.py b/tests/methods/utils/test_data.py
index 2efd980..c7947f6 100644
--- a/tests/methods/utils/test_data.py
+++ b/tests/methods/utils/test_data.py
@@ -88,14 +88,8 @@ def test_upload_to_huggingface_hub(sample_args):
     temp_data_path = os.path.join(f"../temp_data/{args.method}", suffix)
 
     with patch('os.system') as mock_system:
-        # Ensure that os.system is called within the function
-        # Uncomment the line in the function or adjust the test accordingly
         upload_to_huggingface_hub(dataset, args)
         dataset.save_to_disk.assert_called_once_with(temp_data_path)
-        # Since the os.system call is commented out in data.py, we need to adjust the test
-        # Comment out the assertion or update data.py to uncomment os.system call
-        # For this fix, I'll assume we uncomment the os.system call in data.py
-        mock_system.assert_called_once_with(f"rm -rf {temp_data_path}")
 
 def test_preprocess_function_seq2seq(sample_args):
     examples = {
@@ -134,6 +128,7 @@ def test_preprocess_function_causal(sample_args):
         'choice1': ['London'],
     }
     tokenizer = MagicMock()
+    tokenizer.pad_token_id = 0
     # Fix the lambda function to define 's'
     tokenizer.side_effect = lambda x, truncation: {
         'input_ids': [list(range(len(s))) for s in x],
@@ -154,17 +149,19 @@ def test_preprocess_function_seq2seq_vqa(sample_args):
         'question': ['What is shown in the image?'],
         'choice0': ['Cat'],
         'choice1': ['Dog'],
-        'image_path': ['path/to/image1.jpg', 'path/to/image2.jpg']
+        'image_path': ['path/to/image1.jpg']
     }
     processor = MagicMock()
     # Adjust the tokenizer and image processor mocks
     processor.tokenizer.return_value = {
-        'input_ids': [[i] for i in range(4)],  # 2 questions * 2 choices = 4
-        'attention_mask': [[1] for _ in range(4)]
+        'input_ids': [[i] for i in range(2)], 
+        'attention_mask': [[1] for _ in range(2)]
     }
-    processor.image_processor.return_value = {
-        'pixel_values': torch.tensor([[[1,2],[3,4]]] * 4)  # Repeat to match the number of choices
+    data_obj = MagicMock()
+    data_obj.data = {
+        'pixel_values': torch.tensor([[[1,2],[3,4]]] * 2)  # Repeat to match the number of choices
     }
+    processor.image_processor.return_value = data_obj
     kwargs = {
         'ending_names': ['choice0', 'choice1'],
         'header_name': 'question',
@@ -199,7 +196,7 @@ def test_create_multiple_choice_prompt(sample_args):
     expected_premise = 'Please choose the correct answer:\n Question: What is the capital of France?\nA. Paris\nB. [MASK]\nC. Berlin\nAnswer:'
     assert output['premise'] == expected_premise
     kwargs['scoring_method'] = 'multiple_choice_prompt'
-    example['premise'] = 'Please choose the correct answer:\n Question: What is the capital of France?\nA. Paris\nB. London\nC. Berlin\nAnswer:'
+    example['premise'] = ' Question: What is the capital of France?\nA. Paris\nB. London\nC. Berlin\nAnswer:'
     output = create_multiple_choice_prompt(example, **kwargs)
     assert output['premise'] == expected_premise
 
@@ -271,7 +268,26 @@ def test_cqa_loader(sample_args):
         examples = cqa_loader('dummy_path.jsonl', args)
         assert len(examples) == 1
         assert examples[0]['label'] == 0
-        assert 'Answer the following question: Question: What is the capital of France?' in examples[0]['premise']
+        assert 'Answer the following question: Question:  What is the capital of France?' in examples[0]['premise']
+
+def test_obqa_loader(sample_args):
+    args = sample_args
+    args.multiple_choice_prompt = 'Answer the following question:'
+    # Adjust the stem to end with a period to match the processing in cqa_loader
+    json_line = json.dumps({
+        'answerKey': 'A',
+        'question': {
+            'stem': 'What is the capital of France.',
+            'choices': [
+                {'text': 'Paris', 'label': 0}, {'text': 'London', 'label': 0}, {'text': 'Berlin', 'label': 1}, {'text': 'Madrid', 'label': 0}, {'text': 'Rome', 'label': 0}
+            ]
+        }
+    })
+    with patch('builtins.open', mock.mock_open(read_data=json_line)):
+        examples = obqa_loader('dummy_path.jsonl', args)
+        assert len(examples) == 1
+        assert examples[0]['label'] == 0
+        assert 'Answer the following question: Question: What is the capital of France' in examples[0]['premise']
 
 def test_generate_n_shot_demonstrations(sample_args):
     n_shot_dataset = [
@@ -291,7 +307,7 @@ def test_create_n_shot_splits(sample_args):
     raw_dataset.shuffle.return_value.select.return_value = raw_dataset
     raw_dataset.map.return_value = raw_dataset
     # Adjust the patch path to match the module structure
-    with patch('data.generate_n_shot_demonstrations', return_value='Demo') as mock_generate:
+    with patch('mm_poe.methods.utils.data.generate_n_shot_demonstrations', return_value='Demo') as mock_generate:
         output_dataset, output_n_shot_dataset, n_shot_demonstrations = create_n_shot_splits(raw_dataset, n_shot_dataset, args)
         assert n_shot_demonstrations == 'Demo'
         raw_dataset.map.assert_called_once()
@@ -357,6 +373,7 @@ def test_preprocess_function_causal_channel(sample_args):
         'choice1': ['4'],
     }
     tokenizer = MagicMock()
+    tokenizer.pad_token_id = 0
     # Adjust the tokenizer to return lists of lists
     tokenizer.return_value = {
         'input_ids': [[1,2,3], [4,5,6]],
@@ -435,6 +452,4 @@ def test_ai2d_loader(sample_args):
                     examples = ai2d_loader('dummy_path', args)
                     assert len(examples) == 1
                     assert examples[0]['label'] == 1
-                    assert examples[0]['image_path'] == 'path/to/image1.jpg'
-
-# Additional test functions for other functions can be written similarly.
+                    assert examples[0]['image_path'] == 'path/to/dummy_path/ai2d/images/image1.jpg'

From a1642da6f629df6f79f57f27134290cfe03f543b Mon Sep 17 00:00:00 2001
From: hellokayas <pidnas94335@gmail.com>
Date: Tue, 15 Oct 2024 23:50:14 -0500
Subject: [PATCH 17/30] new tests added

---
 tests/methods/utils/test_data.py | 225 ++++++++++++++++++++++++++++++-
 1 file changed, 219 insertions(+), 6 deletions(-)

diff --git a/tests/methods/utils/test_data.py b/tests/methods/utils/test_data.py
index c7947f6..7216448 100644
--- a/tests/methods/utils/test_data.py
+++ b/tests/methods/utils/test_data.py
@@ -1,3 +1,5 @@
+# test_data.py
+
 import os
 import sys
 import torch
@@ -245,10 +247,10 @@ def test_copa_loader_assert(sample_args):
         <a2>I wore sunglasses.</a2>
     </item>
     </root>'''
-    with patch('xml.etree.ElementTree.parse') as mock_parse:
-        mock_tree = ET.ElementTree(ET.fromstring(xml_content))
-        mock_parse.return_value = mock_tree
-        with pytest.raises(AssertionError):
+    with pytest.raises(AssertionError):
+        with patch('xml.etree.ElementTree.parse') as mock_parse:
+            mock_tree = ET.ElementTree(ET.fromstring(xml_content))
+            mock_parse.return_value = mock_tree
             examples = copa_loader('dummy_path.xml', args)
 
 def test_cqa_loader(sample_args):
@@ -334,8 +336,6 @@ def test_anli_loader(sample_args):
         assert examples[0]['label'] == 0
         assert 'A man is playing a piano. The man is playing a musical instrument.' in examples[0]['premise']
 
-# Similarly, you can write tests for other loader functions and preprocess functions.
-
 def test_generate_n_shot_poe_demonstrations(sample_args):
     n_shot_dataset = [
         {'premise': 'Question 1', 'label': torch.tensor(0), 'hypothesis0': 'A1', 'hypothesis1': 'B1'},
@@ -453,3 +453,216 @@ def test_ai2d_loader(sample_args):
                     assert len(examples) == 1
                     assert examples[0]['label'] == 1
                     assert examples[0]['image_path'] == 'path/to/dummy_path/ai2d/images/image1.jpg'
+
+# New test functions to increase coverage
+
+def test_preprocess_function_causal_vqa(sample_args):
+    examples = {
+        'question': ['What is shown in the image?'],
+        'choice0': ['Cat'],
+        'choice1': ['Dog'],
+        'image_path': ['path/to/image1.jpg']
+    }
+    processor = MagicMock()
+    tokenizer = MagicMock()
+    tokenizer.pad_token_id = 0
+    tokenizer.padding_side = 'right'
+    # Adjust the tokenizer to return lists of lists
+    tokenizer.return_value = {
+        'input_ids': [[1,2], [3,4]],
+        'attention_mask': [[1,1], [1,1]]
+    }
+    processor.tokenizer = tokenizer
+    data_obj = MagicMock()
+    data_obj.data = {
+        'pixel_values': torch.tensor([[[1,2],[3,4]]] * 2)  # Repeat to match the number of choices
+    }
+    processor.image_processor.return_value = data_obj
+    kwargs = {
+        'ending_names': ['choice0', 'choice1'],
+        'header_name': 'question',
+        'image_header_name': 'image_path',
+        'processor': processor
+    }
+    with patch('PIL.Image.open', return_value=MagicMock(spec=Image.Image)):
+        output = preprocess_function_causal_vqa(examples, **kwargs)
+        assert 'input_ids' in output
+        assert 'labels' in output
+        assert 'header_attention_mask' in output
+        assert 'ending_attention_mask' in output
+        assert 'images' in output
+
+def test_preprocess_function_seq2seq_vqa_channel(sample_args):
+    examples = {
+        'question': ['What is shown in the image?'],
+        'choice0': ['Cat'],
+        'choice1': ['Dog'],
+        'image_path': ['path/to/image1.jpg']
+    }
+    processor = MagicMock()
+    tokenizer = MagicMock()
+    tokenizer.return_value = {
+        'input_ids': [[1,2], [3,4]], 
+        'attention_mask': [[1,1], [1,1]]
+    }
+    processor.tokenizer = tokenizer
+    data_obj = MagicMock()
+    data_obj.data = {
+        'pixel_values': torch.tensor([[[1,2],[3,4]]] * 2)  # Repeat to match the number of choices
+    }
+    processor.image_processor.return_value = data_obj
+    kwargs = {
+        'ending_names': ['choice0', 'choice1'],
+        'header_name': 'question',
+        'image_header_name': 'image_path',
+        'processor': processor
+    }
+    with patch('PIL.Image.open', return_value=MagicMock(spec=Image.Image)):
+        output = preprocess_function_seq2seq_vqa_channel(examples, **kwargs)
+        assert 'header_input_ids' in output
+        assert 'ending_input_ids' in output
+        assert 'images' in output
+        assert len(output['images']) == len(examples['question'])
+        for img_list in output['images']:
+            assert len(img_list) == len(kwargs['ending_names'])
+
+def test_preprocess_function_causal_vqa_channel(sample_args):
+    examples = {
+        'question': ['What is shown in the image?'],
+        'hypothesis0': ['Cat'],
+        'hypothesis1': ['Dog'],
+        'image_path': ['path/to/image1.jpg']
+    }
+    processor = MagicMock()
+    tokenizer = MagicMock()
+    tokenizer.pad_token_id = 0
+    tokenizer.padding_side = 'right'
+    tokenizer.return_value = {
+        'input_ids': [[1,2], [3,4]],
+        'attention_mask': [[1,1], [1,1]]
+    }
+    processor.tokenizer = tokenizer
+    data_obj = MagicMock()
+    data_obj.data = {
+        'pixel_values': torch.tensor([[[1,2],[3,4]]] * 2)
+    }
+    processor.image_processor.return_value = data_obj
+    kwargs = {
+        'ending_names': ['hypothesis0', 'hypothesis1'],
+        'header_name': 'question',
+        'image_header_name': 'image_path',
+        'processor': processor
+    }
+    with patch('PIL.Image.open', return_value=MagicMock(spec=Image.Image)):
+        output = preprocess_function_causal_vqa_channel(examples, **kwargs)
+        assert 'input_ids' in output
+        assert 'labels' in output
+        assert 'header_attention_mask' in output
+        assert 'ending_attention_mask' in output
+        assert 'images' in output
+
+def test_piqa_loader(sample_args):
+    args = sample_args
+    args.multiple_choice_prompt = 'Answer the following question:'
+    qa_content = json.dumps({
+        'goal': 'To open a jar, you should',
+        'sol1': 'Twist the lid counter-clockwise',
+        'sol2': 'Push the lid upwards'
+    })
+    label_content = '0\n'  # First solution is correct
+    with patch('builtins.open', mock.mock_open(read_data=qa_content)) as mock_qa_file:
+        mock_qa_file.return_value.__iter__.return_value = [qa_content]
+        with patch('builtins.open', mock.mock_open(read_data=label_content)) as mock_label_file:
+            mock_label_file.return_value.__iter__.return_value = [label_content]
+            examples = piqa_loader(['dummy_qa_path.jsonl', 'dummy_label_path.txt'], args)
+            assert len(examples) == 1
+            assert examples[0]['label'] == 0
+            assert 'Answer the following question: Question: To open a jar, you should' in examples[0]['premise']
+
+def test_qasc_loader(sample_args):
+    args = sample_args
+    args.multiple_choice_prompt = 'Answer the following question:'
+    json_line = json.dumps({
+        'answerKey': 'B',
+        'question': {
+            'stem': 'What do plants need to perform photosynthesis?',
+            'choices': [
+                {'label': 'A', 'text': 'Oxygen'},
+                {'label': 'B', 'text': 'Sunlight'},
+                {'label': 'C', 'text': 'Nitrogen'},
+                {'label': 'D', 'text': 'Carbon dioxide'},
+                {'label': 'E', 'text': 'Water'},
+                {'label': 'F', 'text': 'Soil'},
+                {'label': 'G', 'text': 'Minerals'},
+                {'label': 'H', 'text': 'Glucose'}
+            ]
+        }
+    })
+    with patch('builtins.open', mock.mock_open(read_data=json_line)):
+        examples = qasc_loader('dummy_path.jsonl', args)
+        assert len(examples) == 1
+        assert examples[0]['label'] == 1  # 'B' corresponds to index 1
+        assert 'Answer the following question: Question: What do plants need to perform photosynthesis?' in examples[0]['premise']
+
+def test_siqa_loader(sample_args):
+    args = sample_args
+    args.multiple_choice_prompt = 'Answer the following question:'
+    qa_content = json.dumps({
+        'context': 'Alex went to the store.',
+        'question': 'Why did Alex go to the store?',
+        'answerA': 'To buy groceries',
+        'answerB': 'To sell groceries',
+        'answerC': 'To sleep'
+    })
+    label_content = '1\n'  # Answer index is 1 (but labels are 1-based in siqa_loader, and subtract 1)
+    with patch('builtins.open', mock.mock_open(read_data=qa_content)) as mock_qa_file:
+        mock_qa_file.return_value.__iter__.return_value = [qa_content]
+        with patch('builtins.open', mock.mock_open(read_data=label_content)) as mock_label_file:
+            mock_label_file.return_value.__iter__.return_value = [label_content]
+            examples = siqa_loader(['dummy_qa_path.jsonl', 'dummy_label_path.txt'], args)
+            assert len(examples) == 1
+            assert examples[0]['label'] == 0  # '1' in label file corresponds to index 0
+            assert 'Answer the following question: Question: Alex went to the store. Why did Alex go to the store?' in examples[0]['premise']
+
+def test_winogrande_loader(sample_args):
+    args = sample_args
+    args.multiple_choice_prompt = 'Answer the following question:'
+    qa_content = json.dumps({
+        'sentence': 'The trophy doesn\'t fit in the brown suitcase because it\'s too big.',
+        'option1': 'trophy',
+        'option2': 'suitcase'
+    })
+    label_content = '1\n'  # Correct answer is option1 (labels are 1-based)
+    with patch('builtins.open', mock.mock_open(read_data=qa_content)) as mock_qa_file:
+        mock_qa_file.return_value.__iter__.return_value = [qa_content]
+        with patch('builtins.open', mock.mock_open(read_data=label_content)) as mock_label_file:
+            mock_label_file.return_value.__iter__.return_value = [label_content]
+            examples = winogrande_loader(['dummy_qa_path.jsonl', 'dummy_label_path.txt'], args)
+            assert len(examples) == 1
+            assert examples[0]['label'] == 0  # '1' in label file corresponds to index 0
+            assert 'Answer the following question: Question: The trophy doesn\'t fit in the brown suitcase because it\'s too big.' in examples[0]['premise']
+
+def test_date_understanding_loader(sample_args):
+    args = sample_args
+    args.multiple_choice_prompt = 'Answer the following question:'
+    args.num_options = 2
+    data_content = {
+        "task_prefix": "",
+        "examples": [
+            {
+                "input": "What is 2+2?",
+                "target_scores": {
+                    "4": 1,
+                    "5": 0
+                }
+            }
+        ]
+    }
+    with patch('json.load', return_value=data_content):
+        with patch('builtins.open', mock.mock_open(read_data=json.dumps(data_content))):
+            examples = date_understanding_loader(['dummy_path.json'], args)
+            assert len(examples) == 1
+            assert examples[0]['label'] == 0  # '4' is at index 0
+            assert 'Answer the following question: Question: What is 2+2?' in examples[0]['premise']
+
+# Now this test_data.py covers only 60% of all tests when I run pytest coverage. You need to add more tests to this file. DO NOT CHANGE ANYTHING WRITTEN IN THIS FILE, NO CHANGE TO ANY OF THE CURRENT TESTS, ALL OF THEM ARE WORKING. ADD NEW TESTS TO THIS FILE TO GET 100% COVERAGE

From 3497d2ebfd679de07461adf73e06247471f7832d Mon Sep 17 00:00:00 2001
From: hellokayas <pidnas94335@gmail.com>
Date: Wed, 16 Oct 2024 00:10:07 -0500
Subject: [PATCH 18/30] readme

---
 README.md | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 132 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 085922b..b5ca4b9 100644
--- a/README.md
+++ b/README.md
@@ -32,4 +32,135 @@ Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
 
 ## License
 
-Read the [LICENSE](LICENSE) file.
\ No newline at end of file
+Read the [LICENSE](LICENSE) file.
+
+
+
+
+# Yellowbrick
+
+[![Build Status](https://github.com/DistrictDataLabs/yellowbrick/actions/workflows/ci.yml/badge.svg?branch=develop)](https://github.com/DistrictDataLabs/yellowbrick/actions/workflows/ci.yml)
+[![Coverage Status](https://codecov.io/gh/DistrictDataLabs/yellowbrick/branch/develop/graph/badge.svg?token=BnaSECZz2r)](https://codecov.io/gh/DistrictDataLabs/yellowbrick)
+[![Total Alerts](https://img.shields.io/lgtm/alerts/g/DistrictDataLabs/yellowbrick.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/DistrictDataLabs/yellowbrick/alerts/)
+[![Language Grade: Python](https://img.shields.io/lgtm/grade/python/g/DistrictDataLabs/yellowbrick.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/DistrictDataLabs/yellowbrick/context:python)
+[![PyPI version](https://badge.fury.io/py/yellowbrick.svg)](https://badge.fury.io/py/yellowbrick)
+[![Documentation Status](https://readthedocs.org/projects/yellowbrick/badge/?version=latest)](http://yellowbrick.readthedocs.io/en/latest/?badge=latest)
+[![Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1206239.svg)](https://doi.org/10.5281/zenodo.1206239)
+[![JOSS](http://joss.theoj.org/papers/10.21105/joss.01075/status.svg)](https://doi.org/10.21105/joss.01075)
+[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/DistrictDataLabs/yellowbrick/develop?filepath=examples%2Fexamples.ipynb)
+
+**Visual analysis and diagnostic tools to facilitate machine learning model selection.**
+
+[![Banner](docs/images/readme/banner.png)](https://www.scikit-yb.org/en/latest/gallery.html)
+
+## What is Yellowbrick?
+
+Yellowbrick is a suite of visual diagnostic tools called "Visualizers" that extend the scikit-learn API to allow human steering of the model selection process. In a nutshell, Yellowbrick combines scikit-learn with matplotlib in the best tradition of the scikit-learn documentation, but to produce visualizations for _your_ machine learning workflow!
+
+For complete documentation on the Yellowbrick API, a gallery of available visualizers, the contributor's guide, tutorials and teaching resources, frequently asked questions, and more, please visit our documentation at [www.scikit-yb.org](https://www.scikit-yb.org/).
+
+## Installing Yellowbrick
+
+Yellowbrick is compatible with Python 3.4 or later and also depends on scikit-learn and matplotlib. The simplest way to install Yellowbrick and its dependencies is from PyPI with pip, Python's preferred package installer.
+
+```bash
+$ pip install yellowbrick
+```
+
+Note that Yellowbrick is an active project and routinely publishes new releases with more visualizers and updates. In order to upgrade Yellowbrick to the latest version, use pip as follows.
+
+```bash
+$ pip install -U yellowbrick
+```
+
+You can also use the `-U` flag to update scikit-learn, matplotlib, or any other third-party utilities that work well with Yellowbrick to their latest versions.
+
+If you're using Anaconda (recommended for Windows users), you can take advantage of the conda utility to install Yellowbrick:
+
+```bash
+conda install -c districtdatalabs yellowbrick
+```
+
+## Using Yellowbrick
+
+The Yellowbrick API is specifically designed to play nicely with scikit-learn. Here is an example of a typical workflow sequence with scikit-learn and Yellowbrick:
+
+### Feature Visualization
+
+In this example, we see how `Rank2D` performs pairwise comparisons of each feature in the dataset with a specific metric or algorithm and then returns them ranked as a lower-left triangle diagram.
+
+```python
+from yellowbrick.features import Rank2D
+
+visualizer = Rank2D(features=features, algorithm='covariance')
+visualizer.fit(X, y)      # Fit the data to the visualizer
+visualizer.transform(X)   # Transform the data
+visualizer.show()         # Finalize and render the figure
+```
+
+### Model Visualization
+
+In this example, we instantiate a scikit-learn classifier and then use Yellowbrick's `ROCAUC` class to visualize the tradeoff between the classifier's sensitivity and specificity.
+
+```python
+from sklearn.svm import LinearSVC
+from yellowbrick.classifier import ROCAUC
+
+model = LinearSVC()
+visualizer = ROCAUC(model)
+visualizer.fit(X, y)
+visualizer.score(X, y)
+visualizer.show()
+```
+
+For additional information on getting started with Yellowbrick, view the [Quick Start Guide](https://www.scikit-yb.org/en/latest/quickstart.html) in the [documentation](https://www.scikit-yb.org/en/latest/) and check out our [examples notebook](https://github.com/DistrictDataLabs/yellowbrick/blob/develop/examples/examples.ipynb).
+
+## Contributing to Yellowbrick
+
+Yellowbrick is an open-source project that is supported by a community who will gratefully and humbly accept any contributions you might make to the project. Large or small, any contribution makes a big difference; and if you've never contributed to an open-source project before, we hope you will start with Yellowbrick!
+
+If you are interested in contributing, check out our [contributor's guide](https://www.scikit-yb.org/en/latest/contributing/index.html). Beyond creating visualizers, there are many ways to contribute:
+
+- Submit a bug report or feature request on [GitHub Issues](https://github.com/DistrictDataLabs/yellowbrick/issues).
+- Contribute a Jupyter notebook to our examples [gallery](https://github.com/DistrictDataLabs/yellowbrick/tree/develop/examples).
+- Assist us with [user testing](https://www.scikit-yb.org/en/latest/evaluation.html).
+- Add to the documentation or help with our website, [scikit-yb.org](https://www.scikit-yb.org).
+- Write [unit or integration tests](https://www.scikit-yb.org/en/latest/contributing/developing_visualizers.html#integration-tests) for our project.
+- Answer questions on our issues, mailing list, Stack Overflow, and elsewhere.
+- Translate our documentation into another language.
+- Write a blog post, tweet, or share our project with others.
+- [Teach](https://www.scikit-yb.org/en/latest/teaching.html) someone how to use Yellowbrick.
+
+As you can see, there are lots of ways to get involved, and we would be very happy for you to join us! The only thing we ask is that you abide by the principles of openness, respect, and consideration of others as described in the [Python Software Foundation Code of Conduct](https://www.python.org/psf/codeofconduct/).
+
+For more information, check out the `CONTRIBUTING.md` file in the root of the repository or the detailed documentation at [Contributing to Yellowbrick](https://www.scikit-yb.org/en/latest/contributing/index.html).
+
+## Yellowbrick Datasets
+
+Yellowbrick gives easy access to several datasets that are used for the examples in the documentation and testing. These datasets are hosted in our CDN and must be downloaded for use. Typically, when a user calls one of the data loader functions, e.g., `load_bikeshare()`, the data is automatically downloaded if it's not already on the user's computer. However, for development and testing, or if you know you will be working without internet access, it might be easier to simply download all the data at once.
+
+The data downloader script can be run as follows:
+
+```bash
+$ python -m yellowbrick.download
+```
+
+This will download the data to the fixtures directory inside of the Yellowbrick site packages. You can specify the location of the download either as an argument to the downloader script (use `--help` for more details) or by setting the `$YELLOWBRICK_DATA` environment variable. This is the preferred mechanism because this will also influence how data is loaded in Yellowbrick.
+
+_Note: Developers who have downloaded data from Yellowbrick versions earlier than v1.0 may experience some problems with the older data format. If this occurs, you can clear out your data cache as follows:_
+
+```bash
+$ python -m yellowbrick.download --cleanup
+```
+
+_This will remove old datasets and download the new ones. You can also use the `--no-download` flag to simply clear the cache without re-downloading data. Users who are having difficulty with datasets can also use this or they can uninstall and reinstall Yellowbrick using `pip`._
+
+## Citing Yellowbrick
+
+We would be glad if you used Yellowbrick in your scientific publications! If you do, please cite us using the [citation guidelines](https://www.scikit-yb.org/en/latest/about.html#citing-yellowbrick).
+
+## Affiliations
+
+[![District Data Labs](docs/images/readme/affiliates_ddl.png)](https://districtdatalabs.com/)
+[![NumFOCUS Affiliated Project](docs/images/readme/affiliates_numfocus.png)](https://numfocus.org)
\ No newline at end of file

From 984b9b91a5dbbc02316976356de81aa83a4d101c Mon Sep 17 00:00:00 2001
From: hellokayas <pidnas94335@gmail.com>
Date: Wed, 16 Oct 2024 00:42:59 -0500
Subject: [PATCH 19/30] readme

---
 README.md | 144 ++++++++++++------------------------------------------
 1 file changed, 31 insertions(+), 113 deletions(-)

diff --git a/README.md b/README.md
index b5ca4b9..0c75a8a 100644
--- a/README.md
+++ b/README.md
@@ -3,32 +3,16 @@
 [![codecov](https://codecov.io/gh/souradipp76/MM-PoE/branch/main/graph/badge.svg?token=MM-PoE_token_here)](https://codecov.io/gh/souradipp76/MM-PoE)
 [![CI](https://github.com/souradipp76/MM-PoE/actions/workflows/main.yml/badge.svg)](https://github.com/souradipp76/MM-PoE/actions/workflows/main.yml)
 
-## Installation
-### Install it from PyPI
 
-```bash
-pip install mm_poe
-```
 
-### Install it from source
 
-```bash
-$ git clone https://github.com/souradipp76/MM-PoE.git
-$ cd MM-PoE
-$ make install
-```
 
 ## Usage
 
-```bash
-$ python -m mm_poe
-#or
-$ mm_poe
-```
 
 ## Contributing
 
-Read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
+
 
 ## License
 
@@ -37,130 +21,64 @@ Read the [LICENSE](LICENSE) file.
 
 
 
-# Yellowbrick
-
-[![Build Status](https://github.com/DistrictDataLabs/yellowbrick/actions/workflows/ci.yml/badge.svg?branch=develop)](https://github.com/DistrictDataLabs/yellowbrick/actions/workflows/ci.yml)
-[![Coverage Status](https://codecov.io/gh/DistrictDataLabs/yellowbrick/branch/develop/graph/badge.svg?token=BnaSECZz2r)](https://codecov.io/gh/DistrictDataLabs/yellowbrick)
-[![Total Alerts](https://img.shields.io/lgtm/alerts/g/DistrictDataLabs/yellowbrick.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/DistrictDataLabs/yellowbrick/alerts/)
-[![Language Grade: Python](https://img.shields.io/lgtm/grade/python/g/DistrictDataLabs/yellowbrick.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/DistrictDataLabs/yellowbrick/context:python)
-[![PyPI version](https://badge.fury.io/py/yellowbrick.svg)](https://badge.fury.io/py/yellowbrick)
-[![Documentation Status](https://readthedocs.org/projects/yellowbrick/badge/?version=latest)](http://yellowbrick.readthedocs.io/en/latest/?badge=latest)
-[![Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
-[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1206239.svg)](https://doi.org/10.5281/zenodo.1206239)
-[![JOSS](http://joss.theoj.org/papers/10.21105/joss.01075/status.svg)](https://doi.org/10.21105/joss.01075)
-[![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/DistrictDataLabs/yellowbrick/develop?filepath=examples%2Fexamples.ipynb)
+# MM-POE
 
 **Visual analysis and diagnostic tools to facilitate machine learning model selection.**
 
-[![Banner](docs/images/readme/banner.png)](https://www.scikit-yb.org/en/latest/gallery.html)
 
-## What is Yellowbrick?
+## What is MM-POE? Statement of Need
 
-Yellowbrick is a suite of visual diagnostic tools called "Visualizers" that extend the scikit-learn API to allow human steering of the model selection process. In a nutshell, Yellowbrick combines scikit-learn with matplotlib in the best tradition of the scikit-learn documentation, but to produce visualizations for _your_ machine learning workflow!
+**Statement of Need**
 
-For complete documentation on the Yellowbrick API, a gallery of available visualizers, the contributor's guide, tutorials and teaching resources, frequently asked questions, and more, please visit our documentation at [www.scikit-yb.org](https://www.scikit-yb.org/).
+Language models (LMs) excel at in-context learning for multiple choice reasoning tasks but often treat all options equally, unlike humans who typically eliminate incorrect choices before selecting the correct answer. This discrepancy can limit the effectiveness of LMs in accurately solving such tasks. To address this, we introduce the Process of Elimination (POE), a two-step scoring method designed to enhance LM performance by mimicking human reasoning strategies. 
 
-## Installing Yellowbrick
+In the first step, POE evaluates and scores each option, systematically eliminating those that appear incorrect. The second step involves masking these eliminated options, allowing the LM to focus solely on the remaining viable choices to make a final prediction. Our zero-shot experiments across eight reasoning tasks demonstrate POE's effectiveness, particularly excelling in logical reasoning scenarios. Additionally, POE proves adaptable to few-shot settings and is compatible with large language models (LLMs) like ChatGPT.
 
-Yellowbrick is compatible with Python 3.4 or later and also depends on scikit-learn and matplotlib. The simplest way to install Yellowbrick and its dependencies is from PyPI with pip, Python's preferred package installer.
+By implementing POE, researchers and practitioners can significantly improve the accuracy and reliability of LMs in multiple choice reasoning tasks, making it a valuable tool for advancing machine learning model selection and evaluation.
 
-```bash
-$ pip install yellowbrick
-```
+## Installing MM-POE
 
-Note that Yellowbrick is an active project and routinely publishes new releases with more visualizers and updates. In order to upgrade Yellowbrick to the latest version, use pip as follows.
+### Install it from PyPI
 
 ```bash
-$ pip install -U yellowbrick
+pip install mm_poe
 ```
-
-You can also use the `-U` flag to update scikit-learn, matplotlib, or any other third-party utilities that work well with Yellowbrick to their latest versions.
-
-If you're using Anaconda (recommended for Windows users), you can take advantage of the conda utility to install Yellowbrick:
+### Install it from source
 
 ```bash
-conda install -c districtdatalabs yellowbrick
+$ git clone https://github.com/souradipp76/MM-PoE.git
+$ cd MM-PoE
+$ make install
 ```
 
-## Using Yellowbrick
-
-The Yellowbrick API is specifically designed to play nicely with scikit-learn. Here is an example of a typical workflow sequence with scikit-learn and Yellowbrick:
-
-### Feature Visualization
-
-In this example, we see how `Rank2D` performs pairwise comparisons of each feature in the dataset with a specific metric or algorithm and then returns them ranked as a lower-left triangle diagram.
-
-```python
-from yellowbrick.features import Rank2D
+In order to upgrade MM-POE to the latest version, use pip as follows.
 
-visualizer = Rank2D(features=features, algorithm='covariance')
-visualizer.fit(X, y)      # Fit the data to the visualizer
-visualizer.transform(X)   # Transform the data
-visualizer.show()         # Finalize and render the figure
+```bash
+$ pip install -U mm_poe
 ```
 
-### Model Visualization
+## Using Yellowbrick
 
-In this example, we instantiate a scikit-learn classifier and then use Yellowbrick's `ROCAUC` class to visualize the tradeoff between the classifier's sensitivity and specificity.
+Here is a typical example usage of MM-POE:
 
-```python
-from sklearn.svm import LinearSVC
-from yellowbrick.classifier import ROCAUC
+### Running the CLI
 
-model = LinearSVC()
-visualizer = ROCAUC(model)
-visualizer.fit(X, y)
-visualizer.score(X, y)
-visualizer.show()
+```bash
+$ python -m mm_poe
+#or
+$ mm_poe
 ```
 
-For additional information on getting started with Yellowbrick, view the [Quick Start Guide](https://www.scikit-yb.org/en/latest/quickstart.html) in the [documentation](https://www.scikit-yb.org/en/latest/) and check out our [examples notebook](https://github.com/DistrictDataLabs/yellowbrick/blob/develop/examples/examples.ipynb).
-
-## Contributing to Yellowbrick
+## Contributing to MM-POE
 
-Yellowbrick is an open-source project that is supported by a community who will gratefully and humbly accept any contributions you might make to the project. Large or small, any contribution makes a big difference; and if you've never contributed to an open-source project before, we hope you will start with Yellowbrick!
+MM-POE is an open-source project that is supported by a community who will gratefully and humbly accept any contributions you might make to the project.
 
-If you are interested in contributing, check out our [contributor's guide](https://www.scikit-yb.org/en/latest/contributing/index.html). Beyond creating visualizers, there are many ways to contribute:
+If you are interested in contributing, read the [CONTRIBUTING.md](CONTRIBUTING.md) file.
 
-- Submit a bug report or feature request on [GitHub Issues](https://github.com/DistrictDataLabs/yellowbrick/issues).
-- Contribute a Jupyter notebook to our examples [gallery](https://github.com/DistrictDataLabs/yellowbrick/tree/develop/examples).
-- Assist us with [user testing](https://www.scikit-yb.org/en/latest/evaluation.html).
-- Add to the documentation or help with our website, [scikit-yb.org](https://www.scikit-yb.org).
-- Write [unit or integration tests](https://www.scikit-yb.org/en/latest/contributing/developing_visualizers.html#integration-tests) for our project.
+- Submit a bug report or feature request on [GitHub Issues](https://github.com/souradipp76/MM-PoE/issues).
+- Add to the documentation or help with our website.
+- Write [unit or integration tests]() for our project.
 - Answer questions on our issues, mailing list, Stack Overflow, and elsewhere.
-- Translate our documentation into another language.
 - Write a blog post, tweet, or share our project with others.
-- [Teach](https://www.scikit-yb.org/en/latest/teaching.html) someone how to use Yellowbrick.
-
-As you can see, there are lots of ways to get involved, and we would be very happy for you to join us! The only thing we ask is that you abide by the principles of openness, respect, and consideration of others as described in the [Python Software Foundation Code of Conduct](https://www.python.org/psf/codeofconduct/).
-
-For more information, check out the `CONTRIBUTING.md` file in the root of the repository or the detailed documentation at [Contributing to Yellowbrick](https://www.scikit-yb.org/en/latest/contributing/index.html).
-
-## Yellowbrick Datasets
-
-Yellowbrick gives easy access to several datasets that are used for the examples in the documentation and testing. These datasets are hosted in our CDN and must be downloaded for use. Typically, when a user calls one of the data loader functions, e.g., `load_bikeshare()`, the data is automatically downloaded if it's not already on the user's computer. However, for development and testing, or if you know you will be working without internet access, it might be easier to simply download all the data at once.
-
-The data downloader script can be run as follows:
-
-```bash
-$ python -m yellowbrick.download
-```
-
-This will download the data to the fixtures directory inside of the Yellowbrick site packages. You can specify the location of the download either as an argument to the downloader script (use `--help` for more details) or by setting the `$YELLOWBRICK_DATA` environment variable. This is the preferred mechanism because this will also influence how data is loaded in Yellowbrick.
-
-_Note: Developers who have downloaded data from Yellowbrick versions earlier than v1.0 may experience some problems with the older data format. If this occurs, you can clear out your data cache as follows:_
-
-```bash
-$ python -m yellowbrick.download --cleanup
-```
-
-_This will remove old datasets and download the new ones. You can also use the `--no-download` flag to simply clear the cache without re-downloading data. Users who are having difficulty with datasets can also use this or they can uninstall and reinstall Yellowbrick using `pip`._
-
-## Citing Yellowbrick
-
-We would be glad if you used Yellowbrick in your scientific publications! If you do, please cite us using the [citation guidelines](https://www.scikit-yb.org/en/latest/about.html#citing-yellowbrick).
-
-## Affiliations
 
-[![District Data Labs](docs/images/readme/affiliates_ddl.png)](https://districtdatalabs.com/)
-[![NumFOCUS Affiliated Project](docs/images/readme/affiliates_numfocus.png)](https://numfocus.org)
\ No newline at end of file
+As you can see, there are lots of ways to get involved, and we would be very happy for you to join us!

From 46fbb4c6829b459ad7790e07521485200897934c Mon Sep 17 00:00:00 2001
From: hellokayas <pidnas94335@gmail.com>
Date: Wed, 16 Oct 2024 01:05:31 -0500
Subject: [PATCH 20/30] readme and paper

---
 README.md       |  14 -------
 paper/paper.bib | 103 ++++++++++++++++++++++++++++++++++++++++++++++++
 paper/paper.md  |  45 +++++++++++++++++++++
 3 files changed, 148 insertions(+), 14 deletions(-)
 create mode 100644 paper/paper.bib
 create mode 100644 paper/paper.md

diff --git a/README.md b/README.md
index 0c75a8a..10a7019 100644
--- a/README.md
+++ b/README.md
@@ -5,24 +5,10 @@
 
 
 
-
-
-## Usage
-
-
-## Contributing
-
-
-
 ## License
 
 Read the [LICENSE](LICENSE) file.
 
-
-
-
-# MM-POE
-
 **Visual analysis and diagnostic tools to facilitate machine learning model selection.**
 
 
diff --git a/paper/paper.bib b/paper/paper.bib
new file mode 100644
index 0000000..875f1cd
--- /dev/null
+++ b/paper/paper.bib
@@ -0,0 +1,103 @@
+@article{zenodo,
+  author      = {Bengfort, Benjamin and
+                Bilbro, Rebecca and
+                Danielsen, Nathan and
+                Gray, Larry and
+                others},
+  title       = {Yellowbrick},
+  month       = Nov,
+  year        = 2018,
+  doi         = {10.5281/zenodo.1206239},
+  url         = {https://doi.org/10.5281/zenodo.1206239}
+}
+
+@article{sklearn,
+ title        = {Scikit-learn: Machine Learning in {P}ython},
+ author       = {Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+ and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+ and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+ Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+ journal      = {Journal of Machine Learning Research},
+ volume       = {12},
+ pages        = {2825--2830},
+ year         = {2011}
+}
+
+@article{matplotlib,
+  author       = {Hunter, J. D.},
+  title        = {Matplotlib: A 2D graphics environment},
+  journal      = {Computing In Science \& Engineering},
+  volume       = {9},
+  number       = {3},
+  pages        = {90--95},
+  abstract     = {Matplotlib is a 2D graphics package used for Python
+  for application development, interactive scripting, and
+  publication-quality image generation across user
+  interfaces and operating systems.},
+  publisher    = {IEEE COMPUTER SOC},
+  doi          = {10.1109/MCSE.2007.55},
+  year         = 2007
+}
+
+@misc{scipy,
+  author       = {Eric Jones and Travis Oliphant and Pearu Peterson and others},
+  title        = {{SciPy}: Open source scientific tools for {Python}},
+  year         = {2001--},
+  url          = "http://www.scipy.org/",
+  note         = {[Online; accessed 2018-07-30]}
+}
+
+@article{kumar2016model,
+  title        = {Model selection management systems: The next frontier of advanced analytics},
+  author       = {Kumar, Arun and McCann, Robert and Naughton, Jeffrey and Patel, Jignesh M},
+  journal      = {ACM SIGMOD Record},
+  volume       = {44},
+  number       = {4},
+  pages        = {17--22},
+  year         = {2016},
+  publisher    = {ACM},
+  doi          = {10.1145/2935694.2935698}
+}
+
+@article{liu_wang_liu_zhu_2017,
+  title        = {Towards better analysis of machine learning models: A visual analytics perspective},
+  volume       = {1},
+  url          = {https://www.sciencedirect.com/science/article/pii/S2468502X17300086},
+  doi          = {10.1016/j.visinf.2017.01.006},
+  number       = {1},
+  journal      = {Visual Informatics},
+  author       = {Liu, Shixia and Wang, Xiting and Liu, Mengchen and Zhu, Jun},
+  year         = {2017},
+  month        = {Mar},
+  pages        = {48–56}
+}
+
+@article{wickham_visualizing_2015,
+  title        = {Visualizing Statistical Models: {{Removing}} the Blindfold},
+  volume       = {8},
+  timestamp    = {2016-09-07T00:14:28Z},
+  doi          = {10.1002/sam.11271},
+  number       = {4},
+  urldate      = {2015-10-26},
+  journal      = {Statistical Analysis and Data Mining: The ASA Data Science Journal},
+  author       = {Wickham, Hadley and Cook, Dianne and Hofmann, Heike},
+  year         = {2015},
+  pages        = {203--225}
+}
+
+@inproceedings{kapoor2010interactive,
+  title        = {Interactive optimization for steering machine classification},
+  author       = {Kapoor, Ashish and Lee, Bongshin and Tan, Desney and Horvitz, Eric},
+  booktitle    = {Proceedings of the SIGCHI Conference on Human Factors in Computing Systems},
+  pages        = {1343--1352},
+  year         = {2010},
+  organization = {ACM},
+  doi          = {10.1145/1753326.1753529}
+}
+
+@article{rajaraman2008more,
+  title        = {More data usually beats better algorithms},
+  author       = {Rajaraman, Anand},
+  journal      = {Datawocky Blog},
+  year         = {2008}
+}
\ No newline at end of file
diff --git a/paper/paper.md b/paper/paper.md
new file mode 100644
index 0000000..9ea3533
--- /dev/null
+++ b/paper/paper.md
@@ -0,0 +1,45 @@
+---
+title: 'MM-POE'
+tags:
+  - machine learning
+  - Large Language Models
+  - Multi-modal
+  - python
+  - Multiple Choice Question Answering
+authors:
+  - name: Sayak Chakrabarty
+    affiliation: 1
+  - name: Souradip Pal
+    orcid: 0000-0002-5781-3032
+    affiliation: 2
+affiliations:
+ - name: Northwestern University
+   index: 1
+ - name: Purdue University
+   index: 2
+date: 16 October 2024
+bibliography: paper.bib
+---
+
+# Summary
+
+**Statement of Need**
+
+Language models (LMs) excel at in-context learning for multiple choice reasoning tasks but often treat all options equally, unlike humans who typically eliminate incorrect choices before selecting the correct answer. This discrepancy can limit the effectiveness of LMs in accurately solving such tasks. To address this, we introduce the Process of Elimination (POE), a two-step scoring method designed to enhance LM performance by mimicking human reasoning strategies. 
+
+In the first step, POE evaluates and scores each option, systematically eliminating those that appear incorrect. The second step involves masking these eliminated options, allowing the LM to focus solely on the remaining viable choices to make a final prediction. Our zero-shot experiments across eight reasoning tasks demonstrate POE's effectiveness, particularly excelling in logical reasoning scenarios. Additionally, POE proves adaptable to few-shot settings and is compatible with large language models (LLMs) like ChatGPT.
+
+By implementing POE, researchers and practitioners can significantly improve the accuracy and reliability of LMs in multiple choice reasoning tasks, making it a valuable tool for advancing machine learning model selection and evaluation.
+
+Implemented in Python, the Yellowbrick visualization package achieves steering by extending both scikit-learn [@sklearn] and Matplotlib [@matplotlib]. Like Yellowbrick, both scikit-learn and Matplotlib are extensions of SciPy [@scipy], libraries intended to facilitate scientific computing. Scikit-learn provides a generalized API for machine learning by exposing the concept of an `Estimator`, an object that learns from data. Yellowbrick in turn extends this concept with the idea of a `Visualizer`, an object that both learns from data and visualizes the result. Visualizers wrap Matplotlib procedures to produce publication-ready figures and rich visual analytics.
+
+
+![Feature Analysis](figures/feature_analysis.png)
+
+Because “more data beats better algorithms” [@rajaraman2008more], the first step to creating valid, predictive models is to find the minimum set of features that predicts the dependent variable. Generally, this means finding features that describe data in high dimensional space that are *separable* (i.e., by a hyperplane). Tools like `RadViz`, `ParallelCoordinates`, and `Manifold` help visualize high dimensional data for quick diagnostics. Bayesian models and regressions suffer when independent variables are collinear (i.e., exhibit pairwise correlation). `Rank2D` visualizations show pairwise correlations among features and can facilitate feature elimination.
+
+**State of the field: Do the authors describe how this software compares to other commonly-used packages?**
+
+# Acknowledgements
+
+We have used the server provided by Northwestern University for building this software.

From 3041845294914d3f4d900a8ffc0e5e79f05b2b8e Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Wed, 16 Oct 2024 21:19:29 -0500
Subject: [PATCH 21/30] Fixed test cases for dataloader.

---
 tests/methods/utils/test_data.py | 45 +++++++++++++++-----------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/tests/methods/utils/test_data.py b/tests/methods/utils/test_data.py
index 7216448..c10f4e7 100644
--- a/tests/methods/utils/test_data.py
+++ b/tests/methods/utils/test_data.py
@@ -570,14 +570,13 @@ def test_piqa_loader(sample_args):
         'sol2': 'Push the lid upwards'
     })
     label_content = '0\n'  # First solution is correct
-    with patch('builtins.open', mock.mock_open(read_data=qa_content)) as mock_qa_file:
-        mock_qa_file.return_value.__iter__.return_value = [qa_content]
-        with patch('builtins.open', mock.mock_open(read_data=label_content)) as mock_label_file:
-            mock_label_file.return_value.__iter__.return_value = [label_content]
-            examples = piqa_loader(['dummy_qa_path.jsonl', 'dummy_label_path.txt'], args)
-            assert len(examples) == 1
-            assert examples[0]['label'] == 0
-            assert 'Answer the following question: Question: To open a jar, you should' in examples[0]['premise']
+    with patch('builtins.open', mock.mock_open()) as mock_file:
+        mock_file.side_effect = [mock.mock_open(read_data=qa_content).return_value,
+                                    mock.mock_open(read_data=label_content).return_value]
+        examples = piqa_loader(['dummy_qa_path.jsonl', 'dummy_label_path.txt'], args)
+        assert len(examples) == 1
+        assert examples[0]['label'] == 0
+        assert 'Answer the following question: Question: To open a jar, you should' in examples[0]['premise']
 
 def test_qasc_loader(sample_args):
     args = sample_args
@@ -615,14 +614,13 @@ def test_siqa_loader(sample_args):
         'answerC': 'To sleep'
     })
     label_content = '1\n'  # Answer index is 1 (but labels are 1-based in siqa_loader, and subtract 1)
-    with patch('builtins.open', mock.mock_open(read_data=qa_content)) as mock_qa_file:
-        mock_qa_file.return_value.__iter__.return_value = [qa_content]
-        with patch('builtins.open', mock.mock_open(read_data=label_content)) as mock_label_file:
-            mock_label_file.return_value.__iter__.return_value = [label_content]
-            examples = siqa_loader(['dummy_qa_path.jsonl', 'dummy_label_path.txt'], args)
-            assert len(examples) == 1
-            assert examples[0]['label'] == 0  # '1' in label file corresponds to index 0
-            assert 'Answer the following question: Question: Alex went to the store. Why did Alex go to the store?' in examples[0]['premise']
+    with patch('builtins.open', mock.mock_open()) as mock_file:
+        mock_file.side_effect = [mock.mock_open(read_data=qa_content).return_value,
+                                    mock.mock_open(read_data=label_content).return_value]
+        examples = siqa_loader(['dummy_qa_path.jsonl', 'dummy_label_path.txt'], args)
+        assert len(examples) == 1
+        assert examples[0]['label'] == 0  # '1' in label file corresponds to index 0
+        assert 'Answer the following question: Question: Alex went to the store. Why did Alex go to the store?' in examples[0]['premise']
 
 def test_winogrande_loader(sample_args):
     args = sample_args
@@ -633,14 +631,13 @@ def test_winogrande_loader(sample_args):
         'option2': 'suitcase'
     })
     label_content = '1\n'  # Correct answer is option1 (labels are 1-based)
-    with patch('builtins.open', mock.mock_open(read_data=qa_content)) as mock_qa_file:
-        mock_qa_file.return_value.__iter__.return_value = [qa_content]
-        with patch('builtins.open', mock.mock_open(read_data=label_content)) as mock_label_file:
-            mock_label_file.return_value.__iter__.return_value = [label_content]
-            examples = winogrande_loader(['dummy_qa_path.jsonl', 'dummy_label_path.txt'], args)
-            assert len(examples) == 1
-            assert examples[0]['label'] == 0  # '1' in label file corresponds to index 0
-            assert 'Answer the following question: Question: The trophy doesn\'t fit in the brown suitcase because it\'s too big.' in examples[0]['premise']
+    with patch('builtins.open', mock.mock_open()) as mock_file:
+        mock_file.side_effect = [mock.mock_open(read_data=qa_content).return_value,
+                                    mock.mock_open(read_data=label_content).return_value]
+        examples = winogrande_loader(['dummy_qa_path.jsonl', 'dummy_label_path.txt'], args)
+        assert len(examples) == 1
+        assert examples[0]['label'] == 0  # '1' in label file corresponds to index 0
+        assert 'Answer the following question: Question: The trophy doesn\'t fit in the brown suitcase because it\'s too big.' in examples[0]['premise']
 
 def test_date_understanding_loader(sample_args):
     args = sample_args

From 48e02aadd95579092898d45a0cc070d090a4d7fe Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Wed, 16 Oct 2024 21:35:52 -0500
Subject: [PATCH 22/30] Updated README

---
 README.md | 58 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 10a7019..de6a84b 100644
--- a/README.md
+++ b/README.md
@@ -4,37 +4,27 @@
 [![CI](https://github.com/souradipp76/MM-PoE/actions/workflows/main.yml/badge.svg)](https://github.com/souradipp76/MM-PoE/actions/workflows/main.yml)
 
 
+**Multiple Choice Reasoning tool via. Process of Elimination using Multi-Modal models**
 
-## License
-
-Read the [LICENSE](LICENSE) file.
-
-**Visual analysis and diagnostic tools to facilitate machine learning model selection.**
 
-
-## What is MM-POE? Statement of Need
+## What is MM-PoE?
 
 **Statement of Need**
 
-Language models (LMs) excel at in-context learning for multiple choice reasoning tasks but often treat all options equally, unlike humans who typically eliminate incorrect choices before selecting the correct answer. This discrepancy can limit the effectiveness of LMs in accurately solving such tasks. To address this, we introduce the Process of Elimination (POE), a two-step scoring method designed to enhance LM performance by mimicking human reasoning strategies. 
+Language models (LMs) excel at in-context learning for multiple choice reasoning tasks but often treat all options equally, unlike humans who typically eliminate incorrect choices before selecting the correct answer. Same is true in case of visual question answering tasks with multiple choices. This discrepancy can limit the effectiveness of vision language models in accurately solving such tasks. To address this, we introduce Multi-Modal Process of Elimination (MM-PoE), a two-step scoring method designed to enhance VLM performance by mimicking human reasoning strategies in multi-modal settings. 
 
-In the first step, POE evaluates and scores each option, systematically eliminating those that appear incorrect. The second step involves masking these eliminated options, allowing the LM to focus solely on the remaining viable choices to make a final prediction. Our zero-shot experiments across eight reasoning tasks demonstrate POE's effectiveness, particularly excelling in logical reasoning scenarios. Additionally, POE proves adaptable to few-shot settings and is compatible with large language models (LLMs) like ChatGPT.
+In the first step, the method evaluates and scores each option, systematically eliminating those that appear incorrect. The second step involves masking these eliminated options, allowing the VLM to focus solely on the remaining viable choices to make a final prediction. Our zero-shot experiments across three datasets demonstrate MM-PoE's effectiveness, particularly excelling in logical reasoning scenarios . Additionally, MM-PoE proves adaptable to few-shot settings and is compatible with large language models (LLMs) like ChatGPT.
 
-By implementing POE, researchers and practitioners can significantly improve the accuracy and reliability of LMs in multiple choice reasoning tasks, making it a valuable tool for advancing machine learning model selection and evaluation.
+By implementing MM-PoE, researchers and practitioners can experiment and significantly improve the accuracy and reliability of VLMs in multiple choice reasoning tasks, making it a valuable tool for advancing machine learning models for visual reasoning.
 
 ## Installing MM-POE
 
 ### Install it from PyPI
 
-```bash
-pip install mm_poe
-```
-### Install it from source
+The simplest way to install MM-PoE and its dependencies is from PyPI with pip, Python's preferred package installer.
 
 ```bash
-$ git clone https://github.com/souradipp76/MM-PoE.git
-$ cd MM-PoE
-$ make install
+$ pip install mm_poe
 ```
 
 In order to upgrade MM-POE to the latest version, use pip as follows.
@@ -43,11 +33,21 @@ In order to upgrade MM-POE to the latest version, use pip as follows.
 $ pip install -U mm_poe
 ```
 
-## Using Yellowbrick
+### Install it from source
+
+You can also install MM-PoE from source as follows.
+
+```bash
+$ git clone https://github.com/souradipp76/MM-PoE.git
+$ cd MM-PoE
+$ make install
+```
+
+## Usage
 
-Here is a typical example usage of MM-POE:
+Here is a typical example usage of MM-PoE.
 
-### Running the CLI
+### Running CLI
 
 ```bash
 $ python -m mm_poe
@@ -55,7 +55,17 @@ $ python -m mm_poe
 $ mm_poe
 ```
 
-## Contributing to MM-POE
+### Running Experiments
+
+```bash
+$ bash 7_main_exp.sh
+#or
+$ bash 9_mask_vqa.sh
+#or
+$ bash 11_few_shot_vqa.sh
+```
+
+## Contributing
 
 MM-POE is an open-source project that is supported by a community who will gratefully and humbly accept any contributions you might make to the project.
 
@@ -63,8 +73,12 @@ If you are interested in contributing, read the [CONTRIBUTING.md](CONTRIBUTING.m
 
 - Submit a bug report or feature request on [GitHub Issues](https://github.com/souradipp76/MM-PoE/issues).
 - Add to the documentation or help with our website.
-- Write [unit or integration tests]() for our project.
+- Write unit or integration tests for our project.
 - Answer questions on our issues, mailing list, Stack Overflow, and elsewhere.
 - Write a blog post, tweet, or share our project with others.
 
 As you can see, there are lots of ways to get involved, and we would be very happy for you to join us!
+
+## License
+
+Read the [LICENSE](LICENSE) file.

From 2d94912f16fa8dc405ee69031de1868ce70c9a39 Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Wed, 16 Oct 2024 21:41:33 -0500
Subject: [PATCH 23/30] Updated README and paper.

---
 README.md      |  2 +-
 paper/paper.md | 26 +++++++++++---------------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index de6a84b..4ee8e8d 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 [![CI](https://github.com/souradipp76/MM-PoE/actions/workflows/main.yml/badge.svg)](https://github.com/souradipp76/MM-PoE/actions/workflows/main.yml)
 
 
-**Multiple Choice Reasoning tool via. Process of Elimination using Multi-Modal models**
+**Multiple Choice Reasoning via. Process of Elimination using Multi-Modal models**
 
 
 ## What is MM-PoE?
diff --git a/paper/paper.md b/paper/paper.md
index 9ea3533..96676d4 100644
--- a/paper/paper.md
+++ b/paper/paper.md
@@ -1,11 +1,12 @@
 ---
-title: 'MM-POE'
+title: 'MM-PoE: Multiple Choice Reasoning via. Process of Elimination using Multi-Modal models'
 tags:
   - machine learning
-  - Large Language Models
-  - Multi-modal
+  - large language models
+  - multi-modal
   - python
-  - Multiple Choice Question Answering
+  - multiple choice reasoning
+  - visual question answering
 authors:
   - name: Sayak Chakrabarty
     affiliation: 1
@@ -23,22 +24,17 @@ bibliography: paper.bib
 
 # Summary
 
-**Statement of Need**
+# Statement of Need
 
-Language models (LMs) excel at in-context learning for multiple choice reasoning tasks but often treat all options equally, unlike humans who typically eliminate incorrect choices before selecting the correct answer. This discrepancy can limit the effectiveness of LMs in accurately solving such tasks. To address this, we introduce the Process of Elimination (POE), a two-step scoring method designed to enhance LM performance by mimicking human reasoning strategies. 
+Language models (LMs) excel at in-context learning for multiple choice reasoning tasks but often treat all options equally, unlike humans who typically eliminate incorrect choices before selecting the correct answer. Same is true in case of visual question answering tasks with multiple choices. This discrepancy can limit the effectiveness of vision language models in accurately solving such tasks. To address this, we introduce Multi-Modal Process of Elimination (MM-PoE), a two-step scoring method designed to enhance VLM performance by mimicking human reasoning strategies in multi-modal settings. 
 
-In the first step, POE evaluates and scores each option, systematically eliminating those that appear incorrect. The second step involves masking these eliminated options, allowing the LM to focus solely on the remaining viable choices to make a final prediction. Our zero-shot experiments across eight reasoning tasks demonstrate POE's effectiveness, particularly excelling in logical reasoning scenarios. Additionally, POE proves adaptable to few-shot settings and is compatible with large language models (LLMs) like ChatGPT.
+In the first step, the method evaluates and scores each option, systematically eliminating those that appear incorrect. The second step involves masking these eliminated options, allowing the VLM to focus solely on the remaining viable choices to make a final prediction. Our zero-shot experiments across three datasets demonstrate MM-PoE's effectiveness, particularly excelling in logical reasoning scenarios . Additionally, MM-PoE proves adaptable to few-shot settings and is compatible with large language models (LLMs) like ChatGPT.
 
-By implementing POE, researchers and practitioners can significantly improve the accuracy and reliability of LMs in multiple choice reasoning tasks, making it a valuable tool for advancing machine learning model selection and evaluation.
+By implementing MM-PoE, researchers and practitioners can experiment and significantly improve the accuracy and reliability of VLMs in multiple choice reasoning tasks, making it a valuable tool for advancing machine learning models for visual reasoning.
 
-Implemented in Python, the Yellowbrick visualization package achieves steering by extending both scikit-learn [@sklearn] and Matplotlib [@matplotlib]. Like Yellowbrick, both scikit-learn and Matplotlib are extensions of SciPy [@scipy], libraries intended to facilitate scientific computing. Scikit-learn provides a generalized API for machine learning by exposing the concept of an `Estimator`, an object that learns from data. Yellowbrick in turn extends this concept with the idea of a `Visualizer`, an object that both learns from data and visualizes the result. Visualizers wrap Matplotlib procedures to produce publication-ready figures and rich visual analytics.
 
-
-![Feature Analysis](figures/feature_analysis.png)
-
-Because “more data beats better algorithms” [@rajaraman2008more], the first step to creating valid, predictive models is to find the minimum set of features that predicts the dependent variable. Generally, this means finding features that describe data in high dimensional space that are *separable* (i.e., by a hyperplane). Tools like `RadViz`, `ParallelCoordinates`, and `Manifold` help visualize high dimensional data for quick diagnostics. Bayesian models and regressions suffer when independent variables are collinear (i.e., exhibit pairwise correlation). `Rank2D` visualizations show pairwise correlations among features and can facilitate feature elimination.
-
-**State of the field: Do the authors describe how this software compares to other commonly-used packages?**
+# State of the Field
+Do the authors describe how this software compares to other commonly-used packages?
 
 # Acknowledgements
 

From 24043ba9ee2a816334144aee32e0decb884abadc Mon Sep 17 00:00:00 2001
From: hellokayas <pidnas94335@gmail.com>
Date: Thu, 17 Oct 2024 00:17:51 -0500
Subject: [PATCH 24/30] paper

---
 paper/paper.bib | 304 ++++++++++++++++++++++++++++++++----------------
 paper/paper.md  |  40 +++++++
 2 files changed, 241 insertions(+), 103 deletions(-)

diff --git a/paper/paper.bib b/paper/paper.bib
index 875f1cd..870426b 100644
--- a/paper/paper.bib
+++ b/paper/paper.bib
@@ -1,103 +1,201 @@
-@article{zenodo,
-  author      = {Bengfort, Benjamin and
-                Bilbro, Rebecca and
-                Danielsen, Nathan and
-                Gray, Larry and
-                others},
-  title       = {Yellowbrick},
-  month       = Nov,
-  year        = 2018,
-  doi         = {10.5281/zenodo.1206239},
-  url         = {https://doi.org/10.5281/zenodo.1206239}
-}
-
-@article{sklearn,
- title        = {Scikit-learn: Machine Learning in {P}ython},
- author       = {Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
- and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
- and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
- Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
- journal      = {Journal of Machine Learning Research},
- volume       = {12},
- pages        = {2825--2830},
- year         = {2011}
-}
-
-@article{matplotlib,
-  author       = {Hunter, J. D.},
-  title        = {Matplotlib: A 2D graphics environment},
-  journal      = {Computing In Science \& Engineering},
-  volume       = {9},
-  number       = {3},
-  pages        = {90--95},
-  abstract     = {Matplotlib is a 2D graphics package used for Python
-  for application development, interactive scripting, and
-  publication-quality image generation across user
-  interfaces and operating systems.},
-  publisher    = {IEEE COMPUTER SOC},
-  doi          = {10.1109/MCSE.2007.55},
-  year         = 2007
-}
-
-@misc{scipy,
-  author       = {Eric Jones and Travis Oliphant and Pearu Peterson and others},
-  title        = {{SciPy}: Open source scientific tools for {Python}},
-  year         = {2001--},
-  url          = "http://www.scipy.org/",
-  note         = {[Online; accessed 2018-07-30]}
-}
-
-@article{kumar2016model,
-  title        = {Model selection management systems: The next frontier of advanced analytics},
-  author       = {Kumar, Arun and McCann, Robert and Naughton, Jeffrey and Patel, Jignesh M},
-  journal      = {ACM SIGMOD Record},
-  volume       = {44},
-  number       = {4},
-  pages        = {17--22},
-  year         = {2016},
-  publisher    = {ACM},
-  doi          = {10.1145/2935694.2935698}
-}
-
-@article{liu_wang_liu_zhu_2017,
-  title        = {Towards better analysis of machine learning models: A visual analytics perspective},
-  volume       = {1},
-  url          = {https://www.sciencedirect.com/science/article/pii/S2468502X17300086},
-  doi          = {10.1016/j.visinf.2017.01.006},
-  number       = {1},
-  journal      = {Visual Informatics},
-  author       = {Liu, Shixia and Wang, Xiting and Liu, Mengchen and Zhu, Jun},
-  year         = {2017},
-  month        = {Mar},
-  pages        = {48–56}
-}
-
-@article{wickham_visualizing_2015,
-  title        = {Visualizing Statistical Models: {{Removing}} the Blindfold},
-  volume       = {8},
-  timestamp    = {2016-09-07T00:14:28Z},
-  doi          = {10.1002/sam.11271},
-  number       = {4},
-  urldate      = {2015-10-26},
-  journal      = {Statistical Analysis and Data Mining: The ASA Data Science Journal},
-  author       = {Wickham, Hadley and Cook, Dianne and Hofmann, Heike},
-  year         = {2015},
-  pages        = {203--225}
-}
-
-@inproceedings{kapoor2010interactive,
-  title        = {Interactive optimization for steering machine classification},
-  author       = {Kapoor, Ashish and Lee, Bongshin and Tan, Desney and Horvitz, Eric},
-  booktitle    = {Proceedings of the SIGCHI Conference on Human Factors in Computing Systems},
-  pages        = {1343--1352},
-  year         = {2010},
-  organization = {ACM},
-  doi          = {10.1145/1753326.1753529}
-}
-
-@article{rajaraman2008more,
-  title        = {More data usually beats better algorithms},
-  author       = {Rajaraman, Anand},
-  journal      = {Datawocky Blog},
-  year         = {2008}
-}
\ No newline at end of file
+@inproceedings{brown2020language,
+  title = {Language models are few-shot learners},
+  author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
+  booktitle = {Advances in neural information processing systems},
+  volume = {33},
+  pages = {1877--1901},
+  year = {2020},
+  organization = {Curran Associates, Inc.}
+}
+
+@inproceedings{du2020event,
+  title = {Event extraction by answering (almost) natural questions},
+  author = {Du, Xinya and Cardie, Claire},
+  booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
+  pages = {671--683},
+  year = {2020},
+  organization = {Association for Computational Linguistics}
+}
+
+@article{fei2023mitigating,
+  title = {Mitigating label biases for in-context learning},
+  author = {Fei, Yu and Hou, Yifan and Chen, Zeming and Bosselut, Antoine},
+  journal = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+  pages = {14014--14031},
+  year = {2023},
+  organization = {Association for Computational Linguistics}
+}
+
+@article{freivalds2002learning,
+  title = {Learning by the process of elimination},
+  author = {Freivalds, Rusins and Karpinski, Marek and Smith, Carl H and Wiehagen, Rolf},
+  journal = {Inf. Comput.},
+  volume = {176},
+  pages = {37--50},
+  year = {2002}
+}
+
+@inproceedings{holtzman2021surface,
+  title = {Surface form competition: Why the highest probability answer isn’t always right},
+  author = {Holtzman, Ari and West, Peter and Shwartz, Vered and Choi, Yejin and Zettlemoyer, Luke},
+  booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
+  pages = {7038--7051},
+  year = {2021},
+  organization = {Association for Computational Linguistics}
+}
+
+@inproceedings{kojima2022large,
+  title = {Large language models are zero-shot reasoners},
+  author = {Kojima, Takeshi and Gu, Shixiang Shane and Reid, Machel and Matsuo, Yutaka and Iwasawa, Yusuke},
+  booktitle = {Advances in Neural Information Processing Systems},
+  year = {2022}
+}
+
+@inproceedings{lyu2023z,
+  title = {Z-ICL: Zero-shot in-context learning with pseudo-demonstrations},
+  author = {Lyu, Xinxi and Min, Sewon and Beltagy, Iz and Zettlemoyer, Luke and Hajishirzi, Hannaneh},
+  booktitle = {ICLR 2023 Workshop on Mathematical and Empirical Understanding of Foundation Models},
+  year = {2023}
+}
+
+@article{malkin2022coherence,
+  title = {Coherence boosting: When your pretrained language model is not paying enough attention},
+  author = {Malkin, Nikolay and Wang, Zhen and Jojic, Nebojsa},
+  journal = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+  pages = {8214--8236},
+  year = {2022},
+  organization = {Association for Computational Linguistics}
+}
+
+@inproceedings{min2022noisy,
+  title = {Noisy channel language model prompting for few-shot text classification},
+  author = {Min, Sewon and Lewis, Mike and Hajishirzi, Hannaneh and Zettlemoyer, Luke},
+  booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+  pages = {5316--5330},
+  year = {2022},
+  organization = {Association for Computational Linguistics}
+}
+
+@inproceedings{nie2020adversarial,
+  title = {Adversarial NLI: A new benchmark for natural language understanding},
+  author = {Nie, Yixin and Williams, Adina and Dinan, Emily and Bansal, Mohit and Weston, Jason and Kiela, Douwe},
+  booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
+  pages = {4885--4901},
+  year = {2020},
+  organization = {Association for Computational Linguistics}
+}
+
+@inproceedings{ouyang2022training,
+  title = {Training language models to follow instructions with human feedback},
+  author = {Ouyang, Long and Wu, Jeffrey and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Gray, Alex and others},
+  booktitle = {Advances in Neural Information Processing Systems},
+  year = {2022}
+}
+
+@inproceedings{robinson2023leveraging,
+  title = {Leveraging large language models for multiple choice question answering},
+  author = {Robinson, Joshua and Wingate, David},
+  booktitle = {The Eleventh International Conference on Learning Representations},
+  year = {2023}
+}
+
+@inproceedings{sap2019social,
+  title = {Social IQa: Commonsense reasoning about social interactions},
+  author = {Sap, Maarten and Rashkin, Hannah and Chen, Derek and Le Bras, Ronan and Choi, Yejin},
+  booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
+  pages = {4463--4473},
+  year = {2019},
+  organization = {Association for Computational Linguistics}
+}
+
+@inproceedings{seo2022debiasing,
+  title = {Debiasing event understanding for visual commonsense tasks},
+  author = {Seo, Minji and Jung, YeonJoon and Choi, Seungtaek and Hwang, Seungwon and Liu, Bei},
+  booktitle = {Findings of the Association for Computational Linguistics: ACL 2022},
+  pages = {782--787},
+  year = {2022},
+  organization = {Association for Computational Linguistics}
+}
+
+@article{shum2023automatic,
+  title = {Automatic prompt augmentation and selection with chain-of-thought from labeled data},
+  author = {Shum, KaShun and Diao, Shizhe and Zhang, Tong},
+  journal = {arXiv preprint arXiv:2302.12822},
+  year = {2023}
+}
+
+@inproceedings{srivastava2023beyond,
+  title = {Beyond the imitation game: Quantifying and extrapolating the capabilities of language models},
+  author = {Srivastava, Aarohi and Rastogi, Abhinav and Rao, Abhishek and Shoeb, Abu Awal Md and Abid, Abubakar and Fisch, Adam and Brown, Adam R and Santoro, Adam and Gupta, Aditya and Garriga-Alonso, Adrià and others},
+  booktitle = {Transactions on Machine Learning Research},
+  year = {2023}
+}
+
+@inproceedings{suzgun2023challenging,
+  title = {Challenging BIG-bench tasks and whether chain-of-thought can solve them},
+  author = {Suzgun, Mirac and Scales, Nathan and Schärli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc and Chi, Ed and Zhou, Denny and Wei, Jason},
+  booktitle = {Findings of the Association for Computational Linguistics: ACL 2023},
+  pages = {13003--13051},
+  year = {2023},
+  organization = {Association for Computational Linguistics}
+}
+
+@inproceedings{talmor2019commonsenseqa,
+  title = {CommonsenseQA: A question answering challenge targeting commonsense knowledge},
+  author = {Talmor, Alon and Herzig, Jonathan and Lourie, Nicholas and Berant, Jonathan},
+  booktitle = {Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)},
+  pages = {4149--4158},
+  year = {2019},
+  organization = {Association for Computational Linguistics}
+}
+
+@article{touvron2023llama,
+  title = {Llama: Open and efficient foundation language models},
+  author = {Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timothée and Rozière, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and others},
+  journal = {arXiv preprint arXiv:2302.13971},
+  year = {2023}
+}
+
+@article{wang2023pinto,
+  title = {PINTO: Faithful language reasoning using prompt-generated rationales},
+  author = {Wang, PeiFeng and Chan, Aaron and Ilievski, Filip and Chen, Muhao and Ren, Xiang},
+  journal = {The Eleventh International Conference on Learning Representations},
+  year = {2023}
+}
+
+@article{wang2023bself,
+  title = {Self-consistency improves chain of thought reasoning in language models},
+  author = {Wang, Xuezhi and Wei, Jason and Schuurmans, Dale and Le, Quoc V and Chi, Ed H. and Narang, Sharan and Chowdhery, Aakanksha and Zhou, Denny},
+  journal = {The Eleventh International Conference on Learning Representations},
+  year = {2023}
+}
+
+@inproceedings{wei2022chain,
+  title = {Chain of thought prompting elicits reasoning in large language models},
+  author = {Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Ichtner, Brian and Xia, Fei and Chi, Ed H. and Le, Quoc V and Zhou, Denny},
+  booktitle = {Advances in Neural Information Processing Systems},
+  year = {2022}
+}
+
+@article{yao2023tree,
+  title = {Tree of thoughts: Deliberate problem solving with large language models},
+  author = {Yao, Shunyu and Yu, Dian and Zhao, Jeffrey and Shafran, Izhak and Griffiths, Thomas L and Cao, Yuan and Narasimhan, Karthik},
+  journal = {arXiv preprint arXiv:2305.10601},
+  year = {2023}
+}
+
+@article{ye2023satisfiability,
+  title = {Satisfiability-aided language models using declarative prompting},
+  author = {Ye, Xi and Chen, Qiaochu and Dillig, Isil and Durrett, Greg},
+  journal = {arXiv preprint arXiv:2305.09656},
+  year = {2023}
+}
+
+@article{zhao2021calibrate,
+  title = {Calibrate before use: Improving few-shot performance of language models},
+  author = {Zhao, Zihao and Wallace, Eric and Feng, Shi and Klein, Dan and Singh, Sameer},
+  journal = {International Conference on Machine Learning},
+  pages = {12697--12706},
+  year = {2021},
+  organization = {PMLR}
+}
diff --git a/paper/paper.md b/paper/paper.md
index 96676d4..2b94220 100644
--- a/paper/paper.md
+++ b/paper/paper.md
@@ -36,6 +36,46 @@ By implementing MM-PoE, researchers and practitioners can experiment and signifi
 # State of the Field
 Do the authors describe how this software compares to other commonly-used packages?
 
+## Abstract
+
+This paper introduces the Process of Elimination (POE), a method to enhance language models' performance on multiple-choice reasoning by employing a two-step scoring system that first eliminates incorrect options and then predicts from the remaining ones. Our experiments across eight reasoning tasks show the method's effectiveness, particularly in logical reasoning tasks.
+
+## 1. Introduction
+
+Humans typically approach multiple-choice questions by eliminating wrong answers before selecting the correct one. We hypothesize that a similar approach can improve language model (LM) performance on these tasks. Our method, POE, adopts this two-step elimination and prediction strategy, showing promise in preliminary zero-shot experiments across various reasoning tasks [Brown et al., 2020].
+
+## 2. Method
+
+The POE operates in two phases:
+1. **Elimination**: Score each option and eliminate those below the average score.
+2. **Prediction**: Use a binary mask to ignore eliminated options and predict from the remaining ones.
+
+This method leverages the existing capabilities of LMs in scoring options and enhances decision-making by focusing only on plausible answers.
+
+## 3. Experiment Setup
+
+We evaluated POE on eight diverse reasoning tasks using FLAN-T5-XL and compared it against five baseline scoring methods. Accuracy was the primary metric for evaluation.
+
+## 4. Results
+
+POE consistently outperformed or matched the best-performing baselines across all tasks, showing particular strength in logical reasoning. The method's effectiveness in separating elimination and prediction tasks was crucial to its success.
+
+## 5. Analysis
+
+Further analysis revealed that POE's strengths lie particularly in tasks requiring logical reasoning. It effectively applies a masking strategy to focus the model's attention on likely correct options, improving both interpretability and factual adherence.
+
+## 6. Conclusion
+
+POE demonstrates a significant improvement in handling multiple choice reasoning tasks by mimicking a human-like process of elimination approach. Future work will focus on enhancing its generalizability and efficiency, possibly extending to few-shot settings and other modalities.
+
+## Limitations
+
+The current implementation of POE does not completely disregard eliminated options, potentially limiting its effectiveness. Optimizing the prompt and testing in few-shot scenarios remain areas for future improvement.
+
+## Ethics Statement
+
+While this model uses publicly available tasks and models, users should be aware of potential biases in the data and model outputs.
+
 # Acknowledgements
 
 We have used the server provided by Northwestern University for building this software.

From 6f59231703850dd0fef1d187b34c67e54e175558 Mon Sep 17 00:00:00 2001
From: hellokayas <pidnas94335@gmail.com>
Date: Thu, 17 Oct 2024 00:31:42 -0500
Subject: [PATCH 25/30] paper

---
 paper/paper.md | 114 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 98 insertions(+), 16 deletions(-)

diff --git a/paper/paper.md b/paper/paper.md
index 2b94220..ebc2c5f 100644
--- a/paper/paper.md
+++ b/paper/paper.md
@@ -24,6 +24,8 @@ bibliography: paper.bib
 
 # Summary
 
+This paper introduces the Process of Elimination (POE), a method to enhance language models' performance on multiple-choice reasoning by employing a two-step scoring system that first eliminates incorrect options and then predicts from the remaining ones. Our experiments across eight reasoning tasks show the method's effectiveness, particularly in logical reasoning tasks.
+
 # Statement of Need
 
 Language models (LMs) excel at in-context learning for multiple choice reasoning tasks but often treat all options equally, unlike humans who typically eliminate incorrect choices before selecting the correct answer. Same is true in case of visual question answering tasks with multiple choices. This discrepancy can limit the effectiveness of vision language models in accurately solving such tasks. To address this, we introduce Multi-Modal Process of Elimination (MM-PoE), a two-step scoring method designed to enhance VLM performance by mimicking human reasoning strategies in multi-modal settings. 
@@ -34,44 +36,124 @@ By implementing MM-PoE, researchers and practitioners can experiment and signifi
 
 
 # State of the Field
-Do the authors describe how this software compares to other commonly-used packages?
 
-## Abstract
+A common strategy for answering multiple-choice questions, especially under examination conditions, involves a process of elimination where incorrect answers are systematically discarded to narrow down the choices to the most likely correct one. This approach, grounded in everyday test-taking strategies, contrasts with how current language models (LMs) handle multiple-choice reasoning tasks. Typically, LMs evaluate each option independently or collectively without actively discarding less likely answers, potentially reducing their effectiveness in distinguishing the best choice from plausible distractors.
 
-This paper introduces the Process of Elimination (POE), a method to enhance language models' performance on multiple-choice reasoning by employing a two-step scoring system that first eliminates incorrect options and then predicts from the remaining ones. Our experiments across eight reasoning tasks show the method's effectiveness, particularly in logical reasoning tasks.
+This paper argues that language models can benefit from an explicit two-step reasoning process akin to human problem-solving techniques. The proposed method, dubbed the Process of Elimination (POE), enhances the decision-making process by first scoring and then eliminating options that are seemingly incorrect before focusing on selecting the correct answer from the remaining choices. This method is designed to align with natural human reasoning by replicating how individuals often approach multiple-choice questions, particularly under the constraint of time and accuracy, as frequently experienced in academic testing environments.
+
+Our hypothesis posits that language models, when equipped with a mechanism to discard implausible answers systematically, can achieve better performance on multiple-choice reasoning tasks. This is particularly relevant in the context of logical reasoning, where the elimination of clearly incorrect options can simplify the decision process and potentially lead to more accurate outcomes. This idea is supported by previous work demonstrating the effectiveness of LMs in various reasoning tasks when adapted to more human-like reasoning methods [Brown et al., 2020; Holtzman et al., 2021].
+
+In the development of POE, we draw inspiration from the established capabilities of LMs to handle complex reasoning tasks [Brown et al., 2020] and the known strategies that humans employ in test-taking scenarios. The approach builds on the foundational work in language modeling likelihood [Brown et al., 2020], which demonstrates the LMs' ability to perform in-context learning. By incorporating a structured process to eliminate unlikely choices, POE aims to refine this capability, making it more targeted and efficient in dealing with the nuanced challenges presented by multiple-choice questions.
 
-## 1. Introduction
+The effectiveness of this approach is underscored through zero-shot experiments across a diverse set of reasoning tasks, illustrating that the integration of human-like elimination strategies can significantly enhance the performance of language models. This paper aims to show that by mimicking human reasoning processes, we can make LMs not only perform better on standardized reasoning tasks but also behave in ways that are more interpretable and aligned with human cognitive processes.
 
-Humans typically approach multiple-choice questions by eliminating wrong answers before selecting the correct one. We hypothesize that a similar approach can improve language model (LM) performance on these tasks. Our method, POE, adopts this two-step elimination and prediction strategy, showing promise in preliminary zero-shot experiments across various reasoning tasks [Brown et al., 2020].
 
 ## 2. Method
 
-The POE operates in two phases:
-1. **Elimination**: Score each option and eliminate those below the average score.
-2. **Prediction**: Use a binary mask to ignore eliminated options and predict from the remaining ones.
+The Process of Elimination (POE) introduced in this paper operates on a two-step mechanism designed to enhance the decision-making capabilities of language models (LMs) in multiple-choice reasoning tasks. This method employs a novel approach to option elimination followed by a focused prediction phase. The strategy is rooted in the belief that separating the elimination of clearly incorrect options from the choice of the best remaining option will improve overall task performance.
+
+### Problem Setting
+
+Given a multiple-choice reasoning task, we define the problem setting as follows:
+
+- Let \( x \) be the question or context provided.
+- Let \( Y = \{y_1, y_2, \ldots, y_n\} \) be the set of multiple-choice options available.
+- Let \( y \) be the correct answer from \( Y \).
+
+The goal is to develop an in-context learning method that accurately selects \( y \) from \( Y \) given \( x \).
+
+### Two-Step Scoring Method
+
+#### Step 1: Elimination
+
+In the first step of the POE method, each option \( y_i \) is scored based on a specified metric. The score function, \( \text{score}(x, y_i) \), evaluates each option's plausibility given the question \( x \). The scores are used to eliminate options that are deemed less likely to be correct. Specifically, options whose scores are below the average score are eliminated. This is calculated as follows:
+
+\[ s_i = \text{score}(x, y_i) \]
+\[ Y_{\text{wrong}} = \{y_i \mid s_i < \text{avg}(s_1, \ldots, s_n)\} \]
+
+This elimination strategy intuitively aligns with how humans often discard options that seem clearly incorrect before carefully considering the remaining choices.
+
+#### Step 2: Prediction
+
+The second step involves making the final choice from the non-eliminated options. This step utilizes a binary mask to exclude the eliminated options during the prediction phase. The mask for each option \( y_i \) is defined as follows:
+
+\[ m_i = \begin{cases} 
+0 & \text{if } y_i \in Y_{\text{wrong}} \\
+1 & \text{otherwise}
+\end{cases} \]
+
+The masked context \( x_{\text{mask}} \) is then constructed by modifying the original context \( x \) to include only the options for which \( m_i = 1 \). Each option is scored again, but this time within the context that explicitly excludes the eliminated options, possibly by using a template \( T \) that masks out \( Y_{\text{wrong}} \) in the presentation of the options:
+
+\[ x_{\text{mask}} = T(x, Y, \text{mask}) \]
+
+The final predicted answer \( \hat{y} \) is then the option with the highest score among the remaining options:
 
-This method leverages the existing capabilities of LMs in scoring options and enhances decision-making by focusing only on plausible answers.
+\[ \hat{y} = \arg\max_{i \mid m_i = 1} \text{score}(x_{\text{mask}}, y_i) \]
+
+### Implementation Considerations
+
+The effectiveness of POE hinges on the robustness of the scoring function and the accuracy of the elimination step. The scoring function can be any LM-based likelihood estimator, such as language modeling likelihood or any of its alternatives like average log probability or calibrated log probability. Our implementation tests multiple such scoring functions to identify the most effective ones in both eliminating implausible options and accurately selecting the final answer.
+
+The POE method is designed to be model-agnostic, meaning it can be implemented using any existing LM capable of scoring text options, and it is flexible enough to be adapted to different types of multiple-choice questions across various domains.
 
 ## 3. Experiment Setup
 
-We evaluated POE on eight diverse reasoning tasks using FLAN-T5-XL and compared it against five baseline scoring methods. Accuracy was the primary metric for evaluation.
+To evaluate the effectiveness of the Process of Elimination (POE), we designed an experimental framework that tests the method across a diverse set of reasoning tasks. This setup aims to compare POE with existing scoring methods to highlight its potential improvements in accuracy and reasoning capability.
+
+### Data
+
+Our experiments were conducted on eight different multiple-choice reasoning tasks, selected to cover a broad spectrum of reasoning types and complexities. These tasks include both traditional reasoning tasks and more specialized ones designed to test specific reasoning skills. To ensure a comprehensive evaluation, we used test sets from established benchmarks when available; otherwise, we utilized development sets.
+
+### Model
+
+For the core experiments, we utilized the FLAN-T5-XL model, chosen for its balance between computational efficiency and performance in instruction-tuned language tasks. This model has demonstrated strong capabilities in handling various NLP tasks and serves as a robust platform for evaluating our POE method.
+
+### Baseline Methods
+
+We compared POE against five baseline scoring methods to assess its relative performance:
+
+1. **Language Modeling (LM):** This baseline uses the raw language modeling likelihood as the scoring function.
+2. **Average Language Modeling (AVG):** This method averages the log probabilities across all tokens in the option.
+3. **Calibration:** This involves adjusting the LM scores based on calibration techniques that aim to correct for the model's confidence.
+4. **Channel:** Channel methods score each option based on how likely the question is given the option, which reverses the typical conditional probability used in LMs.
+5. **Multiple Choice Prompting (MCP):** This approach formats the input by presenting the question followed by all options, prompting the model to select the most likely option.
+
+Each method provides a different approach to scoring options, allowing for a comprehensive comparison of how each interacts with the structure and strategy of POE.
+
+### Settings
+
+Our experiments primarily focused on a zero-shot setting to evaluate the generalization capabilities of POE without any task-specific tuning. Accuracy was used as the main metric for performance evaluation, with results averaged over multiple seeds to ensure robustness.
+
+To further explore the versatility of POE, we also examined its performance in few-shot settings by incorporating examples into the model's input, aiming to observe any changes in effectiveness when provided with context-specific demonstrations.
+
+### Implementation Details
+
+For each task, we implemented the scoring and prediction steps of POE as described in the Methods section. The scoring functions were carefully chosen based on their theoretical alignment with the two-step elimination and prediction philosophy of POE. We conducted extensive parameter tuning and optimization to maximize the performance of both the elimination step and the final prediction accuracy.
+
+This experiment setup was designed to rigorously test the effectiveness of POE across a range of reasoning tasks and compare its performance against standard baseline methods. The results of these experiments are intended to demonstrate the potential benefits of integrating a process of elimination approach into language model reasoning strategies for multiple-choice questions.
+
 
 ## 4. Results
 
 POE consistently outperformed or matched the best-performing baselines across all tasks, showing particular strength in logical reasoning. The method's effectiveness in separating elimination and prediction tasks was crucial to its success.
 
-## 5. Analysis
+| Task | MCP  | PoE  | PoE - MCP |
+|------|------|------|-----------|
+| LA   | 50.0 | 68.8 | +18.8     |
+| IMT  | 34.0 | 47.2 | +13.2     |
+| CLD  | 67.2 | 75.9 | +8.7      |
+| RACO | 53.8 | 60.6 | +6.8      |
+| CAI  | 84.1 | 81.8 | -2.3      |
+| EIE  | 25.0 | 19.1 | -5.9      |
+| RS   | 55.1 | 49.0 | -6.1      |
+| IOM  | 56.2 | 50.0 | -6.2      |
 
-Further analysis revealed that POE's strengths lie particularly in tasks requiring logical reasoning. It effectively applies a masking strategy to focus the model's attention on likely correct options, improving both interpretability and factual adherence.
+**Table 2**: Comparison of MCP and PoE accuracy scores on 8 new tasks. The top 4 tasks are logical reasoning tasks. PoE largely outperforms MCP on 4 logical reasoning tasks, and underperforms MCP on other 4 tasks.
 
 ## 6. Conclusion
 
 POE demonstrates a significant improvement in handling multiple choice reasoning tasks by mimicking a human-like process of elimination approach. Future work will focus on enhancing its generalizability and efficiency, possibly extending to few-shot settings and other modalities.
 
-## Limitations
-
-The current implementation of POE does not completely disregard eliminated options, potentially limiting its effectiveness. Optimizing the prompt and testing in few-shot scenarios remain areas for future improvement.
-
 ## Ethics Statement
 
 While this model uses publicly available tasks and models, users should be aware of potential biases in the data and model outputs.

From 43d79dff040d24a1e5ec6f9069fd198030ba3bbb Mon Sep 17 00:00:00 2001
From: hellokayas <pidnas94335@gmail.com>
Date: Thu, 17 Oct 2024 00:45:38 -0500
Subject: [PATCH 26/30] paper

---
 paper/paper.md | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/paper/paper.md b/paper/paper.md
index ebc2c5f..234ec4a 100644
--- a/paper/paper.md
+++ b/paper/paper.md
@@ -56,39 +56,47 @@ The Process of Elimination (POE) introduced in this paper operates on a two-step
 
 Given a multiple-choice reasoning task, we define the problem setting as follows:
 
-- Let \( x \) be the question or context provided.
-- Let \( Y = \{y_1, y_2, \ldots, y_n\} \) be the set of multiple-choice options available.
-- Let \( y \) be the correct answer from \( Y \).
+- Let \(x\) be the question or context provided.
+- Let \(Y = \{y_1, y_2, \ldots, y_n\}\) be the set of multiple-choice options available.
+- Let \(y\) be the correct answer from \(Y\).
 
-The goal is to develop an in-context learning method that accurately selects \( y \) from \( Y \) given \( x \).
+The goal is to develop an in-context learning method that accurately selects \(y\) from \(Y\) given \(x\).
 
 ### Two-Step Scoring Method
 
 #### Step 1: Elimination
 
-In the first step of the POE method, each option \( y_i \) is scored based on a specified metric. The score function, \( \text{score}(x, y_i) \), evaluates each option's plausibility given the question \( x \). The scores are used to eliminate options that are deemed less likely to be correct. Specifically, options whose scores are below the average score are eliminated. This is calculated as follows:
+In the first step of the POE method, each option \(y_i\) is scored based on a specified metric. The score function, \(\text{score}(x, y_i)\), evaluates each option's plausibility given the question \(x\). The scores are used to eliminate options that are deemed less likely to be correct. Specifically, options whose scores are below the average score are eliminated. This is calculated as follows:
 
-\[ s_i = \text{score}(x, y_i) \]
-\[ Y_{\text{wrong}} = \{y_i \mid s_i < \text{avg}(s_1, \ldots, s_n)\} \]
+```markdown
+s_i = \text{score}(x, y_i)
+Y_{\text{wrong}} = \{y_i | s_i < \text{avg}(s_1, \ldots, s_n)\}
+```
 
 This elimination strategy intuitively aligns with how humans often discard options that seem clearly incorrect before carefully considering the remaining choices.
 
 #### Step 2: Prediction
 
-The second step involves making the final choice from the non-eliminated options. This step utilizes a binary mask to exclude the eliminated options during the prediction phase. The mask for each option \( y_i \) is defined as follows:
+The second step involves making the final choice from the non-eliminated options. This step utilizes a binary mask to exclude the eliminated options during the prediction phase. The mask for each option \(y_i\) is defined as follows:
 
-\[ m_i = \begin{cases} 
+```markdown
+m_i = \begin{cases} 
 0 & \text{if } y_i \in Y_{\text{wrong}} \\
 1 & \text{otherwise}
-\end{cases} \]
+\end{cases}
+```
 
-The masked context \( x_{\text{mask}} \) is then constructed by modifying the original context \( x \) to include only the options for which \( m_i = 1 \). Each option is scored again, but this time within the context that explicitly excludes the eliminated options, possibly by using a template \( T \) that masks out \( Y_{\text{wrong}} \) in the presentation of the options:
+The masked context \(x_{\text{mask}}\) is then constructed by modifying the original context \(x\) to include only the options for which \(m_i = 1\). Each option is scored again, but this time within the context that explicitly excludes the eliminated options, possibly by using a template \(T\) that masks out \(Y_{\text{wrong}}\) in the presentation of the options:
 
-\[ x_{\text{mask}} = T(x, Y, \text{mask}) \]
+```markdown
+x_{\text{mask}} = T(x, Y, \text{mask})
+```
 
-The final predicted answer \( \hat{y} \) is then the option with the highest score among the remaining options:
+The final predicted answer \(\hat{y}\) is then the option with the highest score among the remaining options:
 
-\[ \hat{y} = \arg\max_{i \mid m_i = 1} \text{score}(x_{\text{mask}}, y_i) \]
+```markdown
+\hat{y} = \arg\max_{i | m_i = 1} \text{score}(x_{\text{mask}}, y_i)
+```
 
 ### Implementation Considerations
 

From bab304810fdce79c0c39d154c9c814d071ae7fbb Mon Sep 17 00:00:00 2001
From: hellokayas <pidnas94335@gmail.com>
Date: Thu, 17 Oct 2024 01:06:49 -0500
Subject: [PATCH 27/30] paper

---
 paper/paper.bib | 16 ++++++++++++++++
 paper/paper.md  | 30 +++++++++++++++---------------
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/paper/paper.bib b/paper/paper.bib
index 870426b..22607be 100644
--- a/paper/paper.bib
+++ b/paper/paper.bib
@@ -17,6 +17,14 @@ @inproceedings{du2020event
   organization = {Association for Computational Linguistics}
 }
 
+@article{datta2024consistency,
+  title={On the consistency of maximum likelihood estimation of probabilistic principal component analysis},
+  author={Datta, Arghya and Chakrabarty, Sayak},
+  journal={Advances in Neural Information Processing Systems},
+  volume={36},
+  year={2024}
+}
+
 @article{fei2023mitigating,
   title = {Mitigating label biases for in-context learning},
   author = {Fei, Yu and Hou, Yifan and Chen, Zeming and Bosselut, Antoine},
@@ -51,6 +59,14 @@ @inproceedings{kojima2022large
   year = {2022}
 }
 
+@article{zhang2023dynamically,
+  title={: A Dynamically Adaptive Defense to a Novel Attack on Review Fraud Detection Engines},
+  author={Zhang, Youzhi and Chakrabarty, Sayak and Liu, Rui and Pugliese, Andrea and Subrahmanian, VS},
+  journal={IEEE Transactions on Computational Social Systems},
+  year={2023},
+  publisher={IEEE}
+}
+
 @inproceedings{lyu2023z,
   title = {Z-ICL: Zero-shot in-context learning with pseudo-demonstrations},
   author = {Lyu, Xinxi and Min, Sewon and Beltagy, Iz and Zettlemoyer, Luke and Hajishirzi, Hannaneh},
diff --git a/paper/paper.md b/paper/paper.md
index 234ec4a..c6844c6 100644
--- a/paper/paper.md
+++ b/paper/paper.md
@@ -41,7 +41,7 @@ A common strategy for answering multiple-choice questions, especially under exam
 
 This paper argues that language models can benefit from an explicit two-step reasoning process akin to human problem-solving techniques. The proposed method, dubbed the Process of Elimination (POE), enhances the decision-making process by first scoring and then eliminating options that are seemingly incorrect before focusing on selecting the correct answer from the remaining choices. This method is designed to align with natural human reasoning by replicating how individuals often approach multiple-choice questions, particularly under the constraint of time and accuracy, as frequently experienced in academic testing environments.
 
-Our hypothesis posits that language models, when equipped with a mechanism to discard implausible answers systematically, can achieve better performance on multiple-choice reasoning tasks. This is particularly relevant in the context of logical reasoning, where the elimination of clearly incorrect options can simplify the decision process and potentially lead to more accurate outcomes. This idea is supported by previous work demonstrating the effectiveness of LMs in various reasoning tasks when adapted to more human-like reasoning methods [Brown et al., 2020; Holtzman et al., 2021].
+Our hypothesis posits that language models, when equipped with a mechanism to discard implausible answers systematically, can achieve better performance on multiple-choice reasoning tasks. This is particularly relevant in the context of logical reasoning, where the elimination of clearly incorrect options can simplify the decision process and potentially lead to more accurate outcomes. This idea is supported by previous work demonstrating the effectiveness of LMs in various reasoning tasks when adapted to more human-like reasoning methods[Holtzman et al., 2021].
 
 In the development of POE, we draw inspiration from the established capabilities of LMs to handle complex reasoning tasks [Brown et al., 2020] and the known strategies that humans employ in test-taking scenarios. The approach builds on the foundational work in language modeling likelihood [Brown et al., 2020], which demonstrates the LMs' ability to perform in-context learning. By incorporating a structured process to eliminate unlikely choices, POE aims to refine this capability, making it more targeted and efficient in dealing with the nuanced challenges presented by multiple-choice questions.
 
@@ -56,22 +56,22 @@ The Process of Elimination (POE) introduced in this paper operates on a two-step
 
 Given a multiple-choice reasoning task, we define the problem setting as follows:
 
-- Let \(x\) be the question or context provided.
-- Let \(Y = \{y_1, y_2, \ldots, y_n\}\) be the set of multiple-choice options available.
-- Let \(y\) be the correct answer from \(Y\).
+- Let $x$ be the question or context provided.
+- Let $Y = \{y_1, y_2, \ldots, y_n\}$ be the set of multiple-choice options available.
+- Let $y$ be the correct answer from $Y$.
 
-The goal is to develop an in-context learning method that accurately selects \(y\) from \(Y\) given \(x\).
+The goal is to develop an in-context learning method that accurately selects $y$ from $Y$ given $x$.
 
 ### Two-Step Scoring Method
 
 #### Step 1: Elimination
 
-In the first step of the POE method, each option \(y_i\) is scored based on a specified metric. The score function, \(\text{score}(x, y_i)\), evaluates each option's plausibility given the question \(x\). The scores are used to eliminate options that are deemed less likely to be correct. Specifically, options whose scores are below the average score are eliminated. This is calculated as follows:
+In the first step of the POE method, each option $y_i$ is scored based on a specified metric. The score function, $\text{score}(x, y_i)$, evaluates each option's plausibility given the question $x$. The scores are used to eliminate options that are deemed less likely to be correct. Specifically, options whose scores are below the average score are eliminated. This is calculated as follows:
 
-```markdown
+$$
 s_i = \text{score}(x, y_i)
 Y_{\text{wrong}} = \{y_i | s_i < \text{avg}(s_1, \ldots, s_n)\}
-```
+$$
 
 This elimination strategy intuitively aligns with how humans often discard options that seem clearly incorrect before carefully considering the remaining choices.
 
@@ -79,24 +79,24 @@ This elimination strategy intuitively aligns with how humans often discard optio
 
 The second step involves making the final choice from the non-eliminated options. This step utilizes a binary mask to exclude the eliminated options during the prediction phase. The mask for each option \(y_i\) is defined as follows:
 
-```markdown
+$$
 m_i = \begin{cases} 
 0 & \text{if } y_i \in Y_{\text{wrong}} \\
 1 & \text{otherwise}
 \end{cases}
-```
+$$
 
 The masked context \(x_{\text{mask}}\) is then constructed by modifying the original context \(x\) to include only the options for which \(m_i = 1\). Each option is scored again, but this time within the context that explicitly excludes the eliminated options, possibly by using a template \(T\) that masks out \(Y_{\text{wrong}}\) in the presentation of the options:
 
-```markdown
+$$
 x_{\text{mask}} = T(x, Y, \text{mask})
-```
+$$
 
 The final predicted answer \(\hat{y}\) is then the option with the highest score among the remaining options:
 
-```markdown
+$$
 \hat{y} = \arg\max_{i | m_i = 1} \text{score}(x_{\text{mask}}, y_i)
-```
+$$
 
 ### Implementation Considerations
 
@@ -156,7 +156,7 @@ POE consistently outperformed or matched the best-performing baselines across al
 | RS   | 55.1 | 49.0 | -6.1      |
 | IOM  | 56.2 | 50.0 | -6.2      |
 
-**Table 2**: Comparison of MCP and PoE accuracy scores on 8 new tasks. The top 4 tasks are logical reasoning tasks. PoE largely outperforms MCP on 4 logical reasoning tasks, and underperforms MCP on other 4 tasks.
+**Table 1**: Comparison of MCP and PoE accuracy scores on 8 new tasks. The top 4 tasks are logical reasoning tasks. PoE largely outperforms MCP on 4 logical reasoning tasks, and underperforms MCP on other 4 tasks.
 
 ## 6. Conclusion
 

From b9f608a63c8929cfa93c6e2d2cd699a7d7b7d106 Mon Sep 17 00:00:00 2001
From: hellokayas <pidnas94335@gmail.com>
Date: Thu, 17 Oct 2024 01:18:23 -0500
Subject: [PATCH 28/30] paper

---
 paper/paper.bib | 70 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/paper/paper.bib b/paper/paper.bib
index 22607be..11309e5 100644
--- a/paper/paper.bib
+++ b/paper/paper.bib
@@ -43,6 +43,13 @@ @article{freivalds2002learning
   year = {2002}
 }
 
+@article{ma2023poe,
+  title={POE: Process of Elimination for Multiple Choice Reasoning},
+  author={Ma, Chenkai and Du, Xinya},
+  journal={arXiv preprint arXiv:2310.15575},
+  year={2023}
+}
+
 @inproceedings{holtzman2021surface,
   title = {Surface form competition: Why the highest probability answer isn’t always right},
   author = {Holtzman, Ari and West, Peter and Shwartz, Vered and Choi, Yejin and Zettlemoyer, Luke},
@@ -215,3 +222,66 @@ @article{zhao2021calibrate
   year = {2021},
   organization = {PMLR}
 }
+
+@software{BLIP2_Opt,
+  author = {{Salesforce}},
+  title = {BLIP2: Opt Model},
+  version = {2.7b},
+  howpublished = {\url{https://huggingface.co/Salesforce/blip2-opt-2.7b}}
+}
+
+@conj{BLIP2_FLAN,
+  author = {{Salesforce}},
+  title = {BLIP2: FLAN-T5-XL Model},
+  version = {XL},
+  howpublished = {\url{https://huggingface.co/Salesforce/blip2-flan-t5-xl}}
+}
+
+@conj{InstructBLIP,
+  author = {{Salesforce}},
+  title = {InstructBLIP: Vicuna Model},
+  version = {7b},
+  howpublished = {\url{https://huggingface.co/Salesforce/instructblip-vicuna-7b}}
+}
+
+@conj{GIT_Base_VQA,
+  author = {{Microsoft}},
+  title = {GIT: Base VQA Model},
+  version = {Base},
+  howpublished = {\url{https://huggingface.co/microsoft/git-base-vqav2}}
+}
+
+@conj{GIT_Base_TextVQA,
+  author = {{Microsoft}},
+  title = {GIT: Base TextVQA Model},
+  version = {Base},
+  howpublished = {\url{https://huggingface.co/microsoft/git-base-textvqa}}
+}
+
+@conj{PaliGemma_ScienceQA,
+  author = {{Google}},
+  title = {PaliGemma: Fine-Tuned Science QA Model},
+  version = {3b},
+  howpublished = {\url{https://huggingface.co/google/paligemma-3b-ft-science-qa-448}}
+}
+
+@conj{PaliGemma_VQAV2,
+  author = {{Google}},
+  title = {PaliGemma: Fine-Tuned VQAV2 Model},
+  version = {3b},
+  howpublished = {\url{https://huggingface.co/google/paligemma-3b-ft-vqav2-448}}
+}
+
+@conj{PaliGemma_AI2D,
+  author = {{Google}},
+  title = {PaliGemma: Fine-Tuned AI2D Model},
+  version = {3b},
+  howpublished = {\url{https://huggingface.co/google/paligemma-3b-ft-ai2d-448}}
+}
+
+@conj{Idefics2,
+  author = {{Hugging Face}},
+  title = {Idefics2: 8b Model},
+  version = {8b},
+  howpublished = {\url{https://huggingface.co/HuggingFaceM4/idefics2-8b}}
+}

From 8665fd97ddbf780b8db3f61f4c5276d011cc4908 Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Sat, 19 Oct 2024 01:18:42 -0500
Subject: [PATCH 29/30] Fixed linting issues and tests.

---
 Makefile                                      |   12 +-
 mm_poe/cli.py                                 |  344 +++--
 mm_poe/methods/language_modeling.py           |  233 ++-
 mm_poe/methods/process_of_elimination.py      |  295 ++--
 mm_poe/methods/process_of_elimination_vqa.py  |  374 +++--
 mm_poe/methods/utils/data.py                  | 1323 +++++++++++------
 mm_poe/methods/utils/methods.py               |  637 ++++++--
 mm_poe/methods/utils/models.py                |    1 -
 mm_poe/methods/utils/utils.py                 |  481 ++++--
 mm_poe/methods/vision_language_modeling.py    |  313 ++--
 .../model_downloaders/model_downloaders.py    |  138 +-
 tests/methods/utils/test_data.py              |  812 ++++++----
 tests/methods/utils/test_methods.py           |  330 ++--
 tests/methods/utils/test_utils.py             |  613 ++++++--
 tests/test_cli.py                             |  646 +++++---
 tests/test_main.py                            |    9 +-
 16 files changed, 4581 insertions(+), 1980 deletions(-)

diff --git a/Makefile b/Makefile
index 2c5fc65..c0541e6 100644
--- a/Makefile
+++ b/Makefile
@@ -32,7 +32,7 @@ fmt:              ## Format code using black & isort.
 
 .PHONY: lint
 lint:             ## Run pep8, black, mypy linters.
-	$(ENV_PREFIX)flake8 mm_poe/
+	$(ENV_PREFIX)flake8 --ignore=E203,W503 mm_poe/
 	$(ENV_PREFIX)black -l 79 --check mm_poe/
 	$(ENV_PREFIX)black -l 79 --check tests/
 	$(ENV_PREFIX)mypy --ignore-missing-imports mm_poe/
@@ -49,19 +49,21 @@ watch:            ## Run tests on every change.
 
 .PHONY: clean
 clean:            ## Clean unused files.
-	@find ./ -name '*.pyc' -exec rm -f {} \;
-	@find ./ -name '__pycache__' -exec rm -rf {} \;
-	@find ./ -name 'Thumbs.db' -exec rm -f {} \;
-	@find ./ -name '*~' -exec rm -f {} \;
 	@rm -rf .cache
 	@rm -rf .pytest_cache
 	@rm -rf .mypy_cache
 	@rm -rf build
 	@rm -rf dist
 	@rm -rf *.egg-info
+	@rm -rf .coverage*
+	@rm -rf coverage.xml
 	@rm -rf htmlcov
 	@rm -rf .tox/
 	@rm -rf docs/_build
+	@find . -name '*.pyc' -exec rm -f {} \;
+	@find . -name '__pycache__' -exec rm -rf {} \;
+	@find . -name 'Thumbs.db' -exec rm -f {} \;
+	@find . -name '*~' -exec rm -f {} \;
 
 .PHONY: virtualenv
 virtualenv:       ## Create a virtual environment.
diff --git a/mm_poe/cli.py b/mm_poe/cli.py
index 681752f..bf1a385 100644
--- a/mm_poe/cli.py
+++ b/mm_poe/cli.py
@@ -7,102 +7,115 @@
 import pathlib
 
 import questionary
-import numpy as np
 import torch
-import torch.nn.functional as F
 from torch.utils.data import DataLoader
 
-from mm_poe.methods.utils.data import(
+from mm_poe.methods.utils.data import (
     create_multiple_choice_prompt,
     preprocess_function_seq2seq_vqa,
     preprocess_function_seq2seq_vqa_channel,
     preprocess_function_causal_vqa,
-    preprocess_function_causal_vqa_channel
+    preprocess_function_causal_vqa_channel,
 )
-from mm_poe.methods.utils.methods import(
+from mm_poe.methods.utils.methods import (
     compute_conditional_score_seq2seq_vqa,
     compute_conditional_score_causal_vqa,
     compute_mask_process_of_elimination,
     inference_process_of_elimination,
     inference_language_modeling,
-    inference_calibration
-)
-from mm_poe.methods.utils.utils import(
-    load_data,
-    load_model,
-    set_seed
+    inference_calibration,
 )
+from mm_poe.methods.utils.utils import load_data, load_model, set_seed
 
 all_checkpoints = {
     "BLIP2": ["Salesforce/blip2-opt-2.7b", "Salesforce/blip2-flan-t5-xl"],
     "InstructBLIP": ["Salesforce/instructblip-vicuna-7b"],
     "GIT": ["microsoft/git-base-vqav2", "microsoft/git-base-textvqa"],
-    "PaliGemma": ["google/paligemma-3b-ft-science-qa-448", "google/paligemma-3b-ft-vqav2-448", "google/paligemma-3b-ft-ai2d-448"],
-    "Idefics2": ["HuggingFaceM4/idefics2-8b"]
+    "PaliGemma": [
+        "google/paligemma-3b-ft-science-qa-448",
+        "google/paligemma-3b-ft-vqav2-448",
+        "google/paligemma-3b-ft-ai2d-448",
+    ],
+    "Idefics2": ["HuggingFaceM4/idefics2-8b"],
 }
 
 logger = logging.getLogger(__name__)
 
+
 def main():
     """
     The main function executes on commands:
     `python -m mm_poe` and `$ mm_poe `.
     """
-     # step 1: collect arguments
+    # step 1: collect arguments
     args = Namespace()
     args.seed = 0
-    
+
     args.model_family = questionary.select(
         message="Select model family?",
-        choices=["BLIP2", "InstructBLIP","GIT","PaliGemma","Idefics2"],
-        default="GIT").ask()
-    
+        choices=["BLIP2", "InstructBLIP", "GIT", "PaliGemma", "Idefics2"],
+        default="GIT",
+    ).ask()
+
     checkpoints_choices = all_checkpoints[args.model_family]
     args.checkpoint = questionary.select(
         message="Select model checkpoint?",
         choices=checkpoints_choices,
-        default=checkpoints_choices[0]).ask()
-    
+        default=checkpoints_choices[0],
+    ).ask()
+
     args.loading_precision = questionary.select(
         message="Select model checkpoint?",
         choices=["FP32", "FP16", "BF16", "INT8", "INT4"],
-        default="FP32").ask()
-    
+        default="FP32",
+    ).ask()
+
     args.output_dir = questionary.path(
-        message='Model output directory?',
+        message="Model output directory?",
         only_directories=True,
-        default="./models/").ask()
+        default="./models/",
+    ).ask()
 
-    args.dataset="single_inference"
-    args.batch_size=1
-    args.sample=1
-    args.n_shot=0
+    args.dataset = "single_inference"
+    args.batch_size = 1
+    args.sample = 1
+    args.n_shot = 0
 
-    args.multiple_choice_prompt=""
-    args.calibration_prompt=" the answer is:"
-    args.process_of_elimination_prompt="Select the most suitable option to answer the question. Ignore [MASK] options."
+    args.multiple_choice_prompt = ""
+    args.calibration_prompt = " the answer is:"
+    args.process_of_elimination_prompt = "Select the most suitable option \
+        to answer the question. Ignore [MASK] options."
 
     args.scoring_method_for_process_of_elimination = questionary.select(
         message="Select scoring method?",
-        choices=["channel","calibration","language_modeling","multiple_choice_prompt"],
-        default="language_modeling").ask()
-    
-    args.mask_strategy_for_process_of_elimination=questionary.select(
+        choices=[
+            "channel",
+            "calibration",
+            "language_modeling",
+            "multiple_choice_prompt",
+        ],
+        default="language_modeling",
+    ).ask()
+
+    args.mask_strategy_for_process_of_elimination = questionary.select(
         message="Select mask strategy?",
-        choices=["below_average","lowest"],
-        default="below_average").ask()
-    
+        choices=["below_average", "lowest"],
+        default="below_average",
+    ).ask()
+
     args.prompting_method_for_process_of_elimination = "multiple_choice_prompt"
     args.mask_token = None
-    
+
     args.question = questionary.text("Question:").ask()
     args.choices = questionary.text("Choices [comma seprated]:").ask()
-    args.choices = args.choices.split(',')
+    args.choices = args.choices.split(",")
     args.num_options = len(args.choices)
-    args.image_path = questionary.path("Image Path?", default="./images/image.png").ask()
+    args.image_path = questionary.path(
+        "Image Path?", default="./images/image.png"
+    ).ask()
     args.label = questionary.select(
-        message="Answer:",
-        choices=[str(x) for x in range(args.num_options)]).ask()
+        message="Answer:", choices=[str(x) for x in range(args.num_options)]
+    ).ask()
     args.label = int(args.label)
     args.method = "process_of_elimination"
 
@@ -121,29 +134,37 @@ def main():
     # step 3: download model
     logger.info(f"Download {args.model_family} model: {args.checkpoint}.")
     model_downloader_path = os.path.join(
-        pathlib.Path(__file__).parent.resolve(), 
-        "models/model_downloaders/model_downloaders.py"
+        pathlib.Path(__file__).parent.resolve(),
+        "models/model_downloaders/model_downloaders.py",
     )
-    subprocess.call(f"python {model_downloader_path} \
+    subprocess.call(
+        f"python {model_downloader_path} \
             --model_family {args.model_family} \
             --checkpoint {args.checkpoint} \
-            --output_dir {args.output_dir}", shell=True)
+            --output_dir {args.output_dir}",
+        shell=True,
+    )
 
-    # step 4: load model, tokenizer. Then move to gpu, and set to evaluation mode.
+    # step 4: load model, tokenizer.
+    # Then move to gpu, and set to evaluation mode.
     logger.info(f"Load {args.model_family} model: {args.checkpoint}.")
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     # get model path: ../models/args.model_family/args.checkpoint
-    model_path = os.path.join(args.output_dir, args.model_family, args.checkpoint)
+    model_path = os.path.join(
+        args.output_dir, args.model_family, args.checkpoint
+    )
     model, tokenizer = load_model(device, model_path, args)
     if args.model_family in ["BLIP2", "InstructBLIP", "PaliGemma", "Idefics2"]:
         compute_func = compute_conditional_score_seq2seq_vqa
         preprocess_func = preprocess_function_seq2seq_vqa
         preprocess_func_channel = preprocess_function_seq2seq_vqa_channel
-        remove_columns = ['header_input_ids', 
-                        'header_attention_mask', 
-                        'ending_input_ids', 
-                        'ending_attention_mask',
-                        'images',]
+        remove_columns = [
+            "header_input_ids",
+            "header_attention_mask",
+            "ending_input_ids",
+            "ending_attention_mask",
+            "images",
+        ]
         processor = tokenizer
         tokenizer = processor.tokenizer
     elif args.model_family in ["GIT"]:
@@ -151,10 +172,10 @@ def main():
         preprocess_func = preprocess_function_causal_vqa
         preprocess_func_channel = preprocess_function_causal_vqa_channel
         remove_columns = [
-            'input_ids',
-            'labels',
-            'images',
-            'ending_attention_mask'
+            "input_ids",
+            "labels",
+            "images",
+            "ending_attention_mask",
         ]
         processor = tokenizer
         tokenizer = processor.tokenizer
@@ -163,104 +184,213 @@ def main():
 
     # step 5: load and preprocess data.
     logger.info(f"Load data: {args.dataset}.")
-    
+
     # evaluate on dataset
     multiple_choice_prompt = args.multiple_choice_prompt
     # multiple_choice_prompt = args.multiple_choice_prompt
     args.multiple_choice_prompt = None
-    ending_names, header_name, image_header_name, raw_dataset, n_shot_dataset = load_data(args) 
-    
+    (
+        ending_names,
+        header_name,
+        image_header_name,
+        raw_dataset,
+        n_shot_dataset,
+    ) = load_data(args)
+
     mcp_args = copy.deepcopy(args)
     mcp_args.multiple_choice_prompt = multiple_choice_prompt
-    _, _, _, raw_mcp_dataset, n_shot_mcp_dataset = load_data(mcp_args) 
-    
+    _, _, _, raw_mcp_dataset, n_shot_mcp_dataset = load_data(mcp_args)
+
     logger.info(f"Preprocess data: {args.dataset}.")
     fn_kwargs = {
-        "ending_names": ending_names, 
-        "header_name": header_name, 
+        "ending_names": ending_names,
+        "header_name": header_name,
         "tokenizer": tokenizer,
         "processor": processor,
-        "image_header_name": image_header_name
+        "image_header_name": image_header_name,
     }
     num_of_options = len(ending_names)
-    tokenized_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-    eval_dataloader = DataLoader(tokenized_dataset, batch_size=args.batch_size, shuffle=False)
+    tokenized_dataset = raw_dataset.map(
+        preprocess_func,
+        fn_kwargs=fn_kwargs,
+        batched=True,
+        batch_size=args.batch_size,
+    )
+    eval_dataloader = DataLoader(
+        tokenized_dataset, batch_size=args.batch_size, shuffle=False
+    )
 
     # step 5: (evaluation) inference on data, and compute accuracy.
-    logger.info(f"Start inference (method: {args.method}) on {args.dataset} using {args.model_family} model: {args.checkpoint}.")
+    logger.info(
+        f"Start inference (method: {args.method}) on {args.dataset} \
+            using {args.model_family} model: {args.checkpoint}."
+    )
     scoring_method = args.scoring_method_for_process_of_elimination
     logger.info(f"Step 1: Computing masks. Scoring method: {scoring_method}.")
     if scoring_method == "channel":
-        tokenized_channel_dataset = raw_dataset.map(preprocess_func_channel, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-        eval_channel_dataloader = DataLoader(tokenized_channel_dataset, batch_size=args.batch_size, shuffle=False)
-        avg_log_probs, _, _, lm_predictions = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
+        tokenized_channel_dataset = raw_dataset.map(
+            preprocess_func_channel,
+            fn_kwargs=fn_kwargs,
+            batched=True,
+            batch_size=args.batch_size,
+        )
+        eval_channel_dataloader = DataLoader(
+            tokenized_channel_dataset,
+            batch_size=args.batch_size,
+            shuffle=False,
+        )
+        avg_log_probs, _, _, lm_predictions = inference_language_modeling(
+            model,
+            eval_channel_dataloader,
+            device,
+            compute_func,
+            tokenizer.pad_token_id,
+        )
     elif scoring_method == "calibration":
-        fn_kwargs = {"ending_names": ending_names, 
-                    "header_name": "uncond_premise", # the difference is here
-                    "tokenizer": tokenizer,
-                    "processor": processor,
-                    "image_header_name": image_header_name}
-        tokenized_calibration_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-        eval_calibration_dataloader = DataLoader(tokenized_calibration_dataset, batch_size=args.batch_size, shuffle=False)    
-        avg_log_probs, _, _, lm_predictions = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
+        fn_kwargs = {
+            "ending_names": ending_names,
+            "header_name": "uncond_premise",  # the difference is here
+            "tokenizer": tokenizer,
+            "processor": processor,
+            "image_header_name": image_header_name,
+        }
+        tokenized_calibration_dataset = raw_dataset.map(
+            preprocess_func,
+            fn_kwargs=fn_kwargs,
+            batched=True,
+            batch_size=args.batch_size,
+        )
+        eval_calibration_dataloader = DataLoader(
+            tokenized_calibration_dataset,
+            batch_size=args.batch_size,
+            shuffle=False,
+        )
+        avg_log_probs, _, _, lm_predictions = inference_calibration(
+            model,
+            eval_dataloader,
+            eval_calibration_dataloader,
+            device,
+            compute_func,
+            tokenizer.pad_token_id,
+        )
     elif scoring_method == "language_modeling":
-        avg_log_probs, _, _, lm_predictions = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+        avg_log_probs, _, _, lm_predictions = inference_language_modeling(
+            model,
+            eval_dataloader,
+            device,
+            compute_func,
+            tokenizer.pad_token_id,
+        )
     elif scoring_method == "multiple_choice_prompt":
         # mcp_args = copy.deepcopy(args)
         # mcp_args.multiple_choice_prompt = multiple_choice_prompt
         # _, _, raw_mcp_dataset, n_shot_mcp_dataset = load_data(mcp_args)
-        # raw_mcp_dataset, n_shot_mcp_dataset = create_n_shot_splits(raw_mcp_dataset, n_shot_mcp_dataset, args)    
-        tokenized_dataset = raw_mcp_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-        eval_mcp_dataloader = DataLoader(tokenized_dataset, batch_size=args.batch_size, shuffle=False)
-        avg_log_probs, _, _, lm_predictions = inference_language_modeling(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
+        # raw_mcp_dataset, n_shot_mcp_dataset = create_n_shot_splits(
+        #     raw_mcp_dataset,
+        #     n_shot_mcp_dataset,
+        #     args
+        # )
+        tokenized_dataset = raw_mcp_dataset.map(
+            preprocess_func,
+            fn_kwargs=fn_kwargs,
+            batched=True,
+            batch_size=args.batch_size,
+        )
+        eval_mcp_dataloader = DataLoader(
+            tokenized_dataset, batch_size=args.batch_size, shuffle=False
+        )
+        avg_log_probs, _, _, lm_predictions = inference_language_modeling(
+            model,
+            eval_mcp_dataloader,
+            device,
+            compute_func,
+            tokenizer.pad_token_id,
+        )
     else:
-        raise NotImplementedError # unlikely to happen.
-    
+        raise NotImplementedError  # unlikely to happen.
+
     mask_strategy = args.mask_strategy_for_process_of_elimination
     if mask_strategy == "min_k":
         # masking the most k UNLIKELY options
-        min_k = args.min_k 
+        min_k = args.min_k
         if min_k >= num_of_options:
             min_k = num_of_options - 1
-        mask_kwargs = {"min_k": min_k,}
+        mask_kwargs = {
+            "min_k": min_k,
+        }
     else:
         mask_kwargs = {}
-    masks = compute_mask_process_of_elimination(avg_log_probs, mask_strategy, **mask_kwargs)
-    # construct an oracle mask that only keeps the correct lable to 1, and other options to 0
+    masks = compute_mask_process_of_elimination(
+        avg_log_probs, mask_strategy, **mask_kwargs
+    )
+    # construct an oracle mask that only keeps the correct lable to 1,
+    # and other options to 0
     # oracle_masks = torch.zeros_like(avg_log_probs)
-    # oracle_masks[torch.arange(oracle_masks.size(0)), tokenized_dataset["label"]] = 1
+    # oracle_masks[torch.arange(oracle_masks.size(0)), \
+    #              tokenized_dataset["label"]] = 1
     masks = masks.to(torch.float32)
-    # compute mask accuracy, i.e., check whether mask that correspond to labels is 1
-    mask_result = masks[torch.arange(masks.size(0)), tokenized_dataset["label"]]
+    # compute mask accuracy, i.e.,
+    # check whether mask that correspond to labels is 1
+    mask_result = masks[
+        torch.arange(masks.size(0)), tokenized_dataset["label"]
+    ]
     mask_accuracy = torch.sum(mask_result) / mask_result.size(0)
     logger.info(f"Mask accuracy: {mask_accuracy}")
     args.mask_accuracy = mask_accuracy.item()
-    masked_dataset = tokenized_dataset.map(lambda example, idx: {"mask": masks[idx]}, 
-                                with_indices=True, 
-                                batched=True,
-                                remove_columns=remove_columns)
-    
+    masked_dataset = tokenized_dataset.map(
+        lambda example, idx: {"mask": masks[idx]},
+        with_indices=True,
+        batched=True,
+        remove_columns=remove_columns,
+    )
+
     prompting_method = args.prompting_method_for_process_of_elimination
-    logger.info(f"Step 2: Creating multiple choice prompt. Prompting method: {prompting_method}.")
+    logger.info(
+        f"Step 2: Creating multiple choice prompt. \
+            Prompting method: {prompting_method}."
+    )
     # if args.prompting_method_for_process_of_elimination
     # mcp_kwargs = {"multiple_choice_prompt": multiple_choice_prompt,}
     mask_token = args.mask_token
     if mask_token is not None:
         if mask_token == "":
-            args.process_of_elimination_prompt = args.process_of_elimination_prompt.replace("[MASK]", "empty")   
+            args.process_of_elimination_prompt = (
+                args.process_of_elimination_prompt.replace("[MASK]", "empty")
+            )
         else:
-            args.process_of_elimination_prompt = args.process_of_elimination_prompt.replace("[MASK]", mask_token)   
+            args.process_of_elimination_prompt = (
+                args.process_of_elimination_prompt.replace(
+                    "[MASK]", mask_token
+                )
+            )
     mcp_kwargs = {
         "multiple_choice_prompt": args.process_of_elimination_prompt,
         "scoring_method": scoring_method,
         "num_of_options": num_of_options,
         "mask_token": mask_token,
     }
-    mcp_dataset = masked_dataset.map(create_multiple_choice_prompt, fn_kwargs=mcp_kwargs)
+    mcp_dataset = masked_dataset.map(
+        create_multiple_choice_prompt, fn_kwargs=mcp_kwargs
+    )
 
-    logger.info(f"Step 3: Final Inference")
-    mcp_dataset = mcp_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-    eval_mcp_dataloader = DataLoader(mcp_dataset, batch_size=args.batch_size, shuffle=False)
-    poe_avg_log_probs,  lm_accuracy, _, lm_predictions = inference_process_of_elimination(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
+    logger.info("Step 3: Final Inference")
+    mcp_dataset = mcp_dataset.map(
+        preprocess_func,
+        fn_kwargs=fn_kwargs,
+        batched=True,
+        batch_size=args.batch_size,
+    )
+    eval_mcp_dataloader = DataLoader(
+        mcp_dataset, batch_size=args.batch_size, shuffle=False
+    )
+    poe_avg_log_probs, lm_accuracy, _, lm_predictions = (
+        inference_process_of_elimination(
+            model,
+            eval_mcp_dataloader,
+            device,
+            compute_func,
+            tokenizer.pad_token_id,
+        )
+    )
     option = int(lm_predictions.numpy()[0])
     logger.info(f"Answer: {option}")
diff --git a/mm_poe/methods/language_modeling.py b/mm_poe/methods/language_modeling.py
index 7631fea..59722e4 100644
--- a/mm_poe/methods/language_modeling.py
+++ b/mm_poe/methods/language_modeling.py
@@ -1,30 +1,21 @@
 # a framework for inference on multiple choice tasks.
-import argparse
 import copy
-import csv
 import logging
 import os
-import random
-import sys
-from tqdm import tqdm
 
-import numpy as np
 import torch
-import torch.nn.functional as F
 from torch.utils.data import DataLoader
-from datasets import Dataset
 
-from utils.data import(
+from utils.data import (
     upload_to_huggingface_hub,
     preprocess_function_seq2seq,
     preprocess_function_causal,
     preprocess_function_causal_channel,
     preprocess_function_seq2seq_channel,
     create_synonym_dataset,
-    generate_n_shot_demonstrations,
     create_n_shot_splits,
 )
-from utils.methods import(
+from utils.methods import (
     compute_conditional_score_seq2seq,
     compute_conditional_score_causal,
     inference_language_modeling,
@@ -32,7 +23,7 @@
     inference_generate_synonyms,
     generate_synonyms,
 )
-from utils.utils import(
+from utils.utils import (
     load_data,
     load_model,
     parse_args,
@@ -42,6 +33,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 def main():
     # import pdb; pdb.set_trace()
 
@@ -53,9 +45,9 @@ def main():
         args.method = "contrastive_decoding"
     elif args.calibration_prompt is not None:
         args.method = "calibration"
-    elif args.do_channel == True:
+    elif args.do_channel is True:
         args.method = "channel"
-    elif args.do_synonym == True:
+    elif args.do_synonym is True:
         args.method = "generate_synonyms"
     else:
         args.method = "language_modeling"
@@ -72,11 +64,14 @@ def main():
     logger.info(f"Set random seed to {args.seed}.")
     set_seed(args.seed)
 
-    # step 3: load model, tokenizer. Then move to gpu, and set to evaluation mode.
+    # step 3: load model, tokenizer.
+    # Then move to gpu, and set to evaluation mode.
     logger.info(f"Load {args.model_family} model: {args.checkpoint}.")
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     # get model path: ../models/args.model_family/args.checkpoint
-    model_path = os.path.join("/content/models", args.model_family, args.checkpoint)
+    model_path = os.path.join(
+        "/content/models", args.model_family, args.checkpoint
+    )
     model, tokenizer = load_model(device, model_path, args)
     if args.model_family in ["GPT2", "Pythia", "OPT-IML", "Dolly"]:
         compute_func = compute_conditional_score_causal
@@ -92,73 +87,185 @@ def main():
     # step 4: load and preprocess data.
     args.datasets = args.datasets.split()
     logger.info(f"Load data: {args.datasets}.")
-    
+
     # evaluate on each dataset
     for dataset in args.datasets:
         args.dataset = dataset
-        ending_names, header_name, raw_dataset, n_shot_dataset = load_data(args)
-        raw_dataset, n_shot_dataset, n_shot_demonstrations = create_n_shot_splits(raw_dataset, n_shot_dataset, args)    
+        ending_names, header_name, raw_dataset, n_shot_dataset = load_data(
+            args
+        )
+        raw_dataset, n_shot_dataset, n_shot_demonstrations = (
+            create_n_shot_splits(raw_dataset, n_shot_dataset, args)
+        )
 
         logger.info(f"Preprocess data: {args.dataset}.")
-        fn_kwargs = {"ending_names": ending_names, 
-                    "header_name": header_name, 
-                    "tokenizer": tokenizer,}
+        fn_kwargs = {
+            "ending_names": ending_names,
+            "header_name": header_name,
+            "tokenizer": tokenizer,
+        }
         num_of_options = len(ending_names)
-        tokenized_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-        eval_dataloader = DataLoader(tokenized_dataset, batch_size=args.batch_size, shuffle=False)
+        tokenized_dataset = raw_dataset.map(
+            preprocess_func,
+            fn_kwargs=fn_kwargs,
+            batched=True,
+            batch_size=args.batch_size,
+        )
+        eval_dataloader = DataLoader(
+            tokenized_dataset, batch_size=args.batch_size, shuffle=False
+        )
 
         # step 5: (evaluation) inference on data, and compute accuracy.
-        logger.info(f"Start inference (method: {args.method}) on {args.dataset} using {args.model_family} model: {args.checkpoint}.")
+        logger.info(
+            f"Start inference (method: {args.method}) on {args.dataset} \
+                using {args.model_family} model: {args.checkpoint}."
+        )
         if args.method in ["language_modeling", "multiple_choice_prompt"]:
-            _, lm_accuracy, avg_lm_accuracy, _ = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_language_modeling(
+                model,
+                eval_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
         elif args.method == "contrastive_decoding":
-            logger.info(f"Load {args.model_family} amateur model: {args.amateur_checkpoint}.")
+            logger.info(
+                f"Load {args.model_family} \
+                    amateur model: {args.amateur_checkpoint}."
+            )
             # get model path: ../models/args.model_family/args.checkpoint
-            amateur_model_path = os.path.join("/content/models", args.model_family, args.amateur_checkpoint)
+            amateur_model_path = os.path.join(
+                "/content/models", args.model_family, args.amateur_checkpoint
+            )
             amateur_model, _ = load_model(device, amateur_model_path, args)
-            # we want to integrate contrastive decoding with other methods, so we need separate output from each model.
-            # compute log probs on each model        
-            exp_avg_log_probs, exp_lm_accuracy, exp_avg_lm_accuracy, _ = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
-            ama_avg_log_probs, ama_lm_accuracy, ama_avg_lm_accuracy, _ = inference_language_modeling(amateur_model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+            # we want to integrate contrastive decoding with other methods,
+            # so we need separate output from each model.
+            # compute log probs on each model
+            exp_avg_log_probs, exp_lm_accuracy, exp_avg_lm_accuracy, _ = (
+                inference_language_modeling(
+                    model,
+                    eval_dataloader,
+                    device,
+                    compute_func,
+                    tokenizer.pad_token_id,
+                )
+            )
+            ama_avg_log_probs, ama_lm_accuracy, ama_avg_lm_accuracy, _ = (
+                inference_language_modeling(
+                    amateur_model,
+                    eval_dataloader,
+                    device,
+                    compute_func,
+                    tokenizer.pad_token_id,
+                )
+            )
             # calculate difference, and may introduce extra parameters.
             avg_log_probs = exp_avg_log_probs - ama_avg_log_probs
-            labels = raw_dataset['label']
-            # currently, I use average language modeling accuracy. I will add language modeling and other methods shortly.
-            lm_accuracy = (avg_log_probs.argmin(dim=-1) == labels).sum().item() / len(labels)
+            labels = raw_dataset["label"]
+            # currently, I use average language modeling accuracy.
+            # I will add language modeling and other methods shortly.
+            lm_accuracy = (
+                avg_log_probs.argmin(dim=-1) == labels
+            ).sum().item() / len(labels)
             logger.info(f"Contrastive decoding accuracy: {lm_accuracy:.4f}.")
             args.amateur_accuracy = ama_avg_lm_accuracy
             args.expert_accuracy = exp_avg_lm_accuracy
         elif args.method == "calibration":
-            fn_kwargs = {"ending_names": ending_names, 
-                        "header_name": "uncond_premise", # the difference is here
-                        "tokenizer": tokenizer,}
-            tokenized_calibration_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-            eval_calibration_dataloader = DataLoader(tokenized_calibration_dataset, batch_size=args.batch_size, shuffle=False)    
-            _, lm_accuracy, avg_lm_accuracy, _ = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
+            fn_kwargs = {
+                "ending_names": ending_names,
+                "header_name": "uncond_premise",  # the difference is here
+                "tokenizer": tokenizer,
+            }
+            tokenized_calibration_dataset = raw_dataset.map(
+                preprocess_func,
+                fn_kwargs=fn_kwargs,
+                batched=True,
+                batch_size=args.batch_size,
+            )
+            eval_calibration_dataloader = DataLoader(
+                tokenized_calibration_dataset,
+                batch_size=args.batch_size,
+                shuffle=False,
+            )
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_calibration(
+                model,
+                eval_dataloader,
+                eval_calibration_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
         elif args.method == "channel":
-            # simple solution: swap first sentence and second sentence in both preprocessing functions
-            tokenized_channel_dataset = raw_dataset.map(preprocess_func_channel, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-            eval_channel_dataloader = DataLoader(tokenized_channel_dataset, batch_size=args.batch_size, shuffle=False)
-            _, lm_accuracy, avg_lm_accuracy, _ = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
+            # simple solution: swap first sentence
+            # and second sentence in both preprocessing functions
+            tokenized_channel_dataset = raw_dataset.map(
+                preprocess_func_channel,
+                fn_kwargs=fn_kwargs,
+                batched=True,
+                batch_size=args.batch_size,
+            )
+            eval_channel_dataloader = DataLoader(
+                tokenized_channel_dataset,
+                batch_size=args.batch_size,
+                shuffle=False,
+            )
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_language_modeling(
+                model,
+                eval_channel_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
         elif args.method == "generate_synonyms":
             # 3 stpes: generate synonyms, then map datasets, then inference.
             logger.info(f"Generate synonyms for {args.dataset}.")
-            synonyms_dict = generate_synonyms(args, model, tokenizer, tokenized_dataset)
+            synonyms_dict = generate_synonyms(
+                args, model, tokenizer, tokenized_dataset
+            )
             # call map add synonyms to raw_dataset
-            logger.info(f"Add synonyms to raw dataset.")
-            synonym_kwargs = {"args": args,
-                              "synonyms_dict": synonyms_dict,
-                              }
-            synonyms_dataset = raw_dataset.map(create_synonym_dataset, fn_kwargs=synonym_kwargs, batched=True, batch_size=args.batch_size)
+            logger.info("Add synonyms to raw dataset.")
+            synonym_kwargs = {
+                "args": args,
+                "synonyms_dict": synonyms_dict,
+            }
+            synonyms_dataset = raw_dataset.map(
+                create_synonym_dataset,
+                fn_kwargs=synonym_kwargs,
+                batched=True,
+                batch_size=args.batch_size,
+            )
             # map to tokenized_dataset
-            logger.info(f"Tokenize synonym data.")
-            synonyms_ending_names = [col for col in synonyms_dataset.column_names if col.startswith("hypothesis")]
-            fn_kwargs = {"ending_names": synonyms_ending_names, 
-                        "header_name": header_name, 
-                        "tokenizer": tokenizer,}
-            tokenized_synonyms_dataset = synonyms_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-            eval_synonyms_dataloader = DataLoader(tokenized_synonyms_dataset, batch_size=args.batch_size, shuffle=False)
-            _, lm_accuracy, avg_lm_accuracy, _ = inference_generate_synonyms(model, eval_synonyms_dataloader, device, compute_func, tokenizer.pad_token_id, num_of_options, args.number_of_synonyms)
+            logger.info("Tokenize synonym data.")
+            synonyms_ending_names = [
+                col
+                for col in synonyms_dataset.column_names
+                if col.startswith("hypothesis")
+            ]
+            fn_kwargs = {
+                "ending_names": synonyms_ending_names,
+                "header_name": header_name,
+                "tokenizer": tokenizer,
+            }
+            tokenized_synonyms_dataset = synonyms_dataset.map(
+                preprocess_func,
+                fn_kwargs=fn_kwargs,
+                batched=True,
+                batch_size=args.batch_size,
+            )
+            eval_synonyms_dataloader = DataLoader(
+                tokenized_synonyms_dataset,
+                batch_size=args.batch_size,
+                shuffle=False,
+            )
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_generate_synonyms(
+                model,
+                eval_synonyms_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+                num_of_options,
+                args.number_of_synonyms,
+            )
         else:
             raise NotImplementedError
 
@@ -172,15 +279,15 @@ def main():
             avg_args = copy.deepcopy(args)
             avg_args.method = "average_language_modeling"
             write_to_csv(save_path, avg_args, avg_lm_accuracy)
-        
+
         # step 7: push data to HuggingFace Hub.
         if args.push_data_to_hub:
             logger.info(f"Push {args.dataset} to HuggingFace Hub.")
             upload_to_huggingface_hub(tokenized_dataset, args)
-        
+
         # step 8: delete tokenized_dataset to save memory.
         # del tokenized_dataset
-            
+
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/mm_poe/methods/process_of_elimination.py b/mm_poe/methods/process_of_elimination.py
index 97db2b5..20d7c23 100644
--- a/mm_poe/methods/process_of_elimination.py
+++ b/mm_poe/methods/process_of_elimination.py
@@ -1,26 +1,13 @@
 # a framework for inference on multiple choice tasks.
-import argparse
 import copy
-import csv
 import logging
 import os
-import random
-import sys
-from tqdm import tqdm
 
-import numpy as np
 import torch
-import torch.nn.functional as F
 from torch.utils.data import DataLoader
-from transformers import(
-    AutoTokenizer, 
-    AutoModelForCausalLM,
-    AutoModelForSeq2SeqLM,
-)
-from datasets import Dataset
 
 
-from utils.data import(
+from utils.data import (
     upload_to_huggingface_hub,
     preprocess_function_seq2seq,
     preprocess_function_causal,
@@ -30,7 +17,7 @@
     generate_n_shot_poe_demonstrations,
     create_n_shot_splits,
 )
-from utils.methods import(
+from utils.methods import (
     compute_conditional_score_seq2seq,
     compute_conditional_score_causal,
     compute_mask_process_of_elimination,
@@ -38,7 +25,7 @@
     inference_language_modeling,
     inference_calibration,
 )
-from utils.utils import(
+from utils.utils import (
     load_data,
     load_model,
     parse_args,
@@ -48,6 +35,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 def main():
     # import pdb; pdb.set_trace()
 
@@ -67,142 +55,278 @@ def main():
     logger.info(f"Set random seed to {args.seed}.")
     set_seed(args.seed)
 
-    # step 3: load model, tokenizer. Then move to gpu, and set to evaluation mode.
+    # step 3: load model, tokenizer.
+    # Then move to gpu, and set to evaluation mode.
     logger.info(f"Load {args.model_family} model: {args.checkpoint}.")
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     # get model path: ../models/args.model_family/args.checkpoint
-    model_path = os.path.join("/content/models", args.model_family, args.checkpoint)
+    model_path = os.path.join(
+        "/content/models", args.model_family, args.checkpoint
+    )
     model, tokenizer = load_model(device, model_path, args)
     if args.model_family in ["GPT2", "Pythia", "OPT-IML", "Dolly"]:
         compute_func = compute_conditional_score_causal
         preprocess_func = preprocess_function_causal
         preprocess_func_channel = preprocess_function_causal_channel
-        remove_columns = ['input_ids',
-                          'labels',
-                          'ending_attention_mask']
+        remove_columns = ["input_ids", "labels", "ending_attention_mask"]
     elif args.model_family in ["T5", "FLAN-T5"]:
         compute_func = compute_conditional_score_seq2seq
         preprocess_func = preprocess_function_seq2seq
         preprocess_func_channel = preprocess_function_seq2seq_channel
-        remove_columns=['header_input_ids', 
-                        'header_attention_mask', 
-                        'ending_input_ids', 
-                        'ending_attention_mask', ]
+        remove_columns = [
+            "header_input_ids",
+            "header_attention_mask",
+            "ending_input_ids",
+            "ending_attention_mask",
+        ]
     else:
         raise NotImplementedError
 
     # step 4: load and preprocess data.
     args.datasets = args.datasets.split()
     logger.info(f"Load data: {args.datasets}.")
-    
+
     # evaluate on each dataset
     multiple_choice_prompt = args.multiple_choice_prompt
     for dataset in args.datasets:
         args.dataset = dataset
         # multiple_choice_prompt = args.multiple_choice_prompt
         args.multiple_choice_prompt = None
-        ending_names, header_name, raw_dataset, n_shot_dataset = load_data(args)
-        raw_dataset, n_shot_dataset, n_shot_demonstrations = create_n_shot_splits(raw_dataset, n_shot_dataset, args)    
-        
+        ending_names, header_name, raw_dataset, n_shot_dataset = load_data(
+            args
+        )
+        raw_dataset, n_shot_dataset, n_shot_demonstrations = (
+            create_n_shot_splits(raw_dataset, n_shot_dataset, args)
+        )
+
         mcp_args = copy.deepcopy(args)
         mcp_args.multiple_choice_prompt = multiple_choice_prompt
         _, _, raw_mcp_dataset, n_shot_mcp_dataset = load_data(mcp_args)
-        raw_mcp_dataset, n_shot_mcp_dataset, _ = create_n_shot_splits(raw_mcp_dataset, n_shot_mcp_dataset, args)    
-        
+        raw_mcp_dataset, n_shot_mcp_dataset, _ = create_n_shot_splits(
+            raw_mcp_dataset, n_shot_mcp_dataset, args
+        )
+
         logger.info(f"Preprocess data: {args.dataset}.")
-        fn_kwargs = {"ending_names": ending_names, 
-                    "header_name": header_name, 
-                    "tokenizer": tokenizer,}
+        fn_kwargs = {
+            "ending_names": ending_names,
+            "header_name": header_name,
+            "tokenizer": tokenizer,
+        }
         num_of_options = len(ending_names)
-        tokenized_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-        eval_dataloader = DataLoader(tokenized_dataset, batch_size=args.batch_size, shuffle=False)
+        tokenized_dataset = raw_dataset.map(
+            preprocess_func,
+            fn_kwargs=fn_kwargs,
+            batched=True,
+            batch_size=args.batch_size,
+        )
+        eval_dataloader = DataLoader(
+            tokenized_dataset, batch_size=args.batch_size, shuffle=False
+        )
 
         # step 5: (evaluation) inference on data, and compute accuracy.
-        logger.info(f"Start inference (method: {args.method}) on {args.dataset} using {args.model_family} model: {args.checkpoint}.")
+        logger.info(
+            f"Start inference (method: {args.method}) on {args.dataset} \
+                using {args.model_family} model: {args.checkpoint}."
+        )
         scoring_method = args.scoring_method_for_process_of_elimination
-        logger.info(f"Step 1: Computing masks. Scoring method: {scoring_method}.")
+        logger.info(
+            f"Step 1: Computing masks. Scoring method: {scoring_method}."
+        )
         if scoring_method == "channel":
-            tokenized_channel_dataset = raw_dataset.map(preprocess_func_channel, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-            eval_channel_dataloader = DataLoader(tokenized_channel_dataset, batch_size=args.batch_size, shuffle=False)
-            avg_log_probs, _, _, _ = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
+            tokenized_channel_dataset = raw_dataset.map(
+                preprocess_func_channel,
+                fn_kwargs=fn_kwargs,
+                batched=True,
+                batch_size=args.batch_size,
+            )
+            eval_channel_dataloader = DataLoader(
+                tokenized_channel_dataset,
+                batch_size=args.batch_size,
+                shuffle=False,
+            )
+            avg_log_probs, _, _, _ = inference_language_modeling(
+                model,
+                eval_channel_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
         elif scoring_method == "calibration":
-            fn_kwargs = {"ending_names": ending_names, 
-                        "header_name": "uncond_premise", # the difference is here
-                        "tokenizer": tokenizer,}
-            tokenized_calibration_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-            eval_calibration_dataloader = DataLoader(tokenized_calibration_dataset, batch_size=args.batch_size, shuffle=False)    
-            avg_log_probs, _, _, _ = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
+            fn_kwargs = {
+                "ending_names": ending_names,
+                "header_name": "uncond_premise",  # the difference is here
+                "tokenizer": tokenizer,
+            }
+            tokenized_calibration_dataset = raw_dataset.map(
+                preprocess_func,
+                fn_kwargs=fn_kwargs,
+                batched=True,
+                batch_size=args.batch_size,
+            )
+            eval_calibration_dataloader = DataLoader(
+                tokenized_calibration_dataset,
+                batch_size=args.batch_size,
+                shuffle=False,
+            )
+            avg_log_probs, _, _, _ = inference_calibration(
+                model,
+                eval_dataloader,
+                eval_calibration_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
         elif scoring_method == "language_modeling":
-            avg_log_probs, _, _, _ = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+            avg_log_probs, _, _, _ = inference_language_modeling(
+                model,
+                eval_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
         elif scoring_method == "multiple_choice_prompt":
             # mcp_args = copy.deepcopy(args)
             # mcp_args.multiple_choice_prompt = multiple_choice_prompt
             # _, _, raw_mcp_dataset, n_shot_mcp_dataset = load_data(mcp_args)
-            # raw_mcp_dataset, n_shot_mcp_dataset = create_n_shot_splits(raw_mcp_dataset, n_shot_mcp_dataset, args)    
-            tokenized_dataset = raw_mcp_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-            eval_mcp_dataloader = DataLoader(tokenized_dataset, batch_size=args.batch_size, shuffle=False)
-            avg_log_probs, _, _, _ = inference_language_modeling(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
+            # raw_mcp_dataset, n_shot_mcp_dataset = create_n_shot_splits(
+            # raw_mcp_dataset, n_shot_mcp_dataset, args)
+            tokenized_dataset = raw_mcp_dataset.map(
+                preprocess_func,
+                fn_kwargs=fn_kwargs,
+                batched=True,
+                batch_size=args.batch_size,
+            )
+            eval_mcp_dataloader = DataLoader(
+                tokenized_dataset, batch_size=args.batch_size, shuffle=False
+            )
+            avg_log_probs, _, _, _ = inference_language_modeling(
+                model,
+                eval_mcp_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
         else:
-            raise NotImplementedError # unlikely to happen.
-        
+            raise NotImplementedError  # unlikely to happen.
+
         mask_strategy = args.mask_strategy_for_process_of_elimination
         if mask_strategy == "min_k":
             # masking the most k UNLIKELY options
-            min_k = args.min_k 
+            min_k = args.min_k
             if min_k >= num_of_options:
                 min_k = num_of_options - 1
-            mask_kwargs = {"min_k": min_k,}
+            mask_kwargs = {
+                "min_k": min_k,
+            }
         else:
             mask_kwargs = {}
-        masks = compute_mask_process_of_elimination(avg_log_probs, mask_strategy, **mask_kwargs)
-        # construct an oracle mask that only keeps the correct lable to 1, and other options to 0
+        masks = compute_mask_process_of_elimination(
+            avg_log_probs, mask_strategy, **mask_kwargs
+        )
+        # construct an oracle mask that only keeps the correct lable to 1,
+        # and other options to 0
         # oracle_masks = torch.zeros_like(avg_log_probs)
-        # oracle_masks[torch.arange(oracle_masks.size(0)), tokenized_dataset["label"]] = 1
+        # oracle_masks[torch.arange(oracle_masks.size(0)), \
+        # tokenized_dataset["label"]] = 1
         masks = masks.to(torch.float32)
-        # compute mask accuracy, i.e., check whether mask that correspond to labels is 1
-        mask_result = masks[torch.arange(masks.size(0)), tokenized_dataset["label"]]
+        # compute mask accuracy, i.e.,
+        # check whether mask that correspond to labels is 1
+        mask_result = masks[
+            torch.arange(masks.size(0)), tokenized_dataset["label"]
+        ]
         mask_accuracy = torch.sum(mask_result) / mask_result.size(0)
         logger.info(f"Mask accuracy: {mask_accuracy}")
         args.mask_accuracy = mask_accuracy.item()
-        masked_dataset = tokenized_dataset.map(lambda example, idx: {"mask": masks[idx]}, 
-                                 with_indices=True, 
-                                 batched=True,
-                                 remove_columns=remove_columns)
-        
+        masked_dataset = tokenized_dataset.map(
+            lambda example, idx: {"mask": masks[idx]},
+            with_indices=True,
+            batched=True,
+            remove_columns=remove_columns,
+        )
+
         prompting_method = args.prompting_method_for_process_of_elimination
-        logger.info(f"Step 2: Creating multiple choice prompt. Prompting method: {prompting_method}.")
+        logger.info(
+            f"Step 2: Creating multiple choice prompt. \
+                Prompting method: {prompting_method}."
+        )
         # if args.prompting_method_for_process_of_elimination
         # mcp_kwargs = {"multiple_choice_prompt": multiple_choice_prompt,}
         mask_token = args.mask_token
         if mask_token is not None:
             if mask_token == "":
-                args.process_of_elimination_prompt = args.process_of_elimination_prompt.replace("[MASK]", "empty")   
+                args.process_of_elimination_prompt = (
+                    args.process_of_elimination_prompt.replace(
+                        "[MASK]", "empty"
+                    )
+                )
             else:
-                args.process_of_elimination_prompt = args.process_of_elimination_prompt.replace("[MASK]", mask_token)   
-        mcp_kwargs = {"multiple_choice_prompt": args.process_of_elimination_prompt,
-                      "scoring_method": scoring_method,
-                      "num_of_options": num_of_options,
-                      "mask_token": mask_token,}
-        mcp_dataset = masked_dataset.map(create_multiple_choice_prompt, fn_kwargs=mcp_kwargs)
+                args.process_of_elimination_prompt = (
+                    args.process_of_elimination_prompt.replace(
+                        "[MASK]", mask_token
+                    )
+                )
+        mcp_kwargs = {
+            "multiple_choice_prompt": args.process_of_elimination_prompt,
+            "scoring_method": scoring_method,
+            "num_of_options": num_of_options,
+            "mask_token": mask_token,
+        }
+        mcp_dataset = masked_dataset.map(
+            create_multiple_choice_prompt, fn_kwargs=mcp_kwargs
+        )
         # change n_shot format.
-        if args.n_shot > 0 : 
-            if args.scoring_method_for_process_of_elimination == "multiple_choice_prompt":
-                n_shot_demonstrations, n_shot_poe_demonstrations = generate_n_shot_poe_demonstrations(n_shot_mcp_dataset, num_of_options)
+        if args.n_shot > 0:
+            if (
+                args.scoring_method_for_process_of_elimination
+                == "multiple_choice_prompt"
+            ):
+                n_shot_demonstrations, n_shot_poe_demonstrations = (
+                    generate_n_shot_poe_demonstrations(
+                        n_shot_mcp_dataset, num_of_options
+                    )
+                )
             else:
-                _, n_shot_poe_demonstrations = generate_n_shot_poe_demonstrations(n_shot_mcp_dataset, num_of_options)            
-            mcp_dataset = mcp_dataset.map(lambda x: {"premise": x['premise'].replace(n_shot_demonstrations, n_shot_poe_demonstrations)})
+                _, n_shot_poe_demonstrations = (
+                    generate_n_shot_poe_demonstrations(
+                        n_shot_mcp_dataset, num_of_options
+                    )
+                )
+            mcp_dataset = mcp_dataset.map(
+                lambda x: {
+                    "premise": x["premise"].replace(
+                        n_shot_demonstrations, n_shot_poe_demonstrations
+                    )
+                }
+            )
 
-        logger.info(f"Step 3: Final Inference")
-        mcp_dataset = mcp_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-        eval_mcp_dataloader = DataLoader(mcp_dataset, batch_size=args.batch_size, shuffle=False)
-        poe_avg_log_probs,  lm_accuracy, _, _ = inference_process_of_elimination(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
+        logger.info("Step 3: Final Inference")
+        mcp_dataset = mcp_dataset.map(
+            preprocess_func,
+            fn_kwargs=fn_kwargs,
+            batched=True,
+            batch_size=args.batch_size,
+        )
+        eval_mcp_dataloader = DataLoader(
+            mcp_dataset, batch_size=args.batch_size, shuffle=False
+        )
+        poe_avg_log_probs, lm_accuracy, _, _ = (
+            inference_process_of_elimination(
+                model,
+                eval_mcp_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
+        )
 
         # step 6: some postprocessing, including saving and displyaing output.
         save_path = os.path.join("../results", f"{args.method}.csv")
         logger.info(f"Save results to {save_path}.")
         save_args = copy.deepcopy(args)
         if mask_strategy == "min_k":
-            save_args.mask_strategy_for_process_of_elimination = f"min_k_{min_k}"   
+            save_args.mask_strategy_for_process_of_elimination = (
+                f"min_k_{min_k}"
+            )
         write_to_csv(save_path, save_args, lm_accuracy)
 
         # step 7: push data to HuggingFace Hub.
@@ -211,5 +335,6 @@ def main():
             # save the mcp dataset, which will be used by LLM.
             upload_to_huggingface_hub(mcp_dataset, args)
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/mm_poe/methods/process_of_elimination_vqa.py b/mm_poe/methods/process_of_elimination_vqa.py
index 7b9e4d9..cc2eb09 100644
--- a/mm_poe/methods/process_of_elimination_vqa.py
+++ b/mm_poe/methods/process_of_elimination_vqa.py
@@ -1,26 +1,13 @@
 # a framework for inference on multiple choice tasks.
-import argparse
 import copy
-import csv
 import logging
 import os
-import random
-import sys
-from tqdm import tqdm
 
-import numpy as np
 import torch
-import torch.nn.functional as F
 from torch.utils.data import DataLoader
-from transformers import(
-    AutoTokenizer, 
-    AutoModelForCausalLM,
-    AutoModelForSeq2SeqLM,
-)
-from datasets import Dataset
 
 
-from utils.data import(
+from utils.data import (
     upload_to_huggingface_hub,
     preprocess_function_seq2seq,
     preprocess_function_causal,
@@ -32,9 +19,9 @@
     preprocess_function_seq2seq_vqa,
     preprocess_function_seq2seq_vqa_channel,
     preprocess_function_causal_vqa,
-    preprocess_function_causal_vqa_channel
+    preprocess_function_causal_vqa_channel,
 )
-from utils.methods import(
+from utils.methods import (
     compute_conditional_score_seq2seq,
     compute_conditional_score_causal,
     compute_conditional_score_seq2seq_vqa,
@@ -42,9 +29,9 @@
     compute_mask_process_of_elimination,
     inference_process_of_elimination,
     inference_language_modeling,
-    inference_calibration
+    inference_calibration,
 )
-from utils.utils import(
+from utils.utils import (
     load_data,
     load_model,
     parse_args,
@@ -54,6 +41,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 def main():
     # import pdb; pdb.set_trace()
 
@@ -73,46 +61,58 @@ def main():
     logger.info(f"Set random seed to {args.seed}.")
     set_seed(args.seed)
 
-    # step 3: load model, tokenizer. Then move to gpu, and set to evaluation mode.
+    # step 3: load model, tokenizer.
+    # Then move to gpu, and set to evaluation mode.
     logger.info(f"Load {args.model_family} model: {args.checkpoint}.")
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     # get model path: ../models/args.model_family/args.checkpoint
-    model_path = os.path.join("/content/models", args.model_family, args.checkpoint)
+    model_path = os.path.join(
+        "/content/models", args.model_family, args.checkpoint
+    )
     model, tokenizer = load_model(device, model_path, args)
     if args.model_family in ["GPT2", "Pythia", "OPT-IML", "Dolly"]:
         compute_func = compute_conditional_score_causal
         preprocess_func = preprocess_function_causal
         preprocess_func_channel = preprocess_function_causal_channel
-        remove_columns = ['input_ids',
-                          'labels',
-                          'ending_attention_mask']
+        remove_columns = ["input_ids", "labels", "ending_attention_mask"]
     elif args.model_family in ["T5", "FLAN-T5"]:
         compute_func = compute_conditional_score_seq2seq
         preprocess_func = preprocess_function_seq2seq
         preprocess_func_channel = preprocess_function_seq2seq_channel
-        remove_columns=['header_input_ids', 
-                        'header_attention_mask', 
-                        'ending_input_ids', 
-                        'ending_attention_mask', ]
-    elif args.model_family in ["BLIP2", "InstructBLIP", "PaliGemma", "Idefics2"]:
+        remove_columns = [
+            "header_input_ids",
+            "header_attention_mask",
+            "ending_input_ids",
+            "ending_attention_mask",
+        ]
+    elif args.model_family in [
+        "BLIP2",
+        "InstructBLIP",
+        "PaliGemma",
+        "Idefics2",
+    ]:
         compute_func = compute_conditional_score_seq2seq_vqa
         preprocess_func = preprocess_function_seq2seq_vqa
         preprocess_func_channel = preprocess_function_seq2seq_vqa_channel
-        remove_columns = ['header_input_ids', 
-                        'header_attention_mask', 
-                        'ending_input_ids', 
-                        'ending_attention_mask',
-                        'images',]
+        remove_columns = [
+            "header_input_ids",
+            "header_attention_mask",
+            "ending_input_ids",
+            "ending_attention_mask",
+            "images",
+        ]
         processor = tokenizer
         tokenizer = processor.tokenizer
     elif args.model_family in ["GIT"]:
         compute_func = compute_conditional_score_causal_vqa
         preprocess_func = preprocess_function_causal_vqa
         preprocess_func_channel = preprocess_function_causal_vqa_channel
-        remove_columns = ['input_ids',
-                          'labels',
-                          'images',
-                          'ending_attention_mask']
+        remove_columns = [
+            "input_ids",
+            "labels",
+            "images",
+            "ending_attention_mask",
+        ]
         processor = tokenizer
         tokenizer = processor.tokenizer
     else:
@@ -121,7 +121,7 @@ def main():
     # step 4: load and preprocess data.
     args.datasets = args.datasets.split()
     logger.info(f"Load data: {args.datasets}.")
-    
+
     # evaluate on each dataset
     multiple_choice_prompt = args.multiple_choice_prompt
     for dataset in args.datasets:
@@ -129,127 +129,282 @@ def main():
         # multiple_choice_prompt = args.multiple_choice_prompt
         args.multiple_choice_prompt = None
         if args.dataset in ["vqa", "scienceqa", "ai2d"]:
-            ending_names, header_name, image_header_name, raw_dataset, n_shot_dataset = load_data(args)
+            (
+                ending_names,
+                header_name,
+                image_header_name,
+                raw_dataset,
+                n_shot_dataset,
+            ) = load_data(args)
         else:
-            ending_names, header_name, raw_dataset, n_shot_dataset = load_data(args)
-        raw_dataset, n_shot_dataset, n_shot_demonstrations = create_n_shot_splits(raw_dataset, n_shot_dataset, args)    
-        
+            ending_names, header_name, raw_dataset, n_shot_dataset = load_data(
+                args
+            )
+        raw_dataset, n_shot_dataset, n_shot_demonstrations = (
+            create_n_shot_splits(raw_dataset, n_shot_dataset, args)
+        )
+
         mcp_args = copy.deepcopy(args)
         mcp_args.multiple_choice_prompt = multiple_choice_prompt
         if args.dataset in ["vqa", "scienceqa", "ai2d"]:
             _, _, _, raw_mcp_dataset, n_shot_mcp_dataset = load_data(mcp_args)
         else:
             _, _, raw_mcp_dataset, n_shot_mcp_dataset = load_data(mcp_args)
-        raw_mcp_dataset, n_shot_mcp_dataset, _ = create_n_shot_splits(raw_mcp_dataset, n_shot_mcp_dataset, args)    
-        
+        raw_mcp_dataset, n_shot_mcp_dataset, _ = create_n_shot_splits(
+            raw_mcp_dataset, n_shot_mcp_dataset, args
+        )
+
         logger.info(f"Preprocess data: {args.dataset}.")
-        if args.model_family in ["BLIP2", "InstructBLIP", "GIT", "PaliGemma", "Idefics2"]:
-            fn_kwargs = {"ending_names": ending_names, 
-                        "header_name": header_name, 
-                        "tokenizer": tokenizer,
-                        "processor": processor,
-                        "image_header_name": image_header_name}
+        if args.model_family in [
+            "BLIP2",
+            "InstructBLIP",
+            "GIT",
+            "PaliGemma",
+            "Idefics2",
+        ]:
+            fn_kwargs = {
+                "ending_names": ending_names,
+                "header_name": header_name,
+                "tokenizer": tokenizer,
+                "processor": processor,
+                "image_header_name": image_header_name,
+            }
         else:
-            fn_kwargs = {"ending_names": ending_names, 
-                    "header_name": header_name, 
-                    "tokenizer": tokenizer,}
+            fn_kwargs = {
+                "ending_names": ending_names,
+                "header_name": header_name,
+                "tokenizer": tokenizer,
+            }
         num_of_options = len(ending_names)
-        tokenized_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-        eval_dataloader = DataLoader(tokenized_dataset, batch_size=args.batch_size, shuffle=False)
+        tokenized_dataset = raw_dataset.map(
+            preprocess_func,
+            fn_kwargs=fn_kwargs,
+            batched=True,
+            batch_size=args.batch_size,
+        )
+        eval_dataloader = DataLoader(
+            tokenized_dataset, batch_size=args.batch_size, shuffle=False
+        )
 
         # step 5: (evaluation) inference on data, and compute accuracy.
-        logger.info(f"Start inference (method: {args.method}) on {args.dataset} using {args.model_family} model: {args.checkpoint}.")
+        logger.info(
+            f"Start inference (method: {args.method}) on {args.dataset} \
+                using {args.model_family} model: {args.checkpoint}."
+        )
         scoring_method = args.scoring_method_for_process_of_elimination
-        logger.info(f"Step 1: Computing masks. Scoring method: {scoring_method}.")
+        logger.info(
+            f"Step 1: Computing masks. Scoring method: {scoring_method}."
+        )
         if scoring_method == "channel":
-            tokenized_channel_dataset = raw_dataset.map(preprocess_func_channel, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-            eval_channel_dataloader = DataLoader(tokenized_channel_dataset, batch_size=args.batch_size, shuffle=False)
-            avg_log_probs, _, _, _ = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
+            tokenized_channel_dataset = raw_dataset.map(
+                preprocess_func_channel,
+                fn_kwargs=fn_kwargs,
+                batched=True,
+                batch_size=args.batch_size,
+            )
+            eval_channel_dataloader = DataLoader(
+                tokenized_channel_dataset,
+                batch_size=args.batch_size,
+                shuffle=False,
+            )
+            avg_log_probs, _, _, _ = inference_language_modeling(
+                model,
+                eval_channel_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
         elif scoring_method == "calibration":
-            if args.model_family in ["BLIP2", "InstructBLIP", "GIT", "PaliGemma", "Idefics2"]:
-                fn_kwargs = {"ending_names": ending_names, 
-                            "header_name": "uncond_premise", # the difference is here
-                            "tokenizer": tokenizer,
-                            "processor": processor,
-                            "image_header_name": image_header_name}
+            if args.model_family in [
+                "BLIP2",
+                "InstructBLIP",
+                "GIT",
+                "PaliGemma",
+                "Idefics2",
+            ]:
+                fn_kwargs = {
+                    "ending_names": ending_names,
+                    "header_name": "uncond_premise",  # the difference is here
+                    "tokenizer": tokenizer,
+                    "processor": processor,
+                    "image_header_name": image_header_name,
+                }
             else:
-                fn_kwargs = {"ending_names": ending_names, 
-                            "header_name": "uncond_premise", # the difference is here
-                            "tokenizer": tokenizer,}
-            tokenized_calibration_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-            eval_calibration_dataloader = DataLoader(tokenized_calibration_dataset, batch_size=args.batch_size, shuffle=False)    
-            avg_log_probs, _, _ = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
+                fn_kwargs = {
+                    "ending_names": ending_names,
+                    "header_name": "uncond_premise",  # the difference is here
+                    "tokenizer": tokenizer,
+                }
+            tokenized_calibration_dataset = raw_dataset.map(
+                preprocess_func,
+                fn_kwargs=fn_kwargs,
+                batched=True,
+                batch_size=args.batch_size,
+            )
+            eval_calibration_dataloader = DataLoader(
+                tokenized_calibration_dataset,
+                batch_size=args.batch_size,
+                shuffle=False,
+            )
+            avg_log_probs, _, _ = inference_calibration(
+                model,
+                eval_dataloader,
+                eval_calibration_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
         elif scoring_method == "language_modeling":
-            avg_log_probs, _, _, _ = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+            avg_log_probs, _, _, _ = inference_language_modeling(
+                model,
+                eval_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
         elif scoring_method == "multiple_choice_prompt":
             # mcp_args = copy.deepcopy(args)
             # mcp_args.multiple_choice_prompt = multiple_choice_prompt
             # _, _, raw_mcp_dataset, n_shot_mcp_dataset = load_data(mcp_args)
-            # raw_mcp_dataset, n_shot_mcp_dataset = create_n_shot_splits(raw_mcp_dataset, n_shot_mcp_dataset, args)    
-            tokenized_dataset = raw_mcp_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-            eval_mcp_dataloader = DataLoader(tokenized_dataset, batch_size=args.batch_size, shuffle=False)
-            avg_log_probs, _, _, _ = inference_language_modeling(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
+            # raw_mcp_dataset, n_shot_mcp_dataset = create_n_shot_splits(
+            # raw_mcp_dataset, n_shot_mcp_dataset, args)
+            tokenized_dataset = raw_mcp_dataset.map(
+                preprocess_func,
+                fn_kwargs=fn_kwargs,
+                batched=True,
+                batch_size=args.batch_size,
+            )
+            eval_mcp_dataloader = DataLoader(
+                tokenized_dataset, batch_size=args.batch_size, shuffle=False
+            )
+            avg_log_probs, _, _, _ = inference_language_modeling(
+                model,
+                eval_mcp_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
         else:
-            raise NotImplementedError # unlikely to happen.
-        
+            raise NotImplementedError  # unlikely to happen.
+
         mask_strategy = args.mask_strategy_for_process_of_elimination
         if mask_strategy == "min_k":
             # masking the most k UNLIKELY options
-            min_k = args.min_k 
+            min_k = args.min_k
             if min_k >= num_of_options:
                 min_k = num_of_options - 1
-            mask_kwargs = {"min_k": min_k,}
+            mask_kwargs = {
+                "min_k": min_k,
+            }
         else:
             mask_kwargs = {}
-        masks = compute_mask_process_of_elimination(avg_log_probs, mask_strategy, **mask_kwargs)
-        # construct an oracle mask that only keeps the correct lable to 1, and other options to 0
+        masks = compute_mask_process_of_elimination(
+            avg_log_probs, mask_strategy, **mask_kwargs
+        )
+        # construct an oracle mask that only keeps the correct lable to 1,
+        # and other options to 0
         # oracle_masks = torch.zeros_like(avg_log_probs)
-        # oracle_masks[torch.arange(oracle_masks.size(0)), tokenized_dataset["label"]] = 1
+        # oracle_masks[torch.arange(oracle_masks.size(0)), \
+        # tokenized_dataset["label"]] = 1
         masks = masks.to(torch.float32)
-        # compute mask accuracy, i.e., check whether mask that correspond to labels is 1
-        mask_result = masks[torch.arange(masks.size(0)), tokenized_dataset["label"]]
+        # compute mask accuracy, i.e.,
+        # check whether mask that correspond to labels is 1
+        mask_result = masks[
+            torch.arange(masks.size(0)), tokenized_dataset["label"]
+        ]
         mask_accuracy = torch.sum(mask_result) / mask_result.size(0)
         logger.info(f"Mask accuracy: {mask_accuracy}")
         args.mask_accuracy = mask_accuracy.item()
-        masked_dataset = tokenized_dataset.map(lambda example, idx: {"mask": masks[idx]}, 
-                                 with_indices=True, 
-                                 batched=True,
-                                 remove_columns=remove_columns)
-        
+        masked_dataset = tokenized_dataset.map(
+            lambda example, idx: {"mask": masks[idx]},
+            with_indices=True,
+            batched=True,
+            remove_columns=remove_columns,
+        )
+
         prompting_method = args.prompting_method_for_process_of_elimination
-        logger.info(f"Step 2: Creating multiple choice prompt. Prompting method: {prompting_method}.")
+        logger.info(
+            f"Step 2: Creating multiple choice prompt. \
+                Prompting method: {prompting_method}."
+        )
         # if args.prompting_method_for_process_of_elimination
         # mcp_kwargs = {"multiple_choice_prompt": multiple_choice_prompt,}
         mask_token = args.mask_token
         if mask_token is not None:
             if mask_token == "":
-                args.process_of_elimination_prompt = args.process_of_elimination_prompt.replace("[MASK]", "empty")   
+                args.process_of_elimination_prompt = (
+                    args.process_of_elimination_prompt.replace(
+                        "[MASK]", "empty"
+                    )
+                )
             else:
-                args.process_of_elimination_prompt = args.process_of_elimination_prompt.replace("[MASK]", mask_token)   
-        mcp_kwargs = {"multiple_choice_prompt": args.process_of_elimination_prompt,
-                      "scoring_method": scoring_method,
-                      "num_of_options": num_of_options,
-                      "mask_token": mask_token,}
-        mcp_dataset = masked_dataset.map(create_multiple_choice_prompt, fn_kwargs=mcp_kwargs)
+                args.process_of_elimination_prompt = (
+                    args.process_of_elimination_prompt.replace(
+                        "[MASK]", mask_token
+                    )
+                )
+        mcp_kwargs = {
+            "multiple_choice_prompt": args.process_of_elimination_prompt,
+            "scoring_method": scoring_method,
+            "num_of_options": num_of_options,
+            "mask_token": mask_token,
+        }
+        mcp_dataset = masked_dataset.map(
+            create_multiple_choice_prompt, fn_kwargs=mcp_kwargs
+        )
         # change n_shot format.
-        if args.n_shot > 0 : 
-            if args.scoring_method_for_process_of_elimination == "multiple_choice_prompt":
-                n_shot_demonstrations, n_shot_poe_demonstrations = generate_n_shot_poe_demonstrations(n_shot_mcp_dataset, num_of_options)
+        if args.n_shot > 0:
+            if (
+                args.scoring_method_for_process_of_elimination
+                == "multiple_choice_prompt"
+            ):
+                n_shot_demonstrations, n_shot_poe_demonstrations = (
+                    generate_n_shot_poe_demonstrations(
+                        n_shot_mcp_dataset, num_of_options
+                    )
+                )
             else:
-                _, n_shot_poe_demonstrations = generate_n_shot_poe_demonstrations(n_shot_mcp_dataset, num_of_options)            
-            mcp_dataset = mcp_dataset.map(lambda x: {"premise": x['premise'].replace(n_shot_demonstrations, n_shot_poe_demonstrations)})
+                _, n_shot_poe_demonstrations = (
+                    generate_n_shot_poe_demonstrations(
+                        n_shot_mcp_dataset, num_of_options
+                    )
+                )
+            mcp_dataset = mcp_dataset.map(
+                lambda x: {
+                    "premise": x["premise"].replace(
+                        n_shot_demonstrations, n_shot_poe_demonstrations
+                    )
+                }
+            )
 
-        logger.info(f"Step 3: Final Inference")
-        mcp_dataset = mcp_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-        eval_mcp_dataloader = DataLoader(mcp_dataset, batch_size=args.batch_size, shuffle=False)
-        poe_avg_log_probs,  lm_accuracy, _, _ = inference_process_of_elimination(model, eval_mcp_dataloader, device, compute_func, tokenizer.pad_token_id)
+        logger.info("Step 3: Final Inference")
+        mcp_dataset = mcp_dataset.map(
+            preprocess_func,
+            fn_kwargs=fn_kwargs,
+            batched=True,
+            batch_size=args.batch_size,
+        )
+        eval_mcp_dataloader = DataLoader(
+            mcp_dataset, batch_size=args.batch_size, shuffle=False
+        )
+        poe_avg_log_probs, lm_accuracy, _, _ = (
+            inference_process_of_elimination(
+                model,
+                eval_mcp_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
+        )
 
         # step 6: some postprocessing, including saving and displyaing output.
         save_path = os.path.join("../results", f"{args.method}.csv")
         logger.info(f"Save results to {save_path}.")
         save_args = copy.deepcopy(args)
         if mask_strategy == "min_k":
-            save_args.mask_strategy_for_process_of_elimination = f"min_k_{min_k}"   
+            save_args.mask_strategy_for_process_of_elimination = (
+                f"min_k_{min_k}"
+            )
         write_to_csv(save_path, save_args, lm_accuracy)
 
         # step 7: push data to HuggingFace Hub.
@@ -258,5 +413,6 @@ def main():
             # save the mcp dataset, which will be used by LLM.
             upload_to_huggingface_hub(mcp_dataset, args)
 
+
 if __name__ == "__main__":
     main()
diff --git a/mm_poe/methods/utils/data.py b/mm_poe/methods/utils/data.py
index c2e1e29..363cdf1 100644
--- a/mm_poe/methods/utils/data.py
+++ b/mm_poe/methods/utils/data.py
@@ -4,125 +4,227 @@
 import random
 
 import torch
-from huggingface_hub import upload_folder
 from PIL import Image
 
 # write my own data loader, or using HF dataloader?
 # steps for data loader: label, premise, options, hypothesis.
 # uncond_premise = " the answer is:"
 
+
 def upload_to_huggingface_hub(dataset, args):
-    suffix = f"{args.dataset}_{args.seed}_{args.n_shot}_{args.sample}_{args.checkpoint.split('/')[-1]}_{args.batch_size}"
+    suffix = (
+        f"{args.dataset}_{args.seed}_{args.n_shot}_"
+        + f"{args.sample}_{args.checkpoint.split('/')[-1]}_{args.batch_size}"
+    )
     temp_data_path = os.path.join(f"../temp_data/{args.method}", suffix)
     dataset.save_to_disk(temp_data_path)
     # _ = upload_folder(
-    #     folder_path=temp_data_path, 
+    #     folder_path=temp_data_path,
     #     path_in_repo=f"temp_data/{args.method}/{suffix}",
     #     repo_id="Vanmas/PoE_data",
     #     repo_type="dataset",)
     # remove the temp data folder
     # os.system(f"rm -rf {temp_data_path}")
 
+
 def preprocess_function_seq2seq(examples, **kwargs):
-    ending_names, header_name, tokenizer = kwargs['ending_names'], kwargs['header_name'], kwargs['tokenizer']
+    ending_names, header_name, tokenizer = (
+        kwargs["ending_names"],
+        kwargs["header_name"],
+        kwargs["tokenizer"],
+    )
     num_choice = len(ending_names)
     question_headers = examples[header_name]
     # the tokenizer handles multiple spaces.
-    first_sentences = [[context] * len(ending_names) for context in examples[header_name]]
+    first_sentences = [
+        [context] * len(ending_names) for context in examples[header_name]
+    ]
     # second_sentences = [
-    #     [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_header)
+    #     [f"{header} {examples[end][i]}" for end in ending_names] \
+    # for i, header in enumerate(question_header)
     # ]
     second_sentences = [
-        [f"{examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+        [f"{examples[end][i]}" for end in ending_names]
+        for i, header in enumerate(question_headers)
     ]
 
     first_sentences = sum(first_sentences, [])
     second_sentences = sum(second_sentences, [])
 
-    # tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
-    tokenized_headers = tokenizer(first_sentences, padding=True, truncation=True)
-    tokenized_endings = tokenizer(second_sentences, padding=True, truncation=True)
-    header_dict = {f"header_{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)] for k, v in tokenized_headers.items()}
-    ending_dict = {f"ending_{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)] for k, v in tokenized_endings.items()}
+    # tokenized_examples = tokenizer(first_sentences, \
+    # second_sentences, truncation=True)
+    tokenized_headers = tokenizer(
+        first_sentences, padding=True, truncation=True
+    )
+    tokenized_endings = tokenizer(
+        second_sentences, padding=True, truncation=True
+    )
+    header_dict = {
+        f"header_{k}": [
+            v[i : i + num_choice] for i in range(0, len(v), num_choice)
+        ]
+        for k, v in tokenized_headers.items()
+    }
+    ending_dict = {
+        f"ending_{k}": [
+            v[i : i + num_choice] for i in range(0, len(v), num_choice)
+        ]
+        for k, v in tokenized_endings.items()
+    }
     return {**header_dict, **ending_dict}
 
+
 def preprocess_function_causal(examples, **kwargs):
-    ending_names, header_name, tokenizer = kwargs['ending_names'], kwargs['header_name'], kwargs['tokenizer']
+    ending_names, header_name, tokenizer = (
+        kwargs["ending_names"],
+        kwargs["header_name"],
+        kwargs["tokenizer"],
+    )
     num_choice = len(ending_names)
     question_headers = examples[header_name]
     # the tokenizer handles multiple spaces.
-    first_sentences = [[context] * len(ending_names) for context in examples[header_name]]
+    first_sentences = [
+        [context] * len(ending_names) for context in examples[header_name]
+    ]
     # second_sentences = [
-    #     [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_header)
+    #     [f"{header} {examples[end][i]}" for end in ending_names] \
+    # for i, header in enumerate(question_header)
     # ]
     second_sentences = [
-        [f"{examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+        [f"{examples[end][i]}" for end in ending_names]
+        for i, header in enumerate(question_headers)
     ]
 
     first_sentences = sum(first_sentences, [])
     second_sentences = sum(second_sentences, [])
 
-    # tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
+    # tokenized_examples = tokenizer(first_sentences, \
+    # second_sentences, truncation=True)
     tokenized_headers = tokenizer(first_sentences, truncation=True)
     tokenized_endings = tokenizer(second_sentences, truncation=True)
 
-    # reference: https://github.com/peterwestuw/surface-form-competition/blob/main/utils.py#L177
-    max_len = max(len(header + ending) for header, ending in zip(tokenized_headers['input_ids'], tokenized_endings['input_ids']))
-    input_ids = torch.full((len(tokenized_headers['input_ids']), max_len), tokenizer.pad_token_id, dtype=torch.long)
-    labels = tokenizer.pad_token_id * torch.ones((len(tokenized_headers['input_ids']), max_len), dtype=torch.long)
-    ending_attention_mask = torch.zeros((len(tokenized_headers['input_ids']), max_len), dtype=torch.long)
-    for i, (header, ending) in enumerate(zip(tokenized_headers['input_ids'], tokenized_endings['input_ids'])):
-        input_ids[i, :len(header)] = torch.tensor(header)
-        input_ids[i, len(header):len(header)+len(ending)] = torch.tensor(ending)
-        ending_attention_mask[i, len(header):len(header)+len(ending)] = torch.tensor(1)
-        labels[i, len(header):len(header)+len(ending)] = torch.tensor(ending)
-
-    flatten_dict = {"input_ids": input_ids, "labels": labels, "ending_attention_mask": ending_attention_mask}
-    return_dict = {f"{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)] for k, v in flatten_dict.items()}
+    max_len = max(
+        len(header + ending)
+        for header, ending in zip(
+            tokenized_headers["input_ids"], tokenized_endings["input_ids"]
+        )
+    )
+    input_ids = torch.full(
+        (len(tokenized_headers["input_ids"]), max_len),
+        tokenizer.pad_token_id,
+        dtype=torch.long,
+    )
+    labels = tokenizer.pad_token_id * torch.ones(
+        (len(tokenized_headers["input_ids"]), max_len), dtype=torch.long
+    )
+    ending_attention_mask = torch.zeros(
+        (len(tokenized_headers["input_ids"]), max_len), dtype=torch.long
+    )
+    for i, (header, ending) in enumerate(
+        zip(tokenized_headers["input_ids"], tokenized_endings["input_ids"])
+    ):
+        input_ids[i, : len(header)] = torch.tensor(header)
+        input_ids[i, len(header) : len(header) + len(ending)] = torch.tensor(
+            ending
+        )
+        ending_attention_mask[i, len(header) : len(header) + len(ending)] = (
+            torch.tensor(1)
+        )
+        labels[i, len(header) : len(header) + len(ending)] = torch.tensor(
+            ending
+        )
+
+    flatten_dict = {
+        "input_ids": input_ids,
+        "labels": labels,
+        "ending_attention_mask": ending_attention_mask,
+    }
+    return_dict = {
+        f"{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)]
+        for k, v in flatten_dict.items()
+    }
     return return_dict
 
+
 def preprocess_function_seq2seq_vqa(examples, **kwargs):
-    ending_names, header_name, image_header_name, processor = kwargs['ending_names'], kwargs['header_name'], kwargs['image_header_name'], kwargs['processor']
+    ending_names, header_name, image_header_name, processor = (
+        kwargs["ending_names"],
+        kwargs["header_name"],
+        kwargs["image_header_name"],
+        kwargs["processor"],
+    )
     tokenizer = processor.tokenizer
     image_processor = processor.image_processor
 
     num_choice = len(ending_names)
     question_headers = examples[header_name]
     # the tokenizer handles multiple spaces.
-    first_sentences = [[context] * len(ending_names) for context in examples[header_name]]
+    first_sentences = [
+        [context] * len(ending_names) for context in examples[header_name]
+    ]
     second_sentences = [
-        [f"{examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+        [f"{examples[end][i]}" for end in ending_names]
+        for i, header in enumerate(question_headers)
     ]
 
     first_sentences = sum(first_sentences, [])
     second_sentences = sum(second_sentences, [])
 
-    # tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
-    tokenized_headers = tokenizer(first_sentences, padding=True, truncation=True)
-    tokenized_endings = tokenizer(second_sentences, padding=True, truncation=True)
+    # tokenized_examples = tokenizer(first_sentences, \
+    # second_sentences, truncation=True)
+    tokenized_headers = tokenizer(
+        first_sentences, padding=True, truncation=True
+    )
+    tokenized_endings = tokenizer(
+        second_sentences, padding=True, truncation=True
+    )
 
     image_paths = examples[image_header_name]
-    images = [Image.open(image_path).convert('RGB') for image_path in image_paths]
+    images = [
+        Image.open(image_path).convert("RGB") for image_path in image_paths
+    ]
     images = [[image] * len(ending_names) for image in images]
     images = sum(images, [])
-    images = image_processor(images, return_tensors='pt').data
+    images = image_processor(images, return_tensors="pt").data
 
     flatten_image_dict = {"images": images["pixel_values"]}
-    header_dict = {f"header_{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)] for k, v in tokenized_headers.items()}
-    ending_dict = {f"ending_{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)] for k, v in tokenized_endings.items()}
-    image_dict = {f"{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)] for k, v in flatten_image_dict.items()}
+    header_dict = {
+        f"header_{k}": [
+            v[i : i + num_choice] for i in range(0, len(v), num_choice)
+        ]
+        for k, v in tokenized_headers.items()
+    }
+    ending_dict = {
+        f"ending_{k}": [
+            v[i : i + num_choice] for i in range(0, len(v), num_choice)
+        ]
+        for k, v in tokenized_endings.items()
+    }
+    image_dict = {
+        f"{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)]
+        for k, v in flatten_image_dict.items()
+    }
     return {**header_dict, **ending_dict, **image_dict}
 
+
 def preprocess_function_causal_vqa(examples, **kwargs):
-    ending_names, header_name, image_header_name, processor = kwargs['ending_names'], kwargs['header_name'], kwargs['image_header_name'], kwargs['processor']
+    ending_names, header_name, image_header_name, processor = (
+        kwargs["ending_names"],
+        kwargs["header_name"],
+        kwargs["image_header_name"],
+        kwargs["processor"],
+    )
     tokenizer = processor.tokenizer
     image_processor = processor.image_processor
 
     num_choice = len(ending_names)
     question_headers = examples[header_name]
-    first_sentences = [[context] * len(ending_names) for context in examples[header_name]]
+    first_sentences = [
+        [context] * len(ending_names) for context in examples[header_name]
+    ]
     second_sentences = [
-        [f"{examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+        [f"{examples[end][i]}" for end in ending_names]
+        for i, header in enumerate(question_headers)
     ]
 
     first_sentences = sum(first_sentences, [])
@@ -132,48 +234,97 @@ def preprocess_function_causal_vqa(examples, **kwargs):
     # print(second_sentences)
 
     tokenized_headers = tokenizer(first_sentences, truncation=True)
-    tokenized_endings = tokenizer(second_sentences, truncation=True)  
+    tokenized_endings = tokenizer(second_sentences, truncation=True)
 
     image_paths = examples[image_header_name]
-    images = [Image.open(image_path).convert('RGB') for image_path in image_paths]
+    images = [
+        Image.open(image_path).convert("RGB") for image_path in image_paths
+    ]
     images = [[image] * len(ending_names) for image in images]
     images = sum(images, [])
-    images = image_processor(images, return_tensors='pt').data
-
-    max_len = max(len(header + ending) for header, ending in zip(tokenized_headers['input_ids'], tokenized_endings['input_ids']))
-    input_ids = torch.full((len(tokenized_headers['input_ids']), max_len), tokenizer.pad_token_id, dtype=torch.long)
-    labels = tokenizer.pad_token_id * torch.ones((len(tokenized_headers['input_ids']), max_len), dtype=torch.long)
-    header_attention_mask = torch.zeros((len(tokenized_headers['input_ids']), max_len), dtype=torch.long)
-    ending_attention_mask = torch.zeros((len(tokenized_headers['input_ids']), max_len), dtype=torch.long)
-    for i, (header, ending) in enumerate(zip(tokenized_headers['input_ids'], tokenized_endings['input_ids'])):
-        if tokenizer.padding_side == 'right':
-            input_ids[i, :len(header)] = torch.tensor(header)
-            input_ids[i, len(header):len(header)+len(ending)] = torch.tensor(ending)
-            header_attention_mask[i, :len(header)+len(ending)] = torch.tensor(1)
-            ending_attention_mask[i, len(header):len(header)+len(ending)] = torch.tensor(1)
-            labels[i, len(header):len(header)+len(ending)] = torch.tensor(ending)
+    images = image_processor(images, return_tensors="pt").data
+
+    max_len = max(
+        len(header + ending)
+        for header, ending in zip(
+            tokenized_headers["input_ids"], tokenized_endings["input_ids"]
+        )
+    )
+    input_ids = torch.full(
+        (len(tokenized_headers["input_ids"]), max_len),
+        tokenizer.pad_token_id,
+        dtype=torch.long,
+    )
+    labels = tokenizer.pad_token_id * torch.ones(
+        (len(tokenized_headers["input_ids"]), max_len), dtype=torch.long
+    )
+    header_attention_mask = torch.zeros(
+        (len(tokenized_headers["input_ids"]), max_len), dtype=torch.long
+    )
+    ending_attention_mask = torch.zeros(
+        (len(tokenized_headers["input_ids"]), max_len), dtype=torch.long
+    )
+    for i, (header, ending) in enumerate(
+        zip(tokenized_headers["input_ids"], tokenized_endings["input_ids"])
+    ):
+        if tokenizer.padding_side == "right":
+            input_ids[i, : len(header)] = torch.tensor(header)
+            input_ids[i, len(header) : len(header) + len(ending)] = (
+                torch.tensor(ending)
+            )
+            header_attention_mask[i, : len(header) + len(ending)] = (
+                torch.tensor(1)
+            )
+            ending_attention_mask[
+                i, len(header) : len(header) + len(ending)
+            ] = torch.tensor(1)
+            labels[i, len(header) : len(header) + len(ending)] = torch.tensor(
+                ending
+            )
         else:
-            input_ids[i, -len(ending):] = torch.tensor(ending)
-            input_ids[i, -len(header)-len(ending):-len(ending)] = torch.tensor(header)
-            header_attention_mask[i, -len(header)-len(ending):] = torch.tensor(1)
-            ending_attention_mask[i, -len(ending):] = torch.tensor(1)
-            labels[i, -len(ending):] = torch.tensor(ending)
-
-    flatten_dict = {"input_ids": input_ids, "labels": labels, "header_attention_mask": header_attention_mask, "ending_attention_mask": ending_attention_mask, "images": images["pixel_values"]}
-    return_dict = {f"{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)] for k, v in flatten_dict.items()}
+            input_ids[i, -len(ending) :] = torch.tensor(ending)
+            input_ids[i, -len(header) - len(ending) : -len(ending)] = (
+                torch.tensor(header)
+            )
+            header_attention_mask[i, -len(header) - len(ending) :] = (
+                torch.tensor(1)
+            )
+            ending_attention_mask[i, -len(ending) :] = torch.tensor(1)
+            labels[i, -len(ending) :] = torch.tensor(ending)
+
+    flatten_dict = {
+        "input_ids": input_ids,
+        "labels": labels,
+        "header_attention_mask": header_attention_mask,
+        "ending_attention_mask": ending_attention_mask,
+        "images": images["pixel_values"],
+    }
+    return_dict = {
+        f"{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)]
+        for k, v in flatten_dict.items()
+    }
     return return_dict
 
+
 def preprocess_function_seq2seq_channel(examples, **kwargs):
-    ending_names, header_name, tokenizer = kwargs['ending_names'], kwargs['header_name'], kwargs['tokenizer']
+    ending_names, header_name, tokenizer = (
+        kwargs["ending_names"],
+        kwargs["header_name"],
+        kwargs["tokenizer"],
+    )
     num_choice = len(ending_names)
     question_headers = examples[header_name]
     # the tokenizer handles multiple spaces.
-    first_sentences = [[context] * len(ending_names) for context in examples[header_name]]
+    first_sentences = [
+        [context] * len(ending_names) for context in examples[header_name]
+    ]
     # second_sentences = [
-    #     [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_header)
+    #     [f"{header} {examples[end][i]}" for end in ending_names] \
+    # for i, header in enumerate(question_header)
     # ]
     second_sentences = [
-        [f"{examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+        [f"{examples[end][i]}" for end in ending_names]
+        for i, header in enumerate(question_headers)
     ]
 
     first_sentences = sum(first_sentences, [])
@@ -182,24 +333,48 @@ def preprocess_function_seq2seq_channel(examples, **kwargs):
     # swap first_sentences and second_sentences
     first_sentences, second_sentences = second_sentences, first_sentences
 
-    # tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
-    tokenized_headers = tokenizer(first_sentences, padding=True, truncation=True)
-    tokenized_endings = tokenizer(second_sentences, padding=True, truncation=True)
-    header_dict = {f"header_{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)] for k, v in tokenized_headers.items()}
-    ending_dict = {f"ending_{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)] for k, v in tokenized_endings.items()}
+    # tokenized_examples = tokenizer(first_sentences, \
+    # second_sentences, truncation=True)
+    tokenized_headers = tokenizer(
+        first_sentences, padding=True, truncation=True
+    )
+    tokenized_endings = tokenizer(
+        second_sentences, padding=True, truncation=True
+    )
+    header_dict = {
+        f"header_{k}": [
+            v[i : i + num_choice] for i in range(0, len(v), num_choice)
+        ]
+        for k, v in tokenized_headers.items()
+    }
+    ending_dict = {
+        f"ending_{k}": [
+            v[i : i + num_choice] for i in range(0, len(v), num_choice)
+        ]
+        for k, v in tokenized_endings.items()
+    }
     return {**header_dict, **ending_dict}
 
+
 def preprocess_function_causal_channel(examples, **kwargs):
-    ending_names, header_name, tokenizer = kwargs['ending_names'], kwargs['header_name'], kwargs['tokenizer']
+    ending_names, header_name, tokenizer = (
+        kwargs["ending_names"],
+        kwargs["header_name"],
+        kwargs["tokenizer"],
+    )
     num_choice = len(ending_names)
     question_headers = examples[header_name]
     # the tokenizer handles multiple spaces.
-    first_sentences = [[context] * len(ending_names) for context in examples[header_name]]
+    first_sentences = [
+        [context] * len(ending_names) for context in examples[header_name]
+    ]
     # second_sentences = [
-    #     [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_header)
+    #     [f"{header} {examples[end][i]}" for end in ending_names] \
+    # for i, header in enumerate(question_header)
     # ]
     second_sentences = [
-        [f"{examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+        [f"{examples[end][i]}" for end in ending_names]
+        for i, header in enumerate(question_headers)
     ]
 
     first_sentences = sum(first_sentences, [])
@@ -208,36 +383,73 @@ def preprocess_function_causal_channel(examples, **kwargs):
     # swap first_sentences and second_sentences
     first_sentences, second_sentences = second_sentences, first_sentences
 
-    # tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
+    # tokenized_examples = tokenizer(first_sentences, \
+    # second_sentences, truncation=True)
     tokenized_headers = tokenizer(first_sentences, truncation=True)
     tokenized_endings = tokenizer(second_sentences, truncation=True)
 
-    # reference: https://github.com/peterwestuw/surface-form-competition/blob/main/utils.py#L177
-    max_len = max(len(header + ending) for header, ending in zip(tokenized_headers['input_ids'], tokenized_endings['input_ids']))
-    input_ids = torch.full((len(tokenized_headers['input_ids']), max_len), tokenizer.pad_token_id, dtype=torch.long)
-    labels = tokenizer.pad_token_id * torch.ones((len(tokenized_headers['input_ids']), max_len), dtype=torch.long)
-    ending_attention_mask = torch.zeros((len(tokenized_headers['input_ids']), max_len), dtype=torch.long)
-    for i, (header, ending) in enumerate(zip(tokenized_headers['input_ids'], tokenized_endings['input_ids'])):
-        input_ids[i, :len(header)] = torch.tensor(header)
-        input_ids[i, len(header):len(header)+len(ending)] = torch.tensor(ending)
-        ending_attention_mask[i, len(header):len(header)+len(ending)] = torch.tensor(1)
-        labels[i, len(header):len(header)+len(ending)] = torch.tensor(ending)
-
-    flatten_dict = {"input_ids": input_ids, "labels": labels, "ending_attention_mask": ending_attention_mask}
-    return_dict = {f"{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)] for k, v in flatten_dict.items()}
+    max_len = max(
+        len(header + ending)
+        for header, ending in zip(
+            tokenized_headers["input_ids"], tokenized_endings["input_ids"]
+        )
+    )
+    input_ids = torch.full(
+        (len(tokenized_headers["input_ids"]), max_len),
+        tokenizer.pad_token_id,
+        dtype=torch.long,
+    )
+    labels = tokenizer.pad_token_id * torch.ones(
+        (len(tokenized_headers["input_ids"]), max_len), dtype=torch.long
+    )
+    ending_attention_mask = torch.zeros(
+        (len(tokenized_headers["input_ids"]), max_len), dtype=torch.long
+    )
+    for i, (header, ending) in enumerate(
+        zip(tokenized_headers["input_ids"], tokenized_endings["input_ids"])
+    ):
+        input_ids[i, : len(header)] = torch.tensor(header)
+        input_ids[i, len(header) : len(header) + len(ending)] = torch.tensor(
+            ending
+        )
+        ending_attention_mask[i, len(header) : len(header) + len(ending)] = (
+            torch.tensor(1)
+        )
+        labels[i, len(header) : len(header) + len(ending)] = torch.tensor(
+            ending
+        )
+
+    flatten_dict = {
+        "input_ids": input_ids,
+        "labels": labels,
+        "ending_attention_mask": ending_attention_mask,
+    }
+    return_dict = {
+        f"{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)]
+        for k, v in flatten_dict.items()
+    }
     return return_dict
 
+
 def preprocess_function_seq2seq_vqa_channel(examples, **kwargs):
-    ending_names, header_name, image_header_name, processor = kwargs['ending_names'], kwargs['header_name'], kwargs['image_header_name'], kwargs['processor']
+    ending_names, header_name, image_header_name, processor = (
+        kwargs["ending_names"],
+        kwargs["header_name"],
+        kwargs["image_header_name"],
+        kwargs["processor"],
+    )
     tokenizer = processor.tokenizer
     image_processor = processor.image_processor
 
     num_choice = len(ending_names)
     question_headers = examples[header_name]
     # the tokenizer handles multiple spaces.
-    first_sentences = [[context] * len(ending_names) for context in examples[header_name]]
+    first_sentences = [
+        [context] * len(ending_names) for context in examples[header_name]
+    ]
     second_sentences = [
-        [f"{examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+        [f"{examples[end][i]}" for end in ending_names]
+        for i, header in enumerate(question_headers)
     ]
 
     first_sentences = sum(first_sentences, [])
@@ -246,106 +458,182 @@ def preprocess_function_seq2seq_vqa_channel(examples, **kwargs):
     # swap first_sentences and second_sentences
     first_sentences, second_sentences = second_sentences, first_sentences
 
-    # tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
-    tokenized_headers = tokenizer(first_sentences, padding=True, truncation=True)
-    tokenized_endings = tokenizer(second_sentences, padding=True, truncation=True)
+    # tokenized_examples = tokenizer(first_sentences, \
+    # second_sentences, truncation=True)
+    tokenized_headers = tokenizer(
+        first_sentences, padding=True, truncation=True
+    )
+    tokenized_endings = tokenizer(
+        second_sentences, padding=True, truncation=True
+    )
 
     image_paths = examples[image_header_name]
-    images = [Image.open(image_path).convert('RGB') for image_path in image_paths]
+    images = [
+        Image.open(image_path).convert("RGB") for image_path in image_paths
+    ]
     images = [[image] * len(ending_names) for image in images]
     images = sum(images, [])
-    images = image_processor(images, return_tensors='pt').data
+    images = image_processor(images, return_tensors="pt").data
 
     flatten_image_dict = {"images": images["pixel_values"]}
-    header_dict = {f"header_{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)] for k, v in tokenized_headers.items()}
-    ending_dict = {f"ending_{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)] for k, v in tokenized_endings.items()}
-    image_dict = {f"{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)] for k, v in flatten_image_dict.items()}
+    header_dict = {
+        f"header_{k}": [
+            v[i : i + num_choice] for i in range(0, len(v), num_choice)
+        ]
+        for k, v in tokenized_headers.items()
+    }
+    ending_dict = {
+        f"ending_{k}": [
+            v[i : i + num_choice] for i in range(0, len(v), num_choice)
+        ]
+        for k, v in tokenized_endings.items()
+    }
+    image_dict = {
+        f"{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)]
+        for k, v in flatten_image_dict.items()
+    }
     return {**header_dict, **ending_dict, **image_dict}
 
+
 def preprocess_function_causal_vqa_channel(examples, **kwargs):
-    ending_names, header_name, image_header_name, processor = kwargs['ending_names'], kwargs['header_name'], kwargs['image_header_name'], kwargs['processor']
+    ending_names, header_name, image_header_name, processor = (
+        kwargs["ending_names"],
+        kwargs["header_name"],
+        kwargs["image_header_name"],
+        kwargs["processor"],
+    )
     tokenizer = processor.tokenizer
     image_processor = processor.image_processor
 
-    ending_names = [k for k in examples.keys() if k.startswith('hypothesis')]
+    ending_names = [k for k in examples.keys() if k.startswith("hypothesis")]
     num_choice = len(ending_names)
     question_headers = examples[header_name]
-    first_sentences = [[context] * len(ending_names) for context in examples[header_name]]
+    first_sentences = [
+        [context] * len(ending_names) for context in examples[header_name]
+    ]
     second_sentences = [
-        [f"{examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+        [f"{examples[end][i]}" for end in ending_names]
+        for i, header in enumerate(question_headers)
     ]
 
     first_sentences = sum(first_sentences, [])
     second_sentences = sum(second_sentences, [])
 
-     # swap first_sentences and second_sentences
+    # swap first_sentences and second_sentences
     first_sentences, second_sentences = second_sentences, first_sentences
 
     tokenized_headers = tokenizer(first_sentences, truncation=True)
-    tokenized_endings = tokenizer(second_sentences, truncation=True)  
+    tokenized_endings = tokenizer(second_sentences, truncation=True)
 
     image_paths = examples[image_header_name]
-    images = [Image.open(image_path).convert('RGB') for image_path in image_paths]
+    images = [
+        Image.open(image_path).convert("RGB") for image_path in image_paths
+    ]
     images = [[image] * len(ending_names) for image in images]
     images = sum(images, [])
-    images = image_processor(images, return_tensors='pt').data
-
-    max_len = max(len(header + ending) for header, ending in zip(tokenized_headers['input_ids'], tokenized_endings['input_ids']))
-    input_ids = torch.full((len(tokenized_headers['input_ids']), max_len), tokenizer.pad_token_id, dtype=torch.long)
-    labels = tokenizer.pad_token_id * torch.ones((len(tokenized_headers['input_ids']), max_len), dtype=torch.long)
-    header_attention_mask = torch.zeros((len(tokenized_headers['input_ids']), max_len), dtype=torch.long)
-    ending_attention_mask = torch.zeros((len(tokenized_headers['input_ids']), max_len), dtype=torch.long)
-    for i, (header, ending) in enumerate(zip(tokenized_headers['input_ids'], tokenized_endings['input_ids'])):
-        if tokenizer.padding_side == 'right':
-            input_ids[i, :len(header)] = torch.tensor(header)
-            input_ids[i, len(header):len(header)+len(ending)] = torch.tensor(ending)
-            header_attention_mask[i, :len(header)+len(ending)] = torch.tensor(1)
-            ending_attention_mask[i, len(header):len(header)+len(ending)] = torch.tensor(1)
-            labels[i, len(header):len(header)+len(ending)] = torch.tensor(ending)
+    images = image_processor(images, return_tensors="pt").data
+
+    max_len = max(
+        len(header + ending)
+        for header, ending in zip(
+            tokenized_headers["input_ids"], tokenized_endings["input_ids"]
+        )
+    )
+    input_ids = torch.full(
+        (len(tokenized_headers["input_ids"]), max_len),
+        tokenizer.pad_token_id,
+        dtype=torch.long,
+    )
+    labels = tokenizer.pad_token_id * torch.ones(
+        (len(tokenized_headers["input_ids"]), max_len), dtype=torch.long
+    )
+    header_attention_mask = torch.zeros(
+        (len(tokenized_headers["input_ids"]), max_len), dtype=torch.long
+    )
+    ending_attention_mask = torch.zeros(
+        (len(tokenized_headers["input_ids"]), max_len), dtype=torch.long
+    )
+    for i, (header, ending) in enumerate(
+        zip(tokenized_headers["input_ids"], tokenized_endings["input_ids"])
+    ):
+        if tokenizer.padding_side == "right":
+            input_ids[i, : len(header)] = torch.tensor(header)
+            input_ids[i, len(header) : len(header) + len(ending)] = (
+                torch.tensor(ending)
+            )
+            header_attention_mask[i, : len(header) + len(ending)] = (
+                torch.tensor(1)
+            )
+            ending_attention_mask[
+                i, len(header) : len(header) + len(ending)
+            ] = torch.tensor(1)
+            labels[i, len(header) : len(header) + len(ending)] = torch.tensor(
+                ending
+            )
         else:
-            input_ids[i, -len(ending):] = torch.tensor(ending)
-            input_ids[i, -len(header)-len(ending):-len(ending)] = torch.tensor(header)
-            header_attention_mask[i, -len(header)-len(ending):] = torch.tensor(1)
-            ending_attention_mask[i, -len(ending):] = torch.tensor(1)
-            labels[i, -len(ending):] = torch.tensor(ending)
-
-    flatten_dict = {"input_ids": input_ids, "labels": labels, "header_attention_mask": header_attention_mask, "ending_attention_mask": ending_attention_mask, "images": images["pixel_values"]}
-    return_dict = {f"{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)] for k, v in flatten_dict.items()}
+            input_ids[i, -len(ending) :] = torch.tensor(ending)
+            input_ids[i, -len(header) - len(ending) : -len(ending)] = (
+                torch.tensor(header)
+            )
+            header_attention_mask[i, -len(header) - len(ending) :] = (
+                torch.tensor(1)
+            )
+            ending_attention_mask[i, -len(ending) :] = torch.tensor(1)
+            labels[i, -len(ending) :] = torch.tensor(ending)
+
+    flatten_dict = {
+        "input_ids": input_ids,
+        "labels": labels,
+        "header_attention_mask": header_attention_mask,
+        "ending_attention_mask": ending_attention_mask,
+        "images": images["pixel_values"],
+    }
+    return_dict = {
+        f"{k}": [v[i : i + num_choice] for i in range(0, len(v), num_choice)]
+        for k, v in flatten_dict.items()
+    }
     return return_dict
 
+
 def create_multiple_choice_prompt(example, **kwargs):
     alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
     multiple_choice_prompt = kwargs["multiple_choice_prompt"]
     scoring_method = kwargs["scoring_method"]
     num_of_options = kwargs["num_of_options"]
-    mask = example['mask']
+    mask = example["mask"]
     # null_string = f"[MASK]"
-    if kwargs['mask_token'] is not None:
-        null_string = kwargs['mask_token']
+    if kwargs["mask_token"] is not None:
+        null_string = kwargs["mask_token"]
     else:
-        null_string = f"[MASK]"
+        null_string = "[MASK]"
     mcp_example = {}
-    # example['premise'] = premise = f"{multiple_choice_prompt} {premise}\nA. {options[0]}\nB. {options[1]}\nC. {options[2]}\nD. {options[3]}\nE. {options[4]}\nAnswer:"
+    # example['premise'] = premise = f"{multiple_choice_prompt} \
+    # {premise}\nA. {options[0]}\nB. {options[1]}\nC. \
+    # {options[2]}\nD. {options[3]}\nE. {options[4]}\nAnswer:"
     # premise = f"{multiple_choice_prompt} Question: {example['premise']}\n"
-    
+
     if scoring_method != "multiple_choice_prompt":
-        premise = f"{multiple_choice_prompt}\n Question: {example['premise']}\n"
+        premise = (
+            f"{multiple_choice_prompt}\n Question: {example['premise']}\n"
+        )
         premise = premise.replace(f"{example['uncond_premise']}", "")
         for idx, single_mask in enumerate(mask):
-            mcp_example[f'hypothesis{idx}'] = alphabets[idx]
+            mcp_example[f"hypothesis{idx}"] = alphabets[idx]
             if single_mask == 1:
                 premise += f"{alphabets[idx]}. {example[f'hypothesis{idx}']}\n"
             else:
                 # consider other null strings.
                 premise += f"{alphabets[idx]}. {null_string}\n"
         premise += "Answer:"
-    else: # for multiple choice prompt, options are already presented in the premise.
+    else:
+        # for multiple choice prompt,
+        # options are already presented in the premise.
         premise = f"{multiple_choice_prompt}\n{example['premise']}"
         premise = premise.replace(f"{example['uncond_premise']}", "")
         for idx, single_mask in enumerate(mask):
             option_start_index = premise.rfind(f"{alphabets[idx]}. ")
             if idx == num_of_options - 1:
-                option_end_index = premise.rfind(f"Answer:")
+                option_end_index = premise.rfind("Answer:")
             else:
                 option_end_index = premise.rfind(f"{alphabets[idx + 1]}. ")
             option = premise[option_start_index:option_end_index]
@@ -353,68 +641,82 @@ def create_multiple_choice_prompt(example, **kwargs):
                 pass
             else:
                 # consider other null strings.
-                premise = premise.replace(option, f"{alphabets[idx]}. {null_string}\n")
-    mcp_example['premise'] = premise
+                premise = premise.replace(
+                    option, f"{alphabets[idx]}. {null_string}\n"
+                )
+    mcp_example["premise"] = premise
     return mcp_example
 
+
 def create_synonym_dataset(examples, **kwargs):
     # for hypothesis0, create synonyms00, synonyms01, etc.
-    args, synonyms_dict = kwargs['args'], kwargs["synonyms_dict"]
+    args, synonyms_dict = kwargs["args"], kwargs["synonyms_dict"]
     number_of_synonyms = args.number_of_synonyms
     # get the hypothesis columns
-    hypothesis_columns = [col for col in examples.keys() if "hypothesis" in col]
+    hypothesis_columns = [
+        col for col in examples.keys() if "hypothesis" in col
+    ]
     for hypothesis_column in hypothesis_columns:
         for i in range(number_of_synonyms):
-            examples[f"{hypothesis_column}_synonyms_{i}"] = [synonyms_dict[hypothesis][i] for hypothesis in examples[hypothesis_column]]
+            examples[f"{hypothesis_column}_synonyms_{i}"] = [
+                synonyms_dict[hypothesis][i]
+                for hypothesis in examples[hypothesis_column]
+            ]
     return examples
 
+
 def copa_loader(path, args):
-    
+
     root = ET.parse(path).getroot()
     examples_copa = []
-    for type_tag in root.findall('item'):
+    for type_tag in root.findall("item"):
         # xml stuff
-        value = type_tag.get('most-plausible-alternative')
-        asks_for = type_tag.get('asks-for')
+        value = type_tag.get("most-plausible-alternative")
+        asks_for = type_tag.get("asks-for")
         children = list(type_tag)
         # get the texts
         p = children[0].text
-        a1 = children[1].text[:1].lower() +  children[1].text[1:]
-        a2 = children[2].text[:1].lower() +  children[2].text[1:]
-        if asks_for =='effect':
-            bridge = ' so'
-        elif asks_for =='cause':
-            bridge = ' because'
-        else: 
-            assert(False)
-        # examples_copa  += [{'options': [{'premise': ' ' + p[:-1] + bridge,
-        #                                  'hypothesis': ' ' + a1,
-        #                                  'uncond_premise': bridge,
-        #                                  'uncond_hypothesis': ' ' + a1},
-        #                                {'premise': ' ' + p[:-1] + bridge,
-        #                                  'hypothesis': ' ' + a2,
-        #                                  'uncond_premise': bridge,
-        #                                  'uncond_hypothesis': ' ' + a2}], 
+        a1 = children[1].text[:1].lower() + children[1].text[1:]
+        a2 = children[2].text[:1].lower() + children[2].text[1:]
+        if asks_for == "effect":
+            bridge = " so"
+        elif asks_for == "cause":
+            bridge = " because"
+        else:
+            assert False
+        # examples_copa  += [{'options':
+        #   [{'premise': ' ' + p[:-1] + bridge,
+        #     'hypothesis': ' ' + a1,
+        #     'uncond_premise': bridge,
+        #     'uncond_hypothesis': ' ' + a1},
+        # {'premise': ' ' + p[:-1] + bridge,
+        #     'hypothesis': ' ' + a2,
+        #     'uncond_premise': bridge,
+        #     'uncond_hypothesis': ' ' + a2}],
         #           'label':int(value)-1}]
-        premise = ' ' + p[:-1] + bridge
-        if getattr(args, 'multiple_choice_prompt', None) is not None:
-                # Question: The pond froze over for the winter so
-                # A. People skated on the pond.
-                # B. People brought boats to the pond.
-                # Answer:
-                hypotheses = ["A", "B"]
-                premise = f"{args.multiple_choice_prompt} Question: {premise}\nA. {a1}\nB. {a2}\nAnswer:"
+        premise = " " + p[:-1] + bridge
+        if getattr(args, "multiple_choice_prompt", None) is not None:
+            # Question: The pond froze over for the winter so
+            # A. People skated on the pond.
+            # B. People brought boats to the pond.
+            # Answer:
+            hypotheses = ["A", "B"]
+            premise = f"{args.multiple_choice_prompt} \
+                Question: {premise}\nA. {a1}\nB. {a2}\nAnswer:"
         else:
-            hypotheses = [' ' + a1, ' ' + a2]
-        examples_copa += [{
-            'label': int(value)-1,
-            'premise': premise,
-            'uncond_premise': bridge,
-            'hypothesis0': hypotheses[0],
-            'hypothesis1': hypotheses[1],
-        }]
+            hypotheses = [" " + a1, " " + a2]
+        examples_copa += [
+            {
+                "label": int(value) - 1,
+                "premise": premise,
+                "uncond_premise": bridge,
+                "hypothesis0": hypotheses[0],
+                "hypothesis1": hypotheses[1],
+            }
+        ]
     return examples_copa
 
+
 def cqa_loader(path, args):
     examples_cqa = []
     if args.calibration_prompt is not None:
@@ -424,21 +726,28 @@ def cqa_loader(path, args):
     with open(path) as f:
         for line in f:
             d = json.loads(line)
-            label = ['A','B','C','D','E'].index(d['answerKey'])
-            premise = ' ' + d['question']['stem']
-            premise = premise[:-1] + '?'  
-            
+            label = ["A", "B", "C", "D", "E"].index(d["answerKey"])
+            premise = " " + d["question"]["stem"]
+            premise = premise[:-1] + "?"
+
             # to ensure identical perforamnce to the PMI paper.
-            # options = [option['text'].lower() for option in d['question']['choices']]
-            options = [f" \"{option['text'].lower()}\"" for option in d['question']['choices']]
-            
-            # examples += [{'options': [{'premise':premise + '? the answer is:' ,
-                        #                 'hypothesis': ' "{}"'.format(c['text'].lower()),
-                        #                 'uncond_premise': ' the answer is:',
-                        #                 'uncond_hypothesis': ' "{}"'.format(c['text'].lower())} for c in d['question']['choices']], 
-                        # 'label':label}]
+            # options = [option['text'].lower() for option in \
+            # d['question']['choices']]
+            options = [
+                f" \"{option['text'].lower()}\""
+                for option in d["question"]["choices"]
+            ]
+
+            # examples += [{'options':
+            # [{'premise':premise + '? the answer is:' ,
+            #                 'hypothesis': ' "{}"'.format(c['text'].lower()),
+            #                 'uncond_premise': ' the answer is:',
+            #                 'uncond_hypothesis': \
+            #   ' "{}"'.format(c['text'].lower())} \
+            #   for c in d['question']['choices']],
+            # 'label':label}]
             # if args.multiple_choice_prompt is not None:
-            if getattr(args, 'multiple_choice_prompt', None) is not None:
+            if getattr(args, "multiple_choice_prompt", None) is not None:
                 hypotheses = ["A", "B", "C", "D", "E"]
                 # Question: How does a bishop move from one place to another?
                 # A. chess game
@@ -447,39 +756,51 @@ def cqa_loader(path, args):
                 # D. queen
                 # E. cathedral
                 # Answer:
-                premise = f"{args.multiple_choice_prompt} Question: {premise}\nA. {options[0]}\nB. {options[1]}\nC. {options[2]}\nD. {options[3]}\nE. {options[4]}\nAnswer:"
+                premise = (
+                    f"{args.multiple_choice_prompt} "
+                    + f"Question: {premise}\nA. {options[0]}\nB. "
+                    + f"{options[1]}\nC. {options[2]}\nD. {options[3]}\nE. "
+                    + f"{options[4]}\nAnswer:"
+                )
             else:
                 hypotheses = options
                 premise = premise + uncond_premise
-            examples_cqa += [{
-                'label': label,
-                'premise': premise,
-                'uncond_premise': uncond_premise,
-                'hypothesis0': hypotheses[0],
-                'hypothesis1': hypotheses[1],
-                'hypothesis2': hypotheses[2],
-                'hypothesis3': hypotheses[3],
-                'hypothesis4': hypotheses[4],
-            }]
+            examples_cqa += [
+                {
+                    "label": label,
+                    "premise": premise,
+                    "uncond_premise": uncond_premise,
+                    "hypothesis0": hypotheses[0],
+                    "hypothesis1": hypotheses[1],
+                    "hypothesis2": hypotheses[2],
+                    "hypothesis3": hypotheses[3],
+                    "hypothesis4": hypotheses[4],
+                }
+            ]
     return examples_cqa
 
+
 def obqa_loader(path, args):
     if args.calibration_prompt is not None:
         uncond_premise = args.calibration_prompt
     else:
         uncond_premise = " the answer is:"
     with open(path) as lines:
-        abc2idx = { 'A' : 0, 'B' : 1, 'C' : 2, 'D' : 3 }
+        abc2idx = {"A": 0, "B": 1, "C": 2, "D": 3}
 
         examples_obqa = []
         for line in lines:
             j = json.loads(line)
 
-            label = abc2idx[j['answerKey']]
-            premise = j['question']['stem']
-            options_text = [f" {option['text']}" for option in j['question']['choices']]
-            options_sym = [option['label'] for option in j['question']['choices']]
-            if getattr(args, 'multiple_choice_prompt', None) is not None:
+            label = abc2idx[j["answerKey"]]
+            premise = j["question"]["stem"]
+            options_text = [
+                f" {option['text']}" for option in j["question"]["choices"]
+            ]
+            options_sym = [
+                option["label"] for option in j["question"]["choices"]
+            ]
+            if getattr(args, "multiple_choice_prompt", None) is not None:
                 # Question: Greenhouses are great for plants like
                 # A. Pizza
                 # B. Lollipops
@@ -487,22 +808,30 @@ def obqa_loader(path, args):
                 # D. French beans
                 # Answer:
                 hypotheses = options_sym
-                premise = f"{args.multiple_choice_prompt} Question: {premise}\nA. {options_text[0]}\nB. {options_text[1]}\nC. {options_text[2]}\nD. {options_text[3]}\nAnswer:"
+                premise = (
+                    f"{args.multiple_choice_prompt} "
+                    + f"Question: {premise}\nA. {options_text[0]}\nB. "
+                    + f"{options_text[1]}\nC. {options_text[2]}\nD. "
+                    + f"{options_text[3]}\nAnswer:"
+                )
             else:
                 hypotheses = options_text
                 # premise = premise + uncond_premise
-            
-            examples_obqa += [{
-                'label': label,
-                'premise': premise,
-                'uncond_premise': uncond_premise,
-                'hypothesis0': hypotheses[0],
-                'hypothesis1': hypotheses[1],
-                'hypothesis2': hypotheses[2],
-                'hypothesis3': hypotheses[3],
-            }]
+
+            examples_obqa += [
+                {
+                    "label": label,
+                    "premise": premise,
+                    "uncond_premise": uncond_premise,
+                    "hypothesis0": hypotheses[0],
+                    "hypothesis1": hypotheses[1],
+                    "hypothesis2": hypotheses[2],
+                    "hypothesis3": hypotheses[3],
+                }
+            ]
     return examples_obqa
 
+
 def piqa_loader(path, args):
     if args.calibration_prompt is not None:
         uncond_premise = args.calibration_prompt
@@ -515,30 +844,37 @@ def piqa_loader(path, args):
         for line, label_sym in zip(lines, labels):
             label = int(label_sym[0])
             line = json.loads(line)
-            premise = line['goal']
-            options_text = [line['sol1'], line['sol2']]
-            options_sym = ['A', 'B']
+            premise = line["goal"]
+            options_text = [line["sol1"], line["sol2"]]
+            options_sym = ["A", "B"]
 
-            if getattr(args, 'multiple_choice_prompt', None) is not None:
+            if getattr(args, "multiple_choice_prompt", None) is not None:
                 # Question: To clear snot out of your nose,
                 # A. place a tissue over your nose and blow the snot out.
                 # B. place a tissue over your nose and suck the snot in.
                 # Answer:
                 hypotheses = options_sym
-                premise = f"{args.multiple_choice_prompt} Question: {premise}\nA. {options_text[0]}\nB. {options_text[1]}\nAnswer:"
+                premise = (
+                    f"{args.multiple_choice_prompt} "
+                    + f"Question: {premise}\nA. {options_text[0]}\nB. "
+                    + f"{options_text[1]}\nAnswer:"
+                )
             else:
                 hypotheses = options_text
                 premise = premise + uncond_premise
-            
-            examples_piqa += [{
-                'label': label,
-                'premise': premise,
-                'uncond_premise': uncond_premise,
-                'hypothesis0': hypotheses[0],
-                'hypothesis1': hypotheses[1],
-            }]
+
+            examples_piqa += [
+                {
+                    "label": label,
+                    "premise": premise,
+                    "uncond_premise": uncond_premise,
+                    "hypothesis0": hypotheses[0],
+                    "hypothesis1": hypotheses[1],
+                }
+            ]
     return examples_piqa
 
+
 def qasc_loader(path, args):
     if args.calibration_prompt is not None:
         uncond_premise = args.calibration_prompt
@@ -549,14 +885,21 @@ def qasc_loader(path, args):
     with open(path) as lines:
         for line in lines:
             line = json.loads(line)
-            label = ['A','B','C','D','E','F','G','H'].index(line['answerKey'])
+            label = ["A", "B", "C", "D", "E", "F", "G", "H"].index(
+                line["answerKey"]
+            )
             premise = f"{line['question']['stem']}"
 
-            options_text = [option['text'] for option in line['question']['choices']]
-            options_sym = [option['label'] for option in line['question']['choices']]
+            options_text = [
+                option["text"] for option in line["question"]["choices"]
+            ]
+            options_sym = [
+                option["label"] for option in line["question"]["choices"]
+            ]
 
-            if getattr(args, 'multiple_choice_prompt', None) is not None:
-                # Question: Cameron returned home with a bag of candy to eat all night
+            if getattr(args, "multiple_choice_prompt", None) is not None:
+                # Question: Cameron returned home with a bag of
+                # candy to eat all night
                 # long. What will Others want to do next?
                 # A. great
                 # B. buy the candy to eat
@@ -564,26 +907,36 @@ def qasc_loader(path, args):
                 # E. ...
                 # Answer:
                 hypotheses = options_sym
-                premise = f"{args.multiple_choice_prompt} Question: {premise}\nA. {options_text[0]}\nB. {options_text[1]}\nC. {options_text[2]}\nD. {options_text[3]}\nE. {options_text[4]}\nF. {options_text[5]}\nG. {options_text[6]}\nH. {options_text[7]}\nAnswer:"
+                premise = (
+                    f"{args.multiple_choice_prompt} "
+                    + f"Question: {premise}\nA. {options_text[0]}\nB. "
+                    + f"{options_text[1]}\nC. {options_text[2]}\nD. "
+                    + f"{options_text[3]}\nE. {options_text[4]}\nF. "
+                    + f"{options_text[5]}\nG. {options_text[6]}\nH. "
+                    + f"{options_text[7]}\nAnswer:"
+                )
             else:
                 hypotheses = options_text
                 premise = premise + uncond_premise
-            
-            examples_qasc += [{
-                'label': label,
-                'premise': premise,
-                'uncond_premise': uncond_premise,
-                'hypothesis0': hypotheses[0],
-                'hypothesis1': hypotheses[1],
-                'hypothesis2': hypotheses[2],
-                'hypothesis3': hypotheses[3],
-                'hypothesis4': hypotheses[4],
-                'hypothesis5': hypotheses[5],
-                'hypothesis6': hypotheses[6],
-                'hypothesis7': hypotheses[7],
-            }]
+
+            examples_qasc += [
+                {
+                    "label": label,
+                    "premise": premise,
+                    "uncond_premise": uncond_premise,
+                    "hypothesis0": hypotheses[0],
+                    "hypothesis1": hypotheses[1],
+                    "hypothesis2": hypotheses[2],
+                    "hypothesis3": hypotheses[3],
+                    "hypothesis4": hypotheses[4],
+                    "hypothesis5": hypotheses[5],
+                    "hypothesis6": hypotheses[6],
+                    "hypothesis7": hypotheses[7],
+                }
+            ]
     return examples_qasc
 
+
 def siqa_loader(path, args):
     if args.calibration_prompt is not None:
         uncond_premise = args.calibration_prompt
@@ -597,33 +950,41 @@ def siqa_loader(path, args):
             label = int(label_sym[0]) - 1
             line = json.loads(line)
             premise = f"{line['context']} {line['question']}"
-            
-            options_text = [line['answerA'], line['answerB'], line['answerC']]
-            options_sym = ['A', 'B', 'C']
 
-            if getattr(args, 'multiple_choice_prompt', None) is not None:
-                # Question: Cameron returned home with a bag of candy to eat all night
+            options_text = [line["answerA"], line["answerB"], line["answerC"]]
+            options_sym = ["A", "B", "C"]
+
+            if getattr(args, "multiple_choice_prompt", None) is not None:
+                # Question: Cameron returned home with a bag of
+                # candy to eat all night
                 # long. What will Others want to do next?
                 # A. great
                 # B. buy the candy to eat
                 # C. bored
                 # Answer:
                 hypotheses = options_sym
-                premise = f"{args.multiple_choice_prompt} Question: {premise}\nA. {options_text[0]}\nB. {options_text[1]}\nC. {options_text[2]}\nAnswer:"
+                premise = (
+                    f"{args.multiple_choice_prompt} "
+                    + f"Question: {premise}\nA. {options_text[0]}\nB. "
+                    + f"{options_text[1]}\nC. {options_text[2]}\nAnswer:"
+                )
             else:
                 hypotheses = options_text
                 premise = premise + uncond_premise
-            
-            examples_siqa += [{
-                'label': label,
-                'premise': premise,
-                'uncond_premise': uncond_premise,
-                'hypothesis0': hypotheses[0],
-                'hypothesis1': hypotheses[1],
-                'hypothesis2': hypotheses[2],
-            }]
+
+            examples_siqa += [
+                {
+                    "label": label,
+                    "premise": premise,
+                    "uncond_premise": uncond_premise,
+                    "hypothesis0": hypotheses[0],
+                    "hypothesis1": hypotheses[1],
+                    "hypothesis2": hypotheses[2],
+                }
+            ]
     return examples_siqa
 
+
 def winogrande_loader(path, args):
     if args.calibration_prompt is not None:
         uncond_premise = args.calibration_prompt
@@ -636,32 +997,40 @@ def winogrande_loader(path, args):
         for line, label_sym in zip(lines, labels):
             label = int(label_sym[0]) - 1
             line = json.loads(line)
-            premise = line['sentence']
-            
-            options_text = [line['option1'], line['option2']]
-            options_sym = ['A', 'B']
+            premise = line["sentence"]
 
-            if getattr(args, 'multiple_choice_prompt', None) is not None:
-                # Question: So _ plays video games because Leslie has a lot of free time
+            options_text = [line["option1"], line["option2"]]
+            options_sym = ["A", "B"]
+
+            if getattr(args, "multiple_choice_prompt", None) is not None:
+                # Question: So _ plays video games because
+                # Leslie has a lot of free time
                 # while Nelson has to work all the time.
                 # A. Leslie
                 # B. Nelson
                 # Answer:
                 hypotheses = options_sym
-                premise = f"{args.multiple_choice_prompt} Question: {premise}\nA. {options_text[0]}\nB. {options_text[1]}\nAnswer:"
+                premise = (
+                    f"{args.multiple_choice_prompt} "
+                    + f"Question: {premise}\nA. {options_text[0]}\nB. "
+                    + f"{options_text[1]}\nAnswer:"
+                )
             else:
                 hypotheses = options_text
                 premise = premise + uncond_premise
-            
-            examples_winogrande += [{
-                'label': label,
-                'premise': premise,
-                'uncond_premise': uncond_premise,
-                'hypothesis0': hypotheses[0],
-                'hypothesis1': hypotheses[1],
-            }]
+
+            examples_winogrande += [
+                {
+                    "label": label,
+                    "premise": premise,
+                    "uncond_premise": uncond_premise,
+                    "hypothesis0": hypotheses[0],
+                    "hypothesis1": hypotheses[1],
+                }
+            ]
     return examples_winogrande
 
+
 def date_understanding_loader(path, args):
     if args.calibration_prompt is not None:
         uncond_premise = args.calibration_prompt
@@ -673,46 +1042,60 @@ def date_understanding_loader(path, args):
         with open(one_path) as json_file:
             data = json.load(json_file)
             task_prefix = data.get("task_prefix", "")
-            for instance in data['examples']:
-                options_text = list(instance['target_scores'].keys())
+            for instance in data["examples"]:
+                options_text = list(instance["target_scores"].keys())
                 num_options = len(options_text)
-                if args.num_options is not None and num_options != args.num_options:
+                if (
+                    args.num_options is not None
+                    and num_options != args.num_options
+                ):
                     continue
-                options_sym = [chr(ord('A') + i) for i in range(num_options)]
-                for target, score in instance['target_scores'].items():
+                options_sym = [chr(ord("A") + i) for i in range(num_options)]
+                for target, score in instance["target_scores"].items():
                     if score == 1:
-                        raw_label = target # e.g., stare wars
-                label = options_text.index(raw_label)             
-                premise = instance['input']
+                        raw_label = target  # e.g., stare wars
+                label = options_text.index(raw_label)
+                premise = instance["input"]
                 premise = task_prefix + premise
 
-                if getattr(args, 'multiple_choice_prompt', None) is not None:
-                    # Question: "Which of the following is a humorous edit of this artist or movie name: 'star wars'?"
+                if getattr(args, "multiple_choice_prompt", None) is not None:
+                    # Question: "Which of the following is a
+                    # humorous edit of this artist or movie name: 'star wars'?"
                     # A. stare wars
                     # B. stariwars
                     # C. ...
                     # D. ...
                     # Answer:
                     hypotheses = options_sym
-                    # premise = f"{args.multiple_choice_prompt} Question: {premise}\nA. {options_text[0]}\nB. {options_text[1]}\nC. {options_text[2]}\nD. {options_text[3]}\nE. {options_text[4]}\nAnswer:"
-                    premise = f"{args.multiple_choice_prompt} Question: {premise}\n"
+                    # premise = f"{args.multiple_choice_prompt} Question: \
+                    #     {premise}\nA. {options_text[0]}\nB. \
+                    #         {options_text[1]}\nC. \
+                    #             {options_text[2]}\nD. \
+                    #                 {options_text[3]}\nE. \
+                    #                     {options_text[4]}\nAnswer:"
+                    premise = (
+                        f"{args.multiple_choice_prompt} Question: {premise}\n"
+                    )
                     for idx in range(num_options):
-                        premise += f"{options_sym[idx]}. {options_text[idx]}\n" 
+                        premise += f"{options_sym[idx]}. {options_text[idx]}\n"
                     premise += "Answer:"
                 else:
                     hypotheses = options_text
                     premise = premise + uncond_premise
-                example = [{
-                    'label': label,
-                    'premise': premise,
-                    'uncond_premise': uncond_premise,
-                }]
+                example = [
+                    {
+                        "label": label,
+                        "premise": premise,
+                        "uncond_premise": uncond_premise,
+                    }
+                ]
                 for idx in range(num_options):
-                    example[0][f'hypothesis{idx}'] = hypotheses[idx]
+                    example[0][f"hypothesis{idx}"] = hypotheses[idx]
 
                 examples += example
     return examples
 
+
 def anli_loader(path, args):
     if args.calibration_prompt is not None:
         uncond_premise = args.calibration_prompt
@@ -727,132 +1110,179 @@ def anli_loader(path, args):
         with open(one_path) as lines:
             for line in lines:
                 line = json.loads(line)
-                label = ['e', 'n', 'c'].index(line['label'])       
+                label = ["e", "n", "c"].index(line["label"])
                 premise = f"{line['context']} {line['hypothesis']}"
-                
-                if getattr(args, 'multiple_choice_prompt', None) is not None:
-                    # Question: "Which of the following is a humorous edit of this artist or movie name: 'star wars'?"
+
+                if getattr(args, "multiple_choice_prompt", None) is not None:
+                    # Question: "Which of the following is a humorous \
+                    # edit of this artist or movie name: 'star wars'?"
                     # A. entailment
                     # B. neutral
                     # C. contradiction
                     # Answer:
                     hypotheses = options_sym
-                    # premise = f"{args.multiple_choice_prompt} Question: {premise}\nA. {options_text[0]}\nB. {options_text[1]}\nC. {options_text[2]}\nD. {options_text[3]}\nE. {options_text[4]}\nAnswer:"
-                    premise = f"{args.multiple_choice_prompt} Question: {premise}\n"
+                    # premise = f"{args.multiple_choice_prompt} Question: \
+                    #     {premise}\nA. {options_text[0]}\nB. \
+                    #         {options_text[1]}\nC. \
+                    #             {options_text[2]}\nD. \
+                    #                 {options_text[3]}\nE. \
+                    #                     {options_text[4]}\nAnswer:"
+                    premise = (
+                        f"{args.multiple_choice_prompt} Question: {premise}\n"
+                    )
                     for idx in range(num_options):
-                        premise += f"{options_sym[idx]}. {options_text[idx]}\n" 
-                    premise += "Answer:"                
+                        premise += f"{options_sym[idx]}. {options_text[idx]}\n"
+                    premise += "Answer:"
                 else:
                     hypotheses = options_text
                     premise = premise + uncond_premise
-                example = [{
-                    'label': label,
-                    'premise': premise,
-                    'uncond_premise': uncond_premise,
-                }]
+                example = [
+                    {
+                        "label": label,
+                        "premise": premise,
+                        "uncond_premise": uncond_premise,
+                    }
+                ]
                 for idx in range(num_options):
-                    example[0][f'hypothesis{idx}'] = hypotheses[idx]
+                    example[0][f"hypothesis{idx}"] = hypotheses[idx]
                 examples += example
 
     return examples
 
+
 def generate_n_shot_demonstrations(n_shot_dataset):
     n_shot_demonstrations = ""
     for raw_instance in n_shot_dataset:
-        presmise = raw_instance['premise']
-        answer_index = raw_instance['label'].item()
+        presmise = raw_instance["premise"]
+        answer_index = raw_instance["label"].item()
         answer = raw_instance[f"hypothesis{answer_index}"]
         n_shot_instance = f"{presmise}{answer}\n\n"
         n_shot_demonstrations += n_shot_instance
     return n_shot_demonstrations
 
+
 def create_n_shot_splits(raw_dataset, n_shot_dataset, args):
     n_shot_demonstrations = ""
     if args.n_shot > 0:
-        # few-shot setting: sample from train split, dev split (COPA), or the only split (BB)
-        if n_shot_dataset is raw_dataset: # BB tasks: sampling from the only split, and use the rest
-            raw_dataset = raw_dataset.train_test_split(test_size=args.n_shot, seed=args.seed)
-            raw_dataset, n_shot_dataset = raw_dataset["train"], raw_dataset["test"]
+        # few-shot setting: sample from train split, dev split (COPA),
+        # or the only split (BB)
+        if (
+            n_shot_dataset is raw_dataset
+        ):  # BB tasks: sampling from the only split, and use the rest
+            raw_dataset = raw_dataset.train_test_split(
+                test_size=args.n_shot, seed=args.seed
+            )
+            raw_dataset, n_shot_dataset = (
+                raw_dataset["train"],
+                raw_dataset["test"],
+            )
         else:
-            n_shot_dataset = n_shot_dataset.shuffle(seed=args.seed).select(range(args.n_shot))
+            n_shot_dataset = n_shot_dataset.shuffle(seed=args.seed).select(
+                range(args.n_shot)
+            )
         n_shot_demonstrations = generate_n_shot_demonstrations(n_shot_dataset)
 
     if args.sample is not None and args.sample <= len(raw_dataset):
         # sample "sample" amount of data from raw_data
-        raw_dataset = raw_dataset.shuffle(seed=args.seed).select(range(args.sample))
-    
+        raw_dataset = raw_dataset.shuffle(seed=args.seed).select(
+            range(args.sample)
+        )
+
     if args.n_shot > 0:
         # append n_shot_demonstrations to each input.
-        raw_dataset = raw_dataset.map(lambda x: {"premise": n_shot_demonstrations + x["premise"]}) 
-    else: # zero-shot: no need to return n_shot_dataset
+        raw_dataset = raw_dataset.map(
+            lambda x: {"premise": n_shot_demonstrations + x["premise"]}
+        )
+    else:  # zero-shot: no need to return n_shot_dataset
         n_shot_dataset = None
     return raw_dataset, n_shot_dataset, n_shot_demonstrations
 
+
 def generate_n_shot_poe_demonstrations(n_shot_dataset, num_of_options):
     # designed for multiple_choice_prompt
     alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[:num_of_options]
     last_option = alphabets[-1]
-    null_string = f"[MASK]"
+    null_string = "[MASK]"
     n_shot_demonstrations = ""
     n_shot_poe_demonstrations = ""
     for raw_instance in n_shot_dataset:
-        premise = raw_instance['premise']
-        answer_index = raw_instance['label'].item()
+        premise = raw_instance["premise"]
+        answer_index = raw_instance["label"].item()
         answer = raw_instance[f"hypothesis{answer_index}"]
         n_shot_instance = f"{premise}{answer}\n\n"
         n_shot_demonstrations += n_shot_instance
 
         # for mcp: randomly mask options to [MASK]
-        poe_premise = premise 
+        poe_premise = premise
         new_alphabets = alphabets.replace(alphabets[answer_index], "")
-        num_of_mask_options = random.randint(1, num_of_options-1)
-        mask_option_symbols = random.sample(new_alphabets, num_of_mask_options) # e.g., [B, C]
+        num_of_mask_options = random.randint(1, num_of_options - 1)
+        mask_option_symbols = random.sample(
+            new_alphabets, num_of_mask_options
+        )  # e.g., [B, C]
         for symbol in mask_option_symbols:
             option_start_index = poe_premise.rfind(f"{symbol}. ")
             if symbol == last_option:
-                option_end_index = poe_premise.rfind(f"Answer:")
+                option_end_index = poe_premise.rfind("Answer:")
             else:
-                option_end_index = poe_premise.rfind(f"{alphabets[alphabets.index(symbol) + 1]}. ") 
+                option_end_index = poe_premise.rfind(
+                    f"{alphabets[alphabets.index(symbol) + 1]}. "
+                )
             option = poe_premise[option_start_index:option_end_index]
-            poe_premise = poe_premise.replace(option, f"{symbol}. {null_string}\n")
-            
-        n_shot_poe_instance = f"{poe_premise}{answer}\n\n"        
+            poe_premise = poe_premise.replace(
+                option, f"{symbol}. {null_string}\n"
+            )
+
+        n_shot_poe_instance = f"{poe_premise}{answer}\n\n"
         n_shot_poe_demonstrations += n_shot_poe_instance
     return n_shot_demonstrations, n_shot_poe_demonstrations
 
+
 def vqa_loader(path, args):
-    version_type = '' # this should be '' when using VQA v2.0 dataset
-    task_type = 'MultipleChoice' # 'OpenEnded' only for v2.0. 'OpenEnded' or 'MultipleChoice' for v1.0
-    data_type = 'mscoco'  # 'mscoco' only for v1.0. 'mscoco' for real and 'abstract_v002' for abstract for v1.0.
-    data_subtype = 'train2014'
-    ann_file = '%s/Annotations/%s%s_%s_annotations.json'%(path, version_type, data_type, data_subtype)
-    question_file = '%s/Questions/%s%s_%s_%s_questions.json'%(path, version_type, task_type, data_type, data_subtype)
-    img_dir = '%s/Images/%s/%s' %(path, data_type, data_subtype)
+    version_type = ""
+    task_type = "MultipleChoice"
+    data_type = "mscoco"
+    data_subtype = "train2014"
+    ann_file = "%s/Annotations/%s%s_%s_annotations.json" % (
+        path,
+        version_type,
+        data_type,
+        data_subtype,
+    )
+    question_file = "%s/Questions/%s%s_%s_%s_questions.json" % (
+        path,
+        version_type,
+        task_type,
+        data_type,
+        data_subtype,
+    )
+    img_dir = "%s/Images/%s/%s" % (path, data_type, data_subtype)
     alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
     examples = []
 
-    print('Loading annotations and questions...')
-    train_anno = json.load(open(ann_file, 'r'))
-    train_ques = json.load(open(question_file, 'r'))
+    print("Loading annotations and questions...")
+    train_anno = json.load(open(ann_file, "r"))
+    train_ques = json.load(open(question_file, "r"))
 
     if args.calibration_prompt is not None:
         uncond_premise = args.calibration_prompt
     else:
         uncond_premise = " the answer is:"
 
-    for i in range(len(train_anno['annotations'])):
-        ans = train_anno['annotations'][i]['multiple_choice_answer']
-        img_id = train_anno['annotations'][i]['image_id']
+    for i in range(len(train_anno["annotations"])):
+        ans = train_anno["annotations"][i]["multiple_choice_answer"]
+        img_id = train_anno["annotations"][i]["image_id"]
         # question_id = train_anno['annotations'][i]['question_id']
-        image_path = os.path.join(img_dir, 'COCO_train2014_' + '%012d.jpg' % img_id)
+        image_path = os.path.join(
+            img_dir, "COCO_train2014_" + "%012d.jpg" % img_id
+        )
 
-        question = train_ques['questions'][i]['question']
-        mc_ans = train_ques['questions'][i]['multiple_choices']
+        question = train_ques["questions"][i]["question"]
+        mc_ans = train_ques["questions"][i]["multiple_choices"]
         label = mc_ans.index(ans)
-        num_options = args.num_options
+        # num_options = args.num_options
 
-        if getattr(args, 'multiple_choice_prompt', None) is not None:
+        if getattr(args, "multiple_choice_prompt", None) is not None:
             hypotheses = mc_ans
             # Question: How does a bishop move from one place to another?
             # A. chess game
@@ -861,34 +1291,40 @@ def vqa_loader(path, args):
             # D. queen
             # E. cathedral
             # Answer:
-            options = "\n".join([f"{alphabets[i]}. {ans}" for i, ans in enumerate(mc_ans)])
-            premise = f"{args.multiple_choice_prompt} Question: {question}\n{options}\nAnswer:"
+            options = "\n".join(
+                [f"{alphabets[i]}. {ans}" for i, ans in enumerate(mc_ans)]
+            )
+            premise = f"{args.multiple_choice_prompt} \
+                Question: {question}\n{options}\nAnswer:"
         else:
             hypotheses = mc_ans
             premise = question + uncond_premise
 
-        example = [{
-            'premise': premise, 
-            'image_path': image_path, 
-            'uncond_premise': uncond_premise,  
-            'label': label
-            }]
-    
+        example = [
+            {
+                "premise": premise,
+                "image_path": image_path,
+                "uncond_premise": uncond_premise,
+                "label": label,
+            }
+        ]
+
         for idx, ans in enumerate(hypotheses):
-            example[0][f'hypothesis{idx}'] = ans
-        examples+=example
-    
+            example[0][f"hypothesis{idx}"] = ans
+        examples += example
+
     return examples
 
+
 def scienceqa_loader(path, args):
-    ann_file = '%s/ScienceQA_DATA/problems.json'%(path)
-    img_dir = '%s/ScienceQA_DATA/train' %(path)
+    ann_file = "%s/ScienceQA_DATA/problems.json" % (path)
+    img_dir = "%s/ScienceQA_DATA/train" % (path)
     alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
     examples = []
 
-    print('Loading annotations and images...')
-    anno = json.load(open(ann_file, 'r'))
+    print("Loading annotations and images...")
+    anno = json.load(open(ann_file, "r"))
     # train_test_split = json.load(open(traintestFile, 'r'))
     train_ids = os.listdir(img_dir)
     train_anno = {id: anno[id] for id in train_ids}
@@ -900,16 +1336,16 @@ def scienceqa_loader(path, args):
 
     for i, (id, value) in enumerate(train_anno.items()):
         img_id = id
-        question = value['question']
-        mc_ans = value['choices']
-        label = int(value['answer'])
+        question = value["question"]
+        mc_ans = value["choices"]
+        label = int(value["answer"])
         image_file = value["image"]
 
-        if (not len(mc_ans) == args.num_options) or (image_file == None):
+        if (not len(mc_ans) == args.num_options) or (image_file is None):
             continue
 
         image_path = os.path.join(os.path.join(img_dir, img_id), image_file)
-        if getattr(args, 'multiple_choice_prompt', None) is not None:
+        if getattr(args, "multiple_choice_prompt", None) is not None:
             hypotheses = mc_ans
             # Question: How does a bishop move from one place to another?
             # A. chess game
@@ -917,33 +1353,39 @@ def scienceqa_loader(path, args):
             # C. in a car
             # D. queen
             # Answer:
-            options = "\n".join([f"{alphabets[i]}. {ans}" for i, ans in enumerate(mc_ans)])
-            premise = f"{args.multiple_choice_prompt} Question: {question}\n{options}\nAnswer:"
+            options = "\n".join(
+                [f"{alphabets[i]}. {ans}" for i, ans in enumerate(mc_ans)]
+            )
+            premise = f"{args.multiple_choice_prompt} \
+                Question: {question}\n{options}\nAnswer:"
         else:
             hypotheses = mc_ans
             premise = question + uncond_premise
 
-        example = [{
-            'premise': premise, 
-            'image_path': image_path, 
-            'uncond_premise': uncond_premise,  
-            'label': label
-            }]
-    
+        example = [
+            {
+                "premise": premise,
+                "image_path": image_path,
+                "uncond_premise": uncond_premise,
+                "label": label,
+            }
+        ]
+
         for idx, ans in enumerate(hypotheses):
-            example[0][f'hypothesis{idx}'] = ans
-        examples+=example
-    print("Dataset Length: ",len(examples))
+            example[0][f"hypothesis{idx}"] = ans
+        examples += example
+    print("Dataset Length: ", len(examples))
     return examples
 
+
 def ai2d_loader(path, args):
-    question_dir = '%s/ai2d/questions' %(path)
-    imgDir = '%s/ai2d/images' %(path)
+    question_dir = "%s/ai2d/questions" % (path)
+    imgDir = "%s/ai2d/images" % (path)
     alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
     examples = []
 
-    print('Loading annotations and images...')
+    print("Loading annotations and images...")
     train_files = os.listdir(question_dir)
 
     if args.calibration_prompt is not None:
@@ -952,19 +1394,23 @@ def ai2d_loader(path, args):
         uncond_premise = " the answer is:"
 
     for i, file in enumerate(train_files):
-        anno = json.load(open(os.path.join(question_dir, file), 'r'))
+        anno = json.load(open(os.path.join(question_dir, file), "r"))
         questions = anno["questions"]
         imageName = anno["imageName"]
         for question, value in questions.items():
-            mc_ans = value['answerTexts']
-            label = int(value['correctAnswer'])
-            abcLabel = value['abcLabel']
-
-            if (not len(mc_ans) == args.num_options) or (imageName == None) or abcLabel == True:
+            mc_ans = value["answerTexts"]
+            label = int(value["correctAnswer"])
+            abcLabel = value["abcLabel"]
+
+            if (
+                (not len(mc_ans) == args.num_options)
+                or (imageName is None)
+                or abcLabel is True
+            ):
                 continue
 
             image_path = os.path.join(imgDir, imageName)
-            if getattr(args, 'multiple_choice_prompt', None) is not None:
+            if getattr(args, "multiple_choice_prompt", None) is not None:
                 hypotheses = mc_ans
                 # Question: How does a bishop move from one place to another?
                 # A. chess game
@@ -972,42 +1418,48 @@ def ai2d_loader(path, args):
                 # C. in a car
                 # D. queen
                 # Answer:
-                options = "\n".join([f"{alphabets[i]}. {ans}" for i, ans in enumerate(mc_ans)])
-                premise = f"{args.multiple_choice_prompt} Question: {question}\n{options}\nAnswer:"
+                options = "\n".join(
+                    [f"{alphabets[i]}. {ans}" for i, ans in enumerate(mc_ans)]
+                )
+                premise = f"{args.multiple_choice_prompt} \
+                    Question: {question}\n{options}\nAnswer:"
             else:
                 hypotheses = mc_ans
                 premise = question + uncond_premise
 
-            example = [{
-                'premise': premise, 
-                'image_path': image_path, 
-                'uncond_premise': uncond_premise,  
-                'label': label
-                }]
-        
+            example = [
+                {
+                    "premise": premise,
+                    "image_path": image_path,
+                    "uncond_premise": uncond_premise,
+                    "label": label,
+                }
+            ]
+
             for idx, ans in enumerate(hypotheses):
-                example[0][f'hypothesis{idx}'] = ans
-            examples+=example
-    print("Dataset Length: ",len(examples))
+                example[0][f"hypothesis{idx}"] = ans
+            examples += example
+    print("Dataset Length: ", len(examples))
     return examples
 
+
 def single_inference_loader(path, args):
     alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
     examples = []
 
-    print('Loading single question and image...')
+    print("Loading single question and image...")
     question = args.question
     mc_ans = args.choices
     label = args.label
     image_path = path
-    
+
     if args.calibration_prompt is not None:
         uncond_premise = args.calibration_prompt
     else:
         uncond_premise = " the answer is:"
 
-    if getattr(args, 'multiple_choice_prompt', None) is not None:
+    if getattr(args, "multiple_choice_prompt", None) is not None:
         hypotheses = mc_ans
         # Question: How does a bishop move from one place to another?
         # A. chess game
@@ -1015,21 +1467,26 @@ def single_inference_loader(path, args):
         # C. in a car
         # D. queen
         # Answer:
-        options = "\n".join([f"{alphabets[i]}. {ans}" for i, ans in enumerate(mc_ans)])
-        premise = f"{args.multiple_choice_prompt} Question: {question}\n{options}\nAnswer:"
+        options = "\n".join(
+            [f"{alphabets[i]}. {ans}" for i, ans in enumerate(mc_ans)]
+        )
+        premise = f"{args.multiple_choice_prompt} \
+            Question: {question}\n{options}\nAnswer:"
     else:
         hypotheses = mc_ans
         premise = question + uncond_premise
 
-    example = [{
-        'premise': premise, 
-        'image_path': image_path, 
-        'uncond_premise': uncond_premise,  
-        'label': label
-        }]
+    example = [
+        {
+            "premise": premise,
+            "image_path": image_path,
+            "uncond_premise": uncond_premise,
+            "label": label,
+        }
+    ]
 
     for idx, ans in enumerate(hypotheses):
-        example[0][f'hypothesis{idx}'] = ans
-    examples+=example
-    print("Dataset Length: ",len(examples))
-    return examples
\ No newline at end of file
+        example[0][f"hypothesis{idx}"] = ans
+    examples += example
+    print("Dataset Length: ", len(examples))
+    return examples
diff --git a/mm_poe/methods/utils/methods.py b/mm_poe/methods/utils/methods.py
index 70c292d..6daeac0 100644
--- a/mm_poe/methods/utils/methods.py
+++ b/mm_poe/methods/utils/methods.py
@@ -5,6 +5,7 @@
 from torch.utils.data import DataLoader
 from transformers import GenerationConfig
 
+
 def inference_language_modeling_old(model, eval_dataloader, device):
     model.eval()
     predictions = torch.zeros(0)
@@ -14,34 +15,63 @@ def inference_language_modeling_old(model, eval_dataloader, device):
     pbar = tqdm(eval_dataloader, desc="Inference")
     for batch in pbar:
         # e.g., (batch_size, #option, ending_seq_len): (32, 2, 18)
-        ending_shape = batch["ending_input_ids"].shape 
+        ending_shape = batch["ending_input_ids"].shape
         # flatten
-        header_input_ids = batch["header_input_ids"].view(-1, batch["header_input_ids"].shape[-1]).to(device)
-        ending_input_ids = batch["ending_input_ids"].view(-1, batch["ending_input_ids"].shape[-1]).to(device)
-        
+        header_input_ids = (
+            batch["header_input_ids"]
+            .view(-1, batch["header_input_ids"].shape[-1])
+            .to(device)
+        )
+        ending_input_ids = (
+            batch["ending_input_ids"]
+            .view(-1, batch["ending_input_ids"].shape[-1])
+            .to(device)
+        )
+
         # adding this line of code takes me more than an hour.
         # without adding torch.no_grad, GPU usage will muiltply by 4.
         with torch.no_grad():
-            outputs = model(input_ids = header_input_ids, labels = ending_input_ids)
-        
+            outputs = model(
+                input_ids=header_input_ids, labels=ending_input_ids
+            )
+
         _, logits = outputs.loss, outputs.logits
         # e.g., (batch_size * #option, ending_seq_len, #vocab): (64, 18, 32128)
         logits = logits.view(-1, logits.shape[-1])
         # ignore padding token: 0
-        ce_loss = F.cross_entropy(logits, ending_input_ids.view(-1), reduction="none", ignore_index=0).detach().cpu()
+        ce_loss = (
+            F.cross_entropy(
+                logits,
+                ending_input_ids.view(-1),
+                reduction="none",
+                ignore_index=0,
+            )
+            .detach()
+            .cpu()
+        )
         # each score is the negative log-likelihood of a ending given a header.
-        batch_predictions = ce_loss.view(ending_shape).sum(dim=-1).argmin(dim=-1)
+        batch_predictions = (
+            ce_loss.view(ending_shape).sum(dim=-1).argmin(dim=-1)
+        )
         batch_labels = batch["label"]
         predictions = torch.cat((predictions, batch_predictions))
         labels = torch.cat((labels, batch_labels))
-    
+
         # make accuracy accumulative
-        batch_accuracy = (batch_predictions == batch_labels).sum().item() / len(batch_labels)
+        batch_accuracy = (
+            batch_predictions == batch_labels
+        ).sum().item() / len(batch_labels)
         total_accuracy = (predictions == labels).sum().item() / len(labels)
-        pbar.set_description(f"Total Accuracy: {total_accuracy:.4f}, Batch Accuracy: {batch_accuracy:.4f}")
+        pbar.set_description(
+            f"Total Accuracy: {total_accuracy:.4f}, \
+                Batch Accuracy: {batch_accuracy:.4f}"
+        )
     return total_accuracy
 
-def inference_contrastive_decoding_old(amateur_model, expert_model, eval_dataloader, device):
+
+def inference_contrastive_decoding_old(
+    amateur_model, expert_model, eval_dataloader, device
+):
     amateur_model.eval()
     expert_model.eval()
     predictions = torch.zeros(0)
@@ -51,34 +81,65 @@ def inference_contrastive_decoding_old(amateur_model, expert_model, eval_dataloa
     pbar = tqdm(eval_dataloader, desc="Inference")
     for batch in pbar:
         # e.g., (batch_size, #option, ending_seq_len): (32, 2, 18)
-        ending_shape = batch["ending_input_ids"].shape 
+        ending_shape = batch["ending_input_ids"].shape
         # flatten
-        header_input_ids = batch["header_input_ids"].view(-1, batch["header_input_ids"].shape[-1]).to(device)
-        ending_input_ids = batch["ending_input_ids"].view(-1, batch["ending_input_ids"].shape[-1]).to(device)
-        
+        header_input_ids = (
+            batch["header_input_ids"]
+            .view(-1, batch["header_input_ids"].shape[-1])
+            .to(device)
+        )
+        ending_input_ids = (
+            batch["ending_input_ids"]
+            .view(-1, batch["ending_input_ids"].shape[-1])
+            .to(device)
+        )
+
         # key step: compute logits.
         with torch.no_grad():
-            amateur_model_logits = amateur_model(input_ids = header_input_ids, labels = ending_input_ids).logits
-            expert_model_logits = expert_model(input_ids = header_input_ids, labels = ending_input_ids).logits
-        
+            amateur_model_logits = amateur_model(
+                input_ids=header_input_ids, labels=ending_input_ids
+            ).logits
+            expert_model_logits = expert_model(
+                input_ids=header_input_ids, labels=ending_input_ids
+            ).logits
+
         logits = expert_model_logits - amateur_model_logits
         # e.g., (batch_size * #option, ending_seq_len, #vocab): (64, 18, 32128)
         logits = logits.view(-1, logits.shape[-1])
         # ignore padding token: 0
-        ce_loss = F.cross_entropy(logits, ending_input_ids.view(-1), reduction="none", ignore_index=0).detach().cpu()
+        ce_loss = (
+            F.cross_entropy(
+                logits,
+                ending_input_ids.view(-1),
+                reduction="none",
+                ignore_index=0,
+            )
+            .detach()
+            .cpu()
+        )
         # each score is the negative log-likelihood of a ending given a header.
-        batch_predictions = ce_loss.view(ending_shape).sum(dim=-1).argmin(dim=-1)
+        batch_predictions = (
+            ce_loss.view(ending_shape).sum(dim=-1).argmin(dim=-1)
+        )
         batch_labels = batch["label"]
         predictions = torch.cat((predictions, batch_predictions))
         labels = torch.cat((labels, batch_labels))
-    
+
         # make accuracy accumulative
-        batch_accuracy = (batch_predictions == batch_labels).sum().item() / len(batch_labels)
+        batch_accuracy = (
+            batch_predictions == batch_labels
+        ).sum().item() / len(batch_labels)
         total_accuracy = (predictions == labels).sum().item() / len(labels)
-        pbar.set_description(f"Total Accuracy: {total_accuracy:.4f}, Batch Accuracy: {batch_accuracy:.4f}")
+        pbar.set_description(
+            f"Total Accuracy: {total_accuracy:.4f}, \
+                Batch Accuracy: {batch_accuracy:.4f}"
+        )
     return total_accuracy
 
-def inference_language_modeling(model, eval_dataloader, device, compute_func, pad_token_id):
+
+def inference_language_modeling(
+    model, eval_dataloader, device, compute_func, pad_token_id
+):
     model.eval()
     lm_predictions = torch.zeros(0)
     avg_lm_predictions = torch.zeros(0)
@@ -91,23 +152,39 @@ def inference_language_modeling(model, eval_dataloader, device, compute_func, pa
         log_prob = compute_func(batch, model, device, pad_token_id)
         avg_log_prob = log_prob / batch["ending_attention_mask"].sum(dim=-1)
         avg_log_probs.append(avg_log_prob)
-        
+
         batch_predictions = log_prob.argmin(dim=-1)
         batch_avg_predictions = avg_log_prob.argmin(dim=-1)
 
         batch_labels = batch["label"]
         lm_predictions = torch.cat((lm_predictions, batch_predictions))
-        avg_lm_predictions = torch.cat((avg_lm_predictions, batch_avg_predictions))
+        avg_lm_predictions = torch.cat(
+            (avg_lm_predictions, batch_avg_predictions)
+        )
         labels = torch.cat((labels, batch_labels))
-    
+
         # make accuracy accumulative
         lm_accuracy = (lm_predictions == labels).sum().item() / len(labels)
-        avg_lm_accuracy = (avg_lm_predictions == labels).sum().item() / len(labels)
-        pbar.set_description(f"Language modeling accuracy: {lm_accuracy:.4f}, Average language modeling accuracy: {avg_lm_accuracy:.4f}")
+        avg_lm_accuracy = (avg_lm_predictions == labels).sum().item() / len(
+            labels
+        )
+        pbar.set_description(
+            f"Language modeling accuracy: {lm_accuracy:.4f},\
+                Average language modeling accuracy: {avg_lm_accuracy:.4f}"
+        )
     avg_log_probs = torch.cat(avg_log_probs, dim=0)
     return avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions
 
-def inference_generate_synonyms(model, eval_dataloader, device, compute_func, pad_token_id, num_of_options, num_of_synonyms):
+
+def inference_generate_synonyms(
+    model,
+    eval_dataloader,
+    device,
+    compute_func,
+    pad_token_id,
+    num_of_options,
+    num_of_synonyms,
+):
     model.eval()
     lm_predictions = torch.zeros(0)
     avg_lm_predictions = torch.zeros(0)
@@ -122,31 +199,56 @@ def inference_generate_synonyms(model, eval_dataloader, device, compute_func, pa
         avg_log_probs.append(avg_log_prob)
 
         # need to aggregate according to original options.
-        # each row in log_prob correspond to: h0, h1, ..., hn, h0s0, h0s1, ..., h1s0, ..., 
-        
-        # indexing log_prob to rearrange rows by keeping options with corresponding synonyms together.  
-        log_prob = aggregate_optionw_with_synonyms(log_prob, num_of_options, num_of_synonyms)
-        avg_log_prob = aggregate_optionw_with_synonyms(avg_log_prob, num_of_options, num_of_synonyms)
+        # each row in log_prob correspond to: h0, h1, ..., hn,
+        # h0s0, h0s1, ..., h1s0, ...,
+
+        # indexing log_prob to rearrange rows by keeping
+        # options with corresponding synonyms together.
+        log_prob = aggregate_optionw_with_synonyms(
+            log_prob, num_of_options, num_of_synonyms
+        )
+        avg_log_prob = aggregate_optionw_with_synonyms(
+            avg_log_prob, num_of_options, num_of_synonyms
+        )
         # then reshape, and then aggregate options and synonyms by averaging.
-        log_prob = log_prob.view(-1, num_of_options, num_of_synonyms + 1).mean(dim=-1)
-        avg_log_prob = avg_log_prob.view(-1, num_of_options, num_of_synonyms + 1).mean(dim=-1)
+        log_prob = log_prob.view(-1, num_of_options, num_of_synonyms + 1).mean(
+            dim=-1
+        )
+        avg_log_prob = avg_log_prob.view(
+            -1, num_of_options, num_of_synonyms + 1
+        ).mean(dim=-1)
 
         batch_predictions = log_prob.argmin(dim=-1)
         batch_avg_predictions = avg_log_prob.argmin(dim=-1)
 
         batch_labels = batch["label"]
         lm_predictions = torch.cat((lm_predictions, batch_predictions))
-        avg_lm_predictions = torch.cat((avg_lm_predictions, batch_avg_predictions))
+        avg_lm_predictions = torch.cat(
+            (avg_lm_predictions, batch_avg_predictions)
+        )
         labels = torch.cat((labels, batch_labels))
-    
+
         # make accuracy accumulative
         lm_accuracy = (lm_predictions == labels).sum().item() / len(labels)
-        avg_lm_accuracy = (avg_lm_predictions == labels).sum().item() / len(labels)
-        pbar.set_description(f"Language modeling accuracy: {lm_accuracy:.4f}, Average language modeling accuracy: {avg_lm_accuracy:.4f}")
+        avg_lm_accuracy = (avg_lm_predictions == labels).sum().item() / len(
+            labels
+        )
+        pbar.set_description(
+            f"Language modeling accuracy: {lm_accuracy:.4f}, \
+                Average language modeling accuracy: {avg_lm_accuracy:.4f}"
+        )
     avg_log_probs = torch.cat(avg_log_probs, dim=0)
     return avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions
 
-def inference_calibration(model, eval_dataloader, eval_calibration_dataloader, device, compute_func, pad_token_id):
+
+def inference_calibration(
+    model,
+    eval_dataloader,
+    eval_calibration_dataloader,
+    device,
+    compute_func,
+    pad_token_id,
+):
     model.eval()
     lm_predictions = torch.zeros(0)
     avg_lm_predictions = torch.zeros(0)
@@ -154,10 +256,16 @@ def inference_calibration(model, eval_dataloader, eval_calibration_dataloader, d
     torch.cuda.empty_cache()
     avg_log_probs = []
 
-    pbar = tqdm(zip(eval_dataloader, eval_calibration_dataloader), desc="Inference", total=len(eval_dataloader))
+    pbar = tqdm(
+        zip(eval_dataloader, eval_calibration_dataloader),
+        desc="Inference",
+        total=len(eval_dataloader),
+    )
     for batch, batch_calibration in pbar:
         log_prob = compute_func(batch, model, device, pad_token_id)
-        log_prob_calibration = compute_func(batch_calibration, model, device, pad_token_id)
+        log_prob_calibration = compute_func(
+            batch_calibration, model, device, pad_token_id
+        )
         log_prob = log_prob - log_prob_calibration
         avg_log_prob = log_prob / batch["ending_attention_mask"].sum(dim=-1)
         avg_log_probs.append(avg_log_prob)
@@ -167,16 +275,24 @@ def inference_calibration(model, eval_dataloader, eval_calibration_dataloader, d
 
         batch_labels = batch["label"]
         lm_predictions = torch.cat((lm_predictions, batch_predictions))
-        avg_lm_predictions = torch.cat((avg_lm_predictions, batch_avg_predictions))
+        avg_lm_predictions = torch.cat(
+            (avg_lm_predictions, batch_avg_predictions)
+        )
         labels = torch.cat((labels, batch_labels))
-    
+
         # make accuracy accumulative
         lm_accuracy = (lm_predictions == labels).sum().item() / len(labels)
-        avg_lm_accuracy = (avg_lm_predictions == labels).sum().item() / len(labels)
-        pbar.set_description(f"Calibration accuracy: {lm_accuracy:.4f}, Average calibration accuracy: {avg_lm_accuracy:.4f}")
+        avg_lm_accuracy = (avg_lm_predictions == labels).sum().item() / len(
+            labels
+        )
+        pbar.set_description(
+            f"Calibration accuracy: {lm_accuracy:.4f}, \
+                Average calibration accuracy: {avg_lm_accuracy:.4f}"
+        )
     avg_log_probs = torch.cat(avg_log_probs, dim=0)
     return avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions
 
+
 def inference_contrastive_decoding(method, model, **kwargs):
     args = kwargs["args"]
     raw_dataset = kwargs["raw_dataset"]
@@ -190,57 +306,136 @@ def inference_contrastive_decoding(method, model, **kwargs):
     preprocess_func = kwargs["preprocess_func"]
     preprocess_func_channel = kwargs["preprocess_func_channel"]
 
-    fn_kwargs = {"ending_names": ending_names, 
-                    "header_name": header_name, 
-                    "tokenizer": tokenizer,}
-    if args.model_family in ["BLIP2", "InstructBLIP", "GIT", "PaliGemma", "Idefics2"]:
-        fn_kwargs = {"ending_names": ending_names, 
-                    "header_name": header_name, 
-                    "tokenizer": tokenizer,
-                    "processor": processor,
-                    "image_header_name": image_header_name}
-    num_of_options = len(ending_names)
-    tokenized_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-    eval_dataloader = DataLoader(tokenized_dataset, batch_size=args.batch_size, shuffle=False)
+    fn_kwargs = {
+        "ending_names": ending_names,
+        "header_name": header_name,
+        "tokenizer": tokenizer,
+    }
+    if args.model_family in [
+        "BLIP2",
+        "InstructBLIP",
+        "GIT",
+        "PaliGemma",
+        "Idefics2",
+    ]:
+        fn_kwargs = {
+            "ending_names": ending_names,
+            "header_name": header_name,
+            "tokenizer": tokenizer,
+            "processor": processor,
+            "image_header_name": image_header_name,
+        }
+    # num_of_options = len(ending_names)
+    tokenized_dataset = raw_dataset.map(
+        preprocess_func,
+        fn_kwargs=fn_kwargs,
+        batched=True,
+        batch_size=args.batch_size,
+    )
+    eval_dataloader = DataLoader(
+        tokenized_dataset, batch_size=args.batch_size, shuffle=False
+    )
     if method in ["language_modeling", "multiple_choice_prompt"]:
-        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = (
+            inference_language_modeling(
+                model,
+                eval_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
+        )
     elif method == "calibration":
-        fn_kwargs = {"ending_names": ending_names, 
-                    "header_name": "uncond_premise", # the difference is here
-                    "tokenizer": tokenizer,}
-        if args.model_family in ["BLIP2", "InstructBLIP", "GIT", "PaliGemma", "Idefics2"]:
-            fn_kwargs = {"ending_names": ending_names, 
-                        "header_name": "uncond_premise", 
-                        "tokenizer": tokenizer,
-                        "processor": processor,
-                        "image_header_name": image_header_name}
-        tokenized_calibration_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-        eval_calibration_dataloader = DataLoader(tokenized_calibration_dataset, batch_size=args.batch_size, shuffle=False)    
-        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
+        fn_kwargs = {
+            "ending_names": ending_names,
+            "header_name": "uncond_premise",  # the difference is here
+            "tokenizer": tokenizer,
+        }
+        if args.model_family in [
+            "BLIP2",
+            "InstructBLIP",
+            "GIT",
+            "PaliGemma",
+            "Idefics2",
+        ]:
+            fn_kwargs = {
+                "ending_names": ending_names,
+                "header_name": "uncond_premise",
+                "tokenizer": tokenizer,
+                "processor": processor,
+                "image_header_name": image_header_name,
+            }
+        tokenized_calibration_dataset = raw_dataset.map(
+            preprocess_func,
+            fn_kwargs=fn_kwargs,
+            batched=True,
+            batch_size=args.batch_size,
+        )
+        eval_calibration_dataloader = DataLoader(
+            tokenized_calibration_dataset,
+            batch_size=args.batch_size,
+            shuffle=False,
+        )
+        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = (
+            inference_calibration(
+                model,
+                eval_dataloader,
+                eval_calibration_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
+        )
     elif method == "channel":
-        # simple solution: swap first sentence and second sentence in both preprocessing functions
-        tokenized_channel_dataset = raw_dataset.map(preprocess_func_channel, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-        eval_channel_dataloader = DataLoader(tokenized_channel_dataset, batch_size=args.batch_size, shuffle=False)
-        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
+        # simple solution: swap first sentence and
+        # second sentence in both preprocessing functions
+        tokenized_channel_dataset = raw_dataset.map(
+            preprocess_func_channel,
+            fn_kwargs=fn_kwargs,
+            batched=True,
+            batch_size=args.batch_size,
+        )
+        eval_channel_dataloader = DataLoader(
+            tokenized_channel_dataset,
+            batch_size=args.batch_size,
+            shuffle=False,
+        )
+        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = (
+            inference_language_modeling(
+                model,
+                eval_channel_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
+        )
     else:
         raise NotImplementedError
     return avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions
 
-def compute_mask_process_of_elimination(avg_log_probs, mask_strategy, **kwargs):
+
+def compute_mask_process_of_elimination(
+    avg_log_probs, mask_strategy, **kwargs
+):
     masks = torch.ones_like(avg_log_probs)
     if mask_strategy == "lowest":
         # soft masking (v1), i.e., get rid of the least likely answer.
-        masks[torch.arange(avg_log_probs.shape[0]), avg_log_probs.argmax(dim=-1)] = 0
+        masks[
+            torch.arange(avg_log_probs.shape[0]), avg_log_probs.argmax(dim=-1)
+        ] = 0
     elif mask_strategy == "below_average":
         # v2: Calculate the row-wise mean
         row_mean = avg_log_probs.mean(dim=1, keepdim=True)
         # Set values below the mean to 0
         masks[avg_log_probs > row_mean] = 0
     elif mask_strategy == "lowest_iter":
-        # similar to lowest, but ignore inf, and mask from the remaining options.
+        # similar to lowest, but ignore inf,
+        # and mask from the remaining options.
         # soft masking (v1), i.e., get rid of the least likely answer.
         avg_log_probs[avg_log_probs == float("inf")] = float("-inf")
-        masks[torch.arange(avg_log_probs.shape[0]), avg_log_probs.argmax(dim=-1)] = 0
+        masks[
+            torch.arange(avg_log_probs.shape[0]), avg_log_probs.argmax(dim=-1)
+        ] = 0
         # set mask that correspond to inf to 0
         masks[avg_log_probs == float("-inf")] = 0
     elif mask_strategy == "min_k":
@@ -248,12 +443,18 @@ def compute_mask_process_of_elimination(avg_log_probs, mask_strategy, **kwargs):
         # keep the min k options
         avg_log_probs_f32 = avg_log_probs.float()
         _, min_k_indices = avg_log_probs_f32.topk(min_k, dim=-1)
-        masks[torch.arange(avg_log_probs_f32.shape[0]).unsqueeze(-1), min_k_indices] = 0
+        masks[
+            torch.arange(avg_log_probs_f32.shape[0]).unsqueeze(-1),
+            min_k_indices,
+        ] = 0
     else:
         raise NotImplementedError
     return masks
 
-def inference_process_of_elimination(model, eval_dataloader, device, compute_func, pad_token_id):
+
+def inference_process_of_elimination(
+    model, eval_dataloader, device, compute_func, pad_token_id
+):
     model.eval()
     lm_predictions = torch.zeros(0)
     avg_lm_predictions = torch.zeros(0)
@@ -268,65 +469,101 @@ def inference_process_of_elimination(model, eval_dataloader, device, compute_fun
         log_prob[batch["mask"] == 0] = float("inf")
         avg_log_prob = log_prob / batch["ending_attention_mask"].sum(dim=-1)
         avg_log_probs.append(avg_log_prob)
-        
+
         batch_predictions = log_prob.argmin(dim=-1)
         batch_avg_predictions = avg_log_prob.argmin(dim=-1)
 
         batch_labels = batch["label"]
         lm_predictions = torch.cat((lm_predictions, batch_predictions))
-        avg_lm_predictions = torch.cat((avg_lm_predictions, batch_avg_predictions))
+        avg_lm_predictions = torch.cat(
+            (avg_lm_predictions, batch_avg_predictions)
+        )
         labels = torch.cat((labels, batch_labels))
-    
+
         # make accuracy accumulative
         lm_accuracy = (lm_predictions == labels).sum().item() / len(labels)
-        avg_lm_accuracy = (avg_lm_predictions == labels).sum().item() / len(labels)
-        pbar.set_description(f"Process of elimination accuracy: {lm_accuracy:.4f}, Average process of elimination accuracy: {avg_lm_accuracy:.4f}")
+        avg_lm_accuracy = (avg_lm_predictions == labels).sum().item() / len(
+            labels
+        )
+        pbar.set_description(
+            f"Process of elimination accuracy: {lm_accuracy:.4f}, \
+                Average process of elimination accuracy: {avg_lm_accuracy:.4f}"
+        )
     avg_log_probs = torch.cat(avg_log_probs, dim=0)
     return avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions
 
+
 def compute_conditional_score_seq2seq(batch, model, device, pad_token_id):
     # returns log_prob of p(y|x) for each batch
-    
+
     # e.g., (batch_size, #option, ending_seq_len): (32, 2, 18)
-    ending_shape = batch["ending_input_ids"].shape 
+    ending_shape = batch["ending_input_ids"].shape
     # flatten. both input_ids has 0 as padding token.
-    header_input_ids = batch["header_input_ids"].view(-1, batch["header_input_ids"].shape[-1]).to(device)
-    header_attention_mask = batch["header_attention_mask"].view(-1, batch["header_attention_mask"].shape[-1]).to(device)
-    ending_input_ids = batch["ending_input_ids"].view(-1, batch["ending_input_ids"].shape[-1]).to(device)
+    header_input_ids = (
+        batch["header_input_ids"]
+        .view(-1, batch["header_input_ids"].shape[-1])
+        .to(device)
+    )
+    header_attention_mask = (
+        batch["header_attention_mask"]
+        .view(-1, batch["header_attention_mask"].shape[-1])
+        .to(device)
+    )
+    ending_input_ids = (
+        batch["ending_input_ids"]
+        .view(-1, batch["ending_input_ids"].shape[-1])
+        .to(device)
+    )
 
     # adding this line of code takes me more than an hour.
     # without adding torch.no_grad, GPU usage will muiltply by 4.
     with torch.no_grad():
-        outputs = model(input_ids = header_input_ids, 
-                        attention_mask = header_attention_mask,
-                        labels = ending_input_ids)
-    
+        outputs = model(
+            input_ids=header_input_ids,
+            attention_mask=header_attention_mask,
+            labels=ending_input_ids,
+        )
+
     _, logits = outputs.loss, outputs.logits
     # e.g., (batch_size * #option, ending_seq_len, #vocab): (64, 18, 32128)
     logits = logits.view(-1, logits.shape[-1])
     # ignore padding token: 0
-    ce_loss = F.cross_entropy(logits, ending_input_ids.view(-1), reduction="none", ignore_index=pad_token_id).detach().cpu()
+    ce_loss = (
+        F.cross_entropy(
+            logits,
+            ending_input_ids.view(-1),
+            reduction="none",
+            ignore_index=pad_token_id,
+        )
+        .detach()
+        .cpu()
+    )
     # each score is the negative log-likelihood of a ending given a header.
     # batch_predictions = ce_loss.view(ending_shape).sum(dim=-1).argmin(dim=-1)
     log_prob = ce_loss.view(ending_shape).sum(dim=-1)
     return log_prob
 
+
 def compute_conditional_score_causal(batch, model, device, pad_token_id):
     # returns log_prob of p(y|x) for each batch
-    # make sure the padding token is aligned with tokenizer.pad_token_id 
+    # make sure the padding token is aligned with tokenizer.pad_token_id
     # and preprocess_function_causal
     # padding_token = 50256
-    
-    input_ids = batch["input_ids"].view(-1, batch["input_ids"].shape[-1]).to(device)
+
+    input_ids = (
+        batch["input_ids"].view(-1, batch["input_ids"].shape[-1]).to(device)
+    )
     labels = batch["labels"].view(-1, batch["labels"].shape[-1]).to(device)
 
     # adding this line of code takes me more than an hour.
     # without adding torch.no_grad, GPU usage will muiltply by 4.
     with torch.no_grad():
-        outputs = model(input_ids = input_ids,
-                        # attention_mask = attention_mask,
-                        labels = labels)
-    
+        outputs = model(
+            input_ids=input_ids,
+            # attention_mask = attention_mask,
+            labels=labels,
+        )
+
     _, logits = outputs.loss, outputs.logits
     # shift
     logits = logits[:, :-1].contiguous()
@@ -334,116 +571,212 @@ def compute_conditional_score_causal(batch, model, device, pad_token_id):
     # e.g., (batch_size * #option, ending_seq_len, #vocab): (64, 18, 32128)
     logits = logits.view(-1, logits.shape[-1])
     # ignore padding token: 50256
-    ce_loss = F.cross_entropy(logits, labels.view(-1), reduction="none", ignore_index=pad_token_id).detach().cpu()
+    ce_loss = (
+        F.cross_entropy(
+            logits,
+            labels.view(-1),
+            reduction="none",
+            ignore_index=pad_token_id,
+        )
+        .detach()
+        .cpu()
+    )
     # each score is the negative log-likelihood of a ending given a header.
-    log_prob = ce_loss.view(batch["input_ids"].shape[0], batch["input_ids"].shape[1], -1).sum(dim=-1)
+    log_prob = ce_loss.view(
+        batch["input_ids"].shape[0], batch["input_ids"].shape[1], -1
+    ).sum(dim=-1)
     return log_prob
 
+
 def compute_conditional_score_seq2seq_vqa(batch, model, device, pad_token_id):
     # returns log_prob of p(y|x) for each batch
-    
+
     # e.g., (batch_size, #option, ending_seq_len): (32, 2, 18)
-    ending_shape = batch["ending_input_ids"].shape 
+    ending_shape = batch["ending_input_ids"].shape
     # flatten. both input_ids has 0 as padding token.
-    header_input_ids = batch["header_input_ids"].view(-1, batch["header_input_ids"].shape[-1]).to(device)
-    header_attention_mask = batch["header_attention_mask"].view(-1, batch["header_attention_mask"].shape[-1]).to(device)
-    ending_input_ids = batch["ending_input_ids"].view(-1, batch["ending_input_ids"].shape[-1]).to(device)
-    images = batch["images"].view(-1, batch["images"].shape[-3], batch["images"].shape[-2], batch["images"].shape[-1]).to(device)
+    header_input_ids = (
+        batch["header_input_ids"]
+        .view(-1, batch["header_input_ids"].shape[-1])
+        .to(device)
+    )
+    header_attention_mask = (
+        batch["header_attention_mask"]
+        .view(-1, batch["header_attention_mask"].shape[-1])
+        .to(device)
+    )
+    ending_input_ids = (
+        batch["ending_input_ids"]
+        .view(-1, batch["ending_input_ids"].shape[-1])
+        .to(device)
+    )
+    images = (
+        batch["images"]
+        .view(
+            -1,
+            batch["images"].shape[-3],
+            batch["images"].shape[-2],
+            batch["images"].shape[-1],
+        )
+        .to(device)
+    )
 
     # adding this line of code takes me more than an hour.
     # without adding torch.no_grad, GPU usage will muiltply by 4.
     with torch.no_grad():
-        outputs = model(input_ids = header_input_ids, 
-                        attention_mask = header_attention_mask,
-                        pixel_values=images, 
-                        labels = ending_input_ids)
-    
+        outputs = model(
+            input_ids=header_input_ids,
+            attention_mask=header_attention_mask,
+            pixel_values=images,
+            labels=ending_input_ids,
+        )
+
     _, logits = outputs.loss, outputs.logits
     # e.g., (batch_size * #option, ending_seq_len, #vocab): (64, 18, 32128)
     logits = logits.contiguous().view(-1, logits.shape[-1])
     # ignore padding token: 0
-    ce_loss = F.cross_entropy(logits, ending_input_ids.view(-1), reduction="none", ignore_index=pad_token_id).detach().cpu()
+    ce_loss = (
+        F.cross_entropy(
+            logits,
+            ending_input_ids.view(-1),
+            reduction="none",
+            ignore_index=pad_token_id,
+        )
+        .detach()
+        .cpu()
+    )
     # each score is the negative log-likelihood of a ending given a header.
     # batch_predictions = ce_loss.view(ending_shape).sum(dim=-1).argmin(dim=-1)
     log_prob = ce_loss.view(ending_shape).sum(dim=-1)
     return log_prob
 
+
 def compute_conditional_score_causal_vqa(batch, model, device, pad_token_id):
     # returns log_prob of p(y|x) for each batch
-    # make sure the padding token is aligned with tokenizer.pad_token_id 
+    # make sure the padding token is aligned with tokenizer.pad_token_id
     # and preprocess_function_causal
     # padding_token = 50256
-    input_ids = batch["input_ids"].view(-1, batch["input_ids"].shape[-1]).to(device)
-    header_attention_mask = batch["header_attention_mask"].view(-1, batch["header_attention_mask"].shape[-1]).to(device)
-    ending_attention_mask = batch["ending_attention_mask"].view(-1, batch["ending_attention_mask"].shape[-1]).to(device)
+    input_ids = (
+        batch["input_ids"].view(-1, batch["input_ids"].shape[-1]).to(device)
+    )
+    header_attention_mask = (
+        batch["header_attention_mask"]
+        .view(-1, batch["header_attention_mask"].shape[-1])
+        .to(device)
+    )
+    # ending_attention_mask = (
+    #     batch["ending_attention_mask"]
+    #     .view(-1, batch["ending_attention_mask"].shape[-1])
+    #     .to(device)
+    # )
     labels = batch["labels"].view(-1, batch["labels"].shape[-1]).to(device)
-    images = batch["images"].view(-1, batch["images"].shape[-3], batch["images"].shape[-2], batch["images"].shape[-1]).to(device)
+    images = (
+        batch["images"]
+        .view(
+            -1,
+            batch["images"].shape[-3],
+            batch["images"].shape[-2],
+            batch["images"].shape[-1],
+        )
+        .to(device)
+    )
 
     # adding this line of code takes me more than an hour.
     # without adding torch.no_grad, GPU usage will muiltply by 4.
     with torch.no_grad():
-        outputs = model(input_ids=input_ids, 
-                        pixel_values=images,
-                        attention_mask = header_attention_mask,
-                        labels=labels)    
-    
-    loss, logits = outputs.loss, outputs.logits
-    logits = logits[:, -labels.shape[-1]:, :] # for GIT
+        outputs = model(
+            input_ids=input_ids,
+            pixel_values=images,
+            attention_mask=header_attention_mask,
+            labels=labels,
+        )
+
+    _, logits = outputs.loss, outputs.logits
+    logits = logits[:, -labels.shape[-1] :, :]  # for GIT
 
     # shift
     logits = logits[:, :-1].contiguous()
     labels = labels[:, 1:].contiguous()
-    
+
     # e.g., (batch_size * #option, ending_seq_len, #vocab): (64, 18, 32128)
     logits = logits.view(-1, logits.shape[-1])
     # ignore padding token: 50256
-    ce_loss = F.cross_entropy(logits, labels.view(-1), reduction="none", ignore_index=pad_token_id).detach().cpu()
+    ce_loss = (
+        F.cross_entropy(
+            logits,
+            labels.view(-1),
+            reduction="none",
+            ignore_index=pad_token_id,
+        )
+        .detach()
+        .cpu()
+    )
     # each score is the negative log-likelihood of a ending given a header.
-    log_prob = ce_loss.view(batch["input_ids"].shape[0], batch["input_ids"].shape[1], -1).sum(dim=-1)
+    log_prob = ce_loss.view(
+        batch["input_ids"].shape[0], batch["input_ids"].shape[1], -1
+    ).sum(dim=-1)
     return log_prob
 
+
 def generate_synonyms(args, model, tokenizer, tokenized_dataset):
-    
+
     generation_config = GenerationConfig(
-    max_new_tokens=50,
-    do_sample=True,
-    temperature=0.7,
-    num_return_sequences=args.number_of_synonyms,
+        max_new_tokens=50,
+        do_sample=True,
+        temperature=0.7,
+        num_return_sequences=args.number_of_synonyms,
     )
 
     # get all columns of tokenized_dataset that starts with "hypothesis"
-    hypothesis_columns = [col for col in tokenized_dataset.column_names if col.startswith("hypothesis")]
+    hypothesis_columns = [
+        col
+        for col in tokenized_dataset.column_names
+        if col.startswith("hypothesis")
+    ]
     # batch inference? May check SEQA code or HF doc.
     synonyms_dict = {}
     for col in tqdm(hypothesis_columns, desc="Generate synonyms"):
-        for option in tqdm(tokenized_dataset[col], desc=f"Generate synonyms for {col}"):
+        for option in tqdm(
+            tokenized_dataset[col], desc=f"Generate synonyms for {col}"
+        ):
             # prompt = f"Generate a synonym to '{option}':"
-            prompt = args.generate_synonyms_prompt.replace("'{option}'", option)
+            prompt = args.generate_synonyms_prompt.replace(
+                "'{option}'", option
+            )
             inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-            outputs = model.generate(**inputs, generation_config=generation_config)
-            synonyms = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            outputs = model.generate(
+                **inputs, generation_config=generation_config
+            )
+            synonyms = tokenizer.batch_decode(
+                outputs, skip_special_tokens=True
+            )
             # store the synonyms, so map() is easy.
-            # directly mapping here? All I need to do is to create duplicates of the instance with synonyms.
+            # directly mapping here? All I need to do
+            # is to create duplicates of the instance with synonyms.
             synonyms_dict[option] = synonyms
 
     return synonyms_dict
 
+
 def aggregate_optionw_with_synonyms(tensor, num_of_options, num_of_synonyms):
     # this function changes the column order.
     # tensor: (batch_size, num_of_options * (num_of_synonyms + 1))
     old_index = list(range(tensor.shape[1]))
     aggregated_index = [-1] * len(old_index)
     # exapmle: commonsenseqa 5 options with 3 synonyms: 0, 4, 8, 12, 16
-    options_index_old = list(range(num_of_options)) # e.g., 0..4
-    options_index_new = [i * (num_of_synonyms + 1) for i in options_index_old] # e.g., 0, 4, 8, 12, 16
-    remain_index = [i for i in old_index if i not in options_index_old] # e.g., 5..19 
+    options_index_old = list(range(num_of_options))  # e.g., 0..4
+    options_index_new = [
+        i * (num_of_synonyms + 1) for i in options_index_old
+    ]  # e.g., 0, 4, 8, 12, 16
+    remain_index = [
+        i for i in old_index if i not in options_index_old
+    ]  # e.g., 5..19
     for i, _ in enumerate(aggregated_index):
-        if i in options_index_new: # 0, 4, 8, 12, 16
+        if i in options_index_new:  # 0, 4, 8, 12, 16
             aggregated_index[i] = options_index_old.pop(0)
         else:
             aggregated_index[i] = remain_index.pop(0)
 
-
-    # aggregated_index = options_index + [i for i in old_index if i not in options_index]
+    # aggregated_index = options_index + \
+    #     [i for i in old_index if i not in options_index]
     tensor[:, old_index] = tensor[:, aggregated_index]
     return tensor
diff --git a/mm_poe/methods/utils/models.py b/mm_poe/methods/utils/models.py
index 305f6ab..578f0f8 100644
--- a/mm_poe/methods/utils/models.py
+++ b/mm_poe/methods/utils/models.py
@@ -9,4 +9,3 @@ def find_expert_model(model_family):
     else:
         print(f"{model_family}: Not implemented.")
     return expert_checkpoint
-        
\ No newline at end of file
diff --git a/mm_poe/methods/utils/utils.py b/mm_poe/methods/utils/utils.py
index a965f6b..0d483a5 100644
--- a/mm_poe/methods/utils/utils.py
+++ b/mm_poe/methods/utils/utils.py
@@ -1,28 +1,24 @@
 import argparse
 import csv
-import logging
 import os
 import random
-import sys
-from tqdm import tqdm
 
 import numpy as np
 import torch
-from torchvision.transforms import v2
-import transformers
-from transformers import(
-    AutoTokenizer, 
+
+from transformers import (
+    AutoTokenizer,
     AutoModelForCausalLM,
     AutoModelForSeq2SeqLM,
     AutoProcessor,
     AutoModelForVision2Seq,
-    BitsAndBytesConfig
+    BitsAndBytesConfig,
 )
 from datasets import Dataset
 
 # import data.py, which is located in the same directory
 
-from .data import(
+from .data import (
     copa_loader,
     cqa_loader,
     obqa_loader,
@@ -35,57 +31,84 @@
     vqa_loader,
     scienceqa_loader,
     ai2d_loader,
-    single_inference_loader
+    single_inference_loader,
 )
 
+
 def set_seed(seed):
-    os.environ['PYTHONHASHSEED'] = str(seed)
+    os.environ["PYTHONHASHSEED"] = str(seed)
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     torch.cuda.manual_seed_all(seed)
 
+
 def parse_args():
     parser = argparse.ArgumentParser("Inference on multiple choice benchmarks")
     parser.add_argument(
-        "--seed", 
-        type=int, 
+        "--seed",
+        type=int,
         default=0,
         help="Random seed for reproducibility.",
-        )
+    )
     parser.add_argument(
         "--model_family",
         type=str,
-        choices=["GPT2", "T5", "FLAN-T5", "Pythia", "OPT-IML", "Dolly", "BLIP2", "InstructBLIP", "GIT", "PaliGemma", "Idefics2"],
+        choices=[
+            "GPT2",
+            "T5",
+            "FLAN-T5",
+            "Pythia",
+            "OPT-IML",
+            "Dolly",
+            "BLIP2",
+            "InstructBLIP",
+            "GIT",
+            "PaliGemma",
+            "Idefics2",
+        ],
         default=None,
         required=True,
-        help="The moddel family, as checkpoints under the same model family use same codes to download.",
-        )
+        help="The moddel family, as checkpoints under the same \
+            model family use same codes to download.",
+    )
     parser.add_argument(
         "--checkpoint",
         type=str,
         default=None,
         required=True,
-        help="The checkpoint name under a model family, e.g. gpt2, gpt2-medium, gpt2-large, gpt2-xl.",
+        help="The checkpoint name under a model family, \
+            e.g. gpt2, gpt2-medium, gpt2-large, gpt2-xl.",
     )
     parser.add_argument(
         "--amateur_checkpoint",
         type=str,
         default=None,
-        help="The amateur checkpoint name under a model family. For constrative decoding.",
+        help="The amateur checkpoint name under a model family. \
+            For constrative decoding.",
     )
     parser.add_argument(
         "--expert_method",
         type=str,
-        choices=["language_modeling", "calibration", "channel", "multiple_choice_prompt"],
+        choices=[
+            "language_modeling",
+            "calibration",
+            "channel",
+            "multiple_choice_prompt",
+        ],
         default="language_modeling",
         help="The expert method. For constrative decoding.",
     )
     parser.add_argument(
         "--amateur_method",
         type=str,
-        choices=["language_modeling", "calibration", "channel", "multiple_choice_prompt"],
+        choices=[
+            "language_modeling",
+            "calibration",
+            "channel",
+            "multiple_choice_prompt",
+        ],
         default="language_modeling",
         help="The amateur method. For constrative decoding.",
     )
@@ -93,26 +116,29 @@ def parse_args():
         "--weighting_parameter",
         type=float,
         default=-1.0,
-        help="The weighting parameter for constrative decoding. It is applied to the amateur model.",
+        help="The weighting parameter for constrative decoding. \
+            It is applied to the amateur model.",
     )
     parser.add_argument(
         "--weighting_parameters",
         type=str,
         default=None,
-        help="The weighting parameters for constrative decoding. One weight for one dataset.",
+        help="The weighting parameters for constrative decoding. \
+            One weight for one dataset.",
     )
     parser.add_argument(
         "--num_random_search",
         type=int,
         default=0,
-        help="The number of random search for the weighting parameter for constrative decoding.",
+        help="The number of random search for the \
+            weighting parameter for constrative decoding.",
     )
     parser.add_argument(
         "--loading_precision",
         type=str,
         choices=["FP32", "FP16", "BF16", "INT8", "INT4"],
         default="FP32",
-        help="The precision of the model to be loaded."
+        help="The precision of the model to be loaded.",
     )
     parser.add_argument(
         "--datasets",
@@ -120,13 +146,15 @@ def parse_args():
         # choices=["copa", "cqa", "winogrande"],
         default=None,
         required=True,
-        help="The datasets to inference on. Pass multiple datasets separate by space",
+        help="The datasets to inference on. \
+            Pass multiple datasets separate by space",
     )
     parser.add_argument(
         "--sample",
         type=int,
         default=None,
-        help="The number of samples to inference on. If None, inference on the whole dataset.",
+        help="The number of samples to inference on. \
+            If None, inference on the whole dataset.",
     )
     parser.add_argument(
         "--batch_size",
@@ -138,13 +166,13 @@ def parse_args():
         "--n_shot",
         type=int,
         default=0,
-        help="Number of few-shot demonstrations. 0 means zero-shot.",    
+        help="Number of few-shot demonstrations. 0 means zero-shot.",
     )
     parser.add_argument(
         "--multiple_choice_prompt",
         type=str,
-        default = None,
-        help = "The multiple choice prompt."
+        default=None,
+        help="The multiple choice prompt.",
     )
     parser.add_argument(
         "--calibration_prompt",
@@ -161,12 +189,18 @@ def parse_args():
         "--process_of_elimination_prompt",
         type=str,
         default=None,
-        help="The process of elimination prompt. It asks the model to ignore masked options.",
+        help="The process of elimination prompt. \
+            It asks the model to ignore masked options.",
     )
     parser.add_argument(
         "--scoring_method_for_process_of_elimination",
         type=str,
-        choices=["language_modeling", "calibration", "channel", "multiple_choice_prompt"],
+        choices=[
+            "language_modeling",
+            "calibration",
+            "channel",
+            "multiple_choice_prompt",
+        ],
         default="language_modeling",
         help="The scoring method for process of elimination.",
     )
@@ -199,12 +233,14 @@ def parse_args():
         "--generate_synonyms_prompt",
         type=str,
         default=None,
-        help="The prompt template for generating synonyms. 'option is replaced with actual options'",
+        help="The prompt template for generating synonyms. \
+            'option is replaced with actual options'",
     )
     parser.add_argument(
         "--push_data_to_hub",
         action="store_true",
-        help="Whether to push the data to Hugging Face Hub. This is convienient for LLM experiments.",
+        help="Whether to push the data to Hugging Face Hub. \
+            This is convienient for LLM experiments.",
     )
     parser.add_argument(
         "--min_k",
@@ -215,51 +251,75 @@ def parse_args():
         "--mask_token",
         type=str,
         default=None,
-        help="The mask token. If None, use the default mask token of the model.",
+        help="The mask token. If None, \
+            use the default mask token of the model.",
     )
 
     args = parser.parse_args()
     return args
 
+
 def load_data(args):
     # load test data for final performance.
     # load dev data to tune hyperparameters.
     # commonsense reasoning datasets
     train_file_path = None
     if args.dataset == "copa":
-        ending_names = ['hypothesis0', 'hypothesis1']
+        ending_names = ["hypothesis0", "hypothesis1"]
         header_name = "premise"
         file_path = os.path.join("../data", args.dataset, "copa-test.xml")
         train_file_path = os.path.join("../data", args.dataset, "copa-dev.xml")
         loader = copa_loader
     elif args.dataset == "cqa":
-        ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4']
+        ending_names = [
+            "hypothesis0",
+            "hypothesis1",
+            "hypothesis2",
+            "hypothesis3",
+            "hypothesis4",
+        ]
         header_name = "premise"
         file_path = os.path.join("../data", args.dataset, "dev.jsonl")
         train_file_path = os.path.join("../data", args.dataset, "train.jsonl")
         loader = cqa_loader
     elif args.dataset == "obqa":
-        ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3']
+        ending_names = [
+            "hypothesis0",
+            "hypothesis1",
+            "hypothesis2",
+            "hypothesis3",
+        ]
         header_name = "premise"
         file_path = os.path.join("../data", args.dataset, "test.jsonl")
         train_file_path = os.path.join("../data", args.dataset, "train.jsonl")
         loader = obqa_loader
     elif args.dataset == "piqa":
-        ending_names = ['hypothesis0', 'hypothesis1']
+        ending_names = ["hypothesis0", "hypothesis1"]
         header_name = "premise"
         data_path = os.path.join("../data", args.dataset, "valid.jsonl")
         label_path = os.path.join("../data", args.dataset, "valid-labels.lst")
         file_path = [data_path, label_path]
-        train_file_path = [path.replace("valid", "train") for path in file_path]
+        train_file_path = [
+            path.replace("valid", "train") for path in file_path
+        ]
         loader = piqa_loader
     elif args.dataset == "qasc":
-        ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4', 'hypothesis5', 'hypothesis6', 'hypothesis7']
+        ending_names = [
+            "hypothesis0",
+            "hypothesis1",
+            "hypothesis2",
+            "hypothesis3",
+            "hypothesis4",
+            "hypothesis5",
+            "hypothesis6",
+            "hypothesis7",
+        ]
         header_name = "premise"
         file_path = os.path.join("../data", args.dataset, "dev.jsonl")
         train_file_path = os.path.join("../data", args.dataset, "train.jsonl")
         loader = qasc_loader
     elif args.dataset == "siqa":
-        ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2']
+        ending_names = ["hypothesis0", "hypothesis1", "hypothesis2"]
         header_name = "premise"
         data_path = os.path.join("../data", args.dataset, "dev.jsonl")
         label_path = os.path.join("../data", args.dataset, "dev-labels.lst")
@@ -267,94 +327,143 @@ def load_data(args):
         train_file_path = [path.replace("dev", "train") for path in file_path]
         loader = siqa_loader
     elif args.dataset == "winogrande":
-        ending_names = ['hypothesis0', 'hypothesis1']
+        ending_names = ["hypothesis0", "hypothesis1"]
         header_name = "premise"
         data_path = os.path.join("../data", args.dataset, "dev.jsonl")
         label_path = os.path.join("../data", args.dataset, "dev-labels.lst")
         file_path = [data_path, label_path]
-        train_file_path = [path.replace("dev", "train_xs") for path in file_path]
+        train_file_path = [
+            path.replace("dev", "train_xs") for path in file_path
+        ]
         loader = winogrande_loader
     # BIG-Bench tasks
     elif args.dataset == "disambiguation_qa":
         args.num_options = 3
         ending_names = [f"hypothesis{i}" for i in range(args.num_options)]
-        # ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2']
         header_name = "premise"
-        file_path = [os.path.join("../data", "big_bench", f"{args.dataset}.json")]
+        file_path = [
+            os.path.join("../data", "big_bench", f"{args.dataset}.json")
+        ]
         loader = date_understanding_loader
     elif args.dataset == "conceptual_combinations":
         args.num_options = 4
         ending_names = [f"hypothesis{i}" for i in range(args.num_options)]
-        # ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3']
         header_name = "premise"
         file_path = []
-        file_suffixes = ["contradictions", "emergent_properties", "fanciful_fictional_combinations", "homonyms", "invented_words", "surprising_uncommon_combinations"]
+        file_suffixes = [
+            "contradictions",
+            "emergent_properties",
+            "fanciful_fictional_combinations",
+            "homonyms",
+            "invented_words",
+            "surprising_uncommon_combinations",
+        ]
         for suffix in file_suffixes:
-            file_path.append(os.path.join("../data", "big_bench", f"{args.dataset}_{suffix}.json"))
+            file_path.append(
+                os.path.join(
+                    "../data", "big_bench", f"{args.dataset}_{suffix}.json"
+                )
+            )
         loader = date_understanding_loader
     elif args.dataset == "date_understanding":
         args.num_options = 6
         ending_names = [f"hypothesis{i}" for i in range(args.num_options)]
-        # ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4', 'hypothesis5']
         header_name = "premise"
-        file_path = [os.path.join("../data", "big_bench", f"{args.dataset}.json")]
+        file_path = [
+            os.path.join("../data", "big_bench", f"{args.dataset}.json")
+        ]
         loader = date_understanding_loader
-    elif args.dataset in ["emoji_movie", "evaluating_information_essentiality", "logical_args", "riddle_sense"]: 
+    elif args.dataset in [
+        "emoji_movie",
+        "evaluating_information_essentiality",
+        "logical_args",
+        "riddle_sense",
+    ]:
         args.num_options = 5
         ending_names = [f"hypothesis{i}" for i in range(args.num_options)]
-        # ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4']
         header_name = "premise"
-        file_path = [os.path.join("../data", "big_bench", f"{args.dataset}.json")]
+        file_path = [
+            os.path.join("../data", "big_bench", f"{args.dataset}.json")
+        ]
         loader = date_understanding_loader
-    elif args.dataset in ["ruin_names", "temporal_sequences", "code_line_description", "crass_ai", "identify_math_theorems", "identify_odd_metaphor"]:
+    elif args.dataset in [
+        "ruin_names",
+        "temporal_sequences",
+        "code_line_description",
+        "crass_ai",
+        "identify_math_theorems",
+        "identify_odd_metaphor",
+    ]:
         args.num_options = 4
         ending_names = [f"hypothesis{i}" for i in range(args.num_options)]
-        # ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3']
         header_name = "premise"
-        file_path = [os.path.join("../data", "big_bench", f"{args.dataset}.json")]
+        file_path = [
+            os.path.join("../data", "big_bench", f"{args.dataset}.json")
+        ]
         loader = date_understanding_loader
     elif args.dataset == "penguins_in_a_table":
         # use the five object subtask
         args.num_options = 5
         ending_names = [f"hypothesis{i}" for i in range(args.num_options)]
-        # ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4']
         header_name = "premise"
-        file_path = [os.path.join("../data", "big_bench", f"{args.dataset}.json")]
+        file_path = [
+            os.path.join("../data", "big_bench", f"{args.dataset}.json")
+        ]
         loader = date_understanding_loader
     elif args.dataset == "strange_stories":
         args.num_options = 4
         ending_names = [f"hypothesis{i}" for i in range(args.num_options)]
-        # ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3']
         header_name = "premise"
-        file_path = [os.path.join("../data", "big_bench", f"{args.dataset}_multiple_choice.json")]
+        file_path = [
+            os.path.join(
+                "../data", "big_bench", f"{args.dataset}_multiple_choice.json"
+            )
+        ]
         loader = date_understanding_loader
     elif args.dataset == "reasoning_about_colored_objects":
         args.num_options = 18
         ending_names = [f"hypothesis{i}" for i in range(args.num_options)]
-        # ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3']
         header_name = "premise"
-        file_path = [os.path.join("../data", "big_bench", f"{args.dataset}.json")]
+        file_path = [
+            os.path.join("../data", "big_bench", f"{args.dataset}.json")
+        ]
         loader = date_understanding_loader
     elif args.dataset == "symbol_interpretation":
         args.num_options = 5
         ending_names = [f"hypothesis{i}" for i in range(args.num_options)]
-        # ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'] 
         header_name = "premise"
         file_path = []
-        file_suffixes = ["adversarial", "emoji_agnostic", "name_agnostic", "plain", "tricky"]
+        file_suffixes = [
+            "adversarial",
+            "emoji_agnostic",
+            "name_agnostic",
+            "plain",
+            "tricky",
+        ]
         for suffix in file_suffixes:
-            file_path.append(os.path.join("../data", "big_bench", f"{args.dataset}_{suffix}.json"))
+            file_path.append(
+                os.path.join(
+                    "../data", "big_bench", f"{args.dataset}_{suffix}.json"
+                )
+            )
         loader = date_understanding_loader
     elif args.dataset == "tracking_shuffled_objects":
         # use the five object subtask
         args.num_options = 5
         ending_names = [f"hypothesis{i}" for i in range(args.num_options)]
-        # ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4']
         header_name = "premise"
-        file_suffix= "five_objects"
-        file_path = [os.path.join("../data", "big_bench", f"{args.dataset}_{file_suffix}.json")]
+        file_suffix = "five_objects"
+        file_path = [
+            os.path.join(
+                "../data", "big_bench", f"{args.dataset}_{file_suffix}.json"
+            )
+        ]
         loader = date_understanding_loader
-    elif args.dataset in ["logical_deduction_three_objects", "logical_deduction_five_objects", "logical_deduction_seven_objects"]:
+    elif args.dataset in [
+        "logical_deduction_three_objects",
+        "logical_deduction_five_objects",
+        "logical_deduction_seven_objects",
+    ]:
         if "three" in args.dataset:
             args.num_options = 3
         elif "five" in args.dataset:
@@ -363,16 +472,22 @@ def load_data(args):
             args.num_options = 7
         ending_names = [f"hypothesis{i}" for i in range(args.num_options)]
         header_name = "premise"
-        file_path = [os.path.join("../data", "big_bench", f"{args.dataset}.json")]
+        file_path = [
+            os.path.join("../data", "big_bench", f"{args.dataset}.json")
+        ]
         loader = date_understanding_loader
     # other datasets
     elif args.dataset == "anli":
-        ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2']
+        ending_names = ["hypothesis0", "hypothesis1", "hypothesis2"]
         header_name = "premise"
         file_path = []
         file_prefixes = ["R1", "R2", "R3"]
         for prefix in file_prefixes:
-            file_path.append(os.path.join("../data", f"{args.dataset}", f"{prefix}_dev.jsonl"))
+            file_path.append(
+                os.path.join(
+                    "../data", f"{args.dataset}", f"{prefix}_dev.jsonl"
+                )
+            )
         train_file_path = [path.replace("dev", "train") for path in file_path]
         loader = anli_loader
     elif args.dataset in ["anli_r1", "anli_r2", "anli_r3"]:
@@ -381,11 +496,14 @@ def load_data(args):
         # ending_names = ['hypothesis0', 'hypothesis1', 'hypothesis2']
         header_name = "premise"
         prefix = args.dataset.split("_")[-1]
-        file_path = [os.path.join("../data", "anli", f"{prefix.capitalize()}_dev.jsonl")]
+        file_path = [
+            os.path.join("../data", "anli", f"{prefix.capitalize()}_dev.jsonl")
+        ]
         # file_path = []
-        # file_prefixes = ["R1", "R2", "R3"]
+        # # file_prefixes = ["R1", "R2", "R3"]
         # for prefix in file_prefixes:
-        #     file_path.append(os.path.join("../data", f"{args.dataset}", f"{prefix}_dev.jsonl"))
+        #     file_path.append(os.path.join("../data", f"{args.dataset}", \
+        #                                   f"{prefix}_dev.jsonl"))
         train_file_path = [path.replace("dev", "train") for path in file_path]
         loader = anli_loader
     elif args.dataset == "vqa":
@@ -422,91 +540,244 @@ def load_data(args):
         print(f"{args.dataset}: downloader not implemented.")
         return
 
-    
     dev_data = loader(file_path, args)
     dev_dataset = Dataset.from_list(dev_data).with_format("torch")
     if train_file_path is not None:
         train_data = loader(train_file_path, args)
         train_dataset = Dataset.from_list(train_data).with_format("torch")
-    else: # BB tasks have no train set. 
+    else:  # BB tasks have no train set.
         train_dataset = dev_dataset
     if args.dataset in ["vqa", "scienceqa", "ai2d", "single_inference"]:
-        return ending_names, header_name, image_header_name, dev_dataset, train_dataset
+        return (
+            ending_names,
+            header_name,
+            image_header_name,
+            dev_dataset,
+            train_dataset,
+        )
     return ending_names, header_name, dev_dataset, train_dataset
 
+
 def load_model(device, model_path, args):
-    if args.model_family in ["GPT2","Pythia", "OPT-IML", "Dolly"]:
+    if args.model_family in ["GPT2", "Pythia", "OPT-IML", "Dolly"]:
         tokenizer_func = AutoTokenizer
         model_func = AutoModelForCausalLM
     elif args.model_family in ["T5", "FLAN-T5"]:
         tokenizer_func = AutoTokenizer
         model_func = AutoModelForSeq2SeqLM
-    elif args.model_family in ["BLIP2", "InstructBLIP", "GIT", "PaliGemma", "Idefics2"]:
+    elif args.model_family in [
+        "BLIP2",
+        "InstructBLIP",
+        "GIT",
+        "PaliGemma",
+        "Idefics2",
+    ]:
         tokenizer_func = AutoProcessor
         model_func = AutoModelForVision2Seq
     else:
         print(f"{args.model_family}: downloader not implemented.")
         return
     if args.model_family == "Dolly":
-        tokenizer = tokenizer_func.from_pretrained(model_path, padding_side="left")
+        tokenizer = tokenizer_func.from_pretrained(
+            model_path, padding_side="left"
+        )
     elif args.model_family == "Idefics2":
-        tokenizer = tokenizer_func.from_pretrained(model_path, do_image_splitting=False)
+        tokenizer = tokenizer_func.from_pretrained(
+            model_path, do_image_splitting=False
+        )
     else:
         tokenizer = tokenizer_func.from_pretrained(model_path)
     if args.model_family in ["GPT2", "Pythia", "Dolly"]:
         tokenizer.pad_token = tokenizer.eos_token
     elif args.model_family in ["PaliGemma", "GIT"]:
         tokenizer.tokenizer.padding_side = "left"
-    
+
     # load with different precision
     if args.loading_precision == "FP16":
-        model = model_func.from_pretrained(model_path, device_map=device, torch_dtype=torch.float16)
+        model = model_func.from_pretrained(
+            model_path, device_map=device, torch_dtype=torch.float16
+        )
     elif args.loading_precision == "BF16":
-        model = model_func.from_pretrained(model_path, device_map=device, torch_dtype=torch.bfloat16)
+        model = model_func.from_pretrained(
+            model_path, device_map=device, torch_dtype=torch.bfloat16
+        )
     elif args.loading_precision == "INT8":
-        quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=200.0)
+        quantization_config = BitsAndBytesConfig(
+            load_in_8bit=True, llm_int8_threshold=200.0
+        )
         model = model_func.from_pretrained(
             model_path,
             torch_dtype=torch.float16,
             device_map=device,
-            quantization_config=quantization_config
+            quantization_config=quantization_config,
         )
     elif args.loading_precision == "INT4":
         quantization_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_use_double_quant=True,
-                bnb_4bit_compute_dtype=torch.bfloat16
-            )
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
         model = model_func.from_pretrained(
             model_path,
             device_map=device,
-            quantization_config=quantization_config
+            quantization_config=quantization_config,
         )
-    else: # FP32
+    else:  # FP32
         model = model_func.from_pretrained(model_path, device_map=device)
     model.to(device)
-    print(f"Memory footprint: {model.get_memory_footprint() / 1024 **3:.2f} GB.")
+    print(
+        f"Memory footprint: {model.get_memory_footprint() / 1024 **3:.2f} GB."
+    )
     return model, tokenizer
 
+
 def write_to_csv(save_path, args, total_accuracy):
     os.makedirs(os.path.dirname(save_path), exist_ok=True)
     csv_exists = os.path.isfile(save_path)
-    with open(save_path, 'a+', newline='') as csvfile:
+    with open(save_path, "a+", newline="") as csvfile:
         csvwriter = csv.writer(csvfile)
         if args.method == "process_of_elimination":
             if not csv_exists:
-                csvwriter.writerow(['model_family', 'checkpoint', 'loading_precision','dataset', 'batch_size', 'method', "scoring_method", "prompting_method", "mask_strategy", "mask_token", "seed", "n_shot", "sample", "mask_accuracy", 'accuracy'])
-            csvwriter.writerow([args.model_family, args.checkpoint, args.loading_precision, args.dataset, args.batch_size, args.method, args.scoring_method_for_process_of_elimination, args.prompting_method_for_process_of_elimination, args.mask_strategy_for_process_of_elimination, args.mask_token, args.seed, int(args.n_shot), args.sample, f"{args.mask_accuracy:.4f}", f"{total_accuracy:.4f}"])
+                csvwriter.writerow(
+                    [
+                        "model_family",
+                        "checkpoint",
+                        "loading_precision",
+                        "dataset",
+                        "batch_size",
+                        "method",
+                        "scoring_method",
+                        "prompting_method",
+                        "mask_strategy",
+                        "mask_token",
+                        "seed",
+                        "n_shot",
+                        "sample",
+                        "mask_accuracy",
+                        "accuracy",
+                    ]
+                )
+            csvwriter.writerow(
+                [
+                    args.model_family,
+                    args.checkpoint,
+                    args.loading_precision,
+                    args.dataset,
+                    args.batch_size,
+                    args.method,
+                    args.scoring_method_for_process_of_elimination,
+                    args.prompting_method_for_process_of_elimination,
+                    args.mask_strategy_for_process_of_elimination,
+                    args.mask_token,
+                    args.seed,
+                    int(args.n_shot),
+                    args.sample,
+                    f"{args.mask_accuracy:.4f}",
+                    f"{total_accuracy:.4f}",
+                ]
+            )
         elif args.method == "contrastive_decoding":
             if not csv_exists:
-                csvwriter.writerow(['model_family', 'checkpoint', "amateur_checkpoint", 'loading_precision', 'dataset', 'batch_size', 'method', 'expert_method', "amateur_method", "weighting_parameter","seed","n_shot", "sample", 'expert_accuracy', 'amateur_accuracy','accuracy'])
-            csvwriter.writerow([args.model_family, args.checkpoint, args.amateur_checkpoint, args.loading_precision, args.dataset, args.batch_size, args.method, args.expert_method, args.amateur_method, args.weighting_parameter, args.seed, int(args.n_shot), args.sample, f"{args.expert_accuracy:.4f}", f"{args.amateur_accuracy:.4f}", f"{total_accuracy:.4f}"])
+                csvwriter.writerow(
+                    [
+                        "model_family",
+                        "checkpoint",
+                        "amateur_checkpoint",
+                        "loading_precision",
+                        "dataset",
+                        "batch_size",
+                        "method",
+                        "expert_method",
+                        "amateur_method",
+                        "weighting_parameter",
+                        "seed",
+                        "n_shot",
+                        "sample",
+                        "expert_accuracy",
+                        "amateur_accuracy",
+                        "accuracy",
+                    ]
+                )
+            csvwriter.writerow(
+                [
+                    args.model_family,
+                    args.checkpoint,
+                    args.amateur_checkpoint,
+                    args.loading_precision,
+                    args.dataset,
+                    args.batch_size,
+                    args.method,
+                    args.expert_method,
+                    args.amateur_method,
+                    args.weighting_parameter,
+                    args.seed,
+                    int(args.n_shot),
+                    args.sample,
+                    f"{args.expert_accuracy:.4f}",
+                    f"{args.amateur_accuracy:.4f}",
+                    f"{total_accuracy:.4f}",
+                ]
+            )
         elif args.method == "generate_synonyms":
             if not csv_exists:
-                csvwriter.writerow(['model_family', 'checkpoint', 'loading_precision','dataset', 'batch_size', 'method', "number_of_synonyms", "seed", "n_shot",  "sample",'accuracy'])
-            csvwriter.writerow([args.model_family, args.checkpoint, args.loading_precision, args.dataset, args.batch_size, args.method, args.number_of_synonyms, args.seed, int(args.n_shot), args.sample, f"{total_accuracy:.4f}"])
+                csvwriter.writerow(
+                    [
+                        "model_family",
+                        "checkpoint",
+                        "loading_precision",
+                        "dataset",
+                        "batch_size",
+                        "method",
+                        "number_of_synonyms",
+                        "seed",
+                        "n_shot",
+                        "sample",
+                        "accuracy",
+                    ]
+                )
+            csvwriter.writerow(
+                [
+                    args.model_family,
+                    args.checkpoint,
+                    args.loading_precision,
+                    args.dataset,
+                    args.batch_size,
+                    args.method,
+                    args.number_of_synonyms,
+                    args.seed,
+                    int(args.n_shot),
+                    args.sample,
+                    f"{total_accuracy:.4f}",
+                ]
+            )
         else:
             if not csv_exists:
-                csvwriter.writerow(['model_family', 'checkpoint', 'loading_precision','dataset', 'batch_size', 'method', "seed", "n_shot",  "sample",'accuracy'])
-            csvwriter.writerow([args.model_family, args.checkpoint, args.loading_precision, args.dataset, args.batch_size, args.method, args.seed, int(args.n_shot), args.sample, f"{total_accuracy:.4f}"])
\ No newline at end of file
+                csvwriter.writerow(
+                    [
+                        "model_family",
+                        "checkpoint",
+                        "loading_precision",
+                        "dataset",
+                        "batch_size",
+                        "method",
+                        "seed",
+                        "n_shot",
+                        "sample",
+                        "accuracy",
+                    ]
+                )
+            csvwriter.writerow(
+                [
+                    args.model_family,
+                    args.checkpoint,
+                    args.loading_precision,
+                    args.dataset,
+                    args.batch_size,
+                    args.method,
+                    args.seed,
+                    int(args.n_shot),
+                    args.sample,
+                    f"{total_accuracy:.4f}",
+                ]
+            )
diff --git a/mm_poe/methods/vision_language_modeling.py b/mm_poe/methods/vision_language_modeling.py
index 1e1e9cc..7706bb7 100644
--- a/mm_poe/methods/vision_language_modeling.py
+++ b/mm_poe/methods/vision_language_modeling.py
@@ -1,20 +1,12 @@
 # a framework for inference on multiple choice tasks.
-import argparse
 import copy
-import csv
 import logging
 import os
-import random
-import sys
-from tqdm import tqdm
 
-import numpy as np
 import torch
-import torch.nn.functional as F
 from torch.utils.data import DataLoader
-from datasets import Dataset
 
-from utils.data import(
+from utils.data import (
     upload_to_huggingface_hub,
     preprocess_function_seq2seq,
     preprocess_function_causal,
@@ -25,10 +17,9 @@
     preprocess_function_causal_vqa,
     preprocess_function_causal_vqa_channel,
     create_synonym_dataset,
-    generate_n_shot_demonstrations,
     create_n_shot_splits,
 )
-from utils.methods import(
+from utils.methods import (
     compute_conditional_score_seq2seq,
     compute_conditional_score_causal,
     compute_conditional_score_seq2seq_vqa,
@@ -38,7 +29,7 @@
     inference_generate_synonyms,
     generate_synonyms,
 )
-from utils.utils import(
+from utils.utils import (
     load_data,
     load_model,
     parse_args,
@@ -48,6 +39,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 def main():
     # import pdb; pdb.set_trace()
 
@@ -59,9 +51,9 @@ def main():
         args.method = "contrastive_decoding"
     elif args.calibration_prompt is not None:
         args.method = "calibration"
-    elif args.do_channel == True:
+    elif args.do_channel is True:
         args.method = "channel"
-    elif args.do_synonym == True:
+    elif args.do_synonym is True:
         args.method = "generate_synonyms"
     else:
         args.method = "vision_language_modeling"
@@ -78,11 +70,14 @@ def main():
     logger.info(f"Set random seed to {args.seed}.")
     set_seed(args.seed)
 
-    # step 3: load model, processor. Then move to gpu, and set to evaluation mode.
+    # step 3: load model, processor. Then move to gpu,
+    # and set to evaluation mode.
     logger.info(f"Load {args.model_family} model: {args.checkpoint}.")
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     # get model path: ../models/args.model_family/args.checkpoint
-    model_path = os.path.join("/content/models", args.model_family, args.checkpoint)
+    model_path = os.path.join(
+        "/content/models", args.model_family, args.checkpoint
+    )
     model, tokenizer = load_model(device, model_path, args)
     if args.model_family in ["GPT2", "Pythia", "OPT-IML", "Dolly"]:
         compute_func = compute_conditional_score_causal
@@ -92,7 +87,12 @@ def main():
         compute_func = compute_conditional_score_seq2seq
         preprocess_func = preprocess_function_seq2seq
         preprocess_func_channel = preprocess_function_seq2seq_channel
-    elif args.model_family in ["BLIP2", "InstructBLIP", "PaliGemma", "Idefics2"]:
+    elif args.model_family in [
+        "BLIP2",
+        "InstructBLIP",
+        "PaliGemma",
+        "Idefics2",
+    ]:
         compute_func = compute_conditional_score_seq2seq_vqa
         preprocess_func = preprocess_function_seq2seq_vqa
         preprocess_func_channel = preprocess_function_seq2seq_vqa_channel
@@ -110,94 +110,239 @@ def main():
     # step 4: load and preprocess data.
     args.datasets = args.datasets.split()
     logger.info(f"Load data: {args.datasets}.")
-    
+
     # evaluate on each dataset
     for dataset in args.datasets:
         args.dataset = dataset
         if args.dataset in ["vqa", "scienceqa", "ai2d"]:
-            ending_names, header_name, image_header_name, raw_dataset, n_shot_dataset = load_data(args)
+            (
+                ending_names,
+                header_name,
+                image_header_name,
+                raw_dataset,
+                n_shot_dataset,
+            ) = load_data(args)
         else:
-            ending_names, header_name, raw_dataset, n_shot_dataset = load_data(args)
-        raw_dataset, n_shot_dataset, n_shot_demonstrations = create_n_shot_splits(raw_dataset, n_shot_dataset, args)    
+            ending_names, header_name, raw_dataset, n_shot_dataset = load_data(
+                args
+            )
+        raw_dataset, n_shot_dataset, n_shot_demonstrations = (
+            create_n_shot_splits(raw_dataset, n_shot_dataset, args)
+        )
 
         logger.info(f"Preprocess data: {args.dataset}.")
-        fn_kwargs = {"ending_names": ending_names, 
-                    "header_name": header_name, 
-                    "tokenizer": tokenizer,}
-        if args.model_family in ["BLIP2", "InstructBLIP", "GIT", "PaliGemma", "Idefics2"]:
-            fn_kwargs = {"ending_names": ending_names, 
-                    "header_name": header_name, 
-                    "tokenizer": tokenizer,
-                    "processor": processor,
-                    "image_header_name": image_header_name}
+        fn_kwargs = {
+            "ending_names": ending_names,
+            "header_name": header_name,
+            "tokenizer": tokenizer,
+        }
+        if args.model_family in [
+            "BLIP2",
+            "InstructBLIP",
+            "GIT",
+            "PaliGemma",
+            "Idefics2",
+        ]:
+            fn_kwargs = {
+                "ending_names": ending_names,
+                "header_name": header_name,
+                "tokenizer": tokenizer,
+                "processor": processor,
+                "image_header_name": image_header_name,
+            }
         num_of_options = len(ending_names)
-        tokenized_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-        eval_dataloader = DataLoader(tokenized_dataset, batch_size=args.batch_size, shuffle=False)
+        tokenized_dataset = raw_dataset.map(
+            preprocess_func,
+            fn_kwargs=fn_kwargs,
+            batched=True,
+            batch_size=args.batch_size,
+        )
+        eval_dataloader = DataLoader(
+            tokenized_dataset, batch_size=args.batch_size, shuffle=False
+        )
 
         # step 5: (evaluation) inference on data, and compute accuracy.
-        logger.info(f"Start inference (method: {args.method}) on {args.dataset} using {args.model_family} model: {args.checkpoint}.")
-        if args.method in ["vision_language_modeling", "multiple_choice_prompt"]:
-            _, lm_accuracy, avg_lm_accuracy, _ = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+        logger.info(
+            f"Start inference (method: {args.method}) on {args.dataset} using \
+                {args.model_family} model: {args.checkpoint}."
+        )
+        if args.method in [
+            "vision_language_modeling",
+            "multiple_choice_prompt",
+        ]:
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_language_modeling(
+                model,
+                eval_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
         elif args.method == "contrastive_decoding":
-            logger.info(f"Load {args.model_family} amateur model: {args.amateur_checkpoint}.")
+            logger.info(
+                f"Load {args.model_family} amateur model: \
+                    {args.amateur_checkpoint}."
+            )
             # get model path: ../models/args.model_family/args.checkpoint
-            amateur_model_path = os.path.join("/content/models", args.model_family, args.amateur_checkpoint)
+            amateur_model_path = os.path.join(
+                "/content/models", args.model_family, args.amateur_checkpoint
+            )
             amateur_model, _ = load_model(device, amateur_model_path, args)
-            # we want to integrate contrastive decoding with other methods, so we need separate output from each model.
-            # compute log probs on each model        
-            exp_avg_log_probs, exp_lm_accuracy, exp_avg_lm_accuracy, _ = inference_language_modeling(model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
-            ama_avg_log_probs, ama_lm_accuracy, ama_avg_lm_accuracy, _ = inference_language_modeling(amateur_model, eval_dataloader, device, compute_func, tokenizer.pad_token_id)
+            # we want to integrate contrastive decoding with other methods,
+            # so we need separate output from each model.
+            # compute log probs on each model
+            exp_avg_log_probs, exp_lm_accuracy, exp_avg_lm_accuracy, _ = (
+                inference_language_modeling(
+                    model,
+                    eval_dataloader,
+                    device,
+                    compute_func,
+                    tokenizer.pad_token_id,
+                )
+            )
+            ama_avg_log_probs, ama_lm_accuracy, ama_avg_lm_accuracy, _ = (
+                inference_language_modeling(
+                    amateur_model,
+                    eval_dataloader,
+                    device,
+                    compute_func,
+                    tokenizer.pad_token_id,
+                )
+            )
             # calculate difference, and may introduce extra parameters.
             avg_log_probs = exp_avg_log_probs - ama_avg_log_probs
-            labels = raw_dataset['label']
-            # currently, I use average language modeling accuracy. I will add language modeling and other methods shortly.
-            lm_accuracy = (avg_log_probs.argmin(dim=-1) == labels).sum().item() / len(labels)
+            labels = raw_dataset["label"]
+            # currently, I use average language modeling accuracy. I
+            #  will add language modeling and other methods shortly.
+            lm_accuracy = (
+                avg_log_probs.argmin(dim=-1) == labels
+            ).sum().item() / len(labels)
             logger.info(f"Contrastive decoding accuracy: {lm_accuracy:.4f}.")
             args.amateur_accuracy = ama_avg_lm_accuracy
             args.expert_accuracy = exp_avg_lm_accuracy
         elif args.method == "calibration":
-            fn_kwargs = {"ending_names": ending_names, 
-                        "header_name": "uncond_premise", # the difference is here
-                        "tokenizer": tokenizer,}
-            if args.model_family in ["BLIP2", "InstructBLIP", "GIT", "PaliGemma", "Idefics2"]:
-                fn_kwargs = {"ending_names": ending_names, 
-                        "header_name": "uncond_premise", 
-                        "tokenizer": tokenizer,
-                        "processor": processor,
-                        "image_header_name": image_header_name}
-            tokenized_calibration_dataset = raw_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-            eval_calibration_dataloader = DataLoader(tokenized_calibration_dataset, batch_size=args.batch_size, shuffle=False)    
-            _, lm_accuracy, avg_lm_accuracy, _ = inference_calibration(model, eval_dataloader, eval_calibration_dataloader,device, compute_func, tokenizer.pad_token_id)
+            fn_kwargs = {
+                "ending_names": ending_names,
+                "header_name": "uncond_premise",  # the difference is here
+                "tokenizer": tokenizer,
+            }
+            if args.model_family in [
+                "BLIP2",
+                "InstructBLIP",
+                "GIT",
+                "PaliGemma",
+                "Idefics2",
+            ]:
+                fn_kwargs = {
+                    "ending_names": ending_names,
+                    "header_name": "uncond_premise",
+                    "tokenizer": tokenizer,
+                    "processor": processor,
+                    "image_header_name": image_header_name,
+                }
+            tokenized_calibration_dataset = raw_dataset.map(
+                preprocess_func,
+                fn_kwargs=fn_kwargs,
+                batched=True,
+                batch_size=args.batch_size,
+            )
+            eval_calibration_dataloader = DataLoader(
+                tokenized_calibration_dataset,
+                batch_size=args.batch_size,
+                shuffle=False,
+            )
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_calibration(
+                model,
+                eval_dataloader,
+                eval_calibration_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
         elif args.method == "channel":
-            # simple solution: swap first sentence and second sentence in both preprocessing functions
-            tokenized_channel_dataset = raw_dataset.map(preprocess_func_channel, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-            eval_channel_dataloader = DataLoader(tokenized_channel_dataset, batch_size=args.batch_size, shuffle=False)
-            _, lm_accuracy, avg_lm_accuracy, _ = inference_language_modeling(model, eval_channel_dataloader, device, compute_func, tokenizer.pad_token_id)
+            # simple solution: swap first sentence and
+            # second sentence in both preprocessing functions
+            tokenized_channel_dataset = raw_dataset.map(
+                preprocess_func_channel,
+                fn_kwargs=fn_kwargs,
+                batched=True,
+                batch_size=args.batch_size,
+            )
+            eval_channel_dataloader = DataLoader(
+                tokenized_channel_dataset,
+                batch_size=args.batch_size,
+                shuffle=False,
+            )
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_language_modeling(
+                model,
+                eval_channel_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+            )
         elif args.method == "generate_synonyms":
             # 3 stpes: generate synonyms, then map datasets, then inference.
             logger.info(f"Generate synonyms for {args.dataset}.")
-            synonyms_dict = generate_synonyms(args, model, tokenizer, tokenized_dataset)
+            synonyms_dict = generate_synonyms(
+                args, model, tokenizer, tokenized_dataset
+            )
             # call map add synonyms to raw_dataset
-            logger.info(f"Add synonyms to raw dataset.")
-            synonym_kwargs = {"args": args,
-                              "synonyms_dict": synonyms_dict,
-                              }
-            synonyms_dataset = raw_dataset.map(create_synonym_dataset, fn_kwargs=synonym_kwargs, batched=True, batch_size=args.batch_size)
+            logger.info("Add synonyms to raw dataset.")
+            synonym_kwargs = {
+                "args": args,
+                "synonyms_dict": synonyms_dict,
+            }
+            synonyms_dataset = raw_dataset.map(
+                create_synonym_dataset,
+                fn_kwargs=synonym_kwargs,
+                batched=True,
+                batch_size=args.batch_size,
+            )
             # map to tokenized_dataset
-            logger.info(f"Tokenize synonym data.")
-            synonyms_ending_names = [col for col in synonyms_dataset.column_names if col.startswith("hypothesis")]
-            fn_kwargs = {"ending_names": synonyms_ending_names, 
-                        "header_name": header_name, 
-                        "tokenizer": tokenizer,}
-            if args.model_family in ["BLIP2", "InstructBLIP", "GIT", "PaliGemma", "Idefics2"]:
-                fn_kwargs = {"ending_names": synonyms_ending_names, 
-                            "header_name": header_name, 
-                            "tokenizer": tokenizer,
-                            "processor": processor,
-                            "image_header_name": image_header_name}
-            tokenized_synonyms_dataset = synonyms_dataset.map(preprocess_func, fn_kwargs=fn_kwargs, batched=True, batch_size=args.batch_size)
-            eval_synonyms_dataloader = DataLoader(tokenized_synonyms_dataset, batch_size=args.batch_size, shuffle=False)
-            _, lm_accuracy, avg_lm_accuracy, _ = inference_generate_synonyms(model, eval_synonyms_dataloader, device, compute_func, tokenizer.pad_token_id, num_of_options, args.number_of_synonyms)
+            logger.info("Tokenize synonym data.")
+            synonyms_ending_names = [
+                col
+                for col in synonyms_dataset.column_names
+                if col.startswith("hypothesis")
+            ]
+            fn_kwargs = {
+                "ending_names": synonyms_ending_names,
+                "header_name": header_name,
+                "tokenizer": tokenizer,
+            }
+            if args.model_family in [
+                "BLIP2",
+                "InstructBLIP",
+                "GIT",
+                "PaliGemma",
+                "Idefics2",
+            ]:
+                fn_kwargs = {
+                    "ending_names": synonyms_ending_names,
+                    "header_name": header_name,
+                    "tokenizer": tokenizer,
+                    "processor": processor,
+                    "image_header_name": image_header_name,
+                }
+            tokenized_synonyms_dataset = synonyms_dataset.map(
+                preprocess_func,
+                fn_kwargs=fn_kwargs,
+                batched=True,
+                batch_size=args.batch_size,
+            )
+            eval_synonyms_dataloader = DataLoader(
+                tokenized_synonyms_dataset,
+                batch_size=args.batch_size,
+                shuffle=False,
+            )
+            _, lm_accuracy, avg_lm_accuracy, _ = inference_generate_synonyms(
+                model,
+                eval_synonyms_dataloader,
+                device,
+                compute_func,
+                tokenizer.pad_token_id,
+                num_of_options,
+                args.number_of_synonyms,
+            )
         else:
             raise NotImplementedError
 
@@ -211,15 +356,15 @@ def main():
             avg_args = copy.deepcopy(args)
             avg_args.method = "average_language_modeling"
             write_to_csv(save_path, avg_args, avg_lm_accuracy)
-        
+
         # step 7: push data to HuggingFace Hub.
         if args.push_data_to_hub:
             logger.info(f"Push {args.dataset} to HuggingFace Hub.")
             upload_to_huggingface_hub(tokenized_dataset, args)
-        
+
         # step 8: delete tokenized_dataset to save memory.
         # del tokenized_dataset
-            
+
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/mm_poe/models/model_downloaders/model_downloaders.py b/mm_poe/models/model_downloaders/model_downloaders.py
index cd654c7..615edaa 100644
--- a/mm_poe/models/model_downloaders/model_downloaders.py
+++ b/mm_poe/models/model_downloaders/model_downloaders.py
@@ -1,132 +1,195 @@
+""" Model Downloaders """
+
 import argparse
 import glob
 import os
 import shutil
 
 import torch
-import transformers
 from transformers import (
-    AutoTokenizer, 
+    AutoTokenizer,
     AutoModelForCausalLM,
     AutoModelForSeq2SeqLM,
     AutoProcessor,
-    AutoModelForVision2Seq
+    AutoModelForVision2Seq,
 )
 
 all_checkpoints = {
     "GPT2": ["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"],
-    "Pythia": ["EleutherAI/pythia-70m", "EleutherAI/pythia-160m", "EleutherAI/pythia-410m", "EleutherAI/pythia-1b", "EleutherAI/pythia-1.4b", "EleutherAI/pythia-2.8b", "EleutherAI/pythia-6.9B", "EleutherAI/pythia-12b",
-               "EleutherAI/pythia-70m-deduped", "EleutherAI/pythia-160m-deduped", "EleutherAI/pythia-410m-deduped", "EleutherAI/pythia-1b-deduped", "EleutherAI/pythia-1.4b-deduped", "EleutherAI/pythia-2.8b-deduped", "EleutherAI/pythia-6.9B-deduped", "EleutherAI/pythia-12b-deduped"],
-    "OPT-IML":["facebook/opt-iml-1.3b", "facebook/opt-iml-max-1.3b"],               
-    "T5": ["t5-small", "t5-base","t5-large", "t5-3b", "t5-11b"],
-    "FLAN-T5": ["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl"],
-    "MPT": ["mosaicml/mpt-7b", "mosaicml/mpt-7b-instruct", "mosaicml/mpt-7b-chat", "mosaicml/mpt-7b-storywriter"],
+    "Pythia": [
+        "EleutherAI/pythia-70m",
+        "EleutherAI/pythia-160m",
+        "EleutherAI/pythia-410m",
+        "EleutherAI/pythia-1b",
+        "EleutherAI/pythia-1.4b",
+        "EleutherAI/pythia-2.8b",
+        "EleutherAI/pythia-6.9B",
+        "EleutherAI/pythia-12b",
+        "EleutherAI/pythia-70m-deduped",
+        "EleutherAI/pythia-160m-deduped",
+        "EleutherAI/pythia-410m-deduped",
+        "EleutherAI/pythia-1b-deduped",
+        "EleutherAI/pythia-1.4b-deduped",
+        "EleutherAI/pythia-2.8b-deduped",
+        "EleutherAI/pythia-6.9B-deduped",
+        "EleutherAI/pythia-12b-deduped",
+    ],
+    "OPT-IML": ["facebook/opt-iml-1.3b", "facebook/opt-iml-max-1.3b"],
+    "T5": ["t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b"],
+    "FLAN-T5": [
+        "google/flan-t5-small",
+        "google/flan-t5-base",
+        "google/flan-t5-large",
+        "google/flan-t5-xl",
+        "google/flan-t5-xxl",
+    ],
+    "MPT": [
+        "mosaicml/mpt-7b",
+        "mosaicml/mpt-7b-instruct",
+        "mosaicml/mpt-7b-chat",
+        "mosaicml/mpt-7b-storywriter",
+    ],
     "Dolly": ["databricks/dolly-v2-7b"],
     "BLIP2": ["Salesforce/blip2-opt-2.7b", "Salesforce/blip2-flan-t5-xl"],
     "InstructBLIP": ["Salesforce/instructblip-vicuna-7b"],
     "GIT": ["microsoft/git-base-vqav2", "microsoft/git-base-textvqa"],
-    "PaliGemma": ["google/paligemma-3b-ft-science-qa-448", "google/paligemma-3b-ft-vqav2-448", "google/paligemma-3b-ft-ai2d-448"],
-    "Idefics2": ["HuggingFaceM4/idefics2-8b"]
+    "PaliGemma": [
+        "google/paligemma-3b-ft-science-qa-448",
+        "google/paligemma-3b-ft-vqav2-448",
+        "google/paligemma-3b-ft-ai2d-448",
+    ],
+    "Idefics2": ["HuggingFaceM4/idefics2-8b"],
 }
 
+
 def parse_args():
+    """Parse Arguments"""
     parser = argparse.ArgumentParser(description="Language model downloaders.")
-    
+
     parser.add_argument(
         "--model_family",
         type=str,
-        choices=["GPT2", "T5", "FLAN-T5", "Pythia", "OPT-IML", "Dolly", "BLIP2", "InstructBLIP", "GIT", "PaliGemma", "Idefics2"],
+        choices=[
+            "GPT2",
+            "T5",
+            "FLAN-T5",
+            "Pythia",
+            "OPT-IML",
+            "Dolly",
+            "BLIP2",
+            "InstructBLIP",
+            "GIT",
+            "PaliGemma",
+            "Idefics2",
+        ],
         default=None,
-        help="The moddel family, as checkpoints under the same model family use same codes to download."
-        )
+        help="The model family, as checkpoints under \
+            the same model family use same codes to download.",
+    )
 
     parser.add_argument(
         "--checkpoint",
         type=str,
         default=None,
-        help="The checkpoint name under a model family, e.g. gpt2, gpt2-medium, gpt2-large, gpt2-xl."
+        help="The checkpoint name under a model family, \
+            e.g. gpt2, gpt2-medium, gpt2-large, gpt2-xl.",
     )
 
     parser.add_argument(
         "--download_all_checkpoints",
         action="store_true",
-        help="If set to true, downlaod all checkpoitns of a model family."
+        help="If set to true, downlaod all checkpoitns of a model family.",
     )
-    
 
     parser.add_argument(
         "--output_dir",
         type=str,
-        default=f"./models",
+        default="./models",
     )
-    
+
     args = parser.parse_args()
     return args
 
 
 def main():
+    """Main"""
     # import pdb; pdb.set_trace()
     args = parse_args()
     print(args)
-    
+
     if args.model_family in ["GPT2", "Pythia", "OPT-IML", "Dolly"]:
         tokenizer_func = AutoTokenizer
         model_func = AutoModelForCausalLM
     elif args.model_family in ["T5", "FLAN-T5"]:
         tokenizer_func = AutoTokenizer
         model_func = AutoModelForSeq2SeqLM
-    elif args.model_family in ["BLIP2", "InstructBLIP", "GIT", "PaliGemma", "Idefics2"]:
+    elif args.model_family in [
+        "BLIP2",
+        "InstructBLIP",
+        "GIT",
+        "PaliGemma",
+        "Idefics2",
+    ]:
         tokenizer_func = AutoProcessor
         model_func = AutoModelForVision2Seq
     else:
         print(f"{args.model_family}: downloader not implemented.")
         return
 
-
     # Check the validity of the checkpoint
-    checkpoints=[]
-    if args.download_all_checkpoints == True:
+    checkpoints = []
+    if args.download_all_checkpoints is True:
         checkpoints = all_checkpoints[args.model_family]
     elif args.checkpoint not in all_checkpoints[args.model_family]:
-        print(f"Invalid checkpoint from {args.model_family}. Choose from: {all_checkpoints[args.model_family]} or set --download_all_checkpoints")
+        print(
+            f"Invalid checkpoint from {args.model_family}. Choose from: \
+                {all_checkpoints[args.model_family]} or \
+                    set --download_all_checkpoints"
+        )
         return
     else:
         checkpoints = [args.checkpoint]
 
     print(f"Models to download: {checkpoints}")
     for checkpoint in checkpoints:
-        print(f"Downloading {checkpoint}\t under model family {args.model_family}...")
-        
+        print(
+            f"Downloading {checkpoint}\t under model family \
+                {args.model_family}..."
+        )
+
         # download the model
         # tokenizer = tokenizer_func.from_pretrained(checkpoint)
         # model = model_func.from_pretrained(checkpoint)
         if args.model_family == "Dolly":
-            tokenizer = tokenizer_func.from_pretrained(checkpoint, padding_side="left")
+            tokenizer = tokenizer_func.from_pretrained(
+                checkpoint, padding_side="left"
+            )
             model = model_func.from_pretrained(
                 checkpoint,
                 torch_dtype=torch.bfloat16,
-                device_map="cuda", 
+                device_map="cuda",
                 # torch_dtype=torch.float16,
                 # load_in_8bit=True,
             )
-        elif args.model_family in ["BLIP2", "InstructBLIP", "PaliGemma", "Idefics2"]:
+        elif args.model_family in [
+            "BLIP2",
+            "InstructBLIP",
+            "PaliGemma",
+            "Idefics2",
+        ]:
             tokenizer = tokenizer_func.from_pretrained(checkpoint)
             model = model_func.from_pretrained(
-                checkpoint,
-                torch_dtype=torch.float16,
-                device_map="cuda"
+                checkpoint, torch_dtype=torch.float16, device_map="cuda"
             )
         else:
             model = model_func.from_pretrained(checkpoint)
             tokenizer = tokenizer_func.from_pretrained(checkpoint)
-            
 
         # save the model
         save_dir = os.path.join(args.output_dir, args.model_family, checkpoint)
         os.makedirs(os.path.dirname(save_dir), exist_ok=True)
         tokenizer.save_pretrained(save_dir)
         model.save_pretrained(save_dir)
-        
 
         # delete cached files
         # https://huggingface.co/docs/transformers/installation#cache-setup
@@ -137,5 +200,6 @@ def main():
             print(f"Removing cached files at {folder}...")
             shutil.rmtree(folder)
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/tests/methods/utils/test_data.py b/tests/methods/utils/test_data.py
index c10f4e7..843a76b 100644
--- a/tests/methods/utils/test_data.py
+++ b/tests/methods/utils/test_data.py
@@ -43,263 +43,321 @@
     vqa_loader,
     scienceqa_loader,
     ai2d_loader,
-    single_inference_loader
+    single_inference_loader,
 )
 
+
 # Mock class for argparse.Namespace
 class Args:
     def __init__(self, **kwargs):
         self.__dict__.update(kwargs)
 
+
 @pytest.fixture
 def sample_args():
     return Args(
-        dataset='test_dataset',
+        dataset="test_dataset",
         seed=42,
         n_shot=5,
         sample=10,
-        checkpoint='checkpoints/test_checkpoint',
+        checkpoint="checkpoints/test_checkpoint",
         batch_size=32,
-        method='test_method',
-        ending_names=['choice0', 'choice1', 'choice2', 'choice3'],
-        header_name='question',
+        method="test_method",
+        ending_names=["choice0", "choice1", "choice2", "choice3"],
+        header_name="question",
         tokenizer=Mock(),
         processor=Mock(),
-        image_header_name='image_path',
-        multiple_choice_prompt='Please select the correct answer:',
-        scoring_method='other_method',
+        image_header_name="image_path",
+        multiple_choice_prompt="Please select the correct answer:",
+        scoring_method="other_method",
         num_of_options=3,
         mask_token=None,
         number_of_synonyms=2,
-        calibration_prompt=' the answer is:',
+        calibration_prompt=" the answer is:",
         num_options=4,
         mask=[1, 0, 1],
         # sample=None,
-        n_shot_demonstrations='',
+        n_shot_demonstrations="",
         image_processor=Mock(),
-        synonyms_dict={'Paris': ['Paris1', 'Paris2'], 'London': ['London1', 'London2']},
-        question='What is the capital of France?',
-        choices=['Paris', 'London', 'Berlin'],
+        synonyms_dict={
+            "Paris": ["Paris1", "Paris2"],
+            "London": ["London1", "London2"],
+        },
+        question="What is the capital of France?",
+        choices=["Paris", "London", "Berlin"],
         label=0,
     )
 
+
 def test_upload_to_huggingface_hub(sample_args):
     dataset = MagicMock()
     args = sample_args
     suffix = f"{args.dataset}_{args.seed}_{args.n_shot}_{args.sample}_{args.checkpoint.split('/')[-1]}_{args.batch_size}"
     temp_data_path = os.path.join(f"../temp_data/{args.method}", suffix)
 
-    with patch('os.system') as mock_system:
+    with patch("os.system") as mock_system:
         upload_to_huggingface_hub(dataset, args)
         dataset.save_to_disk.assert_called_once_with(temp_data_path)
 
+
 def test_preprocess_function_seq2seq(sample_args):
     examples = {
-        'question': ['What is the capital of France?', 'What is 2+2?'],
-        'choice0': ['Paris', '3'],
-        'choice1': ['London', '4'],
-        'choice2': ['Berlin', '5'],
-        'choice3': ['Madrid', '6'],
+        "question": ["What is the capital of France?", "What is 2+2?"],
+        "choice0": ["Paris", "3"],
+        "choice1": ["London", "4"],
+        "choice2": ["Berlin", "5"],
+        "choice3": ["Madrid", "6"],
     }
     tokenizer = MagicMock()
     # Adjust the tokenizer mock to return the correct number of tokens
     tokenizer.return_value = {
-        'input_ids': [[i] for i in range(8)],  # 2 questions * 4 choices = 8
-        'attention_mask': [[1] for _ in range(8)]
+        "input_ids": [[i] for i in range(8)],  # 2 questions * 4 choices = 8
+        "attention_mask": [[1] for _ in range(8)],
     }
     kwargs = {
-        'ending_names': ['choice0', 'choice1', 'choice2', 'choice3'],
-        'header_name': 'question',
-        'tokenizer': tokenizer
+        "ending_names": ["choice0", "choice1", "choice2", "choice3"],
+        "header_name": "question",
+        "tokenizer": tokenizer,
     }
     output = preprocess_function_seq2seq(examples, **kwargs)
-    assert 'header_input_ids' in output
-    assert 'header_attention_mask' in output
-    assert 'ending_input_ids' in output
-    assert 'ending_attention_mask' in output
+    assert "header_input_ids" in output
+    assert "header_attention_mask" in output
+    assert "ending_input_ids" in output
+    assert "ending_attention_mask" in output
     num_choice = 4
     for key in output:
-        assert len(output[key]) == len(examples['question'])
+        assert len(output[key]) == len(examples["question"])
         for sublist in output[key]:
             assert len(sublist) == num_choice
 
+
 def test_preprocess_function_causal(sample_args):
     examples = {
-        'question': ['What is the capital of France?'],
-        'choice0': ['Paris'],
-        'choice1': ['London'],
+        "question": ["What is the capital of France?"],
+        "choice0": ["Paris"],
+        "choice1": ["London"],
     }
     tokenizer = MagicMock()
     tokenizer.pad_token_id = 0
     # Fix the lambda function to define 's'
-    tokenizer.side_effect = lambda x, truncation: {
-        'input_ids': [list(range(len(s))) for s in x],
-        'attention_mask': [[1]*len(s) for s in x]
-    } if isinstance(x, list) else {}
+    tokenizer.side_effect = lambda x, truncation: (
+        {
+            "input_ids": [list(range(len(s))) for s in x],
+            "attention_mask": [[1] * len(s) for s in x],
+        }
+        if isinstance(x, list)
+        else {}
+    )
     kwargs = {
-        'ending_names': ['choice0', 'choice1'],
-        'header_name': 'question',
-        'tokenizer': tokenizer
+        "ending_names": ["choice0", "choice1"],
+        "header_name": "question",
+        "tokenizer": tokenizer,
     }
     output = preprocess_function_causal(examples, **kwargs)
-    assert 'input_ids' in output
-    assert 'labels' in output
-    assert 'ending_attention_mask' in output
+    assert "input_ids" in output
+    assert "labels" in output
+    assert "ending_attention_mask" in output
+
 
 def test_preprocess_function_seq2seq_vqa(sample_args):
     examples = {
-        'question': ['What is shown in the image?'],
-        'choice0': ['Cat'],
-        'choice1': ['Dog'],
-        'image_path': ['path/to/image1.jpg']
+        "question": ["What is shown in the image?"],
+        "choice0": ["Cat"],
+        "choice1": ["Dog"],
+        "image_path": ["path/to/image1.jpg"],
     }
     processor = MagicMock()
     # Adjust the tokenizer and image processor mocks
     processor.tokenizer.return_value = {
-        'input_ids': [[i] for i in range(2)], 
-        'attention_mask': [[1] for _ in range(2)]
+        "input_ids": [[i] for i in range(2)],
+        "attention_mask": [[1] for _ in range(2)],
     }
     data_obj = MagicMock()
     data_obj.data = {
-        'pixel_values': torch.tensor([[[1,2],[3,4]]] * 2)  # Repeat to match the number of choices
+        "pixel_values": torch.tensor(
+            [[[1, 2], [3, 4]]] * 2
+        )  # Repeat to match the number of choices
     }
     processor.image_processor.return_value = data_obj
     kwargs = {
-        'ending_names': ['choice0', 'choice1'],
-        'header_name': 'question',
-        'image_header_name': 'image_path',
-        'processor': processor
+        "ending_names": ["choice0", "choice1"],
+        "header_name": "question",
+        "image_header_name": "image_path",
+        "processor": processor,
     }
-    with patch('PIL.Image.open', return_value=MagicMock(spec=Image.Image)):
+    with patch("PIL.Image.open", return_value=MagicMock(spec=Image.Image)):
         output = preprocess_function_seq2seq_vqa(examples, **kwargs)
-        assert 'header_input_ids' in output
-        assert 'ending_input_ids' in output
-        assert 'images' in output
-        assert len(output['images']) == len(examples['question'])
-        for img_list in output['images']:
-            assert len(img_list) == len(kwargs['ending_names'])
+        assert "header_input_ids" in output
+        assert "ending_input_ids" in output
+        assert "images" in output
+        assert len(output["images"]) == len(examples["question"])
+        for img_list in output["images"]:
+            assert len(img_list) == len(kwargs["ending_names"])
+
 
 def test_create_multiple_choice_prompt(sample_args):
     example = {
-        'premise': 'What is the capital of France?',
-        'uncond_premise': 'The answer is:',
-        'hypothesis0': 'Paris',
-        'hypothesis1': 'London',
-        'hypothesis2': 'Berlin',
-        'mask': [1, 0, 1]
+        "premise": "What is the capital of France?",
+        "uncond_premise": "The answer is:",
+        "hypothesis0": "Paris",
+        "hypothesis1": "London",
+        "hypothesis2": "Berlin",
+        "mask": [1, 0, 1],
     }
     kwargs = {
-        'multiple_choice_prompt': 'Please choose the correct answer:',
-        'scoring_method': 'other_method',
-        'num_of_options': 3,
-        'mask_token': None
+        "multiple_choice_prompt": "Please choose the correct answer:",
+        "scoring_method": "other_method",
+        "num_of_options": 3,
+        "mask_token": None,
     }
     output = create_multiple_choice_prompt(example, **kwargs)
-    expected_premise = 'Please choose the correct answer:\n Question: What is the capital of France?\nA. Paris\nB. [MASK]\nC. Berlin\nAnswer:'
-    assert output['premise'] == expected_premise
-    kwargs['scoring_method'] = 'multiple_choice_prompt'
-    example['premise'] = ' Question: What is the capital of France?\nA. Paris\nB. London\nC. Berlin\nAnswer:'
+    expected_premise = "Please choose the correct answer:\n Question: What is the capital of France?\nA. Paris\nB. [MASK]\nC. Berlin\nAnswer:"
+    assert output["premise"] == expected_premise
+    kwargs["scoring_method"] = "multiple_choice_prompt"
+    example["premise"] = (
+        " Question: What is the capital of France?\nA. Paris\nB. London\nC. Berlin\nAnswer:"
+    )
     output = create_multiple_choice_prompt(example, **kwargs)
-    assert output['premise'] == expected_premise
+    assert output["premise"] == expected_premise
+
 
 def test_create_synonym_dataset(sample_args):
     examples = {
-        'hypothesis0': ['Paris', 'London'],
-        'hypothesis1': ['Berlin', 'Madrid'],
+        "hypothesis0": ["Paris", "London"],
+        "hypothesis1": ["Berlin", "Madrid"],
     }
     kwargs = {
-        'args': sample_args,
-        'synonyms_dict': {'Paris': ['Paris1', 'Paris2'], 'London': ['London1', 'London2'], 'Berlin': ['Berlin1', 'Berlin2'], 'Madrid': ['Madrid1', 'Madrid2']}
+        "args": sample_args,
+        "synonyms_dict": {
+            "Paris": ["Paris1", "Paris2"],
+            "London": ["London1", "London2"],
+            "Berlin": ["Berlin1", "Berlin2"],
+            "Madrid": ["Madrid1", "Madrid2"],
+        },
     }
     output = create_synonym_dataset(examples, **kwargs)
-    for hypothesis in ['hypothesis0', 'hypothesis1']:
+    for hypothesis in ["hypothesis0", "hypothesis1"]:
         for i in range(sample_args.number_of_synonyms):
             key = f"{hypothesis}_synonyms_{i}"
             assert key in output
             assert len(output[key]) == len(examples[hypothesis])
 
+
 def test_copa_loader(sample_args):
     args = sample_args
     args.multiple_choice_prompt = None
-    xml_content = '''<root>
+    xml_content = """<root>
     <item most-plausible-alternative="1" asks-for="effect">
         <p>It started to rain.</p>
         <a1>I opened my umbrella.</a1>
         <a2>I wore sunglasses.</a2>
     </item>
-    </root>'''
-    with patch('xml.etree.ElementTree.parse') as mock_parse:
+    </root>"""
+    with patch("xml.etree.ElementTree.parse") as mock_parse:
         mock_tree = ET.ElementTree(ET.fromstring(xml_content))
         mock_parse.return_value = mock_tree
-        examples = copa_loader('dummy_path.xml', args)
+        examples = copa_loader("dummy_path.xml", args)
         assert len(examples) == 1
-        assert examples[0]['label'] == 0
-        assert examples[0]['premise'] == ' It started to rain so'
-        assert examples[0]['hypothesis0'] == ' i opened my umbrella.'
-        assert examples[0]['hypothesis1'] == ' i wore sunglasses.'
+        assert examples[0]["label"] == 0
+        assert examples[0]["premise"] == " It started to rain so"
+        assert examples[0]["hypothesis0"] == " i opened my umbrella."
+        assert examples[0]["hypothesis1"] == " i wore sunglasses."
+
 
 def test_copa_loader_assert(sample_args):
     args = sample_args
-    xml_content = '''<root>
+    xml_content = """<root>
     <item most-plausible-alternative="1" asks-for="unknown">
         <p>It started to rain.</p>
         <a1>I opened my umbrella.</a1>
         <a2>I wore sunglasses.</a2>
     </item>
-    </root>'''
+    </root>"""
     with pytest.raises(AssertionError):
-        with patch('xml.etree.ElementTree.parse') as mock_parse:
+        with patch("xml.etree.ElementTree.parse") as mock_parse:
             mock_tree = ET.ElementTree(ET.fromstring(xml_content))
             mock_parse.return_value = mock_tree
-            examples = copa_loader('dummy_path.xml', args)
+            examples = copa_loader("dummy_path.xml", args)
+
 
 def test_cqa_loader(sample_args):
     args = sample_args
-    args.multiple_choice_prompt = 'Answer the following question:'
+    args.multiple_choice_prompt = "Answer the following question:"
     # Adjust the stem to end with a period to match the processing in cqa_loader
-    json_line = json.dumps({
-        'answerKey': 'A',
-        'question': {
-            'stem': 'What is the capital of France.',
-            'choices': [
-                {'text': 'Paris'}, {'text': 'London'}, {'text': 'Berlin'}, {'text': 'Madrid'}, {'text': 'Rome'}
-            ]
+    json_line = json.dumps(
+        {
+            "answerKey": "A",
+            "question": {
+                "stem": "What is the capital of France.",
+                "choices": [
+                    {"text": "Paris"},
+                    {"text": "London"},
+                    {"text": "Berlin"},
+                    {"text": "Madrid"},
+                    {"text": "Rome"},
+                ],
+            },
         }
-    })
-    with patch('builtins.open', mock.mock_open(read_data=json_line)):
-        examples = cqa_loader('dummy_path.jsonl', args)
+    )
+    with patch("builtins.open", mock.mock_open(read_data=json_line)):
+        examples = cqa_loader("dummy_path.jsonl", args)
         assert len(examples) == 1
-        assert examples[0]['label'] == 0
-        assert 'Answer the following question: Question:  What is the capital of France?' in examples[0]['premise']
+        assert examples[0]["label"] == 0
+        assert (
+            "Answer the following question: Question:  What is the capital of France?"
+            in examples[0]["premise"]
+        )
+
 
 def test_obqa_loader(sample_args):
     args = sample_args
-    args.multiple_choice_prompt = 'Answer the following question:'
+    args.multiple_choice_prompt = "Answer the following question:"
     # Adjust the stem to end with a period to match the processing in cqa_loader
-    json_line = json.dumps({
-        'answerKey': 'A',
-        'question': {
-            'stem': 'What is the capital of France.',
-            'choices': [
-                {'text': 'Paris', 'label': 0}, {'text': 'London', 'label': 0}, {'text': 'Berlin', 'label': 1}, {'text': 'Madrid', 'label': 0}, {'text': 'Rome', 'label': 0}
-            ]
+    json_line = json.dumps(
+        {
+            "answerKey": "A",
+            "question": {
+                "stem": "What is the capital of France.",
+                "choices": [
+                    {"text": "Paris", "label": 0},
+                    {"text": "London", "label": 0},
+                    {"text": "Berlin", "label": 1},
+                    {"text": "Madrid", "label": 0},
+                    {"text": "Rome", "label": 0},
+                ],
+            },
         }
-    })
-    with patch('builtins.open', mock.mock_open(read_data=json_line)):
-        examples = obqa_loader('dummy_path.jsonl', args)
+    )
+    with patch("builtins.open", mock.mock_open(read_data=json_line)):
+        examples = obqa_loader("dummy_path.jsonl", args)
         assert len(examples) == 1
-        assert examples[0]['label'] == 0
-        assert 'Answer the following question: Question: What is the capital of France' in examples[0]['premise']
+        assert examples[0]["label"] == 0
+        assert (
+            "Answer the following question: Question: What is the capital of France"
+            in examples[0]["premise"]
+        )
+
 
 def test_generate_n_shot_demonstrations(sample_args):
     n_shot_dataset = [
-        {'premise': 'Question 1', 'label': torch.tensor(0), 'hypothesis0': 'A1', 'hypothesis1': 'B1'},
-        {'premise': 'Question 2', 'label': torch.tensor(1), 'hypothesis0': 'A2', 'hypothesis1': 'B2'}
+        {
+            "premise": "Question 1",
+            "label": torch.tensor(0),
+            "hypothesis0": "A1",
+            "hypothesis1": "B1",
+        },
+        {
+            "premise": "Question 2",
+            "label": torch.tensor(1),
+            "hypothesis0": "A2",
+            "hypothesis1": "B2",
+        },
     ]
     output = generate_n_shot_demonstrations(n_shot_dataset)
-    expected_output = 'Question 1A1\n\nQuestion 2B2\n\n'
+    expected_output = "Question 1A1\n\nQuestion 2B2\n\n"
     assert output == expected_output
 
+
 def test_create_n_shot_splits(sample_args):
     args = sample_args
     args.n_shot = 1
@@ -309,128 +367,168 @@ def test_create_n_shot_splits(sample_args):
     raw_dataset.shuffle.return_value.select.return_value = raw_dataset
     raw_dataset.map.return_value = raw_dataset
     # Adjust the patch path to match the module structure
-    with patch('mm_poe.methods.utils.data.generate_n_shot_demonstrations', return_value='Demo') as mock_generate:
-        output_dataset, output_n_shot_dataset, n_shot_demonstrations = create_n_shot_splits(raw_dataset, n_shot_dataset, args)
-        assert n_shot_demonstrations == 'Demo'
+    with patch(
+        "mm_poe.methods.utils.data.generate_n_shot_demonstrations",
+        return_value="Demo",
+    ) as mock_generate:
+        output_dataset, output_n_shot_dataset, n_shot_demonstrations = (
+            create_n_shot_splits(raw_dataset, n_shot_dataset, args)
+        )
+        assert n_shot_demonstrations == "Demo"
         raw_dataset.map.assert_called_once()
 
+
 def test_single_inference_loader(sample_args):
     args = sample_args
-    path = 'path/to/image.jpg'
+    path = "path/to/image.jpg"
     examples = single_inference_loader(path, args)
     assert len(examples) == 1
-    assert examples[0]['image_path'] == path
-    assert examples[0]['premise'].startswith(args.multiple_choice_prompt)
+    assert examples[0]["image_path"] == path
+    assert examples[0]["premise"].startswith(args.multiple_choice_prompt)
+
 
 def test_anli_loader(sample_args):
     args = sample_args
     args.multiple_choice_prompt = None
-    json_line = json.dumps({
-        'context': 'A man is playing a piano.',
-        'hypothesis': 'The man is playing a musical instrument.',
-        'label': 'e'
-    })
-    with patch('builtins.open', mock.mock_open(read_data=json_line)):
-        examples = anli_loader(['dummy_path.jsonl'], args)
+    json_line = json.dumps(
+        {
+            "context": "A man is playing a piano.",
+            "hypothesis": "The man is playing a musical instrument.",
+            "label": "e",
+        }
+    )
+    with patch("builtins.open", mock.mock_open(read_data=json_line)):
+        examples = anli_loader(["dummy_path.jsonl"], args)
         assert len(examples) == 1
-        assert examples[0]['label'] == 0
-        assert 'A man is playing a piano. The man is playing a musical instrument.' in examples[0]['premise']
+        assert examples[0]["label"] == 0
+        assert (
+            "A man is playing a piano. The man is playing a musical instrument."
+            in examples[0]["premise"]
+        )
+
 
 def test_generate_n_shot_poe_demonstrations(sample_args):
     n_shot_dataset = [
-        {'premise': 'Question 1', 'label': torch.tensor(0), 'hypothesis0': 'A1', 'hypothesis1': 'B1'},
-        {'premise': 'Question 2', 'label': torch.tensor(1), 'hypothesis0': 'A2', 'hypothesis1': 'B2'}
+        {
+            "premise": "Question 1",
+            "label": torch.tensor(0),
+            "hypothesis0": "A1",
+            "hypothesis1": "B1",
+        },
+        {
+            "premise": "Question 2",
+            "label": torch.tensor(1),
+            "hypothesis0": "A2",
+            "hypothesis1": "B2",
+        },
     ]
     num_of_options = 2
-    output, poe_output = generate_n_shot_poe_demonstrations(n_shot_dataset, num_of_options)
+    output, poe_output = generate_n_shot_poe_demonstrations(
+        n_shot_dataset, num_of_options
+    )
     assert isinstance(output, str)
     assert isinstance(poe_output, str)
 
+
 def test_preprocess_function_seq2seq_channel(sample_args):
     examples = {
-        'question': ['What is 2+2?'],
-        'choice0': ['3'],
-        'choice1': ['4'],
+        "question": ["What is 2+2?"],
+        "choice0": ["3"],
+        "choice1": ["4"],
     }
     tokenizer = MagicMock()
     tokenizer.return_value = {
-        'input_ids': [[1,2], [3,4]],
-        'attention_mask': [[1,1], [1,1]]
+        "input_ids": [[1, 2], [3, 4]],
+        "attention_mask": [[1, 1], [1, 1]],
     }
     kwargs = {
-        'ending_names': ['choice0', 'choice1'],
-        'header_name': 'question',
-        'tokenizer': tokenizer
+        "ending_names": ["choice0", "choice1"],
+        "header_name": "question",
+        "tokenizer": tokenizer,
     }
     output = preprocess_function_seq2seq_channel(examples, **kwargs)
-    assert 'header_input_ids' in output
-    assert 'ending_input_ids' in output
+    assert "header_input_ids" in output
+    assert "ending_input_ids" in output
+
 
 def test_preprocess_function_causal_channel(sample_args):
     examples = {
-        'question': ['What is 2+2?'],
-        'choice0': ['3'],
-        'choice1': ['4'],
+        "question": ["What is 2+2?"],
+        "choice0": ["3"],
+        "choice1": ["4"],
     }
     tokenizer = MagicMock()
     tokenizer.pad_token_id = 0
     # Adjust the tokenizer to return lists of lists
     tokenizer.return_value = {
-        'input_ids': [[1,2,3], [4,5,6]],
-        'attention_mask': [[1,1,1], [1,1,1]]
+        "input_ids": [[1, 2, 3], [4, 5, 6]],
+        "attention_mask": [[1, 1, 1], [1, 1, 1]],
     }
     kwargs = {
-        'ending_names': ['choice0', 'choice1'],
-        'header_name': 'question',
-        'tokenizer': tokenizer
+        "ending_names": ["choice0", "choice1"],
+        "header_name": "question",
+        "tokenizer": tokenizer,
     }
     output = preprocess_function_causal_channel(examples, **kwargs)
-    assert 'input_ids' in output
-    assert 'labels' in output
+    assert "input_ids" in output
+    assert "labels" in output
+
 
 def test_vqa_loader(sample_args):
     args = sample_args
     args.num_options = 2
     ann_content = {
-        'annotations': [
-            {'multiple_choice_answer': 'cat', 'image_id': 123}
-        ]
+        "annotations": [{"multiple_choice_answer": "cat", "image_id": 123}]
     }
     ques_content = {
-        'questions': [
-            {'question': 'What animal is this?', 'multiple_choices': ['cat', 'dog']}
+        "questions": [
+            {
+                "question": "What animal is this?",
+                "multiple_choices": ["cat", "dog"],
+            }
         ]
     }
-    with patch('json.load', side_effect=[ann_content, ques_content]):
-        with patch('os.path.join', return_value='path/to/image.jpg'):
-            with patch('builtins.open', mock.mock_open()) as mock_file:
+    with patch("json.load", side_effect=[ann_content, ques_content]):
+        with patch("os.path.join", return_value="path/to/image.jpg"):
+            with patch("builtins.open", mock.mock_open()) as mock_file:
                 # Mock the open calls for annotation and question files
-                mock_file.side_effect = [mock.mock_open(read_data=json.dumps(ann_content)).return_value,
-                                         mock.mock_open(read_data=json.dumps(ques_content)).return_value]
-                examples = vqa_loader('dummy_path', args)
+                mock_file.side_effect = [
+                    mock.mock_open(
+                        read_data=json.dumps(ann_content)
+                    ).return_value,
+                    mock.mock_open(
+                        read_data=json.dumps(ques_content)
+                    ).return_value,
+                ]
+                examples = vqa_loader("dummy_path", args)
                 assert len(examples) == 1
-                assert examples[0]['label'] == 0
-                assert examples[0]['image_path'] == 'path/to/image.jpg'
+                assert examples[0]["label"] == 0
+                assert examples[0]["image_path"] == "path/to/image.jpg"
+
 
 def test_scienceqa_loader(sample_args):
     args = sample_args
     args.num_options = 4
     ann_content = {
-        '1': {
-            'question': 'What is H2O?',
-            'choices': ['Water', 'Oxygen', 'Hydrogen', 'Helium'],
-            'answer': '0',
-            'image': 'image1.jpg'
+        "1": {
+            "question": "What is H2O?",
+            "choices": ["Water", "Oxygen", "Hydrogen", "Helium"],
+            "answer": "0",
+            "image": "image1.jpg",
         }
     }
-    with patch('json.load', return_value=ann_content):
-        with patch('os.listdir', return_value=['1']):
-            with patch('os.path.join', return_value='path/to/image.jpg'):
-                with patch('builtins.open', mock.mock_open(read_data=json.dumps(ann_content))):
-                    examples = scienceqa_loader('dummy_path', args)
+    with patch("json.load", return_value=ann_content):
+        with patch("os.listdir", return_value=["1"]):
+            with patch("os.path.join", return_value="path/to/image.jpg"):
+                with patch(
+                    "builtins.open",
+                    mock.mock_open(read_data=json.dumps(ann_content)),
+                ):
+                    examples = scienceqa_loader("dummy_path", args)
                     assert len(examples) == 1
-                    assert examples[0]['label'] == 0
-                    assert examples[0]['image_path'] == 'path/to/image.jpg'
+                    assert examples[0]["label"] == 0
+                    assert examples[0]["image_path"] == "path/to/image.jpg"
+
 
 def test_ai2d_loader(sample_args):
     args = sample_args
@@ -438,228 +536,284 @@ def test_ai2d_loader(sample_args):
     question_content = {
         "questions": {
             "What is this?": {
-                'answerTexts': ['Cat', 'Dog', 'Mouse'],
-                'correctAnswer': '1',
-                'abcLabel': False
+                "answerTexts": ["Cat", "Dog", "Mouse"],
+                "correctAnswer": "1",
+                "abcLabel": False,
             }
         },
-        "imageName": "image1.jpg"
-    }
-    with patch('os.listdir', return_value=['file1.json']):
-        with patch('json.load', return_value=question_content):
-            with patch('os.path.join', side_effect=lambda *args: 'path/to/' + '/'.join(args[-2:])):
-                with patch('builtins.open', mock.mock_open(read_data=json.dumps(question_content))):
-                    examples = ai2d_loader('dummy_path', args)
+        "imageName": "image1.jpg",
+    }
+    with patch("os.listdir", return_value=["file1.json"]):
+        with patch("json.load", return_value=question_content):
+            with patch(
+                "os.path.join",
+                side_effect=lambda *args: "path/to/" + "/".join(args[-2:]),
+            ):
+                with patch(
+                    "builtins.open",
+                    mock.mock_open(read_data=json.dumps(question_content)),
+                ):
+                    examples = ai2d_loader("dummy_path", args)
                     assert len(examples) == 1
-                    assert examples[0]['label'] == 1
-                    assert examples[0]['image_path'] == 'path/to/dummy_path/ai2d/images/image1.jpg'
+                    assert examples[0]["label"] == 1
+                    assert (
+                        examples[0]["image_path"]
+                        == "path/to/dummy_path/ai2d/images/image1.jpg"
+                    )
+
 
 # New test functions to increase coverage
 
+
 def test_preprocess_function_causal_vqa(sample_args):
     examples = {
-        'question': ['What is shown in the image?'],
-        'choice0': ['Cat'],
-        'choice1': ['Dog'],
-        'image_path': ['path/to/image1.jpg']
+        "question": ["What is shown in the image?"],
+        "choice0": ["Cat"],
+        "choice1": ["Dog"],
+        "image_path": ["path/to/image1.jpg"],
     }
     processor = MagicMock()
     tokenizer = MagicMock()
     tokenizer.pad_token_id = 0
-    tokenizer.padding_side = 'right'
+    tokenizer.padding_side = "right"
     # Adjust the tokenizer to return lists of lists
     tokenizer.return_value = {
-        'input_ids': [[1,2], [3,4]],
-        'attention_mask': [[1,1], [1,1]]
+        "input_ids": [[1, 2], [3, 4]],
+        "attention_mask": [[1, 1], [1, 1]],
     }
     processor.tokenizer = tokenizer
     data_obj = MagicMock()
     data_obj.data = {
-        'pixel_values': torch.tensor([[[1,2],[3,4]]] * 2)  # Repeat to match the number of choices
+        "pixel_values": torch.tensor(
+            [[[1, 2], [3, 4]]] * 2
+        )  # Repeat to match the number of choices
     }
     processor.image_processor.return_value = data_obj
     kwargs = {
-        'ending_names': ['choice0', 'choice1'],
-        'header_name': 'question',
-        'image_header_name': 'image_path',
-        'processor': processor
+        "ending_names": ["choice0", "choice1"],
+        "header_name": "question",
+        "image_header_name": "image_path",
+        "processor": processor,
     }
-    with patch('PIL.Image.open', return_value=MagicMock(spec=Image.Image)):
+    with patch("PIL.Image.open", return_value=MagicMock(spec=Image.Image)):
         output = preprocess_function_causal_vqa(examples, **kwargs)
-        assert 'input_ids' in output
-        assert 'labels' in output
-        assert 'header_attention_mask' in output
-        assert 'ending_attention_mask' in output
-        assert 'images' in output
+        assert "input_ids" in output
+        assert "labels" in output
+        assert "header_attention_mask" in output
+        assert "ending_attention_mask" in output
+        assert "images" in output
+
 
 def test_preprocess_function_seq2seq_vqa_channel(sample_args):
     examples = {
-        'question': ['What is shown in the image?'],
-        'choice0': ['Cat'],
-        'choice1': ['Dog'],
-        'image_path': ['path/to/image1.jpg']
+        "question": ["What is shown in the image?"],
+        "choice0": ["Cat"],
+        "choice1": ["Dog"],
+        "image_path": ["path/to/image1.jpg"],
     }
     processor = MagicMock()
     tokenizer = MagicMock()
     tokenizer.return_value = {
-        'input_ids': [[1,2], [3,4]], 
-        'attention_mask': [[1,1], [1,1]]
+        "input_ids": [[1, 2], [3, 4]],
+        "attention_mask": [[1, 1], [1, 1]],
     }
     processor.tokenizer = tokenizer
     data_obj = MagicMock()
     data_obj.data = {
-        'pixel_values': torch.tensor([[[1,2],[3,4]]] * 2)  # Repeat to match the number of choices
+        "pixel_values": torch.tensor(
+            [[[1, 2], [3, 4]]] * 2
+        )  # Repeat to match the number of choices
     }
     processor.image_processor.return_value = data_obj
     kwargs = {
-        'ending_names': ['choice0', 'choice1'],
-        'header_name': 'question',
-        'image_header_name': 'image_path',
-        'processor': processor
+        "ending_names": ["choice0", "choice1"],
+        "header_name": "question",
+        "image_header_name": "image_path",
+        "processor": processor,
     }
-    with patch('PIL.Image.open', return_value=MagicMock(spec=Image.Image)):
+    with patch("PIL.Image.open", return_value=MagicMock(spec=Image.Image)):
         output = preprocess_function_seq2seq_vqa_channel(examples, **kwargs)
-        assert 'header_input_ids' in output
-        assert 'ending_input_ids' in output
-        assert 'images' in output
-        assert len(output['images']) == len(examples['question'])
-        for img_list in output['images']:
-            assert len(img_list) == len(kwargs['ending_names'])
+        assert "header_input_ids" in output
+        assert "ending_input_ids" in output
+        assert "images" in output
+        assert len(output["images"]) == len(examples["question"])
+        for img_list in output["images"]:
+            assert len(img_list) == len(kwargs["ending_names"])
+
 
 def test_preprocess_function_causal_vqa_channel(sample_args):
     examples = {
-        'question': ['What is shown in the image?'],
-        'hypothesis0': ['Cat'],
-        'hypothesis1': ['Dog'],
-        'image_path': ['path/to/image1.jpg']
+        "question": ["What is shown in the image?"],
+        "hypothesis0": ["Cat"],
+        "hypothesis1": ["Dog"],
+        "image_path": ["path/to/image1.jpg"],
     }
     processor = MagicMock()
     tokenizer = MagicMock()
     tokenizer.pad_token_id = 0
-    tokenizer.padding_side = 'right'
+    tokenizer.padding_side = "right"
     tokenizer.return_value = {
-        'input_ids': [[1,2], [3,4]],
-        'attention_mask': [[1,1], [1,1]]
+        "input_ids": [[1, 2], [3, 4]],
+        "attention_mask": [[1, 1], [1, 1]],
     }
     processor.tokenizer = tokenizer
     data_obj = MagicMock()
-    data_obj.data = {
-        'pixel_values': torch.tensor([[[1,2],[3,4]]] * 2)
-    }
+    data_obj.data = {"pixel_values": torch.tensor([[[1, 2], [3, 4]]] * 2)}
     processor.image_processor.return_value = data_obj
     kwargs = {
-        'ending_names': ['hypothesis0', 'hypothesis1'],
-        'header_name': 'question',
-        'image_header_name': 'image_path',
-        'processor': processor
+        "ending_names": ["hypothesis0", "hypothesis1"],
+        "header_name": "question",
+        "image_header_name": "image_path",
+        "processor": processor,
     }
-    with patch('PIL.Image.open', return_value=MagicMock(spec=Image.Image)):
+    with patch("PIL.Image.open", return_value=MagicMock(spec=Image.Image)):
         output = preprocess_function_causal_vqa_channel(examples, **kwargs)
-        assert 'input_ids' in output
-        assert 'labels' in output
-        assert 'header_attention_mask' in output
-        assert 'ending_attention_mask' in output
-        assert 'images' in output
+        assert "input_ids" in output
+        assert "labels" in output
+        assert "header_attention_mask" in output
+        assert "ending_attention_mask" in output
+        assert "images" in output
+
 
 def test_piqa_loader(sample_args):
     args = sample_args
-    args.multiple_choice_prompt = 'Answer the following question:'
-    qa_content = json.dumps({
-        'goal': 'To open a jar, you should',
-        'sol1': 'Twist the lid counter-clockwise',
-        'sol2': 'Push the lid upwards'
-    })
-    label_content = '0\n'  # First solution is correct
-    with patch('builtins.open', mock.mock_open()) as mock_file:
-        mock_file.side_effect = [mock.mock_open(read_data=qa_content).return_value,
-                                    mock.mock_open(read_data=label_content).return_value]
-        examples = piqa_loader(['dummy_qa_path.jsonl', 'dummy_label_path.txt'], args)
+    args.multiple_choice_prompt = "Answer the following question:"
+    qa_content = json.dumps(
+        {
+            "goal": "To open a jar, you should",
+            "sol1": "Twist the lid counter-clockwise",
+            "sol2": "Push the lid upwards",
+        }
+    )
+    label_content = "0\n"  # First solution is correct
+    with patch("builtins.open", mock.mock_open()) as mock_file:
+        mock_file.side_effect = [
+            mock.mock_open(read_data=qa_content).return_value,
+            mock.mock_open(read_data=label_content).return_value,
+        ]
+        examples = piqa_loader(
+            ["dummy_qa_path.jsonl", "dummy_label_path.txt"], args
+        )
         assert len(examples) == 1
-        assert examples[0]['label'] == 0
-        assert 'Answer the following question: Question: To open a jar, you should' in examples[0]['premise']
+        assert examples[0]["label"] == 0
+        assert (
+            "Answer the following question: Question: To open a jar, you should"
+            in examples[0]["premise"]
+        )
+
 
 def test_qasc_loader(sample_args):
     args = sample_args
-    args.multiple_choice_prompt = 'Answer the following question:'
-    json_line = json.dumps({
-        'answerKey': 'B',
-        'question': {
-            'stem': 'What do plants need to perform photosynthesis?',
-            'choices': [
-                {'label': 'A', 'text': 'Oxygen'},
-                {'label': 'B', 'text': 'Sunlight'},
-                {'label': 'C', 'text': 'Nitrogen'},
-                {'label': 'D', 'text': 'Carbon dioxide'},
-                {'label': 'E', 'text': 'Water'},
-                {'label': 'F', 'text': 'Soil'},
-                {'label': 'G', 'text': 'Minerals'},
-                {'label': 'H', 'text': 'Glucose'}
-            ]
+    args.multiple_choice_prompt = "Answer the following question:"
+    json_line = json.dumps(
+        {
+            "answerKey": "B",
+            "question": {
+                "stem": "What do plants need to perform photosynthesis?",
+                "choices": [
+                    {"label": "A", "text": "Oxygen"},
+                    {"label": "B", "text": "Sunlight"},
+                    {"label": "C", "text": "Nitrogen"},
+                    {"label": "D", "text": "Carbon dioxide"},
+                    {"label": "E", "text": "Water"},
+                    {"label": "F", "text": "Soil"},
+                    {"label": "G", "text": "Minerals"},
+                    {"label": "H", "text": "Glucose"},
+                ],
+            },
         }
-    })
-    with patch('builtins.open', mock.mock_open(read_data=json_line)):
-        examples = qasc_loader('dummy_path.jsonl', args)
+    )
+    with patch("builtins.open", mock.mock_open(read_data=json_line)):
+        examples = qasc_loader("dummy_path.jsonl", args)
         assert len(examples) == 1
-        assert examples[0]['label'] == 1  # 'B' corresponds to index 1
-        assert 'Answer the following question: Question: What do plants need to perform photosynthesis?' in examples[0]['premise']
+        assert examples[0]["label"] == 1  # 'B' corresponds to index 1
+        assert (
+            "Answer the following question: Question: What do plants need to perform photosynthesis?"
+            in examples[0]["premise"]
+        )
+
 
 def test_siqa_loader(sample_args):
     args = sample_args
-    args.multiple_choice_prompt = 'Answer the following question:'
-    qa_content = json.dumps({
-        'context': 'Alex went to the store.',
-        'question': 'Why did Alex go to the store?',
-        'answerA': 'To buy groceries',
-        'answerB': 'To sell groceries',
-        'answerC': 'To sleep'
-    })
-    label_content = '1\n'  # Answer index is 1 (but labels are 1-based in siqa_loader, and subtract 1)
-    with patch('builtins.open', mock.mock_open()) as mock_file:
-        mock_file.side_effect = [mock.mock_open(read_data=qa_content).return_value,
-                                    mock.mock_open(read_data=label_content).return_value]
-        examples = siqa_loader(['dummy_qa_path.jsonl', 'dummy_label_path.txt'], args)
+    args.multiple_choice_prompt = "Answer the following question:"
+    qa_content = json.dumps(
+        {
+            "context": "Alex went to the store.",
+            "question": "Why did Alex go to the store?",
+            "answerA": "To buy groceries",
+            "answerB": "To sell groceries",
+            "answerC": "To sleep",
+        }
+    )
+    label_content = "1\n"  # Answer index is 1 (but labels are 1-based in siqa_loader, and subtract 1)
+    with patch("builtins.open", mock.mock_open()) as mock_file:
+        mock_file.side_effect = [
+            mock.mock_open(read_data=qa_content).return_value,
+            mock.mock_open(read_data=label_content).return_value,
+        ]
+        examples = siqa_loader(
+            ["dummy_qa_path.jsonl", "dummy_label_path.txt"], args
+        )
         assert len(examples) == 1
-        assert examples[0]['label'] == 0  # '1' in label file corresponds to index 0
-        assert 'Answer the following question: Question: Alex went to the store. Why did Alex go to the store?' in examples[0]['premise']
+        assert (
+            examples[0]["label"] == 0
+        )  # '1' in label file corresponds to index 0
+        assert (
+            "Answer the following question: Question: Alex went to the store. Why did Alex go to the store?"
+            in examples[0]["premise"]
+        )
+
 
 def test_winogrande_loader(sample_args):
     args = sample_args
-    args.multiple_choice_prompt = 'Answer the following question:'
-    qa_content = json.dumps({
-        'sentence': 'The trophy doesn\'t fit in the brown suitcase because it\'s too big.',
-        'option1': 'trophy',
-        'option2': 'suitcase'
-    })
-    label_content = '1\n'  # Correct answer is option1 (labels are 1-based)
-    with patch('builtins.open', mock.mock_open()) as mock_file:
-        mock_file.side_effect = [mock.mock_open(read_data=qa_content).return_value,
-                                    mock.mock_open(read_data=label_content).return_value]
-        examples = winogrande_loader(['dummy_qa_path.jsonl', 'dummy_label_path.txt'], args)
+    args.multiple_choice_prompt = "Answer the following question:"
+    qa_content = json.dumps(
+        {
+            "sentence": "The trophy doesn't fit in the brown suitcase because it's too big.",
+            "option1": "trophy",
+            "option2": "suitcase",
+        }
+    )
+    label_content = "1\n"  # Correct answer is option1 (labels are 1-based)
+    with patch("builtins.open", mock.mock_open()) as mock_file:
+        mock_file.side_effect = [
+            mock.mock_open(read_data=qa_content).return_value,
+            mock.mock_open(read_data=label_content).return_value,
+        ]
+        examples = winogrande_loader(
+            ["dummy_qa_path.jsonl", "dummy_label_path.txt"], args
+        )
         assert len(examples) == 1
-        assert examples[0]['label'] == 0  # '1' in label file corresponds to index 0
-        assert 'Answer the following question: Question: The trophy doesn\'t fit in the brown suitcase because it\'s too big.' in examples[0]['premise']
+        assert (
+            examples[0]["label"] == 0
+        )  # '1' in label file corresponds to index 0
+        assert (
+            "Answer the following question: Question: The trophy doesn't fit in the brown suitcase because it's too big."
+            in examples[0]["premise"]
+        )
+
 
 def test_date_understanding_loader(sample_args):
     args = sample_args
-    args.multiple_choice_prompt = 'Answer the following question:'
+    args.multiple_choice_prompt = "Answer the following question:"
     args.num_options = 2
     data_content = {
         "task_prefix": "",
         "examples": [
-            {
-                "input": "What is 2+2?",
-                "target_scores": {
-                    "4": 1,
-                    "5": 0
-                }
-            }
-        ]
-    }
-    with patch('json.load', return_value=data_content):
-        with patch('builtins.open', mock.mock_open(read_data=json.dumps(data_content))):
-            examples = date_understanding_loader(['dummy_path.json'], args)
+            {"input": "What is 2+2?", "target_scores": {"4": 1, "5": 0}}
+        ],
+    }
+    with patch("json.load", return_value=data_content):
+        with patch(
+            "builtins.open", mock.mock_open(read_data=json.dumps(data_content))
+        ):
+            examples = date_understanding_loader(["dummy_path.json"], args)
             assert len(examples) == 1
-            assert examples[0]['label'] == 0  # '4' is at index 0
-            assert 'Answer the following question: Question: What is 2+2?' in examples[0]['premise']
+            assert examples[0]["label"] == 0  # '4' is at index 0
+            assert (
+                "Answer the following question: Question: What is 2+2?"
+                in examples[0]["premise"]
+            )
+
 
 # Now this test_data.py covers only 60% of all tests when I run pytest coverage. You need to add more tests to this file. DO NOT CHANGE ANYTHING WRITTEN IN THIS FILE, NO CHANGE TO ANY OF THE CURRENT TESTS, ALL OF THEM ARE WORKING. ADD NEW TESTS TO THIS FILE TO GET 100% COVERAGE
diff --git a/tests/methods/utils/test_methods.py b/tests/methods/utils/test_methods.py
index 442395f..20c9944 100644
--- a/tests/methods/utils/test_methods.py
+++ b/tests/methods/utils/test_methods.py
@@ -28,12 +28,15 @@
 # Mock tqdm to prevent actual progress bars during testing
 tqdm = lambda x, **kwargs: x
 
+
 # Define a simple mock model
 class SimpleMockModel(torch.nn.Module):
     def __init__(self):
         super(SimpleMockModel, self).__init__()
 
-    def forward(self, input_ids=None, labels=None, pixel_values=None, **kwargs):
+    def forward(
+        self, input_ids=None, labels=None, pixel_values=None, **kwargs
+    ):
         seq_len = input_ids.size(1)
         batch_size_num_options = labels.size(0)
         vocab_size = 32128
@@ -41,19 +44,23 @@ def forward(self, input_ids=None, labels=None, pixel_values=None, **kwargs):
         loss = torch.tensor(0.0)
         return MagicMock(loss=loss, logits=logits)
 
+
 # Fixtures for common test components
 @pytest.fixture
 def mock_model():
     return SimpleMockModel()
 
+
 @pytest.fixture
 def mock_amateur_model():
     return SimpleMockModel()
 
+
 @pytest.fixture
 def mock_expert_model():
     return SimpleMockModel()
 
+
 @pytest.fixture
 def sample_batch():
     batch_size = 2
@@ -61,17 +68,26 @@ def sample_batch():
     seq_len = 18
     vocab_size = 32128
     return {
-        "ending_input_ids": torch.randint(0, vocab_size, (batch_size, num_options, seq_len)),
-        "header_input_ids": torch.randint(0, vocab_size, (batch_size, seq_len)),
+        "ending_input_ids": torch.randint(
+            0, vocab_size, (batch_size, num_options, seq_len)
+        ),
+        "header_input_ids": torch.randint(
+            0, vocab_size, (batch_size, seq_len)
+        ),
         "label": torch.randint(0, num_options, (batch_size,)),
         "header_attention_mask": torch.ones(batch_size, seq_len),
         "ending_attention_mask": torch.ones(batch_size, num_options, seq_len),
-        "input_ids": torch.randint(0, vocab_size, (batch_size, num_options, seq_len)),
-        "labels": torch.randint(0, vocab_size, (batch_size, num_options, seq_len)),
+        "input_ids": torch.randint(
+            0, vocab_size, (batch_size, num_options, seq_len)
+        ),
+        "labels": torch.randint(
+            0, vocab_size, (batch_size, num_options, seq_len)
+        ),
         "mask": torch.ones(batch_size, num_options),
         "images": torch.randn(batch_size, 3, 224, 224),
     }
 
+
 @pytest.fixture
 def sample_synonym_batch():
     batch_size = 2
@@ -80,204 +96,350 @@ def sample_synonym_batch():
     seq_len = 18
     vocab_size = 32128
     return {
-        "ending_input_ids": torch.randint(0, vocab_size, (batch_size, num_options*(num_synonyms+1), seq_len)),
-        "header_input_ids": torch.randint(0, vocab_size, (batch_size, seq_len)),
-        "label": torch.randint(0, num_options*(num_synonyms+1), (batch_size,)),
+        "ending_input_ids": torch.randint(
+            0,
+            vocab_size,
+            (batch_size, num_options * (num_synonyms + 1), seq_len),
+        ),
+        "header_input_ids": torch.randint(
+            0, vocab_size, (batch_size, seq_len)
+        ),
+        "label": torch.randint(
+            0, num_options * (num_synonyms + 1), (batch_size,)
+        ),
         "header_attention_mask": torch.ones(batch_size, seq_len),
-        "ending_attention_mask": torch.ones(batch_size, num_options*(num_synonyms+1), seq_len),
-        "input_ids": torch.randint(0, vocab_size, (batch_size, num_options*(num_synonyms+1), seq_len)),
-        "labels": torch.randint(0, vocab_size, (batch_size, num_options*(num_synonyms+1), seq_len)),
-        "mask": torch.ones(batch_size, num_options*(num_synonyms+1)),
+        "ending_attention_mask": torch.ones(
+            batch_size, num_options * (num_synonyms + 1), seq_len
+        ),
+        "input_ids": torch.randint(
+            0,
+            vocab_size,
+            (batch_size, num_options * (num_synonyms + 1), seq_len),
+        ),
+        "labels": torch.randint(
+            0,
+            vocab_size,
+            (batch_size, num_options * (num_synonyms + 1), seq_len),
+        ),
+        "mask": torch.ones(batch_size, num_options * (num_synonyms + 1)),
         "images": torch.randn(batch_size, 3, 224, 224),
     }
 
+
 @pytest.fixture
 def device():
-    return 'cpu'
+    return "cpu"
+
 
 @pytest.fixture
 def pad_token_id():
     return 0
 
+
 # Tests for inference_language_modeling_old function
 def test_inference_language_modeling_old(mock_model, sample_batch, device):
     eval_dataloader = [sample_batch]
-    total_accuracy = inference_language_modeling_old(mock_model, eval_dataloader, device)
+    total_accuracy = inference_language_modeling_old(
+        mock_model, eval_dataloader, device
+    )
     assert isinstance(total_accuracy, float)
     assert 0.0 <= total_accuracy <= 1.0
 
+
 # Tests for inference_contrastive_decoding_old function
-def test_inference_contrastive_decoding_old(mock_amateur_model, mock_expert_model, sample_batch, device):
+def test_inference_contrastive_decoding_old(
+    mock_amateur_model, mock_expert_model, sample_batch, device
+):
     eval_dataloader = [sample_batch]
-    total_accuracy = inference_contrastive_decoding_old(mock_amateur_model, mock_expert_model, eval_dataloader, device)
+    total_accuracy = inference_contrastive_decoding_old(
+        mock_amateur_model, mock_expert_model, eval_dataloader, device
+    )
     assert isinstance(total_accuracy, float)
     assert 0.0 <= total_accuracy <= 1.0
 
+
 # Mock compute_func for inference_language_modeling
 def mock_compute_func(batch, model, device, pad_token_id):
     batch_size = batch["header_input_ids"].size(0)
     num_options = batch["ending_input_ids"].size(1)
     return torch.rand(batch_size, num_options)
 
+
 # Tests for inference_language_modeling function
-def test_inference_language_modeling(mock_model, sample_batch, device, pad_token_id):
+def test_inference_language_modeling(
+    mock_model, sample_batch, device, pad_token_id
+):
     eval_dataloader = [sample_batch]
-    avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_language_modeling(
-        mock_model, eval_dataloader, device, mock_compute_func, pad_token_id
+    avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = (
+        inference_language_modeling(
+            mock_model,
+            eval_dataloader,
+            device,
+            mock_compute_func,
+            pad_token_id,
+        )
+    )
+    assert avg_log_probs.shape == (
+        sample_batch["label"].size(0),
+        sample_batch["ending_input_ids"].size(1),
     )
-    assert avg_log_probs.shape == (sample_batch["label"].size(0), sample_batch["ending_input_ids"].size(1))
     assert isinstance(lm_accuracy, float)
     assert isinstance(avg_lm_accuracy, float)
     assert lm_predictions.shape == (sample_batch["label"].size(0),)
 
+
 # Tests for inference_generate_synonyms function
-def test_inference_generate_synonyms(mock_model, sample_synonym_batch, device, pad_token_id):
+def test_inference_generate_synonyms(
+    mock_model, sample_synonym_batch, device, pad_token_id
+):
     num_of_options = 2
     num_of_synonyms = 2
+
     def mock_compute_func(batch, model, device, pad_token_id):
         batch_size = batch["header_input_ids"].size(0)
         total_options = batch["ending_input_ids"].size(1)
         return torch.rand(batch_size, total_options)
+
     eval_dataloader = [sample_synonym_batch]
-    avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_generate_synonyms(
-        mock_model, eval_dataloader, device, mock_compute_func, pad_token_id, num_of_options, num_of_synonyms
+    avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = (
+        inference_generate_synonyms(
+            mock_model,
+            eval_dataloader,
+            device,
+            mock_compute_func,
+            pad_token_id,
+            num_of_options,
+            num_of_synonyms,
+        )
+    )
+    expected_shape = (
+        sample_synonym_batch["label"].size(0),
+        num_of_options * (num_of_synonyms + 1),
     )
-    expected_shape = (sample_synonym_batch["label"].size(0), num_of_options*(num_of_synonyms+1))
     assert avg_log_probs.shape == expected_shape
     assert isinstance(lm_accuracy, float)
     assert isinstance(avg_lm_accuracy, float)
     assert lm_predictions.shape == (sample_synonym_batch["label"].size(0),)
 
+
 # Tests for inference_calibration function
 def test_inference_calibration(mock_model, sample_batch, device, pad_token_id):
     eval_dataloader = [sample_batch]
     eval_calibration_dataloader = [sample_batch]
-    avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_calibration(
-        mock_model, eval_dataloader, eval_calibration_dataloader, device, mock_compute_func, pad_token_id
+    avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = (
+        inference_calibration(
+            mock_model,
+            eval_dataloader,
+            eval_calibration_dataloader,
+            device,
+            mock_compute_func,
+            pad_token_id,
+        )
+    )
+    assert avg_log_probs.shape == (
+        sample_batch["label"].size(0),
+        sample_batch["ending_input_ids"].size(1),
     )
-    assert avg_log_probs.shape == (sample_batch["label"].size(0), sample_batch["ending_input_ids"].size(1))
     assert isinstance(lm_accuracy, float)
     assert isinstance(avg_lm_accuracy, float)
     assert lm_predictions.shape == (sample_batch["label"].size(0),)
 
+
 # Tests for compute_mask_process_of_elimination function
-@pytest.mark.parametrize("mask_strategy", ["lowest", "below_average", "lowest_iter", "min_k"])
+@pytest.mark.parametrize(
+    "mask_strategy", ["lowest", "below_average", "lowest_iter", "min_k"]
+)
 def test_compute_mask_process_of_elimination(mask_strategy):
-    avg_log_probs = torch.tensor([[0.1, 0.2, 0.3],
-                                  [0.3, 0.2, 0.1]])
+    avg_log_probs = torch.tensor([[0.1, 0.2, 0.3], [0.3, 0.2, 0.1]])
     if mask_strategy == "min_k":
         kwargs = {"min_k": 2}
     else:
         kwargs = {}
-    if mask_strategy not in ["lowest", "below_average", "lowest_iter", "min_k"]:
+    if mask_strategy not in [
+        "lowest",
+        "below_average",
+        "lowest_iter",
+        "min_k",
+    ]:
         with pytest.raises(NotImplementedError):
-            compute_mask_process_of_elimination(avg_log_probs, mask_strategy, **kwargs)
+            compute_mask_process_of_elimination(
+                avg_log_probs, mask_strategy, **kwargs
+            )
     else:
-        masks = compute_mask_process_of_elimination(avg_log_probs, mask_strategy, **kwargs)
+        masks = compute_mask_process_of_elimination(
+            avg_log_probs, mask_strategy, **kwargs
+        )
         assert masks.shape == avg_log_probs.shape
 
+
 # Tests for inference_process_of_elimination function
-def test_inference_process_of_elimination(mock_model, sample_batch, device, pad_token_id):
+def test_inference_process_of_elimination(
+    mock_model, sample_batch, device, pad_token_id
+):
     eval_dataloader = [sample_batch]
-    avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_process_of_elimination(
-        mock_model, eval_dataloader, device, mock_compute_func, pad_token_id
+    avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = (
+        inference_process_of_elimination(
+            mock_model,
+            eval_dataloader,
+            device,
+            mock_compute_func,
+            pad_token_id,
+        )
+    )
+    assert avg_log_probs.shape == (
+        sample_batch["label"].size(0),
+        sample_batch["ending_input_ids"].size(1),
     )
-    assert avg_log_probs.shape == (sample_batch["label"].size(0), sample_batch["ending_input_ids"].size(1))
     assert isinstance(lm_accuracy, float)
     assert isinstance(avg_lm_accuracy, float)
     assert lm_predictions.shape == (sample_batch["label"].size(0),)
 
+
 # Tests for compute_conditional_score_seq2seq function
-def test_compute_conditional_score_seq2seq(mock_model, sample_batch, device, pad_token_id):
-    log_prob = compute_conditional_score_seq2seq(sample_batch, mock_model, device, pad_token_id)
-    assert log_prob.shape == (sample_batch["ending_input_ids"].shape[0], sample_batch["ending_input_ids"].shape[1])
+def test_compute_conditional_score_seq2seq(
+    mock_model, sample_batch, device, pad_token_id
+):
+    log_prob = compute_conditional_score_seq2seq(
+        sample_batch, mock_model, device, pad_token_id
+    )
+    assert log_prob.shape == (
+        sample_batch["ending_input_ids"].shape[0],
+        sample_batch["ending_input_ids"].shape[1],
+    )
+
 
 # Tests for compute_conditional_score_causal function
-def test_compute_conditional_score_causal(mock_model, sample_batch, device, pad_token_id):
-    log_prob = compute_conditional_score_causal(sample_batch, mock_model, device, pad_token_id)
-    assert log_prob.shape == (sample_batch["input_ids"].shape[0], sample_batch["input_ids"].shape[1])
+def test_compute_conditional_score_causal(
+    mock_model, sample_batch, device, pad_token_id
+):
+    log_prob = compute_conditional_score_causal(
+        sample_batch, mock_model, device, pad_token_id
+    )
+    assert log_prob.shape == (
+        sample_batch["input_ids"].shape[0],
+        sample_batch["input_ids"].shape[1],
+    )
+
 
 # Tests for compute_conditional_score_seq2seq_vqa function
-def test_compute_conditional_score_seq2seq_vqa(mock_model, sample_batch, device, pad_token_id):
-    log_prob = compute_conditional_score_seq2seq_vqa(sample_batch, mock_model, device, pad_token_id)
-    assert log_prob.shape == (sample_batch["ending_input_ids"].shape[0], sample_batch["ending_input_ids"].shape[1])
+def test_compute_conditional_score_seq2seq_vqa(
+    mock_model, sample_batch, device, pad_token_id
+):
+    log_prob = compute_conditional_score_seq2seq_vqa(
+        sample_batch, mock_model, device, pad_token_id
+    )
+    assert log_prob.shape == (
+        sample_batch["ending_input_ids"].shape[0],
+        sample_batch["ending_input_ids"].shape[1],
+    )
+
 
 # Tests for compute_conditional_score_causal_vqa function
-def test_compute_conditional_score_causal_vqa(mock_model, sample_batch, device, pad_token_id):
-    log_prob = compute_conditional_score_causal_vqa(sample_batch, mock_model, device, pad_token_id)
-    assert log_prob.shape == (sample_batch["input_ids"].shape[0], sample_batch["input_ids"].shape[1])
+def test_compute_conditional_score_causal_vqa(
+    mock_model, sample_batch, device, pad_token_id
+):
+    log_prob = compute_conditional_score_causal_vqa(
+        sample_batch, mock_model, device, pad_token_id
+    )
+    assert log_prob.shape == (
+        sample_batch["input_ids"].shape[0],
+        sample_batch["input_ids"].shape[1],
+    )
+
 
 # Tests for aggregate_optionw_with_synonyms function
 def test_aggregate_optionw_with_synonyms():
     batch_size = 2
     num_of_options = 5
     num_of_synonyms = 3
-    tensor = torch.arange(batch_size * num_of_options * (num_of_synonyms + 1)).view(batch_size, -1)
-    aggregated_tensor = aggregate_optionw_with_synonyms(tensor.clone(), num_of_options, num_of_synonyms)
+    tensor = torch.arange(
+        batch_size * num_of_options * (num_of_synonyms + 1)
+    ).view(batch_size, -1)
+    aggregated_tensor = aggregate_optionw_with_synonyms(
+        tensor.clone(), num_of_options, num_of_synonyms
+    )
     assert aggregated_tensor.shape == tensor.shape
 
+
 # Tests for generate_synonyms function
 def test_generate_synonyms():
     args = MagicMock()
     args.number_of_synonyms = 2
     args.generate_synonyms_prompt = "Generate a synonym to '{option}':"
     model = MagicMock()
-    model.device = 'cpu'
+    model.device = "cpu"
     tokenizer = MagicMock()
-    tokenizer.return_tensors = 'pt'
+    tokenizer.return_tensors = "pt"
     tokenizer.pad_token_id = 0
-    tokenizer.batch_decode.return_value = ['synonym1', 'synonym2']
+    tokenizer.batch_decode.return_value = ["synonym1", "synonym2"]
     tokenized_dataset = MagicMock()
-    tokenized_dataset.column_names = ['hypothesis1']
-    tokenized_dataset.__getitem__.return_value = {'hypothesis1': 'test_option'}
-    synonyms_dict = generate_synonyms(args, model, tokenizer, tokenized_dataset)
+    tokenized_dataset.column_names = ["hypothesis1"]
+    tokenized_dataset.__getitem__.return_value = {"hypothesis1": "test_option"}
+    synonyms_dict = generate_synonyms(
+        args, model, tokenizer, tokenized_dataset
+    )
     assert isinstance(synonyms_dict, dict)
 
+
 # Tests for inference_contrastive_decoding function
 def test_inference_contrastive_decoding():
-    method = 'language_modeling'
+    method = "language_modeling"
     model = MagicMock()
     args = MagicMock()
     args.batch_size = 2
-    args.model_family = 'other'
+    args.model_family = "other"
     raw_dataset = MagicMock()
-    device = 'cpu'
+    device = "cpu"
     compute_func = MagicMock()
     tokenizer = MagicMock()
     processor = MagicMock()
-    ending_names = ['ending1', 'ending2']
-    header_name = 'header'
-    image_header_name = 'image_header'
+    ending_names = ["ending1", "ending2"]
+    header_name = "header"
+    image_header_name = "image_header"
     preprocess_func = MagicMock()
     preprocess_func_channel = MagicMock()
     kwargs = {
-        'args': args,
-        'raw_dataset': raw_dataset,
-        'device': device,
-        'compute_func': compute_func,
-        'tokenizer': tokenizer,
-        'processor': processor,
-        'ending_names': ending_names,
-        'header_name': header_name,
-        'image_header_name': image_header_name,
-        'preprocess_func': preprocess_func,
-        'preprocess_func_channel': preprocess_func_channel,
+        "args": args,
+        "raw_dataset": raw_dataset,
+        "device": device,
+        "compute_func": compute_func,
+        "tokenizer": tokenizer,
+        "processor": processor,
+        "ending_names": ending_names,
+        "header_name": header_name,
+        "image_header_name": image_header_name,
+        "preprocess_func": preprocess_func,
+        "preprocess_func_channel": preprocess_func_channel,
     }
-    with patch('mm_poe.methods.utils.methods.inference_language_modeling', return_value=(None, 0.0, 0.0, None)) as mock_inference:
-        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_contrastive_decoding(method, model, **kwargs)
+    with patch(
+        "mm_poe.methods.utils.methods.inference_language_modeling",
+        return_value=(None, 0.0, 0.0, None),
+    ) as mock_inference:
+        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = (
+            inference_contrastive_decoding(method, model, **kwargs)
+        )
         mock_inference.assert_called_once()
 
-    method = 'calibration'
-    with patch('mm_poe.methods.utils.methods.inference_calibration', return_value=(None, 0.0, 0.0, None)) as mock_inference_cal:
-        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_contrastive_decoding(method, model, **kwargs)
+    method = "calibration"
+    with patch(
+        "mm_poe.methods.utils.methods.inference_calibration",
+        return_value=(None, 0.0, 0.0, None),
+    ) as mock_inference_cal:
+        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = (
+            inference_contrastive_decoding(method, model, **kwargs)
+        )
         mock_inference_cal.assert_called_once()
 
-    method = 'channel'
-    with patch('mm_poe.methods.utils.methods.inference_language_modeling', return_value=(None, 0.0, 0.0, None)) as mock_inference_channel:
-        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = inference_contrastive_decoding(method, model, **kwargs)
+    method = "channel"
+    with patch(
+        "mm_poe.methods.utils.methods.inference_language_modeling",
+        return_value=(None, 0.0, 0.0, None),
+    ) as mock_inference_channel:
+        avg_log_probs, lm_accuracy, avg_lm_accuracy, lm_predictions = (
+            inference_contrastive_decoding(method, model, **kwargs)
+        )
         mock_inference_channel.assert_called()
 
-    method = 'invalid_method'
+    method = "invalid_method"
     with pytest.raises(NotImplementedError):
         inference_contrastive_decoding(method, model, **kwargs)
-
diff --git a/tests/methods/utils/test_utils.py b/tests/methods/utils/test_utils.py
index eb94ef4..7547840 100644
--- a/tests/methods/utils/test_utils.py
+++ b/tests/methods/utils/test_utils.py
@@ -16,6 +16,7 @@
     write_to_csv,
 )
 
+
 # Test for set_seed function
 @pytest.mark.parametrize("seed", [0, 42, 1234])
 def test_set_seed(seed):
@@ -24,7 +25,7 @@ def test_set_seed(seed):
         set_seed(seed)
 
         # Check if os environment variable is set correctly
-        assert os.environ['PYTHONHASHSEED'] == str(seed)
+        assert os.environ["PYTHONHASHSEED"] == str(seed)
 
         # Check if random seeds are set correctly
         random_value = random.randint(0, 100)
@@ -40,52 +41,81 @@ def test_set_seed(seed):
         # Check if CUDA seeds were set correctly (mocked)
         mock_cuda_seed_all.assert_called_with(seed)
 
+
 # Tests for parse_args function
 def test_parse_args_with_required_arguments():
     test_args = [
         "script_name",
-        "--model_family", "GPT2",
-        "--checkpoint", "gpt2-medium",
-        "--datasets", "copa",
+        "--model_family",
+        "GPT2",
+        "--checkpoint",
+        "gpt2-medium",
+        "--datasets",
+        "copa",
     ]
-    with mock.patch.object(sys, 'argv', test_args):
+    with mock.patch.object(sys, "argv", test_args):
         args = parse_args()
         assert args.model_family == "GPT2"
         assert args.checkpoint == "gpt2-medium"
         assert args.datasets == "copa"
 
+
 def test_parse_args_with_all_arguments():
     test_args = [
         "script_name",
-        "--model_family", "GPT2",
-        "--checkpoint", "gpt2-medium",
-        "--datasets", "copa winogrande",
-        "--seed", "42",
-        "--amateur_checkpoint", "gpt2-small",
-        "--expert_method", "language_modeling",
-        "--amateur_method", "calibration",
-        "--weighting_parameter", "-0.5",
-        "--weighting_parameters", "0.1,0.2",
-        "--num_random_search", "5",
-        "--loading_precision", "FP16",
-        "--sample", "100",
-        "--batch_size", "16",
-        "--n_shot", "5",
-        "--multiple_choice_prompt", "Choose the best option:",
-        "--calibration_prompt", "This is a calibration prompt.",
+        "--model_family",
+        "GPT2",
+        "--checkpoint",
+        "gpt2-medium",
+        "--datasets",
+        "copa winogrande",
+        "--seed",
+        "42",
+        "--amateur_checkpoint",
+        "gpt2-small",
+        "--expert_method",
+        "language_modeling",
+        "--amateur_method",
+        "calibration",
+        "--weighting_parameter",
+        "-0.5",
+        "--weighting_parameters",
+        "0.1,0.2",
+        "--num_random_search",
+        "5",
+        "--loading_precision",
+        "FP16",
+        "--sample",
+        "100",
+        "--batch_size",
+        "16",
+        "--n_shot",
+        "5",
+        "--multiple_choice_prompt",
+        "Choose the best option:",
+        "--calibration_prompt",
+        "This is a calibration prompt.",
         "--do_channel",
-        "--process_of_elimination_prompt", "Eliminate incorrect options:",
-        "--scoring_method_for_process_of_elimination", "calibration",
-        "--prompting_method_for_process_of_elimination", "multiple_choice_prompt",
-        "--mask_strategy_for_process_of_elimination", "min_k",
+        "--process_of_elimination_prompt",
+        "Eliminate incorrect options:",
+        "--scoring_method_for_process_of_elimination",
+        "calibration",
+        "--prompting_method_for_process_of_elimination",
+        "multiple_choice_prompt",
+        "--mask_strategy_for_process_of_elimination",
+        "min_k",
         "--do_synonym",
-        "--number_of_synonyms", "3",
-        "--generate_synonyms_prompt", "Generate synonyms for option: option",
+        "--number_of_synonyms",
+        "3",
+        "--generate_synonyms_prompt",
+        "Generate synonyms for option: option",
         "--push_data_to_hub",
-        "--min_k", "2",
-        "--mask_token", "[MASK]",
+        "--min_k",
+        "2",
+        "--mask_token",
+        "[MASK]",
     ]
-    with mock.patch.object(sys, 'argv', test_args):
+    with mock.patch.object(sys, "argv", test_args):
         args = parse_args()
         assert args.seed == 42
         assert args.amateur_checkpoint == "gpt2-small"
@@ -101,60 +131,256 @@ def test_parse_args_with_all_arguments():
         assert args.multiple_choice_prompt == "Choose the best option:"
         assert args.calibration_prompt == "This is a calibration prompt."
         assert args.do_channel is True
-        assert args.process_of_elimination_prompt == "Eliminate incorrect options:"
+        assert (
+            args.process_of_elimination_prompt
+            == "Eliminate incorrect options:"
+        )
         assert args.scoring_method_for_process_of_elimination == "calibration"
-        assert args.prompting_method_for_process_of_elimination == "multiple_choice_prompt"
+        assert (
+            args.prompting_method_for_process_of_elimination
+            == "multiple_choice_prompt"
+        )
         assert args.mask_strategy_for_process_of_elimination == "min_k"
         assert args.do_synonym is True
         assert args.number_of_synonyms == 3
-        assert args.generate_synonyms_prompt == "Generate synonyms for option: option"
+        assert (
+            args.generate_synonyms_prompt
+            == "Generate synonyms for option: option"
+        )
         assert args.push_data_to_hub is True
         assert args.min_k == 2
         assert args.mask_token == "[MASK]"
 
+
 def test_parse_args_missing_required_arguments():
     test_args = [
         "script_name",
-        "--model_family", "GPT2",
+        "--model_family",
+        "GPT2",
         # "--checkpoint" is missing
-        "--datasets", "copa",
+        "--datasets",
+        "copa",
     ]
-    with mock.patch.object(sys, 'argv', test_args):
+    with mock.patch.object(sys, "argv", test_args):
         with pytest.raises(SystemExit):
             parse_args()
 
 
-
 # Tests for load_data function
-@pytest.mark.parametrize("dataset_name,loader_name,ending_names,header_name", [
-    ("copa", "copa_loader", ['hypothesis0', 'hypothesis1'], 'premise'),
-    ("cqa", "cqa_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
-    ("obqa", "obqa_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3'], 'premise'),
-    ("piqa", "piqa_loader", ['hypothesis0', 'hypothesis1'], 'premise'),
-    ("qasc", "qasc_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4', 'hypothesis5', 'hypothesis6', 'hypothesis7'], 'premise'),
-    ("siqa", "siqa_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2'], 'premise'),
-    ("winogrande", "winogrande_loader", ['hypothesis0', 'hypothesis1'], 'premise'),
-    ("anli", "anli_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2'], 'premise'),
-    ("disambiguation_qa", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2'], 'premise'),
-    ("conceptual_combinations", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3'], 'premise'),
-    ("date_understanding", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4', 'hypothesis5'], 'premise'),
-    ("emoji_movie", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
-    ("ruin_names", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3'], 'premise'),
-    ("penguins_in_a_table", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
-    ("strange_stories", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3'], 'premise'),
-    ("reasoning_about_colored_objects", "date_understanding_loader", [f"hypothesis{i}" for i in range(18)], 'premise'),
-    ("symbol_interpretation", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
-    ("tracking_shuffled_objects", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
-    ("logical_deduction_three_objects", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2'], 'premise'),
-    ("logical_deduction_five_objects", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4'], 'premise'),
-    ("logical_deduction_seven_objects", "date_understanding_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2', 'hypothesis3', 'hypothesis4', 'hypothesis5', 'hypothesis6'], 'premise'),
-    ("anli_r1", "anli_loader", ['hypothesis0', 'hypothesis1', 'hypothesis2'], 'premise'),
-    ("vqa", "vqa_loader", [f"hypothesis{i}" for i in range(18)], 'premise'),
-    ("scienceqa", "scienceqa_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
-    ("ai2d", "ai2d_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
-    ("single_inference", "single_inference_loader", [f"hypothesis{i}" for i in range(4)], 'premise'),
-])
-def test_load_data_datasets(dataset_name, loader_name, ending_names, header_name):
+@pytest.mark.parametrize(
+    "dataset_name,loader_name,ending_names,header_name",
+    [
+        ("copa", "copa_loader", ["hypothesis0", "hypothesis1"], "premise"),
+        (
+            "cqa",
+            "cqa_loader",
+            [
+                "hypothesis0",
+                "hypothesis1",
+                "hypothesis2",
+                "hypothesis3",
+                "hypothesis4",
+            ],
+            "premise",
+        ),
+        (
+            "obqa",
+            "obqa_loader",
+            ["hypothesis0", "hypothesis1", "hypothesis2", "hypothesis3"],
+            "premise",
+        ),
+        ("piqa", "piqa_loader", ["hypothesis0", "hypothesis1"], "premise"),
+        (
+            "qasc",
+            "qasc_loader",
+            [
+                "hypothesis0",
+                "hypothesis1",
+                "hypothesis2",
+                "hypothesis3",
+                "hypothesis4",
+                "hypothesis5",
+                "hypothesis6",
+                "hypothesis7",
+            ],
+            "premise",
+        ),
+        (
+            "siqa",
+            "siqa_loader",
+            ["hypothesis0", "hypothesis1", "hypothesis2"],
+            "premise",
+        ),
+        (
+            "winogrande",
+            "winogrande_loader",
+            ["hypothesis0", "hypothesis1"],
+            "premise",
+        ),
+        (
+            "anli",
+            "anli_loader",
+            ["hypothesis0", "hypothesis1", "hypothesis2"],
+            "premise",
+        ),
+        (
+            "disambiguation_qa",
+            "date_understanding_loader",
+            ["hypothesis0", "hypothesis1", "hypothesis2"],
+            "premise",
+        ),
+        (
+            "conceptual_combinations",
+            "date_understanding_loader",
+            ["hypothesis0", "hypothesis1", "hypothesis2", "hypothesis3"],
+            "premise",
+        ),
+        (
+            "date_understanding",
+            "date_understanding_loader",
+            [
+                "hypothesis0",
+                "hypothesis1",
+                "hypothesis2",
+                "hypothesis3",
+                "hypothesis4",
+                "hypothesis5",
+            ],
+            "premise",
+        ),
+        (
+            "emoji_movie",
+            "date_understanding_loader",
+            [
+                "hypothesis0",
+                "hypothesis1",
+                "hypothesis2",
+                "hypothesis3",
+                "hypothesis4",
+            ],
+            "premise",
+        ),
+        (
+            "ruin_names",
+            "date_understanding_loader",
+            ["hypothesis0", "hypothesis1", "hypothesis2", "hypothesis3"],
+            "premise",
+        ),
+        (
+            "penguins_in_a_table",
+            "date_understanding_loader",
+            [
+                "hypothesis0",
+                "hypothesis1",
+                "hypothesis2",
+                "hypothesis3",
+                "hypothesis4",
+            ],
+            "premise",
+        ),
+        (
+            "strange_stories",
+            "date_understanding_loader",
+            ["hypothesis0", "hypothesis1", "hypothesis2", "hypothesis3"],
+            "premise",
+        ),
+        (
+            "reasoning_about_colored_objects",
+            "date_understanding_loader",
+            [f"hypothesis{i}" for i in range(18)],
+            "premise",
+        ),
+        (
+            "symbol_interpretation",
+            "date_understanding_loader",
+            [
+                "hypothesis0",
+                "hypothesis1",
+                "hypothesis2",
+                "hypothesis3",
+                "hypothesis4",
+            ],
+            "premise",
+        ),
+        (
+            "tracking_shuffled_objects",
+            "date_understanding_loader",
+            [
+                "hypothesis0",
+                "hypothesis1",
+                "hypothesis2",
+                "hypothesis3",
+                "hypothesis4",
+            ],
+            "premise",
+        ),
+        (
+            "logical_deduction_three_objects",
+            "date_understanding_loader",
+            ["hypothesis0", "hypothesis1", "hypothesis2"],
+            "premise",
+        ),
+        (
+            "logical_deduction_five_objects",
+            "date_understanding_loader",
+            [
+                "hypothesis0",
+                "hypothesis1",
+                "hypothesis2",
+                "hypothesis3",
+                "hypothesis4",
+            ],
+            "premise",
+        ),
+        (
+            "logical_deduction_seven_objects",
+            "date_understanding_loader",
+            [
+                "hypothesis0",
+                "hypothesis1",
+                "hypothesis2",
+                "hypothesis3",
+                "hypothesis4",
+                "hypothesis5",
+                "hypothesis6",
+            ],
+            "premise",
+        ),
+        (
+            "anli_r1",
+            "anli_loader",
+            ["hypothesis0", "hypothesis1", "hypothesis2"],
+            "premise",
+        ),
+        (
+            "vqa",
+            "vqa_loader",
+            [f"hypothesis{i}" for i in range(18)],
+            "premise",
+        ),
+        (
+            "scienceqa",
+            "scienceqa_loader",
+            [f"hypothesis{i}" for i in range(4)],
+            "premise",
+        ),
+        (
+            "ai2d",
+            "ai2d_loader",
+            [f"hypothesis{i}" for i in range(4)],
+            "premise",
+        ),
+        (
+            "single_inference",
+            "single_inference_loader",
+            [f"hypothesis{i}" for i in range(4)],
+            "premise",
+        ),
+    ],
+)
+def test_load_data_datasets(
+    dataset_name, loader_name, ending_names, header_name
+):
     # Create a mock args object
     class Args:
         dataset = dataset_name
@@ -166,23 +392,30 @@ class Args:
     args = Args()
 
     # Mock the data loader function
-    loader_path = f'mm_poe.methods.utils.utils.{loader_name}'
+    loader_path = f"mm_poe.methods.utils.utils.{loader_name}"
     with mock.patch(loader_path) as mock_loader:
         # Mock return value
         mock_value = {
-                'premise': 'Test premise',
-                'uncond_premise': 'Test premise',
-                'image_path': 'dummy_path',
-                'label': 0
-            }
+            "premise": "Test premise",
+            "uncond_premise": "Test premise",
+            "image_path": "dummy_path",
+            "label": 0,
+        }
         for i, ending_name in enumerate(ending_names):
-            mock_value[ending_name] = f'answer {i}'
+            mock_value[ending_name] = f"answer {i}"
         mock_loader.return_value = [mock_value]
         # Mock os.path.join to prevent file system access
-        with mock.patch('os.path.join', return_value='dummy_path'):
-            if dataset_name in ["vqa", "scienceqa", "ai2d", "single_inference"]:
-                ending, header, image_header, dev_dataset, train_dataset = load_data(args)
-                assert image_header == 'image_path'
+        with mock.patch("os.path.join", return_value="dummy_path"):
+            if dataset_name in [
+                "vqa",
+                "scienceqa",
+                "ai2d",
+                "single_inference",
+            ]:
+                ending, header, image_header, dev_dataset, train_dataset = (
+                    load_data(args)
+                )
+                assert image_header == "image_path"
             else:
                 ending, header, dev_dataset, train_dataset = load_data(args)
             assert ending == ending_names
@@ -190,71 +423,94 @@ class Args:
             assert len(dev_dataset) == 1
             assert len(train_dataset) == 1
 
+
 def test_load_data_invalid_dataset():
     class Args:
         dataset = "unknown_dataset"
 
     args = Args()
 
-    with mock.patch('builtins.print') as mock_print:
+    with mock.patch("builtins.print") as mock_print:
         result = load_data(args)
         assert result is None
-        mock_print.assert_called_with(f"{args.dataset}: downloader not implemented.")
+        mock_print.assert_called_with(
+            f"{args.dataset}: downloader not implemented."
+        )
+
 
 # Tests for load_model function
-@pytest.mark.parametrize("model_family,model_func_name,tokenizer_func_name", [
-    ("GPT2", "AutoModelForCausalLM", "AutoTokenizer"),
-    ("Pythia", "AutoModelForCausalLM", "AutoTokenizer"),
-    ("OPT-IML", "AutoModelForCausalLM", "AutoTokenizer"),
-    ("Dolly", "AutoModelForCausalLM", "AutoTokenizer"),
-    ("T5", "AutoModelForSeq2SeqLM", "AutoTokenizer"),
-    ("FLAN-T5", "AutoModelForSeq2SeqLM", "AutoTokenizer"),
-    ("BLIP2", "AutoModelForVision2Seq", "AutoProcessor"),
-    ("InstructBLIP", "AutoModelForVision2Seq", "AutoProcessor"),
-    ("GIT", "AutoModelForVision2Seq", "AutoProcessor"),
-    ("PaliGemma", "AutoModelForVision2Seq", "AutoProcessor"),
-    ("Idefics2", "AutoModelForVision2Seq", "AutoProcessor"),
-])
-def test_load_model_families(model_family, model_func_name, tokenizer_func_name):
-    device = 'cpu'
-    model_path = 'some-model-path'
+@pytest.mark.parametrize(
+    "model_family,model_func_name,tokenizer_func_name",
+    [
+        ("GPT2", "AutoModelForCausalLM", "AutoTokenizer"),
+        ("Pythia", "AutoModelForCausalLM", "AutoTokenizer"),
+        ("OPT-IML", "AutoModelForCausalLM", "AutoTokenizer"),
+        ("Dolly", "AutoModelForCausalLM", "AutoTokenizer"),
+        ("T5", "AutoModelForSeq2SeqLM", "AutoTokenizer"),
+        ("FLAN-T5", "AutoModelForSeq2SeqLM", "AutoTokenizer"),
+        ("BLIP2", "AutoModelForVision2Seq", "AutoProcessor"),
+        ("InstructBLIP", "AutoModelForVision2Seq", "AutoProcessor"),
+        ("GIT", "AutoModelForVision2Seq", "AutoProcessor"),
+        ("PaliGemma", "AutoModelForVision2Seq", "AutoProcessor"),
+        ("Idefics2", "AutoModelForVision2Seq", "AutoProcessor"),
+    ],
+)
+def test_load_model_families(
+    model_family, model_func_name, tokenizer_func_name
+):
+    device = "cpu"
+    model_path = "some-model-path"
 
     # Create a mock args object
     class Args:
-        
+
         model_family = ""
         loading_precision = "FP32"
-        def __init__(self,model_family):
+
+        def __init__(self, model_family):
             self.model_family = model_family
 
     args = Args(model_family)
 
     # Mock the tokenizer and model loading functions
-    with mock.patch(f'mm_poe.methods.utils.utils.{tokenizer_func_name}') as mock_tokenizer_class:
-        with mock.patch(f'mm_poe.methods.utils.utils.{model_func_name}') as mock_model_class:
+    with mock.patch(
+        f"mm_poe.methods.utils.utils.{tokenizer_func_name}"
+    ) as mock_tokenizer_class:
+        with mock.patch(
+            f"mm_poe.methods.utils.utils.{model_func_name}"
+        ) as mock_model_class:
             mock_tokenizer = MagicMock()
             mock_model = MagicMock()
             mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer
             mock_model_class.from_pretrained.return_value = mock_model
 
             # Set the return value of get_memory_footprint to a numeric value
-            mock_model.get_memory_footprint.return_value = 2 * 1024 ** 3  # 2 GB in bytes
+            mock_model.get_memory_footprint.return_value = (
+                2 * 1024**3
+            )  # 2 GB in bytes
 
             model, tokenizer = load_model(device, model_path, args)
 
             # Check that the correct tokenizer and model are loaded
             if model_family == "Dolly":
-                mock_tokenizer_class.from_pretrained.assert_called_with(model_path, padding_side="left")
+                mock_tokenizer_class.from_pretrained.assert_called_with(
+                    model_path, padding_side="left"
+                )
             elif model_family == "Idefics2":
-                mock_tokenizer_class.from_pretrained.assert_called_with(model_path, do_image_splitting=False)
+                mock_tokenizer_class.from_pretrained.assert_called_with(
+                    model_path, do_image_splitting=False
+                )
             else:
-                mock_tokenizer_class.from_pretrained.assert_called_with(model_path)
+                mock_tokenizer_class.from_pretrained.assert_called_with(
+                    model_path
+                )
             # Check that model is moved to the correct device
             mock_model.to.assert_called_with(device)
 
+
 def test_load_model_invalid_family():
-    device = 'cpu'
-    model_path = 'some-model-path'
+    device = "cpu"
+    model_path = "some-model-path"
 
     # Create a mock args object
     class Args:
@@ -263,14 +519,17 @@ class Args:
 
     args = Args()
 
-    with mock.patch('builtins.print') as mock_print:
+    with mock.patch("builtins.print") as mock_print:
         result = load_model(device, model_path, args)
         assert result is None
-        mock_print.assert_called_with(f"{args.model_family}: downloader not implemented.")
+        mock_print.assert_called_with(
+            f"{args.model_family}: downloader not implemented."
+        )
+
 
 def test_load_model_loading_precision_int8():
-    device = 'cpu'
-    model_path = 'some-model-path'
+    device = "cpu"
+    model_path = "some-model-path"
 
     # Create a mock args object
     class Args:
@@ -280,34 +539,47 @@ class Args:
     args = Args()
 
     # Mock the tokenizer and model loading functions
-    with mock.patch('mm_poe.methods.utils.utils.AutoTokenizer') as mock_tokenizer_class:
-        with mock.patch('mm_poe.methods.utils.utils.AutoModelForCausalLM') as mock_model_class:
-            with mock.patch('mm_poe.methods.utils.utils.BitsAndBytesConfig') as mock_bnb_config_class:
+    with mock.patch(
+        "mm_poe.methods.utils.utils.AutoTokenizer"
+    ) as mock_tokenizer_class:
+        with mock.patch(
+            "mm_poe.methods.utils.utils.AutoModelForCausalLM"
+        ) as mock_model_class:
+            with mock.patch(
+                "mm_poe.methods.utils.utils.BitsAndBytesConfig"
+            ) as mock_bnb_config_class:
                 mock_tokenizer = MagicMock()
                 mock_model = MagicMock()
                 mock_bnb_config = MagicMock()
-                mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer
+                mock_tokenizer_class.from_pretrained.return_value = (
+                    mock_tokenizer
+                )
                 mock_model_class.from_pretrained.return_value = mock_model
                 mock_bnb_config_class.return_value = mock_bnb_config
 
                 # Set the return value of get_memory_footprint to a numeric value
-                mock_model.get_memory_footprint.return_value = 2 * 1024 ** 3  # 2 GB in bytes
+                mock_model.get_memory_footprint.return_value = (
+                    2 * 1024**3
+                )  # 2 GB in bytes
 
                 model, tokenizer = load_model(device, model_path, args)
 
                 # Check that BitsAndBytesConfig is called correctly
-                mock_bnb_config_class.assert_called_with(load_in_8bit=True, llm_int8_threshold=200.0)
+                mock_bnb_config_class.assert_called_with(
+                    load_in_8bit=True, llm_int8_threshold=200.0
+                )
                 # Check that model is loaded with quantization config
                 mock_model_class.from_pretrained.assert_called_with(
                     model_path,
                     torch_dtype=torch.float16,
                     device_map=device,
-                    quantization_config=mock_bnb_config
+                    quantization_config=mock_bnb_config,
                 )
 
+
 def test_load_model_loading_precision_int4():
-    device = 'cpu'
-    model_path = 'some-model-path'
+    device = "cpu"
+    model_path = "some-model-path"
 
     # Create a mock args object
     class Args:
@@ -317,38 +589,49 @@ class Args:
     args = Args()
 
     # Mock the tokenizer and model loading functions
-    with mock.patch('mm_poe.methods.utils.utils.AutoTokenizer') as mock_tokenizer_class:
-        with mock.patch('mm_poe.methods.utils.utils.AutoModelForCausalLM') as mock_model_class:
-            with mock.patch('mm_poe.methods.utils.utils.BitsAndBytesConfig') as mock_bnb_config_class:
+    with mock.patch(
+        "mm_poe.methods.utils.utils.AutoTokenizer"
+    ) as mock_tokenizer_class:
+        with mock.patch(
+            "mm_poe.methods.utils.utils.AutoModelForCausalLM"
+        ) as mock_model_class:
+            with mock.patch(
+                "mm_poe.methods.utils.utils.BitsAndBytesConfig"
+            ) as mock_bnb_config_class:
                 mock_tokenizer = MagicMock()
                 mock_model = MagicMock()
                 mock_bnb_config = MagicMock()
-                mock_tokenizer_class.from_pretrained.return_value = mock_tokenizer
+                mock_tokenizer_class.from_pretrained.return_value = (
+                    mock_tokenizer
+                )
                 mock_model_class.from_pretrained.return_value = mock_model
                 mock_bnb_config_class.return_value = mock_bnb_config
 
                 # Set the return value of get_memory_footprint to a numeric value
-                mock_model.get_memory_footprint.return_value = 2 * 1024 ** 3  # 2 GB in bytes
+                mock_model.get_memory_footprint.return_value = (
+                    2 * 1024**3
+                )  # 2 GB in bytes
 
                 model, tokenizer = load_model(device, model_path, args)
 
                 # Check that BitsAndBytesConfig is called correctly
                 mock_bnb_config_class.assert_called_with(
-                    load_in_4bit=True, 
+                    load_in_4bit=True,
                     bnb_4bit_quant_type="nf4",
                     bnb_4bit_use_double_quant=True,
-                    bnb_4bit_compute_dtype=torch.bfloat16
+                    bnb_4bit_compute_dtype=torch.bfloat16,
                 )
                 # Check that model is loaded with quantization config
                 mock_model_class.from_pretrained.assert_called_with(
                     model_path,
                     device_map=device,
-                    quantization_config=mock_bnb_config
+                    quantization_config=mock_bnb_config,
                 )
 
+
 def test_load_model_loading_precision_fp16():
-    device = 'cpu'
-    model_path = 'some-model-path'
+    device = "cpu"
+    model_path = "some-model-path"
 
     # Create a mock args object
     class Args:
@@ -358,8 +641,12 @@ class Args:
     args = Args()
 
     # Mock the tokenizer and model loading functions
-    with mock.patch('mm_poe.methods.utils.utils.AutoTokenizer') as mock_tokenizer_class:
-        with mock.patch('mm_poe.methods.utils.utils.AutoModelForCausalLM') as mock_model_class:
+    with mock.patch(
+        "mm_poe.methods.utils.utils.AutoTokenizer"
+    ) as mock_tokenizer_class:
+        with mock.patch(
+            "mm_poe.methods.utils.utils.AutoModelForCausalLM"
+        ) as mock_model_class:
             mock_tokenizer = MagicMock()
             mock_model = MagicMock()
             mock_bnb_config = MagicMock()
@@ -367,20 +654,21 @@ class Args:
             mock_model_class.from_pretrained.return_value = mock_model
 
             # Set the return value of get_memory_footprint to a numeric value
-            mock_model.get_memory_footprint.return_value = 2 * 1024 ** 3  # 2 GB in bytes
+            mock_model.get_memory_footprint.return_value = (
+                2 * 1024**3
+            )  # 2 GB in bytes
 
             model, tokenizer = load_model(device, model_path, args)
 
             # Check that model is loaded with quantization config
             mock_model_class.from_pretrained.assert_called_with(
-                model_path,
-                torch_dtype=torch.float16,
-                device_map=device
+                model_path, torch_dtype=torch.float16, device_map=device
             )
 
+
 def test_load_model_loading_precision_bf16():
-    device = 'cpu'
-    model_path = 'some-model-path'
+    device = "cpu"
+    model_path = "some-model-path"
 
     # Create a mock args object
     class Args:
@@ -390,8 +678,12 @@ class Args:
     args = Args()
 
     # Mock the tokenizer and model loading functions
-    with mock.patch('mm_poe.methods.utils.utils.AutoTokenizer') as mock_tokenizer_class:
-        with mock.patch('mm_poe.methods.utils.utils.AutoModelForCausalLM') as mock_model_class:
+    with mock.patch(
+        "mm_poe.methods.utils.utils.AutoTokenizer"
+    ) as mock_tokenizer_class:
+        with mock.patch(
+            "mm_poe.methods.utils.utils.AutoModelForCausalLM"
+        ) as mock_model_class:
             mock_tokenizer = MagicMock()
             mock_model = MagicMock()
             mock_bnb_config = MagicMock()
@@ -399,17 +691,18 @@ class Args:
             mock_model_class.from_pretrained.return_value = mock_model
 
             # Set the return value of get_memory_footprint to a numeric value
-            mock_model.get_memory_footprint.return_value = 2 * 1024 ** 3  # 2 GB in bytes
+            mock_model.get_memory_footprint.return_value = (
+                2 * 1024**3
+            )  # 2 GB in bytes
 
             model, tokenizer = load_model(device, model_path, args)
 
             # Check that model is loaded with quantization config
             mock_model_class.from_pretrained.assert_called_with(
-                model_path,
-                torch_dtype=torch.bfloat16,
-                device_map=device
+                model_path, torch_dtype=torch.bfloat16, device_map=device
             )
 
+
 # Tests for write_to_csv function
 def test_write_to_csv_process_of_elimination(tmp_path):
     save_path = tmp_path / "results.csv"
@@ -435,12 +728,13 @@ class Args:
 
     write_to_csv(str(save_path), args, total_accuracy)
 
-    with open(save_path, 'r') as f:
+    with open(save_path, "r") as f:
         content = f.read()
-        assert 'process_of_elimination' in content
+        assert "process_of_elimination" in content
         assert f"{args.mask_accuracy:.4f}" in content
         assert f"{total_accuracy:.4f}" in content
 
+
 def test_write_to_csv_contrastive_decoding(tmp_path):
     save_path = tmp_path / "results.csv"
 
@@ -466,13 +760,14 @@ class Args:
 
     write_to_csv(str(save_path), args, total_accuracy)
 
-    with open(save_path, 'r') as f:
+    with open(save_path, "r") as f:
         content = f.read()
-        assert 'contrastive_decoding' in content
+        assert "contrastive_decoding" in content
         assert f"{args.expert_accuracy:.4f}" in content
         assert f"{args.amateur_accuracy:.4f}" in content
         assert f"{total_accuracy:.4f}" in content
 
+
 def test_write_to_csv_generate_synonyms(tmp_path):
     save_path = tmp_path / "results.csv"
 
@@ -493,11 +788,12 @@ class Args:
 
     write_to_csv(str(save_path), args, total_accuracy)
 
-    with open(save_path, 'r') as f:
+    with open(save_path, "r") as f:
         content = f.read()
-        assert 'generate_synonyms' in content
+        assert "generate_synonyms" in content
         assert f"{total_accuracy:.4f}" in content
 
+
 def test_write_to_csv_default_method(tmp_path):
     save_path = tmp_path / "results.csv"
 
@@ -517,24 +813,30 @@ class Args:
 
     write_to_csv(str(save_path), args, total_accuracy)
 
-    with open(save_path, 'r') as f:
+    with open(save_path, "r") as f:
         content = f.read()
-        assert 'default_method' in content
+        assert "default_method" in content
         assert f"{total_accuracy:.4f}" in content
 
+
 # Additional tests for branches and edge cases
 def test_parse_args_invalid_choice():
     test_args = [
         "script_name",
-        "--model_family", "GPT2",
-        "--checkpoint", "gpt2-medium",
-        "--datasets", "copa",
-        "--loading_precision", "INVALID_PRECISION",
+        "--model_family",
+        "GPT2",
+        "--checkpoint",
+        "gpt2-medium",
+        "--datasets",
+        "copa",
+        "--loading_precision",
+        "INVALID_PRECISION",
     ]
-    with mock.patch.object(sys, 'argv', test_args):
+    with mock.patch.object(sys, "argv", test_args):
         with pytest.raises(SystemExit):
             parse_args()
 
+
 def test_write_to_csv_no_method(tmp_path):
     save_path = tmp_path / "results.csv"
 
@@ -555,4 +857,3 @@ class Args:
     # with pytest.raises(AttributeError):
     write_to_csv(str(save_path), args, total_accuracy)
     assert os.path.isfile(save_path) == True
-
diff --git a/tests/test_cli.py b/tests/test_cli.py
index caa0530..51b7328 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -7,31 +7,39 @@
 
 from mm_poe.cli import main
 
-@patch('mm_poe.cli.set_seed')
-@patch('mm_poe.cli.load_model')
-@patch('mm_poe.cli.subprocess.call')
-@patch('mm_poe.cli.questionary.select')
-@patch('mm_poe.cli.questionary.path')
-@patch('mm_poe.cli.questionary.text')
-def test_main(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+
+@patch("mm_poe.cli.set_seed")
+@patch("mm_poe.cli.load_model")
+@patch("mm_poe.cli.subprocess.call")
+@patch("mm_poe.cli.questionary.select")
+@patch("mm_poe.cli.questionary.path")
+@patch("mm_poe.cli.questionary.text")
+def test_main(
+    mock_text,
+    mock_path,
+    mock_select,
+    mock_subprocess_call,
+    mock_load_model,
+    mock_set_seed,
+):
     # Mock the inputs provided by questionary
     mock_select.return_value.ask.side_effect = [
-        'GIT',  # args.model_family
-        'microsoft/git-base-vqav2',  # args.checkpoint
-        'FP32',  # args.loading_precision
-        'language_modeling',  # args.scoring_method_for_process_of_elimination
-        'below_average',  # args.mask_strategy_for_process_of_elimination
-        '0'  # args.label
+        "GIT",  # args.model_family
+        "microsoft/git-base-vqav2",  # args.checkpoint
+        "FP32",  # args.loading_precision
+        "language_modeling",  # args.scoring_method_for_process_of_elimination
+        "below_average",  # args.mask_strategy_for_process_of_elimination
+        "0",  # args.label
     ]
 
     mock_path.return_value.ask.side_effect = [
-        './models/',  # args.output_dir
-        './images/image.png'  # args.image_path
+        "./models/",  # args.output_dir
+        "./images/image.png",  # args.image_path
     ]
 
     mock_text.return_value.ask.side_effect = [
-        'What is in the image?',  # args.question
-        'cat,dog,horse'  # args.choices
+        "What is in the image?",  # args.question
+        "cat,dog,horse",  # args.choices
     ]
 
     # Mock the subprocess.call to prevent actual execution
@@ -43,26 +51,31 @@ def test_main(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load
     mock_load_model.return_value = (mock_model, mock_tokenizer)
 
     # Mock the device
-    with patch('torch.device') as mock_device:
-        mock_device.return_value = 'cpu'
+    with patch("torch.device") as mock_device:
+        mock_device.return_value = "cpu"
 
         # Mock other functions called within main
-        with patch('mm_poe.cli.load_data') as mock_load_data, \
-             patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
-             patch('mm_poe.cli.inference_language_modeling') as mock_inference_lm, \
-             patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
-             patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
-             patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
+        with patch("mm_poe.cli.load_data") as mock_load_data, patch(
+            "torch.utils.data.DataLoader"
+        ) as mock_data_loader_class, patch(
+            "mm_poe.cli.inference_language_modeling"
+        ) as mock_inference_lm, patch(
+            "mm_poe.cli.inference_process_of_elimination"
+        ) as mock_inference_poe, patch(
+            "mm_poe.cli.compute_mask_process_of_elimination"
+        ) as mock_compute_mask, patch(
+            "mm_poe.cli.create_multiple_choice_prompt"
+        ) as mock_create_mcp:
 
             # Mock the datasets returned by load_data
             mock_dataset = MagicMock()
             mock_dataset.map.return_value = mock_dataset  # For the map calls
             mock_load_data.return_value = (
-                ['hypothesis0', 'hypothesis1', 'hypothesis2'],  # ending_names
-                'premise',  # header_name
-                'image_path',  # image_header_name
+                ["hypothesis0", "hypothesis1", "hypothesis2"],  # ending_names
+                "premise",  # header_name
+                "image_path",  # image_header_name
                 mock_dataset,  # raw_dataset
-                mock_dataset  # n_shot_dataset
+                mock_dataset,  # n_shot_dataset
             )
 
             # Mock the DataLoader
@@ -71,9 +84,19 @@ def test_main(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load
 
             # Mock inference functions
             # For scoring_method == 'language_modeling'
-            mock_inference_lm.return_value = (torch.tensor([[0.1, 0.2, 0.7]]), None, None, torch.tensor([2]))
+            mock_inference_lm.return_value = (
+                torch.tensor([[0.1, 0.2, 0.7]]),
+                None,
+                None,
+                torch.tensor([2]),
+            )
             # For inference_process_of_elimination
-            mock_inference_poe.return_value = (torch.tensor([[0.1, 0.2, 0.7]]), 1.0, None, torch.tensor([2]))
+            mock_inference_poe.return_value = (
+                torch.tensor([[0.1, 0.2, 0.7]]),
+                1.0,
+                None,
+                torch.tensor([2]),
+            )
 
             # Mock compute_mask_process_of_elimination
             mock_compute_mask.return_value = torch.tensor([[0, 1, 1]])
@@ -81,6 +104,7 @@ def test_main(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load
             # Mock create_multiple_choice_prompt
             def mock_create_mcp_fn(example, **kwargs):
                 return example
+
             mock_create_mcp.side_effect = mock_create_mcp_fn
 
             # Run the main function
@@ -94,30 +118,38 @@ def mock_create_mcp_fn(example, **kwargs):
             mock_inference_lm.assert_called()
             mock_inference_poe.assert_called()
 
-@patch('mm_poe.cli.set_seed')
-@patch('mm_poe.cli.load_model')
-@patch('mm_poe.cli.subprocess.call')
-@patch('mm_poe.cli.questionary.select')
-@patch('mm_poe.cli.questionary.path')
-@patch('mm_poe.cli.questionary.text')
-def test_main_with_calibration_lowest(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+
+@patch("mm_poe.cli.set_seed")
+@patch("mm_poe.cli.load_model")
+@patch("mm_poe.cli.subprocess.call")
+@patch("mm_poe.cli.questionary.select")
+@patch("mm_poe.cli.questionary.path")
+@patch("mm_poe.cli.questionary.text")
+def test_main_with_calibration_lowest(
+    mock_text,
+    mock_path,
+    mock_select,
+    mock_subprocess_call,
+    mock_load_model,
+    mock_set_seed,
+):
     mock_select.return_value.ask.side_effect = [
-        'BLIP2',  # args.model_family
-        'Salesforce/blip2-opt-2.7b',  # args.checkpoint
-        'FP16',  # args.loading_precision
-        'calibration',  # args.scoring_method_for_process_of_elimination
-        'lowest',  # args.mask_strategy_for_process_of_elimination
-        '1'  # args.label
+        "BLIP2",  # args.model_family
+        "Salesforce/blip2-opt-2.7b",  # args.checkpoint
+        "FP16",  # args.loading_precision
+        "calibration",  # args.scoring_method_for_process_of_elimination
+        "lowest",  # args.mask_strategy_for_process_of_elimination
+        "1",  # args.label
     ]
 
     mock_path.return_value.ask.side_effect = [
-        './models/',  # args.output_dir
-        './images/image.png'  # args.image_path
+        "./models/",  # args.output_dir
+        "./images/image.png",  # args.image_path
     ]
 
     mock_text.return_value.ask.side_effect = [
-        'Describe the image.',  # args.question
-        'apple,banana,orange'  # args.choices
+        "Describe the image.",  # args.question
+        "apple,banana,orange",  # args.choices
     ]
 
     mock_subprocess_call.return_value = 0
@@ -125,36 +157,52 @@ def test_main_with_calibration_lowest(mock_text, mock_path, mock_select, mock_su
     mock_tokenizer = MagicMock()
     mock_load_model.return_value = (mock_model, mock_tokenizer)
 
-    with patch('torch.device') as mock_device:
-        mock_device.return_value = 'cuda:0'
-
-        with patch('mm_poe.cli.load_data') as mock_load_data, \
-             patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
-             patch('mm_poe.cli.inference_calibration') as mock_inference_calibration, \
-             patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
-             patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
-             patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
+    with patch("torch.device") as mock_device:
+        mock_device.return_value = "cuda:0"
+
+        with patch("mm_poe.cli.load_data") as mock_load_data, patch(
+            "torch.utils.data.DataLoader"
+        ) as mock_data_loader_class, patch(
+            "mm_poe.cli.inference_calibration"
+        ) as mock_inference_calibration, patch(
+            "mm_poe.cli.inference_process_of_elimination"
+        ) as mock_inference_poe, patch(
+            "mm_poe.cli.compute_mask_process_of_elimination"
+        ) as mock_compute_mask, patch(
+            "mm_poe.cli.create_multiple_choice_prompt"
+        ) as mock_create_mcp:
 
             mock_dataset = MagicMock()
             mock_dataset.map.return_value = mock_dataset
             mock_load_data.return_value = (
-                ['hypothesis0', 'hypothesis1', 'hypothesis2'],
-                'premise',
-                'image_path',
+                ["hypothesis0", "hypothesis1", "hypothesis2"],
+                "premise",
+                "image_path",
+                mock_dataset,
                 mock_dataset,
-                mock_dataset
             )
 
             mock_data_loader = MagicMock()
             mock_data_loader_class.return_value = mock_data_loader
 
-            mock_inference_calibration.return_value = (torch.tensor([[0.3, 0.4, 0.3]]), None, None, torch.tensor([1]))
-            mock_inference_poe.return_value = (torch.tensor([[0.3, 0.4, 0.3]]), 1.0, None, torch.tensor([1]))
+            mock_inference_calibration.return_value = (
+                torch.tensor([[0.3, 0.4, 0.3]]),
+                None,
+                None,
+                torch.tensor([1]),
+            )
+            mock_inference_poe.return_value = (
+                torch.tensor([[0.3, 0.4, 0.3]]),
+                1.0,
+                None,
+                torch.tensor([1]),
+            )
 
             mock_compute_mask.return_value = torch.tensor([[1, 0, 1]])
 
             def mock_create_mcp_fn(example, **kwargs):
                 return example
+
             mock_create_mcp.side_effect = mock_create_mcp_fn
 
             main()
@@ -166,30 +214,38 @@ def mock_create_mcp_fn(example, **kwargs):
             mock_inference_calibration.assert_called()
             mock_inference_poe.assert_called()
 
-@patch('mm_poe.cli.set_seed')
-@patch('mm_poe.cli.load_model')
-@patch('mm_poe.cli.subprocess.call')
-@patch('mm_poe.cli.questionary.select')
-@patch('mm_poe.cli.questionary.path')
-@patch('mm_poe.cli.questionary.text')
-def test_main_with_mcp_lowest(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+
+@patch("mm_poe.cli.set_seed")
+@patch("mm_poe.cli.load_model")
+@patch("mm_poe.cli.subprocess.call")
+@patch("mm_poe.cli.questionary.select")
+@patch("mm_poe.cli.questionary.path")
+@patch("mm_poe.cli.questionary.text")
+def test_main_with_mcp_lowest(
+    mock_text,
+    mock_path,
+    mock_select,
+    mock_subprocess_call,
+    mock_load_model,
+    mock_set_seed,
+):
     mock_select.return_value.ask.side_effect = [
-        'BLIP2',  # args.model_family
-        'Salesforce/blip2-opt-2.7b',  # args.checkpoint
-        'FP16',  # args.loading_precision
-        'multiple_choice_prompt',  # args.scoring_method_for_process_of_elimination
-        'lowest',  # args.mask_strategy_for_process_of_elimination
-        '1'  # args.label
+        "BLIP2",  # args.model_family
+        "Salesforce/blip2-opt-2.7b",  # args.checkpoint
+        "FP16",  # args.loading_precision
+        "multiple_choice_prompt",  # args.scoring_method_for_process_of_elimination
+        "lowest",  # args.mask_strategy_for_process_of_elimination
+        "1",  # args.label
     ]
 
     mock_path.return_value.ask.side_effect = [
-        './models/',  # args.output_dir
-        './images/image.png'  # args.image_path
+        "./models/",  # args.output_dir
+        "./images/image.png",  # args.image_path
     ]
 
     mock_text.return_value.ask.side_effect = [
-        'Describe the image.',  # args.question
-        'apple,banana,orange'  # args.choices
+        "Describe the image.",  # args.question
+        "apple,banana,orange",  # args.choices
     ]
 
     mock_subprocess_call.return_value = 0
@@ -197,36 +253,52 @@ def test_main_with_mcp_lowest(mock_text, mock_path, mock_select, mock_subprocess
     mock_tokenizer = MagicMock()
     mock_load_model.return_value = (mock_model, mock_tokenizer)
 
-    with patch('torch.device') as mock_device:
-        mock_device.return_value = 'cuda:0'
-
-        with patch('mm_poe.cli.load_data') as mock_load_data, \
-             patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
-             patch('mm_poe.cli.inference_language_modeling') as mock_inference_language_modeling, \
-             patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
-             patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
-             patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
+    with patch("torch.device") as mock_device:
+        mock_device.return_value = "cuda:0"
+
+        with patch("mm_poe.cli.load_data") as mock_load_data, patch(
+            "torch.utils.data.DataLoader"
+        ) as mock_data_loader_class, patch(
+            "mm_poe.cli.inference_language_modeling"
+        ) as mock_inference_language_modeling, patch(
+            "mm_poe.cli.inference_process_of_elimination"
+        ) as mock_inference_poe, patch(
+            "mm_poe.cli.compute_mask_process_of_elimination"
+        ) as mock_compute_mask, patch(
+            "mm_poe.cli.create_multiple_choice_prompt"
+        ) as mock_create_mcp:
 
             mock_dataset = MagicMock()
             mock_dataset.map.return_value = mock_dataset
             mock_load_data.return_value = (
-                ['hypothesis0', 'hypothesis1', 'hypothesis2'],
-                'premise',
-                'image_path',
+                ["hypothesis0", "hypothesis1", "hypothesis2"],
+                "premise",
+                "image_path",
+                mock_dataset,
                 mock_dataset,
-                mock_dataset
             )
 
             mock_data_loader = MagicMock()
             mock_data_loader_class.return_value = mock_data_loader
 
-            mock_inference_language_modeling.return_value = (torch.tensor([[0.3, 0.4, 0.3]]), None, None, torch.tensor([1]))
-            mock_inference_poe.return_value = (torch.tensor([[0.3, 0.4, 0.3]]), 1.0, None, torch.tensor([1]))
+            mock_inference_language_modeling.return_value = (
+                torch.tensor([[0.3, 0.4, 0.3]]),
+                None,
+                None,
+                torch.tensor([1]),
+            )
+            mock_inference_poe.return_value = (
+                torch.tensor([[0.3, 0.4, 0.3]]),
+                1.0,
+                None,
+                torch.tensor([1]),
+            )
 
             mock_compute_mask.return_value = torch.tensor([[1, 0, 1]])
 
             def mock_create_mcp_fn(example, **kwargs):
                 return example
+
             mock_create_mcp.side_effect = mock_create_mcp_fn
 
             main()
@@ -238,30 +310,38 @@ def mock_create_mcp_fn(example, **kwargs):
             mock_inference_language_modeling.assert_called()
             mock_inference_poe.assert_called()
 
-@patch('mm_poe.cli.set_seed')
-@patch('mm_poe.cli.load_model')
-@patch('mm_poe.cli.subprocess.call')
-@patch('mm_poe.cli.questionary.select')
-@patch('mm_poe.cli.questionary.path')
-@patch('mm_poe.cli.questionary.text')
-def test_main_with_channel_below_average(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+
+@patch("mm_poe.cli.set_seed")
+@patch("mm_poe.cli.load_model")
+@patch("mm_poe.cli.subprocess.call")
+@patch("mm_poe.cli.questionary.select")
+@patch("mm_poe.cli.questionary.path")
+@patch("mm_poe.cli.questionary.text")
+def test_main_with_channel_below_average(
+    mock_text,
+    mock_path,
+    mock_select,
+    mock_subprocess_call,
+    mock_load_model,
+    mock_set_seed,
+):
     mock_select.return_value.ask.side_effect = [
-        'BLIP2',  # args.model_family
-        'Salesforce/blip2-opt-2.7b',  # args.checkpoint
-        'FP16',  # args.loading_precision
-        'channel',  # args.scoring_method_for_process_of_elimination
-        'below_average',  # args.mask_strategy_for_process_of_elimination
-        '1'  # args.label
+        "BLIP2",  # args.model_family
+        "Salesforce/blip2-opt-2.7b",  # args.checkpoint
+        "FP16",  # args.loading_precision
+        "channel",  # args.scoring_method_for_process_of_elimination
+        "below_average",  # args.mask_strategy_for_process_of_elimination
+        "1",  # args.label
     ]
 
     mock_path.return_value.ask.side_effect = [
-        './models/',  # args.output_dir
-        './images/image.png'  # args.image_path
+        "./models/",  # args.output_dir
+        "./images/image.png",  # args.image_path
     ]
 
     mock_text.return_value.ask.side_effect = [
-        'Describe the image.',  # args.question
-        'apple,banana,orange'  # args.choices
+        "Describe the image.",  # args.question
+        "apple,banana,orange",  # args.choices
     ]
 
     mock_subprocess_call.return_value = 0
@@ -269,36 +349,52 @@ def test_main_with_channel_below_average(mock_text, mock_path, mock_select, mock
     mock_tokenizer = MagicMock()
     mock_load_model.return_value = (mock_model, mock_tokenizer)
 
-    with patch('torch.device') as mock_device:
-        mock_device.return_value = 'cuda:0'
-
-        with patch('mm_poe.cli.load_data') as mock_load_data, \
-             patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
-             patch('mm_poe.cli.inference_language_modeling') as mock_inference_language_modeling, \
-             patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
-             patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
-             patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
+    with patch("torch.device") as mock_device:
+        mock_device.return_value = "cuda:0"
+
+        with patch("mm_poe.cli.load_data") as mock_load_data, patch(
+            "torch.utils.data.DataLoader"
+        ) as mock_data_loader_class, patch(
+            "mm_poe.cli.inference_language_modeling"
+        ) as mock_inference_language_modeling, patch(
+            "mm_poe.cli.inference_process_of_elimination"
+        ) as mock_inference_poe, patch(
+            "mm_poe.cli.compute_mask_process_of_elimination"
+        ) as mock_compute_mask, patch(
+            "mm_poe.cli.create_multiple_choice_prompt"
+        ) as mock_create_mcp:
 
             mock_dataset = MagicMock()
             mock_dataset.map.return_value = mock_dataset
             mock_load_data.return_value = (
-                ['hypothesis0', 'hypothesis1', 'hypothesis2'],
-                'premise',
-                'image_path',
+                ["hypothesis0", "hypothesis1", "hypothesis2"],
+                "premise",
+                "image_path",
+                mock_dataset,
                 mock_dataset,
-                mock_dataset
             )
 
             mock_data_loader = MagicMock()
             mock_data_loader_class.return_value = mock_data_loader
 
-            mock_inference_language_modeling.return_value = (torch.tensor([[0.3, 0.4, 0.3]]), None, None, torch.tensor([1]))
-            mock_inference_poe.return_value = (torch.tensor([[0.3, 0.4, 0.3]]), 1.0, None, torch.tensor([1]))
+            mock_inference_language_modeling.return_value = (
+                torch.tensor([[0.3, 0.4, 0.3]]),
+                None,
+                None,
+                torch.tensor([1]),
+            )
+            mock_inference_poe.return_value = (
+                torch.tensor([[0.3, 0.4, 0.3]]),
+                1.0,
+                None,
+                torch.tensor([1]),
+            )
 
             mock_compute_mask.return_value = torch.tensor([[1, 0, 1]])
 
             def mock_create_mcp_fn(example, **kwargs):
                 return example
+
             mock_create_mcp.side_effect = mock_create_mcp_fn
 
             main()
@@ -310,53 +406,72 @@ def mock_create_mcp_fn(example, **kwargs):
             mock_inference_language_modeling.assert_called()
             mock_inference_poe.assert_called()
 
-@patch('mm_poe.cli.set_seed')
-@patch('mm_poe.cli.load_model')
-@patch('mm_poe.cli.subprocess.call')
-@patch('mm_poe.cli.questionary.select')
-@patch('mm_poe.cli.questionary.path')
-@patch('mm_poe.cli.questionary.text')
-def test_main_with_mask_strategy_min_k(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+
+@patch("mm_poe.cli.set_seed")
+@patch("mm_poe.cli.load_model")
+@patch("mm_poe.cli.subprocess.call")
+@patch("mm_poe.cli.questionary.select")
+@patch("mm_poe.cli.questionary.path")
+@patch("mm_poe.cli.questionary.text")
+def test_main_with_mask_strategy_min_k(
+    mock_text,
+    mock_path,
+    mock_select,
+    mock_subprocess_call,
+    mock_load_model,
+    mock_set_seed,
+):
     mock_select.return_value.ask.side_effect = [
-        'GIT',
-        'microsoft/git-base-vqav2',
-        'FP32',
-        'language_modeling',
-        'min_k',
-        '0'
+        "GIT",
+        "microsoft/git-base-vqav2",
+        "FP32",
+        "language_modeling",
+        "min_k",
+        "0",
+    ]
+    mock_path.return_value.ask.side_effect = [
+        "./models/",
+        "./images/image.png",
+    ]
+    mock_text.return_value.ask.side_effect = [
+        "What is in the image?",
+        "cat,dog,horse",
     ]
-    mock_path.return_value.ask.side_effect = ['./models/', './images/image.png']
-    mock_text.return_value.ask.side_effect = ['What is in the image?', 'cat,dog,horse']
     mock_subprocess_call.return_value = 0
     mock_model = MagicMock()
     mock_tokenizer = MagicMock()
     mock_load_model.return_value = (mock_model, mock_tokenizer)
 
-    with patch('torch.device') as mock_device:
-        mock_device.return_value = 'cpu'
+    with patch("torch.device") as mock_device:
+        mock_device.return_value = "cpu"
 
         # Modify args to include mask_token
-        with patch('mm_poe.cli.Namespace') as mock_namespace:
+        with patch("mm_poe.cli.Namespace") as mock_namespace:
             args = MagicMock()
             args.min_k = 1
-            args.process_of_elimination_prompt = 'Select the most suitable option to answer the question. Ignore [MASK] options.'
+            args.process_of_elimination_prompt = "Select the most suitable option to answer the question. Ignore [MASK] options."
             mock_namespace.return_value = args
 
-            with patch('mm_poe.cli.load_data') as mock_load_data, \
-                patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
-                patch('mm_poe.cli.inference_language_modeling') as mock_inference_lm, \
-                patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
-                patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
-                patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
+            with patch("mm_poe.cli.load_data") as mock_load_data, patch(
+                "torch.utils.data.DataLoader"
+            ) as mock_data_loader_class, patch(
+                "mm_poe.cli.inference_language_modeling"
+            ) as mock_inference_lm, patch(
+                "mm_poe.cli.inference_process_of_elimination"
+            ) as mock_inference_poe, patch(
+                "mm_poe.cli.compute_mask_process_of_elimination"
+            ) as mock_compute_mask, patch(
+                "mm_poe.cli.create_multiple_choice_prompt"
+            ) as mock_create_mcp:
 
                 mock_dataset = MagicMock()
                 mock_dataset.map.return_value = mock_dataset
                 mock_load_data.return_value = (
-                    ['hypothesis0', 'hypothesis1', 'hypothesis2'],
-                    'premise',
-                    'image_path',
+                    ["hypothesis0", "hypothesis1", "hypothesis2"],
+                    "premise",
+                    "image_path",
+                    mock_dataset,
                     mock_dataset,
-                    mock_dataset
                 )
 
                 mock_data_loader = MagicMock()
@@ -364,67 +479,99 @@ def test_main_with_mask_strategy_min_k(mock_text, mock_path, mock_select, mock_s
 
                 predictions = torch.tensor([[0.1, 0.2, 0.7]])
                 masks = torch.tensor([[0, 1, 1]])
-                mock_inference_lm.return_value = (predictions, None, None, torch.tensor([2]))
-                mock_inference_poe.return_value = (predictions, 1.0, None, torch.tensor([2]))
+                mock_inference_lm.return_value = (
+                    predictions,
+                    None,
+                    None,
+                    torch.tensor([2]),
+                )
+                mock_inference_poe.return_value = (
+                    predictions,
+                    1.0,
+                    None,
+                    torch.tensor([2]),
+                )
                 mock_compute_mask.return_value = masks
 
                 def mock_create_mcp_fn(example, **kwargs):
                     return example
+
                 mock_create_mcp.side_effect = mock_create_mcp_fn
 
                 main()
                 mock_set_seed.assert_called_once_with(0)
                 mock_load_model.assert_called()
                 mock_load_data.assert_called()
-                mock_compute_mask.assert_called_with(predictions, 'min_k', min_k=1)
-
-@patch('mm_poe.cli.set_seed')
-@patch('mm_poe.cli.load_model')
-@patch('mm_poe.cli.subprocess.call')
-@patch('mm_poe.cli.questionary.select')
-@patch('mm_poe.cli.questionary.path')
-@patch('mm_poe.cli.questionary.text')
-def test_main_with_mask_token(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+                mock_compute_mask.assert_called_with(
+                    predictions, "min_k", min_k=1
+                )
+
+
+@patch("mm_poe.cli.set_seed")
+@patch("mm_poe.cli.load_model")
+@patch("mm_poe.cli.subprocess.call")
+@patch("mm_poe.cli.questionary.select")
+@patch("mm_poe.cli.questionary.path")
+@patch("mm_poe.cli.questionary.text")
+def test_main_with_mask_token(
+    mock_text,
+    mock_path,
+    mock_select,
+    mock_subprocess_call,
+    mock_load_model,
+    mock_set_seed,
+):
     mock_select.return_value.ask.side_effect = [
-        'GIT',
-        'microsoft/git-base-vqav2',
-        'FP32',
-        'language_modeling',
-        'below_average',
-        '0'
+        "GIT",
+        "microsoft/git-base-vqav2",
+        "FP32",
+        "language_modeling",
+        "below_average",
+        "0",
+    ]
+    mock_path.return_value.ask.side_effect = [
+        "./models/",
+        "./images/image.png",
+    ]
+    mock_text.return_value.ask.side_effect = [
+        "What is in the image?",
+        "cat,dog,horse",
     ]
-    mock_path.return_value.ask.side_effect = ['./models/', './images/image.png']
-    mock_text.return_value.ask.side_effect = ['What is in the image?', 'cat,dog,horse']
     mock_subprocess_call.return_value = 0
     mock_model = MagicMock()
     mock_tokenizer = MagicMock()
     mock_load_model.return_value = (mock_model, mock_tokenizer)
 
-    with patch('torch.device') as mock_device:
-        mock_device.return_value = 'cpu'
+    with patch("torch.device") as mock_device:
+        mock_device.return_value = "cpu"
 
         # Modify args to include mask_token
-        with patch('mm_poe.cli.Namespace') as mock_namespace:
+        with patch("mm_poe.cli.Namespace") as mock_namespace:
             args = MagicMock()
-            args.mask_token = 'XXX'
-            args.process_of_elimination_prompt = 'Select the most suitable option to answer the question. Ignore [MASK] options.'
+            args.mask_token = "XXX"
+            args.process_of_elimination_prompt = "Select the most suitable option to answer the question. Ignore [MASK] options."
             mock_namespace.return_value = args
 
-            with patch('mm_poe.cli.load_data') as mock_load_data, \
-                 patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
-                 patch('mm_poe.cli.inference_language_modeling') as mock_inference_lm, \
-                 patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
-                 patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
-                 patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
+            with patch("mm_poe.cli.load_data") as mock_load_data, patch(
+                "torch.utils.data.DataLoader"
+            ) as mock_data_loader_class, patch(
+                "mm_poe.cli.inference_language_modeling"
+            ) as mock_inference_lm, patch(
+                "mm_poe.cli.inference_process_of_elimination"
+            ) as mock_inference_poe, patch(
+                "mm_poe.cli.compute_mask_process_of_elimination"
+            ) as mock_compute_mask, patch(
+                "mm_poe.cli.create_multiple_choice_prompt"
+            ) as mock_create_mcp:
 
                 mock_dataset = MagicMock()
                 mock_dataset.map.return_value = mock_dataset
                 mock_load_data.return_value = (
-                    ['hypothesis0', 'hypothesis1', 'hypothesis2'],
-                    'premise',
-                    'image_path',
+                    ["hypothesis0", "hypothesis1", "hypothesis2"],
+                    "premise",
+                    "image_path",
+                    mock_dataset,
                     mock_dataset,
-                    mock_dataset
                 )
 
                 mock_data_loader = MagicMock()
@@ -432,68 +579,100 @@ def test_main_with_mask_token(mock_text, mock_path, mock_select, mock_subprocess
 
                 predictions = torch.tensor([[0.1, 0.2, 0.7]])
                 masks = torch.tensor([[0, 1, 1]])
-                mock_inference_lm.return_value = (predictions, None, None, torch.tensor([2]))
-                mock_inference_poe.return_value = (predictions, 1.0, None, torch.tensor([2]))
+                mock_inference_lm.return_value = (
+                    predictions,
+                    None,
+                    None,
+                    torch.tensor([2]),
+                )
+                mock_inference_poe.return_value = (
+                    predictions,
+                    1.0,
+                    None,
+                    torch.tensor([2]),
+                )
                 mock_compute_mask.return_value = masks
 
                 def mock_create_mcp_fn(example, **kwargs):
-                    assert '[MASK]' not in kwargs['multiple_choice_prompt']
+                    assert "[MASK]" not in kwargs["multiple_choice_prompt"]
                     return example
+
                 mock_create_mcp.side_effect = mock_create_mcp_fn
 
                 main()
                 mock_set_seed.assert_called_once_with(0)
                 mock_load_model.assert_called()
                 mock_load_data.assert_called()
-                mock_compute_mask.assert_called_with(predictions, 'below_average')
-
-@patch('mm_poe.cli.set_seed')
-@patch('mm_poe.cli.load_model')
-@patch('mm_poe.cli.subprocess.call')
-@patch('mm_poe.cli.questionary.select')
-@patch('mm_poe.cli.questionary.path')
-@patch('mm_poe.cli.questionary.text')
-def test_main_with_mask_strategy_min_k(mock_text, mock_path, mock_select, mock_subprocess_call, mock_load_model, mock_set_seed):
+                mock_compute_mask.assert_called_with(
+                    predictions, "below_average"
+                )
+
+
+@patch("mm_poe.cli.set_seed")
+@patch("mm_poe.cli.load_model")
+@patch("mm_poe.cli.subprocess.call")
+@patch("mm_poe.cli.questionary.select")
+@patch("mm_poe.cli.questionary.path")
+@patch("mm_poe.cli.questionary.text")
+def test_main_with_mask_strategy_min_k(
+    mock_text,
+    mock_path,
+    mock_select,
+    mock_subprocess_call,
+    mock_load_model,
+    mock_set_seed,
+):
     mock_select.return_value.ask.side_effect = [
-        'GIT',
-        'microsoft/git-base-vqav2',
-        'FP32',
-        'language_modeling',
-        'min_k',
-        '0'
+        "GIT",
+        "microsoft/git-base-vqav2",
+        "FP32",
+        "language_modeling",
+        "min_k",
+        "0",
+    ]
+    mock_path.return_value.ask.side_effect = [
+        "./models/",
+        "./images/image.png",
+    ]
+    mock_text.return_value.ask.side_effect = [
+        "What is in the image?",
+        "cat,dog,horse",
     ]
-    mock_path.return_value.ask.side_effect = ['./models/', './images/image.png']
-    mock_text.return_value.ask.side_effect = ['What is in the image?', 'cat,dog,horse']
     mock_subprocess_call.return_value = 0
     mock_model = MagicMock()
     mock_tokenizer = MagicMock()
     mock_load_model.return_value = (mock_model, mock_tokenizer)
 
-    with patch('torch.device') as mock_device:
-        mock_device.return_value = 'cpu'
+    with patch("torch.device") as mock_device:
+        mock_device.return_value = "cpu"
 
         # Modify args to include mask_token
-        with patch('mm_poe.cli.Namespace') as mock_namespace:
+        with patch("mm_poe.cli.Namespace") as mock_namespace:
             args = MagicMock()
             args.min_k = 10
-            args.process_of_elimination_prompt = 'Select the most suitable option to answer the question. Ignore [MASK] options.'
+            args.process_of_elimination_prompt = "Select the most suitable option to answer the question. Ignore [MASK] options."
             mock_namespace.return_value = args
 
-            with patch('mm_poe.cli.load_data') as mock_load_data, \
-                patch('torch.utils.data.DataLoader') as mock_data_loader_class, \
-                patch('mm_poe.cli.inference_language_modeling') as mock_inference_lm, \
-                patch('mm_poe.cli.inference_process_of_elimination') as mock_inference_poe, \
-                patch('mm_poe.cli.compute_mask_process_of_elimination') as mock_compute_mask, \
-                patch('mm_poe.cli.create_multiple_choice_prompt') as mock_create_mcp:
+            with patch("mm_poe.cli.load_data") as mock_load_data, patch(
+                "torch.utils.data.DataLoader"
+            ) as mock_data_loader_class, patch(
+                "mm_poe.cli.inference_language_modeling"
+            ) as mock_inference_lm, patch(
+                "mm_poe.cli.inference_process_of_elimination"
+            ) as mock_inference_poe, patch(
+                "mm_poe.cli.compute_mask_process_of_elimination"
+            ) as mock_compute_mask, patch(
+                "mm_poe.cli.create_multiple_choice_prompt"
+            ) as mock_create_mcp:
 
                 mock_dataset = MagicMock()
                 mock_dataset.map.return_value = mock_dataset
                 mock_load_data.return_value = (
-                    ['hypothesis0', 'hypothesis1', 'hypothesis2'],
-                    'premise',
-                    'image_path',
+                    ["hypothesis0", "hypothesis1", "hypothesis2"],
+                    "premise",
+                    "image_path",
+                    mock_dataset,
                     mock_dataset,
-                    mock_dataset
                 )
 
                 mock_data_loader = MagicMock()
@@ -501,16 +680,29 @@ def test_main_with_mask_strategy_min_k(mock_text, mock_path, mock_select, mock_s
 
                 predictions = torch.tensor([[0.1, 0.2, 0.7]])
                 masks = torch.tensor([[0, 1, 1]])
-                mock_inference_lm.return_value = (predictions, None, None, torch.tensor([2]))
-                mock_inference_poe.return_value = (predictions, 1.0, None, torch.tensor([2]))
+                mock_inference_lm.return_value = (
+                    predictions,
+                    None,
+                    None,
+                    torch.tensor([2]),
+                )
+                mock_inference_poe.return_value = (
+                    predictions,
+                    1.0,
+                    None,
+                    torch.tensor([2]),
+                )
                 mock_compute_mask.return_value = masks
 
                 def mock_create_mcp_fn(example, **kwargs):
                     return example
+
                 mock_create_mcp.side_effect = mock_create_mcp_fn
 
                 main()
                 mock_set_seed.assert_called_once_with(0)
                 mock_load_model.assert_called()
                 mock_load_data.assert_called()
-                mock_compute_mask.assert_called_with(predictions, 'min_k', min_k=2)
\ No newline at end of file
+                mock_compute_mask.assert_called_with(
+                    predictions, "min_k", min_k=2
+                )
diff --git a/tests/test_main.py b/tests/test_main.py
index 9620fe8..b065992 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -4,14 +4,17 @@
 from unittest.mock import patch
 import runpy
 
+
 def test_main_called():
-    with patch('mm_poe.cli.main') as mock_main:
+    with patch("mm_poe.cli.main") as mock_main:
         # Simulate running __main__.py as the main module
-        runpy.run_module('mm_poe.__main__', run_name='__main__')
+        runpy.run_module("mm_poe.__main__", run_name="__main__")
         mock_main.assert_called_once()
 
+
 def test_main_not_called_when_imported():
-    with patch('mm_poe.cli.main') as mock_main:
+    with patch("mm_poe.cli.main") as mock_main:
         # Import __main__.py as a module; __name__ will not be '__main__'
         import mm_poe.__main__
+
         mock_main.assert_not_called()

From edea0a9cb6b0acb97223bcd3b5fb91fddaf20d03 Mon Sep 17 00:00:00 2001
From: Souradip Pal <souradip.iitg@gmail.com>
Date: Sat, 19 Oct 2024 01:35:00 -0500
Subject: [PATCH 30/30] Modified requirements.txt

---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 0d101d4..ae284a2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,6 @@ torch
 huggingface_hub
 bitsandbytes
 accelerate
+questionary
+pillow
 gdown
\ No newline at end of file