From e3610b149f8609c0dd1996eeb9d113f4758116d5 Mon Sep 17 00:00:00 2001
From: wheynelau <waynelau15045@gmail.com>
Date: Tue, 11 Jun 2024 13:16:42 +0800
Subject: [PATCH 01/12] fix: python>3.9 sample does not work on set

Sampling from a set deprecated
since Python 3.9 and will be removed in a subsequent version.
---
 evaluate.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/evaluate.py b/evaluate.py
index a8be841..4263534 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -7,6 +7,8 @@
 import argparse
 import datetime
 
+from typing import List, Dict, Tuple
+
 random.seed(0)
 
 if torch.cuda.is_available():
@@ -14,18 +16,17 @@
 else:
     device = "cpu"
 
-
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_path", type=str, help="Path to pretrained model", required=True)
     parser.add_argument("--name", type=str,help="Output File Name", default="model_name", required=True)
     parser.add_argument("--run_full", type=str,help="run 0, 1, 3 shots", default="true")
-
+    parser.add_argument("--no_tqdm", type=bool, help='whether to disable tqdm', default= False)
     parser.add_argument("--output_folder", type=str, default="output")
     args = parser.parse_args()
     return args
 
-def read_ttbhs():
+def read_ttbhs() -> List[Dict]:
     questions = []
     with open("quiz-tatabahasa.jsonl") as fopen:
         for no, l in enumerate(fopen):
@@ -47,7 +48,7 @@ def read_ttbhs():
     print(f"Running {len(questions)} questions")
     return questions
 
-def read_bmpt3():
+def read_bmpt3() -> List[Dict]:
     with open('BM-A-pt3') as fopen:
         text = fopen.read()
     
@@ -68,7 +69,7 @@ def read_bmpt3():
     
     return questions
 
-def convert_prompt(row, answer = False):
+def convert_prompt(row, answer = False) -> str:
     if answer:
         prompt = f"""
 objektif: {row['objektif']}
@@ -86,20 +87,20 @@ def convert_prompt(row, answer = False):
 def most_common(l):
     return max(set(l), key=l.count)
 
-def evaluate(questions):
+def evaluate(questions:List[Dict]) -> float:
     filtered = [q for q in questions if 'output' in q]
     correct = 0
     for q in filtered:
         correct += most_common(q['output']) == q['jawapan']
     return (correct / len(filtered)) * 100
 
-def run_test(args, model, tokenizer, questions, n_shots):
+def run_test(args, model, tokenizer, questions, n_shots) -> Tuple[List[Dict], float]:
 
-    for i in tqdm(range(len(questions))):
+    for i in tqdm(range(len(questions)), leave=True, disable = args.no_tqdm):
         prompts = []
         if n_shots:
             arange = set(range(len(questions)))
-            shots = random.sample(arange - {i}, n_shots)
+            shots = random.sample(sorted(arange - {i}), n_shots)
             for no, s in enumerate(shots):
                 prompts.append(f'Contoh soalan {no + 1}\n' + convert_prompt(questions[s], answer = True))
         prompts.append(convert_prompt(questions[i]))

From 90110a41d1d6884c1c04470e7d36235939c3d739 Mon Sep 17 00:00:00 2001
From: wheynelau <waynelau15045@gmail.com>
Date: Tue, 11 Jun 2024 13:21:36 +0800
Subject: [PATCH 02/12] docs and feat: add docs and change flags in args

---
 evaluate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/evaluate.py b/evaluate.py
index 4263534..933ac4b 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -20,8 +20,8 @@ def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_path", type=str, help="Path to pretrained model", required=True)
     parser.add_argument("--name", type=str,help="Output File Name", default="model_name", required=True)
-    parser.add_argument("--run_full", type=str,help="run 0, 1, 3 shots", default="true")
-    parser.add_argument("--no_tqdm", type=bool, help='whether to disable tqdm', default= False)
+    parser.add_argument("--run_full", type=bool, help="run 0, 1, 3 shots", action = "store_true")
+    parser.add_argument("--no_tqdm", type=bool, help='whether to disable tqdm', action = "store_true")
     parser.add_argument("--output_folder", type=str, default="output")
     args = parser.parse_args()
     return args

From 776df525fa14f7bf587a0342d49bbcada0959368 Mon Sep 17 00:00:00 2001
From: wheynelau <waynelau15045@gmail.com>
Date: Tue, 11 Jun 2024 13:26:43 +0800
Subject: [PATCH 03/12] doc: add warning for run_full

---
 evaluate.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/evaluate.py b/evaluate.py
index 933ac4b..8205166 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -20,8 +20,8 @@ def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_path", type=str, help="Path to pretrained model", required=True)
     parser.add_argument("--name", type=str,help="Output File Name", default="model_name", required=True)
-    parser.add_argument("--run_full", type=bool, help="run 0, 1, 3 shots", action = "store_true")
-    parser.add_argument("--no_tqdm", type=bool, help='whether to disable tqdm', action = "store_true")
+    parser.add_argument("--run_full", help="run 0, 1, 3 shots", action = "store_true")
+    parser.add_argument("--tqdm", help='whether to run tqdm', action = "store_true")
     parser.add_argument("--output_folder", type=str, default="output")
     args = parser.parse_args()
     return args
@@ -161,6 +161,9 @@ def main():
     config['full_run'] = args.run_full
     config['timestamp'] = timestamp
     
+    if not args.run_full:
+        print("The recent change sets the default value of run_full to `False`, if this is not intended "
+              "run `evaluate.py` with `--run_full`")
 
     qns_ttbhs = read_ttbhs()
     qns_bmpt3 = read_bmpt3()

From bc3f00190f2f8ada4ee102637c0927697b2b9ba4 Mon Sep 17 00:00:00 2001
From: wheynelau <waynelau15045@gmail.com>
Date: Tue, 11 Jun 2024 13:29:26 +0800
Subject: [PATCH 04/12] fix: no_tqdm to tqdm

---
 evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluate.py b/evaluate.py
index 8205166..499a79e 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -96,7 +96,7 @@ def evaluate(questions:List[Dict]) -> float:
 
 def run_test(args, model, tokenizer, questions, n_shots) -> Tuple[List[Dict], float]:
 
-    for i in tqdm(range(len(questions)), leave=True, disable = args.no_tqdm):
+    for i in tqdm(range(len(questions)), leave=True, disable = args.tqdm):
         prompts = []
         if n_shots:
             arange = set(range(len(questions)))

From 67196b2667ee68f632b7ac10bc3c8607571998bc Mon Sep 17 00:00:00 2001
From: wheynelau <waynelau15045@gmail.com>
Date: Tue, 11 Jun 2024 13:36:43 +0800
Subject: [PATCH 05/12] docs: add docs for tqdm and some functions

---
 evaluate.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/evaluate.py b/evaluate.py
index 499a79e..b83683e 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -84,7 +84,7 @@ def convert_prompt(row, answer = False) -> str:
     """
     return prompt.strip()
 
-def most_common(l):
+def most_common(l:List) -> str:
     return max(set(l), key=l.count)
 
 def evaluate(questions:List[Dict]) -> float:
@@ -96,7 +96,9 @@ def evaluate(questions:List[Dict]) -> float:
 
 def run_test(args, model, tokenizer, questions, n_shots) -> Tuple[List[Dict], float]:
 
-    for i in tqdm(range(len(questions)), leave=True, disable = args.tqdm):
+    # not args.tqdm => if true, then disable = False => enable tqdm
+    #               => if false, then disable = True => disable tqdm
+    for i in tqdm(range(len(questions)), leave=True, disable = not args.tqdm):
         prompts = []
         if n_shots:
             arange = set(range(len(questions)))
@@ -205,6 +207,7 @@ def main():
             json.dump(merged, fopen, indent=4)
     
     else: #3 shot for 5 qns - for debugging
+        #TODO: Can we remove the for loop below? 
         #tatabahasa
         for i in [3]:
             q, s = run_test(args, 

From faf54bc9eceba3891f7d0fbe4aa4af5a8d9a7ee3 Mon Sep 17 00:00:00 2001
From: wheynelau <waynelau15045@gmail.com>
Date: Tue, 11 Jun 2024 13:43:28 +0800
Subject: [PATCH 06/12] feat: add markdown print for readability

---
 evaluate.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/evaluate.py b/evaluate.py
index b83683e..741680b 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -236,8 +236,11 @@ def main():
             conf = {"config": config}
             merged = {**data, **conf}
             json.dump(merged, fopen, indent=4)
-    
-    print(scores)
+    try:
+        import pandas as pd
+        print(pd.DataFrame(scores).to_markdown())
+    except ImportError:
+        print(scores)
 
 if __name__ == "__main__":
     main()

From e6100184a7562601e7fcb4cb773b5b7e2684adb0 Mon Sep 17 00:00:00 2001
From: wheynelau <waynelau15045@gmail.com>
Date: Tue, 11 Jun 2024 13:56:31 +0800
Subject: [PATCH 07/12] feat: add print questions for both benchmarks

---
 evaluate.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/evaluate.py b/evaluate.py
index 741680b..35fa21f 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -16,6 +16,8 @@
 else:
     device = "cpu"
 
+# TODO: Convert prints to logs
+
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_path", type=str, help="Path to pretrained model", required=True)
@@ -45,7 +47,7 @@ def read_ttbhs() -> List[Dict]:
                 'jawapan': jawapan,
             }
             questions.append(data)
-    print(f"Running {len(questions)} questions")
+    print(f"TTBHS: Running {len(questions)} questions")
     return questions
 
 def read_bmpt3() -> List[Dict]:
@@ -66,7 +68,7 @@ def read_bmpt3() -> List[Dict]:
             'jawapan': jawapan,
         }
         questions.append(data)
-    
+    print(f"BM-A-PT3: Running {len(questions)} questions")
     return questions
 
 def convert_prompt(row, answer = False) -> str:

From 440bf34a1c0d77cb14430e0f4f7767fc77b3be3e Mon Sep 17 00:00:00 2001
From: wheynelau <waynelau15045@gmail.com>
Date: Thu, 13 Jun 2024 08:18:24 +0000
Subject: [PATCH 08/12] feat!: multiple debugging arguments

BREAKING CHANGE: Potentially breaking for previous workflows due to new arguments and changes to default values
---
 evaluate.py | 115 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 74 insertions(+), 41 deletions(-)

diff --git a/evaluate.py b/evaluate.py
index 35fa21f..71ba933 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -1,4 +1,5 @@
-from transformers import AutoTokenizer, AutoModelForCausalLM
+from doctest import debug
+from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
 import torch
 from tqdm import tqdm
 import os
@@ -6,10 +7,15 @@
 import json
 import argparse
 import datetime
-
+import logging
 from typing import List, Dict, Tuple
 
-random.seed(0)
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s',
+                    handlers=[logging.StreamHandler()])
+
+
+logger = logging.getLogger(__name__)
 
 if torch.cuda.is_available():
     device = "cuda"
@@ -24,7 +30,12 @@ def parse_args():
     parser.add_argument("--name", type=str,help="Output File Name", default="model_name", required=True)
     parser.add_argument("--run_full", help="run 0, 1, 3 shots", action = "store_true")
     parser.add_argument("--tqdm", help='whether to run tqdm', action = "store_true")
+    parser.add_argument("--num_fewshots", help="Number of few shots as a string etc '0,1,2'", default = "0,1,3")
     parser.add_argument("--output_folder", type=str, default="output")
+    parser.add_argument("--repeat",type = int, default=5, help= "Number of loops per prompt")
+    parser.add_argument("--limit",type = int, default=0, help = 'run N number of samples')
+    parser.add_argument("--device", type=str, help = "device map")
+    parser.add_argument("--deterministic", help="disable sampling", action = "store_true")
     args = parser.parse_args()
     return args
 
@@ -47,7 +58,7 @@ def read_ttbhs() -> List[Dict]:
                 'jawapan': jawapan,
             }
             questions.append(data)
-    print(f"TTBHS: Running {len(questions)} questions")
+    logging.info(f"TTBHS: Running {len(questions)} questions")
     return questions
 
 def read_bmpt3() -> List[Dict]:
@@ -96,67 +107,89 @@ def evaluate(questions:List[Dict]) -> float:
         correct += most_common(q['output']) == q['jawapan']
     return (correct / len(filtered)) * 100
 
-def run_test(args, model, tokenizer, questions, n_shots) -> Tuple[List[Dict], float]:
+def run_test(args, model, tokenizer, questions, n_shots, n_repeat:int = 5) -> Tuple[List[Dict], float]:
 
+    generate_kwargs = dict(
+                    max_new_tokens=3,
+                    top_p=0.95,
+                    top_k=50,
+                    temperature=0.5,
+                    # if no_sample is true, then do_sample = False
+                    do_sample=not args.deterministic,
+                    num_beams=1,
+                    repetition_penalty=1.05,
+                )
     # not args.tqdm => if true, then disable = False => enable tqdm
     #               => if false, then disable = True => disable tqdm
+    set_seed(1234)
+
     for i in tqdm(range(len(questions)), leave=True, disable = not args.tqdm):
         prompts = []
+        
         if n_shots:
             arange = set(range(len(questions)))
-            shots = random.sample(sorted(arange - {i}), n_shots)
+            if args.deterministic:
+                shots = sorted(arange - {i})[:n_shots]
+            else:
+                shots = random.sample(sorted(arange - {i}), n_shots)
             for no, s in enumerate(shots):
                 prompts.append(f'Contoh soalan {no + 1}\n' + convert_prompt(questions[s], answer = True))
         prompts.append(convert_prompt(questions[i]))
         prompt = '\n\n'.join(prompts)
-        inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')
+        inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to(args.device)
         inputs.pop('token_type_ids', None)
-        repeat = []
-        for _ in range(5):
+        repeat, debug_output, debug_toks = [], [], []
+        for _ in range(n_repeat):
             try:
-                generate_kwargs = dict(
-                    inputs,
-                    max_new_tokens=3,
-                    top_p=0.95,
-                    top_k=50,
-                    temperature=0.5,
-                    do_sample=True,
-                    num_beams=1,
-                    repetition_penalty=1.05,
-                )
-                r = model.generate(**generate_kwargs)
-                r = tokenizer.decode(r[0]).split('jawapan:')[-1].strip().split()
+                r = model.generate(**inputs,**generate_kwargs)
+                r = tokenizer.decode(r[0]).split('jawapan:')[-1]
+                debug_output.append(r)
+                r = r.strip().split()
                 repeat.append(r[0].replace('.', '').replace('</s>', '').split('\\')[0].split('/')[0])
         
             except Exception as e:
-                print(e)
+                print(e, r)
                 pass
-        
+        questions[i]['input_tok'] = inputs.input_ids.tolist()[0]
+        questions[i]['prompt'] = prompt
         questions[i]['output'] = repeat
+        questions[i]['debug'] = debug_output
 
     # with open(f'{args.output_folder}/output-{n_shots}shot-{args.name}.json', 'w') as fopen:
     #     json.dump(questions, fopen)
 
     score = evaluate(questions)
 
-    # print (f"{n_shots}shot: {score}")
+    # logging.error (f"{n_shots}shot: {score}")
 
     return questions, score
 
 def main():
 
     args = parse_args()
+    logging.basicConfig(filename=f'eval.log', level=logging.INFO)
+    logger.info(args)
 
     timestamp = datetime.datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
 
     os.makedirs(args.output_folder + '/' + timestamp, exist_ok=True)
 
+    if not args.run_full:
+        logger.warning("The recent change sets the default value of run_full to `False`, if this is not intended "
+              "run `evaluate.py` with `--run_full`")
+    if args.deterministic:
+        logger.warning("No sampling should only be used for debugging purposes.\n"
+                       "This will disable random n_shots sampling and use the first n_shots instead.\n"
+                       "Repeats will also be disabled.")
+        args.repeat = 1
+        logger.warning("Setting repeat to 1")
+
     tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code = True)
     model = AutoModelForCausalLM.from_pretrained(
         args.model_path,
         trust_remote_code = True,
         torch_dtype = torch.float16,
-        device_map=device
+        device_map=args.device if args.device else device
     )
 
     config = {}
@@ -164,10 +197,6 @@ def main():
     config['model_args'] = args.model_path
     config['full_run'] = args.run_full
     config['timestamp'] = timestamp
-    
-    if not args.run_full:
-        print("The recent change sets the default value of run_full to `False`, if this is not intended "
-              "run `evaluate.py` with `--run_full`")
 
     qns_ttbhs = read_ttbhs()
     qns_bmpt3 = read_bmpt3()
@@ -177,30 +206,34 @@ def main():
     ttbhs_scores = {}
     bmpt3_scores = {}
 
+    num_fewshots = [int(x) for x in args.num_fewshots.split(',')]
+
 
     if args.run_full: #Full 0,1,3 for both tests
-        #tatabahasa
-        for i in [0,1,3]:
+        # === tatabahasa ===
+        for i in num_fewshots:
             q, s = run_test(args, 
                 model=model, 
                 tokenizer=tokenizer,
-                questions=qns_ttbhs,
-                n_shots=i)
+                questions=qns_ttbhs[:args.limit] if args.limit else qns_ttbhs,
+                n_shots=i,
+               n_repeat=args.repeat)
             with open(f'{args.output_folder}/{timestamp}/output-tatabahasa-{i}shot-{args.name}.json', 'w') as fopen:
-                json.dump(q, fopen)
+                json.dump(q, fopen, indent=4)
             ttbhs_scores[f'n_shot={i}'] = s
-        #bmpt3
-        for i in [0,1,3]:
+        scores['tatabahasa'] = ttbhs_scores
+        # === bmpt3 ===
+        for i in num_fewshots:
             q, s = run_test(args, 
                 model=model, 
                 tokenizer=tokenizer,
-                questions=qns_bmpt3,
-                n_shots=i)
+                questions=qns_bmpt3[:args.limit] if args.limit else qns_bmpt3,
+                n_shots=i,
+                n_repeat=args.repeat)
             with open(f'{args.output_folder}/{timestamp}/output-bmpt3-{i}shot-{args.name}.json', 'w') as fopen:
-                json.dump(q, fopen)
+                json.dump(q, fopen, indent= 4)
             bmpt3_scores[f'n_shot={i}'] = s
-        
-        scores['tatabahasa'] = ttbhs_scores
+
         scores['bmpt3'] = bmpt3_scores
         with open(f'{args.output_folder}/{timestamp}/score.json', 'w') as fopen:
             data = {"results": scores}

From 2d97664ec6c9076019a07e25de2a4ec001222b78 Mon Sep 17 00:00:00 2001
From: wheynelau <waynelau15045@gmail.com>
Date: Thu, 13 Jun 2024 08:19:51 +0000
Subject: [PATCH 09/12] chore: add output folder

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 0b30d5b..2229174 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,5 @@
 *quiz-tatabahasa.jsonl
 
 .DS_Store
-*.pbs
\ No newline at end of file
+*.pbs
+output
\ No newline at end of file

From 6e01cc4246ba7f6ea81288ba1b8b7121131e0e25 Mon Sep 17 00:00:00 2001
From: wheynelau <waynelau15045@gmail.com>
Date: Thu, 13 Jun 2024 08:22:57 +0000
Subject: [PATCH 10/12] chore: lint

---
 evaluate.py | 298 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 182 insertions(+), 116 deletions(-)

diff --git a/evaluate.py b/evaluate.py
index 71ba933..8e8cae4 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -10,9 +10,11 @@
 import logging
 from typing import List, Dict, Tuple
 
-logging.basicConfig(level=logging.INFO,
-                    format='%(asctime)s - %(levelname)s - %(message)s',
-                    handlers=[logging.StreamHandler()])
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler()],
+)
 
 
 logger = logging.getLogger(__name__)
@@ -24,65 +26,83 @@
 
 # TODO: Convert prints to logs
 
+
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model_path", type=str, help="Path to pretrained model", required=True)
-    parser.add_argument("--name", type=str,help="Output File Name", default="model_name", required=True)
-    parser.add_argument("--run_full", help="run 0, 1, 3 shots", action = "store_true")
-    parser.add_argument("--tqdm", help='whether to run tqdm', action = "store_true")
-    parser.add_argument("--num_fewshots", help="Number of few shots as a string etc '0,1,2'", default = "0,1,3")
+    parser.add_argument(
+        "--model_path", type=str, help="Path to pretrained model", required=True
+    )
+    parser.add_argument(
+        "--name", type=str, help="Output File Name", default="model_name", required=True
+    )
+    parser.add_argument("--run_full", help="run 0, 1, 3 shots", action="store_true")
+    parser.add_argument("--tqdm", help="whether to run tqdm", action="store_true")
+    parser.add_argument(
+        "--num_fewshots",
+        help="Number of few shots as a string etc '0,1,2'",
+        default="0,1,3",
+    )
     parser.add_argument("--output_folder", type=str, default="output")
-    parser.add_argument("--repeat",type = int, default=5, help= "Number of loops per prompt")
-    parser.add_argument("--limit",type = int, default=0, help = 'run N number of samples')
-    parser.add_argument("--device", type=str, help = "device map")
-    parser.add_argument("--deterministic", help="disable sampling", action = "store_true")
+    parser.add_argument(
+        "--repeat", type=int, default=5, help="Number of loops per prompt"
+    )
+    parser.add_argument("--limit", type=int, default=0, help="run N number of samples")
+    parser.add_argument("--device", type=str, help="device map")
+    parser.add_argument("--deterministic", help="disable sampling", action="store_true")
     args = parser.parse_args()
     return args
 
+
 def read_ttbhs() -> List[Dict]:
     questions = []
     with open("quiz-tatabahasa.jsonl") as fopen:
         for no, l in enumerate(fopen):
             l = json.loads(l)
-            soalan = [l['question']]
+            soalan = [l["question"]]
             jawapan = None
-            for c, k in l['choices'].items():
+            for c, k in l["choices"].items():
                 soalan.append(f"{c}. {k['text']}")
-                if k['answer']:
+                if k["answer"]:
                     jawapan = c
-            
+
             data = {
-                'no': no,
-                'objektif': 'Jawab soalan yang diberikan' if l['instruction'] is None else l['instruction'],
-                'soalan': '\n'.join(soalan),
-                'jawapan': jawapan,
+                "no": no,
+                "objektif": (
+                    "Jawab soalan yang diberikan"
+                    if l["instruction"] is None
+                    else l["instruction"]
+                ),
+                "soalan": "\n".join(soalan),
+                "jawapan": jawapan,
             }
             questions.append(data)
     logging.info(f"TTBHS: Running {len(questions)} questions")
     return questions
 
+
 def read_bmpt3() -> List[Dict]:
-    with open('BM-A-pt3') as fopen:
+    with open("BM-A-pt3") as fopen:
         text = fopen.read()
-    
+
     questions = []
-    for t in text.split('no: ')[1:]:
+    for t in text.split("no: ")[1:]:
         t = t.strip()
-        no = t.split('\n')[0]
-        objektif = t.split('objektif: ')[1].split('\n')[0]
-        soalan = t.split('soalan:')[1].split('jawapan:')[0].strip()
-        jawapan = t.split('jawapan: ')[1].split(',')[0].strip()
+        no = t.split("\n")[0]
+        objektif = t.split("objektif: ")[1].split("\n")[0]
+        soalan = t.split("soalan:")[1].split("jawapan:")[0].strip()
+        jawapan = t.split("jawapan: ")[1].split(",")[0].strip()
         data = {
-            'no': no,
-            'objektif': objektif,
-            'soalan': soalan,
-            'jawapan': jawapan,
+            "no": no,
+            "objektif": objektif,
+            "soalan": soalan,
+            "jawapan": jawapan,
         }
         questions.append(data)
     print(f"BM-A-PT3: Running {len(questions)} questions")
     return questions
 
-def convert_prompt(row, answer = False) -> str:
+
+def convert_prompt(row, answer=False) -> str:
     if answer:
         prompt = f"""
 objektif: {row['objektif']}
@@ -97,35 +117,42 @@ def convert_prompt(row, answer = False) -> str:
     """
     return prompt.strip()
 
-def most_common(l:List) -> str:
+
+def most_common(l: List) -> str:
     return max(set(l), key=l.count)
 
-def evaluate(questions:List[Dict]) -> float:
-    filtered = [q for q in questions if 'output' in q]
+
+def evaluate(questions: List[Dict]) -> float:
+    filtered = [q for q in questions if "output" in q]
     correct = 0
     for q in filtered:
-        correct += most_common(q['output']) == q['jawapan']
+        correct += most_common(q["output"]) == q["jawapan"]
     return (correct / len(filtered)) * 100
 
-def run_test(args, model, tokenizer, questions, n_shots, n_repeat:int = 5) -> Tuple[List[Dict], float]:
+
+def run_test(
+    args, model, tokenizer, questions, n_shots, n_repeat: int = 5
+) -> Tuple[List[Dict], float]:
 
     generate_kwargs = dict(
-                    max_new_tokens=3,
-                    top_p=0.95,
-                    top_k=50,
-                    temperature=0.5,
-                    # if no_sample is true, then do_sample = False
-                    do_sample=not args.deterministic,
-                    num_beams=1,
-                    repetition_penalty=1.05,
-                )
+        max_new_tokens=3,
+        top_p=0.95,
+        top_k=50,
+        temperature=0.5,
+        # if no_sample is true, then do_sample = False
+        do_sample=not args.deterministic,
+        num_beams=1,
+        repetition_penalty=1.05,
+    )
+
+    if args.deterministic:
+        # try to be deterministic every round
+        set_seed(1234)
     # not args.tqdm => if true, then disable = False => enable tqdm
     #               => if false, then disable = True => disable tqdm
-    set_seed(1234)
-
-    for i in tqdm(range(len(questions)), leave=True, disable = not args.tqdm):
+    for i in tqdm(range(len(questions)), leave=True, disable=not args.tqdm):
         prompts = []
-        
+
         if n_shots:
             arange = set(range(len(questions)))
             if args.deterministic:
@@ -133,27 +160,38 @@ def run_test(args, model, tokenizer, questions, n_shots, n_repeat:int = 5) -> Tu
             else:
                 shots = random.sample(sorted(arange - {i}), n_shots)
             for no, s in enumerate(shots):
-                prompts.append(f'Contoh soalan {no + 1}\n' + convert_prompt(questions[s], answer = True))
+                prompts.append(
+                    f"Contoh soalan {no + 1}\n"
+                    + convert_prompt(questions[s], answer=True)
+                )
         prompts.append(convert_prompt(questions[i]))
-        prompt = '\n\n'.join(prompts)
-        inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to(args.device)
-        inputs.pop('token_type_ids', None)
+        prompt = "\n\n".join(prompts)
+        inputs = tokenizer([prompt], return_tensors="pt", add_special_tokens=False).to(
+            args.device
+        )
+        inputs.pop("token_type_ids", None)
         repeat, debug_output, debug_toks = [], [], []
         for _ in range(n_repeat):
             try:
-                r = model.generate(**inputs,**generate_kwargs)
-                r = tokenizer.decode(r[0]).split('jawapan:')[-1]
+                r = model.generate(**inputs, **generate_kwargs)
+                r = tokenizer.decode(r[0]).split("jawapan:")[-1]
                 debug_output.append(r)
                 r = r.strip().split()
-                repeat.append(r[0].replace('.', '').replace('</s>', '').split('\\')[0].split('/')[0])
-        
+                repeat.append(
+                    r[0]
+                    .replace(".", "")
+                    .replace("</s>", "")
+                    .split("\\")[0]
+                    .split("/")[0]
+                )
+
             except Exception as e:
                 print(e, r)
                 pass
-        questions[i]['input_tok'] = inputs.input_ids.tolist()[0]
-        questions[i]['prompt'] = prompt
-        questions[i]['output'] = repeat
-        questions[i]['debug'] = debug_output
+        questions[i]["input_tok"] = inputs.input_ids.tolist()[0]
+        questions[i]["prompt"] = prompt
+        questions[i]["output"] = repeat
+        questions[i]["debug"] = debug_output
 
     # with open(f'{args.output_folder}/output-{n_shots}shot-{args.name}.json', 'w') as fopen:
     #     json.dump(questions, fopen)
@@ -164,39 +202,46 @@ def run_test(args, model, tokenizer, questions, n_shots, n_repeat:int = 5) -> Tu
 
     return questions, score
 
+
 def main():
 
     args = parse_args()
-    logging.basicConfig(filename=f'eval.log', level=logging.INFO)
+    logging.basicConfig(filename=f"eval.log", level=logging.INFO)
     logger.info(args)
 
-    timestamp = datetime.datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
+    timestamp = (
+        datetime.datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
+    )
 
-    os.makedirs(args.output_folder + '/' + timestamp, exist_ok=True)
+    os.makedirs(args.output_folder + "/" + timestamp, exist_ok=True)
 
     if not args.run_full:
-        logger.warning("The recent change sets the default value of run_full to `False`, if this is not intended "
-              "run `evaluate.py` with `--run_full`")
+        logger.warning(
+            "The recent change sets the default value of run_full to `False`, if this is not intended "
+            "run `evaluate.py` with `--run_full`"
+        )
     if args.deterministic:
-        logger.warning("No sampling should only be used for debugging purposes.\n"
-                       "This will disable random n_shots sampling and use the first n_shots instead.\n"
-                       "Repeats will also be disabled.")
+        logger.warning(
+            "No sampling should only be used for debugging purposes.\n"
+            "This will disable random n_shots sampling and use the first n_shots instead.\n"
+            "Repeats will also be disabled."
+        )
         args.repeat = 1
         logger.warning("Setting repeat to 1")
 
-    tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code = True)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(
         args.model_path,
-        trust_remote_code = True,
-        torch_dtype = torch.float16,
-        device_map=args.device if args.device else device
+        trust_remote_code=True,
+        torch_dtype=torch.float16,
+        device_map=args.device if args.device else device,
     )
 
     config = {}
-    config['run_name'] = args.name
-    config['model_args'] = args.model_path
-    config['full_run'] = args.run_full
-    config['timestamp'] = timestamp
+    config["run_name"] = args.name
+    config["model_args"] = args.model_path
+    config["full_run"] = args.run_full
+    config["timestamp"] = timestamp
 
     qns_ttbhs = read_ttbhs()
     qns_bmpt3 = read_bmpt3()
@@ -206,76 +251,97 @@ def main():
     ttbhs_scores = {}
     bmpt3_scores = {}
 
-    num_fewshots = [int(x) for x in args.num_fewshots.split(',')]
+    num_fewshots = [int(x) for x in args.num_fewshots.split(",")]
 
-
-    if args.run_full: #Full 0,1,3 for both tests
+    if args.run_full:  # Full 0,1,3 for both tests
         # === tatabahasa ===
         for i in num_fewshots:
-            q, s = run_test(args, 
-                model=model, 
+            q, s = run_test(
+                args,
+                model=model,
                 tokenizer=tokenizer,
-                questions=qns_ttbhs[:args.limit] if args.limit else qns_ttbhs,
+                questions=qns_ttbhs[: args.limit] if args.limit else qns_ttbhs,
                 n_shots=i,
-               n_repeat=args.repeat)
-            with open(f'{args.output_folder}/{timestamp}/output-tatabahasa-{i}shot-{args.name}.json', 'w') as fopen:
+                n_repeat=args.repeat,
+            )
+            with open(
+                f"{args.output_folder}/{timestamp}/output-tatabahasa-{i}shot-{args.name}.json",
+                "w",
+            ) as fopen:
                 json.dump(q, fopen, indent=4)
-            ttbhs_scores[f'n_shot={i}'] = s
-        scores['tatabahasa'] = ttbhs_scores
+            ttbhs_scores[f"n_shot={i}"] = s
+        scores["tatabahasa"] = ttbhs_scores
         # === bmpt3 ===
         for i in num_fewshots:
-            q, s = run_test(args, 
-                model=model, 
+            q, s = run_test(
+                args,
+                model=model,
                 tokenizer=tokenizer,
-                questions=qns_bmpt3[:args.limit] if args.limit else qns_bmpt3,
+                questions=qns_bmpt3[: args.limit] if args.limit else qns_bmpt3,
                 n_shots=i,
-                n_repeat=args.repeat)
-            with open(f'{args.output_folder}/{timestamp}/output-bmpt3-{i}shot-{args.name}.json', 'w') as fopen:
-                json.dump(q, fopen, indent= 4)
-            bmpt3_scores[f'n_shot={i}'] = s
+                n_repeat=args.repeat,
+            )
+            with open(
+                f"{args.output_folder}/{timestamp}/output-bmpt3-{i}shot-{args.name}.json",
+                "w",
+            ) as fopen:
+                json.dump(q, fopen, indent=4)
+            bmpt3_scores[f"n_shot={i}"] = s
 
-        scores['bmpt3'] = bmpt3_scores
-        with open(f'{args.output_folder}/{timestamp}/score.json', 'w') as fopen:
+        scores["bmpt3"] = bmpt3_scores
+        with open(f"{args.output_folder}/{timestamp}/score.json", "w") as fopen:
             data = {"results": scores}
             conf = {"config": config}
             merged = {**data, **conf}
             json.dump(merged, fopen, indent=4)
-    
-    else: #3 shot for 5 qns - for debugging
-        #TODO: Can we remove the for loop below? 
-        #tatabahasa
+
+    else:  # 3 shot for 5 qns - for debugging
+        # TODO: Can we remove the for loop below?
+        # tatabahasa
         for i in [3]:
-            q, s = run_test(args, 
-                model=model, 
+            q, s = run_test(
+                args,
+                model=model,
                 tokenizer=tokenizer,
                 questions=qns_ttbhs[:5],
-                n_shots=i)
-            with open(f'{args.output_folder}/{timestamp}/output-tatabahasa-{i}shot-{args.name}.json', 'w') as fopen:
+                n_shots=i,
+            )
+            with open(
+                f"{args.output_folder}/{timestamp}/output-tatabahasa-{i}shot-{args.name}.json",
+                "w",
+            ) as fopen:
                 json.dump(q, fopen)
-            ttbhs_scores[f'n_shot={i}'] = s
-        #bmpt3
+            ttbhs_scores[f"n_shot={i}"] = s
+        # bmpt3
         for i in [3]:
-            q, s = run_test(args, 
-                model=model, 
+            q, s = run_test(
+                args,
+                model=model,
                 tokenizer=tokenizer,
                 questions=qns_bmpt3[:5],
-                n_shots=i)
-            with open(f'{args.output_folder}/{timestamp}/output-bmpt3-{i}shot-{args.name}.json', 'w') as fopen:
+                n_shots=i,
+            )
+            with open(
+                f"{args.output_folder}/{timestamp}/output-bmpt3-{i}shot-{args.name}.json",
+                "w",
+            ) as fopen:
                 json.dump(q, fopen)
-            bmpt3_scores[f'n_shot={i}'] = s
-        
-        scores['tatabahasa'] = ttbhs_scores
-        scores['bmpt3'] = bmpt3_scores
-        with open(f'{args.output_folder}/{timestamp}/score.json', 'w') as fopen:
+            bmpt3_scores[f"n_shot={i}"] = s
+
+        scores["tatabahasa"] = ttbhs_scores
+        scores["bmpt3"] = bmpt3_scores
+        with open(f"{args.output_folder}/{timestamp}/score.json", "w") as fopen:
             data = {"results": scores}
             conf = {"config": config}
             merged = {**data, **conf}
             json.dump(merged, fopen, indent=4)
     try:
         import pandas as pd
+
         print(pd.DataFrame(scores).to_markdown())
     except ImportError:
         print(scores)
 
+
 if __name__ == "__main__":
     main()

From 4161cf8819739d7f6184d5657e30efdfb481cbf2 Mon Sep 17 00:00:00 2001
From: wheynelau <waynelau15045@gmail.com>
Date: Fri, 14 Jun 2024 05:20:09 +0000
Subject: [PATCH 11/12] feat: add deterministic flag

---
 evaluate.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/evaluate.py b/evaluate.py
index 8e8cae4..74dd7b8 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -155,10 +155,11 @@ def run_test(
 
         if n_shots:
             arange = set(range(len(questions)))
-            if args.deterministic:
-                shots = sorted(arange - {i})[:n_shots]
-            else:
-                shots = random.sample(sorted(arange - {i}), n_shots)
+            shots = (
+                    sorted(arange - {i})[:n_shots] # if deterministic, then use the first n_shots
+                      if args.deterministic else
+                    random.sample(sorted(arange - {i}), n_shots) # else sample n_shots
+                    )
             for no, s in enumerate(shots):
                 prompts.append(
                     f"Contoh soalan {no + 1}\n"

From 99ba3def1de8e5f5b3210606444773338e027873 Mon Sep 17 00:00:00 2001
From: wheynelau <waynelau15045@gmail.com>
Date: Wed, 19 Jun 2024 06:43:07 +0000
Subject: [PATCH 12/12] check for file sep

---
 evaluate.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/evaluate.py b/evaluate.py
index 74dd7b8..5dbc906 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -39,7 +39,7 @@ def parse_args():
     parser.add_argument("--tqdm", help="whether to run tqdm", action="store_true")
     parser.add_argument(
         "--num_fewshots",
-        help="Number of few shots as a string etc '0,1,2'",
+        help="Number of few shots as a string etc '0,1,3'",
         default="0,1,3",
     )
     parser.add_argument("--output_folder", type=str, default="output")
@@ -47,9 +47,12 @@ def parse_args():
         "--repeat", type=int, default=5, help="Number of loops per prompt"
     )
     parser.add_argument("--limit", type=int, default=0, help="run N number of samples")
-    parser.add_argument("--device", type=str, help="device map")
+    parser.add_argument("--device", type=str, help="device map", default='cuda')
     parser.add_argument("--deterministic", help="disable sampling", action="store_true")
     args = parser.parse_args()
+    if "/" in args.name:
+        logging.warning("name should not contain /")
+        args.name = args.name.replace("/", "_")
     return args
 
 
@@ -187,7 +190,7 @@ def run_test(
                 )
 
             except Exception as e:
-                print(e, r)
+                print(e)
                 pass
         questions[i]["input_tok"] = inputs.input_ids.tolist()[0]
         questions[i]["prompt"] = prompt
@@ -214,7 +217,7 @@ def main():
         datetime.datetime.now().replace(microsecond=0).isoformat().replace(":", "_")
     )
 
-    os.makedirs(args.output_folder + "/" + timestamp, exist_ok=True)
+    os.makedirs(os.path.join(args.output_folder,timestamp), exist_ok=True)
 
     if not args.run_full:
         logger.warning(
@@ -336,12 +339,12 @@ def main():
             conf = {"config": config}
             merged = {**data, **conf}
             json.dump(merged, fopen, indent=4)
-    try:
-        import pandas as pd
+    # try:
+    #     import pandas as pd
 
-        print(pd.DataFrame(scores).to_markdown())
-    except ImportError:
-        print(scores)
+    #     print(pd.DataFrame(scores).to_markdown())
+    # except ImportError:
+    print(scores)
 
 
 if __name__ == "__main__":