From e3610b149f8609c0dd1996eeb9d113f4758116d5 Mon Sep 17 00:00:00 2001 From: wheynelau <waynelau15045@gmail.com> Date: Tue, 11 Jun 2024 13:16:42 +0800 Subject: [PATCH 01/12] fix: python>3.9 sample does not work on set Sampling from a set deprecated since Python 3.9 and will be removed in a subsequent version. --- evaluate.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/evaluate.py b/evaluate.py index a8be841..4263534 100644 --- a/evaluate.py +++ b/evaluate.py @@ -7,6 +7,8 @@ import argparse import datetime +from typing import List, Dict, Tuple + random.seed(0) if torch.cuda.is_available(): @@ -14,18 +16,17 @@ else: device = "cpu" - def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, help="Path to pretrained model", required=True) parser.add_argument("--name", type=str,help="Output File Name", default="model_name", required=True) parser.add_argument("--run_full", type=str,help="run 0, 1, 3 shots", default="true") - + parser.add_argument("--no_tqdm", type=bool, help='whether to disable tqdm', default= False) parser.add_argument("--output_folder", type=str, default="output") args = parser.parse_args() return args -def read_ttbhs(): +def read_ttbhs() -> List[Dict]: questions = [] with open("quiz-tatabahasa.jsonl") as fopen: for no, l in enumerate(fopen): @@ -47,7 +48,7 @@ def read_ttbhs(): print(f"Running {len(questions)} questions") return questions -def read_bmpt3(): +def read_bmpt3() -> List[Dict]: with open('BM-A-pt3') as fopen: text = fopen.read() @@ -68,7 +69,7 @@ def read_bmpt3(): return questions -def convert_prompt(row, answer = False): +def convert_prompt(row, answer = False) -> str: if answer: prompt = f""" objektif: {row['objektif']} @@ -86,20 +87,20 @@ def convert_prompt(row, answer = False): def most_common(l): return max(set(l), key=l.count) -def evaluate(questions): +def evaluate(questions:List[Dict]) -> float: filtered = [q for q in questions if 'output' in q] correct = 0 for q in filtered: correct += most_common(q['output']) == q['jawapan'] return (correct / len(filtered)) * 100 -def run_test(args, model, tokenizer, questions, n_shots): +def run_test(args, model, tokenizer, questions, n_shots) -> Tuple[List[Dict], float]: - for i in tqdm(range(len(questions))): + for i in tqdm(range(len(questions)), leave=True, disable = args.no_tqdm): prompts = [] if n_shots: arange = set(range(len(questions))) - shots = random.sample(arange - {i}, n_shots) + shots = random.sample(sorted(arange - {i}), n_shots) for no, s in enumerate(shots): prompts.append(f'Contoh soalan {no + 1}\n' + convert_prompt(questions[s], answer = True)) prompts.append(convert_prompt(questions[i])) From 90110a41d1d6884c1c04470e7d36235939c3d739 Mon Sep 17 00:00:00 2001 From: wheynelau <waynelau15045@gmail.com> Date: Tue, 11 Jun 2024 13:21:36 +0800 Subject: [PATCH 02/12] docs and feat: add docs and change flags in args --- evaluate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evaluate.py b/evaluate.py index 4263534..933ac4b 100644 --- a/evaluate.py +++ b/evaluate.py @@ -20,8 +20,8 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, help="Path to pretrained model", required=True) parser.add_argument("--name", type=str,help="Output File Name", default="model_name", required=True) - parser.add_argument("--run_full", type=str,help="run 0, 1, 3 shots", default="true") - parser.add_argument("--no_tqdm", type=bool, help='whether to disable tqdm', default= False) + parser.add_argument("--run_full", type=bool, help="run 0, 1, 3 shots", action = "store_true") + parser.add_argument("--no_tqdm", type=bool, help='whether to disable tqdm', action = "store_true") parser.add_argument("--output_folder", type=str, default="output") args = parser.parse_args() return args From 776df525fa14f7bf587a0342d49bbcada0959368 Mon Sep 17 00:00:00 2001 From: wheynelau <waynelau15045@gmail.com> Date: Tue, 11 Jun 2024 13:26:43 +0800 Subject: [PATCH 03/12] doc: add warning for run_full --- evaluate.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/evaluate.py b/evaluate.py index 933ac4b..8205166 100644 --- a/evaluate.py +++ b/evaluate.py @@ -20,8 +20,8 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, help="Path to pretrained model", required=True) parser.add_argument("--name", type=str,help="Output File Name", default="model_name", required=True) - parser.add_argument("--run_full", type=bool, help="run 0, 1, 3 shots", action = "store_true") - parser.add_argument("--no_tqdm", type=bool, help='whether to disable tqdm', action = "store_true") + parser.add_argument("--run_full", help="run 0, 1, 3 shots", action = "store_true") + parser.add_argument("--tqdm", help='whether to run tqdm', action = "store_true") parser.add_argument("--output_folder", type=str, default="output") args = parser.parse_args() return args @@ -161,6 +161,9 @@ def main(): config['full_run'] = args.run_full config['timestamp'] = timestamp + if not args.run_full: + print("The recent change sets the default value of run_full to `False`, if this is not intended " + "run `evaluate.py` with `--run_full`") qns_ttbhs = read_ttbhs() qns_bmpt3 = read_bmpt3() From bc3f00190f2f8ada4ee102637c0927697b2b9ba4 Mon Sep 17 00:00:00 2001 From: wheynelau <waynelau15045@gmail.com> Date: Tue, 11 Jun 2024 13:29:26 +0800 Subject: [PATCH 04/12] fix: no_tqdm to tqdm --- evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluate.py b/evaluate.py index 8205166..499a79e 100644 --- a/evaluate.py +++ b/evaluate.py @@ -96,7 +96,7 @@ def evaluate(questions:List[Dict]) -> float: def run_test(args, model, tokenizer, questions, n_shots) -> Tuple[List[Dict], float]: - for i in tqdm(range(len(questions)), leave=True, disable = args.no_tqdm): + for i in tqdm(range(len(questions)), leave=True, disable = args.tqdm): prompts = [] if n_shots: arange = set(range(len(questions))) From 67196b2667ee68f632b7ac10bc3c8607571998bc Mon Sep 17 00:00:00 2001 From: wheynelau <waynelau15045@gmail.com> Date: Tue, 11 Jun 2024 13:36:43 +0800 Subject: [PATCH 05/12] docs: add docs for tqdm and some functions --- evaluate.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/evaluate.py b/evaluate.py index 499a79e..b83683e 100644 --- a/evaluate.py +++ b/evaluate.py @@ -84,7 +84,7 @@ def convert_prompt(row, answer = False) -> str: """ return prompt.strip() -def most_common(l): +def most_common(l:List) -> str: return max(set(l), key=l.count) def evaluate(questions:List[Dict]) -> float: @@ -96,7 +96,9 @@ def evaluate(questions:List[Dict]) -> float: def run_test(args, model, tokenizer, questions, n_shots) -> Tuple[List[Dict], float]: - for i in tqdm(range(len(questions)), leave=True, disable = args.tqdm): + # not args.tqdm => if true, then disable = False => enable tqdm + # => if false, then disable = True => disable tqdm + for i in tqdm(range(len(questions)), leave=True, disable = not args.tqdm): prompts = [] if n_shots: arange = set(range(len(questions))) @@ -205,6 +207,7 @@ def main(): json.dump(merged, fopen, indent=4) else: #3 shot for 5 qns - for debugging + #TODO: Can we remove the for loop below? #tatabahasa for i in [3]: q, s = run_test(args, From faf54bc9eceba3891f7d0fbe4aa4af5a8d9a7ee3 Mon Sep 17 00:00:00 2001 From: wheynelau <waynelau15045@gmail.com> Date: Tue, 11 Jun 2024 13:43:28 +0800 Subject: [PATCH 06/12] feat: add markdown print for readability --- evaluate.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/evaluate.py b/evaluate.py index b83683e..741680b 100644 --- a/evaluate.py +++ b/evaluate.py @@ -236,8 +236,11 @@ def main(): conf = {"config": config} merged = {**data, **conf} json.dump(merged, fopen, indent=4) - - print(scores) + try: + import pandas as pd + print(pd.DataFrame(scores).to_markdown()) + except ImportError: + print(scores) if __name__ == "__main__": main() From e6100184a7562601e7fcb4cb773b5b7e2684adb0 Mon Sep 17 00:00:00 2001 From: wheynelau <waynelau15045@gmail.com> Date: Tue, 11 Jun 2024 13:56:31 +0800 Subject: [PATCH 07/12] feat: add print questions for both benchmarks --- evaluate.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/evaluate.py b/evaluate.py index 741680b..35fa21f 100644 --- a/evaluate.py +++ b/evaluate.py @@ -16,6 +16,8 @@ else: device = "cpu" +# TODO: Convert prints to logs + def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, help="Path to pretrained model", required=True) @@ -45,7 +47,7 @@ def read_ttbhs() -> List[Dict]: 'jawapan': jawapan, } questions.append(data) - print(f"Running {len(questions)} questions") + print(f"TTBHS: Running {len(questions)} questions") return questions def read_bmpt3() -> List[Dict]: @@ -66,7 +68,7 @@ def read_bmpt3() -> List[Dict]: 'jawapan': jawapan, } questions.append(data) - + print(f"BM-A-PT3: Running {len(questions)} questions") return questions def convert_prompt(row, answer = False) -> str: From 440bf34a1c0d77cb14430e0f4f7767fc77b3be3e Mon Sep 17 00:00:00 2001 From: wheynelau <waynelau15045@gmail.com> Date: Thu, 13 Jun 2024 08:18:24 +0000 Subject: [PATCH 08/12] feat!: multiple debugging arguments BREAKING CHANGE: Potentially breaking for previous workflows due to new arguments and changes to default values --- evaluate.py | 115 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 41 deletions(-) diff --git a/evaluate.py b/evaluate.py index 35fa21f..71ba933 100644 --- a/evaluate.py +++ b/evaluate.py @@ -1,4 +1,5 @@ -from transformers import AutoTokenizer, AutoModelForCausalLM +from doctest import debug +from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed import torch from tqdm import tqdm import os @@ -6,10 +7,15 @@ import json import argparse import datetime - +import logging from typing import List, Dict, Tuple -random.seed(0) +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[logging.StreamHandler()]) + + +logger = logging.getLogger(__name__) if torch.cuda.is_available(): device = "cuda" @@ -24,7 +30,12 @@ def parse_args(): parser.add_argument("--name", type=str,help="Output File Name", default="model_name", required=True) parser.add_argument("--run_full", help="run 0, 1, 3 shots", action = "store_true") parser.add_argument("--tqdm", help='whether to run tqdm', action = "store_true") + parser.add_argument("--num_fewshots", help="Number of few shots as a string etc '0,1,2'", default = "0,1,3") parser.add_argument("--output_folder", type=str, default="output") + parser.add_argument("--repeat",type = int, default=5, help= "Number of loops per prompt") + parser.add_argument("--limit",type = int, default=0, help = 'run N number of samples') + parser.add_argument("--device", type=str, help = "device map") + parser.add_argument("--deterministic", help="disable sampling", action = "store_true") args = parser.parse_args() return args @@ -47,7 +58,7 @@ def read_ttbhs() -> List[Dict]: 'jawapan': jawapan, } questions.append(data) - print(f"TTBHS: Running {len(questions)} questions") + logging.info(f"TTBHS: Running {len(questions)} questions") return questions def read_bmpt3() -> List[Dict]: @@ -96,67 +107,89 @@ def evaluate(questions:List[Dict]) -> float: correct += most_common(q['output']) == q['jawapan'] return (correct / len(filtered)) * 100 -def run_test(args, model, tokenizer, questions, n_shots) -> Tuple[List[Dict], float]: +def run_test(args, model, tokenizer, questions, n_shots, n_repeat:int = 5) -> Tuple[List[Dict], float]: + generate_kwargs = dict( + max_new_tokens=3, + top_p=0.95, + top_k=50, + temperature=0.5, + # if no_sample is true, then do_sample = False + do_sample=not args.deterministic, + num_beams=1, + repetition_penalty=1.05, + ) # not args.tqdm => if true, then disable = False => enable tqdm # => if false, then disable = True => disable tqdm + set_seed(1234) + for i in tqdm(range(len(questions)), leave=True, disable = not args.tqdm): prompts = [] + if n_shots: arange = set(range(len(questions))) - shots = random.sample(sorted(arange - {i}), n_shots) + if args.deterministic: + shots = sorted(arange - {i})[:n_shots] + else: + shots = random.sample(sorted(arange - {i}), n_shots) for no, s in enumerate(shots): prompts.append(f'Contoh soalan {no + 1}\n' + convert_prompt(questions[s], answer = True)) prompts.append(convert_prompt(questions[i])) prompt = '\n\n'.join(prompts) - inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda') + inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to(args.device) inputs.pop('token_type_ids', None) - repeat = [] - for _ in range(5): + repeat, debug_output, debug_toks = [], [], [] + for _ in range(n_repeat): try: - generate_kwargs = dict( - inputs, - max_new_tokens=3, - top_p=0.95, - top_k=50, - temperature=0.5, - do_sample=True, - num_beams=1, - repetition_penalty=1.05, - ) - r = model.generate(**generate_kwargs) - r = tokenizer.decode(r[0]).split('jawapan:')[-1].strip().split() + r = model.generate(**inputs,**generate_kwargs) + r = tokenizer.decode(r[0]).split('jawapan:')[-1] + debug_output.append(r) + r = r.strip().split() repeat.append(r[0].replace('.', '').replace('</s>', '').split('\\')[0].split('/')[0]) except Exception as e: - print(e) + print(e, r) pass - + questions[i]['input_tok'] = inputs.input_ids.tolist()[0] + questions[i]['prompt'] = prompt questions[i]['output'] = repeat + questions[i]['debug'] = debug_output # with open(f'{args.output_folder}/output-{n_shots}shot-{args.name}.json', 'w') as fopen: # json.dump(questions, fopen) score = evaluate(questions) - # print (f"{n_shots}shot: {score}") + # logging.error (f"{n_shots}shot: {score}") return questions, score def main(): args = parse_args() + logging.basicConfig(filename=f'eval.log', level=logging.INFO) + logger.info(args) timestamp = datetime.datetime.now().replace(microsecond=0).isoformat().replace(":", "_") os.makedirs(args.output_folder + '/' + timestamp, exist_ok=True) + if not args.run_full: + logger.warning("The recent change sets the default value of run_full to `False`, if this is not intended " + "run `evaluate.py` with `--run_full`") + if args.deterministic: + logger.warning("No sampling should only be used for debugging purposes.\n" + "This will disable random n_shots sampling and use the first n_shots instead.\n" + "Repeats will also be disabled.") + args.repeat = 1 + logger.warning("Setting repeat to 1") + tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code = True) model = AutoModelForCausalLM.from_pretrained( args.model_path, trust_remote_code = True, torch_dtype = torch.float16, - device_map=device + device_map=args.device if args.device else device ) config = {} @@ -164,10 +197,6 @@ def main(): config['model_args'] = args.model_path config['full_run'] = args.run_full config['timestamp'] = timestamp - - if not args.run_full: - print("The recent change sets the default value of run_full to `False`, if this is not intended " - "run `evaluate.py` with `--run_full`") qns_ttbhs = read_ttbhs() qns_bmpt3 = read_bmpt3() @@ -177,30 +206,34 @@ def main(): ttbhs_scores = {} bmpt3_scores = {} + num_fewshots = [int(x) for x in args.num_fewshots.split(',')] + if args.run_full: #Full 0,1,3 for both tests - #tatabahasa - for i in [0,1,3]: + # === tatabahasa === + for i in num_fewshots: q, s = run_test(args, model=model, tokenizer=tokenizer, - questions=qns_ttbhs, - n_shots=i) + questions=qns_ttbhs[:args.limit] if args.limit else qns_ttbhs, + n_shots=i, + n_repeat=args.repeat) with open(f'{args.output_folder}/{timestamp}/output-tatabahasa-{i}shot-{args.name}.json', 'w') as fopen: - json.dump(q, fopen) + json.dump(q, fopen, indent=4) ttbhs_scores[f'n_shot={i}'] = s - #bmpt3 - for i in [0,1,3]: + scores['tatabahasa'] = ttbhs_scores + # === bmpt3 === + for i in num_fewshots: q, s = run_test(args, model=model, tokenizer=tokenizer, - questions=qns_bmpt3, - n_shots=i) + questions=qns_bmpt3[:args.limit] if args.limit else qns_bmpt3, + n_shots=i, + n_repeat=args.repeat) with open(f'{args.output_folder}/{timestamp}/output-bmpt3-{i}shot-{args.name}.json', 'w') as fopen: - json.dump(q, fopen) + json.dump(q, fopen, indent= 4) bmpt3_scores[f'n_shot={i}'] = s - - scores['tatabahasa'] = ttbhs_scores + scores['bmpt3'] = bmpt3_scores with open(f'{args.output_folder}/{timestamp}/score.json', 'w') as fopen: data = {"results": scores} From 2d97664ec6c9076019a07e25de2a4ec001222b78 Mon Sep 17 00:00:00 2001 From: wheynelau <waynelau15045@gmail.com> Date: Thu, 13 Jun 2024 08:19:51 +0000 Subject: [PATCH 09/12] chore: add output folder --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0b30d5b..2229174 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ *quiz-tatabahasa.jsonl .DS_Store -*.pbs \ No newline at end of file +*.pbs +output \ No newline at end of file From 6e01cc4246ba7f6ea81288ba1b8b7121131e0e25 Mon Sep 17 00:00:00 2001 From: wheynelau <waynelau15045@gmail.com> Date: Thu, 13 Jun 2024 08:22:57 +0000 Subject: [PATCH 10/12] chore: lint --- evaluate.py | 298 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 182 insertions(+), 116 deletions(-) diff --git a/evaluate.py b/evaluate.py index 71ba933..8e8cae4 100644 --- a/evaluate.py +++ b/evaluate.py @@ -10,9 +10,11 @@ import logging from typing import List, Dict, Tuple -logging.basicConfig(level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s', - handlers=[logging.StreamHandler()]) +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler()], +) logger = logging.getLogger(__name__) @@ -24,65 +26,83 @@ # TODO: Convert prints to logs + def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("--model_path", type=str, help="Path to pretrained model", required=True) - parser.add_argument("--name", type=str,help="Output File Name", default="model_name", required=True) - parser.add_argument("--run_full", help="run 0, 1, 3 shots", action = "store_true") - parser.add_argument("--tqdm", help='whether to run tqdm', action = "store_true") - parser.add_argument("--num_fewshots", help="Number of few shots as a string etc '0,1,2'", default = "0,1,3") + parser.add_argument( + "--model_path", type=str, help="Path to pretrained model", required=True + ) + parser.add_argument( + "--name", type=str, help="Output File Name", default="model_name", required=True + ) + parser.add_argument("--run_full", help="run 0, 1, 3 shots", action="store_true") + parser.add_argument("--tqdm", help="whether to run tqdm", action="store_true") + parser.add_argument( + "--num_fewshots", + help="Number of few shots as a string etc '0,1,2'", + default="0,1,3", + ) parser.add_argument("--output_folder", type=str, default="output") - parser.add_argument("--repeat",type = int, default=5, help= "Number of loops per prompt") - parser.add_argument("--limit",type = int, default=0, help = 'run N number of samples') - parser.add_argument("--device", type=str, help = "device map") - parser.add_argument("--deterministic", help="disable sampling", action = "store_true") + parser.add_argument( + "--repeat", type=int, default=5, help="Number of loops per prompt" + ) + parser.add_argument("--limit", type=int, default=0, help="run N number of samples") + parser.add_argument("--device", type=str, help="device map") + parser.add_argument("--deterministic", help="disable sampling", action="store_true") args = parser.parse_args() return args + def read_ttbhs() -> List[Dict]: questions = [] with open("quiz-tatabahasa.jsonl") as fopen: for no, l in enumerate(fopen): l = json.loads(l) - soalan = [l['question']] + soalan = [l["question"]] jawapan = None - for c, k in l['choices'].items(): + for c, k in l["choices"].items(): soalan.append(f"{c}. {k['text']}") - if k['answer']: + if k["answer"]: jawapan = c - + data = { - 'no': no, - 'objektif': 'Jawab soalan yang diberikan' if l['instruction'] is None else l['instruction'], - 'soalan': '\n'.join(soalan), - 'jawapan': jawapan, + "no": no, + "objektif": ( + "Jawab soalan yang diberikan" + if l["instruction"] is None + else l["instruction"] + ), + "soalan": "\n".join(soalan), + "jawapan": jawapan, } questions.append(data) logging.info(f"TTBHS: Running {len(questions)} questions") return questions + def read_bmpt3() -> List[Dict]: - with open('BM-A-pt3') as fopen: + with open("BM-A-pt3") as fopen: text = fopen.read() - + questions = [] - for t in text.split('no: ')[1:]: + for t in text.split("no: ")[1:]: t = t.strip() - no = t.split('\n')[0] - objektif = t.split('objektif: ')[1].split('\n')[0] - soalan = t.split('soalan:')[1].split('jawapan:')[0].strip() - jawapan = t.split('jawapan: ')[1].split(',')[0].strip() + no = t.split("\n")[0] + objektif = t.split("objektif: ")[1].split("\n")[0] + soalan = t.split("soalan:")[1].split("jawapan:")[0].strip() + jawapan = t.split("jawapan: ")[1].split(",")[0].strip() data = { - 'no': no, - 'objektif': objektif, - 'soalan': soalan, - 'jawapan': jawapan, + "no": no, + "objektif": objektif, + "soalan": soalan, + "jawapan": jawapan, } questions.append(data) print(f"BM-A-PT3: Running {len(questions)} questions") return questions -def convert_prompt(row, answer = False) -> str: + +def convert_prompt(row, answer=False) -> str: if answer: prompt = f""" objektif: {row['objektif']} @@ -97,35 +117,42 @@ def convert_prompt(row, answer = False) -> str: """ return prompt.strip() -def most_common(l:List) -> str: + +def most_common(l: List) -> str: return max(set(l), key=l.count) -def evaluate(questions:List[Dict]) -> float: - filtered = [q for q in questions if 'output' in q] + +def evaluate(questions: List[Dict]) -> float: + filtered = [q for q in questions if "output" in q] correct = 0 for q in filtered: - correct += most_common(q['output']) == q['jawapan'] + correct += most_common(q["output"]) == q["jawapan"] return (correct / len(filtered)) * 100 -def run_test(args, model, tokenizer, questions, n_shots, n_repeat:int = 5) -> Tuple[List[Dict], float]: + +def run_test( + args, model, tokenizer, questions, n_shots, n_repeat: int = 5 +) -> Tuple[List[Dict], float]: generate_kwargs = dict( - max_new_tokens=3, - top_p=0.95, - top_k=50, - temperature=0.5, - # if no_sample is true, then do_sample = False - do_sample=not args.deterministic, - num_beams=1, - repetition_penalty=1.05, - ) + max_new_tokens=3, + top_p=0.95, + top_k=50, + temperature=0.5, + # if no_sample is true, then do_sample = False + do_sample=not args.deterministic, + num_beams=1, + repetition_penalty=1.05, + ) + + if args.deterministic: + # try to be deterministic every round + set_seed(1234) # not args.tqdm => if true, then disable = False => enable tqdm # => if false, then disable = True => disable tqdm - set_seed(1234) - - for i in tqdm(range(len(questions)), leave=True, disable = not args.tqdm): + for i in tqdm(range(len(questions)), leave=True, disable=not args.tqdm): prompts = [] - + if n_shots: arange = set(range(len(questions))) if args.deterministic: @@ -133,27 +160,38 @@ def run_test(args, model, tokenizer, questions, n_shots, n_repeat:int = 5) -> Tu else: shots = random.sample(sorted(arange - {i}), n_shots) for no, s in enumerate(shots): - prompts.append(f'Contoh soalan {no + 1}\n' + convert_prompt(questions[s], answer = True)) + prompts.append( + f"Contoh soalan {no + 1}\n" + + convert_prompt(questions[s], answer=True) + ) prompts.append(convert_prompt(questions[i])) - prompt = '\n\n'.join(prompts) - inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to(args.device) - inputs.pop('token_type_ids', None) + prompt = "\n\n".join(prompts) + inputs = tokenizer([prompt], return_tensors="pt", add_special_tokens=False).to( + args.device + ) + inputs.pop("token_type_ids", None) repeat, debug_output, debug_toks = [], [], [] for _ in range(n_repeat): try: - r = model.generate(**inputs,**generate_kwargs) - r = tokenizer.decode(r[0]).split('jawapan:')[-1] + r = model.generate(**inputs, **generate_kwargs) + r = tokenizer.decode(r[0]).split("jawapan:")[-1] debug_output.append(r) r = r.strip().split() - repeat.append(r[0].replace('.', '').replace('</s>', '').split('\\')[0].split('/')[0]) - + repeat.append( + r[0] + .replace(".", "") + .replace("</s>", "") + .split("\\")[0] + .split("/")[0] + ) + except Exception as e: print(e, r) pass - questions[i]['input_tok'] = inputs.input_ids.tolist()[0] - questions[i]['prompt'] = prompt - questions[i]['output'] = repeat - questions[i]['debug'] = debug_output + questions[i]["input_tok"] = inputs.input_ids.tolist()[0] + questions[i]["prompt"] = prompt + questions[i]["output"] = repeat + questions[i]["debug"] = debug_output # with open(f'{args.output_folder}/output-{n_shots}shot-{args.name}.json', 'w') as fopen: # json.dump(questions, fopen) @@ -164,39 +202,46 @@ def run_test(args, model, tokenizer, questions, n_shots, n_repeat:int = 5) -> Tu return questions, score + def main(): args = parse_args() - logging.basicConfig(filename=f'eval.log', level=logging.INFO) + logging.basicConfig(filename=f"eval.log", level=logging.INFO) logger.info(args) - timestamp = datetime.datetime.now().replace(microsecond=0).isoformat().replace(":", "_") + timestamp = ( + datetime.datetime.now().replace(microsecond=0).isoformat().replace(":", "_") + ) - os.makedirs(args.output_folder + '/' + timestamp, exist_ok=True) + os.makedirs(args.output_folder + "/" + timestamp, exist_ok=True) if not args.run_full: - logger.warning("The recent change sets the default value of run_full to `False`, if this is not intended " - "run `evaluate.py` with `--run_full`") + logger.warning( + "The recent change sets the default value of run_full to `False`, if this is not intended " + "run `evaluate.py` with `--run_full`" + ) if args.deterministic: - logger.warning("No sampling should only be used for debugging purposes.\n" - "This will disable random n_shots sampling and use the first n_shots instead.\n" - "Repeats will also be disabled.") + logger.warning( + "No sampling should only be used for debugging purposes.\n" + "This will disable random n_shots sampling and use the first n_shots instead.\n" + "Repeats will also be disabled." + ) args.repeat = 1 logger.warning("Setting repeat to 1") - tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code = True) + tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( args.model_path, - trust_remote_code = True, - torch_dtype = torch.float16, - device_map=args.device if args.device else device + trust_remote_code=True, + torch_dtype=torch.float16, + device_map=args.device if args.device else device, ) config = {} - config['run_name'] = args.name - config['model_args'] = args.model_path - config['full_run'] = args.run_full - config['timestamp'] = timestamp + config["run_name"] = args.name + config["model_args"] = args.model_path + config["full_run"] = args.run_full + config["timestamp"] = timestamp qns_ttbhs = read_ttbhs() qns_bmpt3 = read_bmpt3() @@ -206,76 +251,97 @@ def main(): ttbhs_scores = {} bmpt3_scores = {} - num_fewshots = [int(x) for x in args.num_fewshots.split(',')] + num_fewshots = [int(x) for x in args.num_fewshots.split(",")] - - if args.run_full: #Full 0,1,3 for both tests + if args.run_full: # Full 0,1,3 for both tests # === tatabahasa === for i in num_fewshots: - q, s = run_test(args, - model=model, + q, s = run_test( + args, + model=model, tokenizer=tokenizer, - questions=qns_ttbhs[:args.limit] if args.limit else qns_ttbhs, + questions=qns_ttbhs[: args.limit] if args.limit else qns_ttbhs, n_shots=i, - n_repeat=args.repeat) - with open(f'{args.output_folder}/{timestamp}/output-tatabahasa-{i}shot-{args.name}.json', 'w') as fopen: + n_repeat=args.repeat, + ) + with open( + f"{args.output_folder}/{timestamp}/output-tatabahasa-{i}shot-{args.name}.json", + "w", + ) as fopen: json.dump(q, fopen, indent=4) - ttbhs_scores[f'n_shot={i}'] = s - scores['tatabahasa'] = ttbhs_scores + ttbhs_scores[f"n_shot={i}"] = s + scores["tatabahasa"] = ttbhs_scores # === bmpt3 === for i in num_fewshots: - q, s = run_test(args, - model=model, + q, s = run_test( + args, + model=model, tokenizer=tokenizer, - questions=qns_bmpt3[:args.limit] if args.limit else qns_bmpt3, + questions=qns_bmpt3[: args.limit] if args.limit else qns_bmpt3, n_shots=i, - n_repeat=args.repeat) - with open(f'{args.output_folder}/{timestamp}/output-bmpt3-{i}shot-{args.name}.json', 'w') as fopen: - json.dump(q, fopen, indent= 4) - bmpt3_scores[f'n_shot={i}'] = s + n_repeat=args.repeat, + ) + with open( + f"{args.output_folder}/{timestamp}/output-bmpt3-{i}shot-{args.name}.json", + "w", + ) as fopen: + json.dump(q, fopen, indent=4) + bmpt3_scores[f"n_shot={i}"] = s - scores['bmpt3'] = bmpt3_scores - with open(f'{args.output_folder}/{timestamp}/score.json', 'w') as fopen: + scores["bmpt3"] = bmpt3_scores + with open(f"{args.output_folder}/{timestamp}/score.json", "w") as fopen: data = {"results": scores} conf = {"config": config} merged = {**data, **conf} json.dump(merged, fopen, indent=4) - - else: #3 shot for 5 qns - for debugging - #TODO: Can we remove the for loop below? - #tatabahasa + + else: # 3 shot for 5 qns - for debugging + # TODO: Can we remove the for loop below? + # tatabahasa for i in [3]: - q, s = run_test(args, - model=model, + q, s = run_test( + args, + model=model, tokenizer=tokenizer, questions=qns_ttbhs[:5], - n_shots=i) - with open(f'{args.output_folder}/{timestamp}/output-tatabahasa-{i}shot-{args.name}.json', 'w') as fopen: + n_shots=i, + ) + with open( + f"{args.output_folder}/{timestamp}/output-tatabahasa-{i}shot-{args.name}.json", + "w", + ) as fopen: json.dump(q, fopen) - ttbhs_scores[f'n_shot={i}'] = s - #bmpt3 + ttbhs_scores[f"n_shot={i}"] = s + # bmpt3 for i in [3]: - q, s = run_test(args, - model=model, + q, s = run_test( + args, + model=model, tokenizer=tokenizer, questions=qns_bmpt3[:5], - n_shots=i) - with open(f'{args.output_folder}/{timestamp}/output-bmpt3-{i}shot-{args.name}.json', 'w') as fopen: + n_shots=i, + ) + with open( + f"{args.output_folder}/{timestamp}/output-bmpt3-{i}shot-{args.name}.json", + "w", + ) as fopen: json.dump(q, fopen) - bmpt3_scores[f'n_shot={i}'] = s - - scores['tatabahasa'] = ttbhs_scores - scores['bmpt3'] = bmpt3_scores - with open(f'{args.output_folder}/{timestamp}/score.json', 'w') as fopen: + bmpt3_scores[f"n_shot={i}"] = s + + scores["tatabahasa"] = ttbhs_scores + scores["bmpt3"] = bmpt3_scores + with open(f"{args.output_folder}/{timestamp}/score.json", "w") as fopen: data = {"results": scores} conf = {"config": config} merged = {**data, **conf} json.dump(merged, fopen, indent=4) try: import pandas as pd + print(pd.DataFrame(scores).to_markdown()) except ImportError: print(scores) + if __name__ == "__main__": main() From 4161cf8819739d7f6184d5657e30efdfb481cbf2 Mon Sep 17 00:00:00 2001 From: wheynelau <waynelau15045@gmail.com> Date: Fri, 14 Jun 2024 05:20:09 +0000 Subject: [PATCH 11/12] feat: add deterministic flag --- evaluate.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/evaluate.py b/evaluate.py index 8e8cae4..74dd7b8 100644 --- a/evaluate.py +++ b/evaluate.py @@ -155,10 +155,11 @@ def run_test( if n_shots: arange = set(range(len(questions))) - if args.deterministic: - shots = sorted(arange - {i})[:n_shots] - else: - shots = random.sample(sorted(arange - {i}), n_shots) + shots = ( + sorted(arange - {i})[:n_shots] # if deterministic, then use the first n_shots + if args.deterministic else + random.sample(sorted(arange - {i}), n_shots) # else sample n_shots + ) for no, s in enumerate(shots): prompts.append( f"Contoh soalan {no + 1}\n" From 99ba3def1de8e5f5b3210606444773338e027873 Mon Sep 17 00:00:00 2001 From: wheynelau <waynelau15045@gmail.com> Date: Wed, 19 Jun 2024 06:43:07 +0000 Subject: [PATCH 12/12] check for file sep --- evaluate.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/evaluate.py b/evaluate.py index 74dd7b8..5dbc906 100644 --- a/evaluate.py +++ b/evaluate.py @@ -39,7 +39,7 @@ def parse_args(): parser.add_argument("--tqdm", help="whether to run tqdm", action="store_true") parser.add_argument( "--num_fewshots", - help="Number of few shots as a string etc '0,1,2'", + help="Number of few shots as a string etc '0,1,3'", default="0,1,3", ) parser.add_argument("--output_folder", type=str, default="output") @@ -47,9 +47,12 @@ def parse_args(): "--repeat", type=int, default=5, help="Number of loops per prompt" ) parser.add_argument("--limit", type=int, default=0, help="run N number of samples") - parser.add_argument("--device", type=str, help="device map") + parser.add_argument("--device", type=str, help="device map", default='cuda') parser.add_argument("--deterministic", help="disable sampling", action="store_true") args = parser.parse_args() + if "/" in args.name: + logging.warning("name should not contain /") + args.name = args.name.replace("/", "_") return args @@ -187,7 +190,7 @@ def run_test( ) except Exception as e: - print(e, r) + print(e) pass questions[i]["input_tok"] = inputs.input_ids.tolist()[0] questions[i]["prompt"] = prompt @@ -214,7 +217,7 @@ def main(): datetime.datetime.now().replace(microsecond=0).isoformat().replace(":", "_") ) - os.makedirs(args.output_folder + "/" + timestamp, exist_ok=True) + os.makedirs(os.path.join(args.output_folder,timestamp), exist_ok=True) if not args.run_full: logger.warning( @@ -336,12 +339,12 @@ def main(): conf = {"config": config} merged = {**data, **conf} json.dump(merged, fopen, indent=4) - try: - import pandas as pd + # try: + # import pandas as pd - print(pd.DataFrame(scores).to_markdown()) - except ImportError: - print(scores) + # print(pd.DataFrame(scores).to_markdown()) + # except ImportError: + print(scores) if __name__ == "__main__":