Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

modify gitignore and fix the bug when run humaneval #23

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
/.idea/
*.jsonl
*.csv

__pycache__/
data/
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter"
},
"python.formatting.provider": "none"
}
15 changes: 10 additions & 5 deletions bbh.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def evaluate(model: EvalModel, data: BBHData, ntrain: int) -> dict:
data_test = BBHData(samples=data.samples[ntrain:])
is_correct = []

prompts = []
for i in range(len(data_test.samples)):
# get prompt and make sure it fits
k = int(ntrain)
Expand All @@ -63,11 +64,15 @@ def evaluate(model: EvalModel, data: BBHData, ntrain: int) -> dict:
train_prompt = gen_prompt(data_train, k)
prompt = train_prompt + prompt_end

label = data_test.samples[i].target
pred = model.run(prompt)
is_correct.append(pred.strip().startswith(label))
if i == 0:
print(dict(prompt=prompt, label=label, pred=pred))
prompts.append(prompt)

preds = model.run(prompts)
labels = [s.target for s in data_test.samples]

print(dict(prompt=prompt, label=labels[0], pred=preds[0]))
is_correct.extend(
[pred.strip().startswith(label) for pred, label in zip(preds, labels)]
)

return dict(score=sum(is_correct) / len(is_correct))

Expand Down
14 changes: 10 additions & 4 deletions crass.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ def evaluate(model: EvalModel, data_train: CrassData, data_test: CrassData) -> d

progress = tqdm(data_test.samples)
sample: CrassSample

prompts, labels = [], []
for sample in progress:
# get prompt and make sure it fits
k = int(len(data_train.samples))
Expand All @@ -154,12 +156,16 @@ def evaluate(model: EvalModel, data_train: CrassData, data_test: CrassData) -> d
prompt = train_prompt + prompt_end

label = sample.get_answer_label()
pred = model.run(prompt).strip()
prompts.append(prompt)
labels.append(label)

preds = model.run(prompts)
preds = [i.strip() for i in preds]
for label, pred in zip(labels, preds):
is_correct.append(pred.startswith(label))
score = sum(is_correct) / len(is_correct)
progress.set_postfix(score=score)
print(dict(prompt=prompt, label=label, pred=pred))

score = sum(is_correct) / len(is_correct)
print(dict(prompt=prompts[0], label=labels[0], pred=preds[0]))
return dict(score=score)


Expand Down
16 changes: 12 additions & 4 deletions drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ def evaluate(model: EvalModel, data: DropData, ntrain: int) -> dict:

progress = tqdm(data_test.samples)
sample: DropSample

prompts = []
labels = []
for sample in progress:
# get prompt and make sure it fits
k = int(ntrain)
Expand All @@ -116,11 +119,16 @@ def evaluate(model: EvalModel, data: DropData, ntrain: int) -> dict:
prompt = train_prompt + prompt_end

label = sample.get_answers()[0]
pred = model.run(prompt).strip()
prompts.append(prompt)
labels.append(label)

preds = model.run(prompts)
preds = [i.strip() for i in preds]
for label, pred in zip(labels, preds):
is_correct.append(pred.startswith(label))
score = sum(is_correct) / len(is_correct)
progress.set_postfix(score=score)
print(dict(prompt=prompt, label=label, pred=pred))

score = sum(is_correct) / len(is_correct)
print(dict(prompt=prompts[0], label=labels[0], pred=preds[0]))

return dict(score=score)

Expand Down
38 changes: 24 additions & 14 deletions human_eval/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,29 +78,39 @@ def evaluate(model: EvalModel, data_path: str, **kwargs) -> dict:
dataset = read_problems(data_path)
n_sample = kwargs.get("n_sample", 1)
best_temperature = {1: 0.1, 10: 0.6, 100: 0.8}
temperature = best_temperature[n_sample]

samples = []
progress_bar = tqdm(total=len(dataset) * n_sample, desc="Generating samples")

prompts = []
task_ids = []
for task_id in dataset:
for i in range(n_sample):
prompt = dataset[task_id]["prompt"]
prompt = gen_prompt(prompt, model)
temperature = best_temperature[n_sample]
if temperature > 0:
completion = model.run(prompt, temperature=temperature, do_sample=True)
else:
completion = model.run(prompt)

completion = fix_indents(completion)
sample = dict(task_id=task_id, completion=filter_code(completion, model))
if i == 0:
print("Prompt: ", "-" * 100)
print(prompt)
print("Completion: ", "-" * 100)
print(filter_code(completion, model))
samples.append(sample)
prompts.append(prompt)
task_ids.append(task_id)
progress_bar.update(1)
progress_bar.close()

if temperature > 0:
completions = model.run(prompts, temperature=temperature, do_sample=True)
else:
completions = model.run(prompts)

for i, (prompt, completion, task_id) in enumerate(
zip(prompts, completions, task_ids)
):
completion = fix_indents(completion)
sample = dict(task_id=task_id, completion=filter_code(completion, model))
if i == 0:
print("Prompt: ", "-" * 100)
print(prompt)
print("Completion: ", "-" * 100)
print(filter_code(completion, model))
samples.append(sample)

model_name = model.model_path.replace("/", "_")
pred_filename = f"humaneval_{model_name}_predictions.jsonl"
write_jsonl(pred_filename, samples)
Expand Down
14 changes: 14 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from fire import Fire

from pathlib import Path
import json

import bbh
import crass
import drop
Expand All @@ -9,6 +12,11 @@


def main(task_name: str, **kwargs):
rslt_path = kwargs.pop('rslt_path', None)
if rslt_path is not None and Path(rslt_path).exists():
print(f"Already have file in {rslt_path}. Exist.")
exit(0)

task_map = dict(
mmlu=mmlu.main,
bbh=bbh.main,
Expand Down Expand Up @@ -46,6 +54,12 @@ def main(task_name: str, **kwargs):

results = {name: round(score * 100, 2) for name, score in results.items()}
print(results)

if rslt_path is not None:
Path(rslt_path).parent.mkdir(exist_ok=True, parents=True)
with open(rslt_path, 'w') as f:
json.dump(results, f)

return results


Expand Down
14 changes: 9 additions & 5 deletions mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ def evaluate(args, subject, model: EvalModel, dev_df, test_df):
cors = []
all_probs = []

labels = []
prompts = []
for i in range(test_df.shape[0]):
# get prompt and make sure it fits
k = args.ntrain
Expand All @@ -150,11 +152,13 @@ def evaluate(args, subject, model: EvalModel, dev_df, test_df):
prompt = train_prompt + prompt_end

label = test_df.iloc[i, test_df.shape[1] - 1]
pred = model.run(prompt)
probs = [0 for _ in get_choices()]
cor = pred.strip().startswith(label)
cors.append(cor)
all_probs.append(probs)
prompts.append(prompt)
labels.append(label)

preds = model.run(prompts)
probs = [0 for _ in get_choices()]
cors = [pred.strip().startswith(label) for pred, label in zip(preds, labels)]
all_probs.extend([probs for _ in preds])

acc = np.mean(cors)
cors = np.array(cors)
Expand Down
Loading