-
Notifications
You must be signed in to change notification settings - Fork 204
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* upload results * add a readme * chore: Update upload_results.py script to use shell syntax * Update upload_results.py * Update upload_results.py
- Loading branch information
Showing
2 changed files
with
216 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
## Usage | ||
|
||
### Upload Results | ||
|
||
```sh | ||
python upload_results.py -f <log_folder> -m <model_name> [-F] | ||
``` | ||
|
||
`[-F]` means the script will automatically upload the results without human checking. Otherwise, the script will print the results and ask for confirmation before uploading. | ||
|
||
Example: | ||
|
||
```sh | ||
python upload_results.py -f logs/0706_0959_model_outputs_gpt4v_model_args_c974bc -m gpt-4o -F | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
import os | ||
import json | ||
import argparse | ||
import datasets | ||
import numpy as np | ||
import pandas as pd | ||
from datasets import Dataset, load_dataset | ||
|
||
from PIL import Image | ||
from typing import Dict, List, Union | ||
from tqdm import tqdm | ||
|
||
|
||
EPS = 1e-6 | ||
|
||
RESULT_FEATURES = { | ||
"id": datasets.Value("int32"), | ||
"images": datasets.Sequence(datasets.Image()), | ||
"question": datasets.Value("string"), | ||
"ground_truth": datasets.Value("string"), | ||
"criteria": datasets.Value("string"), | ||
"subtask": datasets.Value("string"), | ||
"response": datasets.Value("string"), | ||
"score": datasets.Value("int32"), | ||
"reason": datasets.Value("string"), | ||
} | ||
|
||
SUBTASKS = [ | ||
"Basic Understanding", | ||
"Contextual Analysis", | ||
"Deeper Implications", | ||
"Broader Implications", | ||
"Further Insights", | ||
] | ||
|
||
|
||
def load_images(config) -> Dict[int, List[Image.Image]]: | ||
dataset = datasets.load_dataset( | ||
config["dataset_path"], config["dataset_name"], split=config["test_split"] | ||
) | ||
images = {} | ||
for data in tqdm(dataset, desc="Loading images"): | ||
images[data["id"]] = data["images"] | ||
return images | ||
|
||
|
||
def get_hf_results(results, detailed_results): | ||
live_bench_images = load_images(results["configs"]["live_bench"]) | ||
mapping = {k: [] for k in RESULT_FEATURES.keys()} | ||
for result in tqdm(detailed_results["logs"], desc="Loading results"): | ||
doc = result["doc"] | ||
res = {} | ||
res["id"] = doc["id"] | ||
res["images"] = live_bench_images[doc["id"]] | ||
res["question"] = doc["question"] | ||
res["ground_truth"] = doc["answer"] | ||
res["criteria"] = doc["criteria"] | ||
res["subtask"] = doc["subtask"] | ||
res["response"] = result["filtered_resps"][0] | ||
res["score"] = result["gpt4_eval_score"]["rating"] | ||
res["reason"] = result["gpt4_eval_score"]["explanation"] | ||
for k, v in res.items(): | ||
mapping[k].append(v) | ||
result_dataset = datasets.Dataset.from_dict( | ||
mapping=mapping, features=datasets.Features(RESULT_FEATURES) | ||
) | ||
return result_dataset | ||
|
||
|
||
def preview_results(results, heading: str): | ||
HEADING = "=" * 15 + " " + heading + " " + "=" * 15 | ||
ENDING = "=" * len(HEADING) | ||
print(HEADING) | ||
print(results) | ||
print(ENDING) | ||
|
||
|
||
def calculate_score(results: Dataset): | ||
results = results.to_pandas() | ||
|
||
sum_score, count = 0, 0 | ||
score = {} | ||
for subtask in SUBTASKS: | ||
score[subtask] = [] | ||
for index, result in tqdm( | ||
results.iterrows(), total=len(results), desc="Calculating score" | ||
): | ||
if result["score"] == -1: | ||
continue | ||
sum_score += result["score"] / 10 | ||
count += 1 | ||
subtask = result["subtask"] | ||
if subtask not in SUBTASKS: | ||
subtask = "Further Insights" | ||
score[result["subtask"]].append(result["score"] / 10) | ||
res = [ | ||
(subtask, len(score[subtask]), np.mean(score[subtask]) * 100) | ||
for subtask in SUBTASKS | ||
] | ||
res.append(("Total", count, sum_score / count * 100)) | ||
res = pd.DataFrame(res, columns=["Subtask", "Count", "Score"]) | ||
|
||
return res | ||
|
||
|
||
def get_results(folder): | ||
detailed_file = os.path.join(folder, "live_bench.json") | ||
results_file = os.path.join(folder, "results.json") | ||
|
||
with open(results_file, "r") as f: | ||
results = json.load(f) | ||
|
||
assert ( | ||
"live_bench" in results["configs"] | ||
), "No live_bench config found in results.json" | ||
final_score = results["results"]["live_bench"]["gpt4_eval_score,none"] | ||
model_configs = results["model_configs"] | ||
version = results["configs"]["live_bench"]["metadata"]["version"] | ||
|
||
assert ( | ||
model_configs["limit"] is None | ||
), "Model limit is not None, please check if the model is tested on the full dataset" | ||
|
||
with open(detailed_file, "r") as f: | ||
detailed_results = json.load(f) | ||
|
||
hf_results = get_hf_results(results, detailed_results) | ||
preview_results(hf_results.to_pandas().iloc[0], "Detailed Results") | ||
score = calculate_score(hf_results) | ||
preview_results(score, "Final Score") | ||
|
||
assert ( | ||
abs(score[score["Subtask"] == "Total"]["Score"] - final_score) <= EPS | ||
).all(), "Final score does not match the calculated score" | ||
|
||
return hf_results, score, version | ||
|
||
|
||
def upload_results( | ||
hf_results: Dataset, | ||
score: pd.DataFrame, | ||
model_name, | ||
dataset_version, | ||
log_folder="logs", | ||
): | ||
hf_results.push_to_hub( | ||
"lmms-lab/LiveBenchDetailedResults", | ||
config_name=dataset_version, | ||
split=model_name.replace("-", "_"), | ||
) | ||
if not os.path.exists(log_folder): | ||
os.makedirs(log_folder) | ||
score_path = os.path.abspath( | ||
os.path.join(log_folder, f"{dataset_version}_{model_name}.csv") | ||
) | ||
score.to_csv(score_path, index=False) | ||
print(f"Results saved to {score_path}") | ||
score_dict = {item["Subtask"]: item["Score"] for index, item in score.iterrows()} | ||
score_dict["Model Name"] = model_name | ||
try: | ||
hf_score = datasets.load_dataset( | ||
"lmms-lab/LiveBenchResults", dataset_version, split="test" | ||
) | ||
except: | ||
hf_score = Dataset.from_dict( | ||
{subtask: [] for subtask in ["Model Name", "Total"] + SUBTASKS} | ||
) | ||
hf_score = hf_score.add_item(score_dict) | ||
df_score = pd.DataFrame(hf_score) | ||
df_score = df_score.drop_duplicates(subset=["Model Name"], keep="last") | ||
df_score = df_score[["Model Name", "Total"] + SUBTASKS] | ||
hf_score = Dataset.from_pandas(df_score) | ||
hf_score.push_to_hub("lmms-lab/LiveBenchResults", dataset_version, split="test") | ||
|
||
|
||
if __name__ == "__main__": | ||
argparse = argparse.ArgumentParser() | ||
argparse.add_argument( | ||
"--folder", "-f", type=str, required=True, help="Results folder" | ||
) | ||
argparse.add_argument("--name", "-m", type=str, required=True, help="Model name") | ||
argparse.add_argument( | ||
"--log_folder", "-l", type=str, default="logs", help="Log folder" | ||
) | ||
argparse.add_argument("--force", "-F", action="store_true", help="Force upload") | ||
args = argparse.parse_args() | ||
hf_results, score, version = get_results(args.folder) | ||
print( | ||
f"Results will be uploaded with model name {args.name} and model version {version}" | ||
) | ||
if args.force is False: | ||
print("Are you sure you want to upload the results? (y/n)", end=" ") | ||
while True: | ||
choice = input().lower() | ||
if choice == "y": | ||
break | ||
elif choice == "n": | ||
exit() | ||
else: | ||
print("Invalid choice, please enter 'y' or 'n'") | ||
upload_results(hf_results, score, args.name, version, args.log_folder) |