Skip to content

Commit

Permalink
update time
Browse files Browse the repository at this point in the history
  • Loading branch information
pufanyi committed Aug 26, 2024
1 parent f0b1ee2 commit ed928e9
Show file tree
Hide file tree
Showing 8 changed files with 40 additions and 40 deletions.
6 changes: 3 additions & 3 deletions tools/live_bench/create_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@
dataset = LiveBench()
dataset.capture(websites=website, driver_kwargs={"headless": True}, screen_shoter="single_screen", shoter_kwargs={"screen_size": (1024, 1024)}, qa_generator="gpt4v", scorer="claude", checker="gemini")

website = load_websites_from_file("/data/pufanyi/project/lmms-eval/temp/images")
dataset.capture(websites=website, screen_shoter="human", qa_generator="gpt4v", scorer="claude", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={})
dataset.upload()
# website = load_websites_from_file("/data/pufanyi/project/lmms-eval/temp/images")
# dataset.capture(websites=website, screen_shoter="human", qa_generator="gpt4v", scorer="claude", checker="gemini", driver_kwargs={}, shoter_kwargs={}, generator_kwargs={})
# dataset.upload()
4 changes: 2 additions & 2 deletions tools/live_bench/data_summary.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
}
],
"source": [
"data = load_dataset(\"lmms-lab/LiveBench\", \"2024-07\")"
"data = load_dataset(\"lmms-lab/LiveBench\", \"2024-08\")"
]
},
{
Expand Down Expand Up @@ -298,7 +298,7 @@
}
],
"source": [
"data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-07\", split=\"test\")"
"data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-08\", split=\"test\")"
]
},
{
Expand Down
6 changes: 3 additions & 3 deletions tools/live_bench/example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@
"text": [
"Map: 100%|██████████| 9/9 [00:00<00:00, 243.32 examples/s]?, ?it/s]\n",
"Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 194.00ba/s]\n",
"Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00, 2.07s/it]\n"
"Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00, 2.08s/it]\n"
]
}
],
Expand Down Expand Up @@ -354,7 +354,7 @@
"source": [
"import datasets\n",
"\n",
"data = datasets.load_dataset(\"lmms-lab/LiveBench\", \"2024-07\")"
"data = datasets.load_dataset(\"lmms-lab/LiveBench\", \"2024-08\")"
]
},
{
Expand Down Expand Up @@ -446,7 +446,7 @@
}
],
"source": [
"data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-07\")"
"data.push_to_hub(\"lmms-lab/LiveBench\", \"2024-08\")"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion tools/live_bench/filter.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"metadata": {},
"outputs": [],
"source": [
"data = load_dataset(\"lmms-lab/LiveBench\", \"2024-07\")"
"data = load_dataset(\"lmms-lab/LiveBench\", \"2024-08\")"
]
},
{
Expand Down
42 changes: 21 additions & 21 deletions tools/live_bench/live_bench/view.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"source": [
"from datasets import load_dataset\n",
"\n",
"dataset = load_dataset(\"lmms-lab/LiveBench\", \"2024-07\")"
"dataset = load_dataset(\"lmms-lab/LiveBench\", \"2024-08\")"
]
},
{
Expand Down Expand Up @@ -103,7 +103,7 @@
" <td>Basic Understanding</td>\n",
" <td>claude</td>\n",
" <td>gemini</td>\n",
" <td>2024-07-20 14:02:22</td>\n",
" <td>2024-08-20 14:02:22</td>\n",
" <td>single_screen</td>\n",
" <td>(1024, 1024)</td>\n",
" <td>10</td>\n",
Expand All @@ -121,7 +121,7 @@
" <td>Deeper Implications</td>\n",
" <td>claude</td>\n",
" <td>gemini</td>\n",
" <td>2024-07-20 14:02:22</td>\n",
" <td>2024-08-20 14:02:22</td>\n",
" <td>single_screen</td>\n",
" <td>(1024, 1024)</td>\n",
" <td>10</td>\n",
Expand All @@ -139,7 +139,7 @@
" <td>Contextual Analysis</td>\n",
" <td>claude</td>\n",
" <td>gemini</td>\n",
" <td>2024-07-20 14:02:22</td>\n",
" <td>2024-08-20 14:02:22</td>\n",
" <td>single_screen</td>\n",
" <td>(1024, 1024)</td>\n",
" <td>10</td>\n",
Expand All @@ -157,7 +157,7 @@
" <td>Deeper Implications</td>\n",
" <td>claude</td>\n",
" <td>None</td>\n",
" <td>2024-07-20 14:02:22</td>\n",
" <td>2024-08-20 14:02:22</td>\n",
" <td>single_screen</td>\n",
" <td>(1024, 1024)</td>\n",
" <td>10</td>\n",
Expand All @@ -175,7 +175,7 @@
" <td>Contextual Analysis</td>\n",
" <td>claude</td>\n",
" <td>gemini</td>\n",
" <td>2024-07-20 14:02:22</td>\n",
" <td>2024-08-20 14:02:22</td>\n",
" <td>single_screen</td>\n",
" <td>(1024, 1024)</td>\n",
" <td>8</td>\n",
Expand Down Expand Up @@ -211,7 +211,7 @@
" <td>Contextual Analysis</td>\n",
" <td>gpt4v</td>\n",
" <td>gemini</td>\n",
" <td>2024-07-21 20:23:39</td>\n",
" <td>2024-08-21 20:23:39</td>\n",
" <td>human</td>\n",
" <td>None</td>\n",
" <td>6</td>\n",
Expand All @@ -229,7 +229,7 @@
" <td>Contextual Analysis</td>\n",
" <td>gpt4v</td>\n",
" <td>gemini</td>\n",
" <td>2024-07-21 20:27:57</td>\n",
" <td>2024-08-21 20:27:57</td>\n",
" <td>human</td>\n",
" <td>None</td>\n",
" <td>6</td>\n",
Expand All @@ -247,7 +247,7 @@
" <td>Contextual Analysis</td>\n",
" <td>gpt4v</td>\n",
" <td>gemini</td>\n",
" <td>2024-07-21 20:27:57</td>\n",
" <td>2024-08-21 20:27:57</td>\n",
" <td>human</td>\n",
" <td>None</td>\n",
" <td>7</td>\n",
Expand All @@ -265,7 +265,7 @@
" <td>Contextual Analysis</td>\n",
" <td>gpt4v</td>\n",
" <td>gemini</td>\n",
" <td>2024-07-21 20:27:57</td>\n",
" <td>2024-08-21 20:27:57</td>\n",
" <td>human</td>\n",
" <td>None</td>\n",
" <td>6</td>\n",
Expand All @@ -283,7 +283,7 @@
" <td>Basic Understanding</td>\n",
" <td>gpt4v</td>\n",
" <td>gemini</td>\n",
" <td>2024-07-21 20:27:57</td>\n",
" <td>2024-08-21 20:27:57</td>\n",
" <td>human</td>\n",
" <td>None</td>\n",
" <td>5</td>\n",
Expand Down Expand Up @@ -362,17 +362,17 @@
"319 Scoring Criteria (Total 10 points):\\n\\n- Ident... Basic Understanding \n",
"\n",
" data_generator checker date_time screen_shoter screen_size \\\n",
"0 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n",
"1 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n",
"2 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n",
"3 claude None 2024-07-20 14:02:22 single_screen (1024, 1024) \n",
"4 claude gemini 2024-07-20 14:02:22 single_screen (1024, 1024) \n",
"0 claude gemini 2024-08-20 14:02:22 single_screen (1024, 1024) \n",
"1 claude gemini 2024-08-20 14:02:22 single_screen (1024, 1024) \n",
"2 claude gemini 2024-08-20 14:02:22 single_screen (1024, 1024) \n",
"3 claude None 2024-08-20 14:02:22 single_screen (1024, 1024) \n",
"4 claude gemini 2024-08-20 14:02:22 single_screen (1024, 1024) \n",
".. ... ... ... ... ... \n",
"315 gpt4v gemini 2024-07-21 20:23:39 human None \n",
"316 gpt4v gemini 2024-07-21 20:27:57 human None \n",
"317 gpt4v gemini 2024-07-21 20:27:57 human None \n",
"318 gpt4v gemini 2024-07-21 20:27:57 human None \n",
"319 gpt4v gemini 2024-07-21 20:27:57 human None \n",
"315 gpt4v gemini 2024-08-21 20:23:39 human None \n",
"316 gpt4v gemini 2024-08-21 20:27:57 human None \n",
"317 gpt4v gemini 2024-08-21 20:27:57 human None \n",
"318 gpt4v gemini 2024-08-21 20:27:57 human None \n",
"319 gpt4v gemini 2024-08-21 20:27:57 human None \n",
"\n",
" score reason scorer_name \n",
"0 10 The answer is correct and can be directly veri... claude \n",
Expand Down
6 changes: 3 additions & 3 deletions tools/live_bench/refine_all_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


if __name__ == "__main__":
hf_data = load_dataset("lmms-lab/LiveBench", "2024-07", split="test")
hf_data = load_dataset("lmms-lab/LiveBench", "2024-08", split="test")
finalizer = QuestionFinalizer()

def load_results():
Expand Down Expand Up @@ -32,5 +32,5 @@ def load_results():
final_data[item].append(value)
# final_data = Dataset.from_generator(load_results)
final_data = Dataset.from_dict(final_data, features=hf_data.features)
final_data.save_to_disk("logs/2024-07-final")
final_data.push_to_hub("lmms-lab/LiveBench", "2024-07")
final_data.save_to_disk("logs/2024-08-final")
final_data.push_to_hub("lmms-lab/LiveBench", "2024-08")
2 changes: 1 addition & 1 deletion tools/live_bench/script/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ python upload_results.py -f <log_folder> -m <model_name> [-F]
Example:

```sh
python upload_results.py -f logs/0706_0959_model_outputs_gpt4v_model_args_c974bc -m gpt-4o -F
python upload_results.py -f logs/0806_0959_model_outputs_gpt4v_model_args_c974bc -m gpt-4o -F
```
12 changes: 6 additions & 6 deletions tools/live_bench/script/modify.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"source": [
"import datasets\n",
"\n",
"data = datasets.load_dataset(\"lmms-lab/LiveBenchResults\", \"2024-07\")"
"data = datasets.load_dataset(\"lmms-lab/LiveBenchResults\", \"2024-08\")"
]
},
{
Expand All @@ -35,7 +35,7 @@
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"2024-07.csv\", index=False)"
"df.to_csv(\"2024-08.csv\", index=False)"
]
},
{
Expand All @@ -46,7 +46,7 @@
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\"2024-07.csv\")"
"df = pd.read_csv(\"2024-08.csv\")"
]
},
{
Expand Down Expand Up @@ -234,7 +234,7 @@
}
],
"source": [
"data.push_to_hub(\"lmms-lab/LiveBenchResults\", \"2024-07\", split=\"test\")"
"data.push_to_hub(\"lmms-lab/LiveBenchResults\", \"2024-08\", split=\"test\")"
]
},
{
Expand All @@ -243,7 +243,7 @@
"metadata": {},
"outputs": [],
"source": [
"data = datasets.load_dataset(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")"
"data = datasets.load_dataset(\"lmms-lab/LiveBenchDetailedResults\", \"2024-08\")"
]
},
{
Expand Down Expand Up @@ -426,7 +426,7 @@
}
],
"source": [
"data.push_to_hub(\"lmms-lab/LiveBenchDetailedResults\", \"2024-07\")"
"data.push_to_hub(\"lmms-lab/LiveBenchDetailedResults\", \"2024-08\")"
]
},
{
Expand Down

0 comments on commit ed928e9

Please sign in to comment.