diff --git a/.gitignore b/.gitignore index f07543c..64d52a7 100644 --- a/.gitignore +++ b/.gitignore @@ -210,3 +210,5 @@ result_dirs/gsm/gemma-2-27b-it@vllm.json result_dirs/zebra-grid/gemma-2-27b-it@vllm.json result_dirs_parsed/ state_of_limit/task_summary_*.json +result_dirs/zebra-grid/bon_32_v2/gpt-4o-mini-2024-07-18.json +result_dirs/zebra-grid/bon_64/gpt-4o-mini-2024-07-18.json diff --git a/result_dirs/zebra-grid.summary.json b/result_dirs/zebra-grid.summary.json index 70e5cbf..b3b9828 100644 --- a/result_dirs/zebra-grid.summary.json +++ b/result_dirs/zebra-grid.summary.json @@ -1,706 +1,41 @@ [ { - "Model": "o1-preview-2024-09-12", - "Mode": "greedy", - "Puzzle Acc": "71.40", - "Cell Acc": "75.14", - "No answer": "0.30", - "Easy Puzzle Acc": "98.57", - "Hard Puzzle Acc": "60.83", - "Total Puzzles": 1000, - "Reason Lens": "1565.88" - }, - { - "Model": "o1-preview-2024-09-12-v2", - "Mode": "greedy", - "Puzzle Acc": "70.40", - "Cell Acc": "74.18", - "No answer": "0.40", - "Easy Puzzle Acc": "98.21", - "Hard Puzzle Acc": "59.58", - "Total Puzzles": 1000, - "Reason Lens": "1559.71" - }, - { - "Model": "o1-mini-2024-09-12-v3", - "Mode": "greedy", - "Puzzle Acc": "59.70", - "Cell Acc": "70.32", - "No answer": "1.00", - "Easy Puzzle Acc": "86.07", - "Hard Puzzle Acc": "49.44", - "Total Puzzles": 1000, - "Reason Lens": "1166.38" - }, - { - "Model": "o1-mini-2024-09-12-v2", - "Mode": "greedy", - "Puzzle Acc": "56.80", - "Cell Acc": "69.87", - "No answer": "1.30", - "Easy Puzzle Acc": "82.86", - "Hard Puzzle Acc": "46.67", - "Total Puzzles": 1000, - "Reason Lens": "1164.95" - }, - { - "Model": "o1-mini-2024-09-12", - "Mode": "greedy", - "Puzzle Acc": "52.60", - "Cell Acc": "52.29", - "No answer": "0.80", - "Easy Puzzle Acc": "87.14", - "Hard Puzzle Acc": "39.17", - "Total Puzzles": 1000, - "Reason Lens": "993.28" - }, - { - "Model": "claude-3-5-sonnet-20240620", - "Mode": "greedy", - "Puzzle Acc": "33.40", - "Cell Acc": "54.34", - "No answer": "0.00", - "Easy Puzzle Acc": "87.50", - "Hard Puzzle Acc": "12.36", - "Total Puzzles": 1000, - "Reason Lens": "1141.94" - }, - { - "Model": "claude-3-5-sonnet-20240620", - "Mode": "sampling", - "Puzzle Acc": "33.40", - "Cell Acc": "53.01", - "No answer": "0.10", - "Easy Puzzle Acc": "88.21", - "Hard Puzzle Acc": "12.08", - "Total Puzzles": 1000, - "Reason Lens": "1153.83" - }, - { - "Model": "Llama-3.1-405B-Inst-fp8@together", - "Mode": "greedy", - "Puzzle Acc": "32.60", - "Cell Acc": "45.80", - "No answer": "12.50", - "Easy Puzzle Acc": "87.14", - "Hard Puzzle Acc": "11.39", - "Total Puzzles": 1000, - "Reason Lens": "314.66" - }, - { - "Model": "Llama-3.1-405B-Inst-fp8@together", - "Mode": "sampling", - "Puzzle Acc": "32.60", - "Cell Acc": "47.04", - "No answer": "10.80", - "Easy Puzzle Acc": "86.07", - "Hard Puzzle Acc": "11.81", - "Total Puzzles": 1000, - "Reason Lens": "439.96" - }, - { - "Model": "gpt-4o-2024-08-06", - "Mode": "greedy", - "Puzzle Acc": "31.70", - "Cell Acc": "50.34", - "No answer": "3.60", - "Easy Puzzle Acc": "84.64", - "Hard Puzzle Acc": "11.11", - "Total Puzzles": 1000, - "Reason Lens": "1106.51" - }, - { - "Model": "gpt-4o-2024-05-13", - "Mode": "sampling", - "Puzzle Acc": "30.80", - "Cell Acc": "46.19", - "No answer": "6.60", - "Easy Puzzle Acc": "81.07", - "Hard Puzzle Acc": "11.25", - "Total Puzzles": 1000, - "Reason Lens": "1549.74" - }, - { - "Model": "gemini-1.5-pro-exp-0827", - "Mode": "greedy", - "Puzzle Acc": "30.50", - "Cell Acc": "50.84", - "No answer": "0.80", - "Easy Puzzle Acc": "79.64", - "Hard Puzzle Acc": "11.39", - "Total Puzzles": 1000, - "Reason Lens": "1594.47" - }, - { - "Model": "Llama-3.1-405B-Inst@sambanova", - "Mode": "greedy", - "Puzzle Acc": "30.10", - "Cell Acc": "39.06", - "No answer": "24.70", - "Easy Puzzle Acc": "84.64", - "Hard Puzzle Acc": "8.89", - "Total Puzzles": 1000, - "Reason Lens": "2001.12" - }, - { - "Model": "chatgpt-4o-latest-24-09-07", - "Mode": "greedy", - "Puzzle Acc": "29.90", - "Cell Acc": "48.83", - "No answer": "4.20", - "Easy Puzzle Acc": "81.43", - "Hard Puzzle Acc": "9.86", - "Total Puzzles": 1000, - "Reason Lens": "1539.99" - }, - { - "Model": "Mistral-Large-2", - "Mode": "greedy", - "Puzzle Acc": "29.00", - "Cell Acc": "47.64", - "No answer": "1.70", - "Easy Puzzle Acc": "80.36", - "Hard Puzzle Acc": "9.03", - "Total Puzzles": 1000, - "Reason Lens": "1592.39" - }, - { - "Model": "gpt-4-turbo-2024-04-09", - "Mode": "greedy", - "Puzzle Acc": "28.40", - "Cell Acc": "47.90", - "No answer": "0.10", - "Easy Puzzle Acc": "80.71", - "Hard Puzzle Acc": "8.06", - "Total Puzzles": 1000, - "Reason Lens": "1148.46" - }, - { - "Model": "gpt-4o-2024-05-13", - "Mode": "greedy", - "Puzzle Acc": "28.20", - "Cell Acc": "38.72", - "No answer": "19.30", - "Easy Puzzle Acc": "77.86", - "Hard Puzzle Acc": "8.89", - "Total Puzzles": 1000, - "Reason Lens": "1643.51" - }, - { - "Model": "gpt-4-0314", - "Mode": "greedy", - "Puzzle Acc": "27.10", - "Cell Acc": "47.43", - "No answer": "0.20", - "Easy Puzzle Acc": "77.14", - "Hard Puzzle Acc": "7.64", - "Total Puzzles": 1000, - "Reason Lens": "1203.17" - }, - { - "Model": "claude-3-opus-20240229", - "Mode": "greedy", - "Puzzle Acc": "27.00", - "Cell Acc": "48.91", - "No answer": "0.00", - "Easy Puzzle Acc": "78.21", - "Hard Puzzle Acc": "7.08", - "Total Puzzles": 1000, - "Reason Lens": "855.72" - }, - { - "Model": "Qwen2.5-72B-Instruct", - "Mode": "greedy", - "Puzzle Acc": "26.60", - "Cell Acc": "40.92", - "No answer": "11.90", - "Easy Puzzle Acc": "76.43", - "Hard Puzzle Acc": "7.22", - "Total Puzzles": 1000, - "Reason Lens": "1795.90" - }, - { - "Model": "gpt-4-turbo-2024-04-09", - "Mode": "sampling", - "Puzzle Acc": "26.40", - "Cell Acc": "47.93", - "No answer": "0.00", - "Easy Puzzle Acc": "74.29", - "Hard Puzzle Acc": "7.78", - "Total Puzzles": 1000, - "Reason Lens": "1165.90" - }, - { - "Model": "Qwen2.5-32B-Instruct", - "Mode": "greedy", - "Puzzle Acc": "26.10", - "Cell Acc": "43.39", - "No answer": "6.30", - "Easy Puzzle Acc": "77.50", - "Hard Puzzle Acc": "6.11", - "Total Puzzles": 1000, - "Reason Lens": "1333.07" - }, - { - "Model": "gemini-1.5-pro-exp-0801", - "Mode": "greedy", - "Puzzle Acc": "25.20", - "Cell Acc": "48.50", + "Model": "gpt-4o-mini-2024-07-18", + "Mode": "bon_64", + "Puzzle Acc": "47.90", + "Cell Acc": "73.42", "No answer": "0.00", - "Easy Puzzle Acc": "72.50", - "Hard Puzzle Acc": "6.81", + "Easy Puzzle Acc": "97.14", + "Hard Puzzle Acc": "28.75", "Total Puzzles": 1000, - "Reason Lens": "1389.75" - }, - { - "Model": "Llama-3.1-405B-Inst@hyperbolic", - "Mode": "greedy", - "Puzzle Acc": "25.00", - "Cell Acc": "46.62", - "No answer": "6.25", - "Easy Puzzle Acc": "66.67", - "Hard Puzzle Acc": "15.38", - "Total Puzzles": 16, - "Reason Lens": "1517.13" - }, - { - "Model": "gemini-1.5-flash-exp-0827", - "Mode": "greedy", - "Puzzle Acc": "25.00", - "Cell Acc": "43.56", - "No answer": "8.50", - "Easy Puzzle Acc": "70.71", - "Hard Puzzle Acc": "7.22", - "Total Puzzles": 1000, - "Reason Lens": "1705.11" - }, - { - "Model": "Meta-Llama-3.1-70B-Instruct", - "Mode": "greedy", - "Puzzle Acc": "24.90", - "Cell Acc": "27.98", - "No answer": "43.00", - "Easy Puzzle Acc": "73.57", - "Hard Puzzle Acc": "5.97", - "Total Puzzles": 1000, - "Reason Lens": "1483.68" - }, - { - "Model": "deepseek-v2-chat-0628", - "Mode": "greedy", - "Puzzle Acc": "22.70", - "Cell Acc": "42.46", - "No answer": "5.20", - "Easy Puzzle Acc": "68.57", - "Hard Puzzle Acc": "4.86", - "Total Puzzles": 1000, - "Reason Lens": "1260.23" - }, - { - "Model": "deepseek-v2.5-0908", - "Mode": "greedy", - "Puzzle Acc": "22.10", - "Cell Acc": "38.01", - "No answer": "12.70", - "Easy Puzzle Acc": "68.21", - "Hard Puzzle Acc": "4.17", - "Total Puzzles": 1000, - "Reason Lens": "1294.46" - }, - { - "Model": "Qwen2-72B-Instruct", - "Mode": "greedy", - "Puzzle Acc": "21.40", - "Cell Acc": "38.32", - "No answer": "10.20", - "Easy Puzzle Acc": "63.93", - "Hard Puzzle Acc": "4.86", - "Total Puzzles": 1000, - "Reason Lens": "1813.82" - }, - { - "Model": "deepseek-v2-coder-0614", - "Mode": "greedy", - "Puzzle Acc": "21.10", - "Cell Acc": "41.58", - "No answer": "4.90", - "Easy Puzzle Acc": "64.64", - "Hard Puzzle Acc": "4.17", - "Total Puzzles": 1000, - "Reason Lens": "1324.55" - }, - { - "Model": "deepseek-v2-coder-0724", - "Mode": "greedy", - "Puzzle Acc": "20.50", - "Cell Acc": "42.35", - "No answer": "3.40", - "Easy Puzzle Acc": "61.79", - "Hard Puzzle Acc": "4.44", - "Total Puzzles": 1000, - "Reason Lens": "1230.63" + "Reason Lens": "901.94", + "N_Mode": "best_of_n", + "N_Size": 64 }, { "Model": "gpt-4o-mini-2024-07-18", - "Mode": "greedy", - "Puzzle Acc": "20.10", - "Cell Acc": "41.26", - "No answer": "0.10", - "Easy Puzzle Acc": "62.50", - "Hard Puzzle Acc": "3.61", - "Total Puzzles": 1000, - "Reason Lens": "943.52" - }, - { - "Model": "gemini-1.5-pro", - "Mode": "sampling", - "Puzzle Acc": "19.70", - "Cell Acc": "45.24", - "No answer": "0.40", - "Easy Puzzle Acc": "60.00", - "Hard Puzzle Acc": "4.03", - "Total Puzzles": 1000, - "Reason Lens": "1356.77" - }, - { - "Model": "gemini-1.5-flash", - "Mode": "greedy", - "Puzzle Acc": "19.40", - "Cell Acc": "31.77", - "No answer": "22.70", - "Easy Puzzle Acc": "59.29", - "Hard Puzzle Acc": "3.89", - "Total Puzzles": 1000, - "Reason Lens": "1538.18" - }, - { - "Model": "gemini-1.5-pro", - "Mode": "greedy", - "Puzzle Acc": "19.40", - "Cell Acc": "44.59", - "No answer": "0.80", - "Easy Puzzle Acc": "55.71", - "Hard Puzzle Acc": "5.28", - "Total Puzzles": 1000, - "Reason Lens": "1336.17" - }, - { - "Model": "yi-large-preview", - "Mode": "greedy", - "Puzzle Acc": "18.90", - "Cell Acc": "42.61", - "No answer": "1.40", - "Easy Puzzle Acc": "58.93", - "Hard Puzzle Acc": "3.33", - "Total Puzzles": 1000, - "Reason Lens": "833.36" - }, - { - "Model": "yi-large", - "Mode": "greedy", - "Puzzle Acc": "18.80", - "Cell Acc": "39.83", - "No answer": "1.80", - "Easy Puzzle Acc": "58.21", - "Hard Puzzle Acc": "3.47", - "Total Puzzles": 1000, - "Reason Lens": "757.01" - }, - { - "Model": "claude-3-sonnet-20240229", - "Mode": "greedy", - "Puzzle Acc": "18.70", - "Cell Acc": "43.66", + "Mode": "bon_32", + "Puzzle Acc": "42.70", + "Cell Acc": "68.86", "No answer": "0.00", - "Easy Puzzle Acc": "58.93", - "Hard Puzzle Acc": "3.06", - "Total Puzzles": 1000, - "Reason Lens": "1095.37" - }, - { - "Model": "Qwen2-72B-Instruct", - "Mode": "sampling", - "Puzzle Acc": "18.70", - "Cell Acc": "40.57", - "No answer": "3.20", - "Easy Puzzle Acc": "57.50", - "Hard Puzzle Acc": "3.61", - "Total Puzzles": 1000, - "Reason Lens": "1894.72" - }, - { - "Model": "gemini-1.5-flash", - "Mode": "sampling", - "Puzzle Acc": "18.40", - "Cell Acc": "36.03", - "No answer": "12.80", - "Easy Puzzle Acc": "57.86", - "Hard Puzzle Acc": "3.06", + "Easy Puzzle Acc": "97.50", + "Hard Puzzle Acc": "21.39", "Total Puzzles": 1000, - "Reason Lens": "1713.03" + "Reason Lens": "980.51", + "N_Mode": "best_of_n", + "N_Size": 32 }, { - "Model": "Meta-Llama-3-70B-Instruct", - "Mode": "greedy", - "Puzzle Acc": "16.80", - "Cell Acc": "42.31", - "No answer": "0.20", - "Easy Puzzle Acc": "52.86", - "Hard Puzzle Acc": "2.78", - "Total Puzzles": 1000, - "Reason Lens": "809.95" - }, - { - "Model": "Athene-70B", - "Mode": "greedy", - "Puzzle Acc": "16.70", - "Cell Acc": "32.98", - "No answer": "21.10", - "Easy Puzzle Acc": "52.50", - "Hard Puzzle Acc": "2.78", - "Total Puzzles": 1000, - "Reason Lens": "391.19" - }, - { - "Model": "gemma-2-27b-it", - "Mode": "greedy", - "Puzzle Acc": "16.30", - "Cell Acc": "41.18", - "No answer": "1.10", - "Easy Puzzle Acc": "50.71", - "Hard Puzzle Acc": "2.92", - "Total Puzzles": 1000, - "Reason Lens": "1014.56" - }, - { - "Model": "claude-3-haiku-20240307", - "Mode": "greedy", - "Puzzle Acc": "14.30", - "Cell Acc": "37.87", - "No answer": "0.10", - "Easy Puzzle Acc": "47.86", - "Hard Puzzle Acc": "1.25", - "Total Puzzles": 1000, - "Reason Lens": "1015.06" - }, - { - "Model": "command-r-plus", - "Mode": "greedy", - "Puzzle Acc": "13.90", - "Cell Acc": "39.01", - "No answer": "0.20", - "Easy Puzzle Acc": "44.64", - "Hard Puzzle Acc": "1.94", - "Total Puzzles": 1000, - "Reason Lens": "810.53" - }, - { - "Model": "reka-core-20240501", - "Mode": "greedy", - "Puzzle Acc": "13.00", - "Cell Acc": "33.88", - "No answer": "4.00", - "Easy Puzzle Acc": "43.21", - "Hard Puzzle Acc": "1.25", - "Total Puzzles": 1000, - "Reason Lens": "1078.29" - }, - { - "Model": "gemma-2-9b-it", - "Mode": "greedy", - "Puzzle Acc": "12.80", - "Cell Acc": "36.79", + "Model": "gpt-4o-mini-2024-07-18", + "Mode": "bon_32_v2", + "Puzzle Acc": "42.60", + "Cell Acc": "69.39", "No answer": "0.00", - "Easy Puzzle Acc": "41.79", - "Hard Puzzle Acc": "1.53", - "Total Puzzles": 1000, - "Reason Lens": "849.84" - }, - { - "Model": "Meta-Llama-3.1-8B-Instruct", - "Mode": "greedy", - "Puzzle Acc": "12.80", - "Cell Acc": "13.68", - "No answer": "61.50", - "Easy Puzzle Acc": "43.57", - "Hard Puzzle Acc": "0.83", - "Total Puzzles": 1000, - "Reason Lens": "1043.90" - }, - { - "Model": "Qwen2.5-7B-Instruct", - "Mode": "greedy", - "Puzzle Acc": "12.00", - "Cell Acc": "30.67", - "No answer": "9.50", - "Easy Puzzle Acc": "38.93", - "Hard Puzzle Acc": "1.53", - "Total Puzzles": 1000, - "Reason Lens": "850.93" - }, - { - "Model": "Meta-Llama-3-8B-Instruct", - "Mode": "greedy", - "Puzzle Acc": "11.90", - "Cell Acc": "23.70", - "No answer": "29.20", - "Easy Puzzle Acc": "40.71", - "Hard Puzzle Acc": "0.69", - "Total Puzzles": 1000, - "Reason Lens": "1216.40" - }, - { - "Model": "Mistral-Nemo-Instruct-2407", - "Mode": "greedy", - "Puzzle Acc": "11.80", - "Cell Acc": "34.93", - "No answer": "1.60", - "Easy Puzzle Acc": "38.93", - "Hard Puzzle Acc": "1.25", - "Total Puzzles": 1000, - "Reason Lens": "925.88" - }, - { - "Model": "Phi-3-mini-4k-instruct", - "Mode": "greedy", - "Puzzle Acc": "11.60", - "Cell Acc": "13.50", - "No answer": "59.00", - "Easy Puzzle Acc": "38.21", - "Hard Puzzle Acc": "1.25", - "Total Puzzles": 1000, - "Reason Lens": "790.29" - }, - { - "Model": "Yi-1.5-34B-Chat", - "Mode": "greedy", - "Puzzle Acc": "11.50", - "Cell Acc": "32.73", - "No answer": "4.40", - "Easy Puzzle Acc": "37.50", - "Hard Puzzle Acc": "1.39", - "Total Puzzles": 1000, - "Reason Lens": "869.65" - }, - { - "Model": "Meta-Llama-3-8B-Instruct", - "Mode": "sampling", - "Puzzle Acc": "11.00", - "Cell Acc": "26.11", - "No answer": "22.30", - "Easy Puzzle Acc": "36.79", - "Hard Puzzle Acc": "0.97", - "Total Puzzles": 1000, - "Reason Lens": "1282.40" - }, - { - "Model": "gpt-3.5-turbo-0125", - "Mode": "greedy", - "Puzzle Acc": "10.10", - "Cell Acc": "33.06", - "No answer": "0.10", - "Easy Puzzle Acc": "33.57", - "Hard Puzzle Acc": "0.97", - "Total Puzzles": 1000, - "Reason Lens": "820.66" - }, - { - "Model": "command-r", - "Mode": "greedy", - "Puzzle Acc": "9.90", - "Cell Acc": "32.66", - "No answer": "1.50", - "Easy Puzzle Acc": "32.14", - "Hard Puzzle Acc": "1.25", - "Total Puzzles": 1000, - "Reason Lens": "1005.17" - }, - { - "Model": "reka-flash-20240226", - "Mode": "greedy", - "Puzzle Acc": "9.30", - "Cell Acc": "25.67", - "No answer": "18.70", - "Easy Puzzle Acc": "30.71", - "Hard Puzzle Acc": "0.97", - "Total Puzzles": 1000, - "Reason Lens": "1074.80" - }, - { - "Model": "mathstral-7B-v0.1", - "Mode": "greedy", - "Puzzle Acc": "9.00", - "Cell Acc": "20.42", - "No answer": "36.00", - "Easy Puzzle Acc": "30.00", - "Hard Puzzle Acc": "0.83", - "Total Puzzles": 1000, - "Reason Lens": "1148.16" - }, - { - "Model": "Mixtral-8x7B-Instruct-v0.1", - "Mode": "greedy", - "Puzzle Acc": "8.70", - "Cell Acc": "26.47", - "No answer": "20.30", - "Easy Puzzle Acc": "28.93", - "Hard Puzzle Acc": "0.83", - "Total Puzzles": 1000, - "Reason Lens": "1177.21" - }, - { - "Model": "Qwen2-7B-Instruct", - "Mode": "greedy", - "Puzzle Acc": "8.40", - "Cell Acc": "22.06", - "No answer": "24.40", - "Easy Puzzle Acc": "29.29", - "Hard Puzzle Acc": "0.28", - "Total Puzzles": 1000, - "Reason Lens": "1473.23" - }, - { - "Model": "Phi-3.5-mini-instruct", - "Mode": "greedy", - "Puzzle Acc": "6.40", - "Cell Acc": "5.98", - "No answer": "80.60", - "Easy Puzzle Acc": "21.79", - "Hard Puzzle Acc": "0.42", - "Total Puzzles": 1000, - "Reason Lens": "718.43" - }, - { - "Model": "Qwen2.5-3B-Instruct", - "Mode": "greedy", - "Puzzle Acc": "4.80", - "Cell Acc": "11.44", - "No answer": "56.70", - "Easy Puzzle Acc": "17.14", - "Hard Puzzle Acc": "0.00", - "Total Puzzles": 1000, - "Reason Lens": "906.58" - }, - { - "Model": "gemma-2-2b-it", - "Mode": "greedy", - "Puzzle Acc": "4.20", - "Cell Acc": "9.97", - "No answer": "57.20", - "Easy Puzzle Acc": "14.29", - "Hard Puzzle Acc": "0.28", - "Total Puzzles": 1000, - "Reason Lens": "1032.89" - }, - { - "Model": "Yi-1.5-9B-Chat", - "Mode": "greedy", - "Puzzle Acc": "2.30", - "Cell Acc": "7.53", - "No answer": "11.30", - "Easy Puzzle Acc": "8.21", - "Hard Puzzle Acc": "0.00", + "Easy Puzzle Acc": "96.79", + "Hard Puzzle Acc": "21.53", "Total Puzzles": 1000, - "Reason Lens": "1592.60" + "Reason Lens": "978.38", + "N_Mode": "best_of_n", + "N_Size": 32 } ] \ No newline at end of file diff --git a/result_dirs/zebra-grid.summary.md b/result_dirs/zebra-grid.summary.md index fda6f0e..298561d 100644 --- a/result_dirs/zebra-grid.summary.md +++ b/result_dirs/zebra-grid.summary.md @@ -1,66 +1,5 @@ -| Model | Mode | Puzzle Acc | Easy Puzzle Acc | Hard Puzzle Acc | Cell Acc | No answer | Total Puzzles | Reason Lens | -|----------------------------------|----------|--------------|-------------------|-------------------|------------|-------------|-----------------|---------------| -| o1-preview-2024-09-12 | greedy | 71.4 | 98.57 | 60.83 | 75.14 | 0.3 | 1000 | 1565.88 | -| o1-preview-2024-09-12-v2 | greedy | 70.4 | 98.21 | 59.58 | 74.18 | 0.4 | 1000 | 1559.71 | -| o1-mini-2024-09-12-v3 | greedy | 59.7 | 86.07 | 49.44 | 70.32 | 1 | 1000 | 1166.38 | -| o1-mini-2024-09-12-v2 | greedy | 56.8 | 82.86 | 46.67 | 69.87 | 1.3 | 1000 | 1164.95 | -| o1-mini-2024-09-12 | greedy | 52.6 | 87.14 | 39.17 | 52.29 | 0.8 | 1000 | 993.28 | -| claude-3-5-sonnet-20240620 | greedy | 33.4 | 87.5 | 12.36 | 54.34 | 0 | 1000 | 1141.94 | -| claude-3-5-sonnet-20240620 | sampling | 33.4 | 88.21 | 12.08 | 53.01 | 0.1 | 1000 | 1153.83 | -| Llama-3.1-405B-Inst-fp8@together | greedy | 32.6 | 87.14 | 11.39 | 45.8 | 12.5 | 1000 | 314.66 | -| Llama-3.1-405B-Inst-fp8@together | sampling | 32.6 | 86.07 | 11.81 | 47.04 | 10.8 | 1000 | 439.96 | -| gpt-4o-2024-08-06 | greedy | 31.7 | 84.64 | 11.11 | 50.34 | 3.6 | 1000 | 1106.51 | -| gpt-4o-2024-05-13 | sampling | 30.8 | 81.07 | 11.25 | 46.19 | 6.6 | 1000 | 1549.74 | -| gemini-1.5-pro-exp-0827 | greedy | 30.5 | 79.64 | 11.39 | 50.84 | 0.8 | 1000 | 1594.47 | -| Llama-3.1-405B-Inst@sambanova | greedy | 30.1 | 84.64 | 8.89 | 39.06 | 24.7 | 1000 | 2001.12 | -| chatgpt-4o-latest-24-09-07 | greedy | 29.9 | 81.43 | 9.86 | 48.83 | 4.2 | 1000 | 1539.99 | -| Mistral-Large-2 | greedy | 29 | 80.36 | 9.03 | 47.64 | 1.7 | 1000 | 1592.39 | -| gpt-4-turbo-2024-04-09 | greedy | 28.4 | 80.71 | 8.06 | 47.9 | 0.1 | 1000 | 1148.46 | -| gpt-4o-2024-05-13 | greedy | 28.2 | 77.86 | 8.89 | 38.72 | 19.3 | 1000 | 1643.51 | -| gpt-4-0314 | greedy | 27.1 | 77.14 | 7.64 | 47.43 | 0.2 | 1000 | 1203.17 | -| claude-3-opus-20240229 | greedy | 27 | 78.21 | 7.08 | 48.91 | 0 | 1000 | 855.72 | -| Qwen2.5-72B-Instruct | greedy | 26.6 | 76.43 | 7.22 | 40.92 | 11.9 | 1000 | 1795.9 | -| gpt-4-turbo-2024-04-09 | sampling | 26.4 | 74.29 | 7.78 | 47.93 | 0 | 1000 | 1165.9 | -| Qwen2.5-32B-Instruct | greedy | 26.1 | 77.5 | 6.11 | 43.39 | 6.3 | 1000 | 1333.07 | -| gemini-1.5-pro-exp-0801 | greedy | 25.2 | 72.5 | 6.81 | 48.5 | 0 | 1000 | 1389.75 | -| Llama-3.1-405B-Inst@hyperbolic | greedy | 25 | 66.67 | 15.38 | 46.62 | 6.25 | 16 | 1517.13 | -| gemini-1.5-flash-exp-0827 | greedy | 25 | 70.71 | 7.22 | 43.56 | 8.5 | 1000 | 1705.11 | -| Meta-Llama-3.1-70B-Instruct | greedy | 24.9 | 73.57 | 5.97 | 27.98 | 43 | 1000 | 1483.68 | -| deepseek-v2-chat-0628 | greedy | 22.7 | 68.57 | 4.86 | 42.46 | 5.2 | 1000 | 1260.23 | -| deepseek-v2.5-0908 | greedy | 22.1 | 68.21 | 4.17 | 38.01 | 12.7 | 1000 | 1294.46 | -| Qwen2-72B-Instruct | greedy | 21.4 | 63.93 | 4.86 | 38.32 | 10.2 | 1000 | 1813.82 | -| deepseek-v2-coder-0614 | greedy | 21.1 | 64.64 | 4.17 | 41.58 | 4.9 | 1000 | 1324.55 | -| deepseek-v2-coder-0724 | greedy | 20.5 | 61.79 | 4.44 | 42.35 | 3.4 | 1000 | 1230.63 | -| gpt-4o-mini-2024-07-18 | greedy | 20.1 | 62.5 | 3.61 | 41.26 | 0.1 | 1000 | 943.52 | -| gemini-1.5-pro | sampling | 19.7 | 60 | 4.03 | 45.24 | 0.4 | 1000 | 1356.77 | -| gemini-1.5-flash | greedy | 19.4 | 59.29 | 3.89 | 31.77 | 22.7 | 1000 | 1538.18 | -| gemini-1.5-pro | greedy | 19.4 | 55.71 | 5.28 | 44.59 | 0.8 | 1000 | 1336.17 | -| yi-large-preview | greedy | 18.9 | 58.93 | 3.33 | 42.61 | 1.4 | 1000 | 833.36 | -| yi-large | greedy | 18.8 | 58.21 | 3.47 | 39.83 | 1.8 | 1000 | 757.01 | -| claude-3-sonnet-20240229 | greedy | 18.7 | 58.93 | 3.06 | 43.66 | 0 | 1000 | 1095.37 | -| Qwen2-72B-Instruct | sampling | 18.7 | 57.5 | 3.61 | 40.57 | 3.2 | 1000 | 1894.72 | -| gemini-1.5-flash | sampling | 18.4 | 57.86 | 3.06 | 36.03 | 12.8 | 1000 | 1713.03 | -| Meta-Llama-3-70B-Instruct | greedy | 16.8 | 52.86 | 2.78 | 42.31 | 0.2 | 1000 | 809.95 | -| Athene-70B | greedy | 16.7 | 52.5 | 2.78 | 32.98 | 21.1 | 1000 | 391.19 | -| gemma-2-27b-it | greedy | 16.3 | 50.71 | 2.92 | 41.18 | 1.1 | 1000 | 1014.56 | -| claude-3-haiku-20240307 | greedy | 14.3 | 47.86 | 1.25 | 37.87 | 0.1 | 1000 | 1015.06 | -| command-r-plus | greedy | 13.9 | 44.64 | 1.94 | 39.01 | 0.2 | 1000 | 810.53 | -| reka-core-20240501 | greedy | 13 | 43.21 | 1.25 | 33.88 | 4 | 1000 | 1078.29 | -| gemma-2-9b-it | greedy | 12.8 | 41.79 | 1.53 | 36.79 | 0 | 1000 | 849.84 | -| Meta-Llama-3.1-8B-Instruct | greedy | 12.8 | 43.57 | 0.83 | 13.68 | 61.5 | 1000 | 1043.9 | -| Qwen2.5-7B-Instruct | greedy | 12 | 38.93 | 1.53 | 30.67 | 9.5 | 1000 | 850.93 | -| Meta-Llama-3-8B-Instruct | greedy | 11.9 | 40.71 | 0.69 | 23.7 | 29.2 | 1000 | 1216.4 | -| Mistral-Nemo-Instruct-2407 | greedy | 11.8 | 38.93 | 1.25 | 34.93 | 1.6 | 1000 | 925.88 | -| Phi-3-mini-4k-instruct | greedy | 11.6 | 38.21 | 1.25 | 13.5 | 59 | 1000 | 790.29 | -| Yi-1.5-34B-Chat | greedy | 11.5 | 37.5 | 1.39 | 32.73 | 4.4 | 1000 | 869.65 | -| Meta-Llama-3-8B-Instruct | sampling | 11 | 36.79 | 0.97 | 26.11 | 22.3 | 1000 | 1282.4 | -| gpt-3.5-turbo-0125 | greedy | 10.1 | 33.57 | 0.97 | 33.06 | 0.1 | 1000 | 820.66 | -| command-r | greedy | 9.9 | 32.14 | 1.25 | 32.66 | 1.5 | 1000 | 1005.17 | -| reka-flash-20240226 | greedy | 9.3 | 30.71 | 0.97 | 25.67 | 18.7 | 1000 | 1074.8 | -| mathstral-7B-v0.1 | greedy | 9 | 30 | 0.83 | 20.42 | 36 | 1000 | 1148.16 | -| Mixtral-8x7B-Instruct-v0.1 | greedy | 8.7 | 28.93 | 0.83 | 26.47 | 20.3 | 1000 | 1177.21 | -| Qwen2-7B-Instruct | greedy | 8.4 | 29.29 | 0.28 | 22.06 | 24.4 | 1000 | 1473.23 | -| Phi-3.5-mini-instruct | greedy | 6.4 | 21.79 | 0.42 | 5.98 | 80.6 | 1000 | 718.43 | -| Qwen2.5-3B-Instruct | greedy | 4.8 | 17.14 | 0 | 11.44 | 56.7 | 1000 | 906.58 | -| gemma-2-2b-it | greedy | 4.2 | 14.29 | 0.28 | 9.97 | 57.2 | 1000 | 1032.89 | -| Yi-1.5-9B-Chat | greedy | 2.3 | 8.21 | 0 | 7.53 | 11.3 | 1000 | 1592.6 | \ No newline at end of file +| Model | Mode | N_Mode | N_Size | Puzzle Acc | Easy Puzzle Acc | Hard Puzzle Acc | Cell Acc | No answer | Total Puzzles | Reason Lens | +|------------------------|-----------|-----------|----------|--------------|-------------------|-------------------|------------|-------------|-----------------|---------------| +| gpt-4o-mini-2024-07-18 | bon_64 | best_of_n | 64 | 47.9 | 97.14 | 28.75 | 73.42 | 0 | 1000 | 901.94 | +| gpt-4o-mini-2024-07-18 | bon_32 | best_of_n | 32 | 42.7 | 97.5 | 21.39 | 68.86 | 0 | 1000 | 980.51 | +| gpt-4o-mini-2024-07-18 | bon_32_v2 | best_of_n | 32 | 42.6 | 96.79 | 21.53 | 69.39 | 0 | 1000 | 978.38 | \ No newline at end of file diff --git a/src/evaluation/zebra_grid_eval.py b/src/evaluation/zebra_grid_eval.py index d20ca6f..924a10c 100644 --- a/src/evaluation/zebra_grid_eval.py +++ b/src/evaluation/zebra_grid_eval.py @@ -6,6 +6,9 @@ from eval_utils import load_model_results, extract_last_complete_json, model_name_replacement +from collections import Counter +from collections import defaultdict + private_solutions = {} def load_private_solutions(): @@ -17,7 +20,7 @@ def load_private_solutions(): -def eval_model(model, filepath): +def eval_model(model, filepath, mode="best_of_n", max_N=None): global private_solutions with open(filepath, "r") as f: print(f"Processing {filepath}") @@ -51,22 +54,139 @@ def eval_model(model, filepath): this_total_cells += len(columns) - 1 total_cells += this_total_cells - # Read and Parse the prediction from model output - prediction_str = item["output"][0] - prediction_json = extract_last_complete_json(prediction_str) - if prediction_json is None or "solution" not in prediction_json or prediction_json["solution"] is None: - # print("-"*100) - # prediction_str = prediction_str.replace("\n", "") - # print([prediction_str]) - # json.loads(prediction_str) + # Read and Parse the predictions from model output + predictions = [extract_last_complete_json(output) for output in item["output"]] + predictions = [p for p in predictions if p is not None and "solution" in p and p["solution"] is not None] + + # if all the predictions are empty, then skip the current puzzle, and add no answer count + if not predictions: no_asnwer += 1 - # print(item["id"]) - continue - reason = prediction_json.get("reasoning", "") - prediction_table = prediction_json["solution"] + continue + + # Limit the number of predictions to max_N if specified + if max_N is not None: + predictions = predictions[:max_N] + + + n_size = len(predictions) # Capture the number of predictions + + if n_size == 1: + mode = "single" + # Single output case + prediction_table = predictions[0]["solution"] + reason = predictions[0].get("reasoning", "") + elif mode == "best_of_n": + # Best of N: Choose the prediction with the maximum number of correct cells + max_correct_cells = 0 + best_prediction = None + for prediction in predictions: + current_correct_cells = 0 + prediction_table = prediction["solution"] + for house in solution_table: + for column in solution_table[house]: + if house in prediction_table and column in prediction_table[house]: + truth_cell = solution_table[house][column].lower().strip() + # Note that prediction_table[house][column] could be None + if prediction_table[house][column] is None: + continue + predicted_cell = prediction_table[house][column].lower().strip() + if truth_cell == predicted_cell: + current_correct_cells += 1 + if current_correct_cells > max_correct_cells or best_prediction is None: + max_correct_cells = current_correct_cells + best_prediction = prediction + prediction_table = best_prediction["solution"] + reason = best_prediction.get("reasoning", "") + + elif mode == "majority_of_n": + # Majority of N: Perform majority voting for each cell + prediction_table = {} + for house in solution_table: + prediction_table[house] = {} + for column in solution_table[house]: + votes = [] + for prediction in predictions: + if house in prediction["solution"] and column in prediction["solution"][house]: + predicted_cell = prediction["solution"][house][column] + if isinstance(predicted_cell, list): + predicted_cell = predicted_cell[0] + # Note that prediction_table[house][column] could be None + if predicted_cell is not None: + votes.append(predicted_cell.lower().strip()) + if votes: + most_common = Counter(votes).most_common(1)[0][0] + prediction_table[house][column] = most_common + else: + prediction_table[house][column] = None + # reason = "" # Reasoning is not applicable for majority voting + # use a random prediction to get the reasoning + reason = predictions[0].get("reasoning", "") + elif mode in ["most_common_of_n", "middle_common_of_n", "least_common_of_n"]: + # Choose the prediction where the cell's value is the most common among all predictions at the same positions + # Specifically, we give each value at each position a score based on its popularity, and the prediction with the highest sum of scores is chosen + # Initialize a dictionary to store scores for each prediction + prediction_scores = defaultdict(int) + + # Iterate over each house and column in the solution table + for house in solution_table: + for column in solution_table[house]: + # Count occurrences of each value at the current position across all predictions + value_counter = Counter() + for prediction in predictions: + if house in prediction["solution"] and column in prediction["solution"][house]: + predicted_cell = prediction["solution"][house][column] + if isinstance(predicted_cell, list): + predicted_cell = predicted_cell[0] + if predicted_cell is not None: + value_counter[predicted_cell.lower().strip()] += 1 + + # Assign scores to each prediction based on the popularity of its value at the current position + for idx, prediction in enumerate(predictions): + if house in prediction["solution"] and column in prediction["solution"][house]: + predicted_cell = prediction["solution"][house][column] + if isinstance(predicted_cell, list): + predicted_cell = predicted_cell[0] + if predicted_cell is not None: + prediction_scores[idx] += value_counter[predicted_cell.lower().strip()] + if mode == "most_common_of_n": + # Select the prediction with the highest score + best_index = max(range(len(predictions)), key=lambda idx: prediction_scores[idx]) + best_prediction = predictions[best_index] + prediction_table = best_prediction["solution"] + reason = best_prediction.get("reasoning", "") + elif mode == "middle_common_of_n": + # Select the prediction with the median score + best_index = sorted(range(len(predictions)), key=lambda idx: prediction_scores[idx])[len(predictions) // 2] + best_prediction = predictions[best_index] + prediction_table = best_prediction["solution"] + reason = best_prediction.get("reasoning", "") + elif mode == "least_common_of_n": + # Select the prediction with the lowest score + best_index = min(range(len(predictions)), key=lambda idx: prediction_scores[idx]) + best_prediction = predictions[best_index] + prediction_table = best_prediction["solution"] + reason = best_prediction.get("reasoning", "") - reason_lens.append(len(reason)) + elif mode in ["longest_of_n", "shortest_of_n", "median_of_n"]: + # Collect all predictions with their reasoning lengths + predictions_with_lengths = [(prediction, len(prediction.get("reasoning", ""))) for prediction in predictions] + + # Sort by reasoning length + predictions_with_lengths.sort(key=lambda x: x[1]) + + if mode == "longest_of_n": + best_prediction = predictions_with_lengths[-1][0] # Last element for longest + elif mode == "shortest_of_n": + best_prediction = predictions_with_lengths[0][0] # First element for shortest + elif mode == "median_of_n": + median_index = len(predictions_with_lengths) // 2 + best_prediction = predictions_with_lengths[median_index][0] # Middle element for median + prediction_table = best_prediction["solution"] + reason = best_prediction.get("reasoning", "") + + reason_lens.append(len(reason)) + this_correct_cells = 0 # number in the solution_table for house in solution_table: for column in solution_table[house]: @@ -95,6 +215,7 @@ def eval_model(model, filepath): parsed_item["correct_cells"] = this_correct_cells parsed_item["total_cells"] = this_total_cells parsed_item["solved"] = this_correct_cells == this_total_cells + parsed_results.append(parsed_item) # # print the success rate by size; order the dict by size first @@ -121,16 +242,30 @@ def eval_model(model, filepath): result["Total Puzzles"] = num_total_puzzles result["Reason Lens"] = f"{sum(reason_lens)/len(reason_lens):.2f}" result["Model"] = model_name_replacement(result["Model"]) + result["N_Mode"] = "single" if n_size == 1 else mode + result["N_Size"] = n_size return result, parsed_results # Return parsed_results along with the result def gen_results(run_name_folders): model_results = load_model_results(run_name_folders) - columns = ["Model", "Mode", "Puzzle Acc", "Easy Puzzle Acc", "Hard Puzzle Acc", "Cell Acc", "No answer", "Total Puzzles", "Reason Lens"] + columns = ["Model", "Mode", "N_Mode", "N_Size", "Puzzle Acc", "Easy Puzzle Acc", "Hard Puzzle Acc", "Cell Acc", "No answer", "Total Puzzles", "Reason Lens"] rows = [] for model_name, filepath in model_results.items(): - result, parsed_results = eval_model(model_name, filepath) + + # result, parsed_results = eval_model(model_name, filepath, mode="majority_of_n", max_N=32) + result, parsed_results = eval_model(model_name, filepath, mode="best_of_n", max_N=64) + # result, parsed_results = eval_model(model_name, filepath, mode="most_common_of_n", max_N=64) + + # result, parsed_results = eval_model(model_name, filepath, mode="longest_of_n", max_N=32) + # result, parsed_results = eval_model(model_name, filepath, mode="shortest_of_n", max_N=32) + # result, parsed_results = eval_model(model_name, filepath, mode="median_of_n", max_N=32) + + # result, parsed_results = eval_model(model_name, filepath, mode="least_common_of_n", max_N=32) + # result, parsed_results = eval_model(model_name, filepath, mode="middle_common_of_n", max_N=32) + + # Save the parsed_results to the same filepath with a new prefix parsed_results_filepath = filepath.replace("result_dirs", "result_dirs_parsed") # Create folders if they don't exist @@ -159,8 +294,12 @@ def gen_results(run_name_folders): if __name__ == "__main__": run_name_folders = { - "greedy": "result_dirs/zebra-grid", - "sampling": "result_dirs/zebra-grid/sampling", + # "greedy": "result_dirs/zebra-grid", + # "sampling": "result_dirs/zebra-grid/sampling", + "bon_32": "result_dirs/zebra-grid/bon_32", + "bon_32_v2": "result_dirs/zebra-grid/bon_32_v2", + "bon_64": "result_dirs/zebra-grid/bon_64", } load_private_solutions() gen_results(run_name_folders) +