Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
568 changes: 411 additions & 157 deletions Extrapolate METR.ipynb

Large diffs are not rendered by default.

297 changes: 125 additions & 172 deletions Full Timelines Model.ipynb

Large diffs are not rendered by default.

104 changes: 50 additions & 54 deletions Simple METR models.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,9 @@
"source": [
"import squigglepy as sq\n",
"from datetime import datetime\n",
"from libs import run_model, calculate_doubling_time\n",
"\n",
"\n",
"O3_LAUNCH_DATE = datetime(2025, 4, 16)\n",
"CLAUDE_3P7_LAUNCH_DATE = datetime(2025, 2, 24)\n",
"from libs import run_model, calculate_doubling_time\n",
"from model_data import model_data\n",
"\n",
"print(\"Loaded libraries\")"
]
Expand All @@ -36,43 +34,43 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████████████████████████████████████████████████████████████████████| 100000/100000 [00:05<00:00, 16989.81it/s]\n",
"100%|██████████████████████████████████████████████████████████████████████████| 100000/100000 [00:07<00:00, 12581.74it/s]\n"
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:06<00:00, 15637.69it/s]\n",
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:07<00:00, 13032.56it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{1: 1102,\n",
" 5: 1252,\n",
" 10: 1343,\n",
"{1: 1098,\n",
" 5: 1254,\n",
" 10: 1344,\n",
" 20: 1460,\n",
" 30: 1552,\n",
" 30: 1551,\n",
" 40: 1634,\n",
" 50: 1716,\n",
" 50: 1714,\n",
" 60: 1800,\n",
" 70: 1897,\n",
" 70: 1895,\n",
" 80: 2013,\n",
" 90: 2189,\n",
" 95: 2348,\n",
" 99: 2666}\n",
" 90: 2193,\n",
" 95: 2347,\n",
" 99: 2671}\n",
"\n",
"-\n",
"\n",
"{1: '2028 Mar 02',\n",
" 5: '2028 Jul 31',\n",
" 10: '2028 Oct 29',\n",
"{1: '2028 Feb 28',\n",
" 5: '2028 Aug 02',\n",
" 10: '2028 Oct 30',\n",
" 20: '2029 Feb 23',\n",
" 30: '2029 May 26',\n",
" 40: '2029 Aug 17',\n",
" 50: '2029 Nov 06',\n",
" 40: '2029 Aug 16',\n",
" 50: '2029 Nov 05',\n",
" 60: '2030 Jan 29',\n",
" 70: '2030 May 07',\n",
" 70: '2030 May 05',\n",
" 80: '2030 Aug 30',\n",
" 90: '2031 Feb 23',\n",
" 95: '2031 Jul 31',\n",
" 99: '2032 Jun 14'}\n"
" 90: '2031 Feb 26',\n",
" 95: '2031 Jul 30',\n",
" 99: '2032 Jun 18'}\n"
]
}
],
Expand All @@ -87,9 +85,7 @@
" return days * measurement_error_variance\n",
"\n",
"\n",
"_ = run_model(\n",
" metr_model, index_date=CLAUDE_3P7_LAUNCH_DATE\n",
") # Results should look similar to Figure 12"
"_ = run_model(metr_model, index_date=model_data['claude_3p7_sonnet']['launch_date']) # Results should look similar to Figure 12"
]
},
{
Expand All @@ -102,58 +98,58 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████████████████████████████████████████████████████████████████████| 100000/100000 [00:06<00:00, 16320.64it/s]\n",
"100%|██████████████████████████████████████████████████████████████████████████| 100000/100000 [00:07<00:00, 12565.94it/s]\n"
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:06<00:00, 15426.34it/s]\n",
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:07<00:00, 12857.84it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{1: 546,\n",
" 5: 621,\n",
" 10: 666,\n",
" 20: 724,\n",
"{1: 547,\n",
" 5: 620,\n",
" 10: 665,\n",
" 20: 723,\n",
" 30: 769,\n",
" 40: 811,\n",
" 40: 810,\n",
" 50: 851,\n",
" 60: 893,\n",
" 70: 940,\n",
" 80: 998,\n",
" 90: 1086,\n",
" 95: 1164,\n",
" 99: 1324}\n",
" 90: 1085,\n",
" 95: 1163,\n",
" 99: 1323}\n",
"\n",
"-\n",
"\n",
"{1: '2026 Oct 14',\n",
" 5: '2026 Dec 28',\n",
" 10: '2027 Feb 11',\n",
" 20: '2027 Apr 11',\n",
"{1: '2026 Oct 16',\n",
" 5: '2026 Dec 27',\n",
" 10: '2027 Feb 10',\n",
" 20: '2027 Apr 10',\n",
" 30: '2027 May 26',\n",
" 40: '2027 Jul 06',\n",
" 50: '2027 Aug 15',\n",
" 60: '2027 Sep 27',\n",
" 70: '2027 Nov 12',\n",
" 60: '2027 Sep 26',\n",
" 70: '2027 Nov 13',\n",
" 80: '2028 Jan 10',\n",
" 90: '2028 Apr 07',\n",
" 95: '2028 Jun 24',\n",
" 90: '2028 Apr 06',\n",
" 95: '2028 Jun 23',\n",
" 99: '2028 Nov 30'}\n"
]
}
],
"source": [
"def metr_model_with_o3():\n",
" days = calculate_doubling_time(\n",
" start_task_length=1.75, agi_task_length=167, doubling_time=118, acceleration=1\n",
" start_task_length=model_data['o3']['performance_50p'],\n",
" agi_task_length=167,\n",
" doubling_time=118,\n",
" acceleration=1\n",
" ) # Use o3 task length, o3 launch date, and the 2024-2025 doubling time\n",
" measurement_error_variance = sq.invlognorm(\n",
" 0.8, 1.5\n",
" ) # Add measurement error on tasks: SD fit to trend variance from Figure 12\n",
" measurement_error_variance = sq.invlognorm(0.8, 1.5) # Add measurement error on tasks: SD fit to trend variance from Figure 12\n",
" return days * measurement_error_variance\n",
"\n",
"\n",
"_ = run_model(metr_model_with_o3, index_date=O3_LAUNCH_DATE)"
"_ = run_model(metr_model_with_o3, index_date=model_data['o3']['launch_date'])"
]
},
{
Expand All @@ -166,8 +162,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|█████████████████████████████████████████████████████████████████████████| 100000/100000 [00:00<00:00, 102136.93it/s]\n",
"100%|██████████████████████████████████████████████████████████████████████████| 100000/100000 [00:04<00:00, 20427.87it/s]\n"
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:01<00:00, 92619.45it/s]\n",
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:05<00:00, 19037.96it/s]\n"
]
},
{
Expand Down Expand Up @@ -219,7 +215,7 @@
" return days - shift\n",
"\n",
"\n",
"_ = run_model(simple_model, index_date=O3_LAUNCH_DATE)"
"_ = run_model(simple_model, index_date=model_data['o3']['launch_date'])"
]
}
],
Expand Down
55 changes: 20 additions & 35 deletions Track Acceleration.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
],
"source": [
"from datetime import datetime\n",
"from typing import List, Tuple\n",
"\n",
"from libs import (\n",
" test_acceleration,\n",
" print_estimation,\n",
" bootstrap_growth_parameters,\n",
" sliding_window_analysis,\n",
")\n",
"from model_data import model_data\n",
"\n",
"print(\"Loaded libraries\")"
]
Expand All @@ -42,46 +42,31 @@
"GPT‑2 to GPT5 (50%): (316, 0.908)\n",
"GPT-3 to GPT5 (50%): (296, 0.9)\n",
"GPT-4 to GPT5 (50%): (253, 0.9)\n",
"Claude 3 Opus to GPT5 (50%): (116, 1.0)\n",
"Claude 3 Opus to GPT5 (50%): (116, 1.0)\n",
"GPT-3 to Claude 4.1 Opus (50%): (296, 0.9)\n",
"GPT-4 to Claude 4.1 Opus (50%): (256, 0.9)\n",
"Claude 3 Opus to Claude 4.1 Opus (50%): (117, 1.0)\n",
"Claude 3 Opus to Claude 4.1 Opus (50%): (117, 1.0)\n",
"Claude 3 Opus to Claude 4.1 Opus (50%): (117, 1.0)\n",
"Claude 3 Opus to Claude 4.1 Opus (50%): (117, 1.0)\n",
"\n",
"=== 80% Reliability ===\n",
"GPT‑2 to GPT5 (80%): (204, 0.974)\n",
"GPT-3 to GPT5 (80%): (302, 0.9)\n",
"GPT-4 to GPT5 (80%): (239, 0.9)\n",
"Claude 3 Opus to GPT5 (80%): (109, 1.0)\n",
"Claude 3 Opus to GPT5 (80%): (109, 1.0)\n",
"GPT-3 to Claude 4.1 Opus (80%): (302, 0.9)\n",
"GPT-4 to Claude 4.1 Opus (80%): (241, 0.9)\n",
"Claude 3 Opus to Claude 4.1 Opus (80%): (108, 1.0)\n",
"Claude 3 Opus to Claude 4.1 Opus (80%): (108, 1.0)\n"
"Claude 3 Opus to Claude 4.1 Opus (80%): (108, 1.0)\n",
"Claude 3 Opus to Claude 4.1 Opus (80%): (108, 1.0)\n"
]
}
],
"source": [
"observed_models: List[Tuple[str, datetime, float, float]] = [\n",
" # model # release date # task length at 50% # task length at 80% (in hrs)\n",
" (\"GPT‑2\", datetime(2019, 2, 14), 2 / 3600, 0.1 / 3600),\n",
" (\"GPT-3\", datetime(2020, 5, 28), 9 / 3600, 2 / 3600),\n",
" (\"GPT‑3.5 Turbo\", datetime(2023, 3, 1), 36 / 3600, 10 / 3600),\n",
" (\"GPT-4\", datetime(2023, 3, 14), 6 / 60, 1 / 60),\n",
" (\"GPT-4-Nov23\", datetime(2023, 11, 6), 8 / 60, 1 / 60),\n",
" (\"Claude 3 Opus\", datetime(2024, 3, 4), 6 / 60, 1 / 60),\n",
" (\"GPT‑4o\", datetime(2024, 5, 13), 9 / 60, 2 / 60),\n",
" (\"Claude 3.5 Sonnet (old)\", datetime(2024, 6, 20), 18 / 60, 3 / 60),\n",
" (\"o1 preview\", datetime(2024, 9, 12), 22 / 60, 4 / 60),\n",
" (\"Claude 3.5 Sonnet (new)\", datetime(2024, 10, 22), 28 / 60, 5 / 60),\n",
" (\"o1\", datetime(2024, 12, 5), 39 / 60, 6 / 60),\n",
" (\"Claude 3.7 Sonnet\", datetime(2025, 2, 24), 59 / 60, 15 / 60),\n",
" (\"o3\", datetime(2025, 4, 16), 1 + 45 / 60, 20 / 60),\n",
" (\"Claude 4 Sonnet\", datetime(2025, 5, 22), 1 + 7 / 60, 16 / 60),\n",
" (\"Claude 4 Opus\", datetime(2025, 5, 22), 1 + 19 / 60, 20 / 60),\n",
" (\"Gemini 2.5 Pro\", datetime(2025, 6, 5), 39 / 60, 9 / 60),\n",
" (\"Grok 4\", datetime(2025, 7, 9), 1 + 50 / 60, 15 / 60),\n",
" (\"Claude 4.1 Opus\", datetime(2025, 8, 5), 1 + 45 / 60, 21 / 60),\n",
" (\"GPT5\", datetime(2025, 8, 7), 2 + 17 / 60, 25 / 60),\n",
"# Construct observed_models from model_data\n",
"# Format: (model_name, release_date, task_length_50%, task_length_80%) in hours\n",
"observed_models = [\n",
" (model['name'], model['launch_date'], model['performance_50p'], model['performance_80p'])\n",
" for model in model_data.values()\n",
" if model['performance_50p'] is not None # Exclude models without data\n",
"]\n",
"\n",
"print(\"=== 50% Reliability ===\")\n",
Expand Down Expand Up @@ -118,15 +103,15 @@
"output_type": "stream",
"text": [
"=== Bootstrap Analysis ===\n",
"Current date: 2025-09-09\n",
"Current date: 2025-10-07\n",
"\n",
"50% Reliability:\n",
"Full dataset: (294, 0.903) (95% CI: {'doubling_time': (110, 333), 'acceleration': (0.9, 1.0)})\n",
"2024+ models: (127, 0.942) (95% CI: {'doubling_time': (110, 171), 'acceleration': (0.9, 1.0)})\n",
"Full dataset: (297, 0.901) (95% CI: {'doubling_time': (110, 333), 'acceleration': (0.9, 1.0)})\n",
"2024+ models: (125, 0.956) (95% CI: {'doubling_time': (110, 172), 'acceleration': (0.9, 1.0)})\n",
"\n",
"80% Reliability:\n",
"Full dataset: (233, 0.941) (95% CI: {'doubling_time': (119, 306), 'acceleration': (0.9, 1.0)})\n",
"2024+ models: (126, 0.927) (95% CI: {'doubling_time': (104, 159), 'acceleration': (0.9, 1.0)})\n",
"Full dataset: (231, 0.947) (95% CI: {'doubling_time': (121, 306), 'acceleration': (0.9, 1.0)})\n",
"2024+ models: (127, 0.922) (95% CI: {'doubling_time': (105, 157), 'acceleration': (0.9, 1.0)})\n",
"\n",
"=== Parameter stability by time window ===\n",
" doubling_time acceleration \n",
Expand Down Expand Up @@ -200,11 +185,11 @@
],
"source": [
"test_acceleration(\n",
" start_task_length=2 / 60 / 60, # GPT2\n",
" start_task_length=model_data['gpt2']['performance_50p'],\n",
" agi_task_length=167,\n",
" initial_doubling_time=260,\n",
" acceleration=0.95,\n",
" start_date=\"2019-02-14\", # GPT2\n",
" start_date=model_data['gpt2']['launch_date'].strftime('%Y-%m-%d'),\n",
")"
]
}
Expand Down
33 changes: 0 additions & 33 deletions Untitled.ipynb

This file was deleted.

Loading