From 2627c5baaf0315c706e3a1add97369f03968feff Mon Sep 17 00:00:00 2001 From: Paul Gauthier <paul@aider.chat> Date: Mon, 16 Dec 2024 12:57:56 -0800 Subject: [PATCH] refac leaderboard --- MANIFEST.in | 2 +- .../docs/leaderboards/by-release-date.md | 10 ++++ aider/website/docs/leaderboards/index.md | 57 +------------------ aider/website/docs/leaderboards/refactor.md | 50 ++++++++++++++++ 4 files changed, 64 insertions(+), 55 deletions(-) create mode 100644 aider/website/docs/leaderboards/by-release-date.md create mode 100644 aider/website/docs/leaderboards/refactor.md diff --git a/MANIFEST.in b/MANIFEST.in index ba9b75c255b..d0d94cad790 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -7,7 +7,7 @@ exclude aider/website/HISTORY.md exclude aider/website/docs/benchmarks*.md exclude aider/website/docs/ctags.md exclude aider/website/docs/unified-diffs.md -exclude aider/website/docs/leaderboards/index.md +recursive-exclude aider/website/docs/leaderboards * recursive-exclude aider/website/assets * recursive-exclude aider/website *.js recursive-exclude aider/website *.html diff --git a/aider/website/docs/leaderboards/by-release-date.md b/aider/website/docs/leaderboards/by-release-date.md new file mode 100644 index 00000000000..78cac1ae66f --- /dev/null +++ b/aider/website/docs/leaderboards/by-release-date.md @@ -0,0 +1,10 @@ +--- +title: Scores by release date +parent: Aider LLM Leaderboards +nav_order: 200 +--- + +## LLM code editing skill by model release date + +[![connecting to many LLMs](/assets/models-over-time.svg)](https://aider.chat/assets/models-over-time.svg) + diff --git a/aider/website/docs/leaderboards/index.md b/aider/website/docs/leaderboards/index.md index 39cf38fe380..1679c516b6c 100644 --- a/aider/website/docs/leaderboards/index.md +++ b/aider/website/docs/leaderboards/index.md @@ -2,27 +2,22 @@ highlight_image: /assets/leaderboard.jpg nav_order: 950 description: Quantitative benchmarks of LLM code editing skill. +has_children: true --- # Aider LLM Leaderboards -{: .no_toc } Aider works best with LLMs which are good at *editing* code, not just good at writing code. -To evaluate an LLM's editing skill, aider uses a pair of benchmarks that +To evaluate an LLM's editing skill, aider uses benchmarks that assess a model's ability to consistently follow the system prompt to successfully edit code. -The leaderboards below report the results from a number of popular LLMs. +The leaderboards report the results from a number of popular LLMs. While [aider can connect to almost any LLM](/docs/llms.html), it works best with models that score well on the benchmarks. -See the following sections for benchmark -results and additional information: -- TOC -{:toc} - ## Code editing leaderboard [Aider's code editing benchmark](/docs/benchmarks.html#the-benchmark) asks the LLM to edit python source files to complete 133 small coding exercises @@ -79,52 +74,6 @@ The model also has to successfully apply all its changes to the source file with } </style> -## Code refactoring leaderboard - -[Aider's refactoring benchmark](https://github.com/Aider-AI/refactor-benchmark) asks the LLM to refactor 89 large methods from large python classes. This is a more challenging benchmark, which tests the model's ability to output long chunks of code without skipping sections or making mistakes. It was developed to provoke and measure [GPT-4 Turbo's "lazy coding" habit](/2023/12/21/unified-diffs.html). - -The refactoring benchmark requires a large context window to -work with large source files. -Therefore, results are available for fewer models. - -<input type="text" id="refacSearchInput" placeholder="Search..." style="width: 100%; max-width: 800px; margin: 10px auto; padding: 8px; display: block; border: 1px solid #ddd; border-radius: 4px;"> - -<table style="width: 100%; max-width: 800px; margin: auto; border-collapse: collapse; box-shadow: 0 2px 4px rgba(0,0,0,0.1); font-size: 14px;"> - <thead style="background-color: #f2f2f2;"> - <tr> - <th style="padding: 8px; text-align: left;">Model</th> - <th style="padding: 8px; text-align: center;">Percent completed correctly</th> - <th style="padding: 8px; text-align: center;">Percent using correct edit format</th> - <th style="padding: 8px; text-align: left;">Command</th> - <th style="padding: 8px; text-align: center;">Edit format</th> - </tr> - </thead> - <tbody> - {% assign refac_sorted = site.data.refactor_leaderboard | sort: 'pass_rate_1' | reverse %} - {% for row in refac_sorted %} - <tr style="border-bottom: 1px solid #ddd;"> - <td style="padding: 8px;">{{ row.model }}</td> - <td style="padding: 8px; text-align: center;">{{ row.pass_rate_1 }}%</td> - <td style="padding: 8px; text-align: center;">{{ row.percent_cases_well_formed }}%</td> - <td style="padding: 8px;"><code>{{ row.command }}</code></td> - <td style="padding: 8px; text-align: center;">{{ row.edit_format }}</td> - </tr> - {% endfor %} - </tbody> -</table> - -<canvas id="refacChart" width="800" height="450" style="margin-top: 20px"></canvas> -<script src="https://unpkg.com/patternomaly/dist/patternomaly.js"></script> -<script src="https://cdn.jsdelivr.net/npm/chart.js"></script> -<script> -{% include refactor-leaderboard.js %} -</script> - - -## LLM code editing skill by model release date - -[![connecting to many LLMs](/assets/models-over-time.svg)](https://aider.chat/assets/models-over-time.svg) - ## Notes on benchmarking results diff --git a/aider/website/docs/leaderboards/refactor.md b/aider/website/docs/leaderboards/refactor.md new file mode 100644 index 00000000000..21e0d0ed83d --- /dev/null +++ b/aider/website/docs/leaderboards/refactor.md @@ -0,0 +1,50 @@ +--- +parent: Aider LLM Leaderboards +highlight_image: /assets/leaderboard.jpg +nav_order: 100 +description: Quantitative benchmark of LLM code refactoring skill. +--- + + +## Aider refactoring leaderboard + +[Aider's refactoring benchmark](https://github.com/Aider-AI/refactor-benchmark) asks the LLM to refactor 89 large methods from large python classes. This is a more challenging benchmark, which tests the model's ability to output long chunks of code without skipping sections or making mistakes. It was developed to provoke and measure [GPT-4 Turbo's "lazy coding" habit](/2023/12/21/unified-diffs.html). + +The refactoring benchmark requires a large context window to +work with large source files. +Therefore, results are available for fewer models. + +<input type="text" id="refacSearchInput" placeholder="Search..." style="width: 100%; max-width: 800px; margin: 10px auto; padding: 8px; display: block; border: 1px solid #ddd; border-radius: 4px;"> + +<table style="width: 100%; max-width: 800px; margin: auto; border-collapse: collapse; box-shadow: 0 2px 4px rgba(0,0,0,0.1); font-size: 14px;"> + <thead style="background-color: #f2f2f2;"> + <tr> + <th style="padding: 8px; text-align: left;">Model</th> + <th style="padding: 8px; text-align: center;">Percent completed correctly</th> + <th style="padding: 8px; text-align: center;">Percent using correct edit format</th> + <th style="padding: 8px; text-align: left;">Command</th> + <th style="padding: 8px; text-align: center;">Edit format</th> + </tr> + </thead> + <tbody> + {% assign refac_sorted = site.data.refactor_leaderboard | sort: 'pass_rate_1' | reverse %} + {% for row in refac_sorted %} + <tr style="border-bottom: 1px solid #ddd;"> + <td style="padding: 8px;">{{ row.model }}</td> + <td style="padding: 8px; text-align: center;">{{ row.pass_rate_1 }}%</td> + <td style="padding: 8px; text-align: center;">{{ row.percent_cases_well_formed }}%</td> + <td style="padding: 8px;"><code>{{ row.command }}</code></td> + <td style="padding: 8px; text-align: center;">{{ row.edit_format }}</td> + </tr> + {% endfor %} + </tbody> +</table> + +<canvas id="refacChart" width="800" height="450" style="margin-top: 20px"></canvas> +<script src="https://unpkg.com/patternomaly/dist/patternomaly.js"></script> +<script src="https://cdn.jsdelivr.net/npm/chart.js"></script> +<script> +{% include refactor-leaderboard.js %} +</script> + +