From 4f9b134cef21cd0f3de7f0ee15bc1b547391fcb8 Mon Sep 17 00:00:00 2001 From: Kilian Lieret Date: Wed, 8 Oct 2025 16:35:22 -0400 Subject: [PATCH 01/15] Comparison plots part 1 --- css/components.css | 43 +++++++++ js/analysis.js | 141 ++++++++++++++++++++++++++++++ js/mainResults.js | 16 +++- templates/_leaderboard_table.html | 33 +++++++ templates/pages/bash-only.html | 3 +- 5 files changed, 232 insertions(+), 4 deletions(-) create mode 100644 js/analysis.js diff --git a/css/components.css b/css/components.css index da54c52..c66a8a9 100644 --- a/css/components.css +++ b/css/components.css @@ -367,6 +367,40 @@ button:focus, white-space: nowrap; } +/* Selection checkbox column (fixed slim width) */ +.data-table.has-select-col th.select-col, +.data-table.has-select-col td.select-col { + width: 36px; + min-width: 36px; + max-width: 36px; + text-align: center; + white-space: nowrap; +} + +/* Prevent first (model) column from expanding when selection column present */ +.data-table.has-select-col th.sortable[data-sort="name"], +.data-table.has-select-col td:first-of-type + td { + width: auto; + white-space: normal; +} + +/* Ensure model column remains flexible when select column exists */ +.data-table.has-select-col th:nth-child(2), +.data-table.has-select-col td:nth-child(2) { + width: 40%; + min-width: 180px; + max-width: 350px; + white-space: normal; + word-wrap: break-word; + text-align: left; /* override default % resolved right alignment */ +} + +/* Keep % Resolved right-aligned (now 3rd column when select column exists) */ +.data-table.has-select-col th:nth-child(3), +.data-table.has-select-col td:nth-child(3) { + text-align: right; +} + /* Cards */ .card { background-color: var(--color-background); @@ -616,6 +650,15 @@ button:focus, } } +/* Modal basic styles */ +.modal { display: none; position: fixed; inset: 0; z-index: var(--z-modal); } +.modal.show { display: block; } +.modal-backdrop { position: absolute; inset: 0; background: rgba(0,0,0,0.45); } +.modal-dialog { position: relative; background: var(--color-background); color: var(--color-text); width: min(720px, calc(100vw - 2rem)); margin: 5vh auto; border-radius: var(--radius-lg); box-shadow: var(--shadow-xl); border: 1.5px solid var(--color-border); resize: both; overflow: auto; min-width: 400px; min-height: 300px; max-width: 90vw; max-height: 90vh; } +.modal-header { display: flex; align-items: center; justify-content: space-between; padding: 0.75rem 1rem; border-bottom: 1.5px solid var(--color-border); } +.modal-body { padding: 1rem; overflow: auto; } +.modal-close { background: transparent; border: none; cursor: pointer; color: var(--color-text-secondary); } + @media (max-width: 992px) { /* On mobile and tablets */ .table-responsive { diff --git a/js/analysis.js b/js/analysis.js new file mode 100644 index 0000000..957c9b4 --- /dev/null +++ b/js/analysis.js @@ -0,0 +1,141 @@ +// Analysis and comparison features for leaderboard +(function() { + let compareChart = null; + + function getSelectedModels() { + const container = document.getElementById('leaderboard-container'); + const active = container ? container.querySelector('.tabcontent.active') : null; + if (!active) return []; + const checkboxes = active.querySelectorAll('input.row-select:checked'); + return Array.from(checkboxes).map(cb => ({ + name: cb.getAttribute('data-model'), + resolved: parseFloat(cb.getAttribute('data-resolved')) || 0 + })); + } + + function openModal() { + const selected = getSelectedModels(); + if (!selected.length) { + alert('Please select at least one model using the checkboxes before comparing results.'); + return; + } + const modal = document.getElementById('compare-modal'); + if (!modal) return; + modal.classList.add('show'); + modal.setAttribute('aria-hidden', 'false'); + renderChart(); + } + + function closeModal() { + const modal = document.getElementById('compare-modal'); + if (!modal) return; + modal.classList.remove('show'); + modal.setAttribute('aria-hidden', 'true'); + } + + function renderChart() { + const selected = getSelectedModels(); + const empty = document.getElementById('compare-empty'); + const canvas = document.getElementById('compare-chart'); + if (!canvas) return; + + if (compareChart) { + compareChart.destroy(); + compareChart = null; + } + + if (!selected.length) { + if (empty) empty.style.display = ''; + return; + } + if (empty) empty.style.display = 'none'; + + const ctx = canvas.getContext('2d'); + const labels = selected.map(s => s.name); + const values = selected.map(s => s.resolved); + + compareChart = new Chart(ctx, { + type: 'bar', + data: { + labels, + datasets: [{ + label: '% Resolved', + data: values, + backgroundColor: 'rgba(37, 99, 235, 0.6)', + borderColor: 'rgba(37, 99, 235, 1)', + borderWidth: 1, + }] + }, + options: { + responsive: true, + maintainAspectRatio: false, + scales: { + y: { + beginAtZero: true, + title: { display: true, text: '% Resolved' }, + ticks: { callback: (v) => v + '%' } + }, + x: { + title: { display: true, text: 'Model' } + } + }, + plugins: { + legend: { display: false }, + tooltip: { + callbacks: { + label: (ctx) => `${ctx.parsed.y.toFixed(2)}%` + } + } + } + } + }); + } + + function initEvents() { + // Open via delegated event to handle dynamic rendering + document.addEventListener('click', (e) => { + const trigger = e.target && typeof e.target.closest === 'function' ? e.target.closest('#compare-btn') : null; + if (trigger) { + e.preventDefault(); + e.stopPropagation(); + openModal(); + } + }); + + // Close via backdrop or close button/icon + const modal = document.getElementById('compare-modal'); + if (modal) { + modal.addEventListener('click', (e) => { + const closeEl = e.target && typeof e.target.closest === 'function' ? e.target.closest('[data-close="true"]') : null; + if (closeEl) { + e.preventDefault(); + closeModal(); + } + }); + } + + const chartType = document.getElementById('compare-chart-type'); + if (chartType) { + chartType.addEventListener('change', () => { + // Future chart types can be handled here; for now just re-render + renderChart(); + }); + } + + document.addEventListener('change', (e) => { + if (e.target && e.target.classList.contains('row-select')) { + if (document.getElementById('compare-modal')?.classList.contains('show')) { + renderChart(); + } + } + }); + } + + if (document.readyState === 'loading') { + document.addEventListener('DOMContentLoaded', initEvents); + } else { + initEvents(); + } +})(); + + diff --git a/js/mainResults.js b/js/mainResults.js index 53c5583..4d30cbd 100644 --- a/js/mainResults.js +++ b/js/mainResults.js @@ -95,9 +95,10 @@ function renderLeaderboardTable(leaderboard) { const tableHtml = `
- +
+ ${isBashOnly ? '' : ''} ${isBashOnly ? '' : ''} @@ -115,6 +116,7 @@ function renderLeaderboardTable(leaderboard) { data-checked="${item.checked ? 'true' : 'false'}" data-tags="${item.tags ? item.tags.join(',') : ''}" > + ${isBashOnly ? `` : ''} `).join('')} - @@ -362,6 +364,16 @@ function openLeaderboard(leaderboardName) { if (typeof updateTable === 'function') { setTimeout(updateTable, 0); } + + // Show/hide compare button based on leaderboard type + const compareBtn = document.getElementById('compare-btn'); + if (compareBtn) { + if (leaderboardName.toLowerCase() === 'bash-only') { + compareBtn.style.display = ''; + } else { + compareBtn.style.display = 'none'; + } + } } document.addEventListener('DOMContentLoaded', function() { diff --git a/templates/_leaderboard_table.html b/templates/_leaderboard_table.html index eaa1a1a..0f1c156 100644 --- a/templates/_leaderboard_table.html +++ b/templates/_leaderboard_table.html @@ -7,6 +7,10 @@
+
Filters:
@@ -55,6 +59,35 @@ {{ leaderboard_tags | tojson }} + + + + + + +

SWE-bench Bash Only uses the SWE-bench Verified dataset with the mini-SWE-agent environment for all models [Post].
diff --git a/templates/pages/bash-only.html b/templates/pages/bash-only.html index 8cae316..a74b5e8 100644 --- a/templates/pages/bash-only.html +++ b/templates/pages/bash-only.html @@ -95,5 +95,4 @@

Citation

{% endblock %} {% block scripts_extra %} - -{% endblock %} \ No newline at end of file +{% endblock %} \ No newline at end of file From 3cc4d5e7f1558e3c3d47bd6fe79b126ec2d704ea Mon Sep 17 00:00:00 2001 From: Kilian Lieret Date: Wed, 8 Oct 2025 16:46:45 -0400 Subject: [PATCH 02/15] Quick select for plots; light mode --- css/components.css | 7 +- js/analysis.js | 197 ++++++++++++++++++++++++++++-- js/mainResults.js | 4 +- templates/_leaderboard_table.html | 39 ++++-- 4 files changed, 229 insertions(+), 18 deletions(-) diff --git a/css/components.css b/css/components.css index c66a8a9..f97d8ee 100644 --- a/css/components.css +++ b/css/components.css @@ -654,10 +654,13 @@ button:focus, .modal { display: none; position: fixed; inset: 0; z-index: var(--z-modal); } .modal.show { display: block; } .modal-backdrop { position: absolute; inset: 0; background: rgba(0,0,0,0.45); } -.modal-dialog { position: relative; background: var(--color-background); color: var(--color-text); width: min(720px, calc(100vw - 2rem)); margin: 5vh auto; border-radius: var(--radius-lg); box-shadow: var(--shadow-xl); border: 1.5px solid var(--color-border); resize: both; overflow: auto; min-width: 400px; min-height: 300px; max-width: 90vw; max-height: 90vh; } +.modal-dialog { position: relative; background: var(--color-background); color: var(--color-text); width: min(720px, calc(100vw - 2rem)); margin: 5vh auto; border-radius: var(--radius-lg); box-shadow: var(--shadow-xl); border: 1.5px solid var(--color-border); resize: both; overflow: auto; min-width: 400px; min-height: 300px; max-width: 90vw; max-height: 90vh; display: flex; flex-direction: column; } +.modal-dialog-small { width: min(480px, calc(100vw - 2rem)); min-width: 320px; min-height: auto; resize: none; } .modal-header { display: flex; align-items: center; justify-content: space-between; padding: 0.75rem 1rem; border-bottom: 1.5px solid var(--color-border); } -.modal-body { padding: 1rem; overflow: auto; } +.modal-body { padding: 1rem; overflow: auto; flex: 1; display: flex; flex-direction: column; } .modal-close { background: transparent; border: none; cursor: pointer; color: var(--color-text-secondary); } +.chart-container { flex: 1; display: flex; flex-direction: column; min-height: 0; position: relative; } +.chart-container canvas { flex: 1; min-height: 260px; } @media (max-width: 992px) { /* On mobile and tablets */ diff --git a/js/analysis.js b/js/analysis.js index 957c9b4..69194d7 100644 --- a/js/analysis.js +++ b/js/analysis.js @@ -1,6 +1,8 @@ // Analysis and comparison features for leaderboard (function() { let compareChart = null; + let resizeObserver = null; + let chartTheme = 'dark'; // 'light' or 'dark' function getSelectedModels() { const container = document.getElementById('leaderboard-container'); @@ -13,10 +15,30 @@ })); } + function getThemeColors(theme) { + if (theme === 'light') { + return { + background: '#ffffff', + gridColor: 'rgba(0, 0, 0, 0.1)', + textColor: '#333333', + barBackground: 'rgba(37, 99, 235, 0.6)', + barBorder: 'rgba(37, 99, 235, 1)' + }; + } else { + return { + background: 'transparent', + gridColor: 'rgba(255, 255, 255, 0.1)', + textColor: '#ffffff', + barBackground: 'rgba(37, 99, 235, 0.6)', + barBorder: 'rgba(37, 99, 235, 1)' + }; + } + } + function openModal() { const selected = getSelectedModels(); if (!selected.length) { - alert('Please select at least one model using the checkboxes before comparing results.'); + openNoSelectionModal(); return; } const modal = document.getElementById('compare-modal'); @@ -24,6 +46,47 @@ modal.classList.add('show'); modal.setAttribute('aria-hidden', 'false'); renderChart(); + setupResizeObserver(); + } + + function openNoSelectionModal() { + const modal = document.getElementById('no-selection-modal'); + if (!modal) return; + modal.classList.add('show'); + modal.setAttribute('aria-hidden', 'false'); + } + + function closeNoSelectionModal() { + const modal = document.getElementById('no-selection-modal'); + if (!modal) return; + modal.classList.remove('show'); + modal.setAttribute('aria-hidden', 'true'); + } + + function selectTopN(n) { + const container = document.getElementById('leaderboard-container'); + const active = container ? container.querySelector('.tabcontent.active') : null; + if (!active) return; + + // First uncheck all + const allCheckboxes = active.querySelectorAll('input.row-select'); + allCheckboxes.forEach(cb => cb.checked = false); + + // Get visible rows (not filtered out) + const visibleRows = Array.from(active.querySelectorAll('tbody tr:not(.no-results)')) + .filter(row => row.style.display !== 'none'); + + // Select top N visible rows + const rowsToSelect = visibleRows.slice(0, n); + rowsToSelect.forEach(row => { + const checkbox = row.querySelector('input.row-select'); + if (checkbox) { + checkbox.checked = true; + } + }); + + closeNoSelectionModal(); + openModal(); } function closeModal() { @@ -31,6 +94,31 @@ if (!modal) return; modal.classList.remove('show'); modal.setAttribute('aria-hidden', 'true'); + teardownResizeObserver(); + } + + function setupResizeObserver() { + const modalDialog = document.querySelector('#compare-modal .modal-dialog'); + if (!modalDialog) return; + + if (resizeObserver) { + resizeObserver.disconnect(); + } + + resizeObserver = new ResizeObserver(() => { + if (compareChart) { + compareChart.resize(); + } + }); + + resizeObserver.observe(modalDialog); + } + + function teardownResizeObserver() { + if (resizeObserver) { + resizeObserver.disconnect(); + resizeObserver = null; + } } function renderChart() { @@ -53,6 +141,26 @@ const ctx = canvas.getContext('2d'); const labels = selected.map(s => s.name); const values = selected.map(s => s.resolved); + const colors = getThemeColors(chartTheme); + + // Set canvas background via container + const chartContainer = canvas.closest('.chart-container'); + if (chartContainer) { + chartContainer.style.backgroundColor = colors.background; + } + + // Plugin to draw background on the chart + const backgroundPlugin = { + id: 'customCanvasBackgroundColor', + beforeDraw: (chart, args, options) => { + const {ctx, chartArea} = chart; + if (!chartArea) return; + ctx.save(); + ctx.fillStyle = colors.background; + ctx.fillRect(0, 0, chart.width, chart.height); + ctx.restore(); + } + }; compareChart = new Chart(ctx, { type: 'bar', @@ -61,8 +169,8 @@ datasets: [{ label: '% Resolved', data: values, - backgroundColor: 'rgba(37, 99, 235, 0.6)', - borderColor: 'rgba(37, 99, 235, 1)', + backgroundColor: colors.barBackground, + borderColor: colors.barBorder, borderWidth: 1, }] }, @@ -72,11 +180,31 @@ scales: { y: { beginAtZero: true, - title: { display: true, text: '% Resolved' }, - ticks: { callback: (v) => v + '%' } + title: { + display: true, + text: '% Resolved', + color: colors.textColor + }, + ticks: { + callback: (v) => v + '%', + color: colors.textColor + }, + grid: { + color: colors.gridColor + } }, x: { - title: { display: true, text: 'Model' } + title: { + display: true, + text: 'Model', + color: colors.textColor + }, + ticks: { + color: colors.textColor + }, + grid: { + color: colors.gridColor + } } }, plugins: { @@ -87,10 +215,29 @@ } } } - } + }, + plugins: [backgroundPlugin] }); } + function toggleChartTheme() { + chartTheme = chartTheme === 'light' ? 'dark' : 'light'; + updateThemeButton(); + renderChart(); + } + + function updateThemeButton() { + const btn = document.getElementById('chart-theme-toggle'); + if (!btn) return; + if (chartTheme === 'light') { + btn.innerHTML = ' Dark mode'; + btn.title = 'Switch to dark mode'; + } else { + btn.innerHTML = ' Light mode'; + btn.title = 'Switch to light mode'; + } + } + function initEvents() { // Open via delegated event to handle dynamic rendering document.addEventListener('click', (e) => { @@ -122,6 +269,42 @@ }); } + const themeToggle = document.getElementById('chart-theme-toggle'); + if (themeToggle) { + themeToggle.addEventListener('click', (e) => { + e.preventDefault(); + toggleChartTheme(); + }); + } + + // No selection modal close handlers + const noSelectionModal = document.getElementById('no-selection-modal'); + if (noSelectionModal) { + noSelectionModal.addEventListener('click', (e) => { + const closeEl = e.target && typeof e.target.closest === 'function' ? e.target.closest('[data-close="true"]') : null; + if (closeEl) { + e.preventDefault(); + closeNoSelectionModal(); + } + }); + } + + // Quick select buttons + const selectTop5 = document.getElementById('select-top-5'); + if (selectTop5) { + selectTop5.addEventListener('click', () => selectTopN(5)); + } + + const selectTop10 = document.getElementById('select-top-10'); + if (selectTop10) { + selectTop10.addEventListener('click', () => selectTopN(10)); + } + + const selectTop20 = document.getElementById('select-top-20'); + if (selectTop20) { + selectTop20.addEventListener('click', () => selectTopN(20)); + } + document.addEventListener('change', (e) => { if (e.target && e.target.classList.contains('row-select')) { if (document.getElementById('compare-modal')?.classList.contains('show')) { diff --git a/js/mainResults.js b/js/mainResults.js index 4d30cbd..0930e86 100644 --- a/js/mainResults.js +++ b/js/mainResults.js @@ -120,9 +120,9 @@ function renderLeaderboardTable(leaderboard) {
Model % ResolvedAvg. $
@@ -142,7 +144,7 @@ function renderLeaderboardTable(leaderboard) {
- ${item.date >= "2025-06-25" ? '🆕' : ''} + ${!isBashOnly && item.date >= "2025-06-25" ? '🆕' : ''} ${item.oss ? '🤠' : ''} - ${item.checked ? '✅' : ''} + ${!isBashOnly && item.checked ? '✅' : ''}
${item.name}
diff --git a/templates/_leaderboard_table.html b/templates/_leaderboard_table.html index 0f1c156..1525c68 100644 --- a/templates/_leaderboard_table.html +++ b/templates/_leaderboard_table.html @@ -72,14 +72,19 @@

Compare results