diff --git a/devops/scripts/benchmarks/CONTRIB.md b/devops/scripts/benchmarks/CONTRIB.md index f52db986d576d..574ca4a883946 100644 --- a/devops/scripts/benchmarks/CONTRIB.md +++ b/devops/scripts/benchmarks/CONTRIB.md @@ -42,11 +42,9 @@ The suite is structured around three main components: Suites, Benchmarks, and Re * **Fields (set by Benchmark):** * `label`: Unique identifier for this *specific result type* within the benchmark instance (e.g., "Submit In Order Time"). Ideally contains `benchmark.name()`. * `value`: The measured numerical result (float). - * `unit`: The unit of the value (string, e.g., "μs", "GB/s", "token/s"). * `command`: The command list used to run the benchmark (`list[str]`). * `env`: Environment variables used (`dict[str, str]`). - * `stdout`: Full standard output of the benchmark run (string). - * `passed`: Boolean indicating if verification passed (default: `True`). + * `unit`: The unit of the value (string, e.g., "μs", "GB/s", "token/s"). * `stddev`: Standard deviation, if calculated by the benchmark itself (float, default: 0.0). * `git_url`, `git_hash`: Git info for the benchmark's source code (string). * **Fields (set by Framework):** diff --git a/devops/scripts/benchmarks/history.py b/devops/scripts/benchmarks/history.py index 468020b2d45d4..b5df83a61d1a5 100644 --- a/devops/scripts/benchmarks/history.py +++ b/devops/scripts/benchmarks/history.py @@ -31,7 +31,12 @@ def load_result(self, file_path: Path) -> BenchmarkRun: else: return None - def load(self, n: int): + def load(self): + """ + Load benchmark runs from the results directory. + This method loads files after the specified archiving criteria, + sorts them by timestamp, and stores the results in self.runs. + """ results_dir = Path(self.dir) / "results" if not results_dir.exists() or not results_dir.is_dir(): log.warning( @@ -42,7 +47,7 @@ def load(self, n: int): # Get all JSON files in the results directory benchmark_files = list(results_dir.glob("*.json")) - # Extract timestamp and sort files by it + # Extract timestamp def extract_timestamp(file_path: Path) -> str: try: # Assumes results are stored as _YYYYMMDD_HHMMSS.json @@ -51,11 +56,45 @@ def extract_timestamp(file_path: Path) -> str: except IndexError: return "" + baseline_drop_after = options.archive_baseline_days * 3 + pr_drop_after = options.archive_pr_days * 3 + baseline_cutoff_date = datetime.now(timezone.utc) - timedelta( + days=baseline_drop_after + ) + log.debug(f"Baseline cutoff date: {baseline_cutoff_date}") + pr_cutoff_date = datetime.now(timezone.utc) - timedelta(days=pr_drop_after) + log.debug(f"PR cutoff date: {pr_cutoff_date}") + + # Filter out files that exceed archiving criteria three times the specified days + def is_file_too_old(file_path: Path) -> bool: + try: + if file_path.stem.startswith("Baseline_"): + cutoff_date = baseline_cutoff_date + else: + cutoff_date = pr_cutoff_date + + timestamp_str = extract_timestamp(file_path) + if not timestamp_str: + return False + + file_timestamp = datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S") + # Add timezone info for proper comparison + file_timestamp = file_timestamp.replace(tzinfo=timezone.utc) + return file_timestamp < cutoff_date + except Exception as e: + log.warning(f"Error processing timestamp for {file_path.name}: {e}") + return False + + benchmark_files = [ + file for file in benchmark_files if not is_file_too_old(file) + ] + + # Sort files by timestamp benchmark_files.sort(key=extract_timestamp, reverse=True) - # Load the first n benchmark files + # Load benchmark files benchmark_runs = [] - for file_path in benchmark_files[:n]: + for file_path in benchmark_files: benchmark_run = self.load_result(file_path) if benchmark_run: benchmark_runs.append(benchmark_run) diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py index fdd8ff772b0c4..93629a9af8f0e 100755 --- a/devops/scripts/benchmarks/main.py +++ b/devops/scripts/benchmarks/main.py @@ -293,7 +293,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): # limit how many files we load. # should this be configurable? log.info(f"Loading benchmark history from {results_dir}...") - history.load(1000) + history.load() log.info(f"Loaded {len(history.runs)} benchmark runs.") if compare_names: diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py index 9c47efdf342c5..c8ea2488cddf8 100644 --- a/devops/scripts/benchmarks/options.py +++ b/devops/scripts/benchmarks/options.py @@ -90,7 +90,9 @@ class Options: git_commit_override: str = None # Archiving settings # Archived runs are stored separately from the main dataset but are still accessible - # via the HTML UI when "Include archived runs" is enabled + # via the HTML UI when "Include archived runs" is enabled. + # Archived runs older than 3 times the specified days are not included in the dashboard, + # ie. when archiving data older than 7 days, runs older than 21 days are not included. archive_baseline_days: int = 30 # Archive Baseline_* runs after 30 days archive_pr_days: int = 7 # Archive other (PR/dev) runs after 7 days