From adb59ced80bdad8c83645a1ed765a8ebcb6257cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= <patryk.kaminski@intel.com>
Date: Wed, 25 Jun 2025 20:05:11 +0000
Subject: [PATCH] [Benchmarks] Add benchmarks logger

---
 devops/scripts/benchmarks/README.md           |  21 ++++
 devops/scripts/benchmarks/benches/benchdnn.py |   6 +-
 devops/scripts/benchmarks/benches/gromacs.py  |  11 +-
 devops/scripts/benchmarks/benches/umf.py      |  15 ++-
 devops/scripts/benchmarks/compare.py          |  62 ++++++-----
 devops/scripts/benchmarks/history.py          |  17 +--
 devops/scripts/benchmarks/main.py             | 105 +++++++++++-------
 devops/scripts/benchmarks/options.py          |   1 -
 devops/scripts/benchmarks/output_html.py      |   5 +-
 devops/scripts/benchmarks/output_markdown.py  |  16 ++-
 devops/scripts/benchmarks/presets.py          |   8 +-
 .../benchmarks/utils/compute_runtime.py       |   9 +-
 .../benchmarks/utils/detect_versions.py       |  24 ++--
 devops/scripts/benchmarks/utils/logger.py     |  95 ++++++++++++++++
 devops/scripts/benchmarks/utils/oneapi.py     |  16 ++-
 devops/scripts/benchmarks/utils/utils.py      |  41 +++----
 16 files changed, 310 insertions(+), 142 deletions(-)
 create mode 100644 devops/scripts/benchmarks/utils/logger.py

diff --git a/devops/scripts/benchmarks/README.md b/devops/scripts/benchmarks/README.md
index b750a5857c025..0e0d7d80a2142 100644
--- a/devops/scripts/benchmarks/README.md
+++ b/devops/scripts/benchmarks/README.md
@@ -45,6 +45,27 @@ are stored [here](https://oneapi-src.github.io/unified-runtime/performance/).
 ## Output formats
 You can display the results in the form of a HTML file by using `--ouptut-html` and a markdown file by using `--output-markdown`. Due to character limits for posting PR comments, the final content of the markdown file might be reduced. In order to obtain the full markdown output, use `--output-markdown full`.
 
+## Logging
+
+The benchmark runner uses a configurable logging system with different log levels that can be set using the `--log-level` command-line option.
+
+Available log levels:
+- `debug`
+- `info` (default)
+- `warning`
+- `error`
+- `critical`
+
+To set the log level, use the `--log-level` option:
+```bash
+./main.py ~/benchmarks_workdir/ --sycl ~/llvm/build/ --log-level debug
+```
+
+You can also use the `--verbose` flag, which sets the log level to `debug` and overrides any `--log-level` setting:
+```bash
+./main.py ~/benchmarks_workdir/ --sycl ~/llvm/build/ --verbose
+```
+
 ## Requirements
 
 ### Python
diff --git a/devops/scripts/benchmarks/benches/benchdnn.py b/devops/scripts/benchmarks/benches/benchdnn.py
index 61d1f0316244c..1bf593d3d19a0 100644
--- a/devops/scripts/benchmarks/benches/benchdnn.py
+++ b/devops/scripts/benchmarks/benches/benchdnn.py
@@ -3,12 +3,15 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+
 from pathlib import Path
+
 from .base import Suite, Benchmark
 from options import options
 from utils.utils import git_clone, run, create_build_path
 from utils.result import Result
 from utils.oneapi import get_oneapi
+from utils.logger import log
 from .benchdnn_list import get_bench_dnn_list
 
 
@@ -150,8 +153,7 @@ def run(self, env_vars):
         )
         result_value = self._extract_time(output)
 
-        if options.verbose:
-            print(f"[{self.name()}] Output: {output}")
+        log.debug(f"[{self.name()}] Output: {output}")
 
         return [
             Result(
diff --git a/devops/scripts/benchmarks/benches/gromacs.py b/devops/scripts/benchmarks/benches/gromacs.py
index 4d95c538df09c..3a035ace78b12 100644
--- a/devops/scripts/benchmarks/benches/gromacs.py
+++ b/devops/scripts/benchmarks/benches/gromacs.py
@@ -4,18 +4,18 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import os
-import subprocess
 from pathlib import Path
+import re
+
 from .base import Suite, Benchmark
 from options import options
 from utils.utils import git_clone, download, run, create_build_path
 from utils.result import Result
 from utils.oneapi import get_oneapi
-import re
+from utils.logger import log
 
 
 class GromacsBench(Suite):
-
     def git_url(self):
         return "https://gitlab.com/gromacs/gromacs.git"
 
@@ -209,8 +209,7 @@ def run(self, env_vars):
 
         time = self._extract_execution_time(mdrun_output)
 
-        if options.verbose:
-            print(f"[{self.name()}] Time: {time:.3f} seconds")
+        log.debug(f"[{self.name()}] Time: {time:.3f} seconds")
 
         return [
             Result(
@@ -259,7 +258,7 @@ def _validate_correctness(self, log_file):
                             drift_value = float(match.group(1))
                             return abs(drift_value) <= threshold
                         except ValueError:
-                            print(
+                            log.warning(
                                 f"Parsed drift value: {drift_value} exceeds threshold"
                             )
                             return False
diff --git a/devops/scripts/benchmarks/benches/umf.py b/devops/scripts/benchmarks/benches/umf.py
index 6af745e7de3cb..d84ffa10f5079 100644
--- a/devops/scripts/benchmarks/benches/umf.py
+++ b/devops/scripts/benchmarks/benches/umf.py
@@ -3,18 +3,17 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import random
-from utils.utils import git_clone
-from .base import Benchmark, Suite
-from utils.result import Result
-from utils.utils import run, create_build_path
-from options import options
-from utils.oneapi import get_oneapi
 import os
 import csv
 import io
 import re
 
+from .base import Benchmark, Suite
+from utils.result import Result
+from options import options
+from utils.oneapi import get_oneapi
+from utils.logger import log
+
 
 def isUMFAvailable():
     return options.umf is not None
@@ -93,7 +92,7 @@ def extra_env_vars(self) -> dict:
 
     def setup(self):
         if not isUMFAvailable():
-            print("UMF prefix path not provided")
+            log.warning("UMF prefix path not provided")
             return
 
         self.oneapi = get_oneapi()
diff --git a/devops/scripts/benchmarks/compare.py b/devops/scripts/benchmarks/compare.py
index 31b31f05f368d..9306e06b4351a 100644
--- a/devops/scripts/benchmarks/compare.py
+++ b/devops/scripts/benchmarks/compare.py
@@ -1,17 +1,21 @@
-from utils.aggregate import Aggregator, SimpleMedian
-from utils.validate import Validate
-from utils.result import Result, BenchmarkRun
-from options import options
+# Copyright (C) 2024-2025 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import os
 import re
-import sys
 import json
 import argparse
 from datetime import datetime, timezone
 from pathlib import Path
 from dataclasses import dataclass, asdict
 
+from utils.aggregate import Aggregator, SimpleMedian
+from utils.validate import Validate
+from utils.result import BenchmarkRun
+from utils.logger import log
+from options import options
+
 
 @dataclass
 class BenchmarkHistoricAverage:
@@ -113,8 +117,8 @@ def validate_benchmark_result(result: BenchmarkRun) -> bool:
             if result.hostname != hostname:
                 return False
             if result.name != result_name:
-                print(
-                    f"Warning: Result file {result_path} does not match specified result name {result.name}."
+                log.warning(
+                    f"Result file {result_path} does not match specified result name {result.name}."
                 )
                 return False
             if result.date < datetime.strptime(cutoff, "%Y%m%d_%H%M%S").replace(
@@ -256,24 +260,24 @@ def to_hist(
         """
 
         if avg_type != "median":
-            print("Only median is currently supported: Refusing to continue.")
+            log.error("Only median is currently supported: Refusing to continue.")
             exit(1)
 
         try:
             with open(compare_file, "r") as compare_f:
                 compare_result = BenchmarkRun.from_json(json.load(compare_f))
         except:
-            print(f"Unable to open {compare_file}.")
+            log.error(f"Unable to open {compare_file}.")
             exit(1)
 
         # Sanity checks:
         if compare_result.hostname == "Unknown":
-            print(
+            log.error(
                 "Hostname for results in {compare_file} unknown, unable to build a historic average: Refusing to continue."
             )
             exit(1)
         if not Validate.timestamp(cutoff):
-            print("Invalid timestamp provided, please follow YYYYMMDD_HHMMSS.")
+            log.error("Invalid timestamp provided, please follow YYYYMMDD_HHMMSS.")
             exit(1)
 
         # Build historic average and compare results against historic average:
@@ -331,7 +335,7 @@ def to_hist(
 
     if args.operation == "to_hist":
         if args.avg_type != "median":
-            print("Only median is currently supported: exiting.")
+            log.error("Only median is currently supported: exiting.")
             exit(1)
         if not Validate.timestamp(args.cutoff):
             raise ValueError("Timestamp must be provided as YYYYMMDD_HHMMSS.")
@@ -352,28 +356,34 @@ def to_hist(
                 else:
                     regressions_ignored.append(test)
 
-        def print_regression(entry: dict):
-            """Print an entry outputted from Compare.to_hist"""
-            print(f"Test: {entry['name']}")
-            print(f"-- Historic {entry['avg_type']}: {entry['hist_avg']}")
-            print(f"-- Run result: {entry['value']}")
-            print(f"-- Delta: {entry['delta']}")
-            print("")
+        def print_regression(entry: dict, is_warning: bool = False):
+            """Print an entry outputted from Compare.to_hist
+
+            Args:
+                entry (dict): The entry to print
+                is_warning (bool): If True, use log.warning instead of log.info
+            """
+            log_func = log.warning if is_warning else log.info
+            log_func(f"Test: {entry['name']}")
+            log_func(f"-- Historic {entry['avg_type']}: {entry['hist_avg']}")
+            log_func(f"-- Run result: {entry['value']}")
+            log_func(f"-- Delta: {entry['delta']}")
+            log_func("")
 
         if improvements:
-            print("#\n# Improvements:\n#\n")
+            log.info("#\n# Improvements:\n#\n")
             for test in improvements:
                 print_regression(test)
         if regressions_ignored:
-            print("#\n# Regressions (filtered out by regression-filter):\n#\n")
+            log.info("#\n# Regressions (filtered out by regression-filter):\n#\n")
             for test in regressions_ignored:
                 print_regression(test)
         if regressions_of_concern:
-            print("#\n# Regressions:\n#\n")
+            log.warning("#\n# Regressions:\n#\n")
             for test in regressions_of_concern:
-                print_regression(test)
+                print_regression(test, is_warning=True)
             exit(1)  # Exit 1 to trigger github test failure
-        print("\nNo unexpected regressions found!")
+        log.info("\nNo unexpected regressions found!")
     else:
-        print("Unsupported operation: exiting.")
+        log.error("Unsupported operation: exiting.")
         exit(1)
diff --git a/devops/scripts/benchmarks/history.py b/devops/scripts/benchmarks/history.py
index 843ee8b63dff1..468020b2d45d4 100644
--- a/devops/scripts/benchmarks/history.py
+++ b/devops/scripts/benchmarks/history.py
@@ -7,12 +7,13 @@
 import json
 from pathlib import Path
 import socket
+
 from utils.result import Result, BenchmarkRun
 from options import Compare, options
 from datetime import datetime, timezone, timedelta
 from utils.utils import run
 from utils.validate import Validate
-
+from utils.logger import log
 from utils.detect_versions import DetectVersion
 
 
@@ -33,8 +34,8 @@ def load_result(self, file_path: Path) -> BenchmarkRun:
     def load(self, n: int):
         results_dir = Path(self.dir) / "results"
         if not results_dir.exists() or not results_dir.is_dir():
-            print(
-                f"Warning: {results_dir} is not a valid directory: no historic results loaded."
+            log.warning(
+                f"{results_dir} is not a valid directory: no historic results loaded."
             )
             return
 
@@ -97,7 +98,7 @@ def git_info_from_path(path: Path) -> (str, str):
 
         if options.git_commit_override is None or options.github_repo_override is None:
             if options.detect_versions.sycl:
-                print(f"Auto-detecting sycl version...")
+                log.info(f"Auto-detecting sycl version...")
                 github_repo, git_hash = DetectVersion.instance().get_dpcpp_git_info()
             else:
                 git_hash, github_repo = git_info_from_path(
@@ -129,12 +130,12 @@ def git_info_from_path(path: Path) -> (str, str):
         if options.build_compute_runtime:
             compute_runtime = options.compute_runtime_tag
         elif options.detect_versions.compute_runtime:
-            print(f"Auto-detecting compute_runtime version...")
+            log.info(f"Auto-detecting compute_runtime version...")
             detect_res = DetectVersion.instance()
             compute_runtime = detect_res.get_compute_runtime_ver()
             if detect_res.get_compute_runtime_ver_cached() is None:
-                print(
-                    "Warning: Could not find compute_runtime version via github tags API."
+                log.warning(
+                    "Could not find compute_runtime version via github tags API."
                 )
         else:
             compute_runtime = "unknown"
@@ -169,7 +170,7 @@ def save(self, save_name, results: list[Result], to_file=True):
         file_path = Path(os.path.join(results_dir, f"{save_name}_{timestamp}.json"))
         with file_path.open("w") as file:
             json.dump(serialized, file, indent=4)
-        print(f"Benchmark results saved to {file_path}")
+        log.info(f"Benchmark results saved to {file_path}")
 
     def find_first(self, name: str) -> BenchmarkRun:
         for r in self.runs:
diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py
index 7449148ba2efb..5669e7d5a4e63 100755
--- a/devops/scripts/benchmarks/main.py
+++ b/devops/scripts/benchmarks/main.py
@@ -5,6 +5,11 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import argparse
+import re
+import statistics
+import os
+
 from benches.compute import *
 from benches.gromacs import GromacsBench
 from benches.velocity import VelocityBench
@@ -21,12 +26,9 @@
 from utils.compute_runtime import *
 from utils.validate import Validate
 from utils.detect_versions import DetectVersion
+from utils.logger import log
 from presets import enabled_suites, presets
 
-import argparse
-import re
-import statistics
-import os
 
 # Update this if you are changing the layout of the results files
 INTERNAL_WORKDIR_VERSION = "2.0"
@@ -40,7 +42,7 @@ def run_iterations(
     failures: dict[str, str],
 ):
     for iter in range(iters):
-        print(f"running {benchmark.name()}, iteration {iter}... ", flush=True)
+        log.info(f"running {benchmark.name()}, iteration {iter}... ")
         bench_results = benchmark.run(env_vars)
         if bench_results is None:
             failures[benchmark.name()] = "benchmark produced no results!"
@@ -49,10 +51,10 @@ def run_iterations(
         for bench_result in bench_results:
             if not bench_result.passed:
                 failures[bench_result.label] = "verification failed"
-                print(f"complete ({bench_result.label}: verification failed).")
+                log.warning(f"complete ({bench_result.label}: verification failed).")
                 continue
 
-            print(
+            log.info(
                 f"{benchmark.name()} complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit})."
             )
 
@@ -126,7 +128,7 @@ def process_results(
         ) * mean_value
 
         if stddev > threshold:
-            print(f"stddev {stddev} above the threshold {threshold} for {label}")
+            log.warning(f"stddev {stddev} above the threshold {threshold} for {label}")
             valid_results = False
 
         rlist.sort(key=lambda res: res.value)
@@ -158,10 +160,13 @@ def collect_metadata(suites):
 def main(directory, additional_env_vars, save_name, compare_names, filter):
     prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
 
+    if options.dry_run:
+        log.info("Dry run mode enabled. No benchmarks will be executed.")
+
     if options.build_compute_runtime:
-        print(f"Setting up Compute Runtime {options.compute_runtime_tag}")
+        log.info(f"Setting up Compute Runtime {options.compute_runtime_tag}")
         cr = get_compute_runtime()
-        print("Compute Runtime setup complete.")
+        log.info("Compute Runtime setup complete.")
         options.extra_ld_libraries.extend(cr.ld_libraries())
         options.extra_env_vars.update(cr.env_vars())
 
@@ -202,33 +207,37 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
             ]
 
         if suite_benchmarks:
-            print(f"Setting up {type(s).__name__}")
+            log.info(f"Setting up {type(s).__name__}")
             try:
                 s.setup()
             except Exception as e:
                 failures[s.name()] = f"Suite setup failure: {e}"
-                print(f"{type(s).__name__} setup failed. Benchmarks won't be added.")
-                print(f"failed: {e}")
+                log.error(
+                    f"{type(s).__name__} setup failed. Benchmarks won't be added."
+                )
+                log.error(f"failed: {e}")
             else:
-                print(f"{type(s).__name__} setup complete.")
+                log.info(f"{type(s).__name__} setup complete.")
                 benchmarks += suite_benchmarks
 
     for benchmark in benchmarks:
         try:
-            if options.verbose:
-                print(f"Setting up {benchmark.name()}... ")
+            log.debug(f"Setting up {benchmark.name()}... ")
             benchmark.setup()
-            if options.verbose:
-                print(f"{benchmark.name()} setup complete.")
+            log.debug(f"{benchmark.name()} setup complete.")
 
         except Exception as e:
             if options.exit_on_failure:
                 raise e
             else:
                 failures[benchmark.name()] = f"Benchmark setup failure: {e}"
-                print(f"failed: {e}")
+                log.error(f"failed: {e}")
 
     results = []
+    if benchmarks:
+        log.info(f"Running {len(benchmarks)} benchmarks...")
+    elif not options.dry_run:
+        log.warning("No benchmarks to run.")
     for benchmark in benchmarks:
         try:
             merged_env_vars = {**additional_env_vars}
@@ -253,15 +262,13 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
                 raise e
             else:
                 failures[benchmark.name()] = f"Benchmark run failure: {e}"
-                print(f"failed: {e}")
+                log.error(f"failed: {e}")
 
     for benchmark in benchmarks:
         # this never has any useful information anyway, so hide it behind verbose
-        if options.verbose:
-            print(f"tearing down {benchmark.name()}... ", flush=True)
+        log.debug(f"tearing down {benchmark.name()}... ")
         benchmark.teardown()
-        if options.verbose:
-            print(f"{benchmark.name()} teardown complete.")
+        log.debug(f"{benchmark.name()} teardown complete.")
 
     this_name = options.current_run_name
     chart_data = {}
@@ -275,19 +282,22 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
     history = BenchmarkHistory(results_dir)
     # limit how many files we load.
     # should this be configurable?
+    log.info(f"Loading benchmark history from {results_dir}...")
     history.load(1000)
-
-    # remove duplicates. this can happen if e.g., --compare baseline is specified manually.
-    compare_names = (
-        list(dict.fromkeys(compare_names)) if compare_names is not None else []
-    )
-
-    for name in compare_names:
-        compare_result = history.get_compare(name)
-        if compare_result:
-            chart_data[name] = compare_result.results
+    log.info(f"Loaded {len(history.runs)} benchmark runs.")
+
+    if compare_names:
+        log.info(f"Comparing against {len(compare_names)} previous runs...")
+        # remove duplicates. this can happen if e.g., --compare baseline is specified manually.
+        compare_names = list(dict.fromkeys(compare_names))
+        for name in compare_names:
+            compare_result = history.get_compare(name)
+            if compare_result:
+                chart_data[name] = compare_result.results
+        log.info(f"Comparison complete.")
 
     if options.output_markdown:
+        log.info("Generating markdown with benchmark results...")
         markdown_content = generate_markdown(
             this_name, chart_data, failures, options.output_markdown, metadata
         )
@@ -299,7 +309,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
         with open(os.path.join(md_path, "benchmark_results.md"), "w") as file:
             file.write(markdown_content)
 
-        print(
+        log.info(
             f"Markdown with benchmark results has been written to {md_path}/benchmark_results.md"
         )
 
@@ -309,16 +319,21 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
     # we calculate historical averages or get latest results for compare.
     # Otherwise we might be comparing the results to themselves.
     if not options.dry_run:
+        log.info(f"Saving benchmark results...")
         history.save(saved_name, results, save_name is not None)
         if saved_name not in compare_names:
             compare_names.append(saved_name)
+        log.info(f"Benchmark results saved.")
 
     if options.output_html:
         html_path = options.output_directory
         if options.output_directory is None:
-            html_path = os.path.join(os.path.dirname(__file__), "html")
-
+            html_path = os.path.normpath(
+                os.path.join(os.path.dirname(__file__), "html")
+            )
+        log.info(f"Generating HTML with benchmark results in {html_path}...")
         generate_html(history, compare_names, html_path, metadata)
+        log.info(f"HTML with benchmark results has been generated")
 
 
 def validate_and_parse_env_args(env_args):
@@ -376,6 +391,7 @@ def validate_and_parse_env_args(env_args):
         type=str,
         help="Compare results against previously saved data.",
         action="append",
+        default=[],
     )
     parser.add_argument(
         "--iterations",
@@ -402,7 +418,9 @@ def validate_and_parse_env_args(env_args):
         default=None,
     )
     parser.add_argument(
-        "--verbose", help="Print output of all the commands.", action="store_true"
+        "--verbose",
+        help="Set logging level to DEBUG. Overrides --log-level.",
+        action="store_true",
     )
     parser.add_argument(
         "--exit-on-failure", help="Exit on first failure.", action="store_true"
@@ -576,12 +594,18 @@ def validate_and_parse_env_args(env_args):
         "'Include archived runs'. PR runs typically have a shorter retention period than baselines.",
         default=options.archive_pr_days,
     )
+    parser.add_argument(
+        "--log-level",
+        type=str,
+        choices=["debug", "info", "warning", "error", "critical"],
+        help="Set the logging level",
+        default="info",
+    )
 
     args = parser.parse_args()
     additional_env_vars = validate_and_parse_env_args(args.env)
 
     options.workdir = args.benchmark_directory
-    options.verbose = args.verbose
     options.rebuild = not args.no_rebuild
     options.redownload = args.redownload
     options.sycl = args.sycl
@@ -606,6 +630,9 @@ def validate_and_parse_env_args(env_args):
     options.build_jobs = args.build_jobs
     options.hip_arch = args.hip_arch
 
+    # Initialize logger with command line arguments
+    log.initialize(args.verbose, args.log_level)
+
     if args.build_igc and args.compute_runtime is None:
         parser.error("--build-igc requires --compute-runtime to be set")
     if args.compute_runtime is not None:
diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py
index ff94274d6647e..9522d7b0b8783 100644
--- a/devops/scripts/benchmarks/options.py
+++ b/devops/scripts/benchmarks/options.py
@@ -52,7 +52,6 @@ class Options:
     benchmark_cwd: str = "INVALID"
     timeout: float = 600
     iterations: int = 3
-    verbose: bool = False
     compare: Compare = Compare.LATEST
     compare_max: int = 10  # average/median over how many results
     output_markdown: MarkdownSize = MarkdownSize.SHORT
diff --git a/devops/scripts/benchmarks/output_html.py b/devops/scripts/benchmarks/output_html.py
index d527bdd7576b7..3b507f639ac75 100644
--- a/devops/scripts/benchmarks/output_html.py
+++ b/devops/scripts/benchmarks/output_html.py
@@ -8,6 +8,7 @@
 
 from options import options
 from utils.result import BenchmarkMetadata, BenchmarkOutput
+from utils.logger import log
 from history import BenchmarkHistory
 from benches.base import benchmark_tags_dict
 
@@ -42,13 +43,13 @@ def _write_output_to_file(
             f.write(";\n")
 
             if not archive:
-                print(f"See {os.getcwd()}/html/index.html for the results.")
+                log.info(f"See {html_path}/index.html for the results.")
     else:
         # For remote format, we write a single JSON file
         data_path = os.path.join(html_path, f"{filename}.json")
         with open(data_path, "w") as f:
             json.dump(json.loads(output.to_json()), f, indent=2)
-        print(
+        log.info(
             f"Upload {data_path} to a location set in config.js remoteDataUrl argument."
         )
 
diff --git a/devops/scripts/benchmarks/output_markdown.py b/devops/scripts/benchmarks/output_markdown.py
index 2a72e687c5a0d..bc881112d4c70 100644
--- a/devops/scripts/benchmarks/output_markdown.py
+++ b/devops/scripts/benchmarks/output_markdown.py
@@ -5,9 +5,10 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import collections
+
 from utils.result import Result, BenchmarkMetadata
+from utils.logger import log
 from options import options, MarkdownSize
-import ast
 
 
 class OutputLine:
@@ -118,8 +119,8 @@ def get_explicit_group_name(result: Result, metadata: dict[str, BenchmarkMetadat
     try:
         explicit_group_name = metadata[result.label].explicit_group
     except Exception as e:
-        print(
-            f"Warning: Unexpected error when getting explicit_group for '{result.label}': {e}"
+        log.warning(
+            f"Unexpected error when getting explicit_group for '{result.label}': {e}"
         )
         return "Other"
 
@@ -226,8 +227,7 @@ def generate_summary_table(
         # Generate the row with all the results from saved runs specified by
         # --compare,
         # Highlight the best value in the row with data
-        if options.verbose:
-            print(f"Results: {results}")
+        log.debug(f"Results: {results}")
         for key in chart_data.keys():
             if key in results:
                 intv = results[key].value
@@ -278,8 +278,7 @@ def generate_summary_table(
                     else:
                         regressed_rows.append(oln.row + " | \n")
 
-            if options.verbose:
-                print(oln.row)
+            log.debug(oln.row)
 
             summary_table += oln.row + "\n"
     else:
@@ -323,8 +322,7 @@ def generate_summary_table(
     if not is_at_least_one_diff:
         summary_line = f"No diffs to calculate performance change"
 
-    if options.verbose:
-        print(summary_line)
+    log.debug(summary_line)
 
     summary_table = "\n## Performance change in benchmark groups\n"
 
diff --git a/devops/scripts/benchmarks/presets.py b/devops/scripts/benchmarks/presets.py
index 7388d6a3cb104..b1ee73227b351 100644
--- a/devops/scripts/benchmarks/presets.py
+++ b/devops/scripts/benchmarks/presets.py
@@ -5,6 +5,8 @@
 
 import argparse
 
+from utils.logger import log
+
 presets: dict[str, list[str]] = {
     "Full": [
         "BenchDNN",
@@ -72,13 +74,13 @@ def main():
     if args.command == "query":
         if args.preset_to_query in presets:
             if not args.quiet:
-                print(f"Benchmark suites to be ran in {args.preset_to_query}:")
+                log.info(f"Benchmark suites to be ran in {args.preset_to_query}:")
                 for suite in presets[args.preset_to_query]:
-                    print(suite)
+                    log.info(suite)
             exit(0)
         else:
             if not args.quiet:
-                print(f"Error: No preset named '{args.preset_to_query}'.")
+                log.error(f"Error: No preset named '{args.preset_to_query}'.")
             exit(1)
 
 
diff --git a/devops/scripts/benchmarks/utils/compute_runtime.py b/devops/scripts/benchmarks/utils/compute_runtime.py
index 380f5127b73fe..24d78e86587dd 100644
--- a/devops/scripts/benchmarks/utils/compute_runtime.py
+++ b/devops/scripts/benchmarks/utils/compute_runtime.py
@@ -7,7 +7,6 @@
 import re
 import yaml
 
-from pathlib import Path
 from .utils import *
 from options import options
 
@@ -51,6 +50,7 @@ def env_vars(self) -> dict:
         }
 
     def build_gmmlib(self, repo, commit):
+        log.info("Building GMMLib...")
         self.gmmlib_repo = git_clone(options.workdir, "gmmlib-repo", repo, commit)
         self.gmmlib_build = os.path.join(options.workdir, "gmmlib-build")
         self.gmmlib_install = os.path.join(options.workdir, "gmmlib-install")
@@ -64,9 +64,11 @@ def build_gmmlib(self, repo, commit):
         run(configure_command)
         run(f"cmake --build {self.gmmlib_build} -j {options.build_jobs}")
         run(f"cmake --install {self.gmmlib_build}")
+        log.info("GMMLib build complete.")
         return self.gmmlib_install
 
     def build_level_zero(self, repo, commit):
+        log.info("Building Level Zero...")
         self.level_zero_repo = git_clone(
             options.workdir, "level-zero-repo", repo, commit
         )
@@ -89,9 +91,11 @@ def build_level_zero(self, repo, commit):
         run(configure_command)
         run(f"cmake --build {self.level_zero_build} -j {options.build_jobs}")
         run(f"cmake --install {self.level_zero_build}")
+        log.info("Level Zero build complete.")
         return self.level_zero_install
 
     def build_igc(self, repo, commit):
+        log.info("Building IGC...")
         self.igc_repo = git_clone(options.workdir, "igc", repo, commit)
         self.vc_intr = git_clone(
             options.workdir,
@@ -151,6 +155,7 @@ def build_igc(self, repo, commit):
         )
         # cmake --install doesn't work...
         run("make install", cwd=self.igc_build)
+        log.info("IGC build complete.")
         return self.igc_install
 
     def read_manifest(self, manifest_path):
@@ -204,6 +209,7 @@ def build_compute_runtime(self):
         # Remove -Werror...
         replace_in_file(cmakelists_path, r"\s-Werror(?:=[a-zA-Z]*)?", "")
 
+        log.info("Building Compute Runtime...")
         configure_command = [
             "cmake",
             f"-B {self.compute_runtime_build}",
@@ -220,6 +226,7 @@ def build_compute_runtime(self):
 
         run(configure_command)
         run(f"cmake --build {self.compute_runtime_build} -j {options.build_jobs}")
+        log.info("Compute Runtime build complete.")
         return self.compute_runtime_build
 
 
diff --git a/devops/scripts/benchmarks/utils/detect_versions.py b/devops/scripts/benchmarks/utils/detect_versions.py
index 91e263536564d..ebb27c948247f 100644
--- a/devops/scripts/benchmarks/utils/detect_versions.py
+++ b/devops/scripts/benchmarks/utils/detect_versions.py
@@ -9,6 +9,8 @@
 from pathlib import Path
 import argparse
 
+from utils.logger import log
+
 if __name__ == "__main__":
     sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 from options import options
@@ -88,12 +90,12 @@ def get_var(var_name: str) -> str:
         # matches up with the prefix of the l0 version patch, the cache is
         # indeed referring to the same version.
         if env_cache_patch == l0_ver_patch[: len(env_cache_patch)]:
-            print(
+            log.info(
                 f"Using compute_runtime tag from COMPUTE_RUNTIME_TAG_CACHE: {env_cache_ver}"
             )
             cls._instance.compute_runtime_ver_cache = env_cache_ver
         else:
-            print(
+            log.warning(
                 f"Mismatch between COMPUTE_RUNTIME_TAG_CACHE {env_cache_ver} and patch reported by level_zero {get_var('L0_VER')}"
             )
 
@@ -119,7 +121,7 @@ def get_dpcpp_ver(self) -> str:
         """
         return self.dpcpp_ver
 
-    def get_dpcpp_git_info(self) -> [str, str]:
+    def get_dpcpp_git_info(self) -> list[str]:
         """
         Returns: (git_repo, commit_hash)
         """
@@ -153,7 +155,7 @@ def get_compute_runtime_ver(self) -> str:
         Returns the compute-runtime version by deriving from l0 version.
         """
         if self.compute_runtime_ver_cache is not None:
-            print(
+            log.info(
                 f"Using cached compute-runtime tag {self.compute_runtime_ver_cache}..."
             )
             return self.compute_runtime_ver_cache
@@ -164,7 +166,7 @@ def get_compute_runtime_ver(self) -> str:
         # not work if we enable benchmark CI in precommit.
         url = options.detect_versions.compute_runtime_tag_api
 
-        print(f"Fetching compute-runtime tag from {url}...")
+        log.info(f"Fetching compute-runtime tag from {url}...")
         try:
             for _ in range(options.detect_versions.max_api_calls):
                 res = request.urlopen(url)
@@ -202,16 +204,16 @@ def get_link_url(link: str) -> str:
                     break
 
         except urllib.error.HTTPError as e:
-            print(f"HTTP error {e.code}: {e.read().decode('utf-8')}")
+            log.error(f"HTTP error {e.code}: {e.read().decode('utf-8')}")
 
         except urllib.error.URLError as e:
-            print(f"URL error: {e.reason}")
+            log.error(f"URL error: {e.reason}")
 
-        print(f"WARNING: unable to find compute-runtime version")
+        log.warning(f"unable to find compute-runtime version")
         return options.detect_versions.not_found_placeholder
 
 
-def main(components: [str]):
+def main(components: list[str]):
     detect_res = DetectVersion.init(f"{os.path.dirname(__file__)}/detect_versions.cpp")
 
     str2fn = {
@@ -223,14 +225,14 @@ def main(components: [str]):
 
     def remove_undefined_components(component: str) -> bool:
         if component not in str2fn:
-            print(f"# Warn: unknown component: {component}", file=sys.stderr)
+            log.warning(f"unknown component: {component}")
             return False
         return True
 
     components_clean = filter(remove_undefined_components, components)
 
     for s in map(lambda c: f"{c.upper()}={str2fn[c]()}", components_clean):
-        print(s)
+        log.info(s)
 
 
 if __name__ == "__main__":
diff --git a/devops/scripts/benchmarks/utils/logger.py b/devops/scripts/benchmarks/utils/logger.py
new file mode 100644
index 0000000000000..3323bfdef27b9
--- /dev/null
+++ b/devops/scripts/benchmarks/utils/logger.py
@@ -0,0 +1,95 @@
+# Copyright (C) 2025 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import logging
+import sys
+from typing import Optional, Type, Any
+
+
+# Define log level mapping as a module-level function
+def _get_log_level(level_str: str) -> int:
+    """Convert a string log level to a logging module level constant."""
+    level_map = {
+        "debug": logging.DEBUG,
+        "info": logging.INFO,
+        "warning": logging.WARNING,
+        "error": logging.ERROR,
+        "critical": logging.CRITICAL,
+    }
+    return level_map.get(level_str.lower(), logging.INFO)
+
+
+class BenchmarkLogger:
+    """Logger for the Benchmark Runner.
+
+    This logger provides different log levels (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+    that can be controlled via command-line arguments. Call initialize() with the
+    appropriate parameters after parsing command line arguments.
+    """
+
+    _instance: Optional["BenchmarkLogger"] = None
+
+    def __new__(cls: Type["BenchmarkLogger"]) -> "BenchmarkLogger":
+        if cls._instance is None:
+            cls._instance = super(BenchmarkLogger, cls).__new__(cls)
+        return cls._instance
+
+    def __init__(self) -> None:
+        """Create logger but don't configure it until initialize() is called."""
+        self._logger: logging.Logger = logging.getLogger("ur_benchmarks")
+
+    def initialize(self, verbose: bool = False, log_level: str = "info") -> None:
+        """Configure the logger with the appropriate log level.
+
+        Args:
+            verbose: If True, sets the log level to DEBUG regardless of log_level
+            log_level: One of "debug", "info", "warning", "error", "critical"
+
+        Note:
+            This method will only initialize the logger once. Subsequent calls will be ignored.
+        """
+        # Return early if logger is already initialized (has handlers)
+        if self._logger.handlers:
+            return
+
+        console_handler = logging.StreamHandler(sys.stdout)
+
+        level = logging.DEBUG if verbose else _get_log_level(log_level)
+        self._logger.setLevel(level)
+        console_handler.setLevel(level)
+
+        formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+        console_handler.setFormatter(formatter)
+
+        self._logger.addHandler(console_handler)
+
+    def debug(self, message: Any) -> None:
+        """Log a debug message."""
+        if self._logger.handlers:
+            self._logger.debug(message)
+
+    def info(self, message: Any) -> None:
+        """Log an info message."""
+        if self._logger.handlers:
+            self._logger.info(message)
+
+    def warning(self, message: Any) -> None:
+        """Log a warning message."""
+        if self._logger.handlers:
+            self._logger.warning(message)
+
+    def error(self, message: Any) -> None:
+        """Log an error message."""
+        if self._logger.handlers:
+            self._logger.error(message)
+
+    def critical(self, message: Any) -> None:
+        """Log a critical message."""
+        if self._logger.handlers:
+            self._logger.critical(message)
+
+
+# Global logger instance
+log = BenchmarkLogger()
diff --git a/devops/scripts/benchmarks/utils/oneapi.py b/devops/scripts/benchmarks/utils/oneapi.py
index 0a477f6e246b1..bb2931b41e84c 100644
--- a/devops/scripts/benchmarks/utils/oneapi.py
+++ b/devops/scripts/benchmarks/utils/oneapi.py
@@ -4,12 +4,14 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 from pathlib import Path
-from utils.utils import download, run
-from options import options
 import os
 import hashlib
 import glob
 
+from utils.utils import download, run
+from utils.logger import log
+from options import options
+
 
 class OneAPI:
     def __init__(self):
@@ -46,12 +48,12 @@ def check_install(self, version):
 
     def install_package(self, name, version, url, checksum):
         if self.check_install(version):
-            print(f"OneAPI {name} version {version} already installed, skipping.")
+            log.info(f"OneAPI {name} version {version} already installed, skipping.")
             return
         package_name = f"package_{name}_{version}.sh"
         package_path = os.path.join(self.oneapi_dir, f"{package_name}")
         if Path(package_path).exists():
-            print(f"{package_path} exists, skipping download of oneAPI package...")
+            log.info(f"{package_path} exists, skipping download of oneAPI package...")
         else:
             package = download(
                 self.oneapi_dir, url, f"{package_name}", checksum=checksum
@@ -61,9 +63,11 @@ def install_package(self, name, version, url, checksum):
                 f"sh {package_path} -a -s --eula accept --install-dir {self.oneapi_dir} --instance {self.oneapi_instance_id}"
             )
         except:
-            print(f"OneAPI {name} version {version} installation likely exists already")
+            log.warning(
+                f"OneAPI {name} version {version} installation likely exists already"
+            )
             return
-        print(f"OneAPI {name} version {version} installation complete")
+        log.info(f"OneAPI {name} version {version} installation complete")
 
     def package_dir(self, package, dir):
         return os.path.join(self.oneapi_dir, package, "latest", dir)
diff --git a/devops/scripts/benchmarks/utils/utils.py b/devops/scripts/benchmarks/utils/utils.py
index ef2a1222a7183..99fee805a5f7a 100644
--- a/devops/scripts/benchmarks/utils/utils.py
+++ b/devops/scripts/benchmarks/utils/utils.py
@@ -7,14 +7,15 @@
 import os
 import shutil
 import subprocess
-
 import tarfile
-from options import options
-from pathlib import Path
 import hashlib
+from pathlib import Path
 from urllib.request import urlopen  # nosec B404
 from shutil import copyfileobj
 
+from options import options
+from utils.logger import log
+
 
 def run(
     command,
@@ -39,7 +40,7 @@ def run(
                     ldlib + os.pathsep + env.get("LD_LIBRARY_PATH", "")
                 )
             else:
-                print(f"Warning: LD_LIBRARY_PATH component does not exist: {ldlib}")
+                log.warning(f"LD_LIBRARY_PATH component does not exist: {ldlib}")
 
         # order is important, we want provided sycl rt libraries to be first
         if add_sycl:
@@ -52,35 +53,33 @@ def run(
 
         env.update(env_vars)
 
-        if options.verbose:
-            command_str = " ".join(command)
-            env_str = " ".join(f"{key}={value}" for key, value in env_vars.items())
-            full_command_str = f"{env_str} {command_str}".strip()
-            print(f"Running: {full_command_str}")
+        command_str = " ".join(command)
+        env_str = " ".join(f"{key}={value}" for key, value in env_vars.items())
+        full_command_str = f"{env_str} {command_str}".strip()
+        log.debug(f"Running: {full_command_str}")
 
         result = subprocess.run(
             command,
             cwd=cwd,
             check=True,
             stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
             env=env,
             timeout=timeout,
         )  # nosec B603
 
-        if options.verbose:
-            print(result.stdout.decode())
-            print(result.stderr.decode())
+        if result.stdout:
+            log.debug(result.stdout.decode())
 
         return result
     except subprocess.CalledProcessError as e:
-        print(e.stdout.decode())
-        print(e.stderr.decode())
+        log.error(e.stdout.decode())
         raise
 
 
 def git_clone(dir, name, repo, commit):
     repo_path = os.path.join(dir, name)
+    log.debug(f"Cloning {repo} into {repo_path} at commit {commit}")
 
     if os.path.isdir(repo_path) and os.path.isdir(os.path.join(repo_path, ".git")):
         run("git fetch", cwd=repo_path)
@@ -93,6 +92,7 @@ def git_clone(dir, name, repo, commit):
         raise Exception(
             f"The directory {repo_path} exists but is not a git repository."
         )
+    log.debug(f"Cloned {repo} into {repo_path} at commit {commit}")
     return repo_path
 
 
@@ -116,11 +116,12 @@ def prepare_workdir(dir, version):
                 prepare_bench_cwd(dir)
                 return
             else:
-                print(f"Version mismatch, cleaning up benchmark directory {dir}")
+                log.warning(f"Version mismatch, cleaning up benchmark directory {dir}")
                 shutil.rmtree(dir)
         else:
             raise Exception(
-                f"The directory {dir} exists but is not a benchmark work directory."
+                f"The directory {dir} exists but is not a benchmark work directory. "
+                f"A BENCH_WORKDIR_VERSION file is expected with version {version} but not found at {version_file_path}."
             )
 
     os.makedirs(dir)
@@ -152,13 +153,13 @@ def calculate_checksum(file_path):
 def download(dir, url, file, untar=False, unzip=False, checksum=""):
     data_file = os.path.join(dir, file)
     if not Path(data_file).exists():
-        print(f"{data_file} does not exist, downloading")
+        log.info(f"{data_file} does not exist, downloading")
         with urlopen(url) as in_stream, open(data_file, "wb") as out_file:
             copyfileobj(in_stream, out_file)
 
         calculated_checksum = calculate_checksum(data_file)
         if calculated_checksum != checksum:
-            print(
+            log.critical(
                 f"Checksum mismatch: expected {checksum}, got {calculated_checksum}. Refusing to continue."
             )
             exit(1)
@@ -172,5 +173,5 @@ def download(dir, url, file, untar=False, unzip=False, checksum=""):
             with gzip.open(data_file, "rb") as f_in, open(stripped_gz, "wb") as f_out:
                 shutil.copyfileobj(f_in, f_out)
     else:
-        print(f"{data_file} exists, skipping...")
+        log.debug(f"{data_file} exists, skipping...")
     return data_file