diff --git a/benchmark/tools/compare.py b/benchmark/tools/compare.py
new file mode 100755
index 00000000000..f6ac5ae321a
--- /dev/null
+++ b/benchmark/tools/compare.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: 2017-2023 The Ginkgo authors
+# SPDX-License-Identifier: BSD-3-Clause
+import sys
+import json
+import argparse
+import math
+import pandas as pd
+import tabulate  # for pandas markdown output
+from frozendict import frozendict
+
+
+keys = {"stencil", "size", "filename", "n", "r", "k", "m"}
+comparison_keys = {"time", "storage", "iterations"}
+suffix = ".ratio"
+
+
+def sorted_key_intersection(a: dict, b: dict) -> list:
+    return sorted(set(a.keys()).intersection(b.keys()), key=str)
+
+
+def parse_json_matrix(filename: str) -> dict:
+    """Parse a JSON file into a key -> test_case dict"""
+    with open(filename) as file:
+        parsed = json.load(file)
+    result = {}
+    assert isinstance(parsed, list)
+    for case in parsed:
+        assert isinstance(case, dict)
+        assert not keys.isdisjoint(case.keys())
+        dict_key = frozendict(
+            {key: case[key] for key in keys.intersection(case.keys())}
+        )
+        if dict_key in result.keys():
+            print(
+                f"WARNING: Duplicate key {json.dumps(dict_key)}",
+                file=sys.stderr,
+            )
+        result[dict_key] = case
+    return result
+
+
+def warn_on_inconsistent_keys(baseline: dict, comparison: dict, context: str):
+    """Print a warning message for non-matching keys between baseline/comparison using the given context string"""
+    baseline_only = sorted(set(baseline.keys()).difference(comparison.keys()))
+    comparison_only = sorted(set(comparison.keys()).difference(baseline.keys()))
+    for key in baseline_only:
+        print(
+            f"WARNING: Key {json.dumps(key) if isinstance(key, dict) else key} found in baseline only in context {context}",
+            file=sys.stderr,
+        )
+    for key in comparison_only:
+        print(
+            f"WARNING: Key {json.dumps(key) if isinstance(key, dict) else key} found in comparison only in context {context}",
+            file=sys.stderr,
+        )
+    for key in sorted_key_intersection(baseline, comparison):
+        if isinstance(baseline[key], dict):
+            assert isinstance(comparison[key], dict)
+            warn_on_inconsistent_keys(
+                baseline[key], comparison[key], f"{context}/{key}"
+            )
+
+
+def ratio(baseline: int | float, comparison: int | float) -> float:
+    """Compares the ratio between baseline and comparison. For runtimes, this is the speedup."""
+    return baseline / comparison
+
+
+def compare_benchmark(baseline: dict, comparison: dict) -> dict:
+    """Compares a handful of keys and component breakdowns recursively, writing them with a suffix to the output"""
+    result = {}
+    for key in sorted_key_intersection(baseline, comparison):
+        if key == "components":
+            assert isinstance(baseline[key], dict)
+            assert isinstance(comparison[key], dict)
+            result[key + suffix] = {
+                sub_key: ratio(baseline[key][sub_key], comparison[key][sub_key])
+                for sub_key in baseline[key]
+            }
+        elif isinstance(baseline[key], dict):
+            result[key] = compare_benchmark(baseline[key], comparison[key])
+        elif key in comparison_keys:
+            result[key + suffix] = ratio(baseline[key], comparison[key])
+    return result
+
+
+def compare(baseline: dict, comparison: dict) -> dict:
+    """Compares a test case, keeping root-level values and recursing into benchmarks"""
+    result = {}
+    for key in sorted_key_intersection(baseline, comparison):
+        # we don't have lists on the test case root level
+        assert not isinstance(baseline[key], list)
+        if isinstance(baseline[key], dict):
+            benchmark_result = {}
+            for benchmark_name in baseline[key].keys():
+                if isinstance(baseline[key][benchmark_name], dict):
+                    comparison_result = compare_benchmark(
+                        baseline[key][benchmark_name], comparison[key][benchmark_name]
+                    )
+                    if len(comparison_result) > 0:
+                        benchmark_result[benchmark_name] = comparison_result
+            if len(benchmark_result) > 0:
+                result[key] = benchmark_result
+        else:
+            # everything that's not a dict should only depend on the key in the root level
+            if baseline[key] != comparison[key]:
+                print(
+                    f"WARNING: Inconsistent value for {key}: {baseline[key]} != {comparison[key]}",
+                    file=sys.stderr,
+                )
+            result[key] = baseline[key]
+    return result
+
+
+def extract_benchmark_results(
+    input: dict, benchmarks: dict, case_key: tuple, context: str | None
+) -> None:
+    for key, value in input.items():
+        benchmark_name = key if context is None else f"{context}/{key}"
+        if key in map(lambda x: x + suffix, comparison_keys):
+            benchmark_name = benchmark_name[: -len(suffix)]
+            if benchmark_name not in benchmarks.keys():
+                benchmarks[benchmark_name] = []
+            benchmarks[benchmark_name].append((case_key, value))
+        elif isinstance(value, dict):
+            extract_benchmark_results(value, benchmarks, case_key, benchmark_name)
+
+
+def is_outlier(value: float, args) -> bool:
+    """returns true iff the is more than the outlier threshold away from 1.0"""
+    return math.fabs(math.log(value)) > math.log(1.0 + args.outlier_threshold / 100)
+
+
+def compare_main(args: list):
+    """Runs the comparison script"""
+    parser = argparse.ArgumentParser(description="Compare to Ginkgo benchmark outputs")
+    parser.add_argument(
+        "--outliers", action="store_true", help="List outliers from the results"
+    )
+    parser.add_argument(
+        "--outlier-threshold",
+        type=float,
+        default=10,
+        help="At what percentage of deviation (above or below) should outliers be reported",
+    )
+    parser.add_argument(
+        "--outlier-count",
+        type=int,
+        default=1000,
+        help="How many outliers should be reported per benchmark",
+    )
+    parser.add_argument("--output", choices=["json", "csv", "markdown"], default="json")
+    parser.add_argument("baseline")
+    parser.add_argument("comparison")
+    args = parser.parse_args(args)
+    baseline_json = parse_json_matrix(args.baseline)
+    comparison_json = parse_json_matrix(args.comparison)
+    warn_on_inconsistent_keys(baseline_json, comparison_json, "root")
+
+    results = {}
+
+    for key in sorted_key_intersection(baseline_json, comparison_json):
+        results[key] = compare(baseline_json[key], comparison_json[key])
+
+    outliers = {}
+    benchmarks = {}
+    for key, value in results.items():
+        extract_benchmark_results(value, benchmarks, key, None)
+    if args.outliers:
+        for benchmark_name, benchmark_results in benchmarks.items():
+            outlier = sorted(
+                [
+                    (case_key, value)
+                    for case_key, value in benchmark_results
+                    if is_outlier(value, args)
+                ],
+                key=lambda x: math.fabs(math.log(x[1])),
+                reverse=True,
+            )
+            outliers[benchmark_name] = outlier[: min(len(outlier), args.outlier_count)]
+
+    if args.output == "json":
+        print(
+            json.dumps(
+                {
+                    "results": [value for _, value in results.items()],
+                    "outliers": {
+                        key: [
+                            {"value": ratio_value, **case_key}
+                            for (case_key, ratio_value) in value
+                        ]
+                        for key, value in outliers.items()
+                        if len(value) > 0
+                    },
+                },
+                indent=4,
+            )
+        )
+    else:
+        columns = ["benchmark", "testcase", "ratio"]
+        only_first = args.output == "markdown"
+        table = pd.DataFrame(
+            sum(
+                [
+                    [
+                        (
+                            key if i == 0 or not only_first else "",
+                            json.dumps(value[0]),
+                            value[1],
+                        )
+                        for i, value in enumerate(values)
+                    ]
+                    for key, values in benchmarks.items()
+                ],
+                [],
+            ),
+            columns=columns,
+        )
+        if args.output == "csv":
+            table.to_csv(sys.stdout, index=False)
+        else:
+            table.to_markdown(sys.stdout, index=False)
+        if args.outliers:
+            outlier_table = pd.DataFrame(
+                sum(
+                    [
+                        [
+                            (
+                                key if i == 0 or not only_first else "",
+                                json.dumps(value[0]),
+                                value[1],
+                            )
+                            for i, value in enumerate(values)
+                        ]
+                        for key, values in outliers.items()
+                    ],
+                    [],
+                ),
+                columns=columns,
+            )
+            if len(outlier_table) > 0:
+                print("\n\nOutliers")
+                if args.output == "csv":
+                    outlier_table.to_csv(sys.stdout, index=False)
+                else:
+                    outlier_table.to_markdown(sys.stdout, index=False)
+            print()
+
+
+if __name__ == "__main__":
+    compare_main(sys.argv)
diff --git a/benchmark/tools/compare_test.py b/benchmark/tools/compare_test.py
new file mode 100644
index 00000000000..83e2ee5dbda
--- /dev/null
+++ b/benchmark/tools/compare_test.py
@@ -0,0 +1,226 @@
+import json
+import compare
+import os
+
+dir_path = os.path.dirname(os.path.realpath(__file__))
+
+
+def test_mismatch(capsys):
+    compare.compare_main(
+        [
+            dir_path + "/../test/reference/blas.simple.stdout",
+            dir_path + "/../test/reference/spmv.matrix.stdout",
+        ]
+    )
+    captured = capsys.readouterr()
+    ref_out = {"results": [], "outliers": {}}
+
+    ref_err = """WARNING: Key {"n": 100} found in baseline only in context root
+WARNING: Key {"filename": ""} found in comparison only in context root
+"""
+    assert json.loads(captured.out) == ref_out
+    assert captured.err == ref_err
+
+
+def test_simple(capsys):
+    compare.compare_main(
+        [
+            dir_path + "/../test/reference/spmv.matrix.stdout",
+            dir_path + "/../test/reference/spmv.matrix.stdout",
+        ]
+    )
+    captured = capsys.readouterr()
+    ref_out = {
+        "results": [
+            {
+                "cols": 36,
+                "filename": "",
+                "nonzeros": 208,
+                "rows": 36,
+                "spmv": {"coo": {"storage.ratio": 1.0, "time.ratio": 1.0}},
+            }
+        ],
+        "outliers": {},
+    }
+
+    assert json.loads(captured.out) == ref_out
+    assert captured.err == ""
+
+
+def test_outliers(capsys):
+    compare.compare_main(
+        [
+            "--outliers",
+            dir_path + "/compare_test_input1.json",
+            dir_path + "/compare_test_input2.json",
+        ]
+    )
+    captured = capsys.readouterr()
+    ref_out = {
+        "results": [
+            {
+                "cols": 36,
+                "filename": "mtx",
+                "nonzeros": 208,
+                "rows": 36,
+                "spmv": {
+                    "coo": {"storage.ratio": 1.0, "time.ratio": 1.2},
+                    "csr": {"storage.ratio": 2.0, "time.ratio": 0.8},
+                    "ell": {"storage.ratio": 0.5, "time.ratio": 1.0},
+                    "sellp": {"storage.ratio": 1.0, "time.ratio": 1.11},
+                    "hybrid": {"storage.ratio": 1.0, "time.ratio": 1.01},
+                },
+            }
+        ],
+        "outliers": {
+            "spmv/coo/time": [{"value": 1.2, "filename": "mtx"}],
+            "spmv/csr/storage": [{"value": 2.0, "filename": "mtx"}],
+            "spmv/csr/time": [{"value": 0.8, "filename": "mtx"}],
+            "spmv/ell/storage": [{"value": 0.5, "filename": "mtx"}],
+            "spmv/sellp/time": [{"value": 1.11, "filename": "mtx"}],
+        },
+    }
+
+    assert json.loads(captured.out) == ref_out
+    assert captured.err == ""
+
+
+def test_outliers_imited(capsys):
+    compare.compare_main(
+        [
+            "--outliers",
+            "--outlier-count",
+            "0",
+            dir_path + "/compare_test_input1.json",
+            dir_path + "/compare_test_input2.json",
+        ]
+    )
+    captured = capsys.readouterr()
+    ref_out = {
+        "results": [
+            {
+                "cols": 36,
+                "filename": "mtx",
+                "nonzeros": 208,
+                "rows": 36,
+                "spmv": {
+                    "coo": {"storage.ratio": 1.0, "time.ratio": 1.2},
+                    "csr": {"storage.ratio": 2.0, "time.ratio": 0.8},
+                    "ell": {"storage.ratio": 0.5, "time.ratio": 1.0},
+                    "sellp": {"storage.ratio": 1.0, "time.ratio": 1.11},
+                    "hybrid": {"storage.ratio": 1.0, "time.ratio": 1.01},
+                },
+            }
+        ],
+        "outliers": {},
+    }
+
+    assert json.loads(captured.out) == ref_out
+    assert captured.err == ""
+
+
+def test_csv(capsys):
+    compare.compare_main(
+        [
+            "--outliers",
+            "--output",
+            "csv",
+            dir_path + "/compare_test_input1.json",
+            dir_path + "/compare_test_input2.json",
+        ]
+    )
+    captured = capsys.readouterr()
+    ref_out = """benchmark,testcase,ratio
+spmv/coo/storage,"{""filename"": ""mtx""}",1.0
+spmv/coo/time,"{""filename"": ""mtx""}",1.2
+spmv/csr/storage,"{""filename"": ""mtx""}",2.0
+spmv/csr/time,"{""filename"": ""mtx""}",0.8
+spmv/ell/storage,"{""filename"": ""mtx""}",0.5
+spmv/ell/time,"{""filename"": ""mtx""}",1.0
+spmv/sellp/storage,"{""filename"": ""mtx""}",1.0
+spmv/sellp/time,"{""filename"": ""mtx""}",1.11
+spmv/hybrid/storage,"{""filename"": ""mtx""}",1.0
+spmv/hybrid/time,"{""filename"": ""mtx""}",1.01
+
+
+Outliers
+benchmark,testcase,ratio
+spmv/coo/time,"{""filename"": ""mtx""}",1.2
+spmv/csr/storage,"{""filename"": ""mtx""}",2.0
+spmv/csr/time,"{""filename"": ""mtx""}",0.8
+spmv/ell/storage,"{""filename"": ""mtx""}",0.5
+spmv/sellp/time,"{""filename"": ""mtx""}",1.11
+
+"""
+    assert captured.out == ref_out
+    assert captured.err == ""
+
+
+def test_md(capsys):
+    compare.compare_main(
+        [
+            "--outliers",
+            "--output",
+            "markdown",
+            dir_path + "/compare_test_input1.json",
+            dir_path + "/compare_test_input2.json",
+        ]
+    )
+    captured = capsys.readouterr()
+    ref_out = """| benchmark           | testcase            |   ratio |
+|:--------------------|:--------------------|--------:|
+| spmv/coo/storage    | {"filename": "mtx"} |    1    |
+| spmv/coo/time       | {"filename": "mtx"} |    1.2  |
+| spmv/csr/storage    | {"filename": "mtx"} |    2    |
+| spmv/csr/time       | {"filename": "mtx"} |    0.8  |
+| spmv/ell/storage    | {"filename": "mtx"} |    0.5  |
+| spmv/ell/time       | {"filename": "mtx"} |    1    |
+| spmv/sellp/storage  | {"filename": "mtx"} |    1    |
+| spmv/sellp/time     | {"filename": "mtx"} |    1.11 |
+| spmv/hybrid/storage | {"filename": "mtx"} |    1    |
+| spmv/hybrid/time    | {"filename": "mtx"} |    1.01 |
+
+Outliers
+| benchmark        | testcase            |   ratio |
+|:-----------------|:--------------------|--------:|
+| spmv/coo/time    | {"filename": "mtx"} |    1.2  |
+| spmv/csr/storage | {"filename": "mtx"} |    2    |
+| spmv/csr/time    | {"filename": "mtx"} |    0.8  |
+| spmv/ell/storage | {"filename": "mtx"} |    0.5  |
+| spmv/sellp/time  | {"filename": "mtx"} |    1.11 |
+"""
+    assert captured.out == ref_out
+    assert captured.err == ""
+
+
+def test_complex(capsys):
+    compare.compare_main(
+        [
+            dir_path + "/compare_test_input3.json",
+            dir_path + "/compare_test_input3.json",
+        ]
+    )
+    captured = capsys.readouterr()
+    ref_out = {
+        "results": [
+            {
+                "filename": "mtx",
+                "solver": {
+                    "gmres": {
+                        "apply": {
+                            "components.ratio": {"foo": 1.0},
+                            "iterations.ratio": 1.0,
+                            "time.ratio": 1.0,
+                        },
+                        "generate": {"time.ratio": 1.0},
+                    }
+                },
+            },
+            {"blas": {"axpy": {"time.ratio": 1.0}}, "k": 2, "m": 3, "n": 1, "r": 4},
+            {"size": 100, "spmv": {"csr": {"time.ratio": 1.0}}, "stencil": "7pt"},
+        ],
+        "outliers": {},
+    }
+
+    assert json.loads(captured.out) == ref_out
+    assert captured.err == ""
diff --git a/benchmark/tools/compare_test_input1.json b/benchmark/tools/compare_test_input1.json
new file mode 100644
index 00000000000..da7b190c270
--- /dev/null
+++ b/benchmark/tools/compare_test_input1.json
@@ -0,0 +1,48 @@
+[
+    {
+        "filename": "mtx",
+        "spmv": {
+            "coo": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.2,
+                "repetitions": 10,
+                "completed": true
+            },
+            "csr": {
+                "storage": 2000,
+                "max_relative_norm2": 1.0,
+                "time": 0.8,
+                "repetitions": 10,
+                "completed": true
+            },
+            "ell": {
+                "storage": 500,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "sellp": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.11,
+                "repetitions": 10,
+                "completed": true
+            },
+            "hybrid": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.01,
+                "repetitions": 10,
+                "completed": true
+            }
+        },
+        "rows": 36,
+        "cols": 36,
+        "nonzeros": 208,
+        "optimal": {
+            "spmv": "coo"
+        }
+    }
+]
\ No newline at end of file
diff --git a/benchmark/tools/compare_test_input2.json b/benchmark/tools/compare_test_input2.json
new file mode 100644
index 00000000000..29a8d348618
--- /dev/null
+++ b/benchmark/tools/compare_test_input2.json
@@ -0,0 +1,48 @@
+[
+    {
+        "filename": "mtx",
+        "spmv": {
+            "coo": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "csr": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "ell": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "sellp": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            },
+            "hybrid": {
+                "storage": 1000,
+                "max_relative_norm2": 1.0,
+                "time": 1.0,
+                "repetitions": 10,
+                "completed": true
+            }
+        },
+        "rows": 36,
+        "cols": 36,
+        "nonzeros": 208,
+        "optimal": {
+            "spmv": "coo"
+        }
+    }
+]
\ No newline at end of file
diff --git a/benchmark/tools/compare_test_input3.json b/benchmark/tools/compare_test_input3.json
new file mode 100644
index 00000000000..f317073d12d
--- /dev/null
+++ b/benchmark/tools/compare_test_input3.json
@@ -0,0 +1,39 @@
+[
+    {
+        "stencil": "7pt",
+        "size": 100,
+        "spmv": {
+            "csr": {
+                "time": 0.5
+            }
+        }
+    },
+    {
+        "n": 1,
+        "k": 2,
+        "m": 3,
+        "r": 4,
+        "blas": {
+            "axpy": {
+                "time": 100
+            }
+        }
+    },
+    {
+        "filename": "mtx",
+        "solver": {
+            "gmres": {
+                "apply": {
+                    "time": 1.0,
+                    "components": {
+                        "foo": 2.0
+                    },
+                    "iterations": 10
+                },
+                "generate": {
+                    "time": 2.0
+                }
+            }
+        }
+    }
+]
\ No newline at end of file
diff --git a/benchmark/utils/general.hpp b/benchmark/utils/general.hpp
index 241d2225938..e0045d8f417 100644
--- a/benchmark/utils/general.hpp
+++ b/benchmark/utils/general.hpp
@@ -289,7 +289,7 @@ void backup_results(json& results)
         return;
     }
     std::ofstream ofs(filenames[next]);
-    ofs << results;
+    ofs << std::setw(4) << results;
     next = 1 - next;
 }