From 8f537984a6cee1d929fc29f21893093248979aec Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Wed, 6 Nov 2024 16:53:10 +0000
Subject: [PATCH 01/20] Add scripts from paper repo

---
 plots-cgo2025-ae/__init__.py                  |   0
 .../config/cycles/all_barchart.json           |  36 +++
 .../config/cycles/all_barchart.mplstyle       |  74 +++++
 .../config/cycles/xdsl_barchart.json          |  36 +++
 .../config/cycles/xdsl_barchart.mplstyle      |  74 +++++
 plots-cgo2025-ae/config/gridplot.mplstyle     |  70 +++++
 plots-cgo2025-ae/cycles.py                    |  47 ++++
 plots-cgo2025-ae/data.py                      | 258 ++++++++++++++++++
 plots-cgo2025-ae/fp_throughput.py             |  68 +++++
 plots-cgo2025-ae/fpu.py                       |  74 +++++
 plots-cgo2025-ae/heatmap.py                   | 141 ++++++++++
 plots-cgo2025-ae/low_level_representation.py  |  83 ++++++
 plots-cgo2025-ae/max_util.py                  |  23 ++
 plots-cgo2025-ae/opt_pipeline.py              | 115 ++++++++
 plots-cgo2025-ae/pass_improvements.py         |  70 +++++
 plots-cgo2025-ae/pass_improvements_stacked.py | 133 +++++++++
 plots-cgo2025-ae/plot.py                      |  50 ++++
 plots-cgo2025-ae/plot_utils.py                | 241 ++++++++++++++++
 plots-cgo2025-ae/regalloc.py                  |  63 +++++
 plots-cgo2025-ae/throughput.py                |  57 ++++
 20 files changed, 1713 insertions(+)
 create mode 100644 plots-cgo2025-ae/__init__.py
 create mode 100644 plots-cgo2025-ae/config/cycles/all_barchart.json
 create mode 100644 plots-cgo2025-ae/config/cycles/all_barchart.mplstyle
 create mode 100644 plots-cgo2025-ae/config/cycles/xdsl_barchart.json
 create mode 100644 plots-cgo2025-ae/config/cycles/xdsl_barchart.mplstyle
 create mode 100644 plots-cgo2025-ae/config/gridplot.mplstyle
 create mode 100644 plots-cgo2025-ae/cycles.py
 create mode 100644 plots-cgo2025-ae/data.py
 create mode 100644 plots-cgo2025-ae/fp_throughput.py
 create mode 100644 plots-cgo2025-ae/fpu.py
 create mode 100644 plots-cgo2025-ae/heatmap.py
 create mode 100644 plots-cgo2025-ae/low_level_representation.py
 create mode 100644 plots-cgo2025-ae/max_util.py
 create mode 100644 plots-cgo2025-ae/opt_pipeline.py
 create mode 100644 plots-cgo2025-ae/pass_improvements.py
 create mode 100644 plots-cgo2025-ae/pass_improvements_stacked.py
 create mode 100644 plots-cgo2025-ae/plot.py
 create mode 100644 plots-cgo2025-ae/plot_utils.py
 create mode 100644 plots-cgo2025-ae/regalloc.py
 create mode 100644 plots-cgo2025-ae/throughput.py

diff --git a/plots-cgo2025-ae/__init__.py b/plots-cgo2025-ae/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/plots-cgo2025-ae/config/cycles/all_barchart.json b/plots-cgo2025-ae/config/cycles/all_barchart.json
new file mode 100644
index 00000000..44762471
--- /dev/null
+++ b/plots-cgo2025-ae/config/cycles/all_barchart.json
@@ -0,0 +1,36 @@
+{
+    "axis": {
+        "xlabel": {
+            "label": ""
+        },
+        "ylabel": {
+            "label": "Cycles",
+            "rotation": "horizontal",
+            "position": [1.0, 1.05],
+            "horizontalalignment": "left",
+            "verticalalignment": "bottom"
+        },
+        "ylim": [
+            0,
+            5000
+        ],
+        "xticks": {
+            "labelrotation": 45
+        }
+    },
+    "legend": {
+        "ncol": 50,
+        "bbox_to_anchor": [0, 1, 1, 0],
+        "loc": "lower right"
+    },
+    "spines" : {
+        "left": {
+            "color": "black",
+            "linewidth": 0.4
+        },
+        "bottom": {
+            "color": "black",
+            "linewidth": 0.4
+        }
+    }
+}
diff --git a/plots-cgo2025-ae/config/cycles/all_barchart.mplstyle b/plots-cgo2025-ae/config/cycles/all_barchart.mplstyle
new file mode 100644
index 00000000..980b425c
--- /dev/null
+++ b/plots-cgo2025-ae/config/cycles/all_barchart.mplstyle
@@ -0,0 +1,74 @@
+# vim: ft=config
+
+## Use TrueType fonts instead of Type 3 fonts
+#
+# Type 3 fonts embed bitmaps and are not allowed in camera-ready submissions
+# for many conferences. TrueType fonts look better and are accepted.
+# This follows: https://www.conference-publishing.com/Help.php
+pdf.fonttype: 42
+ps.fonttype: 42
+
+font.size: 12
+#font.family: serif
+font.family: sans-serif
+#font.sans-serif: ["Helvetica"]
+
+text.usetex: True
+
+## Enable tight_layout by default
+#
+# This ensures the plot has always sufficient space for legends, ...
+# Without this sometimes parts of the figure would be cut off.
+figure.autolayout: True
+
+#figure.figsize: 3, 2.5
+figure.figsize: 4, 2.5
+
+legend.frameon: False
+legend.fontsize: 8 
+legend.loc: upper right
+legend.fancybox: False
+legend.framealpha: 1.0
+legend.shadow: False
+legend.borderaxespad: 0
+legend.edgecolor: gray
+legend.handleheight: 1
+legend.labelspacing: 0.2
+legend.columnspacing: 0.4
+legend.handletextpad: 0.2
+
+patch.edgecolor: black
+patch.force_edgecolor: False
+patch.linewidth: 0.4
+
+xtick.top: False
+xtick.bottom: True
+xtick.major.size: 3
+xtick.major.width: 0.4
+
+ytick.left: True
+ytick.right: False
+ytick.direction: out
+ytick.major.size: 3
+ytick.major.width: 0.4
+
+axes.grid.axis: y
+axes.grid: True
+
+# Hide the right and top spines
+#
+# This reduces the number of lines in the plot. Lines typically catch
+# a readers attention and distract the reader from the actual content.
+# By removing unnecessary spines, we help the reader to focus on
+# the figures in the graph.
+axes.spines.right: False
+axes.spines.top: False
+
+grid.color: black
+grid.alpha: 0.2
+grid.linewidth: 0.4
+grid.linestyle: dotted
+
+
+savefig.bbox: tight
+savefig.pad_inches: 0.05
diff --git a/plots-cgo2025-ae/config/cycles/xdsl_barchart.json b/plots-cgo2025-ae/config/cycles/xdsl_barchart.json
new file mode 100644
index 00000000..918e08fe
--- /dev/null
+++ b/plots-cgo2025-ae/config/cycles/xdsl_barchart.json
@@ -0,0 +1,36 @@
+{
+    "axis": {
+        "xlabel": {
+            "label": ""
+        },
+        "ylabel": {
+            "label": "Cycles",
+            "rotation": "horizontal",
+            "position": [1.0, 1.05],
+            "horizontalalignment": "left",
+            "verticalalignment": "bottom"
+        },
+        "ylim": [
+            0,
+            5000
+        ],
+        "xticks": {
+            "labelrotation": 0
+        }
+    },
+    "legend": {
+        "ncol": 50,
+        "bbox_to_anchor": [0, 1, 1, 0],
+        "loc": "lower right"
+    },
+    "spines" : {
+        "left": {
+            "color": "black",
+            "linewidth": 0.4
+        },
+        "bottom": {
+            "color": "black",
+            "linewidth": 0.4
+        }
+    }
+}
diff --git a/plots-cgo2025-ae/config/cycles/xdsl_barchart.mplstyle b/plots-cgo2025-ae/config/cycles/xdsl_barchart.mplstyle
new file mode 100644
index 00000000..163c89d5
--- /dev/null
+++ b/plots-cgo2025-ae/config/cycles/xdsl_barchart.mplstyle
@@ -0,0 +1,74 @@
+# vim: ft=config
+
+## Use TrueType fonts instead of Type 3 fonts
+#
+# Type 3 fonts embed bitmaps and are not allowed in camera-ready submissions
+# for many conferences. TrueType fonts look better and are accepted.
+# This follows: https://www.conference-publishing.com/Help.php
+pdf.fonttype: 42
+ps.fonttype: 42
+
+font.size: 12
+#font.family: serif
+font.family: sans-serif
+#font.sans-serif: ["Helvetica"]
+
+text.usetex: True
+
+## Enable tight_layout by default
+#
+# This ensures the plot has always sufficient space for legends, ...
+# Without this sometimes parts of the figure would be cut off.
+figure.autolayout: True
+
+#figure.figsize: 3, 2.5
+figure.figsize: 3, 2
+
+legend.frameon: False
+legend.fontsize: 8 
+legend.loc: upper right
+legend.fancybox: False
+legend.framealpha: 1.0
+legend.shadow: False
+legend.borderaxespad: 0
+legend.edgecolor: gray
+legend.handleheight: 1
+legend.labelspacing: 0.2
+legend.columnspacing: 0.4
+legend.handletextpad: 0.2
+
+patch.edgecolor: black
+patch.force_edgecolor: False
+patch.linewidth: 0.4
+
+xtick.top: False
+xtick.bottom: True
+xtick.major.size: 3
+xtick.major.width: 0.4
+
+ytick.left: True
+ytick.right: False
+ytick.direction: out
+ytick.major.size: 3
+ytick.major.width: 0.4
+
+axes.grid.axis: y
+axes.grid: True
+
+# Hide the right and top spines
+#
+# This reduces the number of lines in the plot. Lines typically catch
+# a readers attention and distract the reader from the actual content.
+# By removing unnecessary spines, we help the reader to focus on
+# the figures in the graph.
+axes.spines.right: False
+axes.spines.top: False
+
+grid.color: black
+grid.alpha: 0.2
+grid.linewidth: 0.4
+grid.linestyle: dotted
+
+
+savefig.bbox: tight
+savefig.pad_inches: 0.05
diff --git a/plots-cgo2025-ae/config/gridplot.mplstyle b/plots-cgo2025-ae/config/gridplot.mplstyle
new file mode 100644
index 00000000..1ace4cb8
--- /dev/null
+++ b/plots-cgo2025-ae/config/gridplot.mplstyle
@@ -0,0 +1,70 @@
+# vim: ft=config
+
+## Use TrueType fonts instead of Type 3 fonts
+#
+# Type 3 fonts embed bitmaps and are not allowed in camera-ready submissions
+# for many conferences. TrueType fonts look better and are accepted.
+# This follows: https://www.conference-publishing.com/Help.php
+pdf.fonttype: 42
+ps.fonttype: 42
+
+font.size: 12
+#font.family: serif
+font.family: sans-serif
+#font.sans-serif: ["Helvetica"]
+
+text.usetex: True
+
+## Enable tight_layout by default
+#
+# This ensures the plot has always sufficient space for legends, ...
+# Without this sometimes parts of the figure would be cut off.
+figure.autolayout: True
+
+#figure.figsize: 3, 2.5
+#figure.figsize: 4, 2.5
+
+legend.frameon: False
+legend.fontsize: 14
+legend.loc: upper center
+legend.fancybox: False
+legend.framealpha: 1.0
+legend.shadow: False
+legend.borderaxespad: 0
+legend.edgecolor: gray
+legend.handleheight: 1
+legend.labelspacing: 0.1
+legend.columnspacing: 0.8
+legend.handletextpad: 0.1
+
+xtick.top: False
+xtick.bottom: True
+xtick.major.size: 3
+xtick.major.width: 0.4
+
+ytick.left: True
+ytick.right: False
+ytick.direction: out
+ytick.major.size: 3
+ytick.major.width: 0.4
+
+axes.grid.axis: y
+axes.grid: True
+
+# Hide the right and top spines
+#
+# This reduces the number of lines in the plot. Lines typically catch
+# a readers attention and distract the reader from the actual content.
+# By removing unnecessary spines, we help the reader to focus on
+# the figures in the graph.
+axes.spines.right: False
+axes.spines.top: False
+
+grid.color: black
+grid.alpha: 0.2
+grid.linewidth: 0.4
+grid.linestyle: dashed
+
+
+savefig.bbox: tight
+savefig.pad_inches: 0.02
diff --git a/plots-cgo2025-ae/cycles.py b/plots-cgo2025-ae/cycles.py
new file mode 100644
index 00000000..36899dcd
--- /dev/null
+++ b/plots-cgo2025-ae/cycles.py
@@ -0,0 +1,47 @@
+from typing import Iterable, Sequence, cast
+import pandas as pd
+import numpy as np
+import numpy.typing as npt
+from plot_utils import IMPL_COLORS, IMPL_MARKERS, plot_combined, GridPlotRow
+from matplotlib.axes import Axes
+
+from math import log10, ceil, floor
+
+
+class CyclesGridPlotRow(GridPlotRow):
+    ylabel = "Cycles"
+
+    @classmethod
+    def yrange(cls, dfs: Sequence[pd.DataFrame]) -> npt.NDArray[np.float64]:
+        max_value = cast(float, max(_df.max().iloc[0] for _df in dfs))
+        magnitude: float = 10 ** floor(log10(max_value))
+        greater_round_number = ceil(max_value / magnitude) * magnitude
+        yrange = np.arange(0, greater_round_number + 1, greater_round_number // 10)
+        return yrange
+
+    @classmethod
+    def plot_grid_cell(
+        cls,
+        ax: Axes,
+        df: pd.DataFrame,
+        *,
+        hide_xlabel: bool,
+    ) -> None:
+        for col in df:
+            ax.scatter(
+                x=df.index,
+                y=df[col],
+                color=IMPL_COLORS[col],
+                marker=IMPL_MARKERS[col],
+            )
+        ax.set_xticks(df.index)
+        if not hide_xlabel:
+            ax.set_xlabel(df.index.name, fontsize=12)
+
+
+def plot_cycles(cycles_dfs: tuple[pd.DataFrame, ...]):
+    return plot_combined(
+        CyclesGridPlotRow.get_rows(cycles_dfs, 4),
+        legend_cols=3,
+        rcparams_cfg_file="config/gridplot.mplstyle",
+    )
diff --git a/plots-cgo2025-ae/data.py b/plots-cgo2025-ae/data.py
new file mode 100644
index 00000000..58104850
--- /dev/null
+++ b/plots-cgo2025-ae/data.py
@@ -0,0 +1,258 @@
+from collections.abc import Iterable
+import pandas as pd
+import numpy as np
+
+from enum import StrEnum
+
+
+class Impl(StrEnum):
+    OURS = "Ours"
+    CLANG = "Clang"
+    MLIR = "MLIR"
+
+
+class Operator(StrEnum):
+    CONV = "Conv 3x3"
+    FILL = "Fill"
+    MATMUL = "MatMul"
+    MATMUL_TRANSB = "MatMulT"
+    MAX_POOL = "Max Pool 3x3"
+    RELU = "ReLU"
+    SUM = "Sum"
+    SUM_POOL = "Sum Pool 3x3"
+
+
+OPERATOR_BY_TEST = {
+    "conv2d_d1_s1_3x3": Operator.CONV,
+    "fill": Operator.FILL,
+    "matmul": Operator.MATMUL,
+    "matmul_transb": Operator.MATMUL_TRANSB,
+    "pooling_nchw_max_d1_s2_3x3": Operator.MAX_POOL,
+    "relu": Operator.RELU,
+    "dsum": Operator.SUM,
+    "sum": Operator.SUM,
+    "pooling_nchw_sum_d1_s2_3x3": Operator.SUM_POOL,
+}
+
+PARAMS_BY_OPERATOR = {
+    Operator.CONV: ("M", "N"),
+    Operator.FILL: ("M", "N"),
+    Operator.MATMUL: ("M", "K", "N"),
+    Operator.MATMUL_TRANSB: ("M", "K", "N"),
+    Operator.MAX_POOL: ("M", "N"),
+    Operator.RELU: ("M", "N"),
+    Operator.SUM: ("M", "N"),
+    Operator.SUM_POOL: ("M", "N"),
+}
+
+FLOPS_BY_OPERATOR = {
+    Operator.CONV: lambda m, n: 2 * 9 * n * m,
+    Operator.FILL: lambda m, n: n * m,
+    Operator.MATMUL: lambda m, k, n: 2 * n * m * k,
+    Operator.MATMUL_TRANSB: lambda m, k, n: 2 * n * m * k,
+    Operator.MAX_POOL: lambda m, n: 9 * n * m,
+    Operator.RELU: lambda m, n: n * m,
+    Operator.SUM: lambda m, n: n * m,
+    Operator.SUM_POOL: lambda m, n: 9 * n * m,
+}
+"""
+FLOPS adjusted for whether the operation can benefit from the fmadd instruction.
+"""
+
+OPERAND_SHAPES_BY_OPERATOR = {
+    Operator.CONV: lambda m, n: ((m, n),),
+    Operator.FILL: lambda m, n: ((m, n),),
+    Operator.MATMUL: lambda m, k, n: ((m, k), (k, n)),
+    Operator.MATMUL_TRANSB: lambda m, k, n: ((m, k), (n, k)),
+    Operator.MAX_POOL: lambda m, n: ((m, n),),
+    Operator.RELU: lambda m, n: ((m, n),),
+    Operator.SUM: lambda m, n: ((m, n), (m, n)),
+    Operator.SUM_POOL: lambda m, n: ((m, n),),
+}
+
+FMA_OPERATORS = {Operator.CONV, Operator.MATMUL, Operator.MATMUL_TRANSB}
+
+
+def _get_kernels(filename: str) -> pd.DataFrame:
+    df = pd.read_csv(filename)
+    df.replace(
+        {
+            "linalg_xdsl": Impl.OURS,
+            "snitch_stream": Impl.OURS,
+            "baseline": Impl.CLANG,
+            "linalg": Impl.MLIR,
+            **OPERATOR_BY_TEST,
+        },
+        inplace=True,
+    )
+    df = df[df.impl.isin(set(Impl))]
+    df.set_index(["test", "params"], inplace=True)
+    # Get the result of adding_overhead for each operator and concatenate the dataframes
+    df_with_overhead = pd.concat([
+        adding_overhead(df[df.index.get_level_values(0) == operator], operator)
+        for operator in Operator
+        if operator in df.index.get_level_values(0)
+    ])
+    return df_with_overhead
+
+
+def get_kernels(cleaned: bool = True) -> pd.DataFrame:
+    df = _get_kernels("results/kernels.csv")
+    # Drop unknown operators
+    df = df[df.index.get_level_values(0).isin(tuple(Operator))]
+    if cleaned:
+        # exclude K=400  matmul entries
+        df = df[df.index.get_level_values(1) != "1x400x25xf64"]
+    return df
+
+
+def get_low_level_representation() -> pd.DataFrame:
+    return _get_kernels("results/kernels.low_level_representation.csv")
+
+
+def get_pivoted_all(kernels_df: pd.DataFrame) -> pd.DataFrame:
+    return kernels_df.pivot(columns="impl")
+
+
+def get_pivoted_fpu(pivoted_all_df: pd.DataFrame) -> pd.DataFrame:
+    return pivoted_all_df["fpss_fpu_occupancy"]
+
+
+def get_pivoted_cycles(pivoted_all_df: pd.DataFrame) -> pd.DataFrame:
+    return pivoted_all_df[["cycles", "Min Cycles", "Overhead", "FLOPs", "Throughput", "Max Throughput"]]
+
+
+def get_flops(operator_df: pd.DataFrame, operator: Operator) -> pd.Series:
+    operator_series: list[pd.Series] = [
+        operator_df[param] for param in PARAMS_BY_OPERATOR[operator]
+    ]
+    return FLOPS_BY_OPERATOR[operator](*operator_series)
+
+
+def get_overhead(
+    kernels_operator_df: pd.DataFrame,
+    operator: Operator,
+) -> pd.DataFrame:
+    cols = PARAMS_BY_OPERATOR[operator]
+    col_vals: pd.DataFrame = kernels_operator_df.index.get_level_values(1).str.extract(
+        "x".join((r"(\d+)" for _ in range(len(cols)))) + r"xf(\d+)"
+    )
+    col_vals.columns = cols + ("bitwidth",)
+    col_vals.index = kernels_operator_df.index
+    col_vals = col_vals.apply(pd.to_numeric)
+    flops = get_flops(col_vals, operator)
+    assert (64 % col_vals.bitwidth == 0).all()
+    throughput = flops / kernels_operator_df["cycles"]
+    max_throughput = (2 if operator in FMA_OPERATORS else 1) * 64 // col_vals["bitwidth"]
+    rel_throughput = throughput / max_throughput
+    min_cycles = np.ceil((flops / max_throughput))
+    overhead = kernels_operator_df["cycles"] - min_cycles
+    res = pd.DataFrame({
+        "Min Cycles": min_cycles,
+        "Overhead": overhead,
+        "FLOPs": flops,
+        "Throughput": throughput,
+        "Max Throughput": max_throughput,
+        "Rel Throughput": rel_throughput,
+        "bitwidth": col_vals["bitwidth"]
+    })
+    return res
+
+def adding_overhead(
+    operator_df: pd.DataFrame,
+    operator: Operator,
+) -> pd.DataFrame:
+    return pd.concat(
+        (
+            operator_df,
+            get_overhead(
+                operator_df,
+                operator
+            ),
+        ),
+        axis=1,
+    )
+
+def get_operator_df(
+    pivoted_df: pd.DataFrame, operator: Operator, *, bitwidth: int
+) -> pd.DataFrame:
+    cols = PARAMS_BY_OPERATOR[operator]
+    operator_df = pivoted_df.loc[operator.value]
+    col_vals = operator_df.index.str.extract(
+        "x".join((r"(\d+)" for _ in range(len(cols)))) + r"xf(\d+)"
+    )
+    col_vals.columns = cols + ("bitwidth",)
+    col_vals.index = operator_df.index
+    df = pd.concat(
+        (
+            operator_df,
+            col_vals.apply(pd.to_numeric),
+        ),
+        axis=1,
+    )
+    df = df[df.bitwidth == bitwidth]
+    df.index.name = operator.value
+    return df
+
+
+def get_params_dfs(operator_df: pd.DataFrame) -> Iterable[pd.DataFrame]:
+    name = operator_df.index.name
+    cols = PARAMS_BY_OPERATOR[name]
+    maxs = tuple(operator_df[col].max() for col in cols)
+    operand_shapes_map = OPERAND_SHAPES_BY_OPERATOR[name]
+    for i, col in enumerate(cols):
+        my_df = operator_df
+        name_components: list[str] = []
+        for j, other_col in enumerate(cols):
+            if i != j:
+                name_components.append(str(int(maxs[j])))
+                my_df = my_df[my_df[other_col] == maxs[j]]
+            else:
+                name_components.append(col)
+        other_cols = list(cols) + ["bitwidth"]
+        del other_cols[i]
+        shape_string = " ".join("x".join(t) for t in operand_shapes_map(*name_components))
+        new_name = f"{name} {shape_string}"
+        my_df = my_df.rename(columns={col: new_name}).set_index(new_name).sort_index()
+        my_df.drop(other_cols, axis=1, inplace=True)
+        yield my_df
+
+
+def get_regalloc() -> pd.DataFrame:
+    regalloc_df = pd.read_csv("results/regalloc.csv")
+    regalloc_df = regalloc_df[regalloc_df["impl"].isin(OPERATOR_BY_TEST)]
+    regalloc_df.replace(OPERATOR_BY_TEST, inplace=True)
+    regalloc_df = regalloc_df[~regalloc_df["params"].str.contains("f16")]
+    regalloc_df.reset_index(drop=True, inplace=True)
+    param_components = tuple(param.split("x") for param in regalloc_df["params"])
+    bitwidths = tuple({"Bits": param[-1][1:]} for param in param_components)
+    params = tuple(
+        {p: v for p, v in zip("MNK", param[:-1])} for param in param_components
+    )
+    params_df = pd.DataFrame(params).fillna("{--}")
+    regalloc_df = pd.concat((regalloc_df, pd.DataFrame(bitwidths), params_df), axis=1)
+    del regalloc_df["params"]
+
+    # Reorder columns to move Cycles and Occupancy to the end
+    cols = regalloc_df.columns.tolist()
+    cols = [col for col in cols if col not in ['allocated_float', 'allocated_int']] + [
+        'allocated_float',
+        'allocated_int',
+    ]
+    regalloc_df = regalloc_df[cols]
+
+    return regalloc_df
+
+
+def get_opt_pipeline() -> pd.DataFrame:
+    opt_pipeline_df = pd.read_csv("results/pipeline.csv")
+    opt_pipeline_df = opt_pipeline_df.rename(
+        columns={
+            "FPU Occupancy [%]": "Occupancy",
+            "FMAdd Issues": "FMAdd",
+            "FRep Count": "FRep",
+            "variant": "Optimizations",
+        }
+    )
+
+    return opt_pipeline_df
diff --git a/plots-cgo2025-ae/fp_throughput.py b/plots-cgo2025-ae/fp_throughput.py
new file mode 100644
index 00000000..db341377
--- /dev/null
+++ b/plots-cgo2025-ae/fp_throughput.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import argparse
+import numpy as np
+
+LABELS = {
+    "conv2d_d1_s1_3x3": "2D Convolution",
+    "ddot": "Inner Product",
+    "dense": "Dense ReLU Layer",
+    "dsum": "Vector\nElement-wise Add",
+    "matmul": "Matrix Multiplication",
+    "pooling_nchw_max_d1_s2_3x3": "Max Pooling Layer",
+    "pooling_nchw_sum_d1_s2_3x3": "Sum Pooling Layer",
+}
+
+def add_metrics(data: pd.DataFrame) -> pd.DataFrame:
+    if "fp_inst_throughput" not in data:
+        data["fp_inst_throughput"] = data["fpss_fpu_issues"] / data["cycles"]
+    if "fp_flop_throughput" not in data:
+        data["fp_flop_throughput"] = (
+            data["fpss_fpu_issues"] + data["fpss_fpu_fmadd_issues"]
+        ) / data["cycles"]
+    return data
+
+def generate_throughput(data):
+    df = data[data["impl"].isin(["snitch_stream", "linalg_xdsl"])]
+    # If multiple rows (experiments) are present for a single 'test',
+    # just pick the best one:
+    df = df.loc[df.groupby("test")["fp_flop_throughput"].idxmax()]
+    # Use meaningful labels:
+    df["test"] = df["test"].map(LABELS)
+    fig, ax = plt.subplots(figsize=(8, 9))
+    sns.histplot(
+        ax=ax,
+        data=df,
+        x="test",
+        weights="fp_flop_throughput",
+        legend=False,
+        edgecolor=None,
+        shrink=0.9,
+    )
+    plt.axhline(y=2, color="grey", linestyle="--", linewidth=1)
+    plt.axhline(y=1, color="grey", linestyle="--", linewidth=0.5)
+    plt.xticks(rotation=45, fontsize=8)
+    ax.set_title("Kernels FP sustained throughput @ f64")
+    ax.set_xlabel("")
+    ax.set_ylabel("FLOP/cycle")
+    ax.set_yticks(np.arange(0, 2.1, 0.1))
+    return fig
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate kernels FP throughput from CSV data."
+    )
+    parser.add_argument("csv_file", help="Path to the CSV file")
+    args = parser.parse_args()
+    data = pd.read_csv(args.csv_file)
+    data = add_metrics(data)
+    fig = generate_throughput(data)
+    fig.savefig(f"fp_throughput.pdf", format="pdf")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/plots-cgo2025-ae/fpu.py b/plots-cgo2025-ae/fpu.py
new file mode 100644
index 00000000..911222be
--- /dev/null
+++ b/plots-cgo2025-ae/fpu.py
@@ -0,0 +1,74 @@
+from typing import Sequence
+from matplotlib.axes import Axes
+import pandas as pd
+import numpy as np
+import numpy.typing as npt
+from data import Impl, Operator, get_operator_df, get_params_dfs
+from plot_utils import IMPL_COLORS, IMPL_MARKERS, GridPlotRow, plot_combined
+
+
+def all_plot_dfs(
+    pivoted_df: pd.DataFrame, operators: tuple[Operator, ...]
+) -> tuple[pd.DataFrame, ...]:
+    return tuple(
+        param_df
+        for operator in operators
+        for param_df in get_params_dfs(
+            get_operator_df(pivoted_df, operator, bitwidth=64)
+        )
+    )
+
+
+def get_fpu(pivoted_fpu_df: pd.DataFrame) -> tuple[pd.DataFrame, ...]:
+    return all_plot_dfs(
+        pivoted_fpu_df.filter([Impl.OURS, Impl.CLANG, Impl.MLIR]),
+        (
+            Operator.SUM,
+            Operator.FILL,
+            Operator.RELU,
+            Operator.CONV,
+            Operator.MAX_POOL,
+            Operator.SUM_POOL,
+            # Operator.MATMUL, # Matmul included in other plot
+        ),
+    )
+
+
+class FPUGridPlotRow(GridPlotRow):
+    ylabel = "FPU Utilization"
+
+    @classmethod
+    def yrange(cls, dfs: Sequence[pd.DataFrame]) -> npt.NDArray[np.float64]:
+        return np.arange(0.0, 1.1, 0.1)
+
+    @classmethod
+    def plot_grid_cell(
+        cls,
+        ax: Axes,
+        df: pd.DataFrame,
+        *,
+        hide_xlabel: bool,
+    ) -> None:
+        for col in df:
+            ax.scatter(
+                x=df.index,
+                y=df[col],
+                color=IMPL_COLORS[col],
+                marker=IMPL_MARKERS[col],
+            )
+        ax.set_xticks(df.index)
+        if not hide_xlabel:
+            ax.set_xlabel(df.index.name)
+
+    @classmethod
+    def get_roofline(cls, df: pd.DataFrame) -> float | None:
+        df['Performance Roofline'] = 1.0
+        return 1.0
+
+
+def plot_fpu(fpu_dfs: tuple[pd.DataFrame, ...]):
+    return plot_combined(
+        FPUGridPlotRow.get_rows(fpu_dfs, 6, hide_xtick_labels=[True, False]),
+        legend_cols=4,
+        rcparams_cfg_file="config/gridplot.mplstyle",
+    )
diff --git a/plots-cgo2025-ae/heatmap.py b/plots-cgo2025-ae/heatmap.py
new file mode 100644
index 00000000..1b120432
--- /dev/null
+++ b/plots-cgo2025-ae/heatmap.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import argparse
+
+
+def add_metrics(data: pd.DataFrame) -> pd.DataFrame:
+    if "fp_inst_throughput" not in data:
+        data["fp_inst_throughput"] = data["fpss_fpu_issues"] / data["cycles"]
+    if "fp_flop_throughput" not in data:
+        data["fp_flop_throughput"] = (
+            data["fpss_fpu_issues"] + data["fpss_fpu_fmadd_issues"]
+        ) / data["cycles"]
+    return data
+
+
+def highlight_1_8_cells(ax: plt.axes, df: pd.DataFrame):
+    Ks = df["K"].unique()
+    Ns = df["N"].unique()
+    threshold = 1.8
+
+    # Find the first index in every row with a value above 90
+    for i, k in enumerate(Ks):
+        row = df[df["K"] == k]
+        left_index = row[row["fp_flop_throughput"] >= threshold]["N"].min()
+        if pd.notna(left_index):
+            ax.axvline(
+                x=(left_index - 4) / 4,
+                ymin=(k - 4) / 64,
+                ymax=k / 64,
+                color='white',
+                linewidth=1,
+                alpha=1,
+            )
+
+    # Find the last index in every column with a value above 90
+    for j, n in enumerate(Ns):
+        col = df[df["N"] == n]
+        bottom_index = col[col["fp_flop_throughput"] >= threshold]["K"].min()
+        if pd.notna(bottom_index):
+            ax.axhline(
+                xmin=(n - 4) / 64,
+                xmax=n / 64,
+                y=(bottom_index - 4) / 4,
+                color='white',
+                linewidth=1,
+                alpha=1,
+            )
+
+
+def generate_heatmaps(data: pd.DataFrame):
+    data[["M", "K", "N", "bitwidth"]] = data["params"].str.extract(
+        r"(\d+)x(\d+)x(\d+)xf(\d+)"
+    )
+    data[["M", "K", "N", "bitwidth"]] = data[["M", "K", "N", "bitwidth"]].astype(int)
+
+    # filter out K values unrelated to the heatmap experimental runs
+    data = data[(data["K"] <= 65)]
+
+    sns.set(rc={'text.usetex': True})
+
+    for m_value, m_group in data.groupby("M"):
+        selection = m_group[["K", "N", "fp_flop_throughput"]]
+        pivot = (
+            selection.pivot(index="K", columns="N", values="fp_flop_throughput")
+            / 2
+            * 100
+        ).apply(np.floor)
+
+        # find the min value to use in colorbar
+        min_val = (pivot.agg('min').agg('min') // 10) * 10
+
+        fig, ax = plt.subplots(figsize=(5, 4))
+        sns.heatmap(
+            pivot,
+            ax=ax,
+            annot=True,
+            fmt=".0f",
+            cmap="YlGnBu",
+            vmin=min_val,
+            vmax=100,
+            annot_kws={"fontsize": 9},
+            cbar_kws={
+                "orientation": "horizontal",
+                "aspect": 35,
+                "shrink": 0.85,
+                "pad": 0.07,
+            },
+        )
+        # ax.set_title(
+        #     "Matrix multiplication, $C_{{M \\times N}} = A_{{M \\times K}} B_{{K \\times N}}$ with $M={}$ @ f64".format(
+        #         m_value
+        #     )
+        # )
+        ax.tick_params(
+            axis="both",
+            which="major",
+            labelbottom=True,
+            bottom=False,
+            top=False,
+            labeltop=False,
+            pad=0.0,
+            length=4,
+        )
+        ax.invert_yaxis()  # make sure bottom-left corner is origin for both dimensions
+        ax.set_xlabel("$N$")
+        ax.set_ylabel("$K$", rotation=0)
+        plt.yticks(rotation=0)
+        ax.yaxis.set_label_coords(-0.05, 0.95, transform=None)
+        ax.xaxis.set_label_coords(1, -0.05, transform=None)
+
+        # Skip every second x-axis label
+        for label in ax.xaxis.get_ticklabels()[1::2]:
+            label.set_visible(False)
+
+        cbar = ax.collections[0].colorbar
+        cbar.set_label("\% of FLOP/cycle Roofline", labelpad=2, fontsize=10)
+        cbar.ax.tick_params(size=0)
+
+        plt.tight_layout()
+        highlight_1_8_cells(ax, selection)
+        yield m_value, fig
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate heatmaps from CSV data.")
+    parser.add_argument("csv_file", help="Path to the CSV file")
+    args = parser.parse_args()
+    data = pd.read_csv(args.csv_file)
+    data = add_metrics(data)
+    # FIXME we are able to generate snitch_stream matmul only at the moment
+    data = data.loc[(data["test"] == "matmul") & (data["impl"] == "linalg_xdsl")]
+    for m, fig in generate_heatmaps(data):
+        fig.savefig(f"matmul_heatmap_M_{m}.pdf", format="pdf", bbox_inches="tight")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/plots-cgo2025-ae/low_level_representation.py b/plots-cgo2025-ae/low_level_representation.py
new file mode 100644
index 00000000..a986e082
--- /dev/null
+++ b/plots-cgo2025-ae/low_level_representation.py
@@ -0,0 +1,83 @@
+from matplotlib.figure import Figure
+
+import pandas as pd
+from typing import Sequence, NamedTuple
+from plot_utils import plot_combined
+
+from fpu import FPUGridPlotRow
+from data import (
+    get_pivoted_all,
+    get_pivoted_fpu,
+    get_pivoted_cycles,
+    Operator,
+    get_params_dfs,
+    get_operator_df,
+    Impl,
+)
+from throughput import ThroughputGridPlotRow
+from cycles import CyclesGridPlotRow
+
+
+class LLRDataFrames(NamedTuple):
+    fpu_dfs: Sequence[pd.DataFrame]
+    throughput_dfs: Sequence[pd.DataFrame]
+    cycles_dfs: Sequence[pd.DataFrame]
+
+
+def get_llr_dfs(llr_kernels_df: pd.DataFrame) -> LLRDataFrames:
+    llr_pivoted_all_df = (
+        get_pivoted_all(llr_kernels_df)
+        .loc[:, (slice(None), 'Ours')]
+        .droplevel('impl', axis=1)
+    )
+    llr_pivoted_fpu_df = pd.DataFrame(
+        get_pivoted_fpu(llr_pivoted_all_df).rename(Impl.OURS)
+    )
+    llr_pivoted_cycles_df = get_pivoted_cycles(llr_pivoted_all_df).rename(
+        columns={"cycles": Impl.OURS}
+    )
+    operators = (Operator.SUM, Operator.RELU, Operator.MATMUL_TRANSB)
+    llr_fpu_dfs = tuple(
+        param_df
+        for operator in operators
+        for param_df in get_params_dfs(
+            get_operator_df(llr_pivoted_fpu_df, operator, bitwidth=32)
+        )
+    )
+    # Remove the matmul_t 1 fpu df
+    llr_fpu_dfs = llr_fpu_dfs[:4] + llr_fpu_dfs[5:]
+    llr_throughput_dfs: list[pd.DataFrame] = []
+    llr_cycles_dfs: list[pd.DataFrame] = []
+    for operator in operators:
+        llr_operator_df = get_operator_df(llr_pivoted_cycles_df, operator, bitwidth=32)
+        llr_operator_params_dfs = tuple(get_params_dfs(llr_operator_df))
+        # Remove the matmul_t 1 params df
+        if operator == Operator.MATMUL_TRANSB:
+            llr_operator_params_dfs = llr_operator_params_dfs[1:]
+        llr_operator_throughput_dfs = tuple(
+            pd.DataFrame(
+                {"Ours": df["Throughput"], "Performance Roofline": df["Max Throughput"]}
+            )
+            for df in llr_operator_params_dfs
+        )
+        llr_operator_cycles_dfs = tuple(
+            df[[Impl.OURS, "Min Cycles", "Overhead"]] for df in llr_operator_params_dfs
+        )
+        llr_throughput_dfs.extend(llr_operator_throughput_dfs)
+        llr_cycles_dfs.extend(llr_operator_cycles_dfs)
+
+    return LLRDataFrames(llr_fpu_dfs, llr_throughput_dfs, llr_cycles_dfs)
+
+
+def plot_llr(llr_dfs: LLRDataFrames) -> Figure:
+    return plot_combined(
+        FPUGridPlotRow.get_rows(
+            llr_dfs.fpu_dfs, 6, hide_xlabel=True, hide_xtick_labels=[True]
+        )
+        + ThroughputGridPlotRow.get_rows(
+            llr_dfs.throughput_dfs, 6, hide_xlabel=True, hide_xtick_labels=[True]
+        )
+        + CyclesGridPlotRow.get_rows(llr_dfs.cycles_dfs, 6),
+        legend_cols=4,
+        rcparams_cfg_file="config/gridplot.mplstyle",
+    )
diff --git a/plots-cgo2025-ae/max_util.py b/plots-cgo2025-ae/max_util.py
new file mode 100644
index 00000000..188706ea
--- /dev/null
+++ b/plots-cgo2025-ae/max_util.py
@@ -0,0 +1,23 @@
+
+from typing import Sequence
+import pandas as pd
+
+from data import Impl, Operator
+
+def get_max_util(llr_kernels_df: pd.DataFrame, fpu_dfs: Sequence[pd.DataFrame]) -> str:
+    llr_max_occupancy = llr_kernels_df["fpss_fpu_occupancy"].max()
+    llr_max_throughput = llr_kernels_df["Rel Throughput"].max()
+    llr_matmult_throughput = llr_kernels_df["Throughput"][Operator.MATMUL_TRANSB, :].max()
+    llr_matmult_max_occupancy = llr_kernels_df["fpss_fpu_occupancy"][Operator.MATMUL_TRANSB, :].max()
+    proto_comp_max_occupancy = max(fpu_df[Impl.OURS].max() for fpu_df in fpu_dfs)
+    proto_comp_min_max_occupancy = min_max = min(_df[Impl.OURS].max() for _df in fpu_dfs)
+    clang_max_occupancy = max(fpu_df[Impl.CLANG].max() for fpu_df in fpu_dfs)
+    return f"""\
+\\newdelimitedcommand{{maxutilprotocomp}}{{{proto_comp_max_occupancy*100:.0f}\\%}}
+\\newdelimitedcommand{{minmaxutilprotocomp}}{{{proto_comp_min_max_occupancy*100:.0f}\\%}}
+\\newdelimitedcommand{{maxutilclang}}{{{clang_max_occupancy*100:.0f}\\%}}
+\\newdelimitedcommand{{maxutilllr}}{{{llr_max_occupancy*100:.0f}\\%}}
+\\newdelimitedcommand{{maxutilllrmatmult}}{{{llr_matmult_max_occupancy*100:.0f}\\%}}
+\\newdelimitedcommand{{maxrelthroughputllr}}{{{llr_max_throughput*100:.0f}\\%}}
+\\newdelimitedcommand{{maxabsthroughputllrmatmult}}{{{llr_matmult_throughput:.2f}}}
+"""
diff --git a/plots-cgo2025-ae/opt_pipeline.py b/plots-cgo2025-ae/opt_pipeline.py
new file mode 100644
index 00000000..53f41c67
--- /dev/null
+++ b/plots-cgo2025-ae/opt_pipeline.py
@@ -0,0 +1,115 @@
+import pandas as pd
+
+col_names = {
+    'Optimizations': '',
+    'F Registers': 'FP',
+    'X Registers': 'Integer',
+    'F Loads': 'Loads',
+    'F Stores': 'Stores',
+    'FMAdd': 'FMAdd',
+    'FRep': 'FRep',
+    'Cycles': 'Cycles (\#)',
+    'Occupancy': 'Occupancy (\%)',
+}
+
+col_alignment = {
+    'Optimizations': 'l',
+    'F Registers': 'S[table-format=2.0]',
+    'X Registers': 'S[table-format=2.0]',
+    'F Loads': 'S[table-format=4.0]',
+    'F Stores': 'S[table-format=4.0]',
+    'FMAdd': 'S[table-format=4.0]',
+    'FRep': 'S[table-format=1.0]',
+    'Cycles': 'S[table-format=5.0]',
+    'Occupancy': 'S[table-format=2.2]',
+}
+
+
+def get_opt_pipeline_table(opt_pipeline_df: pd.DataFrame) -> str:
+    # Reorder columns to move Cycles and Occupancy to the end
+    cols = opt_pipeline_df.columns.tolist()
+    cols = [col for col in cols if col not in ['Cycles', 'Occupancy']] + [
+        'Cycles',
+        'Occupancy',
+    ]
+    opt_pipeline_df = opt_pipeline_df[cols]
+    del opt_pipeline_df["params"]
+    latex_table = "\\begin{table*}[h]\n"
+
+    latex_table += "\\sisetup{group-separator = {\ },group-minimum-digits=3}\n"
+
+    latex_table += (
+        "\\setlength\\tabcolsep{0pt} % let LaTeX compute intercolumn whitespace\n"
+    )
+
+    latex_table += "\\caption{"
+    latex_table += (
+        "Our compilation pipeline leverages custom "
+        "\\ac{isa} extensions and knowledge of \\ac{fpu} design in order to achieve "
+        "over 90\\% \\ac{fpu} occupancy for the MatMul kernel, "
+        "operating on 1$\\times$200 and 200$\\times$5 64-bit inputs. "
+        "Incrementally adding each optimization minimizes and, "
+        "eventually eliminates, explicit memory operations, while reducing "
+        "execution time (cycles) and maximizing \\ac{fpu} utilization."
+    )
+    latex_table += "}\n\\label{tab:opt_pipeline}\n"
+
+    latex_table += (
+        "\\centering\n\\begin{tabular*}{\\textwidth}{@{\\extracolsep{\\fill}}"
+        + " ".join(f"{col_alignment[col]}" for col in opt_pipeline_df.columns)
+        + "}\n\\toprule\n"
+    )
+
+    latex_table += "\\textbf{Optimizations} & \\multicolumn{2}{r}{\\textbf{Allocated Registers (\\#)}} & \\multicolumn{4}{c}{\\textbf{Assembly Operations (\\#)}} & \\multicolumn{2}{c}{\\textbf{Performance}}\\\\\n"
+
+    latex_table += "\\cmidrule{2-3}\\cmidrule{4-7}\\cmidrule{8-9}\n"
+
+    latex_table += (
+        " & ".join(f"\\textbf{{{col_names[col]}}}" for col in opt_pipeline_df.columns)
+        + " \\\\\n\\midrule\n"
+    )
+
+    string_table = []
+
+    # Add the rest of the rows
+    for _, row in opt_pipeline_df.iterrows():
+        string_table.append([str(x) for x in row])
+
+    # replace text for last row in the first column
+    string_table[-1][0] = "+ Unroll-and-Jam"
+
+    # change text style for first column
+    for row in string_table:
+        row[0] = f"\\texttt{{{row[0]}}}"
+
+        # replace text for baseline which should be at the first row and column
+        string_table[0][0] = "Baseline (for MatMul)"
+
+    # add max register count for fp registers
+    for row in string_table:
+        row[1] = row[1] + "\\textcolor{lightgray}{/20}"
+
+    # add max register count for int registers
+    for row in string_table:
+        row[2] = row[2] + "\\textcolor{lightgray}{/15}"
+
+        # replace text for baseline which should be at the first row and column
+        string_table[0][0] = "Baseline (for MatMul)"
+
+    # gray out baseline which should be the first line
+    for idx, val in enumerate(string_table[0]):
+        string_table[0][idx] = "\\color{gray} " + val
+
+    # highlight rightmost entry which should be the max FPU util achieved
+    string_table[-1][-1] = "\\textbf{" + string_table[-1][-1] + "}"
+
+    for row in string_table:
+        latex_table += " & ".join(val for val in row) + " \\\\\n"
+
+    latex_table += "\\bottomrule\n"
+
+    latex_table += "\\end{tabular*}\n"
+
+    latex_table += "\\end{table*}\n"
+
+    return latex_table
diff --git a/plots-cgo2025-ae/pass_improvements.py b/plots-cgo2025-ae/pass_improvements.py
new file mode 100644
index 00000000..90b70f2f
--- /dev/null
+++ b/plots-cgo2025-ae/pass_improvements.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+import pandas as pd
+import argparse
+
+
+def extract(df):
+    mask = df["impl"].str.contains(r"linalg_\d_xdsl|linalg_full_xdsl", regex=True)
+    return df[mask]
+
+
+def pass_order_mapping(n_passes=10):
+    custom_order = [f"linalg_{i}_xdsl" for i in range(n_passes)] + ["linalg_full_xdsl"]
+    return {name: index for index, name in enumerate(custom_order)}
+
+
+def add_linalg_passes(data):
+    df = data.copy()
+    # Pass order
+    df["linalg_pass_order"] = df["impl"].map(pass_order_mapping())
+    df = df.sort_values(by=["test", "params", "linalg_pass_order"])
+    # fpss_fpu_occupancy relative improvement
+    df["linalg_pass_relative_improvement"] = (
+        df.groupby(["test", "params"])["fpss_fpu_occupancy"].diff().fillna(0)
+    )
+    # group min/max/delta
+    group_max = df.groupby(["test", "params"])["fpss_fpu_occupancy"].max().reset_index()
+    group_max = group_max.rename(
+        columns={"fpss_fpu_occupancy": "linalg_pass_group_max"}
+    )
+    group_min = df.groupby(["test", "params"])["fpss_fpu_occupancy"].min().reset_index()
+    group_min = group_min.rename(
+        columns={"fpss_fpu_occupancy": "linalg_pass_group_min"}
+    )
+    group_delta = pd.merge(group_min, group_max, on=["test", "params"])
+    group_delta["linalg_pass_group_delta"] = (
+        group_delta["linalg_pass_group_max"] - group_delta["linalg_pass_group_min"]
+    )
+    # overall pass % contribution
+    df = pd.merge(df, group_delta, on=["test", "params"])
+    df["linalg_pass_%_contribution"] = (
+        df["linalg_pass_relative_improvement"].abs()
+        / df["linalg_pass_group_delta"].abs()
+    )
+    return df
+
+
+def get_pass_contributions_table(df):
+    pivot_df = df.pivot(
+        index=["test", "params"], columns="impl", values="linalg_pass_%_contribution"
+    )
+    pivot_df.columns.name = None
+    pivot_df.reset_index(inplace=True)
+    return pivot_df
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate pass improvements CSV table."
+    )
+    parser.add_argument("csv_file", help="Path to the input CSV file")
+    args = parser.parse_args()
+    data = pd.read_csv(args.csv_file)
+    df = extract(data)
+    df = add_linalg_passes(df)
+    df = get_pass_contributions_table(df)
+    print(df.to_csv(index=False))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/plots-cgo2025-ae/pass_improvements_stacked.py b/plots-cgo2025-ae/pass_improvements_stacked.py
new file mode 100644
index 00000000..714d765c
--- /dev/null
+++ b/plots-cgo2025-ae/pass_improvements_stacked.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+
+import matplotlib.pyplot as plt
+import numpy as np
+import argparse
+import pandas as pd
+
+
+def generate_stacked_bars(labels, passes, values):
+
+    n_bars = values.shape[0]
+    n_values = values.shape[1]
+
+    cmap = plt.get_cmap("viridis", n_values)
+    colors = [cmap(i) for i in range(n_values)]
+
+    # Create the stacked bar plot
+    fig, ax = plt.subplots(figsize=(10, 6))
+
+    bar_width = 0.1
+    bar_tick = 0.15
+    x = np.array([bar_tick * v for v in range(n_bars)])
+    bottom = np.zeros(n_bars)
+
+    for i in range(n_values):
+        segments = ax.bar(
+            x,
+            values[:, i],
+            bottom=bottom,
+            width=bar_width,
+            color=colors[i],
+            label=passes[i],
+        )
+        bottom += values[:, i]
+
+        # Add segment values inside each segment
+        if i == 0 or i == n_values - 1:
+            continue
+        for bar in segments:
+            height = bar.get_height()
+            # Calculate vertical position for the text within each segment
+            text_y = bar.get_y() + height / 2.0
+            ax.text(
+                bar.get_x() + bar.get_width() / 2,
+                text_y,
+                f"{height:.2f}",
+                ha="center",
+                va="center",
+                color="white",
+                fontsize=10,
+                fontweight="bold",
+            )
+
+    # Add much thicker white zigzag line with 3 less sloped segments across the full width of each bar
+    zigzag_height = (
+        values[:, -1] * 0.1
+    )  # Reduced to 30% of the top segment height for less slope
+
+    for i in range(n_bars):
+        bar_top = np.sum(values[i])
+        zigzag_center = bar_top - values[i, -1] / 2  # Center of the top segment
+        zigzag_top = zigzag_center + zigzag_height[i] / 2
+        zigzag_bottom = zigzag_center - zigzag_height[i] / 2
+
+        # Create 4 points for 3 straight segments with less slope
+        zigzag_x = [
+            x[i] - bar_width / 2,
+            x[i] - bar_width / 4,
+            x[i] + bar_width / 4,
+            x[i] + bar_width / 2,
+        ]
+        zigzag_y = [zigzag_bottom, zigzag_top, zigzag_bottom, zigzag_top]
+
+        ax.plot(
+            zigzag_x, zigzag_y, color="white", linewidth=16, solid_capstyle="round"
+        )  # Doubled linewidth to 16
+
+    ax.axhline(y=100, color="black", linewidth=2, zorder=3)
+
+    ax.set_xlim(-0.1, n_bars * bar_tick)
+
+    # Add labels to the right of the last bar
+    fontdict = {"family": "Arial", "size": 12, "weight": "bold"}
+    for i in range(1, n_values - 1):
+        y_position = sum(values[0, :i]) + values[0, i] / 2
+        ax.text(
+            x[-1] + bar_width / 2 + 0.01,
+            y_position,
+            passes[i],
+            va="center",
+            color=colors[i],
+            fontdict=fontdict,
+        )
+
+    ax.set_xticks(x)
+    ax.set_xticklabels(labels)
+    ax.get_yaxis().set_visible(False)
+    for key, spine in ax.spines.items():
+        spine.set_visible(False)
+
+    plt.tight_layout()
+    return fig
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate pass improvements stacked bars plot from pass improvements CSV table."
+    )
+    parser.add_argument("csv_file", help="Path to the input pass improvements CSV file")
+    args = parser.parse_args()
+    df = pd.read_csv(args.csv_file)
+    df["label"] = df["test"] + " " + df["params"]
+    df.set_index("label", inplace=True)
+    df.drop(["test", "params"], axis=1, inplace=True)
+    df *= 100.0
+    print(df)
+    column_to_pass = {
+        "linalg_0_xdsl": "baseline",
+        "linalg_1_xdsl": "memref-stream-tile-outer-loops",
+        "linalg_2_xdsl": "memref-stream-unnest-out-parameters",
+        "linalg_3_xdsl": "memref-stream-interleave",
+        "linalg_4_xdsl": "memref-streamify",
+        "linalg_full_xdsl": "convert-riscv-scf-for-to-frep",
+    }
+    passes = [column_to_pass[column] for column in df.columns]
+    df["remainder"] = 40.0
+    passes.append("remainder")
+    fig = generate_stacked_bars(df.index, passes=passes, values=df.values)
+    fig.savefig(f"pass_improvements.pdf", format="pdf", bbox_inches="tight")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/plots-cgo2025-ae/plot.py b/plots-cgo2025-ae/plot.py
new file mode 100644
index 00000000..2e7fcae4
--- /dev/null
+++ b/plots-cgo2025-ae/plot.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+from data import (
+    get_low_level_representation,
+    get_opt_pipeline,
+    get_pivoted_all,
+    get_pivoted_fpu,
+    get_kernels,
+)
+from fpu import get_fpu, plot_fpu
+from low_level_representation import get_llr_dfs, plot_llr
+from max_util import get_max_util
+from regalloc import get_regalloc, print_regalloc
+from plot_utils import savefig
+from opt_pipeline import get_opt_pipeline_table
+
+
+def main():
+    kernels_df = get_kernels()
+    pivoted_all_df = get_pivoted_all(kernels_df)
+    pivoted_fpu_df = get_pivoted_fpu(pivoted_all_df)
+
+    # Plot FPU utilization
+    fpu_dfs = get_fpu(pivoted_fpu_df)
+    fpu_fig = plot_fpu(fpu_dfs)
+    savefig(fpu_fig, "fpu.pdf")
+
+    # Print the regalloc stats
+    regalloc_df = get_regalloc()
+    print_regalloc(regalloc_df, filename="regalloc.tex")
+
+    # Plot low-level representation
+    llr_kernels_df = get_low_level_representation()
+    llr_dfs = get_llr_dfs(llr_kernels_df)
+    llr_fig = plot_llr(llr_dfs)
+    savefig(llr_fig, "low_level_representation.pdf")
+
+    # Print opt pipeline table
+    opt_pipeline_df = get_opt_pipeline()
+    opt_pipeline_table = get_opt_pipeline_table(opt_pipeline_df)
+    with open("opt_pipeline.tex", "w") as f:
+        f.write(opt_pipeline_table)
+
+    # Print max utilization stats
+    max_util_macros = get_max_util(llr_kernels_df, fpu_dfs)
+    with open("max_util.tex", "w") as f:
+        f.write(max_util_macros)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/plots-cgo2025-ae/plot_utils.py b/plots-cgo2025-ae/plot_utils.py
new file mode 100644
index 00000000..e03e8d43
--- /dev/null
+++ b/plots-cgo2025-ae/plot_utils.py
@@ -0,0 +1,241 @@
+from collections.abc import Iterable
+from typing import ClassVar, Sequence, cast
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
+import numpy as np
+import numpy.typing as npt
+import seaborn as sns
+from matplotlib.lines import Line2D
+from data import Impl
+from abc import ABC, abstractmethod
+
+# Color palette
+light_gray = "#cacaca"
+dark_gray = "#827b7b"
+light_blue = "#a6cee3"
+dark_blue = "#1f78b4"
+light_green = "#b2df8a"
+dark_green = "#33a02c"
+light_red = "#fb9a99"
+dark_red = "#e31a1c"
+black = "#000000"
+white = "#ffffff"
+
+COLORS = [
+    light_gray,
+    dark_gray,
+    light_blue,
+    dark_blue,
+    light_green,
+    dark_green,
+    light_red,
+    dark_red,
+]
+
+
+IMPL_COLORS = {
+    Impl.OURS.value: dark_green,
+    Impl.CLANG.value: light_blue,
+    Impl.MLIR.value: dark_blue,
+    "Min Cycles": dark_gray,
+    "Overhead": dark_red,
+    "Performance Roofline": dark_gray,
+}
+
+IMPL_MARKERS = {
+    Impl.OURS.value: 'o',
+    Impl.CLANG.value: 's',
+    Impl.MLIR.value: 'v',
+    "Min Cycles": '^',
+    "Overhead": 'x',
+    "Performance Roofline": "",
+}
+
+IMPL_LINESTYLES = {
+    Impl.OURS.value: '',
+    Impl.CLANG.value: '',
+    Impl.MLIR.value: '',
+    "Min Cycles": '',
+    "Overhead": '',
+    "Performance Roofline": "--",
+}
+
+
+class GridPlotRow(ABC):
+    ylabel: ClassVar[str]
+
+    dfs: Sequence[pd.DataFrame]
+    hide_xlabel: bool
+    hide_xtick_labels: bool
+
+    def __init__(
+        self,
+        dfs: Sequence[pd.DataFrame],
+        *,
+        hide_xlabel: bool = False,
+        hide_xtick_labels: bool = False,
+    ) -> None:
+        self.dfs = dfs
+        self.hide_xlabel = hide_xlabel
+        self.hide_xtick_labels = hide_xtick_labels
+
+    @classmethod
+    @abstractmethod
+    def yrange(cls, dfs: Sequence[pd.DataFrame]) -> npt.NDArray[np.float64]:
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def plot_grid_cell(cls, ax: Axes, df: pd.DataFrame, *, hide_xlabel: bool) -> None:
+        raise NotImplementedError
+
+    @classmethod
+    def get_roofline(cls, df: pd.DataFrame) -> float | None:
+        return None
+
+    def plot_grid_row(self, axs: Sequence[Axes]):
+        yrange = self.yrange(self.dfs)
+
+        for i, (_ax, _d) in enumerate(zip(axs, self.dfs)):
+            self.plot_grid_cell(
+                _ax,
+                _d,
+                hide_xlabel=self.hide_xlabel,
+            )
+
+            if (roofline := self.get_roofline(_d)) is not None:
+                _ax.axhline(
+                    y=roofline,
+                    color=IMPL_COLORS["Performance Roofline"],
+                    linestyle=IMPL_LINESTYLES["Performance Roofline"],
+                )
+
+            _ax.set_yticks(yrange)
+            ytick_distance = (yrange[-1] - yrange[0]) / len(yrange)
+            # add extra distance for the yaxis to avoid graph truncation and
+            # misaligned y-axis ticks in subplots
+            _ax.set_ylim(yrange[0], yrange[-1] + ytick_distance / 10)
+
+            _ax.yaxis.grid(True)
+            _ax.tick_params(axis="both", which="both", left=True)
+
+            yticks = _ax.yaxis.get_major_ticks()
+            for _j, ytick in enumerate(yticks):
+                if _j % 2:
+                    ytick.label1.set_visible(False)
+                    ytick.tick1line.set_visible(False)
+                if i:
+                    ytick.label1.set_visible(False)
+
+            # Improve readability when we have too many xticks
+            # (e.g.: matmul with lots of data points)
+            xtick_labels = _ax.get_xticklabels()
+            if len(xtick_labels) > 10:
+                for label in xtick_labels:
+                    label.set_rotation(90)
+
+            sns.despine(top=True, right=True)
+
+            if self.hide_xtick_labels:
+                _ax.set_xticklabels([])
+
+        # y axis label on first column only
+        axs[0].set_ylabel(self.ylabel)
+
+    @classmethod
+    def get_rows(
+        cls,
+        dfs: Sequence[pd.DataFrame],
+        ncols: int,
+        *,
+        hide_xlabel: bool = False,
+        hide_xtick_labels: list[bool] = [],
+    ) -> tuple["GridPlotRow", ...]:
+        num_plots = len(dfs)
+        rng = range(0, num_plots, ncols)
+
+        if hide_xtick_labels == []:
+            hide_xtick_labels = [False] * len(rng)
+
+        assert len(rng) == len(hide_xtick_labels)
+
+        return tuple(
+            cls(
+                dfs[offset : offset + ncols],
+                hide_xlabel=hide_xlabel,
+                hide_xtick_labels=hide_xtick_labels[i],
+            )
+            for i, offset in enumerate(rng)
+        )
+
+
+def get_legend_entries(dfs: Sequence[pd.DataFrame]) -> dict[str, tuple[str, str, str]]:
+    return {
+        col: (IMPL_COLORS[col], IMPL_MARKERS[col], IMPL_LINESTYLES[col])
+        for _d in dfs
+        for col in cast(Iterable[str], _d)
+    }
+
+
+def savefig(fig: Figure, filename: str):
+    fig.savefig(filename)
+
+
+def subplots(nrows: int, ncols: int, figsize: tuple[float, float]):
+    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, sharey=False)
+
+    # plt.subplots returns different types depending on nrows and ncols
+    # These normalise the type to always be doubly-nested sequence
+    if nrows == 1:
+        axs = [axs]
+    if ncols == 1:
+        axs = [[ax] for ax in axs]
+
+    return fig, cast(Sequence[Sequence[Axes]], axs)
+
+
+def plot_combined(
+    rows: Sequence["GridPlotRow"], rcparams_cfg_file: str = "", *, legend_cols: int
+):
+    nrows = len(rows)
+    ncols = max(len(row.dfs) for row in rows)
+    if os.path.exists(rcparams_cfg_file):
+        plt.style.use(rcparams_cfg_file)
+
+    fig, axs = subplots(nrows, ncols, (ncols * 2.5, nrows * 1.8))
+
+    fig.align_labels()
+    fig.subplots_adjust(hspace=0.5, wspace=0.3)
+
+    for plot_row, axs_row in zip(rows, axs):
+        plot_row.plot_grid_row(axs_row)
+
+        # Remove empty subplots
+        empty_axs = ncols - len(plot_row.dfs)
+        if empty_axs:
+            for ax in axs_row[-empty_axs:]:
+                fig.delaxes(ax)
+
+    # Shared legend
+    legend_entries = get_legend_entries(tuple(df for row in rows for df in row.dfs))
+    lines = [
+        Line2D(
+            [],
+            [],
+            color=color,
+            marker=marker,
+            label=entry,
+            linestyle=linestyle,
+            markersize=6,
+        )
+        for entry, (color, marker, linestyle) in legend_entries.items()
+    ]
+    labels = list(legend_entries.keys())
+    fig.legend(lines, labels, ncols=legend_cols, bbox_to_anchor=(0.5, 1.03))
+
+    fig.tight_layout()
+
+    return fig
diff --git a/plots-cgo2025-ae/regalloc.py b/plots-cgo2025-ae/regalloc.py
new file mode 100644
index 00000000..06d2ecc8
--- /dev/null
+++ b/plots-cgo2025-ae/regalloc.py
@@ -0,0 +1,63 @@
+import re
+import pandas as pd
+from data import get_regalloc as _get_regalloc
+
+
+def get_regalloc() -> pd.DataFrame:
+    regalloc_df = _get_regalloc()
+
+    return regalloc_df
+
+
+def color(color: str, text: str) -> str:
+    return r"\textcolor{" + color + "}{" + text + "}"
+
+
+def print_regalloc(regalloc_df: pd.DataFrame, *, filename: str | None = None):
+    stream = None if filename is None else open(filename, "w")
+    colors = (color("lightgray", "/20"), color("lightgray", "/15"))
+
+    # Sort the DataFrame
+    regalloc_df = regalloc_df.sort_values(
+        [
+            "Bits",
+            "allocated_float",
+            "allocated_int",
+        ],
+        ascending=[False, True, True],
+    )
+
+    string_table = []
+
+    for row in regalloc_df.iterrows():
+        items = tuple(row[1])
+        params = items[:5]
+        regs = items[5:]
+
+        reg_cells = tuple(f"{reg}{col}" for reg, col in zip(regs, colors))
+
+        string_table.append([str(p) for p in params + reg_cells])
+
+    current_precision = None
+
+    for row in string_table:
+        line = ""
+
+        # replace NxM where N and M are integers with N$\times$M in kernel names
+        pattern = r"(\d+)x(\d+)"
+        row[0] = re.sub(pattern, r"\1$\\times$\2", row[0])
+
+        # add short row space to separate precision groups
+        if current_precision is None:
+            current_precision = row[1]
+
+        if current_precision != row[1]:
+            current_precision = row[1]
+            line = "\\addlinespace[0.5em]\n"
+
+        line += " & ".join(val for val in row)
+        print(line, end=" \\\\\n", file=stream)
+
+    print(r"\bottomrule", file=stream)
+    if stream is not None:
+        stream.close()
diff --git a/plots-cgo2025-ae/throughput.py b/plots-cgo2025-ae/throughput.py
new file mode 100644
index 00000000..2a6f7a7a
--- /dev/null
+++ b/plots-cgo2025-ae/throughput.py
@@ -0,0 +1,57 @@
+from typing import Sequence, cast
+import pandas as pd
+import numpy as np
+import numpy.typing as npt
+from plot_utils import IMPL_COLORS, IMPL_MARKERS, plot_combined, GridPlotRow
+from matplotlib.axes import Axes
+
+from math import log2, ceil, floor
+
+
+class ThroughputGridPlotRow(GridPlotRow):
+    ylabel = "Throughput"
+
+    @classmethod
+    def yrange(cls, dfs: Sequence[pd.DataFrame]) -> npt.NDArray[np.float64]:
+        max_value = cast(float, max(_df.max().iloc[0] for _df in dfs))
+        magnitude: float = 2 ** floor(log2(max_value))
+        greater_round_number = ceil(max_value / magnitude) * magnitude
+        yrange = np.arange(0, greater_round_number + 1)
+        return yrange
+
+    @classmethod
+    def plot_grid_cell(
+        cls,
+        ax: Axes,
+        df: pd.DataFrame,
+        *,
+        hide_xlabel: bool,
+    ) -> None:
+        for col in df:
+            if col == "Performance Roofline":
+                continue
+            ax.scatter(
+                x=df.index,
+                y=df[col],
+                color=IMPL_COLORS[col],
+                marker=IMPL_MARKERS[col],
+            )
+        ax.set_xticks(df.index)
+        if not hide_xlabel:
+            ax.set_xlabel(df.index.name, fontsize=12)
+
+    @classmethod
+    def get_roofline(cls, df: pd.DataFrame) -> float | None:
+        max_throughputs = df["Performance Roofline"]
+        max_throughput = max_throughputs.max()
+        min_throughput = max_throughputs.min()
+        assert max_throughput == min_throughput
+        return max_throughput
+
+
+def plot_throughput(throughput_dfs: tuple[pd.DataFrame, ...]):
+    return plot_combined(
+        ThroughputGridPlotRow.get_rows(throughput_dfs, 4),
+        legend_cols=3,
+        rcparams_cfg_file="config/gridplot.mplstyle",
+    )

From 3046490344d5639034d48a54af8f6ab47f3f1087 Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Wed, 6 Nov 2024 16:53:35 +0000
Subject: [PATCH 02/20] Update requirements.txt

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 7ad2a3a0..a9add958 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 pip
 pandas==2.1.1
+seaborn
 numpy==1.26.4
 snakemake==8.14.0
 -e /src/xdsl

From 7ed06fe1454078186baf5ab83aceeea84ff6beb1 Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Wed, 6 Nov 2024 17:12:59 +0000
Subject: [PATCH 03/20] Update requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index a9add958..760a4e59 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 pip
 pandas==2.1.1
-seaborn
+seaborn==0.13.2
 numpy==1.26.4
 snakemake==8.14.0
 -e /src/xdsl

From 99c0ad7e1afee8557db2ae39fac2897b3882bb6e Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Wed, 6 Nov 2024 17:33:56 +0000
Subject: [PATCH 04/20] Disable TeX matplotlib backend

---
 plots-cgo2025-ae/config/gridplot.mplstyle | 2 +-
 plots-cgo2025-ae/heatmap.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/plots-cgo2025-ae/config/gridplot.mplstyle b/plots-cgo2025-ae/config/gridplot.mplstyle
index 1ace4cb8..0114f979 100644
--- a/plots-cgo2025-ae/config/gridplot.mplstyle
+++ b/plots-cgo2025-ae/config/gridplot.mplstyle
@@ -13,7 +13,7 @@ font.size: 12
 font.family: sans-serif
 #font.sans-serif: ["Helvetica"]
 
-text.usetex: True
+text.usetex: False
 
 ## Enable tight_layout by default
 #
diff --git a/plots-cgo2025-ae/heatmap.py b/plots-cgo2025-ae/heatmap.py
index 1b120432..20a8a8d1 100644
--- a/plots-cgo2025-ae/heatmap.py
+++ b/plots-cgo2025-ae/heatmap.py
@@ -60,7 +60,7 @@ def generate_heatmaps(data: pd.DataFrame):
     # filter out K values unrelated to the heatmap experimental runs
     data = data[(data["K"] <= 65)]
 
-    sns.set(rc={'text.usetex': True})
+    # sns.set(rc={'text.usetex': True})
 
     for m_value, m_group in data.groupby("M"):
         selection = m_group[["K", "N", "fp_flop_throughput"]]

From 541d7a4f4cd7ede7e97151569acbef7ec027a23c Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 10:04:13 +0000
Subject: [PATCH 05/20] Add symlink to results dir

---
 plots-cgo2025-ae/results | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 plots-cgo2025-ae/results

diff --git a/plots-cgo2025-ae/results b/plots-cgo2025-ae/results
new file mode 120000
index 00000000..f42d2767
--- /dev/null
+++ b/plots-cgo2025-ae/results
@@ -0,0 +1 @@
+../results/
\ No newline at end of file

From 874633842d3059d07706d84db8d66ade9f7ab2fa Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 10:12:16 +0000
Subject: [PATCH 06/20] Disable TeX backend in matplotlib config

---
 plots-cgo2025-ae/config/cycles/all_barchart.mplstyle  | 2 +-
 plots-cgo2025-ae/config/cycles/xdsl_barchart.mplstyle | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/plots-cgo2025-ae/config/cycles/all_barchart.mplstyle b/plots-cgo2025-ae/config/cycles/all_barchart.mplstyle
index 980b425c..647d6061 100644
--- a/plots-cgo2025-ae/config/cycles/all_barchart.mplstyle
+++ b/plots-cgo2025-ae/config/cycles/all_barchart.mplstyle
@@ -13,7 +13,7 @@ font.size: 12
 font.family: sans-serif
 #font.sans-serif: ["Helvetica"]
 
-text.usetex: True
+text.usetex: False
 
 ## Enable tight_layout by default
 #
diff --git a/plots-cgo2025-ae/config/cycles/xdsl_barchart.mplstyle b/plots-cgo2025-ae/config/cycles/xdsl_barchart.mplstyle
index 163c89d5..0ef75540 100644
--- a/plots-cgo2025-ae/config/cycles/xdsl_barchart.mplstyle
+++ b/plots-cgo2025-ae/config/cycles/xdsl_barchart.mplstyle
@@ -13,7 +13,7 @@ font.size: 12
 font.family: sans-serif
 #font.sans-serif: ["Helvetica"]
 
-text.usetex: True
+text.usetex: False
 
 ## Enable tight_layout by default
 #

From 2a308be1a22f1cf324b42595e470a7de96813d7c Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 10:12:40 +0000
Subject: [PATCH 07/20] Use specific CSV file for regalloc

---
 plots-cgo2025-ae/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plots-cgo2025-ae/data.py b/plots-cgo2025-ae/data.py
index 58104850..d3a4a443 100644
--- a/plots-cgo2025-ae/data.py
+++ b/plots-cgo2025-ae/data.py
@@ -219,7 +219,7 @@ def get_params_dfs(operator_df: pd.DataFrame) -> Iterable[pd.DataFrame]:
 
 
 def get_regalloc() -> pd.DataFrame:
-    regalloc_df = pd.read_csv("results/regalloc.csv")
+    regalloc_df = pd.read_csv("results/regalloc.fast.csv")
     regalloc_df = regalloc_df[regalloc_df["impl"].isin(OPERATOR_BY_TEST)]
     regalloc_df.replace(OPERATOR_BY_TEST, inplace=True)
     regalloc_df = regalloc_df[~regalloc_df["params"].str.contains("f16")]

From f63720e71076fb4d02f367abd5a1939c7a0547ca Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 10:41:56 +0000
Subject: [PATCH 08/20] Allow relative input and output paths based on script
 location

---
 plots-cgo2025-ae/data.py     | 27 +++++++++++++++++----------
 plots-cgo2025-ae/max_util.py |  2 +-
 plots-cgo2025-ae/plot.py     | 28 +++++++++++++++++++---------
 3 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/plots-cgo2025-ae/data.py b/plots-cgo2025-ae/data.py
index d3a4a443..26abe7cf 100644
--- a/plots-cgo2025-ae/data.py
+++ b/plots-cgo2025-ae/data.py
@@ -3,6 +3,7 @@
 import numpy as np
 
 from enum import StrEnum
+from pathlib import Path
 
 
 class Impl(StrEnum):
@@ -96,8 +97,8 @@ def _get_kernels(filename: str) -> pd.DataFrame:
     return df_with_overhead
 
 
-def get_kernels(cleaned: bool = True) -> pd.DataFrame:
-    df = _get_kernels("results/kernels.csv")
+def get_kernels(dir: Path = Path("."), cleaned: bool = True) -> pd.DataFrame:
+    df = _get_kernels(f"{dir}/kernels.csv")
     # Drop unknown operators
     df = df[df.index.get_level_values(0).isin(tuple(Operator))]
     if cleaned:
@@ -106,8 +107,8 @@ def get_kernels(cleaned: bool = True) -> pd.DataFrame:
     return df
 
 
-def get_low_level_representation() -> pd.DataFrame:
-    return _get_kernels("results/kernels.low_level_representation.csv")
+def get_low_level_representation(dir: Path = Path(".")) -> pd.DataFrame:
+    return _get_kernels(f"{dir}/kernels.low_level_representation.csv")
 
 
 def get_pivoted_all(kernels_df: pd.DataFrame) -> pd.DataFrame:
@@ -119,7 +120,9 @@ def get_pivoted_fpu(pivoted_all_df: pd.DataFrame) -> pd.DataFrame:
 
 
 def get_pivoted_cycles(pivoted_all_df: pd.DataFrame) -> pd.DataFrame:
-    return pivoted_all_df[["cycles", "Min Cycles", "Overhead", "FLOPs", "Throughput", "Max Throughput"]]
+    return pivoted_all_df[
+        ["cycles", "Min Cycles", "Overhead", "FLOPs", "Throughput", "Max Throughput"]
+    ]
 
 
 def get_flops(operator_df: pd.DataFrame, operator: Operator) -> pd.Series:
@@ -158,6 +161,7 @@ def get_overhead(
     })
     return res
 
+
 def adding_overhead(
     operator_df: pd.DataFrame,
     operator: Operator,
@@ -173,6 +177,7 @@ def adding_overhead(
         axis=1,
     )
 
+
 def get_operator_df(
     pivoted_df: pd.DataFrame, operator: Operator, *, bitwidth: int
 ) -> pd.DataFrame:
@@ -211,15 +216,17 @@ def get_params_dfs(operator_df: pd.DataFrame) -> Iterable[pd.DataFrame]:
                 name_components.append(col)
         other_cols = list(cols) + ["bitwidth"]
         del other_cols[i]
-        shape_string = " ".join("x".join(t) for t in operand_shapes_map(*name_components))
+        shape_string = " ".join(
+            "x".join(t) for t in operand_shapes_map(*name_components)
+        )
         new_name = f"{name} {shape_string}"
         my_df = my_df.rename(columns={col: new_name}).set_index(new_name).sort_index()
         my_df.drop(other_cols, axis=1, inplace=True)
         yield my_df
 
 
-def get_regalloc() -> pd.DataFrame:
-    regalloc_df = pd.read_csv("results/regalloc.fast.csv")
+def get_regalloc(dir: Path = Path(".")) -> pd.DataFrame:
+    regalloc_df = pd.read_csv(f"{dir}/regalloc.fast.csv")
     regalloc_df = regalloc_df[regalloc_df["impl"].isin(OPERATOR_BY_TEST)]
     regalloc_df.replace(OPERATOR_BY_TEST, inplace=True)
     regalloc_df = regalloc_df[~regalloc_df["params"].str.contains("f16")]
@@ -244,8 +251,8 @@ def get_regalloc() -> pd.DataFrame:
     return regalloc_df
 
 
-def get_opt_pipeline() -> pd.DataFrame:
-    opt_pipeline_df = pd.read_csv("results/pipeline.csv")
+def get_opt_pipeline(dir: Path = Path(".")) -> pd.DataFrame:
+    opt_pipeline_df = pd.read_csv(f"{dir}/pipeline.csv")
     opt_pipeline_df = opt_pipeline_df.rename(
         columns={
             "FPU Occupancy [%]": "Occupancy",
diff --git a/plots-cgo2025-ae/max_util.py b/plots-cgo2025-ae/max_util.py
index 188706ea..c397ba50 100644
--- a/plots-cgo2025-ae/max_util.py
+++ b/plots-cgo2025-ae/max_util.py
@@ -1,9 +1,9 @@
-
 from typing import Sequence
 import pandas as pd
 
 from data import Impl, Operator
 
+
 def get_max_util(llr_kernels_df: pd.DataFrame, fpu_dfs: Sequence[pd.DataFrame]) -> str:
     llr_max_occupancy = llr_kernels_df["fpss_fpu_occupancy"].max()
     llr_max_throughput = llr_kernels_df["Rel Throughput"].max()
diff --git a/plots-cgo2025-ae/plot.py b/plots-cgo2025-ae/plot.py
index 2e7fcae4..4e3425a9 100644
--- a/plots-cgo2025-ae/plot.py
+++ b/plots-cgo2025-ae/plot.py
@@ -13,36 +13,46 @@
 from plot_utils import savefig
 from opt_pipeline import get_opt_pipeline_table
 
+from pathlib import Path
+
+import os
+
+SCRIPT_DIR = Path(os.path.dirname(os.path.abspath(__file__)))
+RESULTS_DIR = SCRIPT_DIR / "results"
+
 
 def main():
-    kernels_df = get_kernels()
+    output_dir = SCRIPT_DIR / "output"
+    output_dir.mkdir(exist_ok=True)
+
+    kernels_df = get_kernels(RESULTS_DIR)
     pivoted_all_df = get_pivoted_all(kernels_df)
     pivoted_fpu_df = get_pivoted_fpu(pivoted_all_df)
 
     # Plot FPU utilization
     fpu_dfs = get_fpu(pivoted_fpu_df)
     fpu_fig = plot_fpu(fpu_dfs)
-    savefig(fpu_fig, "fpu.pdf")
+    savefig(fpu_fig, output_dir / "fpu.pdf")
 
     # Print the regalloc stats
-    regalloc_df = get_regalloc()
-    print_regalloc(regalloc_df, filename="regalloc.tex")
+    regalloc_df = get_regalloc(RESULTS_DIR)
+    print_regalloc(regalloc_df, filename=output_dir / "regalloc.tex")
 
     # Plot low-level representation
-    llr_kernels_df = get_low_level_representation()
+    llr_kernels_df = get_low_level_representation(RESULTS_DIR)
     llr_dfs = get_llr_dfs(llr_kernels_df)
     llr_fig = plot_llr(llr_dfs)
-    savefig(llr_fig, "low_level_representation.pdf")
+    savefig(llr_fig, output_dir / "low_level_representation.pdf")
 
     # Print opt pipeline table
-    opt_pipeline_df = get_opt_pipeline()
+    opt_pipeline_df = get_opt_pipeline(RESULTS_DIR)
     opt_pipeline_table = get_opt_pipeline_table(opt_pipeline_df)
-    with open("opt_pipeline.tex", "w") as f:
+    with open(output_dir / "opt_pipeline.tex", "w") as f:
         f.write(opt_pipeline_table)
 
     # Print max utilization stats
     max_util_macros = get_max_util(llr_kernels_df, fpu_dfs)
-    with open("max_util.tex", "w") as f:
+    with open(output_dir / "max_util.tex", "w") as f:
         f.write(max_util_macros)
 
 

From e1adf6da868b72dcc18c102f4a7e3353ecbca1b4 Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 10:46:44 +0000
Subject: [PATCH 09/20] Fix data fetching method

---
 plots-cgo2025-ae/regalloc.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/plots-cgo2025-ae/regalloc.py b/plots-cgo2025-ae/regalloc.py
index 06d2ecc8..f7b1366f 100644
--- a/plots-cgo2025-ae/regalloc.py
+++ b/plots-cgo2025-ae/regalloc.py
@@ -2,9 +2,11 @@
 import pandas as pd
 from data import get_regalloc as _get_regalloc
 
+from pathlib import Path
 
-def get_regalloc() -> pd.DataFrame:
-    regalloc_df = _get_regalloc()
+
+def get_regalloc(dir: Path = Path(".")) -> pd.DataFrame:
+    regalloc_df = _get_regalloc(dir)
 
     return regalloc_df
 

From fcb04e1f7f33e0570827b81683d4780557e3a13b Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 11:49:12 +0000
Subject: [PATCH 10/20] Allow relative input and output paths based on script
 location

---
 plots-cgo2025-ae/heatmap.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/plots-cgo2025-ae/heatmap.py b/plots-cgo2025-ae/heatmap.py
index 20a8a8d1..33d148dd 100644
--- a/plots-cgo2025-ae/heatmap.py
+++ b/plots-cgo2025-ae/heatmap.py
@@ -1,10 +1,16 @@
 #!/usr/bin/env python3
 
+from pathlib import Path
+
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 import argparse
+import os
+
+SCRIPT_DIR = Path(os.path.dirname(os.path.abspath(__file__)))
+RESULTS_DIR = SCRIPT_DIR / "results"
 
 
 def add_metrics(data: pd.DataFrame) -> pd.DataFrame:
@@ -126,15 +132,26 @@ def generate_heatmaps(data: pd.DataFrame):
 
 
 def main():
+    output_dir = SCRIPT_DIR / "output"
+    output_dir.mkdir(exist_ok=True)
+
     parser = argparse.ArgumentParser(description="Generate heatmaps from CSV data.")
     parser.add_argument("csv_file", help="Path to the CSV file")
     args = parser.parse_args()
-    data = pd.read_csv(args.csv_file)
+
+    csv_path = Path(args.csv_file)
+
+    if not csv_path.is_absolute():
+        csv_path = SCRIPT_DIR / csv_path
+
+    data = pd.read_csv(csv_path)
     data = add_metrics(data)
     # FIXME we are able to generate snitch_stream matmul only at the moment
     data = data.loc[(data["test"] == "matmul") & (data["impl"] == "linalg_xdsl")]
     for m, fig in generate_heatmaps(data):
-        fig.savefig(f"matmul_heatmap_M_{m}.pdf", format="pdf", bbox_inches="tight")
+        fig.savefig(
+            output_dir / f"matmul_heatmap_M_{m}.pdf", format="pdf", bbox_inches="tight"
+        )
 
 
 if __name__ == "__main__":

From cb813eba68f7a26f164a025bb57e0084d6d7fd64 Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 12:12:32 +0000
Subject: [PATCH 11/20] Delatexify

---
 plots-cgo2025-ae/heatmap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plots-cgo2025-ae/heatmap.py b/plots-cgo2025-ae/heatmap.py
index 33d148dd..ab0426c3 100644
--- a/plots-cgo2025-ae/heatmap.py
+++ b/plots-cgo2025-ae/heatmap.py
@@ -123,7 +123,7 @@ def generate_heatmaps(data: pd.DataFrame):
             label.set_visible(False)
 
         cbar = ax.collections[0].colorbar
-        cbar.set_label("\% of FLOP/cycle Roofline", labelpad=2, fontsize=10)
+        cbar.set_label("% of FLOP/cycle Roofline", labelpad=2, fontsize=10)
         cbar.ax.tick_params(size=0)
 
         plt.tight_layout()

From b96c78737c88bfd4ea387966b52e0e6f001bce6f Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 12:15:39 +0000
Subject: [PATCH 12/20] Add script for all plot commands

---
 plots-cgo2025-ae/plot.sh | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 plots-cgo2025-ae/plot.sh

diff --git a/plots-cgo2025-ae/plot.sh b/plots-cgo2025-ae/plot.sh
new file mode 100644
index 00000000..8a17b282
--- /dev/null
+++ b/plots-cgo2025-ae/plot.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+python3 /src/plots-cgo2025-ae/plot.py
+python3 /src/plots-cgo2025-ae/heatmap.py results/kernels.all.csv
+

From c1312a4dba5c747e5b9877757affdc988fb697a2 Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 12:54:13 +0000
Subject: [PATCH 13/20] Make scripts executable

---
 plots-cgo2025-ae/plot.py | 0
 plots-cgo2025-ae/plot.sh | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 plots-cgo2025-ae/plot.py
 mode change 100644 => 100755 plots-cgo2025-ae/plot.sh

diff --git a/plots-cgo2025-ae/plot.py b/plots-cgo2025-ae/plot.py
old mode 100644
new mode 100755
diff --git a/plots-cgo2025-ae/plot.sh b/plots-cgo2025-ae/plot.sh
old mode 100644
new mode 100755

From 94ff97fa4c084cce20a76f57b253ec802e036e0f Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 13:42:02 +0000
Subject: [PATCH 14/20] Produce csv instead of tex files

---
 plots-cgo2025-ae/opt_pipeline.py | 65 +++++---------------------------
 plots-cgo2025-ae/plot.py         |  4 +-
 plots-cgo2025-ae/regalloc.py     | 21 ++---------
 3 files changed, 14 insertions(+), 76 deletions(-)

diff --git a/plots-cgo2025-ae/opt_pipeline.py b/plots-cgo2025-ae/opt_pipeline.py
index 53f41c67..4fc53e68 100644
--- a/plots-cgo2025-ae/opt_pipeline.py
+++ b/plots-cgo2025-ae/opt_pipeline.py
@@ -8,8 +8,8 @@
     'F Stores': 'Stores',
     'FMAdd': 'FMAdd',
     'FRep': 'FRep',
-    'Cycles': 'Cycles (\#)',
-    'Occupancy': 'Occupancy (\%)',
+    'Cycles': 'Cycles (#)',
+    'Occupancy': 'Occupancy (%)',
 }
 
 col_alignment = {
@@ -34,40 +34,11 @@ def get_opt_pipeline_table(opt_pipeline_df: pd.DataFrame) -> str:
     ]
     opt_pipeline_df = opt_pipeline_df[cols]
     del opt_pipeline_df["params"]
-    latex_table = "\\begin{table*}[h]\n"
+    csv_table = ""
 
-    latex_table += "\\sisetup{group-separator = {\ },group-minimum-digits=3}\n"
+    csv_table += "Optimizations, Allocated Registers (#),  Assembly Operations (#) , Performance\\n"
 
-    latex_table += (
-        "\\setlength\\tabcolsep{0pt} % let LaTeX compute intercolumn whitespace\n"
-    )
-
-    latex_table += "\\caption{"
-    latex_table += (
-        "Our compilation pipeline leverages custom "
-        "\\ac{isa} extensions and knowledge of \\ac{fpu} design in order to achieve "
-        "over 90\\% \\ac{fpu} occupancy for the MatMul kernel, "
-        "operating on 1$\\times$200 and 200$\\times$5 64-bit inputs. "
-        "Incrementally adding each optimization minimizes and, "
-        "eventually eliminates, explicit memory operations, while reducing "
-        "execution time (cycles) and maximizing \\ac{fpu} utilization."
-    )
-    latex_table += "}\n\\label{tab:opt_pipeline}\n"
-
-    latex_table += (
-        "\\centering\n\\begin{tabular*}{\\textwidth}{@{\\extracolsep{\\fill}}"
-        + " ".join(f"{col_alignment[col]}" for col in opt_pipeline_df.columns)
-        + "}\n\\toprule\n"
-    )
-
-    latex_table += "\\textbf{Optimizations} & \\multicolumn{2}{r}{\\textbf{Allocated Registers (\\#)}} & \\multicolumn{4}{c}{\\textbf{Assembly Operations (\\#)}} & \\multicolumn{2}{c}{\\textbf{Performance}}\\\\\n"
-
-    latex_table += "\\cmidrule{2-3}\\cmidrule{4-7}\\cmidrule{8-9}\n"
-
-    latex_table += (
-        " & ".join(f"\\textbf{{{col_names[col]}}}" for col in opt_pipeline_df.columns)
-        + " \\\\\n\\midrule\n"
-    )
+    csv_table += " , ".join(f"{col_names[col]}" for col in opt_pipeline_df.columns)
 
     string_table = []
 
@@ -80,36 +51,18 @@ def get_opt_pipeline_table(opt_pipeline_df: pd.DataFrame) -> str:
 
     # change text style for first column
     for row in string_table:
-        row[0] = f"\\texttt{{{row[0]}}}"
-
         # replace text for baseline which should be at the first row and column
         string_table[0][0] = "Baseline (for MatMul)"
 
     # add max register count for fp registers
     for row in string_table:
-        row[1] = row[1] + "\\textcolor{lightgray}{/20}"
+        row[1] = row[1] + "/20"
 
     # add max register count for int registers
     for row in string_table:
-        row[2] = row[2] + "\\textcolor{lightgray}{/15}"
-
-        # replace text for baseline which should be at the first row and column
-        string_table[0][0] = "Baseline (for MatMul)"
-
-    # gray out baseline which should be the first line
-    for idx, val in enumerate(string_table[0]):
-        string_table[0][idx] = "\\color{gray} " + val
-
-    # highlight rightmost entry which should be the max FPU util achieved
-    string_table[-1][-1] = "\\textbf{" + string_table[-1][-1] + "}"
+        row[2] = row[2] + "/15"
 
     for row in string_table:
-        latex_table += " & ".join(val for val in row) + " \\\\\n"
-
-    latex_table += "\\bottomrule\n"
-
-    latex_table += "\\end{tabular*}\n"
-
-    latex_table += "\\end{table*}\n"
+        csv_table += " , ".join(val for val in row) + " \\n"
 
-    return latex_table
+    return csv_table
diff --git a/plots-cgo2025-ae/plot.py b/plots-cgo2025-ae/plot.py
index 4e3425a9..c7e5a9a9 100755
--- a/plots-cgo2025-ae/plot.py
+++ b/plots-cgo2025-ae/plot.py
@@ -36,7 +36,7 @@ def main():
 
     # Print the regalloc stats
     regalloc_df = get_regalloc(RESULTS_DIR)
-    print_regalloc(regalloc_df, filename=output_dir / "regalloc.tex")
+    print_regalloc(regalloc_df, filename=output_dir / "regalloc.csv")
 
     # Plot low-level representation
     llr_kernels_df = get_low_level_representation(RESULTS_DIR)
@@ -47,7 +47,7 @@ def main():
     # Print opt pipeline table
     opt_pipeline_df = get_opt_pipeline(RESULTS_DIR)
     opt_pipeline_table = get_opt_pipeline_table(opt_pipeline_df)
-    with open(output_dir / "opt_pipeline.tex", "w") as f:
+    with open(output_dir / "opt_pipeline.csv", "w") as f:
         f.write(opt_pipeline_table)
 
     # Print max utilization stats
diff --git a/plots-cgo2025-ae/regalloc.py b/plots-cgo2025-ae/regalloc.py
index f7b1366f..3b2e5793 100644
--- a/plots-cgo2025-ae/regalloc.py
+++ b/plots-cgo2025-ae/regalloc.py
@@ -1,4 +1,3 @@
-import re
 import pandas as pd
 from data import get_regalloc as _get_regalloc
 
@@ -11,13 +10,8 @@ def get_regalloc(dir: Path = Path(".")) -> pd.DataFrame:
     return regalloc_df
 
 
-def color(color: str, text: str) -> str:
-    return r"\textcolor{" + color + "}{" + text + "}"
-
-
 def print_regalloc(regalloc_df: pd.DataFrame, *, filename: str | None = None):
     stream = None if filename is None else open(filename, "w")
-    colors = (color("lightgray", "/20"), color("lightgray", "/15"))
 
     # Sort the DataFrame
     regalloc_df = regalloc_df.sort_values(
@@ -36,7 +30,7 @@ def print_regalloc(regalloc_df: pd.DataFrame, *, filename: str | None = None):
         params = items[:5]
         regs = items[5:]
 
-        reg_cells = tuple(f"{reg}{col}" for reg, col in zip(regs, colors))
+        reg_cells = tuple(f"{reg}" for reg in regs)
 
         string_table.append([str(p) for p in params + reg_cells])
 
@@ -45,21 +39,12 @@ def print_regalloc(regalloc_df: pd.DataFrame, *, filename: str | None = None):
     for row in string_table:
         line = ""
 
-        # replace NxM where N and M are integers with N$\times$M in kernel names
-        pattern = r"(\d+)x(\d+)"
-        row[0] = re.sub(pattern, r"\1$\\times$\2", row[0])
-
         # add short row space to separate precision groups
         if current_precision is None:
             current_precision = row[1]
 
-        if current_precision != row[1]:
-            current_precision = row[1]
-            line = "\\addlinespace[0.5em]\n"
-
-        line += " & ".join(val for val in row)
-        print(line, end=" \\\\\n", file=stream)
+        line += " , ".join(val for val in row)
+        print(line, end="\\n", file=stream)
 
-    print(r"\bottomrule", file=stream)
     if stream is not None:
         stream.close()

From e4ee515ef9538d01feea128fa0d851e011e00eea Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 14:31:47 +0000
Subject: [PATCH 15/20] Fix csv output|

---
 plots-cgo2025-ae/data.py         |  2 +-
 plots-cgo2025-ae/max_util.py     | 27 +++++++++++++++++----------
 plots-cgo2025-ae/opt_pipeline.py |  8 +++++---
 plots-cgo2025-ae/plot.py         |  2 +-
 plots-cgo2025-ae/regalloc.py     |  7 ++++---
 5 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/plots-cgo2025-ae/data.py b/plots-cgo2025-ae/data.py
index 26abe7cf..a41f0b9e 100644
--- a/plots-cgo2025-ae/data.py
+++ b/plots-cgo2025-ae/data.py
@@ -236,7 +236,7 @@ def get_regalloc(dir: Path = Path(".")) -> pd.DataFrame:
     params = tuple(
         {p: v for p, v in zip("MNK", param[:-1])} for param in param_components
     )
-    params_df = pd.DataFrame(params).fillna("{--}")
+    params_df = pd.DataFrame(params).fillna("-")
     regalloc_df = pd.concat((regalloc_df, pd.DataFrame(bitwidths), params_df), axis=1)
     del regalloc_df["params"]
 
diff --git a/plots-cgo2025-ae/max_util.py b/plots-cgo2025-ae/max_util.py
index c397ba50..1998fbf7 100644
--- a/plots-cgo2025-ae/max_util.py
+++ b/plots-cgo2025-ae/max_util.py
@@ -7,17 +7,24 @@
 def get_max_util(llr_kernels_df: pd.DataFrame, fpu_dfs: Sequence[pd.DataFrame]) -> str:
     llr_max_occupancy = llr_kernels_df["fpss_fpu_occupancy"].max()
     llr_max_throughput = llr_kernels_df["Rel Throughput"].max()
-    llr_matmult_throughput = llr_kernels_df["Throughput"][Operator.MATMUL_TRANSB, :].max()
-    llr_matmult_max_occupancy = llr_kernels_df["fpss_fpu_occupancy"][Operator.MATMUL_TRANSB, :].max()
+    llr_matmult_throughput = llr_kernels_df["Throughput"][
+        Operator.MATMUL_TRANSB, :
+    ].max()
+    llr_matmult_max_occupancy = llr_kernels_df["fpss_fpu_occupancy"][
+        Operator.MATMUL_TRANSB, :
+    ].max()
     proto_comp_max_occupancy = max(fpu_df[Impl.OURS].max() for fpu_df in fpu_dfs)
-    proto_comp_min_max_occupancy = min_max = min(_df[Impl.OURS].max() for _df in fpu_dfs)
+    proto_comp_min_max_occupancy = min_max = min(
+        _df[Impl.OURS].max() for _df in fpu_dfs
+    )
     clang_max_occupancy = max(fpu_df[Impl.CLANG].max() for fpu_df in fpu_dfs)
+
     return f"""\
-\\newdelimitedcommand{{maxutilprotocomp}}{{{proto_comp_max_occupancy*100:.0f}\\%}}
-\\newdelimitedcommand{{minmaxutilprotocomp}}{{{proto_comp_min_max_occupancy*100:.0f}\\%}}
-\\newdelimitedcommand{{maxutilclang}}{{{clang_max_occupancy*100:.0f}\\%}}
-\\newdelimitedcommand{{maxutilllr}}{{{llr_max_occupancy*100:.0f}\\%}}
-\\newdelimitedcommand{{maxutilllrmatmult}}{{{llr_matmult_max_occupancy*100:.0f}\\%}}
-\\newdelimitedcommand{{maxrelthroughputllr}}{{{llr_max_throughput*100:.0f}\\%}}
-\\newdelimitedcommand{{maxabsthroughputllrmatmult}}{{{llr_matmult_throughput:.2f}}}
+Section 4.2 Maximum FPU utilization for low-level representations, {llr_max_occupancy*100:.0f}%
+Section 4.2 Maximum FPU utilization for low-level representation MatMulT, {llr_matmult_max_occupancy*100:.0f}%
+Section 4.2 Percent of maximum theoretical throughput achieved for low-level representations, {llr_max_throughput*100:.0f}%
+Section 4.2 Maximum throughput achieved for low-level representation MatMulT, {llr_matmult_throughput:.2f}
+Section 4.4 Maximum FPU utilization for prototype micro-kernel compiler, {proto_comp_max_occupancy*100:.0f}%
+Section 4.4 Minimum FPU utilization for prototype micro-kernel compiler, {proto_comp_min_max_occupancy*100:.0f}%
+Section 4.4 Maximum FPU utilization for Clang, {clang_max_occupancy*100:.0f}%
 """
diff --git a/plots-cgo2025-ae/opt_pipeline.py b/plots-cgo2025-ae/opt_pipeline.py
index 4fc53e68..67352d50 100644
--- a/plots-cgo2025-ae/opt_pipeline.py
+++ b/plots-cgo2025-ae/opt_pipeline.py
@@ -36,9 +36,11 @@ def get_opt_pipeline_table(opt_pipeline_df: pd.DataFrame) -> str:
     del opt_pipeline_df["params"]
     csv_table = ""
 
-    csv_table += "Optimizations, Allocated Registers (#),  Assembly Operations (#) , Performance\\n"
+    csv_table += "Optimizations, Allocated Registers (#),  , Assembly Operations (#), , , , Performance\n"
 
-    csv_table += " , ".join(f"{col_names[col]}" for col in opt_pipeline_df.columns)
+    csv_table += ", ".join(f"{col_names[col]}" for col in opt_pipeline_df.columns)
+
+    csv_table += "\n"
 
     string_table = []
 
@@ -63,6 +65,6 @@ def get_opt_pipeline_table(opt_pipeline_df: pd.DataFrame) -> str:
         row[2] = row[2] + "/15"
 
     for row in string_table:
-        csv_table += " , ".join(val for val in row) + " \\n"
+        csv_table += ", ".join(val for val in row) + "\n"
 
     return csv_table
diff --git a/plots-cgo2025-ae/plot.py b/plots-cgo2025-ae/plot.py
index c7e5a9a9..feb2c1f6 100755
--- a/plots-cgo2025-ae/plot.py
+++ b/plots-cgo2025-ae/plot.py
@@ -52,7 +52,7 @@ def main():
 
     # Print max utilization stats
     max_util_macros = get_max_util(llr_kernels_df, fpu_dfs)
-    with open(output_dir / "max_util.tex", "w") as f:
+    with open(output_dir / "max_util.csv", "w") as f:
         f.write(max_util_macros)
 
 
diff --git a/plots-cgo2025-ae/regalloc.py b/plots-cgo2025-ae/regalloc.py
index 3b2e5793..2eaa93e3 100644
--- a/plots-cgo2025-ae/regalloc.py
+++ b/plots-cgo2025-ae/regalloc.py
@@ -29,8 +29,9 @@ def print_regalloc(regalloc_df: pd.DataFrame, *, filename: str | None = None):
         items = tuple(row[1])
         params = items[:5]
         regs = items[5:]
+        max_regs = ("/20", "/15")
 
-        reg_cells = tuple(f"{reg}" for reg in regs)
+        reg_cells = tuple(f"{reg}{max_reg}" for reg, max_reg in zip(regs, max_regs))
 
         string_table.append([str(p) for p in params + reg_cells])
 
@@ -43,8 +44,8 @@ def print_regalloc(regalloc_df: pd.DataFrame, *, filename: str | None = None):
         if current_precision is None:
             current_precision = row[1]
 
-        line += " , ".join(val for val in row)
-        print(line, end="\\n", file=stream)
+        line += ", ".join(val for val in row)
+        print(line, file=stream)
 
     if stream is not None:
         stream.close()

From b433e2597f9d632f54a88c607886f1811b2063b0 Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 14:35:48 +0000
Subject: [PATCH 16/20] Add CSV column titles

---
 plots-cgo2025-ae/regalloc.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/plots-cgo2025-ae/regalloc.py b/plots-cgo2025-ae/regalloc.py
index 2eaa93e3..54b3c8a7 100644
--- a/plots-cgo2025-ae/regalloc.py
+++ b/plots-cgo2025-ae/regalloc.py
@@ -23,7 +23,17 @@ def print_regalloc(regalloc_df: pd.DataFrame, *, filename: str | None = None):
         ascending=[False, True, True],
     )
 
-    string_table = []
+    string_table = [
+        [
+            "Kernel",
+            "Precision bits",
+            "N",
+            "M",
+            "K",
+            "Allocated FP registers",
+            "Allocated Integer registers",
+        ]
+    ]
 
     for row in regalloc_df.iterrows():
         items = tuple(row[1])

From 058358b23db625da810e455bf872401f68064f31 Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 14:41:39 +0000
Subject: [PATCH 17/20] Make scripts executable

---
 plots-cgo2025-ae/heatmap.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 plots-cgo2025-ae/heatmap.py

diff --git a/plots-cgo2025-ae/heatmap.py b/plots-cgo2025-ae/heatmap.py
old mode 100644
new mode 100755

From 83e464cc87f3d1a0f7fab5a644957bd24fc3e852 Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 15:06:16 +0000
Subject: [PATCH 18/20] Fix kernels csv source

---
 plots-cgo2025-ae/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plots-cgo2025-ae/data.py b/plots-cgo2025-ae/data.py
index a41f0b9e..88536c13 100644
--- a/plots-cgo2025-ae/data.py
+++ b/plots-cgo2025-ae/data.py
@@ -98,7 +98,7 @@ def _get_kernels(filename: str) -> pd.DataFrame:
 
 
 def get_kernels(dir: Path = Path("."), cleaned: bool = True) -> pd.DataFrame:
-    df = _get_kernels(f"{dir}/kernels.csv")
+    df = _get_kernels(f"{dir}/kernels.all.csv")
     # Drop unknown operators
     df = df[df.index.get_level_values(0).isin(tuple(Operator))]
     if cleaned:

From 11f09f4d95d54a4e88b47c25b0d82e065ca39f09 Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 15:10:08 +0000
Subject: [PATCH 19/20] Add artifact Make target

---
 Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index d4e88280..1264800d 100644
--- a/Makefile
+++ b/Makefile
@@ -2,10 +2,12 @@ JOBS ?= all
 
 THIS := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
 
-.PHONY: default fast all clean
+.PHONY: default fast all clean artifact
 
 default: fast
 
+artifact: fast all low_level_representation pipeline
+
 fast: maybe_update_xdsl_commit
 	snakemake --cores $(JOBS) --rerun-incomplete fast
 

From 16fe0a7660b2c51436e5a2a8c61138588685f0e4 Mon Sep 17 00:00:00 2001
From: Chris Vasiladiotis <cvassiladiotis@gmail.com>
Date: Thu, 7 Nov 2024 16:02:15 +0000
Subject: [PATCH 20/20] Name outputs as referenced in text

---
 plots-cgo2025-ae/heatmap.py |  9 ++++++---
 plots-cgo2025-ae/plot.py    | 14 ++++----------
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/plots-cgo2025-ae/heatmap.py b/plots-cgo2025-ae/heatmap.py
index ab0426c3..d4858a9c 100755
--- a/plots-cgo2025-ae/heatmap.py
+++ b/plots-cgo2025-ae/heatmap.py
@@ -149,9 +149,12 @@ def main():
     # FIXME we are able to generate snitch_stream matmul only at the moment
     data = data.loc[(data["test"] == "matmul") & (data["impl"] == "linalg_xdsl")]
     for m, fig in generate_heatmaps(data):
-        fig.savefig(
-            output_dir / f"matmul_heatmap_M_{m}.pdf", format="pdf", bbox_inches="tight"
-        )
+        if m == 1:
+            fig.savefig(
+                output_dir / f"figure9_matmul_heatmap_M_{m}.pdf",
+                format="pdf",
+                bbox_inches="tight",
+            )
 
 
 if __name__ == "__main__":
diff --git a/plots-cgo2025-ae/plot.py b/plots-cgo2025-ae/plot.py
index feb2c1f6..5cc2435d 100755
--- a/plots-cgo2025-ae/plot.py
+++ b/plots-cgo2025-ae/plot.py
@@ -8,7 +8,6 @@
 )
 from fpu import get_fpu, plot_fpu
 from low_level_representation import get_llr_dfs, plot_llr
-from max_util import get_max_util
 from regalloc import get_regalloc, print_regalloc
 from plot_utils import savefig
 from opt_pipeline import get_opt_pipeline_table
@@ -32,29 +31,24 @@ def main():
     # Plot FPU utilization
     fpu_dfs = get_fpu(pivoted_fpu_df)
     fpu_fig = plot_fpu(fpu_dfs)
-    savefig(fpu_fig, output_dir / "fpu.pdf")
+    savefig(fpu_fig, output_dir / "figure10.pdf")
 
     # Print the regalloc stats
     regalloc_df = get_regalloc(RESULTS_DIR)
-    print_regalloc(regalloc_df, filename=output_dir / "regalloc.csv")
+    print_regalloc(regalloc_df, filename=output_dir / "table2.csv")
 
     # Plot low-level representation
     llr_kernels_df = get_low_level_representation(RESULTS_DIR)
     llr_dfs = get_llr_dfs(llr_kernels_df)
     llr_fig = plot_llr(llr_dfs)
-    savefig(llr_fig, output_dir / "low_level_representation.pdf")
+    savefig(llr_fig, output_dir / "figure8.pdf")
 
     # Print opt pipeline table
     opt_pipeline_df = get_opt_pipeline(RESULTS_DIR)
     opt_pipeline_table = get_opt_pipeline_table(opt_pipeline_df)
-    with open(output_dir / "opt_pipeline.csv", "w") as f:
+    with open(output_dir / "table3.csv", "w") as f:
         f.write(opt_pipeline_table)
 
-    # Print max utilization stats
-    max_util_macros = get_max_util(llr_kernels_df, fpu_dfs)
-    with open(output_dir / "max_util.csv", "w") as f:
-        f.write(max_util_macros)
-
 
 if __name__ == "__main__":
     main()