From 745b7cc04d299db459500e41cc640d26b2b63916 Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Wed, 30 Oct 2024 16:40:33 -0600
Subject: [PATCH 01/22] added dataset processing on per-graph basis to bfasst

---
 bfasst/flows/analyze_dataset.py               |  35 +++++
 bfasst/flows/flow_descriptions.yaml           |   8 +-
 bfasst/paths.py                               |   2 +
 bfasst/tools/dataset_metrics/graph_metrics.py |  44 ++++++
 .../process_graph_build.ninja.mustache        |   2 +
 .../process_graph_rules.ninja.mustache        |   4 +
 bfasst/utils/process_graph.py                 | 148 ++++++++++++++++++
 7 files changed, 242 insertions(+), 1 deletion(-)
 create mode 100644 bfasst/flows/analyze_dataset.py
 create mode 100644 bfasst/tools/dataset_metrics/graph_metrics.py
 create mode 100644 bfasst/tools/dataset_metrics/process_graph_build.ninja.mustache
 create mode 100644 bfasst/tools/dataset_metrics/process_graph_rules.ninja.mustache
 create mode 100644 bfasst/utils/process_graph.py

diff --git a/bfasst/flows/analyze_dataset.py b/bfasst/flows/analyze_dataset.py
new file mode 100644
index 00000000..a07e5ccd
--- /dev/null
+++ b/bfasst/flows/analyze_dataset.py
@@ -0,0 +1,35 @@
+"""Analyze dataset metrics."""
+
+from pathlib import Path
+from bfasst.flows.flow import Flow
+from bfasst.paths import FLOWS_PATH
+from bfasst.tools.dataset_metrics.graph_metrics import GraphMetrics
+
+
+class AnalyzeDataset(Flow):
+    """Analyze dataset metrics."""
+
+    def __init__(self, design, dataset):
+        # pylint: disable=duplicate-code
+        super().__init__(design)
+        self.design = design
+        self.dataset = Path(dataset)
+
+        self.graph_metrics_default_tool = GraphMetrics(
+            self, design, None, None
+        )  # only used for configuring ninja
+        # pylint: enable=duplicate-code
+
+    def create_build_snippets(self):
+        # get the size of the dataset
+        directories = [x for x in self.dataset.iterdir() if x.is_dir()]
+        iterations = len(directories)
+
+        for i in range(1, iterations + 1):
+            graph_metrics_tool = GraphMetrics(
+                self, self.design, directories[i - 1] / f"{directories[i-1].name}.dump", i
+            )
+            graph_metrics_tool.create_build_snippets()
+
+    def get_top_level_flow_path(self) -> str:
+        return FLOWS_PATH / "analyze_dataset.py"
diff --git a/bfasst/flows/flow_descriptions.yaml b/bfasst/flows/flow_descriptions.yaml
index 53282896..a10cb3ad 100644
--- a/bfasst/flows/flow_descriptions.yaml
+++ b/bfasst/flows/flow_descriptions.yaml
@@ -156,4 +156,10 @@ flows:
   class: OpenTitan
   external_tools:
   - vivado
-  - opentitan
\ No newline at end of file
+  - opentitan
+
+- name: AnalyzeDataset
+  description: Compute Metrics on an FPGA Circuit dataset for GNNs.
+  module: analyze_dataset
+  class: AnalyzeDataset
+  
\ No newline at end of file
diff --git a/bfasst/paths.py b/bfasst/paths.py
index 9dbeb27f..db5fe500 100644
--- a/bfasst/paths.py
+++ b/bfasst/paths.py
@@ -20,6 +20,8 @@
 
 COMMON_TOOLS_PATH = TOOLS_PATH / "common"
 
+DATASET_METRICS_TOOLS_PATH = TOOLS_PATH / "dataset_metrics"
+
 REV_BIT_TOOLS_PATH = TOOLS_PATH / "rev_bit"
 NINJA_TRANSFORM_TOOLS_PATH = TOOLS_PATH / "transform"
 
diff --git a/bfasst/tools/dataset_metrics/graph_metrics.py b/bfasst/tools/dataset_metrics/graph_metrics.py
new file mode 100644
index 00000000..8768fa3c
--- /dev/null
+++ b/bfasst/tools/dataset_metrics/graph_metrics.py
@@ -0,0 +1,44 @@
+"""Create the rule and build snippets for computing gnn dataset metrics."""
+
+import chevron
+
+from bfasst.tools.tool import Tool
+from bfasst.paths import NINJA_BUILD_PATH, DATASET_METRICS_TOOLS_PATH, BFASST_UTILS_PATH
+
+
+class GraphMetrics(Tool):
+    """Create the rule and build snippets for computing gnn dataset metrics
+    ."""
+
+    def __init__(
+        self,
+        flow,
+        design,
+        graph,
+        num,
+    ):
+        super().__init__(flow, design)
+        self.graph = graph
+        self.num = num
+        self.build_path = self.design_build_path / "dataset_metrics"
+        self.metrics_path = self.build_path / f"metrics_{num}.log"
+
+        self._init_outputs()
+        self.rule_snippet_path = DATASET_METRICS_TOOLS_PATH / "process_graph_rules.ninja.mustache"
+
+    def create_build_snippets(self):
+        with open(DATASET_METRICS_TOOLS_PATH / "process_graph_build.ninja.mustache", "r") as f:
+            build = chevron.render(
+                f,
+                {"output": self.metrics_path, "graph": self.graph},
+            )
+
+        with open(NINJA_BUILD_PATH, "a") as f:
+            f.write(build)
+
+    def _init_outputs(self):
+        self.outputs["metrics_path"] = self.metrics_path
+
+    def add_ninja_deps(self, deps):
+        self._add_ninja_deps_default(deps, __file__)
+        deps.append(BFASST_UTILS_PATH / "process_graph.py")
diff --git a/bfasst/tools/dataset_metrics/process_graph_build.ninja.mustache b/bfasst/tools/dataset_metrics/process_graph_build.ninja.mustache
new file mode 100644
index 00000000..92cf4887
--- /dev/null
+++ b/bfasst/tools/dataset_metrics/process_graph_build.ninja.mustache
@@ -0,0 +1,2 @@
+build {{ output }}: process_graph {{ graph }}
+
diff --git a/bfasst/tools/dataset_metrics/process_graph_rules.ninja.mustache b/bfasst/tools/dataset_metrics/process_graph_rules.ninja.mustache
new file mode 100644
index 00000000..7bde2576
--- /dev/null
+++ b/bfasst/tools/dataset_metrics/process_graph_rules.ninja.mustache
@@ -0,0 +1,4 @@
+rule process_graph
+    command = python {{ bfasst_path }}/bfasst/utils/process_graph.py $in -o $out
+    description = compute metrics on $in and save them to $out
+
diff --git a/bfasst/utils/process_graph.py b/bfasst/utils/process_graph.py
new file mode 100644
index 00000000..79453634
--- /dev/null
+++ b/bfasst/utils/process_graph.py
@@ -0,0 +1,148 @@
+"""Compute metrics on a single graph in a dataset."""
+
+import argparse
+from collections import defaultdict
+import logging
+import os
+import json
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    """Load the graph, convert to adj_list, and compute metrics."""
+    # ArgParse
+    parser = argparse.ArgumentParser(description="Compute metrics on a graph.")
+    parser.add_argument("graph", help="The graph to compute metrics on.")
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="Enable debug logging."
+    )
+    parser.add_argument("-o", help="The name of the output file to create")
+    args = parser.parse_args()
+
+    # Logging (for debug, don't use in parallel)
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+    )
+
+    component_nodes, component_edges = load_graph(args.graph)
+
+    adj_lists = convert_to_adj_list(component_nodes, component_edges)
+
+    # Compute metrics for each component
+    metrics_per_ip = compute_metrics_per_ip(adj_lists, args)
+
+    # write metrics to a file
+    output = args.o if args.o else "metrics.log"
+    with open(os.path.abspath(output), "w") as f:
+        f.write(json.dumps(metrics_per_ip))
+
+
+def load_graph(graph):
+    """Load a graph from a file."""
+    graph_path = os.path.abspath(graph)
+
+    component_nodes = defaultdict(list)  # {ip_inst: [node1, node2, ...]}
+    component_edges = defaultdict(list)  # {ip_inst: [(node1, node2), ...]}
+    section = None  # track the section: nodes or edges
+
+    with open(graph_path, "r") as f:
+        for line in f:
+            line = line.strip()
+
+            # Detect the beginning of a section
+            if line.startswith("(("):
+                if section is None:
+                    section = "nodes"
+                else:
+                    section = "edges"
+                line = line[1:].strip()  # Remove the opening '('
+
+            # Detect the end of a section
+            if line == ")":
+                continue
+
+            if not line:
+                continue  # Skip empty lines
+
+            if section == "nodes":
+                parts = line.replace('"', "").split()
+                node_id, label = parts[0], parts[2]
+                node_id = node_id.replace("(", "")
+                if "ip" not in label:
+                    label = "fabric"
+                component_nodes[label].append(node_id)
+
+            elif section == "edges":
+                node1, node2 = line.replace('"', "").replace("(", "").replace(")", "").split()
+                # get the label for both nodes
+                node1_label = find_label(node1, component_nodes)
+                node2_label = find_label(node2, component_nodes)
+                if node1_label == node2_label:
+                    component_edges[node1_label].append((node1, node2))
+
+    return component_nodes, component_edges
+
+
+def find_label(node, component_nodes):
+    """Find the label for a node."""
+    for label, nodes in component_nodes.items():
+        if node in nodes:
+            return label
+    return None
+
+
+def convert_to_adj_list(component_nodes, component_edges):
+    """Convert the graph to adjacency lists."""
+    adj_lists = {}
+    for label, nodes in component_nodes.items():
+        adj_lists[label] = {}
+        for node in nodes:
+            adj_lists[label][node] = []
+
+    for label, edges in component_edges.items():
+        for node1, node2 in edges:
+            adj_lists[label][node1].append(node2)
+            adj_lists[label][node2].append(node1)
+
+    return adj_lists
+
+
+def compute_metrics_per_ip(adj_lists, args):
+    metrics_per_ip = {}
+    for label, adj_list in adj_lists.items():
+        
+        # set up default entries
+        ip = get_ip_name_from_label(label)
+        if ip not in metrics_per_ip:
+            metrics_per_ip[ip] = {"order": [], "size": []}
+        
+        # Order
+        metrics_per_ip[ip]["order"].append(len(adj_list))
+
+        # Size
+        edge_count = 0
+        for node in adj_list:
+            for neighbor in adj_list[node]:
+                edge_count += 1
+        edge_count = edge_count // 2
+        metrics_per_ip[ip]["size"].append(edge_count)
+
+        # Debug (verbose flag only)
+        logger.debug(f"IP: {ip}")
+        logger.debug(f"Component: {label}")
+        logger.debug(f"Nodes: {len(adj_list)}")
+        logger.debug(f"Edges: {edge_count}")
+        logger.debug("")
+
+    return metrics_per_ip
+
+
+def get_ip_name_from_label(label):
+    ip_name = ("_").join(label.split("_")[2:])
+    return ip_name if ip_name else label
+
+
+if __name__ == "__main__":
+    main()

From c7876bb0165d120d2d9181fb540bb5e82d456873 Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Wed, 30 Oct 2024 16:47:49 -0600
Subject: [PATCH 02/22] minor format fix

---
 bfasst/flows/analyze_dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bfasst/flows/analyze_dataset.py b/bfasst/flows/analyze_dataset.py
index a07e5ccd..5a78696f 100644
--- a/bfasst/flows/analyze_dataset.py
+++ b/bfasst/flows/analyze_dataset.py
@@ -26,8 +26,9 @@ def create_build_snippets(self):
         iterations = len(directories)
 
         for i in range(1, iterations + 1):
+            num = int(directories[i - 1].name.split("_")[-1])
             graph_metrics_tool = GraphMetrics(
-                self, self.design, directories[i - 1] / f"{directories[i-1].name}.dump", i
+                self, self.design, directories[i - 1] / f"{directories[i - 1].name}.dump", num
             )
             graph_metrics_tool.create_build_snippets()
 

From fc704b116d03d3379d51d644dd7d28ba7f79809c Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Wed, 30 Oct 2024 17:20:11 -0600
Subject: [PATCH 03/22] Added basics for accumulation of graph metrics

---
 bfasst/utils/accumulate_metrics.py | 100 +++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 bfasst/utils/accumulate_metrics.py

diff --git a/bfasst/utils/accumulate_metrics.py b/bfasst/utils/accumulate_metrics.py
new file mode 100644
index 00000000..2613623b
--- /dev/null
+++ b/bfasst/utils/accumulate_metrics.py
@@ -0,0 +1,100 @@
+"""Accumulate metrics from graphs in a dataset after computing them for all graphs"""
+
+import argparse
+import logging
+import json
+from pathlib import Path
+import statistics
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    """Load the graph, convert to adj_list, and compute metrics."""
+    # ArgParse
+    parser = argparse.ArgumentParser(description="Compute metrics on a graph.")
+    parser.add_argument(
+        "analysis_dir", help="The path to the folder containing all analysis files for all graphs."
+    )
+    parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging.")
+    parser.add_argument("-o", help="The name of the output file to create")
+    args = parser.parse_args()
+
+    # Logging (for debug, don't use in parallel)
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+    )
+
+    # Initialize the master dictionary
+    master_metrics = {}
+
+    # Iterate through the files in the analysis directory
+    for file in Path(args.analysis_dir).iterdir():
+        if file.is_dir():
+            continue
+
+        with open(file, "r") as f:
+            graph_metrics = json.loads(f.readline())
+
+        for ip, metrics in graph_metrics.items():
+            # Initialize the IP entry in the master dictionary if it doesn't exist
+            if ip not in master_metrics:
+                master_metrics[ip] = {}
+
+            for metric, values in metrics.items():
+                # Initialize the metric entry if it doesn't exist
+                if metric not in master_metrics[ip]:
+                    master_metrics[ip][metric] = []
+
+                # Concatenate the lists
+                master_metrics[ip][metric].extend(values)
+
+    # sort the values for each metric after merging
+    for ip in master_metrics:
+        for metric in master_metrics[ip]:
+            master_metrics[ip][metric] = sorted(master_metrics[ip][metric])
+
+    # Compute the stats for each metric
+    stats_summary = {}
+    for ip, metrics in master_metrics.items():
+        for metric, values in metrics.items():
+            # Calculate statistics
+            if values:  # Check if the list is not empty
+                min_val, Q1, median, Q3, max_val = five_number_summary(values)
+                mean = sum(values) / len(values)
+                stddev = statistics.stdev(values) if len(values) > 1 else 0.0
+
+                # Prepare the summary dictionary
+                if ip not in stats_summary:
+                    stats_summary[ip] = {}
+
+                stats_summary[ip][metric] = {
+                    "min": min_val,
+                    "Q1": Q1,
+                    "median": median,
+                    "Q3": Q3,
+                    "max": max_val,
+                    "mean": mean,
+                    "stddev": stddev,
+                }
+
+    for k, v in master_metrics.items():
+        logger.debug(k + ": " + str(v))
+
+    for k, v in stats_summary.items():
+        logger.debug(k + ": " + str(v))
+
+
+def five_number_summary(data):
+    n = len(data)
+    min_val = data[0]
+    max_val = data[-1]
+    Q1 = data[n // 4]
+    median = data[n // 2]
+    Q3 = data[(3 * n) // 4]
+    return min_val, Q1, median, Q3, max_val
+
+
+if __name__ == "__main__":
+    main()

From 75883bd727dd134548368e323f5eeff93443cb6d Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Wed, 30 Oct 2024 17:27:27 -0600
Subject: [PATCH 04/22] updated accumulation script to write to file

---
 bfasst/utils/accumulate_metrics.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/bfasst/utils/accumulate_metrics.py b/bfasst/utils/accumulate_metrics.py
index 2613623b..c1f63c53 100644
--- a/bfasst/utils/accumulate_metrics.py
+++ b/bfasst/utils/accumulate_metrics.py
@@ -17,7 +17,10 @@ def main():
         "analysis_dir", help="The path to the folder containing all analysis files for all graphs."
     )
     parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging.")
-    parser.add_argument("-o", help="The name of the output file to create")
+    parser.add_argument("-m", help="The name of the metrics file to create")
+    parser.add_argument(
+        "-s", help="The name of the stats (5-num summary, mean, stddev) file to create"
+    )
     args = parser.parse_args()
 
     # Logging (for debug, don't use in parallel)
@@ -79,11 +82,14 @@ def main():
                     "stddev": stddev,
                 }
 
-    for k, v in master_metrics.items():
-        logger.debug(k + ": " + str(v))
+    # write master_metrics to a file
+    output = args.m if args.m else "master_metrics.log"
+    with open(output, "w") as f:
+        f.write(json.dumps(master_metrics, indent=4))
 
-    for k, v in stats_summary.items():
-        logger.debug(k + ": " + str(v))
+    output = args.s if args.s else "summary_statistics.log"
+    with open(output, "w") as f:
+        f.write(json.dumps(stats_summary, indent=4))
 
 
 def five_number_summary(data):

From aff18972878f8c8b3cb832369d90e998e5bb0ff7 Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Thu, 31 Oct 2024 13:43:51 -0600
Subject: [PATCH 05/22] Added accumulation of metrics

---
 bfasst/flows/analyze_dataset.py               | 11 +++--
 .../dataset_metrics/accumulate_metrics.py     | 45 +++++++++++++++++++
 .../accumulate_metrics_build.ninja.mustache   |  4 ++
 .../accumulate_metrics_rules.ninja.mustache   |  4 ++
 bfasst/tools/dataset_metrics/graph_metrics.py |  3 +-
 5 files changed, 62 insertions(+), 5 deletions(-)
 create mode 100644 bfasst/tools/dataset_metrics/accumulate_metrics.py
 create mode 100644 bfasst/tools/dataset_metrics/accumulate_metrics_build.ninja.mustache
 create mode 100644 bfasst/tools/dataset_metrics/accumulate_metrics_rules.ninja.mustache

diff --git a/bfasst/flows/analyze_dataset.py b/bfasst/flows/analyze_dataset.py
index 5a78696f..a4402695 100644
--- a/bfasst/flows/analyze_dataset.py
+++ b/bfasst/flows/analyze_dataset.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 from bfasst.flows.flow import Flow
 from bfasst.paths import FLOWS_PATH
+from bfasst.tools.dataset_metrics.accumulate_metrics import AccumulateMetrics
 from bfasst.tools.dataset_metrics.graph_metrics import GraphMetrics
 
 
@@ -15,22 +16,26 @@ def __init__(self, design, dataset):
         self.design = design
         self.dataset = Path(dataset)
 
-        self.graph_metrics_default_tool = GraphMetrics(
-            self, design, None, None
-        )  # only used for configuring ninja
+        # only used for configuring ninja rule snippets
+        self.graph_metrics_default_tool = GraphMetrics(self, design, None, None)
+        self.accumulate_metrics_tool = AccumulateMetrics(self, design, None)
         # pylint: enable=duplicate-code
 
     def create_build_snippets(self):
         # get the size of the dataset
         directories = [x for x in self.dataset.iterdir() if x.is_dir()]
         iterations = len(directories)
+        pieces = []
 
         for i in range(1, iterations + 1):
             num = int(directories[i - 1].name.split("_")[-1])
             graph_metrics_tool = GraphMetrics(
                 self, self.design, directories[i - 1] / f"{directories[i - 1].name}.dump", num
             )
+            pieces.append(graph_metrics_tool.metrics_path)
             graph_metrics_tool.create_build_snippets()
 
+        AccumulateMetrics(self, self.design, pieces).create_build_snippets()
+
     def get_top_level_flow_path(self) -> str:
         return FLOWS_PATH / "analyze_dataset.py"
diff --git a/bfasst/tools/dataset_metrics/accumulate_metrics.py b/bfasst/tools/dataset_metrics/accumulate_metrics.py
new file mode 100644
index 00000000..713e8424
--- /dev/null
+++ b/bfasst/tools/dataset_metrics/accumulate_metrics.py
@@ -0,0 +1,45 @@
+"""Accumulate metrics from the graph_metrics tool."""
+
+import chevron
+
+from bfasst.tools.tool import Tool
+from bfasst.paths import NINJA_BUILD_PATH, DATASET_METRICS_TOOLS_PATH, BFASST_UTILS_PATH
+
+
+class AccumulateMetrics(Tool):
+    """Accumulate metrics from the graph_metrics tool."""
+
+    def __init__(self, flow, design, pieces):
+        super().__init__(flow, design)
+        self.pieces = pieces
+        self.build_path = self.design_build_path / "dataset_metrics"
+        self.metrics_path = self.build_path / "master_metrics.log"
+        self.summary_stats = self.build_path / "summary_stats.log"
+
+        self._init_outputs()
+        self.rule_snippet_path = (
+            DATASET_METRICS_TOOLS_PATH / "accumulate_metrics_rules.ninja.mustache"
+        )
+
+    def create_build_snippets(self):
+        with open(DATASET_METRICS_TOOLS_PATH / "accumulate_metrics_build.ninja.mustache", "r") as f:
+            build = chevron.render(
+                f,
+                {
+                    "metrics_file": self.metrics_path,
+                    "summary_stats": self.summary_stats,
+                    "aggregation_dir": self.build_path,
+                    "pieces": self.pieces,
+                },
+            )
+
+        with open(NINJA_BUILD_PATH, "a") as f:
+            f.write(build)
+
+    def _init_outputs(self):
+        self.outputs["metrics_path"] = self.metrics_path
+        self.outputs["summary_stats"] = self.summary_stats
+
+    def add_ninja_deps(self, deps):
+        self._add_ninja_deps_default(deps, __file__)
+        deps.append(BFASST_UTILS_PATH / "accumulate_metrics.py")
diff --git a/bfasst/tools/dataset_metrics/accumulate_metrics_build.ninja.mustache b/bfasst/tools/dataset_metrics/accumulate_metrics_build.ninja.mustache
new file mode 100644
index 00000000..2299f729
--- /dev/null
+++ b/bfasst/tools/dataset_metrics/accumulate_metrics_build.ninja.mustache
@@ -0,0 +1,4 @@
+build {{ metrics_file }} {{ summary_stats }}: accumulate_metrics {{ aggregation_dir }} | {{#pieces}}{{.}} {{/pieces}}
+    metrics_file = {{ metrics_file }}
+    summary_stats = {{ summary_stats }}
+
diff --git a/bfasst/tools/dataset_metrics/accumulate_metrics_rules.ninja.mustache b/bfasst/tools/dataset_metrics/accumulate_metrics_rules.ninja.mustache
new file mode 100644
index 00000000..2454a7eb
--- /dev/null
+++ b/bfasst/tools/dataset_metrics/accumulate_metrics_rules.ninja.mustache
@@ -0,0 +1,4 @@
+rule accumulate_metrics
+    command = python {{ bfasst_path }}/bfasst/utils/accumulate_metrics.py $in -m $metrics_file -s $summary_stats
+    description = accumulate metrics from $in to produce master_metrics and summary_stats files
+
diff --git a/bfasst/tools/dataset_metrics/graph_metrics.py b/bfasst/tools/dataset_metrics/graph_metrics.py
index 8768fa3c..5a8311f7 100644
--- a/bfasst/tools/dataset_metrics/graph_metrics.py
+++ b/bfasst/tools/dataset_metrics/graph_metrics.py
@@ -7,8 +7,7 @@
 
 
 class GraphMetrics(Tool):
-    """Create the rule and build snippets for computing gnn dataset metrics
-    ."""
+    """Create the rule and build snippets for computing gnn dataset metrics."""
 
     def __init__(
         self,

From 8155cfcfbaee1eac09abe8ddd102a416657e747e Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Thu, 31 Oct 2024 13:52:57 -0600
Subject: [PATCH 06/22] refactored to use FlowNoDesign

---
 bfasst/flows/analyze_dataset.py               | 30 ++++++++++++-------
 .../dataset_metrics/accumulate_metrics.py     | 12 ++++----
 bfasst/tools/dataset_metrics/graph_metrics.py | 11 ++++---
 3 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/bfasst/flows/analyze_dataset.py b/bfasst/flows/analyze_dataset.py
index a4402695..399630c9 100644
--- a/bfasst/flows/analyze_dataset.py
+++ b/bfasst/flows/analyze_dataset.py
@@ -1,24 +1,24 @@
 """Analyze dataset metrics."""
 
 from pathlib import Path
-from bfasst.flows.flow import Flow
+import pathlib
+from bfasst.flows.flow import FlowNoDesign
 from bfasst.paths import FLOWS_PATH
 from bfasst.tools.dataset_metrics.accumulate_metrics import AccumulateMetrics
 from bfasst.tools.dataset_metrics.graph_metrics import GraphMetrics
 
 
-class AnalyzeDataset(Flow):
+class AnalyzeDataset(FlowNoDesign):
     """Analyze dataset metrics."""
 
-    def __init__(self, design, dataset):
+    def __init__(self, dataset):
         # pylint: disable=duplicate-code
-        super().__init__(design)
-        self.design = design
+        super().__init__()
         self.dataset = Path(dataset)
 
         # only used for configuring ninja rule snippets
-        self.graph_metrics_default_tool = GraphMetrics(self, design, None, None)
-        self.accumulate_metrics_tool = AccumulateMetrics(self, design, None)
+        self.graph_metrics_default_tool = GraphMetrics(self, None, None)
+        self.accumulate_metrics_tool = AccumulateMetrics(self, None)
         # pylint: enable=duplicate-code
 
     def create_build_snippets(self):
@@ -30,12 +30,20 @@ def create_build_snippets(self):
         for i in range(1, iterations + 1):
             num = int(directories[i - 1].name.split("_")[-1])
             graph_metrics_tool = GraphMetrics(
-                self, self.design, directories[i - 1] / f"{directories[i - 1].name}.dump", num
+                self, directories[i - 1] / f"{directories[i - 1].name}.dump", num
             )
             pieces.append(graph_metrics_tool.metrics_path)
             graph_metrics_tool.create_build_snippets()
 
-        AccumulateMetrics(self, self.design, pieces).create_build_snippets()
+        AccumulateMetrics(self, pieces).create_build_snippets()
 
-    def get_top_level_flow_path(self) -> str:
-        return FLOWS_PATH / "analyze_dataset.py"
+    @classmethod
+    def flow_build_dir_name(cls) -> str:
+        """Get the name of the build directory for this flow"""
+        return "dataset_metrics"
+
+    def add_ninja_deps(self, deps):
+        super().add_ninja_deps(deps)
+
+    def get_top_level_flow_path(self):
+        return pathlib.Path(__file__).resolve()
diff --git a/bfasst/tools/dataset_metrics/accumulate_metrics.py b/bfasst/tools/dataset_metrics/accumulate_metrics.py
index 713e8424..74473b62 100644
--- a/bfasst/tools/dataset_metrics/accumulate_metrics.py
+++ b/bfasst/tools/dataset_metrics/accumulate_metrics.py
@@ -2,17 +2,17 @@
 
 import chevron
 
-from bfasst.tools.tool import Tool
-from bfasst.paths import NINJA_BUILD_PATH, DATASET_METRICS_TOOLS_PATH, BFASST_UTILS_PATH
+from bfasst.tools.tool import ToolBase
+from bfasst.paths import BUILD_PATH, NINJA_BUILD_PATH, DATASET_METRICS_TOOLS_PATH, BFASST_UTILS_PATH
 
 
-class AccumulateMetrics(Tool):
+class AccumulateMetrics(ToolBase):
     """Accumulate metrics from the graph_metrics tool."""
 
-    def __init__(self, flow, design, pieces):
-        super().__init__(flow, design)
+    def __init__(self, flow, pieces):
+        super().__init__(flow)
         self.pieces = pieces
-        self.build_path = self.design_build_path / "dataset_metrics"
+        self.build_path = BUILD_PATH / "dataset_metrics"
         self.metrics_path = self.build_path / "master_metrics.log"
         self.summary_stats = self.build_path / "summary_stats.log"
 
diff --git a/bfasst/tools/dataset_metrics/graph_metrics.py b/bfasst/tools/dataset_metrics/graph_metrics.py
index 5a8311f7..8e4917c4 100644
--- a/bfasst/tools/dataset_metrics/graph_metrics.py
+++ b/bfasst/tools/dataset_metrics/graph_metrics.py
@@ -2,24 +2,23 @@
 
 import chevron
 
-from bfasst.tools.tool import Tool
-from bfasst.paths import NINJA_BUILD_PATH, DATASET_METRICS_TOOLS_PATH, BFASST_UTILS_PATH
+from bfasst.tools.tool import ToolBase
+from bfasst.paths import BUILD_PATH, NINJA_BUILD_PATH, DATASET_METRICS_TOOLS_PATH, BFASST_UTILS_PATH
 
 
-class GraphMetrics(Tool):
+class GraphMetrics(ToolBase):
     """Create the rule and build snippets for computing gnn dataset metrics."""
 
     def __init__(
         self,
         flow,
-        design,
         graph,
         num,
     ):
-        super().__init__(flow, design)
+        super().__init__(flow)
         self.graph = graph
         self.num = num
-        self.build_path = self.design_build_path / "dataset_metrics"
+        self.build_path = BUILD_PATH / "dataset_metrics"
         self.metrics_path = self.build_path / f"metrics_{num}.log"
 
         self._init_outputs()

From 604ac18dc7388c68333bf088db8f9048c1b7b121 Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Thu, 31 Oct 2024 16:51:51 -0600
Subject: [PATCH 07/22] added diameter

---
 bfasst/utils/process_graph.py | 117 ++++++++++++++++++++++++++++++----
 1 file changed, 106 insertions(+), 11 deletions(-)

diff --git a/bfasst/utils/process_graph.py b/bfasst/utils/process_graph.py
index 79453634..818189a8 100644
--- a/bfasst/utils/process_graph.py
+++ b/bfasst/utils/process_graph.py
@@ -14,9 +14,7 @@ def main():
     # ArgParse
     parser = argparse.ArgumentParser(description="Compute metrics on a graph.")
     parser.add_argument("graph", help="The graph to compute metrics on.")
-    parser.add_argument(
-        "-v", "--verbose", action="store_true", help="Enable debug logging."
-    )
+    parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging.")
     parser.add_argument("-o", help="The name of the output file to create")
     args = parser.parse_args()
 
@@ -112,23 +110,26 @@ def convert_to_adj_list(component_nodes, component_edges):
 def compute_metrics_per_ip(adj_lists, args):
     metrics_per_ip = {}
     for label, adj_list in adj_lists.items():
-        
+
         # set up default entries
         ip = get_ip_name_from_label(label)
         if ip not in metrics_per_ip:
-            metrics_per_ip[ip] = {"order": [], "size": []}
-        
+            metrics_per_ip[ip] = {"order": [], "size": [], "degree": [], "diameter": []}
+
         # Order
         metrics_per_ip[ip]["order"].append(len(adj_list))
 
         # Size
-        edge_count = 0
-        for node in adj_list:
-            for neighbor in adj_list[node]:
-                edge_count += 1
-        edge_count = edge_count // 2
+        edge_count = compute_size(adj_list)
         metrics_per_ip[ip]["size"].append(edge_count)
 
+        # Degree
+        avg_desgree = compute_average_degree(adj_list)
+
+        # Diameter
+        avg_diameter = compute_average_diameter(adj_list)
+        metrics_per_ip[ip]["diameter"].append(avg_diameter)
+
         # Debug (verbose flag only)
         logger.debug(f"IP: {ip}")
         logger.debug(f"Component: {label}")
@@ -139,6 +140,100 @@ def compute_metrics_per_ip(adj_lists, args):
     return metrics_per_ip
 
 
+def compute_size(adj_list):
+    edge_count = 0
+    for node in adj_list:
+        for neighbor in adj_list[node]:
+            edge_count += 1
+    return edge_count // 2
+
+
+def compute_average_diameter(adj_list):
+    uf = UnionFind()
+
+    for u in adj_list:
+        for v in adj_list[u]:
+            uf.union(u, v)
+
+    components = {}
+    for node in adj_list:
+        root = uf.find(node)
+        if root not in components:
+            components[root] = set()
+        components[root].add(node)
+
+    diameters = []
+
+    for component in components.values():
+        node = next(iter(component))
+        u, _ = bfs_farthest(adj_list, node)
+        _, diameter = bfs_farthest(adj_list, u)
+        diameters.append(diameter)
+
+    return sum(diameters) / len(diameters) if diameters else 0
+
+
+def compute_average_degree(adj_list):
+    degrees = []
+    for node in adj_list:
+        degrees.append(len(adj_list[node]))
+    return sum(degrees) / len(degrees) if degrees else 0
+
+
+class UnionFind:
+    def __init__(self):
+        self.parent = {}
+        self.rank = {}
+
+    def add(self, u):
+        if u not in self.parent:
+            self.parent[u] = u
+            self.rank[u] = 0
+
+    def find(self, u):
+        # Ensure u is in the union find
+        self.add(u)
+
+        # Path compression
+        if self.parent[u] != u:
+            self.parent[u] = self.find(self.parent[u])
+        return self.parent[u]
+
+    def union(self, u, v):
+        self.add(u)
+        self.add(v)
+        pu, pv = self.find(u), self.find(v)
+
+        if pv != pu:
+            if self.rank[pu] > self.rank[pv]:
+                self.parent[pv] = pu
+            elif self.rank[pv] > self.rank[pu]:
+                self.parent[pu] = pv
+            else:
+                self.parent[pv] = pu
+                self.rank[pu] += 1
+
+
+def bfs_farthest(adj_list, start_node):
+    queue = [(start_node, 0)]
+    visited = {start_node}
+    farthest_node = start_node
+    max_distance = 0
+
+    while queue:
+        node, distance = queue.pop(0)
+        if distance > max_distance:
+            max_distance = distance
+            farthest_node = node
+
+        for neighbor in adj_list[node]:
+            if neighbor not in visited:
+                queue.append((neighbor, distance + 1))
+                visited.add(neighbor)
+
+    return farthest_node, max_distance
+
+
 def get_ip_name_from_label(label):
     ip_name = ("_").join(label.split("_")[2:])
     return ip_name if ip_name else label

From 1657e9c3be52b1c61b9207d27a12fa987dac9f93 Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Thu, 31 Oct 2024 16:56:55 -0600
Subject: [PATCH 08/22] added degree

---
 bfasst/utils/process_graph.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bfasst/utils/process_graph.py b/bfasst/utils/process_graph.py
index 818189a8..50268c9e 100644
--- a/bfasst/utils/process_graph.py
+++ b/bfasst/utils/process_graph.py
@@ -125,6 +125,7 @@ def compute_metrics_per_ip(adj_lists, args):
 
         # Degree
         avg_desgree = compute_average_degree(adj_list)
+        metrics_per_ip[ip]["degree"].append(avg_desgree)
 
         # Diameter
         avg_diameter = compute_average_diameter(adj_list)

From f28c449ef33141414a8e98eca7c007c8ab0ac374 Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Thu, 31 Oct 2024 17:25:05 -0600
Subject: [PATCH 09/22] added kcore and global/local clustering coefficients

---
 bfasst/utils/process_graph.py | 109 +++++++++++++++++++++++++++++++++-
 1 file changed, 107 insertions(+), 2 deletions(-)

diff --git a/bfasst/utils/process_graph.py b/bfasst/utils/process_graph.py
index 50268c9e..165a14c3 100644
--- a/bfasst/utils/process_graph.py
+++ b/bfasst/utils/process_graph.py
@@ -1,7 +1,7 @@
 """Compute metrics on a single graph in a dataset."""
 
 import argparse
-from collections import defaultdict
+from collections import defaultdict, deque
 import logging
 import os
 import json
@@ -114,7 +114,15 @@ def compute_metrics_per_ip(adj_lists, args):
         # set up default entries
         ip = get_ip_name_from_label(label)
         if ip not in metrics_per_ip:
-            metrics_per_ip[ip] = {"order": [], "size": [], "degree": [], "diameter": []}
+            metrics_per_ip[ip] = {
+                "order": [],
+                "size": [],
+                "degree": [],
+                "diameter": [],
+                "kcore": [],
+                "clustering": [],
+                "local_clustering": [],
+            }
 
         # Order
         metrics_per_ip[ip]["order"].append(len(adj_list))
@@ -131,6 +139,18 @@ def compute_metrics_per_ip(adj_lists, args):
         avg_diameter = compute_average_diameter(adj_list)
         metrics_per_ip[ip]["diameter"].append(avg_diameter)
 
+        # K-core
+        max_k, _ = compute_k_core(adj_list)
+        metrics_per_ip[ip]["kcore"].append(max_k)
+
+        # Global Clustering Coefficient
+        global_clustering = compute_global_clustering(adj_list)
+        metrics_per_ip[ip]["clustering"].append(global_clustering)
+
+        # Local Clustering Coefficient
+        local_clustering = compute_local_clustering(adj_list)
+        metrics_per_ip[ip]["local_clustering"].append(local_clustering)
+
         # Debug (verbose flag only)
         logger.debug(f"IP: {ip}")
         logger.debug(f"Component: {label}")
@@ -235,6 +255,91 @@ def bfs_farthest(adj_list, start_node):
     return farthest_node, max_distance
 
 
+def compute_k_core(adj_list):
+    degree = {node: len(neighbors) for node, neighbors in adj_list.items()}
+    max_k = 0
+    k_core_subgraph = {}
+
+    k = 1
+    while True:
+        queue = deque(node for node, d in degree.items() if d <= k)
+
+        while queue:
+            node = queue.popleft()
+            for neighbor in adj_list[node]:
+                if degree[neighbor] >= k:
+                    degree[neighbor] -= 1
+                    if degree[neighbor] < k:
+                        queue.append(neighbor)
+            degree[node] = 0
+
+        k_core = {
+            node: {neighbor for neighbor in neighbors if degree[neighbor] >= k}
+            for node, neighbors in adj_list.items()
+            if degree[node] >= k
+        }
+
+        if k_core:
+            k_core_subgraph = k_core
+            max_k = k
+        else:
+            break
+
+        k += 1
+
+    return max_k, k_core_subgraph
+
+
+def compute_global_clustering(adj_list):
+    closed_triplets = 0
+    total_triplets = 0
+    visited_pairs = set()
+
+    for node in adj_list:
+        neighbors = set(adj_list[node])
+        degree = len(neighbors)
+
+        total_triplets += degree * (degree - 1) // 2
+
+        for neighbor in neighbors:
+            if (node, neighbor) in visited_pairs or (neighbor, node) in visited_pairs:
+                continue
+
+            common_neighbors = neighbors.intersection(set(adj_list[neighbor]))
+            closed_triplets += len(common_neighbors)
+            visited_pairs.add((node, neighbor))
+
+    return (3 * closed_triplets) / total_triplets if total_triplets else 0
+
+
+def compute_local_clustering(adj_list):
+    local_clustering_coefficients = []
+
+    for node in adj_list:
+        neighbors = set(adj_list[node])
+        degree = len(neighbors)
+
+        if degree < 2:
+            local_clustering_coefficients.append(0)
+            continue
+
+        closed_triplets = 0
+
+        for neighbor in neighbors:
+            common_neighbors = neighbors.intersection(set(adj_list[neighbor]))
+            closed_triplets += len(common_neighbors)
+
+        local_clustering_coefficients.append(
+            (closed_triplets) / (degree * (degree - 1)) if degree > 1 else 0
+        )
+
+    return (
+        sum(local_clustering_coefficients) / len(local_clustering_coefficients)
+        if local_clustering_coefficients
+        else 0
+    )
+
+
 def get_ip_name_from_label(label):
     ip_name = ("_").join(label.split("_")[2:])
     return ip_name if ip_name else label

From d01031fbe2feb5b1772ce2aa457287f847d6c00d Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Thu, 31 Oct 2024 17:26:37 -0600
Subject: [PATCH 10/22] updated names for clustering coefficients

---
 bfasst/utils/process_graph.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bfasst/utils/process_graph.py b/bfasst/utils/process_graph.py
index 165a14c3..4a4df071 100644
--- a/bfasst/utils/process_graph.py
+++ b/bfasst/utils/process_graph.py
@@ -120,8 +120,8 @@ def compute_metrics_per_ip(adj_lists, args):
                 "degree": [],
                 "diameter": [],
                 "kcore": [],
-                "clustering": [],
-                "local_clustering": [],
+                "global_clustering_coeff": [],
+                "local_clustering_coeff": [],
             }
 
         # Order
@@ -145,11 +145,11 @@ def compute_metrics_per_ip(adj_lists, args):
 
         # Global Clustering Coefficient
         global_clustering = compute_global_clustering(adj_list)
-        metrics_per_ip[ip]["clustering"].append(global_clustering)
+        metrics_per_ip[ip]["global_clustering_coeff"].append(global_clustering)
 
         # Local Clustering Coefficient
         local_clustering = compute_local_clustering(adj_list)
-        metrics_per_ip[ip]["local_clustering"].append(local_clustering)
+        metrics_per_ip[ip]["local_clustering_coeff"].append(local_clustering)
 
         # Debug (verbose flag only)
         logger.debug(f"IP: {ip}")

From 0c4a6657bb4d2fc2e3fed853f5381a81ead28456 Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Thu, 31 Oct 2024 17:35:06 -0600
Subject: [PATCH 11/22] added options on each metric so they can be turned
 off/on

---
 bfasst/utils/process_graph.py | 55 ++++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/bfasst/utils/process_graph.py b/bfasst/utils/process_graph.py
index 4a4df071..5afea97d 100644
--- a/bfasst/utils/process_graph.py
+++ b/bfasst/utils/process_graph.py
@@ -16,6 +16,28 @@ def main():
     parser.add_argument("graph", help="The graph to compute metrics on.")
     parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging.")
     parser.add_argument("-o", help="The name of the output file to create")
+
+    parser.add_argument("--order", action="store_true", help="Compute the order of the graph.")
+    parser.add_argument("--size", action="store_true", help="Compute the size of the graph.")
+    parser.add_argument(
+        "--degree", action="store_true", help="Compute the average degree of the graph."
+    )
+    parser.add_argument(
+        "--diameter", action="store_true", help="Compute the average diameter of the graph."
+    )
+    parser.add_argument("--kcore", action="store_true", help="Compute the k-core of the graph.")
+    parser.add_argument(
+        "--global_clustering_coeff",
+        action="store_true",
+        help="Compute the global clustering coefficient of the graph.",
+    )
+    parser.add_argument(
+        "--local_clustering_coeff",
+        action="store_true",
+        help="Compute the local clustering coefficient of the graph.",
+    )
+    parser.add_argument("--all", action="store_true", help="Compute all metrics.", default=True)
+
     args = parser.parse_args()
 
     # Logging (for debug, don't use in parallel)
@@ -125,31 +147,38 @@ def compute_metrics_per_ip(adj_lists, args):
             }
 
         # Order
-        metrics_per_ip[ip]["order"].append(len(adj_list))
+        if args.all or args.order:
+            metrics_per_ip[ip]["order"].append(len(adj_list))
 
         # Size
-        edge_count = compute_size(adj_list)
-        metrics_per_ip[ip]["size"].append(edge_count)
+        if args.all or args.size:
+            edge_count = compute_size(adj_list)
+            metrics_per_ip[ip]["size"].append(edge_count)
 
         # Degree
-        avg_desgree = compute_average_degree(adj_list)
-        metrics_per_ip[ip]["degree"].append(avg_desgree)
+        if args.all or args.degree:
+            avg_desgree = compute_average_degree(adj_list)
+            metrics_per_ip[ip]["degree"].append(avg_desgree)
 
         # Diameter
-        avg_diameter = compute_average_diameter(adj_list)
-        metrics_per_ip[ip]["diameter"].append(avg_diameter)
+        if args.all or args.diameter:
+            avg_diameter = compute_average_diameter(adj_list)
+            metrics_per_ip[ip]["diameter"].append(avg_diameter)
 
         # K-core
-        max_k, _ = compute_k_core(adj_list)
-        metrics_per_ip[ip]["kcore"].append(max_k)
+        if args.all or args.kcore:
+            max_k, _ = compute_k_core(adj_list)
+            metrics_per_ip[ip]["kcore"].append(max_k)
 
         # Global Clustering Coefficient
-        global_clustering = compute_global_clustering(adj_list)
-        metrics_per_ip[ip]["global_clustering_coeff"].append(global_clustering)
+        if args.all or args.global_clustering_coeff:
+            global_clustering = compute_global_clustering(adj_list)
+            metrics_per_ip[ip]["global_clustering_coeff"].append(global_clustering)
 
         # Local Clustering Coefficient
-        local_clustering = compute_local_clustering(adj_list)
-        metrics_per_ip[ip]["local_clustering_coeff"].append(local_clustering)
+        if args.all or args.local_clustering_coeff:
+            local_clustering = compute_local_clustering(adj_list)
+            metrics_per_ip[ip]["local_clustering_coeff"].append(local_clustering)
 
         # Debug (verbose flag only)
         logger.debug(f"IP: {ip}")

From 80e3edaa6a186b134e4755f72d85d8a3142619ad Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Thu, 31 Oct 2024 17:45:18 -0600
Subject: [PATCH 12/22] do not iterate over the summary or master metrics logs

---
 bfasst/utils/accumulate_metrics.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/bfasst/utils/accumulate_metrics.py b/bfasst/utils/accumulate_metrics.py
index c1f63c53..055fa13e 100644
--- a/bfasst/utils/accumulate_metrics.py
+++ b/bfasst/utils/accumulate_metrics.py
@@ -31,12 +31,23 @@ def main():
 
     # Initialize the master dictionary
     master_metrics = {}
+    master_metrics_output = args.m if args.m else "master_metrics.log"
+    stats_summary_output = args.s if args.s else "summary_statistics.log"
 
     # Iterate through the files in the analysis directory
     for file in Path(args.analysis_dir).iterdir():
         if file.is_dir():
             continue
 
+        if (
+            file.name == master_metrics_output
+            or file.name == stats_summary_output
+            # if these exist, don't read them even if master_metrics_output and stats_summary_output are different
+            or file.name == "master_metrics.log"
+            or file.name == "summary_statistics.log"
+        ):
+            continue
+
         with open(file, "r") as f:
             graph_metrics = json.loads(f.readline())
 
@@ -83,12 +94,10 @@ def main():
                 }
 
     # write master_metrics to a file
-    output = args.m if args.m else "master_metrics.log"
-    with open(output, "w") as f:
+    with open(master_metrics_output, "w") as f:
         f.write(json.dumps(master_metrics, indent=4))
 
-    output = args.s if args.s else "summary_statistics.log"
-    with open(output, "w") as f:
+    with open(stats_summary_output, "w") as f:
         f.write(json.dumps(stats_summary, indent=4))
 
 

From 705feda77d15152639b0d3841bab98c6f15cc141 Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Thu, 31 Oct 2024 17:59:38 -0600
Subject: [PATCH 13/22] pylint

---
 bfasst/flows/analyze_dataset.py    |   1 -
 bfasst/utils/accumulate_metrics.py | 109 ++++++++++++++++++-----------
 bfasst/utils/process_graph.py      |  11 ++-
 3 files changed, 78 insertions(+), 43 deletions(-)

diff --git a/bfasst/flows/analyze_dataset.py b/bfasst/flows/analyze_dataset.py
index 399630c9..b92bc258 100644
--- a/bfasst/flows/analyze_dataset.py
+++ b/bfasst/flows/analyze_dataset.py
@@ -3,7 +3,6 @@
 from pathlib import Path
 import pathlib
 from bfasst.flows.flow import FlowNoDesign
-from bfasst.paths import FLOWS_PATH
 from bfasst.tools.dataset_metrics.accumulate_metrics import AccumulateMetrics
 from bfasst.tools.dataset_metrics.graph_metrics import GraphMetrics
 
diff --git a/bfasst/utils/accumulate_metrics.py b/bfasst/utils/accumulate_metrics.py
index 055fa13e..0d457652 100644
--- a/bfasst/utils/accumulate_metrics.py
+++ b/bfasst/utils/accumulate_metrics.py
@@ -12,16 +12,7 @@
 def main():
     """Load the graph, convert to adj_list, and compute metrics."""
     # ArgParse
-    parser = argparse.ArgumentParser(description="Compute metrics on a graph.")
-    parser.add_argument(
-        "analysis_dir", help="The path to the folder containing all analysis files for all graphs."
-    )
-    parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging.")
-    parser.add_argument("-m", help="The name of the metrics file to create")
-    parser.add_argument(
-        "-s", help="The name of the stats (5-num summary, mean, stddev) file to create"
-    )
-    args = parser.parse_args()
+    args = get_args()
 
     # Logging (for debug, don't use in parallel)
     logging.basicConfig(
@@ -30,21 +21,55 @@ def main():
     )
 
     # Initialize the master dictionary
-    master_metrics = {}
     master_metrics_output = args.m if args.m else "master_metrics.log"
     stats_summary_output = args.s if args.s else "summary_statistics.log"
 
     # Iterate through the files in the analysis directory
-    for file in Path(args.analysis_dir).iterdir():
+    master_metrics = compute_master_metrics(
+        args.analysis_dir, master_metrics_output, stats_summary_output
+    )
+
+    # sort the values for each metric after merging
+    master_metrics = sort_metrics(master_metrics)
+
+    # Compute the stats for each metric
+    stats_summary = get_stats_summary(master_metrics)
+
+    # write master_metrics to a file
+    with open(master_metrics_output, "w") as f:
+        f.write(json.dumps(master_metrics, indent=4))
+
+    with open(stats_summary_output, "w") as f:
+        f.write(json.dumps(stats_summary, indent=4))
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="Compute metrics on a graph.")
+    parser.add_argument(
+        "analysis_dir", help="The path to the folder containing all analysis files for all graphs."
+    )
+    parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging.")
+    parser.add_argument("-m", help="The name of the metrics file to create")
+    parser.add_argument(
+        "-s", help="The name of the stats (5-num summary, mean, stddev) file to create"
+    )
+    return parser.parse_args()
+
+
+def compute_master_metrics(analysis_dir, master_metrics_output, stats_summary_output):
+    master_metrics = {}
+    for file in Path(analysis_dir).iterdir():
         if file.is_dir():
             continue
 
-        if (
-            file.name == master_metrics_output
-            or file.name == stats_summary_output
-            # if these exist, don't read them even if master_metrics_output and stats_summary_output are different
-            or file.name == "master_metrics.log"
-            or file.name == "summary_statistics.log"
+        if file.name in (
+            master_metrics_output,
+            stats_summary_output,
+            # Skip the master_metrics and stats_summary files
+            # Even if the user has specified different names
+            # for this run
+            "master_metrics.log",
+            "summary_statistics.log",
         ):
             continue
 
@@ -64,51 +89,53 @@ def main():
                 # Concatenate the lists
                 master_metrics[ip][metric].extend(values)
 
-    # sort the values for each metric after merging
-    for ip in master_metrics:
-        for metric in master_metrics[ip]:
-            master_metrics[ip][metric] = sorted(master_metrics[ip][metric])
+    return master_metrics
 
-    # Compute the stats for each metric
-    stats_summary = {}
-    for ip, metrics in master_metrics.items():
+
+def sort_metrics(metrics):
+    """Sort the values for each metric in the dictionary."""
+    for ip, _ in metrics.items():
+        for metric in metrics[ip]:
+            metrics[ip][metric] = sorted(metrics[ip][metric])
+    return metrics
+
+
+def get_stats_summary(metrics):
+    summary = {}
+    for ip, metrics in metrics.items():
         for metric, values in metrics.items():
             # Calculate statistics
             if values:  # Check if the list is not empty
-                min_val, Q1, median, Q3, max_val = five_number_summary(values)
+                min_val, first_quartile, median, third_quartile, max_val = five_number_summary(
+                    values
+                )
                 mean = sum(values) / len(values)
                 stddev = statistics.stdev(values) if len(values) > 1 else 0.0
 
                 # Prepare the summary dictionary
-                if ip not in stats_summary:
-                    stats_summary[ip] = {}
+                if ip not in summary:
+                    summary[ip] = {}
 
-                stats_summary[ip][metric] = {
+                summary[ip][metric] = {
                     "min": min_val,
-                    "Q1": Q1,
+                    "Q1": first_quartile,
                     "median": median,
-                    "Q3": Q3,
+                    "Q3": third_quartile,
                     "max": max_val,
                     "mean": mean,
                     "stddev": stddev,
                 }
-
-    # write master_metrics to a file
-    with open(master_metrics_output, "w") as f:
-        f.write(json.dumps(master_metrics, indent=4))
-
-    with open(stats_summary_output, "w") as f:
-        f.write(json.dumps(stats_summary, indent=4))
+    return summary
 
 
 def five_number_summary(data):
     n = len(data)
     min_val = data[0]
     max_val = data[-1]
-    Q1 = data[n // 4]
+    first_quartile = data[n // 4]
     median = data[n // 2]
-    Q3 = data[(3 * n) // 4]
-    return min_val, Q1, median, Q3, max_val
+    third_quartile = data[(3 * n) // 4]
+    return min_val, first_quartile, median, third_quartile, max_val
 
 
 if __name__ == "__main__":
diff --git a/bfasst/utils/process_graph.py b/bfasst/utils/process_graph.py
index 5afea97d..4b55df46 100644
--- a/bfasst/utils/process_graph.py
+++ b/bfasst/utils/process_graph.py
@@ -130,6 +130,7 @@ def convert_to_adj_list(component_nodes, component_edges):
 
 
 def compute_metrics_per_ip(adj_lists, args):
+    """Compute metrics for each IP in the graph."""
     metrics_per_ip = {}
     for label, adj_list in adj_lists.items():
 
@@ -193,7 +194,7 @@ def compute_metrics_per_ip(adj_lists, args):
 def compute_size(adj_list):
     edge_count = 0
     for node in adj_list:
-        for neighbor in adj_list[node]:
+        for _ in adj_list[node]:
             edge_count += 1
     return edge_count // 2
 
@@ -231,6 +232,8 @@ def compute_average_degree(adj_list):
 
 
 class UnionFind:
+    """Union-find data structure."""
+
     def __init__(self):
         self.parent = {}
         self.rank = {}
@@ -241,6 +244,7 @@ def add(self, u):
             self.rank[u] = 0
 
     def find(self, u):
+        """Find the parent of a node."""
         # Ensure u is in the union find
         self.add(u)
 
@@ -250,6 +254,7 @@ def find(self, u):
         return self.parent[u]
 
     def union(self, u, v):
+        """Union two nodes."""
         self.add(u)
         self.add(v)
         pu, pv = self.find(u), self.find(v)
@@ -265,6 +270,7 @@ def union(self, u, v):
 
 
 def bfs_farthest(adj_list, start_node):
+    """Breadth-first search to find the farthest node from a starting node."""
     queue = [(start_node, 0)]
     visited = {start_node}
     farthest_node = start_node
@@ -285,6 +291,7 @@ def bfs_farthest(adj_list, start_node):
 
 
 def compute_k_core(adj_list):
+    """Compute the k-core of a graph."""
     degree = {node: len(neighbors) for node, neighbors in adj_list.items()}
     max_k = 0
     k_core_subgraph = {}
@@ -320,6 +327,7 @@ def compute_k_core(adj_list):
 
 
 def compute_global_clustering(adj_list):
+    """Compute the global clustering coefficient of a graph."""
     closed_triplets = 0
     total_triplets = 0
     visited_pairs = set()
@@ -342,6 +350,7 @@ def compute_global_clustering(adj_list):
 
 
 def compute_local_clustering(adj_list):
+    """Compute the local clustering coefficient of a graph."""
     local_clustering_coefficients = []
 
     for node in adj_list:

From 2f77374ca485581c490991b5ea22f14486ad8103 Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Thu, 31 Oct 2024 18:06:29 -0600
Subject: [PATCH 14/22] pylint

---
 bfasst/utils/accumulate_metrics.py | 8 ++++++--
 bfasst/utils/process_graph.py      | 9 +++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/bfasst/utils/accumulate_metrics.py b/bfasst/utils/accumulate_metrics.py
index 0d457652..3b163f00 100644
--- a/bfasst/utils/accumulate_metrics.py
+++ b/bfasst/utils/accumulate_metrics.py
@@ -44,6 +44,7 @@ def main():
 
 
 def get_args():
+    """Get the command line arguments."""
     parser = argparse.ArgumentParser(description="Compute metrics on a graph.")
     parser.add_argument(
         "analysis_dir", help="The path to the folder containing all analysis files for all graphs."
@@ -57,6 +58,7 @@ def get_args():
 
 
 def compute_master_metrics(analysis_dir, master_metrics_output, stats_summary_output):
+    """Compute the master metrics from the analysis directory."""
     master_metrics = {}
     for file in Path(analysis_dir).iterdir():
         if file.is_dir():
@@ -100,9 +102,10 @@ def sort_metrics(metrics):
     return metrics
 
 
-def get_stats_summary(metrics):
+def get_stats_summary(master_metrics):
+    """Compute the 5-number summary, mean, and standard deviation for each metric."""
     summary = {}
-    for ip, metrics in metrics.items():
+    for ip, metrics in master_metrics.items():
         for metric, values in metrics.items():
             # Calculate statistics
             if values:  # Check if the list is not empty
@@ -129,6 +132,7 @@ def get_stats_summary(metrics):
 
 
 def five_number_summary(data):
+    """Compute the 5-number summary for the given data."""
     n = len(data)
     min_val = data[0]
     max_val = data[-1]
diff --git a/bfasst/utils/process_graph.py b/bfasst/utils/process_graph.py
index 4b55df46..1170788d 100644
--- a/bfasst/utils/process_graph.py
+++ b/bfasst/utils/process_graph.py
@@ -182,10 +182,10 @@ def compute_metrics_per_ip(adj_lists, args):
             metrics_per_ip[ip]["local_clustering_coeff"].append(local_clustering)
 
         # Debug (verbose flag only)
-        logger.debug(f"IP: {ip}")
-        logger.debug(f"Component: {label}")
-        logger.debug(f"Nodes: {len(adj_list)}")
-        logger.debug(f"Edges: {edge_count}")
+        logger.debug("IP: %s", ip)
+        logger.debug("Component: %s", label)
+        logger.debug("Nodes: %s", len(adj_list))
+        logger.debug("Edges: %s", edge_count)
         logger.debug("")
 
     return metrics_per_ip
@@ -200,6 +200,7 @@ def compute_size(adj_list):
 
 
 def compute_average_diameter(adj_list):
+    """Compute the average diameter of a graph."""
     uf = UnionFind()
 
     for u in adj_list:

From 76dbad651dfb3e8e4291aae58893956145646807 Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Mon, 4 Nov 2024 14:35:53 -0700
Subject: [PATCH 15/22] removed kcore and local clustering

---
 bfasst/utils/process_graph.py | 83 -----------------------------------
 1 file changed, 83 deletions(-)

diff --git a/bfasst/utils/process_graph.py b/bfasst/utils/process_graph.py
index 1170788d..ee1853ee 100644
--- a/bfasst/utils/process_graph.py
+++ b/bfasst/utils/process_graph.py
@@ -25,17 +25,11 @@ def main():
     parser.add_argument(
         "--diameter", action="store_true", help="Compute the average diameter of the graph."
     )
-    parser.add_argument("--kcore", action="store_true", help="Compute the k-core of the graph.")
     parser.add_argument(
         "--global_clustering_coeff",
         action="store_true",
         help="Compute the global clustering coefficient of the graph.",
     )
-    parser.add_argument(
-        "--local_clustering_coeff",
-        action="store_true",
-        help="Compute the local clustering coefficient of the graph.",
-    )
     parser.add_argument("--all", action="store_true", help="Compute all metrics.", default=True)
 
     args = parser.parse_args()
@@ -142,9 +136,7 @@ def compute_metrics_per_ip(adj_lists, args):
                 "size": [],
                 "degree": [],
                 "diameter": [],
-                "kcore": [],
                 "global_clustering_coeff": [],
-                "local_clustering_coeff": [],
             }
 
         # Order
@@ -166,21 +158,11 @@ def compute_metrics_per_ip(adj_lists, args):
             avg_diameter = compute_average_diameter(adj_list)
             metrics_per_ip[ip]["diameter"].append(avg_diameter)
 
-        # K-core
-        if args.all or args.kcore:
-            max_k, _ = compute_k_core(adj_list)
-            metrics_per_ip[ip]["kcore"].append(max_k)
-
         # Global Clustering Coefficient
         if args.all or args.global_clustering_coeff:
             global_clustering = compute_global_clustering(adj_list)
             metrics_per_ip[ip]["global_clustering_coeff"].append(global_clustering)
 
-        # Local Clustering Coefficient
-        if args.all or args.local_clustering_coeff:
-            local_clustering = compute_local_clustering(adj_list)
-            metrics_per_ip[ip]["local_clustering_coeff"].append(local_clustering)
-
         # Debug (verbose flag only)
         logger.debug("IP: %s", ip)
         logger.debug("Component: %s", label)
@@ -291,42 +273,6 @@ def bfs_farthest(adj_list, start_node):
     return farthest_node, max_distance
 
 
-def compute_k_core(adj_list):
-    """Compute the k-core of a graph."""
-    degree = {node: len(neighbors) for node, neighbors in adj_list.items()}
-    max_k = 0
-    k_core_subgraph = {}
-
-    k = 1
-    while True:
-        queue = deque(node for node, d in degree.items() if d <= k)
-
-        while queue:
-            node = queue.popleft()
-            for neighbor in adj_list[node]:
-                if degree[neighbor] >= k:
-                    degree[neighbor] -= 1
-                    if degree[neighbor] < k:
-                        queue.append(neighbor)
-            degree[node] = 0
-
-        k_core = {
-            node: {neighbor for neighbor in neighbors if degree[neighbor] >= k}
-            for node, neighbors in adj_list.items()
-            if degree[node] >= k
-        }
-
-        if k_core:
-            k_core_subgraph = k_core
-            max_k = k
-        else:
-            break
-
-        k += 1
-
-    return max_k, k_core_subgraph
-
-
 def compute_global_clustering(adj_list):
     """Compute the global clustering coefficient of a graph."""
     closed_triplets = 0
@@ -350,35 +296,6 @@ def compute_global_clustering(adj_list):
     return (3 * closed_triplets) / total_triplets if total_triplets else 0
 
 
-def compute_local_clustering(adj_list):
-    """Compute the local clustering coefficient of a graph."""
-    local_clustering_coefficients = []
-
-    for node in adj_list:
-        neighbors = set(adj_list[node])
-        degree = len(neighbors)
-
-        if degree < 2:
-            local_clustering_coefficients.append(0)
-            continue
-
-        closed_triplets = 0
-
-        for neighbor in neighbors:
-            common_neighbors = neighbors.intersection(set(adj_list[neighbor]))
-            closed_triplets += len(common_neighbors)
-
-        local_clustering_coefficients.append(
-            (closed_triplets) / (degree * (degree - 1)) if degree > 1 else 0
-        )
-
-    return (
-        sum(local_clustering_coefficients) / len(local_clustering_coefficients)
-        if local_clustering_coefficients
-        else 0
-    )
-
-
 def get_ip_name_from_label(label):
     ip_name = ("_").join(label.split("_")[2:])
     return ip_name if ip_name else label

From 5961870d4bd91b3496841ccb56d69170b21c5410 Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Mon, 18 Nov 2024 12:31:30 -0700
Subject: [PATCH 16/22] added utility scripts as deps to dataset_metrics tools

---
 bfasst/tools/dataset_metrics/accumulate_metrics.py          | 1 +
 .../dataset_metrics/accumulate_metrics_build.ninja.mustache | 2 +-
 bfasst/tools/dataset_metrics/graph_metrics.py               | 6 +++++-
 .../dataset_metrics/process_graph_build.ninja.mustache      | 2 +-
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/bfasst/tools/dataset_metrics/accumulate_metrics.py b/bfasst/tools/dataset_metrics/accumulate_metrics.py
index 74473b62..7869749c 100644
--- a/bfasst/tools/dataset_metrics/accumulate_metrics.py
+++ b/bfasst/tools/dataset_metrics/accumulate_metrics.py
@@ -30,6 +30,7 @@ def create_build_snippets(self):
                     "summary_stats": self.summary_stats,
                     "aggregation_dir": self.build_path,
                     "pieces": self.pieces,
+                    "accumulate_metrics_util": BFASST_UTILS_PATH / "accumulate_metrics.py",
                 },
             )
 
diff --git a/bfasst/tools/dataset_metrics/accumulate_metrics_build.ninja.mustache b/bfasst/tools/dataset_metrics/accumulate_metrics_build.ninja.mustache
index 2299f729..2b0b775c 100644
--- a/bfasst/tools/dataset_metrics/accumulate_metrics_build.ninja.mustache
+++ b/bfasst/tools/dataset_metrics/accumulate_metrics_build.ninja.mustache
@@ -1,4 +1,4 @@
-build {{ metrics_file }} {{ summary_stats }}: accumulate_metrics {{ aggregation_dir }} | {{#pieces}}{{.}} {{/pieces}}
+build {{ metrics_file }} {{ summary_stats }}: accumulate_metrics {{ aggregation_dir }} | {{#pieces}}{{.}} {{/pieces}} {{ accumulate_metrics_util }}
     metrics_file = {{ metrics_file }}
     summary_stats = {{ summary_stats }}
 
diff --git a/bfasst/tools/dataset_metrics/graph_metrics.py b/bfasst/tools/dataset_metrics/graph_metrics.py
index 8e4917c4..f00ebf9c 100644
--- a/bfasst/tools/dataset_metrics/graph_metrics.py
+++ b/bfasst/tools/dataset_metrics/graph_metrics.py
@@ -28,7 +28,11 @@ def create_build_snippets(self):
         with open(DATASET_METRICS_TOOLS_PATH / "process_graph_build.ninja.mustache", "r") as f:
             build = chevron.render(
                 f,
-                {"output": self.metrics_path, "graph": self.graph},
+                {
+                    "output": self.metrics_path,
+                    "graph": self.graph,
+                    "process_graph_util": BFASST_UTILS_PATH / "process_graph.py",
+                },
             )
 
         with open(NINJA_BUILD_PATH, "a") as f:
diff --git a/bfasst/tools/dataset_metrics/process_graph_build.ninja.mustache b/bfasst/tools/dataset_metrics/process_graph_build.ninja.mustache
index 92cf4887..954d06ab 100644
--- a/bfasst/tools/dataset_metrics/process_graph_build.ninja.mustache
+++ b/bfasst/tools/dataset_metrics/process_graph_build.ninja.mustache
@@ -1,2 +1,2 @@
-build {{ output }}: process_graph {{ graph }}
+build {{ output }}: process_graph {{ graph }} | {{ process_graph_util }}
 

From e9e09c65c5037a97796ecd754965e4a4048a7057 Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Mon, 18 Nov 2024 12:46:48 -0700
Subject: [PATCH 17/22] updated scripts to work per-component and per-instance,
 updated summary stats file name

---
 bfasst/utils/accumulate_metrics.py |   4 +-
 bfasst/utils/process_graph.py      | 118 ++++++++++++++++++++++++++---
 2 files changed, 109 insertions(+), 13 deletions(-)

diff --git a/bfasst/utils/accumulate_metrics.py b/bfasst/utils/accumulate_metrics.py
index 3b163f00..e2d8f45d 100644
--- a/bfasst/utils/accumulate_metrics.py
+++ b/bfasst/utils/accumulate_metrics.py
@@ -71,10 +71,12 @@ def compute_master_metrics(analysis_dir, master_metrics_output, stats_summary_ou
             # Even if the user has specified different names
             # for this run
             "master_metrics.log",
-            "summary_statistics.log",
+            "summary_stats.log",
         ):
             continue
 
+        logger.debug(f"Processing {file}")
+
         with open(file, "r") as f:
             graph_metrics = json.loads(f.readline())
 
diff --git a/bfasst/utils/process_graph.py b/bfasst/utils/process_graph.py
index ee1853ee..f916acf6 100644
--- a/bfasst/utils/process_graph.py
+++ b/bfasst/utils/process_graph.py
@@ -25,6 +25,9 @@ def main():
     parser.add_argument(
         "--diameter", action="store_true", help="Compute the average diameter of the graph."
     )
+    parser.add_argument(
+        "--component_count", action="store_true", help="Compute the number of components."
+    )
     parser.add_argument(
         "--global_clustering_coeff",
         action="store_true",
@@ -128,35 +131,61 @@ def compute_metrics_per_ip(adj_lists, args):
     metrics_per_ip = {}
     for label, adj_list in adj_lists.items():
 
+        # Compute components
+        components = compute_components(adj_list)
+
         # set up default entries
         ip = get_ip_name_from_label(label)
         if ip not in metrics_per_ip:
             metrics_per_ip[ip] = {
-                "order": [],
-                "size": [],
-                "degree": [],
-                "diameter": [],
+                "instance_order": [],
+                "component_orders": [],
+                "instance_size": [],
+                "component_sizes": [],
+                "avg_degree": [],
+                "avg_diameter": [],
+                "component_diameters": [],
+                "component_count": [],
                 "global_clustering_coeff": [],
             }
 
         # Order
         if args.all or args.order:
-            metrics_per_ip[ip]["order"].append(len(adj_list))
+            metrics_per_ip[ip]["instance_order"].append(len(adj_list))
+
+        # Component-wise order
+        if args.all or args.order:
+            component_orders = compute_component_orders(components)
+            metrics_per_ip[ip]["component_orders"].extend(component_orders)
 
         # Size
         if args.all or args.size:
             edge_count = compute_size(adj_list)
-            metrics_per_ip[ip]["size"].append(edge_count)
+            metrics_per_ip[ip]["instance_size"].append(edge_count)
 
-        # Degree
+        # Component-wise size
+        if args.all or args.size:
+            component_sizes = compute_component_sizes(components, adj_list)
+            metrics_per_ip[ip]["component_sizes"].extend(component_sizes)
+
+        # Avg Degree
         if args.all or args.degree:
             avg_desgree = compute_average_degree(adj_list)
-            metrics_per_ip[ip]["degree"].append(avg_desgree)
+            metrics_per_ip[ip]["avg_degree"].append(avg_desgree)
+
+        # Avg Diameter
+        if args.all or args.diameter:
+            avg_diameter = compute_average_diameter(components, adj_list)
+            metrics_per_ip[ip]["avg_diameter"].append(avg_diameter)
 
-        # Diameter
+        # Component Diameters
         if args.all or args.diameter:
-            avg_diameter = compute_average_diameter(adj_list)
-            metrics_per_ip[ip]["diameter"].append(avg_diameter)
+            component_diameters = compute_component_diameters(components, adj_list)
+            metrics_per_ip[ip]["component_diameters"].extend(component_diameters)
+
+        # Component Count
+        if args.all or args.component_count:
+            metrics_per_ip[ip]["component_count"].append(len(components))
 
         # Global Clustering Coefficient
         if args.all or args.global_clustering_coeff:
@@ -173,6 +202,32 @@ def compute_metrics_per_ip(adj_lists, args):
     return metrics_per_ip
 
 
+def compute_components(adj_list):
+    """Compute the components of a graph."""
+    uf = UnionFind()
+
+    for u in adj_list:
+        for v in adj_list[u]:
+            uf.union(u, v)
+
+    components = {}
+    for node in adj_list:
+        root = uf.find(node)
+        if root not in components:
+            components[root] = set()
+        components[root].add(node)
+
+    return components
+
+
+def compute_component_orders(components):
+    """Compute the order of each component in a graph."""
+    orders = []
+    for component in components.values():
+        orders.append(len(component))
+    return orders
+
+
 def compute_size(adj_list):
     edge_count = 0
     for node in adj_list:
@@ -181,7 +236,20 @@ def compute_size(adj_list):
     return edge_count // 2
 
 
-def compute_average_diameter(adj_list):
+def compute_component_sizes(components, adj_list):
+    """Compute the size of each component in a graph."""
+    sizes = []
+    for component in components.values():
+        edge_count = 0
+        for node in component:
+            for neighbor in adj_list[node]:
+                if neighbor in component:
+                    edge_count += 1
+        sizes.append(edge_count // 2)
+    return sizes
+
+
+def compute_average_diameter(components, adj_list):
     """Compute the average diameter of a graph."""
     uf = UnionFind()
 
@@ -207,6 +275,17 @@ def compute_average_diameter(adj_list):
     return sum(diameters) / len(diameters) if diameters else 0
 
 
+def compute_component_diameters(components, adj_list):
+    """Compute the diameter of each component in a graph."""
+    diameters = []
+    for component in components.values():
+        node = next(iter(component))
+        u, _ = bfs_farthest(adj_list, node)
+        _, diameter = bfs_farthest(adj_list, u)
+        diameters.append(diameter)
+    return diameters
+
+
 def compute_average_degree(adj_list):
     degrees = []
     for node in adj_list:
@@ -301,5 +380,20 @@ def get_ip_name_from_label(label):
     return ip_name if ip_name else label
 
 
+def run_test():
+    adj_list = {
+        "A": ["B", "C"],
+        "B": ["A", "C"],
+        "C": ["A", "B"],
+        "D": ["E"],
+        "E": ["D"],
+    }
+
+    components = compute_components(adj_list)
+    assert len(components) == 2
+    logger.debug(components)
+
+
 if __name__ == "__main__":
     main()
+    run_test()

From a55111d887bd436eb298c92c34d2eacc09d4c5f3 Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Mon, 18 Nov 2024 12:50:54 -0700
Subject: [PATCH 18/22] pylint

---
 bfasst/utils/accumulate_metrics.py | 2 +-
 bfasst/utils/process_graph.py      | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/bfasst/utils/accumulate_metrics.py b/bfasst/utils/accumulate_metrics.py
index e2d8f45d..8461ffa7 100644
--- a/bfasst/utils/accumulate_metrics.py
+++ b/bfasst/utils/accumulate_metrics.py
@@ -75,7 +75,7 @@ def compute_master_metrics(analysis_dir, master_metrics_output, stats_summary_ou
         ):
             continue
 
-        logger.debug(f"Processing {file}")
+        logger.debug("Processing %s", file)
 
         with open(file, "r") as f:
             graph_metrics = json.loads(f.readline())
diff --git a/bfasst/utils/process_graph.py b/bfasst/utils/process_graph.py
index f916acf6..1332b3be 100644
--- a/bfasst/utils/process_graph.py
+++ b/bfasst/utils/process_graph.py
@@ -1,7 +1,7 @@
 """Compute metrics on a single graph in a dataset."""
 
 import argparse
-from collections import defaultdict, deque
+from collections import defaultdict
 import logging
 import os
 import json
@@ -381,6 +381,7 @@ def get_ip_name_from_label(label):
 
 
 def run_test():
+    """Ensure union find works."""
     adj_list = {
         "A": ["B", "C"],
         "B": ["A", "C"],

From a1c3dd0ee7bf9aefe0b0b7986b66f2d1e259401a Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Mon, 18 Nov 2024 14:53:49 -0700
Subject: [PATCH 19/22] added k core

---
 bfasst/utils/process_graph.py | 76 +++++++++++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 3 deletions(-)

diff --git a/bfasst/utils/process_graph.py b/bfasst/utils/process_graph.py
index 1332b3be..d53898af 100644
--- a/bfasst/utils/process_graph.py
+++ b/bfasst/utils/process_graph.py
@@ -1,7 +1,7 @@
 """Compute metrics on a single graph in a dataset."""
 
 import argparse
-from collections import defaultdict
+from collections import defaultdict, deque
 import logging
 import os
 import json
@@ -33,6 +33,9 @@ def main():
         action="store_true",
         help="Compute the global clustering coefficient of the graph.",
     )
+    parser.add_argument(
+        "--k_core", action="store_true", help="Compute the maximal k-core of the graph."
+    )
     parser.add_argument("--all", action="store_true", help="Compute all metrics.", default=True)
 
     args = parser.parse_args()
@@ -147,6 +150,7 @@ def compute_metrics_per_ip(adj_lists, args):
                 "component_diameters": [],
                 "component_count": [],
                 "global_clustering_coeff": [],
+                "max_k_core": [],
             }
 
         # Order
@@ -192,6 +196,11 @@ def compute_metrics_per_ip(adj_lists, args):
             global_clustering = compute_global_clustering(adj_list)
             metrics_per_ip[ip]["global_clustering_coeff"].append(global_clustering)
 
+        # K-Core
+        if args.all or args.k_core:
+            max_k, _ = compute_k_core(adj_list)
+            metrics_per_ip[ip]["max_k_core"].append(max_k)
+
         # Debug (verbose flag only)
         logger.debug("IP: %s", ip)
         logger.debug("Component: %s", label)
@@ -375,12 +384,50 @@ def compute_global_clustering(adj_list):
     return (3 * closed_triplets) / total_triplets if total_triplets else 0
 
 
+def compute_k_core(adj_list):
+    """Compute the k-core of a graph."""
+    degree = {node: len(neighbors) for node, neighbors in adj_list.items()}
+    max_k = 0
+    k_core_subgraph = {}
+
+    k = 1
+    while True:
+        queue = deque(node for node, d in degree.items() if d <= k)
+
+        while queue:
+            node = queue.popleft()
+            for neighbor in adj_list[node]:
+                if degree[neighbor] >= k:
+                    degree[neighbor] -= 1
+                    if degree[neighbor] < k:
+                        queue.append(neighbor)
+            degree[node] = 0
+
+        k_core = {
+            node: {neighbor for neighbor in neighbors if degree[neighbor] >= k}
+            for node, neighbors in adj_list.items()
+            if degree[node] >= k
+        }
+
+        if k_core:
+            k_core_subgraph = k_core
+            max_k = k
+        else:
+            if max_k != 0:
+                max_k += 1
+            break
+
+        k += 1
+
+    return max_k, k_core_subgraph
+
+
 def get_ip_name_from_label(label):
     ip_name = ("_").join(label.split("_")[2:])
     return ip_name if ip_name else label
 
 
-def run_test():
+def test_uf_components():
     """Ensure union find works."""
     adj_list = {
         "A": ["B", "C"],
@@ -395,6 +442,29 @@ def run_test():
     logger.debug(components)
 
 
+def test_k_core():
+    """Ensure k-core works."""
+    adj_list = {
+        "A": ["B", "C", "D", "E"],
+        "B": ["A", "C", "D", "E"],
+        "C": ["A", "B", "D", "F"],
+        "D": ["A", "B", "C", "J"],
+        "E": ["A", "B", "F", "I"],
+        "F": ["C", "E", "G", "H"],
+        "G": ["F"],
+        "H": ["F"],
+        "I": ["E"],
+        "J": ["D", "K", "L"],
+        "K": ["J"],
+        "L": ["J"],
+    }
+
+    max_k, k_core = compute_k_core(adj_list)
+    assert max_k == 3  # A, B, C, D is a 3-core
+    logger.debug(k_core)
+
+
 if __name__ == "__main__":
     main()
-    run_test()
+    test_uf_components()
+    test_k_core()

From 36ebf50979789f22d62d99de6a01fa241ab0448f Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Mon, 18 Nov 2024 15:05:41 -0700
Subject: [PATCH 20/22] make sure k core increments correctly

---
 bfasst/utils/process_graph.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/bfasst/utils/process_graph.py b/bfasst/utils/process_graph.py
index d53898af..9ee12b9d 100644
--- a/bfasst/utils/process_graph.py
+++ b/bfasst/utils/process_graph.py
@@ -409,15 +409,13 @@ def compute_k_core(adj_list):
             if degree[node] >= k
         }
 
+        k += 1
         if k_core:
             k_core_subgraph = k_core
             max_k = k
         else:
-            if max_k != 0:
-                max_k += 1
             break
 
-        k += 1
 
     return max_k, k_core_subgraph
 

From 1bccb274b643c1b7b7d3768cc3e42ab5dc96ad1a Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Mon, 9 Dec 2024 10:11:40 -0700
Subject: [PATCH 21/22] format

---
 bfasst/utils/process_graph.py |  1 -
 bfasst/utils/structural.py    | 12 ++++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/bfasst/utils/process_graph.py b/bfasst/utils/process_graph.py
index 9ee12b9d..b46375e2 100644
--- a/bfasst/utils/process_graph.py
+++ b/bfasst/utils/process_graph.py
@@ -416,7 +416,6 @@ def compute_k_core(adj_list):
         else:
             break
 
-
     return max_k, k_core_subgraph
 
 
diff --git a/bfasst/utils/structural.py b/bfasst/utils/structural.py
index 65c1364f..6e6933bb 100644
--- a/bfasst/utils/structural.py
+++ b/bfasst/utils/structural.py
@@ -727,10 +727,14 @@ def check_for_potential_bram_mapping(self, instance_name: str) -> set[str]:
         if bram_do:
             assert named_instance.properties["DOB_REG"] == "0"
 
-        bram_a_only = named_instance.properties["RAM_MODE"] == '"TDP"' and {
-            None,
-            SdnInstanceWrapper.GND_PIN.net,
-        } >= {named_instance.get_pin("DOBDO", i).net for i in range(32)}
+        bram_a_only = (
+            named_instance.properties["RAM_MODE"] == '"TDP"'
+            and {
+                None,
+                SdnInstanceWrapper.GND_PIN.net,
+            }
+            >= {named_instance.get_pin("DOBDO", i).net for i in range(32)}
+        )
 
         if named_instance.cell_type.startswith("RAMB36E1"):
             # A15 is only connected to a non-const net when cascade is enabled

From 726b49859e24b5ce4c7c5cc009a4f193ff8cf712 Mon Sep 17 00:00:00 2001
From: KeenanRileyFaulkner <keenan.r.faulkner@gmail.com>
Date: Mon, 9 Dec 2024 10:14:43 -0700
Subject: [PATCH 22/22] revert changes

---
 bfasst/utils/structural.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/bfasst/utils/structural.py b/bfasst/utils/structural.py
index 6e6933bb..65c1364f 100644
--- a/bfasst/utils/structural.py
+++ b/bfasst/utils/structural.py
@@ -727,14 +727,10 @@ def check_for_potential_bram_mapping(self, instance_name: str) -> set[str]:
         if bram_do:
             assert named_instance.properties["DOB_REG"] == "0"
 
-        bram_a_only = (
-            named_instance.properties["RAM_MODE"] == '"TDP"'
-            and {
-                None,
-                SdnInstanceWrapper.GND_PIN.net,
-            }
-            >= {named_instance.get_pin("DOBDO", i).net for i in range(32)}
-        )
+        bram_a_only = named_instance.properties["RAM_MODE"] == '"TDP"' and {
+            None,
+            SdnInstanceWrapper.GND_PIN.net,
+        } >= {named_instance.get_pin("DOBDO", i).net for i in range(32)}
 
         if named_instance.cell_type.startswith("RAMB36E1"):
             # A15 is only connected to a non-const net when cascade is enabled