Merge pull request #514 from byuccl/dataset_metrics

Dataset metrics
byuccl · Dec 9, 2024 · 1461dd1 · 1461dd1
2 parents 2919d1b + 726b498
commit 1461dd1
Show file tree

Hide file tree

Showing 11 changed files with 778 additions and 1 deletion.
diff --git a/bfasst/flows/analyze_dataset.py b/bfasst/flows/analyze_dataset.py
@@ -0,0 +1,48 @@
+"""Analyze dataset metrics."""
+
+from pathlib import Path
+import pathlib
+from bfasst.flows.flow import FlowNoDesign
+from bfasst.tools.dataset_metrics.accumulate_metrics import AccumulateMetrics
+from bfasst.tools.dataset_metrics.graph_metrics import GraphMetrics
+
+
+class AnalyzeDataset(FlowNoDesign):
+    """Analyze dataset metrics."""
+
+    def __init__(self, dataset):
+        # pylint: disable=duplicate-code
+        super().__init__()
+        self.dataset = Path(dataset)
+
+        # only used for configuring ninja rule snippets
+        self.graph_metrics_default_tool = GraphMetrics(self, None, None)
+        self.accumulate_metrics_tool = AccumulateMetrics(self, None)
+        # pylint: enable=duplicate-code
+
+    def create_build_snippets(self):
+        # get the size of the dataset
+        directories = [x for x in self.dataset.iterdir() if x.is_dir()]
+        iterations = len(directories)
+        pieces = []
+
+        for i in range(1, iterations + 1):
+            num = int(directories[i - 1].name.split("_")[-1])
+            graph_metrics_tool = GraphMetrics(
+                self, directories[i - 1] / f"{directories[i - 1].name}.dump", num
+            )
+            pieces.append(graph_metrics_tool.metrics_path)
+            graph_metrics_tool.create_build_snippets()
+
+        AccumulateMetrics(self, pieces).create_build_snippets()
+
+    @classmethod
+    def flow_build_dir_name(cls) -> str:
+        """Get the name of the build directory for this flow"""
+        return "dataset_metrics"
+
+    def add_ninja_deps(self, deps):
+        super().add_ninja_deps(deps)
+
+    def get_top_level_flow_path(self):
+        return pathlib.Path(__file__).resolve()
diff --git a/bfasst/flows/flow_descriptions.yaml b/bfasst/flows/flow_descriptions.yaml
@@ -156,4 +156,10 @@ flows:
   class: OpenTitan
   external_tools:
   - vivado
-  - opentitan
+  - opentitan
+
+- name: AnalyzeDataset
+  description: Compute Metrics on an FPGA Circuit dataset for GNNs.
+  module: analyze_dataset
+  class: AnalyzeDataset
+
diff --git a/bfasst/paths.py b/bfasst/paths.py
@@ -20,6 +20,8 @@
 
 COMMON_TOOLS_PATH = TOOLS_PATH / "common"
 
+DATASET_METRICS_TOOLS_PATH = TOOLS_PATH / "dataset_metrics"
+
 REV_BIT_TOOLS_PATH = TOOLS_PATH / "rev_bit"
 NINJA_TRANSFORM_TOOLS_PATH = TOOLS_PATH / "transform"
 

diff --git a/bfasst/tools/dataset_metrics/accumulate_metrics.py b/bfasst/tools/dataset_metrics/accumulate_metrics.py
@@ -0,0 +1,46 @@
+"""Accumulate metrics from the graph_metrics tool."""
+
+import chevron
+
+from bfasst.tools.tool import ToolBase
+from bfasst.paths import BUILD_PATH, NINJA_BUILD_PATH, DATASET_METRICS_TOOLS_PATH, BFASST_UTILS_PATH
+
+
+class AccumulateMetrics(ToolBase):
+    """Accumulate metrics from the graph_metrics tool."""
+
+    def __init__(self, flow, pieces):
+        super().__init__(flow)
+        self.pieces = pieces
+        self.build_path = BUILD_PATH / "dataset_metrics"
+        self.metrics_path = self.build_path / "master_metrics.log"
+        self.summary_stats = self.build_path / "summary_stats.log"
+
+        self._init_outputs()
+        self.rule_snippet_path = (
+            DATASET_METRICS_TOOLS_PATH / "accumulate_metrics_rules.ninja.mustache"
+        )
+
+    def create_build_snippets(self):
+        with open(DATASET_METRICS_TOOLS_PATH / "accumulate_metrics_build.ninja.mustache", "r") as f:
+            build = chevron.render(
+                f,
+                {
+                    "metrics_file": self.metrics_path,
+                    "summary_stats": self.summary_stats,
+                    "aggregation_dir": self.build_path,
+                    "pieces": self.pieces,
+                    "accumulate_metrics_util": BFASST_UTILS_PATH / "accumulate_metrics.py",
+                },
+            )
+
+        with open(NINJA_BUILD_PATH, "a") as f:
+            f.write(build)
+
+    def _init_outputs(self):
+        self.outputs["metrics_path"] = self.metrics_path
+        self.outputs["summary_stats"] = self.summary_stats
+
+    def add_ninja_deps(self, deps):
+        self._add_ninja_deps_default(deps, __file__)
+        deps.append(BFASST_UTILS_PATH / "accumulate_metrics.py")
diff --git a/bfasst/tools/dataset_metrics/accumulate_metrics_build.ninja.mustache b/bfasst/tools/dataset_metrics/accumulate_metrics_build.ninja.mustache
@@ -0,0 +1,4 @@
+build {{ metrics_file }} {{ summary_stats }}: accumulate_metrics {{ aggregation_dir }} | {{#pieces}}{{.}} {{/pieces}} {{ accumulate_metrics_util }}
+    metrics_file = {{ metrics_file }}
+    summary_stats = {{ summary_stats }}
+
diff --git a/bfasst/tools/dataset_metrics/accumulate_metrics_rules.ninja.mustache b/bfasst/tools/dataset_metrics/accumulate_metrics_rules.ninja.mustache
@@ -0,0 +1,4 @@
+rule accumulate_metrics
+    command = python {{ bfasst_path }}/bfasst/utils/accumulate_metrics.py $in -m $metrics_file -s $summary_stats
+    description = accumulate metrics from $in to produce master_metrics and summary_stats files
+
diff --git a/bfasst/tools/dataset_metrics/graph_metrics.py b/bfasst/tools/dataset_metrics/graph_metrics.py
@@ -0,0 +1,46 @@
+"""Create the rule and build snippets for computing gnn dataset metrics."""
+
+import chevron
+
+from bfasst.tools.tool import ToolBase
+from bfasst.paths import BUILD_PATH, NINJA_BUILD_PATH, DATASET_METRICS_TOOLS_PATH, BFASST_UTILS_PATH
+
+
+class GraphMetrics(ToolBase):
+    """Create the rule and build snippets for computing gnn dataset metrics."""
+
+    def __init__(
+        self,
+        flow,
+        graph,
+        num,
+    ):
+        super().__init__(flow)
+        self.graph = graph
+        self.num = num
+        self.build_path = BUILD_PATH / "dataset_metrics"
+        self.metrics_path = self.build_path / f"metrics_{num}.log"
+
+        self._init_outputs()
+        self.rule_snippet_path = DATASET_METRICS_TOOLS_PATH / "process_graph_rules.ninja.mustache"
+
+    def create_build_snippets(self):
+        with open(DATASET_METRICS_TOOLS_PATH / "process_graph_build.ninja.mustache", "r") as f:
+            build = chevron.render(
+                f,
+                {
+                    "output": self.metrics_path,
+                    "graph": self.graph,
+                    "process_graph_util": BFASST_UTILS_PATH / "process_graph.py",
+                },
+            )
+
+        with open(NINJA_BUILD_PATH, "a") as f:
+            f.write(build)
+
+    def _init_outputs(self):
+        self.outputs["metrics_path"] = self.metrics_path
+
+    def add_ninja_deps(self, deps):
+        self._add_ninja_deps_default(deps, __file__)
+        deps.append(BFASST_UTILS_PATH / "process_graph.py")
diff --git a/bfasst/tools/dataset_metrics/process_graph_build.ninja.mustache b/bfasst/tools/dataset_metrics/process_graph_build.ninja.mustache
@@ -0,0 +1,2 @@
+build {{ output }}: process_graph {{ graph }} | {{ process_graph_util }}
+
diff --git a/bfasst/tools/dataset_metrics/process_graph_rules.ninja.mustache b/bfasst/tools/dataset_metrics/process_graph_rules.ninja.mustache
@@ -0,0 +1,4 @@
+rule process_graph
+    command = python {{ bfasst_path }}/bfasst/utils/process_graph.py $in -o $out
+    description = compute metrics on $in and save them to $out
+
diff --git a/bfasst/utils/accumulate_metrics.py b/bfasst/utils/accumulate_metrics.py
@@ -0,0 +1,148 @@
+"""Accumulate metrics from graphs in a dataset after computing them for all graphs"""
+
+import argparse
+import logging
+import json
+from pathlib import Path
+import statistics
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    """Load the graph, convert to adj_list, and compute metrics."""
+    # ArgParse
+    args = get_args()
+
+    # Logging (for debug, don't use in parallel)
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+    )
+
+    # Initialize the master dictionary
+    master_metrics_output = args.m if args.m else "master_metrics.log"
+    stats_summary_output = args.s if args.s else "summary_statistics.log"
+
+    # Iterate through the files in the analysis directory
+    master_metrics = compute_master_metrics(
+        args.analysis_dir, master_metrics_output, stats_summary_output
+    )
+
+    # sort the values for each metric after merging
+    master_metrics = sort_metrics(master_metrics)
+
+    # Compute the stats for each metric
+    stats_summary = get_stats_summary(master_metrics)
+
+    # write master_metrics to a file
+    with open(master_metrics_output, "w") as f:
+        f.write(json.dumps(master_metrics, indent=4))
+
+    with open(stats_summary_output, "w") as f:
+        f.write(json.dumps(stats_summary, indent=4))
+
+
+def get_args():
+    """Get the command line arguments."""
+    parser = argparse.ArgumentParser(description="Compute metrics on a graph.")
+    parser.add_argument(
+        "analysis_dir", help="The path to the folder containing all analysis files for all graphs."
+    )
+    parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging.")
+    parser.add_argument("-m", help="The name of the metrics file to create")
+    parser.add_argument(
+        "-s", help="The name of the stats (5-num summary, mean, stddev) file to create"
+    )
+    return parser.parse_args()
+
+
+def compute_master_metrics(analysis_dir, master_metrics_output, stats_summary_output):
+    """Compute the master metrics from the analysis directory."""
+    master_metrics = {}
+    for file in Path(analysis_dir).iterdir():
+        if file.is_dir():
+            continue
+
+        if file.name in (
+            master_metrics_output,
+            stats_summary_output,
+            # Skip the master_metrics and stats_summary files
+            # Even if the user has specified different names
+            # for this run
+            "master_metrics.log",
+            "summary_stats.log",
+        ):
+            continue
+
+        logger.debug("Processing %s", file)
+
+        with open(file, "r") as f:
+            graph_metrics = json.loads(f.readline())
+
+        for ip, metrics in graph_metrics.items():
+            # Initialize the IP entry in the master dictionary if it doesn't exist
+            if ip not in master_metrics:
+                master_metrics[ip] = {}
+
+            for metric, values in metrics.items():
+                # Initialize the metric entry if it doesn't exist
+                if metric not in master_metrics[ip]:
+                    master_metrics[ip][metric] = []
+
+                # Concatenate the lists
+                master_metrics[ip][metric].extend(values)
+
+    return master_metrics
+
+
+def sort_metrics(metrics):
+    """Sort the values for each metric in the dictionary."""
+    for ip, _ in metrics.items():
+        for metric in metrics[ip]:
+            metrics[ip][metric] = sorted(metrics[ip][metric])
+    return metrics
+
+
+def get_stats_summary(master_metrics):
+    """Compute the 5-number summary, mean, and standard deviation for each metric."""
+    summary = {}
+    for ip, metrics in master_metrics.items():
+        for metric, values in metrics.items():
+            # Calculate statistics
+            if values:  # Check if the list is not empty
+                min_val, first_quartile, median, third_quartile, max_val = five_number_summary(
+                    values
+                )
+                mean = sum(values) / len(values)
+                stddev = statistics.stdev(values) if len(values) > 1 else 0.0
+
+                # Prepare the summary dictionary
+                if ip not in summary:
+                    summary[ip] = {}
+
+                summary[ip][metric] = {
+                    "min": min_val,
+                    "Q1": first_quartile,
+                    "median": median,
+                    "Q3": third_quartile,
+                    "max": max_val,
+                    "mean": mean,
+                    "stddev": stddev,
+                }
+    return summary
+
+
+def five_number_summary(data):
+    """Compute the 5-number summary for the given data."""
+    n = len(data)
+    min_val = data[0]
+    max_val = data[-1]
+    first_quartile = data[n // 4]
+    median = data[n // 2]
+    third_quartile = data[(3 * n) // 4]
+    return min_val, first_quartile, median, third_quartile, max_val
+
+
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		build {{ output }}: process_graph {{ graph }} \| {{ process_graph_util }}