diff --git a/bfasst/flows/analyze_dataset.py b/bfasst/flows/analyze_dataset.py index 399630c9..b92bc258 100644 --- a/bfasst/flows/analyze_dataset.py +++ b/bfasst/flows/analyze_dataset.py @@ -3,7 +3,6 @@ from pathlib import Path import pathlib from bfasst.flows.flow import FlowNoDesign -from bfasst.paths import FLOWS_PATH from bfasst.tools.dataset_metrics.accumulate_metrics import AccumulateMetrics from bfasst.tools.dataset_metrics.graph_metrics import GraphMetrics diff --git a/bfasst/utils/accumulate_metrics.py b/bfasst/utils/accumulate_metrics.py index 055fa13e..0d457652 100644 --- a/bfasst/utils/accumulate_metrics.py +++ b/bfasst/utils/accumulate_metrics.py @@ -12,16 +12,7 @@ def main(): """Load the graph, convert to adj_list, and compute metrics.""" # ArgParse - parser = argparse.ArgumentParser(description="Compute metrics on a graph.") - parser.add_argument( - "analysis_dir", help="The path to the folder containing all analysis files for all graphs." - ) - parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging.") - parser.add_argument("-m", help="The name of the metrics file to create") - parser.add_argument( - "-s", help="The name of the stats (5-num summary, mean, stddev) file to create" - ) - args = parser.parse_args() + args = get_args() # Logging (for debug, don't use in parallel) logging.basicConfig( @@ -30,21 +21,55 @@ def main(): ) # Initialize the master dictionary - master_metrics = {} master_metrics_output = args.m if args.m else "master_metrics.log" stats_summary_output = args.s if args.s else "summary_statistics.log" # Iterate through the files in the analysis directory - for file in Path(args.analysis_dir).iterdir(): + master_metrics = compute_master_metrics( + args.analysis_dir, master_metrics_output, stats_summary_output + ) + + # sort the values for each metric after merging + master_metrics = sort_metrics(master_metrics) + + # Compute the stats for each metric + stats_summary = get_stats_summary(master_metrics) + + # write master_metrics to a file + with open(master_metrics_output, "w") as f: + f.write(json.dumps(master_metrics, indent=4)) + + with open(stats_summary_output, "w") as f: + f.write(json.dumps(stats_summary, indent=4)) + + +def get_args(): + parser = argparse.ArgumentParser(description="Compute metrics on a graph.") + parser.add_argument( + "analysis_dir", help="The path to the folder containing all analysis files for all graphs." + ) + parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging.") + parser.add_argument("-m", help="The name of the metrics file to create") + parser.add_argument( + "-s", help="The name of the stats (5-num summary, mean, stddev) file to create" + ) + return parser.parse_args() + + +def compute_master_metrics(analysis_dir, master_metrics_output, stats_summary_output): + master_metrics = {} + for file in Path(analysis_dir).iterdir(): if file.is_dir(): continue - if ( - file.name == master_metrics_output - or file.name == stats_summary_output - # if these exist, don't read them even if master_metrics_output and stats_summary_output are different - or file.name == "master_metrics.log" - or file.name == "summary_statistics.log" + if file.name in ( + master_metrics_output, + stats_summary_output, + # Skip the master_metrics and stats_summary files + # Even if the user has specified different names + # for this run + "master_metrics.log", + "summary_statistics.log", ): continue @@ -64,51 +89,53 @@ def main(): # Concatenate the lists master_metrics[ip][metric].extend(values) - # sort the values for each metric after merging - for ip in master_metrics: - for metric in master_metrics[ip]: - master_metrics[ip][metric] = sorted(master_metrics[ip][metric]) + return master_metrics - # Compute the stats for each metric - stats_summary = {} - for ip, metrics in master_metrics.items(): + +def sort_metrics(metrics): + """Sort the values for each metric in the dictionary.""" + for ip, _ in metrics.items(): + for metric in metrics[ip]: + metrics[ip][metric] = sorted(metrics[ip][metric]) + return metrics + + +def get_stats_summary(metrics): + summary = {} + for ip, metrics in metrics.items(): for metric, values in metrics.items(): # Calculate statistics if values: # Check if the list is not empty - min_val, Q1, median, Q3, max_val = five_number_summary(values) + min_val, first_quartile, median, third_quartile, max_val = five_number_summary( + values + ) mean = sum(values) / len(values) stddev = statistics.stdev(values) if len(values) > 1 else 0.0 # Prepare the summary dictionary - if ip not in stats_summary: - stats_summary[ip] = {} + if ip not in summary: + summary[ip] = {} - stats_summary[ip][metric] = { + summary[ip][metric] = { "min": min_val, - "Q1": Q1, + "Q1": first_quartile, "median": median, - "Q3": Q3, + "Q3": third_quartile, "max": max_val, "mean": mean, "stddev": stddev, } - - # write master_metrics to a file - with open(master_metrics_output, "w") as f: - f.write(json.dumps(master_metrics, indent=4)) - - with open(stats_summary_output, "w") as f: - f.write(json.dumps(stats_summary, indent=4)) + return summary def five_number_summary(data): n = len(data) min_val = data[0] max_val = data[-1] - Q1 = data[n // 4] + first_quartile = data[n // 4] median = data[n // 2] - Q3 = data[(3 * n) // 4] - return min_val, Q1, median, Q3, max_val + third_quartile = data[(3 * n) // 4] + return min_val, first_quartile, median, third_quartile, max_val if __name__ == "__main__": diff --git a/bfasst/utils/process_graph.py b/bfasst/utils/process_graph.py index 5afea97d..4b55df46 100644 --- a/bfasst/utils/process_graph.py +++ b/bfasst/utils/process_graph.py @@ -130,6 +130,7 @@ def convert_to_adj_list(component_nodes, component_edges): def compute_metrics_per_ip(adj_lists, args): + """Compute metrics for each IP in the graph.""" metrics_per_ip = {} for label, adj_list in adj_lists.items(): @@ -193,7 +194,7 @@ def compute_metrics_per_ip(adj_lists, args): def compute_size(adj_list): edge_count = 0 for node in adj_list: - for neighbor in adj_list[node]: + for _ in adj_list[node]: edge_count += 1 return edge_count // 2 @@ -231,6 +232,8 @@ def compute_average_degree(adj_list): class UnionFind: + """Union-find data structure.""" + def __init__(self): self.parent = {} self.rank = {} @@ -241,6 +244,7 @@ def add(self, u): self.rank[u] = 0 def find(self, u): + """Find the parent of a node.""" # Ensure u is in the union find self.add(u) @@ -250,6 +254,7 @@ def find(self, u): return self.parent[u] def union(self, u, v): + """Union two nodes.""" self.add(u) self.add(v) pu, pv = self.find(u), self.find(v) @@ -265,6 +270,7 @@ def union(self, u, v): def bfs_farthest(adj_list, start_node): + """Breadth-first search to find the farthest node from a starting node.""" queue = [(start_node, 0)] visited = {start_node} farthest_node = start_node @@ -285,6 +291,7 @@ def bfs_farthest(adj_list, start_node): def compute_k_core(adj_list): + """Compute the k-core of a graph.""" degree = {node: len(neighbors) for node, neighbors in adj_list.items()} max_k = 0 k_core_subgraph = {} @@ -320,6 +327,7 @@ def compute_k_core(adj_list): def compute_global_clustering(adj_list): + """Compute the global clustering coefficient of a graph.""" closed_triplets = 0 total_triplets = 0 visited_pairs = set() @@ -342,6 +350,7 @@ def compute_global_clustering(adj_list): def compute_local_clustering(adj_list): + """Compute the local clustering coefficient of a graph.""" local_clustering_coefficients = [] for node in adj_list: