From a0ee952eb5b9722627e633c5dcaa73fddcd595c7 Mon Sep 17 00:00:00 2001 From: zslade Date: Thu, 25 Jan 2024 14:08:28 +0000 Subject: [PATCH 01/10] return data class instead of dictionary --- splink/cluster_metrics.py | 8 ++++++++ splink/linker.py | 24 ++++++++++++------------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/splink/cluster_metrics.py b/splink/cluster_metrics.py index 21ef060900..4d6ec6bb41 100644 --- a/splink/cluster_metrics.py +++ b/splink/cluster_metrics.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from typing import Dict, List from splink.splink_dataframe import SplinkDataFrame @@ -137,3 +138,10 @@ def _size_density_centralisation_sql( sqls.append(sql) return sqls + + +@dataclass +class GraphMetricsResults: + nodes: SplinkDataFrame + edges: SplinkDataFrame + clusters: SplinkDataFrame diff --git a/splink/linker.py b/splink/linker.py index fd4ee3348c..cec1ef5dc3 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -59,6 +59,7 @@ from .cluster_metrics import ( _node_degree_sql, _size_density_centralisation_sql, + GraphMetricsResults, ) from .cluster_studio import render_splink_cluster_studio_html from .comparison import Comparison @@ -2224,10 +2225,10 @@ def _compute_graph_metrics( df_predict: SplinkDataFrame, df_clustered: SplinkDataFrame, threshold_match_probability: float, - ) -> Dict[str, SplinkDataFrame]: + ) -> GraphMetricsResults: """ - Generates tables containing graph metrics (for nodes, edges, and clusters), - and returns a dictionary of Splink dataframes + Generates tables containing graph metrics (for nodes, edges and clusters), + and returns a data class of Splink dataframes Args: df_predict (SplinkDataFrame): The results of `linker.predict()` @@ -2238,11 +2239,11 @@ def _compute_graph_metrics( above this threshold. Returns: - dict[str, SplinkDataFrame]: A dictionary of SplinkDataFrames - containing cluster IDs and selected cluster, node, or edge metrics - key "nodes" for nodes metrics table - key "edges" for edge metrics table - key "clusters" for cluster metrics table + GraphMetricsResult: A data class containing SplinkDataFrames + of cluster IDs and selected node, edge or cluster metrics. + attribute "nodes" for nodes metrics table + attribute "edges" for edge metrics table + attribute "clusters" for cluster metrics table """ df_node_metrics = self._compute_metrics_nodes( @@ -2251,10 +2252,9 @@ def _compute_graph_metrics( # don't need edges as information is baked into node metrics df_cluster_metrics = self._compute_metrics_clusters(df_node_metrics) - return { - "nodes": df_node_metrics, - "clusters": df_cluster_metrics, - } + return GraphMetricsResults( + nodes=df_node_metrics, edges=None, clusters=df_cluster_metrics + ) def profile_columns( self, column_expressions: str | list[str] = None, top_n=10, bottom_n=10 From 0bee649fd7505a6be6c6e8d562aba6c12c76afc5 Mon Sep 17 00:00:00 2001 From: zslade Date: Thu, 25 Jan 2024 14:37:27 +0000 Subject: [PATCH 02/10] add repr --- splink/cluster_metrics.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/splink/cluster_metrics.py b/splink/cluster_metrics.py index 4d6ec6bb41..074f17e2c4 100644 --- a/splink/cluster_metrics.py +++ b/splink/cluster_metrics.py @@ -145,3 +145,13 @@ class GraphMetricsResults: nodes: SplinkDataFrame edges: SplinkDataFrame clusters: SplinkDataFrame + + def __repr__(self): + return f""" +A data class of Splink dataframes containing metrics for nodes, edges and clusters. + +Access dataframes via attributes: +`compute_graph_metrics.nodes` for node metrics, +`compute_graph_metrics.edges` for edge metrics and +`compute_graph_metrics.clusters` for cluster metrics +""" From 3bdffeda8001c9bcae598bcf0c4bef3bda3fbc3c Mon Sep 17 00:00:00 2001 From: zslade Date: Thu, 25 Jan 2024 14:47:00 +0000 Subject: [PATCH 03/10] impose keyword args --- splink/linker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/splink/linker.py b/splink/linker.py index cec1ef5dc3..c2f91fce1a 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -2224,6 +2224,7 @@ def _compute_graph_metrics( self, df_predict: SplinkDataFrame, df_clustered: SplinkDataFrame, + *, threshold_match_probability: float, ) -> GraphMetricsResults: """ From fbdeb6f56836e5b4089c2d50bc78fe72c87ea7dc Mon Sep 17 00:00:00 2001 From: zslade Date: Thu, 25 Jan 2024 14:49:14 +0000 Subject: [PATCH 04/10] lint --- splink/cluster_metrics.py | 2 +- splink/linker.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/splink/cluster_metrics.py b/splink/cluster_metrics.py index 074f17e2c4..d984c7fbae 100644 --- a/splink/cluster_metrics.py +++ b/splink/cluster_metrics.py @@ -147,7 +147,7 @@ class GraphMetricsResults: clusters: SplinkDataFrame def __repr__(self): - return f""" + return """ A data class of Splink dataframes containing metrics for nodes, edges and clusters. Access dataframes via attributes: diff --git a/splink/linker.py b/splink/linker.py index c2f91fce1a..e481f44271 100644 --- a/splink/linker.py +++ b/splink/linker.py @@ -10,7 +10,6 @@ from copy import copy, deepcopy from pathlib import Path from statistics import median -from typing import Dict import sqlglot @@ -57,9 +56,9 @@ waterfall_chart, ) from .cluster_metrics import ( + GraphMetricsResults, _node_degree_sql, _size_density_centralisation_sql, - GraphMetricsResults, ) from .cluster_studio import render_splink_cluster_studio_html from .comparison import Comparison From ac5f417543f6cb43082121c7c1521a527cf47456 Mon Sep 17 00:00:00 2001 From: zslade Date: Thu, 25 Jan 2024 14:57:36 +0000 Subject: [PATCH 05/10] add description for unpacking --- splink/cluster_metrics.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/splink/cluster_metrics.py b/splink/cluster_metrics.py index d984c7fbae..925c5cae86 100644 --- a/splink/cluster_metrics.py +++ b/splink/cluster_metrics.py @@ -154,4 +154,11 @@ def __repr__(self): `compute_graph_metrics.nodes` for node metrics, `compute_graph_metrics.edges` for edge metrics and `compute_graph_metrics.clusters` for cluster metrics + +or equivalently unpack like so: +```node_metrics, edge_metrics, cluster_metrics = ( + df_graph_metrics.nodes, + df_graph_metrics.edges, + df_graph_metrics.clusters, +)``` """ From e043d3e44465ec93040f780d042ba66255b72fa4 Mon Sep 17 00:00:00 2001 From: zslade Date: Thu, 25 Jan 2024 15:14:53 +0000 Subject: [PATCH 06/10] update tests --- tests/test_cluster_metrics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_cluster_metrics.py b/tests/test_cluster_metrics.py index b210ea4c23..4d5372c4bb 100644 --- a/tests/test_cluster_metrics.py +++ b/tests/test_cluster_metrics.py @@ -41,7 +41,7 @@ def test_size_density_dedupe(): df_result = linker._compute_graph_metrics( df_predict, df_clustered, threshold_match_probability=0.9 - )["clusters"].as_pandas_dataframe() + ).clusters.as_pandas_dataframe() # not testing this here - it's not relevant for small clusters anyhow del df_result["cluster_centralisation"] @@ -74,8 +74,8 @@ def test_size_density_link(): df_result = ( linker._compute_graph_metrics( df_predict, df_clustered, threshold_match_probability=0.99 - )["clusters"] - .as_pandas_dataframe() + ) + .clusters.as_pandas_dataframe() .sort_values(by="cluster_id") .reset_index(drop=True) ) @@ -226,7 +226,7 @@ def test_metrics(dialect, test_helpers): df_clustered = linker.register_table(helper.convert_frame(df_c), "clusters") cm = linker._compute_graph_metrics(df_predict, df_clustered, 0.95) - df_cm = cm["clusters"].as_pandas_dataframe() + df_cm = cm.clusters.as_pandas_dataframe() expected = [ {"cluster_id": 1, "n_nodes": 4, "n_edges": 4, "cluster_centralisation": 4 / 6}, From 11b22deb2edbebd98a7a97b242dbab4a75f385aa Mon Sep 17 00:00:00 2001 From: zslade Date: Thu, 25 Jan 2024 15:23:51 +0000 Subject: [PATCH 07/10] fix keyword arg in tests --- tests/test_cluster_metrics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_cluster_metrics.py b/tests/test_cluster_metrics.py index 4d5372c4bb..0cb8a1b0a5 100644 --- a/tests/test_cluster_metrics.py +++ b/tests/test_cluster_metrics.py @@ -225,7 +225,9 @@ def test_metrics(dialect, test_helpers): df_predict = linker.register_table(helper.convert_frame(df_e), "predict") df_clustered = linker.register_table(helper.convert_frame(df_c), "clusters") - cm = linker._compute_graph_metrics(df_predict, df_clustered, 0.95) + cm = linker._compute_graph_metrics( + df_predict, df_clustered, threshold_match_probability=0.95 + ) df_cm = cm.clusters.as_pandas_dataframe() expected = [ From 979723a37f130c7977e507145e786d421f310d1d Mon Sep 17 00:00:00 2001 From: zslade Date: Thu, 25 Jan 2024 15:47:30 +0000 Subject: [PATCH 08/10] fix test --- tests/test_cluster_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cluster_metrics.py b/tests/test_cluster_metrics.py index 0cb8a1b0a5..b52e20b5bc 100644 --- a/tests/test_cluster_metrics.py +++ b/tests/test_cluster_metrics.py @@ -262,7 +262,7 @@ def test_metrics(dialect, test_helpers): expected_row_details["cluster_centralisation"] ) - df_nm = cm["nodes"].as_pandas_dataframe() + df_nm = cm.nodes.as_pandas_dataframe() for unique_id, expected_node_degree in expected_node_degrees: relevant_row = df_nm[df_nm["composite_unique_id"] == unique_id] From bb5e581dbf20047a244df95eb494c2834fc7c5e6 Mon Sep 17 00:00:00 2001 From: zslade Date: Mon, 29 Jan 2024 15:02:48 +0000 Subject: [PATCH 09/10] improve repr --- splink/cluster_metrics.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/splink/cluster_metrics.py b/splink/cluster_metrics.py index 925c5cae86..97504cf363 100644 --- a/splink/cluster_metrics.py +++ b/splink/cluster_metrics.py @@ -147,18 +147,11 @@ class GraphMetricsResults: clusters: SplinkDataFrame def __repr__(self): - return """ -A data class of Splink dataframes containing metrics for nodes, edges and clusters. - -Access dataframes via attributes: -`compute_graph_metrics.nodes` for node metrics, -`compute_graph_metrics.edges` for edge metrics and -`compute_graph_metrics.clusters` for cluster metrics - -or equivalently unpack like so: -```node_metrics, edge_metrics, cluster_metrics = ( - df_graph_metrics.nodes, - df_graph_metrics.edges, - df_graph_metrics.clusters, -)``` -""" + msg = ( + "A data class of Splink dataframes containing metrics for nodes, edges and clusters.\n" + "\nAccess dataframes via attributes:\n" + "`compute_graph_metrics.nodes` for node metrics,\n" + "`compute_graph_metrics.edges` for edge metrics, and\n" + "`compute_graph_metrics.clusters` for cluster metrics\n" + ) + return msg From a0d91cd28aa7a754ea0cbebb75c123a2bec10d72 Mon Sep 17 00:00:00 2001 From: zslade Date: Mon, 29 Jan 2024 15:11:06 +0000 Subject: [PATCH 10/10] lint --- splink/cluster_metrics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/splink/cluster_metrics.py b/splink/cluster_metrics.py index 97504cf363..c30b34246b 100644 --- a/splink/cluster_metrics.py +++ b/splink/cluster_metrics.py @@ -148,7 +148,8 @@ class GraphMetricsResults: def __repr__(self): msg = ( - "A data class of Splink dataframes containing metrics for nodes, edges and clusters.\n" + "A data class of Splink dataframes containing metrics for nodes, edges " + "and clusters.\n" "\nAccess dataframes via attributes:\n" "`compute_graph_metrics.nodes` for node metrics,\n" "`compute_graph_metrics.edges` for edge metrics, and\n"