From 05e3ee3ed6a26a387bc52389c4ba88986c30ed4f Mon Sep 17 00:00:00 2001 From: Alberto Cattaneo Date: Wed, 16 Oct 2024 10:47:12 +0000 Subject: [PATCH] add metapath unit test --- src/kg_topology_toolbox/topology_toolbox.py | 17 ++++++++++------- tests/test_edge_topology_toolbox.py | 21 ++++++++++++++++----- tests/test_node_topology_toolbox.py | 5 +---- tests/test_relation_topology_toolbox.py | 5 +---- 4 files changed, 28 insertions(+), 20 deletions(-) diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py index 5def422..faec2a9 100644 --- a/src/kg_topology_toolbox/topology_toolbox.py +++ b/src/kg_topology_toolbox/topology_toolbox.py @@ -278,9 +278,9 @@ def edge_metapath_count( composition_workers: int = min(32, mp.cpu_count() - 1 or 1), ) -> pd.DataFrame: """ - For each edge in the KG, compute the number of triangles of different - metapaths (i.e., the unique tuples (r1, r2) of relation types - of the two additional edges of the triangle). + For each edge in the KG, compute the number of triangles supported on it + distinguishing between different metapaths (i.e., the unique tuples (r1, r2) + of relation types of the two additional edges of the triangle). :param filter_relations: If not empty, compute the output only for the edges with relation @@ -293,10 +293,11 @@ def edge_metapath_count( on number of available threads (max: 32). :return: - The output dataframe has one row for each (h, t, r1, r2) such that - there exists at least one triangle of metapath (r1, r2) over (any) edge - connecting h, t. + The output dataframe has one row for each (h, r, t, r1, r2) such that + there exists at least one triangle of metapath (r1, r2) over (h, r, t). The number of metapath triangles is given in the column **n_triangles**. + The column **index** provides the index of the edge (h, r, t) in the + original Knowledge Graph dataframe. """ # discard loops as edges of a triangle df_wo_loops = self.df[self.df.h != self.df.t] @@ -313,7 +314,7 @@ def edge_metapath_count( rel_df = self.df df_triangles = df_wo_loops - return composition_count( + counts = composition_count( df_triangles, chunk_size=composition_chunk_size, workers=composition_workers, @@ -321,6 +322,8 @@ def edge_metapath_count( directed=True, ) + return rel_df.reset_index().merge(counts, on=["h", "t"], how="inner") + def edge_degree_cardinality_summary( self, filter_relations: list[int] = [], aggregate_by_r: bool = False ) -> pd.DataFrame: diff --git a/tests/test_edge_topology_toolbox.py b/tests/test_edge_topology_toolbox.py index 6e0b6be..49bfa5e 100644 --- a/tests/test_edge_topology_toolbox.py +++ b/tests/test_edge_topology_toolbox.py @@ -22,12 +22,19 @@ ) -@pytest.mark.parametrize("return_metapath_list", [True, False]) -def test_small_graph_metrics(return_metapath_list: bool) -> None: - # Define a small graph with all the features tested by - # the edge_topology_toolbox +def test_edge_metapath_count() -> None: + res = kgtt.edge_metapath_count() + assert np.allclose(res["index"], [2, 2]) + assert np.allclose(res["h"], [0, 0]) + assert np.allclose(res["r"], [0, 0]) + assert np.allclose(res["t"], [2, 2]) + assert np.allclose(res["r1"], [0, 1]) + assert np.allclose(res["r2"], [1, 1]) + assert np.allclose(res["n_triangles"], [1, 1]) + - # entity degrees statistics +def test_edge_degree_cardinality_summary() -> None: + # edge degrees statistics res = kgtt.edge_degree_cardinality_summary() assert np.allclose(res["h_unique_rel"], [2, 2, 2, 1, 2, 2, 1, 2]) assert np.allclose(res["h_degree"], [3, 3, 3, 2, 3, 3, 2, 3]) @@ -60,6 +67,9 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None: "M:M", ] + +@pytest.mark.parametrize("return_metapath_list", [True, False]) +def test_edge_pattern_summary(return_metapath_list: bool) -> None: # relation pattern symmetry res = kgtt.edge_pattern_summary(return_metapath_list=return_metapath_list) assert np.allclose( @@ -92,6 +102,7 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None: def test_filter_relations() -> None: for rels in [[0], [1], [0, 1]]: for method in [ + kgtt.edge_metapath_count, kgtt.edge_degree_cardinality_summary, partial(kgtt.edge_pattern_summary, return_metapath_list=True), ]: diff --git a/tests/test_node_topology_toolbox.py b/tests/test_node_topology_toolbox.py index 18d87ed..d002b41 100644 --- a/tests/test_node_topology_toolbox.py +++ b/tests/test_node_topology_toolbox.py @@ -19,10 +19,7 @@ @pytest.mark.parametrize("return_relation_list", [True, False]) -def test_small_graph_metrics(return_relation_list: bool) -> None: - # Define a small graph with all the features tested by - # the node_topology_toolbox - +def test_node_degree_summary(return_relation_list: bool) -> None: # entity degrees statistics res = kgtt.node_degree_summary(return_relation_list=return_relation_list) assert np.allclose(res["h_degree"], [3, 1, 3]) diff --git a/tests/test_relation_topology_toolbox.py b/tests/test_relation_topology_toolbox.py index 3f0c05c..e527a5f 100644 --- a/tests/test_relation_topology_toolbox.py +++ b/tests/test_relation_topology_toolbox.py @@ -20,10 +20,7 @@ kgtt = KGTopologyToolbox(df, head_column="H", relation_column="R", tail_column="T") -def test_small_graph_metrics() -> None: - # Define a small graph on five nodes with all the features tested by - # the relation_topology_toolbox - +def test_aggregate_by_r() -> None: dcs = kgtt.edge_degree_cardinality_summary(aggregate_by_r=True) eps = kgtt.edge_pattern_summary(return_metapath_list=True, aggregate_by_r=True)