From acdcf42eeab8975454557dfdf4e986046685b3e6 Mon Sep 17 00:00:00 2001 From: Alberto Cattaneo <84471416+AlCatt91@users.noreply.github.com> Date: Fri, 8 Nov 2024 14:54:17 +0000 Subject: [PATCH] Relation filtering for edge methods + sparse metapath counting (#12) * filter relations for edge card; cap mp workers based on cores * add relation filter to edge methods * ci fix * metapath tweaks * refactor metapath counting with sparse matmuls * docstring update * use np.divmod * avoid repeated work * add metapath unit test * reduce memory usage of metapath counting * improve docstrings * document new functionalities in the doc notebook * tidy up redundant code * fix typo --------- Co-authored-by: Daniel Justus --- docs/source/notebooks/ogb_biokg_demo.ipynb | 316 +++++++++++++++++++- src/kg_topology_toolbox/topology_toolbox.py | 228 ++++++++++---- src/kg_topology_toolbox/utils.py | 152 ++++++++-- tests/test_edge_topology_toolbox.py | 49 ++- tests/test_node_topology_toolbox.py | 5 +- tests/test_relation_topology_toolbox.py | 5 +- 6 files changed, 647 insertions(+), 108 deletions(-) diff --git a/docs/source/notebooks/ogb_biokg_demo.ipynb b/docs/source/notebooks/ogb_biokg_demo.ipynb index bb73448..dc13ff5 100644 --- a/docs/source/notebooks/ogb_biokg_demo.ipynb +++ b/docs/source/notebooks/ogb_biokg_demo.ipynb @@ -22,9 +22,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Found existing installation: kg-topology-toolbox 0.1.0\n", - "Uninstalling kg-topology-toolbox-0.1.0:\n", - " Successfully uninstalled kg-topology-toolbox-0.1.0\n" + "Found existing installation: kg-topology-toolbox 1.0.0\n", + "Uninstalling kg-topology-toolbox-1.0.0:\n", + " Successfully uninstalled kg-topology-toolbox-1.0.0\n" ] } ], @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -63,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -181,7 +181,7 @@ "[5088434 rows x 3 columns]" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -209,14 +209,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/nethome/albertoc/research/knowledge_graphs/kg-topology-toolbox/.venv/lib/python3.10/site-packages/kg_topology_toolbox/topology_toolbox.py:64: UserWarning: The Knowledge Graph contains duplicated edges -- some functionalities may produce incorrect results\n", + "/nethome/albertoc/research/knowledge_graphs/kg-topology-toolbox/.venv/lib/python3.10/site-packages/kg_topology_toolbox/utils.py:42: UserWarning: The Knowledge Graph contains duplicated edges -- some functionalities may produce incorrect results\n", " warnings.warn(\n" ] } @@ -232,13 +232,77 @@ "Notice the warning raised by the constructor, which detects duplicated edges in the `biokg_df` DataFrame: to ensure optimal functionalities, duplicated edges should be removed before instantiating the `KGTopologyToolbox` class." ] }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hrt
38544071972451972
40005341972451972
\n", + "
" + ], + "text/plain": [ + " h r t\n", + "3854407 1972 45 1972\n", + "4000534 1972 45 1972" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find duplicated edges\n", + "biokg_df.loc[biokg_df.duplicated(keep=False)]" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Node-level analysis\n", "\n", - "The method `node_degree_summary` provides a summary of the degrees of each individual node in the knowledge graph. The returned dataframe is indexed on the node ID.\n", + "The method `node_degree_summary` provides a summary of the degrees of each individual node in the knowledge graph. The returned DataFrame is indexed on the node ID.\n", "\n", "- `h_degree` is the number of edges coming out from the node;\n", "- `t_degree` is the number of edges going into the node;\n", @@ -894,7 +958,7 @@ "\n", "![image info](../images/edge_patterns.png)\n", "\n", - "For inverse/inference, the method also provides the number and types of unique relations `r'` realizing the counterpart edges; for composition, the number of triangles supported by the edge is provided (the unique metapaths `[r_1, r_2]` can also be listed by setting `return_metapath_list=True` when calling the method)." + "For inverse/inference, the method also provides the number and types of unique relations `r'` realizing the counterpart edges; for composition, the number of triangles supported by the edge is provided." ] }, { @@ -1210,6 +1274,15 @@ "edge_eps" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we need to identify the different metapaths `[r_1, r_2]` that give triangles `(h,r1,x) - (x,r2,t)` over an edge `(h,r,t)`, we can do so by setting `return_metapath_list=True` in the call of `edge_pattern_summary`. In order to disaggregate the total number of triangles over an edge into separate counts for each existing metapath, the `edge_metapath_count` method should be used instead. \n", + "\n", + "We can now easily produce a global view of the distribution of topological properties." + ] + }, { "cell_type": "code", "execution_count": 12, @@ -1277,6 +1350,225 @@ "plt.tight_layout()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering relation types\n", + "\n", + "The edge-level methods presented in the previous section simultaneously compute statistics for all edges in the KG, and this can be expensive on larger graphs. Moreover, in many practical cases the user might be interested in looking only at the properties of edges of one or few specific relation types.\n", + "\n", + "The methods `edge_degree_cardinality_summary`, `edge_pattern_summary` and `edge_metapath_count` can be passed a list of relation type IDs to restrict computations of their outputs to edges of those specific relation types." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexhrtr1r2n_triangles
03343827327122541210
133438273271225392123
233438273271225382200
33343827327122537227
4334382732712253626
........................
7321494953327152924249213412
7321504953327152924249211412
732151495332715292424926412
732152495332715292424924411
732153495332715292424922412
\n", + "

732154 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " index h r t r1 r2 n_triangles\n", + "0 334382 732 7 1225 41 2 10\n", + "1 334382 732 7 1225 39 2 123\n", + "2 334382 732 7 1225 38 2 200\n", + "3 334382 732 7 1225 37 2 27\n", + "4 334382 732 7 1225 36 2 6\n", + "... ... ... .. ... .. .. ...\n", + "732149 4953327 1529 24 2492 13 41 2\n", + "732150 4953327 1529 24 2492 11 41 2\n", + "732151 4953327 1529 24 2492 6 41 2\n", + "732152 4953327 1529 24 2492 4 41 1\n", + "732153 4953327 1529 24 2492 2 41 2\n", + "\n", + "[732154 rows x 7 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filtered_metapath_counts = kgtt.edge_metapath_count(filter_relations=[7, 24])\n", + "filtered_metapath_counts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The previous cell computes the number of triangles of each existing `(r1, r2)` metapath, but only over `(h,r,t)` edges of the two relation types with ID 7 and 24 (the column `index` gives the index of the edge in the `biokkg_df` DataFrame). This is the same as calling `kgtt.edge_metapath_count().query('r==7 or r==24')`, but the computation is much cheaper and faster." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "r\n", + "24 413366\n", + "7 318788\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filtered_metapath_counts.r.value_counts()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2267,7 +2559,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv38", + "display_name": ".venv", "language": "python", "name": "python3" }, @@ -2281,7 +2573,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py index c3d6f5b..5ae642e 100644 --- a/src/kg_topology_toolbox/topology_toolbox.py +++ b/src/kg_topology_toolbox/topology_toolbox.py @@ -5,6 +5,7 @@ Topology toolbox main functionalities """ +import multiprocessing as mp from functools import cache import numpy as np @@ -257,8 +258,6 @@ def edge_cardinality(self) -> pd.DataFrame: # check if the values in the pair (h_degree, t_degree) are =1 or >1 # to determine the edge cardinality for suffix in ["", "_same_rel"]: - # check if the values in the pair (h_degree, t_degree) are =1 or >1 - # to determine the edge cardinality edge_type = 2 * (head_degree["h_degree" + suffix] == 1) + ( tail_degree["t_degree" + suffix] == 1 ) @@ -270,8 +269,65 @@ def edge_cardinality(self) -> pd.DataFrame: ).astype(str) return df_res + def edge_metapath_count( + self, + filter_relations: list[int] = [], + composition_chunk_size: int = 2**8, + composition_workers: int = min(32, mp.cpu_count() - 1 or 1), + ) -> pd.DataFrame: + """ + For each edge in the KG, compute the number of triangles supported on it + distinguishing between different metapaths (i.e., the unique ordered tuples + (r1, r2) of relation types of the two additional edges of the triangle). + + :param filter_relations: + If not empty, compute the output only for the edges with relation + in this list of relation IDs. + :param composition_chunk_size: + Size of column chunks of sparse adjacency matrix + to compute the triangle count. Reduce the parameter if running OOM. + Default: 2**8. + :param composition_workers: + Number of workers to compute the triangle count. By default, assigned based + on number of available threads (max: 32). + + :return: + The output dataframe has one row for each (h, r, t, r1, r2) such that + there exists at least one triangle of metapath (r1, r2) over (h, r, t). + The number of metapath triangles is given in the column **n_triangles**. + The column **index** provides the index of the edge (h, r, t) in the + original Knowledge Graph dataframe. + """ + # discard loops as edges of a triangle + df_wo_loops = self.df[self.df.h != self.df.t] + if len(filter_relations) > 0: + rel_df = self.df[self.df.r.isin(filter_relations)] + # unique heads and tails used by filtered edges + filter_heads = rel_df.h.unique() + filter_tails = rel_df.t.unique() + # the only relevant edges for triangles are the ones with head in the + # set of filtered heads, or tail in the set of filtered tails + df_triangles = df_wo_loops[ + np.logical_or( + df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails) + ) + ] + else: + rel_df = self.df + df_triangles = df_wo_loops + + counts = composition_count( + df_triangles, + chunk_size=composition_chunk_size, + workers=composition_workers, + metapaths=True, + directed=True, + ) + + return rel_df.reset_index().merge(counts, on=["h", "t"], how="inner") + def edge_degree_cardinality_summary( - self, aggregate_by_r: bool = False + self, filter_relations: list[int] = [], aggregate_by_r: bool = False ) -> pd.DataFrame: """ For each edge in the KG, compute the number of edges with the same head @@ -285,6 +341,9 @@ def edge_degree_cardinality_summary( The output dataframe maintains the same indexing and ordering of triples as the original Knowledge Graph dataframe. + :param filter_relations: + If not empty, compute the output only for the edges with relation + in this list of relation IDs. :param aggregate_by_r: If True, return metrics aggregated by relation type (the output DataFrame will be indexed over relation IDs). @@ -318,6 +377,8 @@ def edge_degree_cardinality_summary( ], axis=1, ) + if len(filter_relations) > 0: + df_res = df_res[df_res.r.isin(filter_relations)] # compute number of parallel edges to avoid double-counting them # in total degree num_parallel = df_res.merge( @@ -326,9 +387,9 @@ def edge_degree_cardinality_summary( how="left", ) df_res["tot_degree"] = ( - df_res.h_degree + df_res.t_degree - num_parallel.n_parallel + df_res.h_degree + df_res.t_degree - num_parallel.n_parallel.values ) - # when restricting to the relation type, there is only one edge + # when restricting to the same relation type, there is only one edge # (the edge itself) that is double-counted df_res["tot_degree_same_rel"] = ( df_res.h_degree_same_rel + df_res.t_degree_same_rel - 1 @@ -344,9 +405,10 @@ def edge_degree_cardinality_summary( def edge_pattern_summary( self, return_metapath_list: bool = False, - composition_chunk_size: int = 2**8, - composition_workers: int = 32, + filter_relations: list[int] = [], aggregate_by_r: bool = False, + composition_chunk_size: int = 2**8, + composition_workers: int = min(32, mp.cpu_count() - 1 or 1), ) -> pd.DataFrame: """ Analyse structural properties of each edge in the KG: @@ -358,15 +420,20 @@ def edge_pattern_summary( :param return_metapath_list: If True, return the list of unique metapaths for all - triangles supported over one edge. WARNING: very expensive for large graphs. - :param composition_chunk_size: - Size of column chunks of sparse adjacency matrix - to compute the triangle count. - :param composition_workers: - Number of workers to compute the triangle count. + triangles supported over each edge. WARNING: very expensive for large graphs. + :param filter_relations: + If not empty, compute the output only for the edges with relation + in this list of relation IDs. :param aggregate_by_r: If True, return metrics aggregated by relation type (the output DataFrame will be indexed over relation IDs). + :param composition_chunk_size: + Size of column chunks of sparse adjacency matrix + to compute the triangle count. Reduce the parameter if running OOM. + Default: 2**8. + :param composition_workers: + Number of workers to compute the triangle count. By default, assigned based + on number of available threads (max: 32). :return: The results dataframe. Contains the following columns @@ -395,29 +462,67 @@ def edge_pattern_summary( - **metapath_list** (list): The list of unique metapaths "r1-r2" for the directed triangles. """ + + # discard loops as edges of a triangle + df_wo_loops = self.df[self.df.h != self.df.t] + if len(filter_relations) > 0: + rel_df = self.df[self.df.r.isin(filter_relations)] + # unique heads and tails used by filtered edges + filter_heads = rel_df.h.unique() + filter_tails = rel_df.t.unique() + filter_entities = np.union1d(filter_heads, filter_tails) + # restrict relevant edges to count inference/inverse patterns + inference_df = self.df[ + np.logical_and( + self.df.h.isin(filter_heads), self.df.t.isin(filter_tails) + ) + ] + inverse_df = self.df[ + np.logical_and( + self.df.h.isin(filter_tails), self.df.t.isin(filter_heads) + ) + ] + # the only relevant edges for triangles are the ones with head in the + # set of filtered heads, or tail in the set of filtered tails + df_triangles = df_wo_loops[ + np.logical_or( + df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails) + ) + ] + # for undirected triangles, heads and tails can be any of the + # filtered entities + df_triangles_und = df_wo_loops[ + np.logical_or( + df_wo_loops.h.isin(filter_entities), + df_wo_loops.t.isin(filter_entities), + ) + ] + else: + rel_df = inference_df = inverse_df = self.df + df_triangles = df_triangles_und = df_wo_loops + df_res = pd.DataFrame( + {"h": rel_df.h, "r": rel_df.r, "t": rel_df.t, "is_symmetric": False} + ) # symmetry-asymmetry # edges with h/t switched - df_inv = self.df.reindex(columns=["t", "r", "h"]).rename( + df_inv = inverse_df.reindex(columns=["t", "r", "h"]).rename( columns={"t": "h", "r": "r", "h": "t"} ) - df_res = pd.DataFrame( - {"h": self.df.h, "r": self.df.r, "t": self.df.t, "is_symmetric": False} - ) df_res.loc[ - self.df.reset_index().merge(df_inv)["index"], + df_res.reset_index().merge(df_inv)["index"], "is_symmetric", ] = True # loops are treated separately df_res["is_loop"] = df_res.h == df_res.t df_res.loc[df_res.h == df_res.t, "is_symmetric"] = False + df_res = df_res.reset_index() + # inverse unique_inv_r_by_ht = df_inv.groupby(["h", "t"], as_index=False).agg( inverse_edge_types=("r", list), ) - df_res = df_res.merge( - unique_inv_r_by_ht, left_on=["h", "t"], right_on=["h", "t"], how="left" - ) + df_res = df_res.merge(unique_inv_r_by_ht, on=["h", "t"], how="left") df_res["inverse_edge_types"] = df_res["inverse_edge_types"].apply( lambda agg: agg if isinstance(agg, list) else [] ) @@ -432,65 +537,64 @@ def edge_pattern_summary( df_res["has_inverse"] = df_res["n_inverse_relations"] > 0 # inference - edges_between_ht = unique_inv_r_by_ht.reindex( - columns=["t", "h", "inverse_edge_types"] - ).rename( - columns={"t": "h", "h": "t", "inverse_edge_types": "inference_edge_types"} - ) - df_res = df_res.merge( - edges_between_ht, left_on=["h", "t"], right_on=["h", "t"], how="left" - ) + if len(filter_relations) > 0: + edges_between_ht = inference_df.groupby(["h", "t"], as_index=False).agg( + inference_edge_types=("r", list), + ) + else: + edges_between_ht = unique_inv_r_by_ht.reindex( + columns=["t", "h", "inverse_edge_types"] + ).rename( + columns={ + "t": "h", + "h": "t", + "inverse_edge_types": "inference_edge_types", + } + ) + df_res = df_res.merge(edges_between_ht, on=["h", "t"], how="left") # inference_edge_types always contains the edge itself, which we need to drop df_res["n_inference_relations"] = df_res.inference_edge_types.str.len() - 1 df_res["has_inference"] = df_res["n_inference_relations"] > 0 # composition & metapaths - # discard loops as edges of a triangle - df_wo_loops = self.df[self.df.h != self.df.t] + counts = composition_count( + df_triangles, + chunk_size=composition_chunk_size, + workers=composition_workers, + metapaths=return_metapath_list, + directed=True, + ) if return_metapath_list: - # 2-hop paths - df_bridges = df_wo_loops.merge( - df_wo_loops, left_on="t", right_on="h", how="inner" - ) - df_triangles = df_wo_loops.merge( - df_bridges, left_on=["h", "t"], right_on=["h_x", "t_y"], how="inner" - ) - df_triangles["metapath"] = ( - df_triangles["r_x"].astype(str) + "-" + df_triangles["r_y"].astype(str) + # turn (r1, r2) into "r1-r2" string for metapaths + counts["metapath"] = ( + counts["r1"].astype(str) + "-" + counts["r2"].astype(str) ) - grouped_triangles = df_triangles.groupby( - ["h", "r", "t"], as_index=False - ).agg( - n_triangles=("metapath", "count"), metapath_list=("metapath", "unique") + # count triangles (summing over all metapaths between two nodes) + # and list unique metapaths for each head and tail node pair + grouped_triangles = counts.groupby(["h", "t"], as_index=False).agg( + n_triangles=("n_triangles", "sum"), metapath_list=("metapath", list) ) df_res = df_res.merge( grouped_triangles, - left_on=["h", "r", "t"], - right_on=["h", "r", "t"], + on=["h", "t"], how="left", ) + # if no triangles are present over an edge, set metapath list to [] df_res["metapath_list"] = df_res["metapath_list"].apply( - lambda agg: agg.tolist() if isinstance(agg, np.ndarray) else [] + lambda agg: agg if isinstance(agg, list) else [] ) - df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int) else: - counts = composition_count( - df_wo_loops, - chunk_size=composition_chunk_size, - workers=composition_workers, - directed=True, - ) df_res = df_res.merge( counts, on=["h", "t"], how="left", ) - df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int) - + df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int) df_res["has_composition"] = df_res["n_triangles"] > 0 + # undirected composition counts = composition_count( - df_wo_loops, + df_triangles_und, chunk_size=composition_chunk_size, workers=composition_workers, directed=False, @@ -505,7 +609,7 @@ def edge_pattern_summary( ) df_res["has_undirected_composition"] = df_res["n_undirected_triangles"] > 0 - df_res = df_res[ + df_res = df_res.set_index("index")[ [ "h", "r", @@ -525,6 +629,7 @@ def edge_pattern_summary( ] + (["metapath_list"] if return_metapath_list else []) ] + df_res.index.name = None return aggregate_by_relation(df_res) if aggregate_by_r else df_res @@ -558,6 +663,7 @@ def jaccard_similarity_relation_sets(self) -> pd.DataFrame: - **jaccard_both** (float): Jaccard similarity between the full entity set of r1 and r2. """ + # set of unique heads/tails/any for each relation ent_unique = self.df.groupby("r", as_index=False).agg( num_triples=("r", "count"), head=("h", "unique"), tail=("t", "unique") ) @@ -574,6 +680,7 @@ def jaccard_similarity_relation_sets(self) -> pd.DataFrame: suffixes=["_r1", "_r2"], how="cross", ) + # order doesn't matter df_res = df_res[df_res.r1 < df_res.r2] df_res["num_triples_both"] = df_res["num_triples_r1"] + df_res["num_triples_r2"] @@ -631,7 +738,7 @@ def relational_affinity_ingram(self, min_max_norm: bool = False) -> pd.DataFrame returned dataframe. :param min_max_norm: - min-max normalization of edge weights. Defaults to False. + min-max normalization of edge weights. Default: False. :return: The results dataframe. Contains the following columns: @@ -648,15 +755,18 @@ def relational_affinity_ingram(self, min_max_norm: bool = False) -> pd.DataFrame # normalize by global t frequency rt_freqs["h"] = rt_freqs["h"] / rt_freqs.groupby("t")["h"].transform("sum") + # sparse matrix of of (h,r) pair frequency E_h = coo_array( (hr_freqs.t, (hr_freqs.h, hr_freqs.r)), shape=[self.n_entity, self.n_rel], ) + # sparse matrix of of (t,r) pair frequency E_t = coo_array( (rt_freqs.h, (rt_freqs.t, rt_freqs.r)), shape=[self.n_entity, self.n_rel], ) + # adjacency matrix of relation graph A = (E_h.T @ E_h).toarray() + (E_t.T @ E_t).toarray() A[np.diag_indices_from(A)] = 0 diff --git a/src/kg_topology_toolbox/utils.py b/src/kg_topology_toolbox/utils.py index d3a3d55..5bc7ac7 100644 --- a/src/kg_topology_toolbox/utils.py +++ b/src/kg_topology_toolbox/utils.py @@ -133,6 +133,7 @@ def aggregate_by_relation(edge_topology_df: pd.DataFrame) -> pd.DataFrame: elif col_dtype == object: if isinstance(edge_topology_df[col].iloc[0], str): for label in np.unique(edge_topology_df[col]): + # fraction of rows for each label df_res[f"{col}_{label}_frac"] = ( edge_topology_df[edge_topology_df[col] == label] .groupby("r")[col] @@ -188,24 +189,63 @@ def jaccard_similarity( def _composition_count_worker( - adj_csr: csr_array, adj_csc: csc_array, tail_shift: int = 0 + adj_csr: csr_array, + adj_csc_slice: csc_array, + adj_mask_slice: csc_array, + slice_tail_shift: int, ) -> pd.DataFrame: - adj_2hop = adj_csr @ adj_csc - adj_composition = (adj_2hop.tocsc() * (adj_csc > 0)).tocoo() - df_composition = pd.DataFrame( - dict( - h=adj_composition.row, - t=adj_composition.col + tail_shift, - n_triangles=adj_composition.data, + """ + Masked sparse matmul to count triangles over graph edges. + + :param adj_csr: shape (n_nodes * n_rels, n_nodes) if distinguishing between + metapaths, (n_nodes, n_nodes) otherwise + :param adj_csc_slice: shape (n_nodes, chunk_size) + :param adj_mask_slice: shape (n_nodes, chunk_size) + :param slice_tail_shift: column shift of the vertical slice + + :return: + Pandas dataframe of triangle counts. + """ + n_nodes = adj_csr.shape[1] + n_rels = adj_csr.shape[0] // n_nodes + # 2-hop count + adj_2hop = adj_csr @ adj_csc_slice + # mask out (h,t) pairs not connected by edges + adj_composition = (adj_2hop.tocsc() * adj_mask_slice).tocoo() + if n_rels > 1: + # distinguish between metapaths + # unflatten results + h, r1 = np.divmod(adj_composition.row, n_rels) + r2, t = np.divmod(adj_composition.col + slice_tail_shift, n_nodes) + df_composition = pd.DataFrame( + dict( + h=h, + t=t, + r1=r1, + r2=r2, + n_triangles=adj_composition.data, + ) + ) + else: + # don't distinguish between metapaths + df_composition = pd.DataFrame( + dict( + h=adj_composition.row, + t=adj_composition.col + slice_tail_shift, + n_triangles=adj_composition.data, + ) ) - ) return df_composition def composition_count( - df: pd.DataFrame, chunk_size: int, workers: int, directed: bool = True + df: pd.DataFrame, + chunk_size: int, + workers: int, + metapaths: bool = False, + directed: bool = True, ) -> pd.DataFrame: - """A helper function to compute the composition count of a graph. + """Compute composition count of a graph. :param df: A graph represented as a pd.DataFrame. Must contain the columns @@ -215,44 +255,112 @@ def composition_count( processed together. :param workers: Number of workers processing chunks concurrently + :param metapaths: + If True, the number of compositions is computed separately for each + unique metapath. :param directed: - Boolean flag. If false, bidirectional edges are considered for - triangles by adding the adjacency matrix and its transposed. Default: True. + If False, bidirectional edges are considered for + triangles, by adding the adjacency matrix and its transposed. Default: True. :return: The results dataframe. Contains the following columns: - **h** (int): Index of the head entity. - **t** (int): Index of the tail entity. - - **n_triangles** (int): Number of compositions for the (h, t) edge. + - **n_triangles** (int): Number of compositions for any edge between (h, t). """ n_nodes = df[["h", "t"]].max().max() + 1 + n_rels = df["r"].max() + 1 + # sparse graph adjacency matrix, counting number of edges between each pair of nodes adj = coo_array( (np.ones(len(df)), (df.h, df.t)), shape=[n_nodes, n_nodes], ).astype(np.uint16) - if not directed: - adj = adj + adj.T - n_cols = adj.shape[1] - adj_csr = adj.tocsr() - adj_csc = adj.tocsc() + + if metapaths: + if not directed: + raise NotImplementedError( + "Metapath counting only implemented for directed triangles" + ) + # relation-aware adjacency matrix, flattened to 2D for sparse implementation + # (adj_csr @ adj_csc).reshape(n_nodes, n_rels, n_rels, n_nodes)[h,r1,r2,t] counts + # the number of 2-hop paths of metapath (r1, r2) between h and t + adj_csr = csr_array( + (np.ones(len(df)), (df.h * n_rels + df.r, df.t)), + shape=[n_nodes * n_rels, n_nodes], + ).astype(np.uint16) + adj_csc = csc_array( + (np.ones(len(df)), (df.h, df.r * n_nodes + df.t)), + shape=[n_nodes, n_nodes * n_rels], + ).astype(np.uint16) + # boolean mask to filter results, keep only triangles over (h,t) pairs connected + # by at least one edge (equivalent to flattened adj[:,None,None,:] > 0) + msk = csc_array( + ( + [True] * (len(adj.data) * n_rels), + ( + (n_rels * adj.row + np.arange(n_rels)[:, None]).flatten(), + np.tile(adj.col, n_rels), + ), + ), + shape=[n_nodes * n_rels, n_nodes], + ) + else: + if not directed: + # add inverse edges for undirected compositions + adj = adj + adj.T + # (adj_csr @ adj_csc)[h,t] counts the number of 2-hop paths between h and t; + # the boolean mask here is simply adj_csc > 0 + adj_csr = adj.tocsr() + adj_csc = adj.tocsc() + + # to compute (adj_csr @ adj_csc) * msk, serialize over vertical slices of adj_csc + n_cols = adj_csc.shape[1] adj_csc_slices = { i: adj_csc[:, i * chunk_size : min((i + 1) * chunk_size, n_cols)] for i in range(int(np.ceil(n_cols / chunk_size))) } - if len(adj_csc_slices) > 1 and workers > 1: with Pool(workers) as pool: + # workers are assigned different adj_csc slices df_composition_list = pool.starmap( _composition_count_worker, ( - (adj_csr, adj_csc_slice, i * chunk_size) + ( + adj_csr, + adj_csc_slice, + ( + # relevant slice of boolean mask (with wraparound) + msk[ + :, + (i * chunk_size + np.arange(adj_csc_slice.shape[1])) + % msk.shape[1], + ] + if metapaths + else adj_csc_slice > 0 + ), + i * chunk_size, + ) for i, adj_csc_slice in adj_csc_slices.items() ), ) else: df_composition_list = [ - _composition_count_worker(adj_csr, adj_csc_slice, i * chunk_size) + _composition_count_worker( + adj_csr, + adj_csc_slice, + ( + # relevant slice of boolean mask (with wraparound) + msk[ + :, + (i * chunk_size + np.arange(adj_csc_slice.shape[1])) + % msk.shape[1], + ] + if metapaths + else adj_csc_slice > 0 + ), + i * chunk_size, + ) for i, adj_csc_slice in adj_csc_slices.items() ] diff --git a/tests/test_edge_topology_toolbox.py b/tests/test_edge_topology_toolbox.py index eaba81a..849bd5c 100644 --- a/tests/test_edge_topology_toolbox.py +++ b/tests/test_edge_topology_toolbox.py @@ -1,5 +1,7 @@ # Copyright (c) 2023 Graphcore Ltd. All rights reserved. +from functools import partial + import numpy as np import pandas as pd import pytest @@ -20,12 +22,20 @@ ) -@pytest.mark.parametrize("return_metapath_list", [True, False]) -def test_small_graph_metrics(return_metapath_list: bool) -> None: - # Define a small graph with all the features tested by - # the edge_topology_toolbox +def test_edge_metapath_count() -> None: + res = kgtt.edge_metapath_count(composition_chunk_size=3) + assert np.allclose(res["index"], [2, 2]) + assert np.allclose(res["h"], [0, 0]) + assert np.allclose(res["r"], [0, 0]) + assert np.allclose(res["t"], [2, 2]) + assert set(zip(res["r1"].values.tolist(), res["r2"].values.tolist())) == set( + [(0, 1), (1, 1)] + ) + assert np.allclose(res["n_triangles"], [1, 1]) - # entity degrees statistics + +def test_edge_degree_cardinality_summary() -> None: + # edge degrees statistics res = kgtt.edge_degree_cardinality_summary() assert np.allclose(res["h_unique_rel"], [2, 2, 2, 1, 2, 2, 1, 2]) assert np.allclose(res["h_degree"], [3, 3, 3, 2, 3, 3, 2, 3]) @@ -58,8 +68,13 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None: "M:M", ] + +@pytest.mark.parametrize("return_metapath_list", [True, False]) +def test_edge_pattern_summary(return_metapath_list: bool) -> None: # relation pattern symmetry - res = kgtt.edge_pattern_summary(return_metapath_list=return_metapath_list) + res = kgtt.edge_pattern_summary( + return_metapath_list=return_metapath_list, composition_chunk_size=3 + ) assert np.allclose( res["is_loop"], [False, False, False, False, False, False, True, True] ) @@ -84,4 +99,24 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None: assert np.allclose(res["n_triangles"], [0, 0, 2, 0, 0, 0, 0, 0]) assert np.allclose(res["n_undirected_triangles"], [3, 3, 2, 6, 2, 2, 0, 0]) if return_metapath_list: - assert res["metapath_list"][2] == ["0-1", "1-1"] + assert set(res["metapath_list"][2]) == set(["0-1", "1-1"]) + + +def test_filter_relations() -> None: + for rels in [[0], [1], [0, 1]]: + for method in [ + kgtt.edge_metapath_count, + kgtt.edge_degree_cardinality_summary, + partial(kgtt.edge_pattern_summary, return_metapath_list=True), + ]: + # compare outputs of standard method call and filtered call + res_all = method() # type: ignore + res_all = res_all[res_all.r.isin(rels)] + res_filtered = method(filter_relations=rels) # type: ignore + assert np.all(res_all.index.values == res_filtered.index.values) + for c in res_all.columns: + if c == "metapath_list": + for a, b in zip(res_all[c].values, res_filtered[c].values): + assert a == b + else: + assert np.all(res_all[c].values == res_filtered[c].values) diff --git a/tests/test_node_topology_toolbox.py b/tests/test_node_topology_toolbox.py index 18d87ed..d002b41 100644 --- a/tests/test_node_topology_toolbox.py +++ b/tests/test_node_topology_toolbox.py @@ -19,10 +19,7 @@ @pytest.mark.parametrize("return_relation_list", [True, False]) -def test_small_graph_metrics(return_relation_list: bool) -> None: - # Define a small graph with all the features tested by - # the node_topology_toolbox - +def test_node_degree_summary(return_relation_list: bool) -> None: # entity degrees statistics res = kgtt.node_degree_summary(return_relation_list=return_relation_list) assert np.allclose(res["h_degree"], [3, 1, 3]) diff --git a/tests/test_relation_topology_toolbox.py b/tests/test_relation_topology_toolbox.py index 3f0c05c..e527a5f 100644 --- a/tests/test_relation_topology_toolbox.py +++ b/tests/test_relation_topology_toolbox.py @@ -20,10 +20,7 @@ kgtt = KGTopologyToolbox(df, head_column="H", relation_column="R", tail_column="T") -def test_small_graph_metrics() -> None: - # Define a small graph on five nodes with all the features tested by - # the relation_topology_toolbox - +def test_aggregate_by_r() -> None: dcs = kgtt.edge_degree_cardinality_summary(aggregate_by_r=True) eps = kgtt.edge_pattern_summary(return_metapath_list=True, aggregate_by_r=True)