diff --git a/docs/source/notebooks/ogb_biokg_demo.ipynb b/docs/source/notebooks/ogb_biokg_demo.ipynb index aa6fac9..bb73448 100644 --- a/docs/source/notebooks/ogb_biokg_demo.ipynb +++ b/docs/source/notebooks/ogb_biokg_demo.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -31,13 +31,13 @@ "source": [ "import sys\n", "!{sys.executable} -m pip uninstall -y kg_topology_toolbox\n", - "!pip install -q git+https://github.com/graphcore-research/kg-topology-toolbox.git\n", + "!pip install -q git+https://github.com/graphcore-research/kg-topology-toolbox.git --no-cache-dir\n", "!pip install -q jupyter ipywidgets ogb seaborn" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -63,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -181,7 +181,7 @@ "[5088434 rows x 3 columns]" ] }, - "execution_count": 5, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -194,7 +194,9 @@ "all_triples = []\n", "for split in dataset.get_edge_split().values():\n", " all_triples.append(np.stack([split[\"head\"], split[\"relation\"], split[\"tail\"]]).T)\n", - "biokg_df = pd.DataFrame(np.concatenate(all_triples), columns=[\"h\", \"r\", \"t\"])\n", + "biokg_df = pd.DataFrame(\n", + " np.concatenate(all_triples).astype(np.int32), columns=[\"h\", \"r\", \"t\"]\n", + ")\n", "biokg_df" ] }, @@ -202,16 +204,32 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Based on this representation of the knowledge graph, we can proceed to compute its topological properties using the `KGTopologyToolbox` class." + "Based on this representation of the knowledge graph, we can proceed to instantiate the `KGTopologyToolbox` class to compute topological properties." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/nethome/albertoc/research/knowledge_graphs/kg-topology-toolbox/.venv/lib/python3.10/site-packages/kg_topology_toolbox/topology_toolbox.py:64: UserWarning: The Knowledge Graph contains duplicated edges -- some functionalities may produce incorrect results\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "kgtt = KGTopologyToolbox(biokg_df)" + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "kgtt = KGTopologyToolbox()" + "Notice the warning raised by the constructor, which detects duplicated edges in the `biokg_df` DataFrame: to ensure optimal functionalities, duplicated edges should be removed before instantiating the `KGTopologyToolbox` class." ] }, { @@ -231,7 +249,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -385,19 +403,19 @@ "[45085 rows x 6 columns]" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "node_ds = kgtt.node_degree_summary(biokg_df)\n", + "node_ds = kgtt.node_degree_summary()\n", "node_ds" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -437,7 +455,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -491,7 +509,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -518,11 +536,11 @@ " h\n", " r\n", " t\n", - " h_unique_rel\n", " h_degree\n", + " h_unique_rel\n", " h_degree_same_rel\n", - " t_unique_rel\n", " t_degree\n", + " t_unique_rel\n", " t_degree_same_rel\n", " tot_degree\n", " tot_degree_same_rel\n", @@ -536,11 +554,11 @@ " 1718\n", " 0\n", " 3207\n", - " 5\n", " 191\n", + " 5\n", " 116\n", - " 6\n", " 46\n", + " 6\n", " 14\n", " 236\n", " 129\n", @@ -552,11 +570,11 @@ " 4903\n", " 0\n", " 13662\n", - " 8\n", " 544\n", + " 8\n", " 33\n", - " 9\n", " 1975\n", + " 9\n", " 50\n", " 2518\n", " 82\n", @@ -568,11 +586,11 @@ " 5480\n", " 0\n", " 15999\n", - " 3\n", " 108\n", + " 3\n", " 5\n", - " 4\n", " 72\n", + " 4\n", " 22\n", " 179\n", " 26\n", @@ -584,11 +602,11 @@ " 3148\n", " 0\n", " 7247\n", - " 4\n", " 110\n", + " 4\n", " 99\n", - " 11\n", " 673\n", + " 11\n", " 271\n", " 782\n", " 369\n", @@ -600,11 +618,11 @@ " 10300\n", " 0\n", " 16202\n", - " 4\n", " 414\n", + " 4\n", " 315\n", - " 6\n", " 148\n", + " 6\n", " 31\n", " 561\n", " 345\n", @@ -632,11 +650,11 @@ " 2451\n", " 50\n", " 5097\n", - " 5\n", " 636\n", + " 5\n", " 272\n", - " 10\n", " 803\n", + " 10\n", " 272\n", " 1437\n", " 543\n", @@ -648,11 +666,11 @@ " 6456\n", " 50\n", " 8833\n", - " 10\n", " 743\n", - " 259\n", " 10\n", + " 259\n", " 371\n", + " 10\n", " 100\n", " 1111\n", " 358\n", @@ -664,11 +682,11 @@ " 9484\n", " 50\n", " 15873\n", - " 8\n", " 652\n", + " 8\n", " 213\n", - " 6\n", " 486\n", + " 6\n", " 163\n", " 1135\n", " 375\n", @@ -680,11 +698,11 @@ " 6365\n", " 50\n", " 496\n", - " 9\n", " 922\n", + " 9\n", " 277\n", - " 19\n", " 618\n", + " 19\n", " 173\n", " 1537\n", " 449\n", @@ -696,11 +714,11 @@ " 13860\n", " 50\n", " 6368\n", - " 7\n", " 485\n", + " 7\n", " 175\n", - " 8\n", " 455\n", + " 8\n", " 147\n", " 939\n", " 321\n", @@ -713,31 +731,31 @@ "" ], "text/plain": [ - " h r t h_unique_rel h_degree h_degree_same_rel \\\n", - "0 1718 0 3207 5 191 116 \n", - "1 4903 0 13662 8 544 33 \n", - "2 5480 0 15999 3 108 5 \n", - "3 3148 0 7247 4 110 99 \n", - "4 10300 0 16202 4 414 315 \n", - "... ... .. ... ... ... ... \n", - "5088429 2451 50 5097 5 636 272 \n", - "5088430 6456 50 8833 10 743 259 \n", - "5088431 9484 50 15873 8 652 213 \n", - "5088432 6365 50 496 9 922 277 \n", - "5088433 13860 50 6368 7 485 175 \n", + " h r t h_degree h_unique_rel h_degree_same_rel \\\n", + "0 1718 0 3207 191 5 116 \n", + "1 4903 0 13662 544 8 33 \n", + "2 5480 0 15999 108 3 5 \n", + "3 3148 0 7247 110 4 99 \n", + "4 10300 0 16202 414 4 315 \n", + "... ... .. ... ... ... ... \n", + "5088429 2451 50 5097 636 5 272 \n", + "5088430 6456 50 8833 743 10 259 \n", + "5088431 9484 50 15873 652 8 213 \n", + "5088432 6365 50 496 922 9 277 \n", + "5088433 13860 50 6368 485 7 175 \n", "\n", - " t_unique_rel t_degree t_degree_same_rel tot_degree \\\n", - "0 6 46 14 236 \n", - "1 9 1975 50 2518 \n", - "2 4 72 22 179 \n", - "3 11 673 271 782 \n", - "4 6 148 31 561 \n", - "... ... ... ... ... \n", - "5088429 10 803 272 1437 \n", - "5088430 10 371 100 1111 \n", - "5088431 6 486 163 1135 \n", - "5088432 19 618 173 1537 \n", - "5088433 8 455 147 939 \n", + " t_degree t_unique_rel t_degree_same_rel tot_degree \\\n", + "0 46 6 14 236 \n", + "1 1975 9 50 2518 \n", + "2 72 4 22 179 \n", + "3 673 11 271 782 \n", + "4 148 6 31 561 \n", + "... ... ... ... ... \n", + "5088429 803 10 272 1437 \n", + "5088430 371 10 100 1111 \n", + "5088431 486 6 163 1135 \n", + "5088432 618 19 173 1537 \n", + "5088433 455 8 147 939 \n", "\n", " tot_degree_same_rel triple_cardinality triple_cardinality_same_rel \n", "0 129 M:M M:M \n", @@ -755,13 +773,13 @@ "[5088434 rows x 13 columns]" ] }, - "execution_count": 10, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "edge_dcs = kgtt.edge_degree_cardinality_summary(biokg_df)\n", + "edge_dcs = kgtt.edge_degree_cardinality_summary()\n", "edge_dcs" ] }, @@ -774,7 +792,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -837,7 +855,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -872,7 +890,7 @@ "source": [ "### Edge topological patterns\n", "\n", - "The second method provided by `KGTopologyToolbox` for topological analysis at the edge level is `edge_pattern_summary`, which extracts information on several significant edge topological patterns. In particular, it detects whether the edge (h,r,t) is a loop, is symmetric or has inverse, inference, composition (directed and undirected):\n", + "`KGTopologyToolbox` also allows us to perform a topological analysis at the edge level, using the method `edge_pattern_summary`, which extracts information on several significant edge topological patterns. In particular, it detects whether the edge (h,r,t) is a loop, is symmetric or has inverse, inference, composition (directed and undirected):\n", "\n", "![image info](../images/edge_patterns.png)\n", "\n", @@ -881,7 +899,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -1182,19 +1200,19 @@ "[5088434 rows x 15 columns]" ] }, - "execution_count": 13, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "edge_eps = kgtt.edge_pattern_summary(biokg_df)\n", + "edge_eps = kgtt.edge_pattern_summary()\n", "edge_eps" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -1215,7 +1233,7 @@ "dtype: float64" ] }, - "execution_count": 14, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1229,7 +1247,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -1265,12 +1283,12 @@ "source": [ "## Relation-level analysis\n", "\n", - "The method `aggregate_by_relation` allows the user to aggregate at the relation-level the statistics outputted by the edge-level methods `edge_degree_cardinality_summary` and `edge_pattern_summary`. This converts DataFrames indexed on the KG edges to DataFrames indexed on the IDs of the unique relation types." + "All edge topological properties seen in the previous section can be aggregated over triples of the same relation type, to produce relation-level statistics. To do so, we can either set the option `aggregate_by_r = True` when calling the methods `edge_degree_cardinality_summary`, `edge_pattern_summary`, or - if edge topological metrics have already been precomputed - use the utility function `aggregate_by_relation`, which converts DataFrames indexed on the KG edges to DataFrames indexed on the IDs of the unique relation types." ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1298,12 +1316,12 @@ " frac_triples\n", " unique_h\n", " unique_t\n", - " h_unique_rel_mean\n", - " h_unique_rel_std\n", - " h_unique_rel_quartile1\n", - " h_unique_rel_quartile2\n", - " h_unique_rel_quartile3\n", " h_degree_mean\n", + " h_degree_std\n", + " h_degree_quartile1\n", + " h_degree_quartile2\n", + " h_degree_quartile3\n", + " h_unique_rel_mean\n", " ...\n", " tot_degree_same_rel_quartile1\n", " tot_degree_same_rel_quartile2\n", @@ -1348,12 +1366,12 @@ " 0.015931\n", " 9742\n", " 9337\n", - " 8.110293\n", - " 8.247277\n", - " 4.0\n", - " 5.0\n", - " 8.0\n", " 569.252202\n", + " 1083.315332\n", + " 111.0\n", + " 222.0\n", + " 521.0\n", + " 8.110293\n", " ...\n", " 45.0\n", " 112.0\n", @@ -1372,12 +1390,12 @@ " 0.001114\n", " 698\n", " 1536\n", - " 27.048157\n", - " 12.936410\n", - " 17.0\n", - " 31.0\n", - " 36.0\n", " 2518.765391\n", + " 2186.452620\n", + " 435.0\n", + " 2087.0\n", + " 4028.0\n", + " 27.048157\n", " ...\n", " 14.0\n", " 32.0\n", @@ -1396,12 +1414,12 @@ " 0.013158\n", " 612\n", " 612\n", - " 36.404307\n", - " 5.600706\n", - " 33.0\n", - " 36.0\n", - " 41.0\n", " 4129.511919\n", + " 1935.630599\n", + " 2548.0\n", + " 3968.0\n", + " 5649.0\n", + " 36.404307\n", " ...\n", " 332.0\n", " 404.0\n", @@ -1420,12 +1438,12 @@ " 0.003849\n", " 491\n", " 491\n", - " 37.095941\n", - " 5.547389\n", - " 33.0\n", - " 37.0\n", - " 41.0\n", " 4527.399592\n", + " 1943.714179\n", + " 2925.0\n", + " 4507.0\n", + " 6161.0\n", + " 37.095941\n", " ...\n", " 114.0\n", " 157.0\n", @@ -1444,12 +1462,12 @@ " 0.006295\n", " 526\n", " 525\n", - " 37.319567\n", - " 5.384523\n", - " 34.0\n", - " 38.0\n", - " 41.0\n", " 4511.067834\n", + " 1905.395180\n", + " 2931.0\n", + " 4507.0\n", + " 6148.0\n", + " 37.319567\n", " ...\n", " 188.0\n", " 243.0\n", @@ -1468,29 +1486,29 @@ "" ], "text/plain": [ - " num_triples frac_triples unique_h unique_t h_unique_rel_mean \\\n", - "r \n", - "0 81066 0.015931 9742 9337 8.110293 \n", - "1 5669 0.001114 698 1536 27.048157 \n", - "2 66954 0.013158 612 612 36.404307 \n", - "3 19585 0.003849 491 491 37.095941 \n", - "4 32034 0.006295 526 525 37.319567 \n", + " num_triples frac_triples unique_h unique_t h_degree_mean h_degree_std \\\n", + "r \n", + "0 81066 0.015931 9742 9337 569.252202 1083.315332 \n", + "1 5669 0.001114 698 1536 2518.765391 2186.452620 \n", + "2 66954 0.013158 612 612 4129.511919 1935.630599 \n", + "3 19585 0.003849 491 491 4527.399592 1943.714179 \n", + "4 32034 0.006295 526 525 4511.067834 1905.395180 \n", "\n", - " h_unique_rel_std h_unique_rel_quartile1 h_unique_rel_quartile2 \\\n", - "r \n", - "0 8.247277 4.0 5.0 \n", - "1 12.936410 17.0 31.0 \n", - "2 5.600706 33.0 36.0 \n", - "3 5.547389 33.0 37.0 \n", - "4 5.384523 34.0 38.0 \n", + " h_degree_quartile1 h_degree_quartile2 h_degree_quartile3 \\\n", + "r \n", + "0 111.0 222.0 521.0 \n", + "1 435.0 2087.0 4028.0 \n", + "2 2548.0 3968.0 5649.0 \n", + "3 2925.0 4507.0 6161.0 \n", + "4 2931.0 4507.0 6148.0 \n", "\n", - " h_unique_rel_quartile3 h_degree_mean ... tot_degree_same_rel_quartile1 \\\n", - "r ... \n", - "0 8.0 569.252202 ... 45.0 \n", - "1 36.0 2518.765391 ... 14.0 \n", - "2 41.0 4129.511919 ... 332.0 \n", - "3 41.0 4527.399592 ... 114.0 \n", - "4 41.0 4511.067834 ... 188.0 \n", + " h_unique_rel_mean ... tot_degree_same_rel_quartile1 \\\n", + "r ... \n", + "0 8.110293 ... 45.0 \n", + "1 27.048157 ... 14.0 \n", + "2 36.404307 ... 332.0 \n", + "3 37.095941 ... 114.0 \n", + "4 37.319567 ... 188.0 \n", "\n", " tot_degree_same_rel_quartile2 tot_degree_same_rel_quartile3 \\\n", "r \n", @@ -1535,27 +1553,29 @@ "[5 rows x 51 columns]" ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "kgtt.aggregate_by_relation(edge_dcs).head()" + "from kg_topology_toolbox.utils import aggregate_by_relation\n", + "\n", + "aggregate_by_relation(edge_dcs).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Notice on the left the columns `num_triples`, `frac_triples`, `unique_h`, `unique_t` giving additional statistics for relation types (number of edges and relative frequency, number of unique entities used as heads/tails by triples of the relation type).\n", + "Notice on the extra columns `num_triples`, `frac_triples`, `unique_h`, `unique_t` giving additional statistics for relation types (number of edges and relative frequency, number of unique entities used as heads/tails by triples of the relation type).\n", "\n", "Similarly, by aggregating the `edge_eps` DataFrame we can look at the distribution of edge topological patterns within each relation type." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1812,25 +1832,25 @@ "[5 rows x 32 columns]" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "kgtt.aggregate_by_relation(edge_eps).head()" + "aggregate_by_relation(edge_eps).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Additional methods are provided for the analysis at the relation level: `jaccard_similarity_relation_sets` to compute the Jaccard similarity of the sets of head/tail entities used by each relation; `relational_affinity_ingram` to compute the InGram pairwise relation similarity (see [paper](https://arxiv.org/abs/2305.19987)). " + "Additional methods are provided in the `KGTopologyToolbox` class for analysis at the relation level: `jaccard_similarity_relation_sets` to compute the Jaccard similarity of the sets of head/tail entities used by each relation; `relational_affinity_ingram` to compute the InGram pairwise relation similarity (see [paper](https://arxiv.org/abs/2305.19987)). " ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -2106,18 +2126,18 @@ "[1275 rows x 14 columns]" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "kgtt.jaccard_similarity_relation_sets(biokg_df)" + "kgtt.jaccard_similarity_relation_sets()" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -2235,13 +2255,13 @@ "[2550 rows x 3 columns]" ] }, - "execution_count": 19, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "kgtt.relational_affinity_ingram(biokg_df)" + "kgtt.relational_affinity_ingram()" ] } ], diff --git a/pyproject.toml b/pyproject.toml index 6628c79..edc0115 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "kg-topology-toolbox" -version = "0.1.0" +version = "1.0.0" authors = [ {name = "Alberto Cattaneo"}, {name = "Daniel Justus"}, diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py index 4fffe64..8d29e96 100644 --- a/src/kg_topology_toolbox/topology_toolbox.py +++ b/src/kg_topology_toolbox/topology_toolbox.py @@ -5,37 +5,143 @@ Topology toolbox main functionalities """ -from collections.abc import Iterable +from functools import cache import numpy as np import pandas as pd from scipy.sparse import coo_array -from kg_topology_toolbox.utils import composition_count, jaccard_similarity +from kg_topology_toolbox.utils import ( + aggregate_by_relation, + check_kg_df_structure, + composition_count, + jaccard_similarity, + node_degrees_and_rels, +) class KGTopologyToolbox: """ - Toolbox class to compute various Knowledge Graph topology statistics. + Toolbox class to compute Knowledge Graph topology statistics. """ - def node_degree_summary( - self, df: pd.DataFrame, return_relation_list: bool = False - ) -> pd.DataFrame: + def __init__( + self, + kg_df: pd.DataFrame, + head_column: str = "h", + relation_column: str = "r", + tail_column: str = "t", + ): + """ + Instantiate the Topology Toolbox for a Knowledge Graph defined + by the list of its edges (h,r,t). + + :param kg_df: + A Knowledge Graph represented as a pd.DataFrame. + Must contain at least three columns, which specify the IDs of + head entity, relation type and tail entity for each edge. + :param head_column: + The name of the column with the IDs of head entities. Default: "h". + :param head_column: + The name of the column with the IDs of relation types. Default: "r". + :param head_column: + The name of the column with the IDs of tail entities. Default: "t". + + """ + check_kg_df_structure(kg_df, head_column, relation_column, tail_column) + + self.df = kg_df[[head_column, relation_column, tail_column]].rename( + columns={head_column: "h", relation_column: "r", tail_column: "t"} + ) + self.n_entity = self.df[["h", "t"]].max().max() + 1 + self.n_rel = self.df.r.max() + 1 + + def loop_count(self) -> pd.DataFrame: + """ + For each entity in the KG, compute the number of loops around the entity + (i.e., the number of edges having the entity as both head and tail). + + :return: + Loop count DataFrame, indexed on the IDs of the graph entities. + """ + n_loops = ( + self.df[self.df.h == self.df.t].groupby("h").agg(n_loops=("r", "count")) + ) + return ( + pd.DataFrame(n_loops, index=np.arange(self.n_entity)).fillna(0).astype(int) + ) + + @cache + def node_head_degree(self, return_relation_list: bool = False) -> pd.DataFrame: + """ + For each entity in the KG, compute the number of edges having it as head + (head-degree, or out-degree of the head node). + The relation types going out of the head node are also identified. + + :param return_relation_list: + If True, return the list of unique relations going + out of the head node. WARNING: expensive for large graphs. + Default: False. + + :return: + The result DataFrame, indexed on the IDs `e` of the graph entities, + with columns: + + - **h_degree** (int): Number of triples with head entity `e`. + - **h_unique_rel** (int): Number of distinct relation types + among edges with head entity `e`. + - **h_rel_list** (Optional[list]): List of unique relation types + among edges with head entity `e`. + Only returned if `return_relation_list = True`. + """ + node_df = node_degrees_and_rels( + self.df, "h", self.n_entity, return_relation_list + ) + return node_df.rename(columns={n: "h_" + n for n in node_df.columns}) + + @cache + def node_tail_degree(self, return_relation_list: bool = False) -> pd.DataFrame: + """ + For each entity in the KG, compute the number of edges having it as tail + (tail-degree, or in-degree of the tail node). + The relation types going into the tail node are also identified. + + :param return_relation_list: + If True, return the list of unique relation types going + into the tail node. WARNING: expensive for large graphs. + Default: False. + + :return: + The result DataFrame, indexed on the IDs `e` of the graph entities, + with columns: + + - **t_degree** (int): Number of triples with tail entity `e`. + - **t_unique_rel** (int): Number of distinct relation types + among edges with tail entity `e`. + - **t_rel_list** (Optional[list]): List of unique relation types + among edges with tail entity `e`. + Only returned if `return_relation_list = True`. + """ + node_df = node_degrees_and_rels( + self.df, "t", self.n_entity, return_relation_list + ) + return node_df.rename(columns={n: "t_" + n for n in node_df.columns}) + + def node_degree_summary(self, return_relation_list: bool = False) -> pd.DataFrame: """ - For each entity, this function computes the number of edges having it as a head + For each entity in the KG, compute the number of edges having it as a head (head-degree, or out-degree), as a tail (tail-degree, or in-degree) - or one of the two (total-degree) in the Knowledge Graph. + or one of the two (total-degree). The in-going and out-going relation types are also identified. The output dataframe is indexed on the IDs of the graph entities. - :param df: A graph represented as a pd.DataFrame. - Must contain at least three columns `h`, `r`, `t`. - :param return_relation_list: If True, return the list of unique relations going + :param return_relation_list: + If True, return the list of unique relations going in/out of an entity. WARNING: expensive for large graphs. - :return: The results dataframe, indexed over the same entity ID `e` used in df, + :return: + The results dataframe, indexed on the IDs `e` of the graph entities, with columns: - **h_degree** (int): Number of triples with head entity `e`. @@ -43,45 +149,33 @@ def node_degree_summary( - **tot_degree** (int): Number of triples with head entity `e` or tail entity `e`. - **h_unique_rel** (int): Number of distinct relation types among edges with head entity `e`. - - **h_rel_list** (list): List of unique relation types among edges + - **h_rel_list** (Optional[list]): List of unique relation types among edges with head entity `e`. + Only returned if `return_relation_list = True`. - **t_unique_rel** (int): Number of distinct relation types among edges with tail entity `e`. - - **t_rel_list** (list): List of unique relation types among edges + - **t_rel_list** (Optional[list]): List of unique relation types among edges with tail entity `e`. + Only returned if `return_relation_list = True`. - **n_loops** (int): number of loops around entity `e`. """ - n_entity = df[["h", "t"]].max().max() + 1 - h_rel_list = {"h_rel_list": ("r", "unique")} if return_relation_list else {} - t_rel_list = {"t_rel_list": ("r", "unique")} if return_relation_list else {} - nodes = pd.DataFrame( - df.groupby("h").agg( - h_degree=("r", "count"), h_unique_rel=("r", "nunique"), **h_rel_list # type: ignore - ), - index=np.arange(n_entity), - ) - nodes = nodes.merge( - df.groupby("t").agg( - t_degree=("r", "count"), t_unique_rel=("r", "nunique"), **t_rel_list # type: ignore - ), + nodes_df = pd.merge( + self.node_head_degree(return_relation_list), + self.node_tail_degree(return_relation_list), left_index=True, right_index=True, - how="left", ) - nodes = nodes.merge( - df[df.h == df.t].groupby("h").agg(n_loops=("r", "count")), + nodes_df = pd.merge( + nodes_df, + self.loop_count(), left_index=True, right_index=True, - how="left", ) - nodes[["h_degree", "h_unique_rel", "t_degree", "t_unique_rel", "n_loops"]] = ( - nodes[["h_degree", "h_unique_rel", "t_degree", "t_unique_rel", "n_loops"]] - .fillna(0) - .astype(int) + nodes_df["tot_degree"] = ( + nodes_df["h_degree"] + nodes_df["t_degree"] - nodes_df["n_loops"] ) - nodes["tot_degree"] = nodes["h_degree"] + nodes["t_degree"] - nodes["n_loops"] - return nodes[ + return nodes_df[ ["h_degree", "t_degree", "tot_degree", "h_unique_rel"] + (["h_rel_list"] if return_relation_list else []) + ["t_unique_rel"] @@ -89,24 +183,115 @@ def node_degree_summary( + ["n_loops"] ] - def edge_degree_cardinality_summary(self, df: pd.DataFrame) -> pd.DataFrame: + @cache + def edge_head_degree(self) -> pd.DataFrame: """ - For each triple, this function computes the number of edges with the same head + For each edge in the KG, compute the number of edges + (in total or of the same relation type) with the same head node. + + :return: + The result DataFrame, with the same indexing and ordering of + triples as the original KG DataFrame, with columns + (in addition to `h`, `r`, `t`): + + - **h_unique_rel** (int): Number of distinct relation types + among edges with head entity `h`. + - **h_degree** (int): Number of triples with head entity `h`. + - **h_degree_same_rel** (int): Number of triples with head entity `h` + and relation type `r`. + """ + edge_by_hr_count = self.df.groupby(["h", "r"], as_index=False).agg( + h_degree_same_rel=("t", "count") + ) + df_res = self.df.merge( + self.node_head_degree(), left_on=["h"], right_index=True, how="left" + ) + return df_res.merge(edge_by_hr_count, on=["h", "r"], how="left") + + @cache + def edge_tail_degree(self) -> pd.DataFrame: + """ + For each edge in the KG, compute the number of edges + (in total or of the same relation type) with the same tail node. + + :return: + The result DataFrame, with the same indexing and ordering of + triples as the original KG DataFrame, with columns + (in addition to `h`, `r`, `t`): + + - **t_unique_rel** (int): Number of distinct relation types + among edges with tail entity `t`. + - **t_degree** (int): Number of triples with tail entity `t`. + - **t_degree_same_rel** (int): Number of triples with tail entity `t` + and relation type `r`. + """ + edge_by_rt_count = self.df.groupby(["r", "t"], as_index=False).agg( + t_degree_same_rel=("h", "count") + ) + df_res = self.df.merge( + self.node_tail_degree(), left_on=["t"], right_index=True, how="left" + ) + return df_res.merge(edge_by_rt_count, on=["r", "t"], how="left") + + def edge_cardinality(self) -> pd.DataFrame: + """ + Classify the cardinality of each edge in the KG: one-to-one + (out-degree=in-degree=1), one-to-many (out-degree>1, in-degree=1), + many-to-one(out-degree=1, in-degree>1) or many-to-many + (in-degree>1, out-degree>1). + + :return: + The result DataFrame, with the same indexing and ordering of + triples as the original KG DataFrame, with columns + (in addition to `h`, `r`, `t`): + + - **triple_cardinality** (int): cardinality type of the edge. + - **triple_cardinality_same_rel** (int): cardinality type of the edge in + the subgraph of edges with relation type `r`. + """ + head_degree = self.edge_head_degree() + tail_degree = self.edge_tail_degree() + df_res = pd.DataFrame( + {"h": head_degree.h, "r": head_degree.r, "t": head_degree.t} + ) + # check if the values in the pair (h_degree, t_degree) are =1 or >1 + # to determine the edge cardinality + for suffix in ["", "_same_rel"]: + # check if the values in the pair (h_degree, t_degree) are =1 or >1 + # to determine the edge cardinality + edge_type = 2 * (head_degree["h_degree" + suffix] == 1) + ( + tail_degree["t_degree" + suffix] == 1 + ) + df_res["triple_cardinality" + suffix] = pd.cut( + edge_type, + bins=[0, 1, 2, 3, 4], + right=False, + labels=["M:M", "1:M", "M:1", "1:1"], + ).astype(str) + return df_res + + def edge_degree_cardinality_summary( + self, aggregate_by_r: bool = False + ) -> pd.DataFrame: + """ + For each edge in the KG, compute the number of edges with the same head (head-degree, or out-degree), the same tail (tail-degree, or in-degree) - or one of the two (total-degree) in the Knowledge Graph. + or one of the two (total-degree). Based on entity degrees, each triple is classified as either one-to-one (out-degree=in-degree=1), one-to-many (out-degree>1, in-degree=1), many-to-one(out-degree=1, in-degree>1) or many-to-many (in-degree>1, out-degree>1). The output dataframe maintains the same indexing and ordering of triples - as the input one. + as the original Knowledge Graph dataframe. - :param df: A graph represented as a pd.DataFrame. - Must contain at least three columns `h`, `r`, `t`. + :param aggregate_by_r: + If True, return metrics aggregated by relation type + (the output DataFrame will be indexed over relation IDs). - :return: The results dataframe. Contains the following columns - (in addition to `h`, `r`, `t` in ``df``): + :return: + The results dataframe. Contains the following columns + (in addition to `h`, `r`, `t`): - **h_unique_rel** (int): Number of distinct relation types among edges with head entity h. @@ -126,33 +311,18 @@ def edge_degree_cardinality_summary(self, df: pd.DataFrame) -> pd.DataFrame: - **triple_cardinality_same_rel** (int): cardinality type of the edge in the subgraph of edges with relation type r. """ - gr_by_h_count = df.groupby("h", as_index=False).agg( - h_unique_rel=("r", "nunique"), h_degree=("t", "count") - ) - gr_by_hr_count = df.groupby(["h", "r"], as_index=False).agg( - h_degree_same_rel=("t", "count") - ) - gr_by_t_count = df.groupby("t", as_index=False).agg( - t_unique_rel=("r", "nunique"), t_degree=("h", "count") - ) - gr_by_rt_count = df.groupby(["r", "t"], as_index=False).agg( - t_degree_same_rel=("h", "count") - ) - - df_res = df.merge(gr_by_h_count, left_on=["h"], right_on=["h"], how="left") - df_res = df_res.merge( - gr_by_hr_count, left_on=["h", "r"], right_on=["h", "r"], how="left" - ) - df_res = df_res.merge(gr_by_t_count, left_on=["t"], right_on=["t"], how="left") - df_res = df_res.merge( - gr_by_rt_count, left_on=["t", "r"], right_on=["t", "r"], how="left" + df_res = pd.concat( + [ + self.edge_head_degree(), + self.edge_tail_degree().drop(columns=["h", "r", "t"]), + ], + axis=1, ) # compute number of parallel edges to avoid double-counting them # in total degree num_parallel = df_res.merge( - df.groupby(["h", "t"], as_index=False).agg(n_parallel=("r", "count")), - left_on=["h", "t"], - right_on=["h", "t"], + self.df.groupby(["h", "t"], as_index=False).agg(n_parallel=("r", "count")), + on=["h", "t"], how="left", ) df_res["tot_degree"] = ( @@ -164,46 +334,43 @@ def edge_degree_cardinality_summary(self, df: pd.DataFrame) -> pd.DataFrame: df_res.h_degree_same_rel + df_res.t_degree_same_rel - 1 ) - # check if the values in the pair (h_degree, t_degree) are =1 or >1 - # to determine the edge cardinality - legend = { - 0: "M:M", - 1: "1:M", - 2: "M:1", - 3: "1:1", - } - for suffix in ["", "_same_rel"]: - edge_type = 2 * (df_res["h_degree" + suffix] == 1) + ( - df_res["t_degree" + suffix] == 1 - ) - df_res["triple_cardinality" + suffix] = edge_type.apply(lambda x: legend[x]) - return df_res + edge_cardinality = self.edge_cardinality() + df_res["triple_cardinality"] = edge_cardinality["triple_cardinality"] + df_res["triple_cardinality_same_rel"] = edge_cardinality[ + "triple_cardinality_same_rel" + ] + return aggregate_by_relation(df_res) if aggregate_by_r else df_res def edge_pattern_summary( self, - df: pd.DataFrame, return_metapath_list: bool = False, composition_chunk_size: int = 2**8, composition_workers: int = 32, + aggregate_by_r: bool = False, ) -> pd.DataFrame: """ - This function analyses the structural properties of each edge in the graph: + Analyse structural properties of each edge in the KG: symmetry, presence of inverse/inference(=parallel) edges and triangles supported on the edge. The output dataframe maintains the same indexing and ordering of triples - as the input one. + as the original Knowledge Graph dataframe. - :param df: A graph represented as a pd.DataFrame. - Must contain at least three columns `h`, `r`, `t`. - :param return_metapath_list: If True, return the list of unique metapaths for all + :param return_metapath_list: + If True, return the list of unique metapaths for all triangles supported over one edge. WARNING: very expensive for large graphs. - :param composition_chunk_size: Size of column chunks of sparse adjacency matrix + :param composition_chunk_size: + Size of column chunks of sparse adjacency matrix to compute the triangle count. - :param composition_workers: Number of workers to compute the triangle count. + :param composition_workers: + Number of workers to compute the triangle count. + :param aggregate_by_r: + If True, return metrics aggregated by relation type + (the output DataFrame will be indexed over relation IDs). - :return: The results dataframe. Contains the following columns - (in addition to `h`, `r`, `t` in ``df``): + :return: + The results dataframe. Contains the following columns + (in addition to `h`, `r`, `t`): - **is_loop** (bool): True if the triple is a loop (``h == t``). - **is_symmetric** (bool): True if the triple (t, r, h) is also contained @@ -230,12 +397,14 @@ def edge_pattern_summary( """ # symmetry-asymmetry # edges with h/t switched - df_inv = df.reindex(columns=["t", "r", "h"]).rename( + df_inv = self.df.reindex(columns=["t", "r", "h"]).rename( columns={"t": "h", "r": "r", "h": "t"} ) - df_res = pd.DataFrame({"h": df.h, "r": df.r, "t": df.t, "is_symmetric": False}) + df_res = pd.DataFrame( + {"h": self.df.h, "r": self.df.r, "t": self.df.t, "is_symmetric": False} + ) df_res.loc[ - df.reset_index().merge(df_inv)["index"], + self.df.reset_index().merge(df_inv)["index"], "is_symmetric", ] = True # loops are treated separately @@ -277,7 +446,7 @@ def edge_pattern_summary( # composition & metapaths # discard loops as edges of a triangle - df_wo_loops = df[df.h != df.t] + df_wo_loops = self.df[self.df.h != self.df.t] if return_metapath_list: # 2-hop paths df_bridges = df_wo_loops.merge( @@ -336,7 +505,7 @@ def edge_pattern_summary( ) df_res["has_undirected_composition"] = df_res["n_undirected_triangles"] > 0 - return df_res[ + df_res = df_res[ [ "h", "r", @@ -357,95 +526,16 @@ def edge_pattern_summary( + (["metapath_list"] if return_metapath_list else []) ] - def aggregate_by_relation(self, edge_topology_df: pd.DataFrame) -> pd.DataFrame: - """ - Aggregate topology metrics of all triples of the same relation type. - To be applied to the output dataframe of either - :meth:`KGTopologyToolbox.edge_degree_cardinality_summary` or - :meth:`KGTopologyToolbox.edge_pattern_summary`. - - The returned dataframe is indexed over relation type IDs, with columns - giving the aggregated statistics of triples of the correspondig relation. - The name of the columns is of the form ``column_name_in_input_df + suffix``. - The aggregation is perfomed by returning: - - - for numerical metrics: mean, standard deviation and quartiles - (``suffix`` = "_mean", "_std", "_quartile1", "_quartile2", "_quartile3"); - - for boolean metrics: the fraction of triples of the relation type - with metric = True (``suffix`` = "_frac"); - - for string metrics: for each possible label, the fraction of triples - of the relation type with that metric value (``suffix`` = "_{label}_frac") - - for list metrics: the unique metric values across triples of the relation - type (``suffix`` = "_unique"). - - :param edge_topology_df: pd.DataFrame of edge topology metrics. - Must contain at least three columns `h`, `r`, `t`. - - :return: The results dataframe. In addition to the columns with the aggregated - metrics by relation type, it also contains columns: - - - **num_triples** (int): Number of triples for each relation type. - - **frac_triples** (float): Fraction of overall triples represented by each - relation type. - - **unique_h** (int): Number of unique head entities used by triples of each - relation type. - - **unique_t** (int): Number of unique tail entities used by triples of each - relation type. - """ - df_by_r = edge_topology_df.groupby("r") - df_res = df_by_r.agg(num_triples=("r", "count")) - df_res["frac_triples"] = df_res["num_triples"] / edge_topology_df.shape[0] - col: str - for col, col_dtype in edge_topology_df.drop(columns=["r"]).dtypes.items(): # type: ignore - if col in ["h", "t"]: - df_res[f"unique_{col}"] = df_by_r[col].nunique() - elif col_dtype == object: - if isinstance(edge_topology_df[col].iloc[0], str): - for label in np.unique(edge_topology_df[col]): - df_res[f"{col}_{label}_frac"] = ( - edge_topology_df[edge_topology_df[col] == label] - .groupby("r")[col] - .count() - / df_res["num_triples"] - ).fillna(0) - elif isinstance(edge_topology_df[col].iloc[0], Iterable): - df_res[f"{col}_unique"] = ( - df_by_r[col] - .agg(np.unique) - .apply( - lambda x: ( - np.unique( - np.concatenate( - [lst for lst in x if len(lst) > 0] or [[]] - ) - ).tolist() - ) - ) - ) - else: - print(f"Skipping column {col}: no known aggregation mode") - continue - elif col_dtype == int or col_dtype == float: - df_res[f"{col}_mean"] = df_by_r[col].mean() - df_res[f"{col}_std"] = df_by_r[col].std() - for q in range(1, 4): - df_res[f"{col}_quartile{q}"] = df_by_r[col].agg( - lambda x: np.quantile(x, 0.25 * q) - ) - elif col_dtype == bool: - df_res[f"{col}_frac"] = df_by_r[col].mean() - return df_res + return aggregate_by_relation(df_res) if aggregate_by_r else df_res - def jaccard_similarity_relation_sets(self, df: pd.DataFrame) -> pd.DataFrame: + def jaccard_similarity_relation_sets(self) -> pd.DataFrame: """ Compute the similarity between relations defined as the Jaccard Similarity between sets of entities (heads and tails) for all pairs of relations in the graph. - :param df: A graph represented as a pd.DataFrame. - Must contain at least three columns `h`, `r`, `t`. - - :return: The results dataframe. Contains the following columns: + :return: + The results dataframe. Contains the following columns: - **r1** (int): Index of the first relation. - **r2** (int): Index of the second relation. @@ -468,7 +558,7 @@ def jaccard_similarity_relation_sets(self, df: pd.DataFrame) -> pd.DataFrame: - **jaccard_both** (float): Jaccard similarity between the full entity set of r1 and r2. """ - ent_unique = df.groupby("r", as_index=False).agg( + ent_unique = self.df.groupby("r", as_index=False).agg( num_triples=("r", "count"), head=("h", "unique"), tail=("t", "unique") ) ent_unique["both"] = ent_unique.apply( @@ -487,7 +577,7 @@ def jaccard_similarity_relation_sets(self, df: pd.DataFrame) -> pd.DataFrame: df_res = df_res[df_res.r1 < df_res.r2] df_res["num_triples_both"] = df_res["num_triples_r1"] + df_res["num_triples_r2"] - df_res["frac_triples_both"] = df_res["num_triples_both"] / df.shape[0] + df_res["frac_triples_both"] = df_res["num_triples_both"] / self.df.shape[0] df_res["num_entities_both"] = df_res.apply( lambda x: len( np.unique( @@ -531,9 +621,7 @@ def jaccard_similarity_relation_sets(self, df: pd.DataFrame) -> pd.DataFrame: ] return df_res - def relational_affinity_ingram( - self, df: pd.DataFrame, min_max_norm: bool = False - ) -> pd.DataFrame: + def relational_affinity_ingram(self, min_max_norm: bool = False) -> pd.DataFrame: """ Compute the similarity between relations based on the approach proposed in InGram: Inductive Knowledge Graph Embedding via Relation Graphs, @@ -542,34 +630,31 @@ def relational_affinity_ingram( Only the pairs of relations witn ``affinity > 0`` are shown in the returned dataframe. - :param df: A graph represented as a pd.DataFrame. - Must contain at least three columns `h`, `r`, `t`. - :param min_max_norm: min-max normalization of edge weights. Defaults to False. + :param min_max_norm: + min-max normalization of edge weights. Defaults to False. - :return: The results dataframe. Contains the following columns: + :return: + The results dataframe. Contains the following columns: - **h_relation** (int): Index of the head relation. - **t_relation** (int): Index of the tail relation. - **edge_weight** (float): Weight for the affinity between the head and the tail relation. """ - n_entities = df[["h", "t"]].max().max() + 1 - n_rels = df.r.max() + 1 - - hr_freqs = df.groupby(["h", "r"], as_index=False).count() + hr_freqs = self.df.groupby(["h", "r"], as_index=False).count() # normalize by global h frequency hr_freqs["t"] = hr_freqs["t"] / hr_freqs.groupby("h")["t"].transform("sum") - rt_freqs = df.groupby(["t", "r"], as_index=False).count() + rt_freqs = self.df.groupby(["t", "r"], as_index=False).count() # normalize by global t frequency rt_freqs["h"] = rt_freqs["h"] / rt_freqs.groupby("t")["h"].transform("sum") E_h = coo_array( (hr_freqs.t, (hr_freqs.h, hr_freqs.r)), - shape=[n_entities, n_rels], + shape=[self.n_entity, self.n_rel], ) E_t = coo_array( (rt_freqs.h, (rt_freqs.t, rt_freqs.r)), - shape=[n_entities, n_rels], + shape=[self.n_entity, self.n_rel], ) A = (E_h.T @ E_h).toarray() + (E_t.T @ E_t).toarray() diff --git a/src/kg_topology_toolbox/utils.py b/src/kg_topology_toolbox/utils.py index c35b0f0..d3a3d55 100644 --- a/src/kg_topology_toolbox/utils.py +++ b/src/kg_topology_toolbox/utils.py @@ -4,24 +4,183 @@ Utility functions """ +import warnings +from collections.abc import Iterable from multiprocessing import Pool import numpy as np import pandas as pd from numpy.typing import NDArray +from pandas.api.types import is_integer_dtype from scipy.sparse import coo_array, csc_array, csr_array +def check_kg_df_structure(kg_df: pd.DataFrame, h: str, r: str, t: str) -> None: + """ + Utility to perform sanity checks on the structure of the provided DataFrame, + to ensure that it encodes a Knowledge Graph in a compatible way. + + :param kg_df: + The Knowledge Graph DataFrame. + :param h: + The name of the column with the IDs of head entities. + :param r: + The name of the column with the IDs of relation types. + :param t: + The name of the column with the IDs of tail entities. + + """ + # check h,r,t columns are present and of an integer type + for col_name in [h, r, t]: + if col_name in kg_df.columns: + if not is_integer_dtype(kg_df[col_name]): + raise TypeError(f"Column {col_name} needs to be of an integer dtype") + else: + raise ValueError(f"DataFrame {kg_df} has no column named {col_name}") + # check there are no duplicated (h,r,t) triples + if kg_df[[h, r, t]].duplicated().any(): + warnings.warn( + "The Knowledge Graph contains duplicated edges" + " -- some functionalities may produce incorrect results" + ) + + +def node_degrees_and_rels( + df: pd.DataFrame, column: str, n_entity: int, return_relation_list: bool +) -> pd.DataFrame: + """ + Aggregate edges by head/tail node and compute associated statistics. + + :param df: + Dataframe of (h,r,t) triples. + :param column: + Name of the column used to aggregate edges. + :param n_entity: + Total number of entities in the graph. + :param return_relation_list: + If True, return the list of unique relations types + in the set of aggregated edges. + + :return: + The result DataFrame, indexed on the IDs of the graph entities, + with columns: + + - **degree** (int): Number of triples in the aggregation. + - **unique_rel** (int): Number of distinct relation types + in the set of aggregated edges. + - **rel_list** (Optional[list]): List of unique relation types + in the set of aggregated edges. + Only returned if `return_relation_list = True`. + """ + rel_list = {"rel_list": ("r", "unique")} if return_relation_list else {} + deg_df = pd.DataFrame( + df.groupby(column).agg( + degree=("r", "count"), unique_rel=("r", "nunique"), **rel_list # type: ignore + ), + index=np.arange(n_entity), + ) + deg_df[["degree", "unique_rel"]] = ( + deg_df[["degree", "unique_rel"]].fillna(0).astype(int) + ) + return deg_df + + +def aggregate_by_relation(edge_topology_df: pd.DataFrame) -> pd.DataFrame: + """ + Aggregate topology metrics of all triples of the same relation type. + To be applied to a DataFrame of metrics having at least columns + `h`, `r`, `t` (e.g., the output of + :meth:`KGTopologyToolbox.edge_degree_cardinality_summary` or + :meth:`KGTopologyToolbox.edge_pattern_summary`). + + The returned dataframe is indexed over relation type IDs, with columns + giving the aggregated statistics of triples of the corresponding relation. + The name of the columns is of the form ``column_name_in_input_df + suffix``. + The aggregation is performed by returning: + + - for numerical metrics: mean, standard deviation and quartiles + (``suffix`` = "_mean", "_std", "_quartile1", "_quartile2", "_quartile3"); + - for boolean metrics: the fraction of triples of the relation type + with metric = True (``suffix`` = "_frac"); + - for string metrics: for each possible label, the fraction of triples + of the relation type with that metric value (``suffix`` = "_{label}_frac") + - for list metrics: the unique metric values across triples of the relation + type (``suffix`` = "_unique"). + + :param edge_topology_df: + pd.DataFrame of edge topology metrics. + Must contain at least three columns `h`, `r`, `t`. + + :return: + The results dataframe. In addition to the columns with the aggregated + metrics by relation type, it also contains columns: + + - **num_triples** (int): Number of triples for each relation type. + - **frac_triples** (float): Fraction of overall triples represented by each + relation type. + - **unique_h** (int): Number of unique head entities used by triples of each + relation type. + - **unique_t** (int): Number of unique tail entities used by triples of each + relation type. + """ + df_by_r = edge_topology_df.groupby("r") + df_res = df_by_r.agg(num_triples=("r", "count")) + df_res["frac_triples"] = df_res["num_triples"] / edge_topology_df.shape[0] + col: str + for col, col_dtype in edge_topology_df.drop(columns=["r"]).dtypes.items(): # type: ignore + if col in ["h", "t"]: + df_res[f"unique_{col}"] = df_by_r[col].nunique() + elif col_dtype == object: + if isinstance(edge_topology_df[col].iloc[0], str): + for label in np.unique(edge_topology_df[col]): + df_res[f"{col}_{label}_frac"] = ( + edge_topology_df[edge_topology_df[col] == label] + .groupby("r")[col] + .count() + / df_res["num_triples"] + ).fillna(0) + elif isinstance(edge_topology_df[col].iloc[0], Iterable): + df_res[f"{col}_unique"] = ( + df_by_r[col] + .agg(np.unique) + .apply( + lambda x: ( + np.unique( + np.concatenate( + [lst for lst in x if len(lst) > 0] or [[]] + ) + ).tolist() + ) + ) + ) + else: + print(f"Skipping column {col}: no known aggregation mode") + continue + elif col_dtype == int or col_dtype == float: + df_res[f"{col}_mean"] = df_by_r[col].mean() + df_res[f"{col}_std"] = df_by_r[col].std() + for q in range(1, 4): + df_res[f"{col}_quartile{q}"] = df_by_r[col].agg( + lambda x: np.quantile(x, 0.25 * q) + ) + elif col_dtype == bool: + df_res[f"{col}_frac"] = df_by_r[col].mean() + return df_res + + def jaccard_similarity( entities_1: NDArray[np.int32], entities_2: NDArray[np.int32] ) -> float: """ Jaccard Similarity function for two sets of entities. - :param entities_1: the array of IDs for the first set of entities. - :param entities_2: the array of IDs for the second set of entities. + :param entities_1: + Array of IDs for the first set of entities. + :param entities_2: + Array of IDs for the second set of entities. - :return: Jaccard Similarity score for two sets of entities. + :return: + Jaccard Similarity score for two sets of entities. """ intersection = len(np.intersect1d(entities_1, entities_2)) union = len(entities_1) + len(entities_2) - intersection @@ -48,22 +207,26 @@ def composition_count( ) -> pd.DataFrame: """A helper function to compute the composition count of a graph. - :param df: A graph represented as a pd.DataFrame. Must contain the columns + :param df: + A graph represented as a pd.DataFrame. Must contain the columns `h` and `t`. No self-loops should be present in the graph. - :param chunk_size: Size of chunks of columns of the adjacency matrix to be + :param chunk_size: + Size of chunks of columns of the adjacency matrix to be processed together. - :param workers: Number of workers processing chunks concurrently - :param directed: Boolean flag. If false, bidirectional edges are considered for - triangles by adding the adjacency matrix and its transposed. Defaults to True. - - :return: The results dataframe. Contains the following columns: + :param workers: + Number of workers processing chunks concurrently + :param directed: + Boolean flag. If false, bidirectional edges are considered for + triangles by adding the adjacency matrix and its transposed. Default: True. + :return: + The results dataframe. Contains the following columns: - **h** (int): Index of the head entity. - **t** (int): Index of the tail entity. - **n_triangles** (int): Number of compositions for the (h, t) edge. """ - n_nodes = max(df[["h", "t"]].max()) + 1 + n_nodes = df[["h", "t"]].max().max() + 1 adj = coo_array( (np.ones(len(df)), (df.h, df.t)), shape=[n_nodes, n_nodes], diff --git a/tests/test_edge_topology_toolbox.py b/tests/test_edge_topology_toolbox.py index b1846b5..eaba81a 100644 --- a/tests/test_edge_topology_toolbox.py +++ b/tests/test_edge_topology_toolbox.py @@ -8,14 +8,16 @@ df = pd.DataFrame( dict( - h=[0, 0, 0, 1, 2, 2, 1, 2], - t=[1, 1, 2, 2, 0, 0, 1, 2], - r=[0, 1, 0, 1, 0, 1, 1, 0], + H=[0, 0, 0, 1, 2, 2, 1, 2], + T=[1, 1, 2, 2, 0, 0, 1, 2], + R=[0, 1, 0, 1, 0, 1, 1, 0], n=["a", "b", "c", "d", "e", "f", "g", "h"], ) ) -tools = KGTopologyToolbox() +kgtt = KGTopologyToolbox( + kg_df=df, head_column="H", relation_column="R", tail_column="T" +) @pytest.mark.parametrize("return_metapath_list", [True, False]) @@ -24,7 +26,7 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None: # the edge_topology_toolbox # entity degrees statistics - res = tools.edge_degree_cardinality_summary(df) + res = kgtt.edge_degree_cardinality_summary() assert np.allclose(res["h_unique_rel"], [2, 2, 2, 1, 2, 2, 1, 2]) assert np.allclose(res["h_degree"], [3, 3, 3, 2, 3, 3, 2, 3]) assert np.allclose(res["h_degree_same_rel"], [2, 1, 2, 2, 2, 1, 2, 2]) @@ -57,7 +59,7 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None: ] # relation pattern symmetry - res = tools.edge_pattern_summary(df, return_metapath_list=return_metapath_list) + res = kgtt.edge_pattern_summary(return_metapath_list=return_metapath_list) assert np.allclose( res["is_loop"], [False, False, False, False, False, False, True, True] ) diff --git a/tests/test_node_topology_toolbox.py b/tests/test_node_topology_toolbox.py index d78ce2d..18d87ed 100644 --- a/tests/test_node_topology_toolbox.py +++ b/tests/test_node_topology_toolbox.py @@ -8,14 +8,14 @@ df = pd.DataFrame( dict( - h=[0, 0, 0, 1, 2, 2, 2], - t=[1, 1, 2, 2, 0, 0, 2], - r=[0, 1, 0, 1, 0, 1, 1], + H=[0, 0, 0, 1, 2, 2, 2], + T=[1, 1, 2, 2, 0, 0, 2], + R=[0, 1, 0, 1, 0, 1, 1], n=["a", "b", "c", "d", "e", "f", "g"], ) ) -tools = KGTopologyToolbox() +kgtt = KGTopologyToolbox(df, head_column="H", relation_column="R", tail_column="T") @pytest.mark.parametrize("return_relation_list", [True, False]) @@ -24,7 +24,7 @@ def test_small_graph_metrics(return_relation_list: bool) -> None: # the node_topology_toolbox # entity degrees statistics - res = tools.node_degree_summary(df, return_relation_list=return_relation_list) + res = kgtt.node_degree_summary(return_relation_list=return_relation_list) assert np.allclose(res["h_degree"], [3, 1, 3]) assert np.allclose(res["t_degree"], [2, 2, 3]) assert np.allclose(res["tot_degree"], [5, 3, 5]) diff --git a/tests/test_relation_topology_toolbox.py b/tests/test_relation_topology_toolbox.py index a41c60d..3f0c05c 100644 --- a/tests/test_relation_topology_toolbox.py +++ b/tests/test_relation_topology_toolbox.py @@ -10,24 +10,22 @@ df = pd.DataFrame( dict( - h=[0, 0, 0, 1, 2, 2, 2, 3, 3, 4], - t=[1, 1, 2, 2, 0, 3, 4, 2, 4, 3], - r=[0, 1, 0, 1, 0, 1, 1, 0, 0, 1], + H=[0, 0, 0, 1, 2, 2, 2, 3, 3, 4], + T=[1, 1, 2, 2, 0, 3, 4, 2, 4, 3], + R=[0, 1, 0, 1, 0, 1, 1, 0, 0, 1], n=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], ) ) -tools = KGTopologyToolbox() +kgtt = KGTopologyToolbox(df, head_column="H", relation_column="R", tail_column="T") def test_small_graph_metrics() -> None: # Define a small graph on five nodes with all the features tested by # the relation_topology_toolbox - dcs = tools.aggregate_by_relation(tools.edge_degree_cardinality_summary(df)) - eps = tools.aggregate_by_relation( - tools.edge_pattern_summary(df, return_metapath_list=True) - ) + dcs = kgtt.edge_degree_cardinality_summary(aggregate_by_r=True) + eps = kgtt.edge_pattern_summary(return_metapath_list=True, aggregate_by_r=True) assert np.allclose(dcs["num_triples"], [5, 5]) assert np.allclose(dcs["frac_triples"], [0.5, 0.5]) @@ -73,7 +71,7 @@ def test_small_graph_metrics() -> None: def test_jaccard_similarity() -> None: # jaccard_similarity_relation_sets - res = tools.jaccard_similarity_relation_sets(df) + res = kgtt.jaccard_similarity_relation_sets() assert np.allclose(res["jaccard_head_head"], [2 / 5]) assert np.allclose(res["jaccard_tail_tail"], [3 / 5]) assert np.allclose(res["jaccard_head_tail"], [2 / 5]) @@ -86,5 +84,5 @@ def test_jaccard_similarity() -> None: ) def test_ingram_affinity(min_max_norm: bool, expected: List[float]) -> None: # relational_affinity_ingram - res = tools.relational_affinity_ingram(df, min_max_norm) + res = kgtt.relational_affinity_ingram(min_max_norm) assert np.allclose(res["edge_weight"], expected)