diff --git a/docs/source/notebooks/ogb_biokg_demo.ipynb b/docs/source/notebooks/ogb_biokg_demo.ipynb
index aa6fac9..bb73448 100644
--- a/docs/source/notebooks/ogb_biokg_demo.ipynb
+++ b/docs/source/notebooks/ogb_biokg_demo.ipynb
@@ -15,7 +15,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 1,
"metadata": {},
"outputs": [
{
@@ -31,13 +31,13 @@
"source": [
"import sys\n",
"!{sys.executable} -m pip uninstall -y kg_topology_toolbox\n",
- "!pip install -q git+https://github.com/graphcore-research/kg-topology-toolbox.git\n",
+ "!pip install -q git+https://github.com/graphcore-research/kg-topology-toolbox.git --no-cache-dir\n",
"!pip install -q jupyter ipywidgets ogb seaborn"
]
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -63,7 +63,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -181,7 +181,7 @@
"[5088434 rows x 3 columns]"
]
},
- "execution_count": 5,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -194,7 +194,9 @@
"all_triples = []\n",
"for split in dataset.get_edge_split().values():\n",
" all_triples.append(np.stack([split[\"head\"], split[\"relation\"], split[\"tail\"]]).T)\n",
- "biokg_df = pd.DataFrame(np.concatenate(all_triples), columns=[\"h\", \"r\", \"t\"])\n",
+ "biokg_df = pd.DataFrame(\n",
+ " np.concatenate(all_triples).astype(np.int32), columns=[\"h\", \"r\", \"t\"]\n",
+ ")\n",
"biokg_df"
]
},
@@ -202,16 +204,32 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Based on this representation of the knowledge graph, we can proceed to compute its topological properties using the `KGTopologyToolbox` class."
+ "Based on this representation of the knowledge graph, we can proceed to instantiate the `KGTopologyToolbox` class to compute topological properties."
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/nethome/albertoc/research/knowledge_graphs/kg-topology-toolbox/.venv/lib/python3.10/site-packages/kg_topology_toolbox/topology_toolbox.py:64: UserWarning: The Knowledge Graph contains duplicated edges -- some functionalities may produce incorrect results\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
+ "source": [
+ "kgtt = KGTopologyToolbox(biokg_df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "kgtt = KGTopologyToolbox()"
+ "Notice the warning raised by the constructor, which detects duplicated edges in the `biokg_df` DataFrame: to ensure optimal functionalities, duplicated edges should be removed before instantiating the `KGTopologyToolbox` class."
]
},
{
@@ -231,7 +249,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -385,19 +403,19 @@
"[45085 rows x 6 columns]"
]
},
- "execution_count": 7,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "node_ds = kgtt.node_degree_summary(biokg_df)\n",
+ "node_ds = kgtt.node_degree_summary()\n",
"node_ds"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -437,7 +455,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -491,7 +509,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -518,11 +536,11 @@
"
h | \n",
" r | \n",
" t | \n",
- " h_unique_rel | \n",
" h_degree | \n",
+ " h_unique_rel | \n",
" h_degree_same_rel | \n",
- " t_unique_rel | \n",
" t_degree | \n",
+ " t_unique_rel | \n",
" t_degree_same_rel | \n",
" tot_degree | \n",
" tot_degree_same_rel | \n",
@@ -536,11 +554,11 @@
" 1718 | \n",
" 0 | \n",
" 3207 | \n",
- " 5 | \n",
" 191 | \n",
+ " 5 | \n",
" 116 | \n",
- " 6 | \n",
" 46 | \n",
+ " 6 | \n",
" 14 | \n",
" 236 | \n",
" 129 | \n",
@@ -552,11 +570,11 @@
" 4903 | \n",
" 0 | \n",
" 13662 | \n",
- " 8 | \n",
" 544 | \n",
+ " 8 | \n",
" 33 | \n",
- " 9 | \n",
" 1975 | \n",
+ " 9 | \n",
" 50 | \n",
" 2518 | \n",
" 82 | \n",
@@ -568,11 +586,11 @@
" 5480 | \n",
" 0 | \n",
" 15999 | \n",
- " 3 | \n",
" 108 | \n",
+ " 3 | \n",
" 5 | \n",
- " 4 | \n",
" 72 | \n",
+ " 4 | \n",
" 22 | \n",
" 179 | \n",
" 26 | \n",
@@ -584,11 +602,11 @@
" 3148 | \n",
" 0 | \n",
" 7247 | \n",
- " 4 | \n",
" 110 | \n",
+ " 4 | \n",
" 99 | \n",
- " 11 | \n",
" 673 | \n",
+ " 11 | \n",
" 271 | \n",
" 782 | \n",
" 369 | \n",
@@ -600,11 +618,11 @@
" 10300 | \n",
" 0 | \n",
" 16202 | \n",
- " 4 | \n",
" 414 | \n",
+ " 4 | \n",
" 315 | \n",
- " 6 | \n",
" 148 | \n",
+ " 6 | \n",
" 31 | \n",
" 561 | \n",
" 345 | \n",
@@ -632,11 +650,11 @@
" 2451 | \n",
" 50 | \n",
" 5097 | \n",
- " 5 | \n",
" 636 | \n",
+ " 5 | \n",
" 272 | \n",
- " 10 | \n",
" 803 | \n",
+ " 10 | \n",
" 272 | \n",
" 1437 | \n",
" 543 | \n",
@@ -648,11 +666,11 @@
" 6456 | \n",
" 50 | \n",
" 8833 | \n",
- " 10 | \n",
" 743 | \n",
- " 259 | \n",
" 10 | \n",
+ " 259 | \n",
" 371 | \n",
+ " 10 | \n",
" 100 | \n",
" 1111 | \n",
" 358 | \n",
@@ -664,11 +682,11 @@
" 9484 | \n",
" 50 | \n",
" 15873 | \n",
- " 8 | \n",
" 652 | \n",
+ " 8 | \n",
" 213 | \n",
- " 6 | \n",
" 486 | \n",
+ " 6 | \n",
" 163 | \n",
" 1135 | \n",
" 375 | \n",
@@ -680,11 +698,11 @@
" 6365 | \n",
" 50 | \n",
" 496 | \n",
- " 9 | \n",
" 922 | \n",
+ " 9 | \n",
" 277 | \n",
- " 19 | \n",
" 618 | \n",
+ " 19 | \n",
" 173 | \n",
" 1537 | \n",
" 449 | \n",
@@ -696,11 +714,11 @@
" 13860 | \n",
" 50 | \n",
" 6368 | \n",
- " 7 | \n",
" 485 | \n",
+ " 7 | \n",
" 175 | \n",
- " 8 | \n",
" 455 | \n",
+ " 8 | \n",
" 147 | \n",
" 939 | \n",
" 321 | \n",
@@ -713,31 +731,31 @@
""
],
"text/plain": [
- " h r t h_unique_rel h_degree h_degree_same_rel \\\n",
- "0 1718 0 3207 5 191 116 \n",
- "1 4903 0 13662 8 544 33 \n",
- "2 5480 0 15999 3 108 5 \n",
- "3 3148 0 7247 4 110 99 \n",
- "4 10300 0 16202 4 414 315 \n",
- "... ... .. ... ... ... ... \n",
- "5088429 2451 50 5097 5 636 272 \n",
- "5088430 6456 50 8833 10 743 259 \n",
- "5088431 9484 50 15873 8 652 213 \n",
- "5088432 6365 50 496 9 922 277 \n",
- "5088433 13860 50 6368 7 485 175 \n",
+ " h r t h_degree h_unique_rel h_degree_same_rel \\\n",
+ "0 1718 0 3207 191 5 116 \n",
+ "1 4903 0 13662 544 8 33 \n",
+ "2 5480 0 15999 108 3 5 \n",
+ "3 3148 0 7247 110 4 99 \n",
+ "4 10300 0 16202 414 4 315 \n",
+ "... ... .. ... ... ... ... \n",
+ "5088429 2451 50 5097 636 5 272 \n",
+ "5088430 6456 50 8833 743 10 259 \n",
+ "5088431 9484 50 15873 652 8 213 \n",
+ "5088432 6365 50 496 922 9 277 \n",
+ "5088433 13860 50 6368 485 7 175 \n",
"\n",
- " t_unique_rel t_degree t_degree_same_rel tot_degree \\\n",
- "0 6 46 14 236 \n",
- "1 9 1975 50 2518 \n",
- "2 4 72 22 179 \n",
- "3 11 673 271 782 \n",
- "4 6 148 31 561 \n",
- "... ... ... ... ... \n",
- "5088429 10 803 272 1437 \n",
- "5088430 10 371 100 1111 \n",
- "5088431 6 486 163 1135 \n",
- "5088432 19 618 173 1537 \n",
- "5088433 8 455 147 939 \n",
+ " t_degree t_unique_rel t_degree_same_rel tot_degree \\\n",
+ "0 46 6 14 236 \n",
+ "1 1975 9 50 2518 \n",
+ "2 72 4 22 179 \n",
+ "3 673 11 271 782 \n",
+ "4 148 6 31 561 \n",
+ "... ... ... ... ... \n",
+ "5088429 803 10 272 1437 \n",
+ "5088430 371 10 100 1111 \n",
+ "5088431 486 6 163 1135 \n",
+ "5088432 618 19 173 1537 \n",
+ "5088433 455 8 147 939 \n",
"\n",
" tot_degree_same_rel triple_cardinality triple_cardinality_same_rel \n",
"0 129 M:M M:M \n",
@@ -755,13 +773,13 @@
"[5088434 rows x 13 columns]"
]
},
- "execution_count": 10,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "edge_dcs = kgtt.edge_degree_cardinality_summary(biokg_df)\n",
+ "edge_dcs = kgtt.edge_degree_cardinality_summary()\n",
"edge_dcs"
]
},
@@ -774,7 +792,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -837,7 +855,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -872,7 +890,7 @@
"source": [
"### Edge topological patterns\n",
"\n",
- "The second method provided by `KGTopologyToolbox` for topological analysis at the edge level is `edge_pattern_summary`, which extracts information on several significant edge topological patterns. In particular, it detects whether the edge (h,r,t) is a loop, is symmetric or has inverse, inference, composition (directed and undirected):\n",
+ "`KGTopologyToolbox` also allows us to perform a topological analysis at the edge level, using the method `edge_pattern_summary`, which extracts information on several significant edge topological patterns. In particular, it detects whether the edge (h,r,t) is a loop, is symmetric or has inverse, inference, composition (directed and undirected):\n",
"\n",
"![image info](../images/edge_patterns.png)\n",
"\n",
@@ -881,7 +899,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
@@ -1182,19 +1200,19 @@
"[5088434 rows x 15 columns]"
]
},
- "execution_count": 13,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "edge_eps = kgtt.edge_pattern_summary(biokg_df)\n",
+ "edge_eps = kgtt.edge_pattern_summary()\n",
"edge_eps"
]
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
@@ -1215,7 +1233,7 @@
"dtype: float64"
]
},
- "execution_count": 14,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -1229,7 +1247,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
@@ -1265,12 +1283,12 @@
"source": [
"## Relation-level analysis\n",
"\n",
- "The method `aggregate_by_relation` allows the user to aggregate at the relation-level the statistics outputted by the edge-level methods `edge_degree_cardinality_summary` and `edge_pattern_summary`. This converts DataFrames indexed on the KG edges to DataFrames indexed on the IDs of the unique relation types."
+ "All edge topological properties seen in the previous section can be aggregated over triples of the same relation type, to produce relation-level statistics. To do so, we can either set the option `aggregate_by_r = True` when calling the methods `edge_degree_cardinality_summary`, `edge_pattern_summary`, or - if edge topological metrics have already been precomputed - use the utility function `aggregate_by_relation`, which converts DataFrames indexed on the KG edges to DataFrames indexed on the IDs of the unique relation types."
]
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
@@ -1298,12 +1316,12 @@
" frac_triples | \n",
" unique_h | \n",
" unique_t | \n",
- " h_unique_rel_mean | \n",
- " h_unique_rel_std | \n",
- " h_unique_rel_quartile1 | \n",
- " h_unique_rel_quartile2 | \n",
- " h_unique_rel_quartile3 | \n",
" h_degree_mean | \n",
+ " h_degree_std | \n",
+ " h_degree_quartile1 | \n",
+ " h_degree_quartile2 | \n",
+ " h_degree_quartile3 | \n",
+ " h_unique_rel_mean | \n",
" ... | \n",
" tot_degree_same_rel_quartile1 | \n",
" tot_degree_same_rel_quartile2 | \n",
@@ -1348,12 +1366,12 @@
" 0.015931 | \n",
" 9742 | \n",
" 9337 | \n",
- " 8.110293 | \n",
- " 8.247277 | \n",
- " 4.0 | \n",
- " 5.0 | \n",
- " 8.0 | \n",
" 569.252202 | \n",
+ " 1083.315332 | \n",
+ " 111.0 | \n",
+ " 222.0 | \n",
+ " 521.0 | \n",
+ " 8.110293 | \n",
" ... | \n",
" 45.0 | \n",
" 112.0 | \n",
@@ -1372,12 +1390,12 @@
" 0.001114 | \n",
" 698 | \n",
" 1536 | \n",
- " 27.048157 | \n",
- " 12.936410 | \n",
- " 17.0 | \n",
- " 31.0 | \n",
- " 36.0 | \n",
" 2518.765391 | \n",
+ " 2186.452620 | \n",
+ " 435.0 | \n",
+ " 2087.0 | \n",
+ " 4028.0 | \n",
+ " 27.048157 | \n",
" ... | \n",
" 14.0 | \n",
" 32.0 | \n",
@@ -1396,12 +1414,12 @@
" 0.013158 | \n",
" 612 | \n",
" 612 | \n",
- " 36.404307 | \n",
- " 5.600706 | \n",
- " 33.0 | \n",
- " 36.0 | \n",
- " 41.0 | \n",
" 4129.511919 | \n",
+ " 1935.630599 | \n",
+ " 2548.0 | \n",
+ " 3968.0 | \n",
+ " 5649.0 | \n",
+ " 36.404307 | \n",
" ... | \n",
" 332.0 | \n",
" 404.0 | \n",
@@ -1420,12 +1438,12 @@
" 0.003849 | \n",
" 491 | \n",
" 491 | \n",
- " 37.095941 | \n",
- " 5.547389 | \n",
- " 33.0 | \n",
- " 37.0 | \n",
- " 41.0 | \n",
" 4527.399592 | \n",
+ " 1943.714179 | \n",
+ " 2925.0 | \n",
+ " 4507.0 | \n",
+ " 6161.0 | \n",
+ " 37.095941 | \n",
" ... | \n",
" 114.0 | \n",
" 157.0 | \n",
@@ -1444,12 +1462,12 @@
" 0.006295 | \n",
" 526 | \n",
" 525 | \n",
- " 37.319567 | \n",
- " 5.384523 | \n",
- " 34.0 | \n",
- " 38.0 | \n",
- " 41.0 | \n",
" 4511.067834 | \n",
+ " 1905.395180 | \n",
+ " 2931.0 | \n",
+ " 4507.0 | \n",
+ " 6148.0 | \n",
+ " 37.319567 | \n",
" ... | \n",
" 188.0 | \n",
" 243.0 | \n",
@@ -1468,29 +1486,29 @@
""
],
"text/plain": [
- " num_triples frac_triples unique_h unique_t h_unique_rel_mean \\\n",
- "r \n",
- "0 81066 0.015931 9742 9337 8.110293 \n",
- "1 5669 0.001114 698 1536 27.048157 \n",
- "2 66954 0.013158 612 612 36.404307 \n",
- "3 19585 0.003849 491 491 37.095941 \n",
- "4 32034 0.006295 526 525 37.319567 \n",
+ " num_triples frac_triples unique_h unique_t h_degree_mean h_degree_std \\\n",
+ "r \n",
+ "0 81066 0.015931 9742 9337 569.252202 1083.315332 \n",
+ "1 5669 0.001114 698 1536 2518.765391 2186.452620 \n",
+ "2 66954 0.013158 612 612 4129.511919 1935.630599 \n",
+ "3 19585 0.003849 491 491 4527.399592 1943.714179 \n",
+ "4 32034 0.006295 526 525 4511.067834 1905.395180 \n",
"\n",
- " h_unique_rel_std h_unique_rel_quartile1 h_unique_rel_quartile2 \\\n",
- "r \n",
- "0 8.247277 4.0 5.0 \n",
- "1 12.936410 17.0 31.0 \n",
- "2 5.600706 33.0 36.0 \n",
- "3 5.547389 33.0 37.0 \n",
- "4 5.384523 34.0 38.0 \n",
+ " h_degree_quartile1 h_degree_quartile2 h_degree_quartile3 \\\n",
+ "r \n",
+ "0 111.0 222.0 521.0 \n",
+ "1 435.0 2087.0 4028.0 \n",
+ "2 2548.0 3968.0 5649.0 \n",
+ "3 2925.0 4507.0 6161.0 \n",
+ "4 2931.0 4507.0 6148.0 \n",
"\n",
- " h_unique_rel_quartile3 h_degree_mean ... tot_degree_same_rel_quartile1 \\\n",
- "r ... \n",
- "0 8.0 569.252202 ... 45.0 \n",
- "1 36.0 2518.765391 ... 14.0 \n",
- "2 41.0 4129.511919 ... 332.0 \n",
- "3 41.0 4527.399592 ... 114.0 \n",
- "4 41.0 4511.067834 ... 188.0 \n",
+ " h_unique_rel_mean ... tot_degree_same_rel_quartile1 \\\n",
+ "r ... \n",
+ "0 8.110293 ... 45.0 \n",
+ "1 27.048157 ... 14.0 \n",
+ "2 36.404307 ... 332.0 \n",
+ "3 37.095941 ... 114.0 \n",
+ "4 37.319567 ... 188.0 \n",
"\n",
" tot_degree_same_rel_quartile2 tot_degree_same_rel_quartile3 \\\n",
"r \n",
@@ -1535,27 +1553,29 @@
"[5 rows x 51 columns]"
]
},
- "execution_count": 16,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "kgtt.aggregate_by_relation(edge_dcs).head()"
+ "from kg_topology_toolbox.utils import aggregate_by_relation\n",
+ "\n",
+ "aggregate_by_relation(edge_dcs).head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Notice on the left the columns `num_triples`, `frac_triples`, `unique_h`, `unique_t` giving additional statistics for relation types (number of edges and relative frequency, number of unique entities used as heads/tails by triples of the relation type).\n",
+ "Notice on the extra columns `num_triples`, `frac_triples`, `unique_h`, `unique_t` giving additional statistics for relation types (number of edges and relative frequency, number of unique entities used as heads/tails by triples of the relation type).\n",
"\n",
"Similarly, by aggregating the `edge_eps` DataFrame we can look at the distribution of edge topological patterns within each relation type."
]
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 16,
"metadata": {},
"outputs": [
{
@@ -1812,25 +1832,25 @@
"[5 rows x 32 columns]"
]
},
- "execution_count": 17,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "kgtt.aggregate_by_relation(edge_eps).head()"
+ "aggregate_by_relation(edge_eps).head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Additional methods are provided for the analysis at the relation level: `jaccard_similarity_relation_sets` to compute the Jaccard similarity of the sets of head/tail entities used by each relation; `relational_affinity_ingram` to compute the InGram pairwise relation similarity (see [paper](https://arxiv.org/abs/2305.19987)). "
+ "Additional methods are provided in the `KGTopologyToolbox` class for analysis at the relation level: `jaccard_similarity_relation_sets` to compute the Jaccard similarity of the sets of head/tail entities used by each relation; `relational_affinity_ingram` to compute the InGram pairwise relation similarity (see [paper](https://arxiv.org/abs/2305.19987)). "
]
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 17,
"metadata": {},
"outputs": [
{
@@ -2106,18 +2126,18 @@
"[1275 rows x 14 columns]"
]
},
- "execution_count": 18,
+ "execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "kgtt.jaccard_similarity_relation_sets(biokg_df)"
+ "kgtt.jaccard_similarity_relation_sets()"
]
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 18,
"metadata": {},
"outputs": [
{
@@ -2235,13 +2255,13 @@
"[2550 rows x 3 columns]"
]
},
- "execution_count": 19,
+ "execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "kgtt.relational_affinity_ingram(biokg_df)"
+ "kgtt.relational_affinity_ingram()"
]
}
],
diff --git a/pyproject.toml b/pyproject.toml
index 6628c79..edc0115 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "kg-topology-toolbox"
-version = "0.1.0"
+version = "1.0.0"
authors = [
{name = "Alberto Cattaneo"},
{name = "Daniel Justus"},
diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py
index 4fffe64..8d29e96 100644
--- a/src/kg_topology_toolbox/topology_toolbox.py
+++ b/src/kg_topology_toolbox/topology_toolbox.py
@@ -5,37 +5,143 @@
Topology toolbox main functionalities
"""
-from collections.abc import Iterable
+from functools import cache
import numpy as np
import pandas as pd
from scipy.sparse import coo_array
-from kg_topology_toolbox.utils import composition_count, jaccard_similarity
+from kg_topology_toolbox.utils import (
+ aggregate_by_relation,
+ check_kg_df_structure,
+ composition_count,
+ jaccard_similarity,
+ node_degrees_and_rels,
+)
class KGTopologyToolbox:
"""
- Toolbox class to compute various Knowledge Graph topology statistics.
+ Toolbox class to compute Knowledge Graph topology statistics.
"""
- def node_degree_summary(
- self, df: pd.DataFrame, return_relation_list: bool = False
- ) -> pd.DataFrame:
+ def __init__(
+ self,
+ kg_df: pd.DataFrame,
+ head_column: str = "h",
+ relation_column: str = "r",
+ tail_column: str = "t",
+ ):
+ """
+ Instantiate the Topology Toolbox for a Knowledge Graph defined
+ by the list of its edges (h,r,t).
+
+ :param kg_df:
+ A Knowledge Graph represented as a pd.DataFrame.
+ Must contain at least three columns, which specify the IDs of
+ head entity, relation type and tail entity for each edge.
+ :param head_column:
+ The name of the column with the IDs of head entities. Default: "h".
+ :param head_column:
+ The name of the column with the IDs of relation types. Default: "r".
+ :param head_column:
+ The name of the column with the IDs of tail entities. Default: "t".
+
+ """
+ check_kg_df_structure(kg_df, head_column, relation_column, tail_column)
+
+ self.df = kg_df[[head_column, relation_column, tail_column]].rename(
+ columns={head_column: "h", relation_column: "r", tail_column: "t"}
+ )
+ self.n_entity = self.df[["h", "t"]].max().max() + 1
+ self.n_rel = self.df.r.max() + 1
+
+ def loop_count(self) -> pd.DataFrame:
+ """
+ For each entity in the KG, compute the number of loops around the entity
+ (i.e., the number of edges having the entity as both head and tail).
+
+ :return:
+ Loop count DataFrame, indexed on the IDs of the graph entities.
+ """
+ n_loops = (
+ self.df[self.df.h == self.df.t].groupby("h").agg(n_loops=("r", "count"))
+ )
+ return (
+ pd.DataFrame(n_loops, index=np.arange(self.n_entity)).fillna(0).astype(int)
+ )
+
+ @cache
+ def node_head_degree(self, return_relation_list: bool = False) -> pd.DataFrame:
+ """
+ For each entity in the KG, compute the number of edges having it as head
+ (head-degree, or out-degree of the head node).
+ The relation types going out of the head node are also identified.
+
+ :param return_relation_list:
+ If True, return the list of unique relations going
+ out of the head node. WARNING: expensive for large graphs.
+ Default: False.
+
+ :return:
+ The result DataFrame, indexed on the IDs `e` of the graph entities,
+ with columns:
+
+ - **h_degree** (int): Number of triples with head entity `e`.
+ - **h_unique_rel** (int): Number of distinct relation types
+ among edges with head entity `e`.
+ - **h_rel_list** (Optional[list]): List of unique relation types
+ among edges with head entity `e`.
+ Only returned if `return_relation_list = True`.
+ """
+ node_df = node_degrees_and_rels(
+ self.df, "h", self.n_entity, return_relation_list
+ )
+ return node_df.rename(columns={n: "h_" + n for n in node_df.columns})
+
+ @cache
+ def node_tail_degree(self, return_relation_list: bool = False) -> pd.DataFrame:
+ """
+ For each entity in the KG, compute the number of edges having it as tail
+ (tail-degree, or in-degree of the tail node).
+ The relation types going into the tail node are also identified.
+
+ :param return_relation_list:
+ If True, return the list of unique relation types going
+ into the tail node. WARNING: expensive for large graphs.
+ Default: False.
+
+ :return:
+ The result DataFrame, indexed on the IDs `e` of the graph entities,
+ with columns:
+
+ - **t_degree** (int): Number of triples with tail entity `e`.
+ - **t_unique_rel** (int): Number of distinct relation types
+ among edges with tail entity `e`.
+ - **t_rel_list** (Optional[list]): List of unique relation types
+ among edges with tail entity `e`.
+ Only returned if `return_relation_list = True`.
+ """
+ node_df = node_degrees_and_rels(
+ self.df, "t", self.n_entity, return_relation_list
+ )
+ return node_df.rename(columns={n: "t_" + n for n in node_df.columns})
+
+ def node_degree_summary(self, return_relation_list: bool = False) -> pd.DataFrame:
"""
- For each entity, this function computes the number of edges having it as a head
+ For each entity in the KG, compute the number of edges having it as a head
(head-degree, or out-degree), as a tail (tail-degree, or in-degree)
- or one of the two (total-degree) in the Knowledge Graph.
+ or one of the two (total-degree).
The in-going and out-going relation types are also identified.
The output dataframe is indexed on the IDs of the graph entities.
- :param df: A graph represented as a pd.DataFrame.
- Must contain at least three columns `h`, `r`, `t`.
- :param return_relation_list: If True, return the list of unique relations going
+ :param return_relation_list:
+ If True, return the list of unique relations going
in/out of an entity. WARNING: expensive for large graphs.
- :return: The results dataframe, indexed over the same entity ID `e` used in df,
+ :return:
+ The results dataframe, indexed on the IDs `e` of the graph entities,
with columns:
- **h_degree** (int): Number of triples with head entity `e`.
@@ -43,45 +149,33 @@ def node_degree_summary(
- **tot_degree** (int): Number of triples with head entity `e` or tail entity `e`.
- **h_unique_rel** (int): Number of distinct relation types
among edges with head entity `e`.
- - **h_rel_list** (list): List of unique relation types among edges
+ - **h_rel_list** (Optional[list]): List of unique relation types among edges
with head entity `e`.
+ Only returned if `return_relation_list = True`.
- **t_unique_rel** (int): Number of distinct relation types
among edges with tail entity `e`.
- - **t_rel_list** (list): List of unique relation types among edges
+ - **t_rel_list** (Optional[list]): List of unique relation types among edges
with tail entity `e`.
+ Only returned if `return_relation_list = True`.
- **n_loops** (int): number of loops around entity `e`.
"""
- n_entity = df[["h", "t"]].max().max() + 1
- h_rel_list = {"h_rel_list": ("r", "unique")} if return_relation_list else {}
- t_rel_list = {"t_rel_list": ("r", "unique")} if return_relation_list else {}
- nodes = pd.DataFrame(
- df.groupby("h").agg(
- h_degree=("r", "count"), h_unique_rel=("r", "nunique"), **h_rel_list # type: ignore
- ),
- index=np.arange(n_entity),
- )
- nodes = nodes.merge(
- df.groupby("t").agg(
- t_degree=("r", "count"), t_unique_rel=("r", "nunique"), **t_rel_list # type: ignore
- ),
+ nodes_df = pd.merge(
+ self.node_head_degree(return_relation_list),
+ self.node_tail_degree(return_relation_list),
left_index=True,
right_index=True,
- how="left",
)
- nodes = nodes.merge(
- df[df.h == df.t].groupby("h").agg(n_loops=("r", "count")),
+ nodes_df = pd.merge(
+ nodes_df,
+ self.loop_count(),
left_index=True,
right_index=True,
- how="left",
)
- nodes[["h_degree", "h_unique_rel", "t_degree", "t_unique_rel", "n_loops"]] = (
- nodes[["h_degree", "h_unique_rel", "t_degree", "t_unique_rel", "n_loops"]]
- .fillna(0)
- .astype(int)
+ nodes_df["tot_degree"] = (
+ nodes_df["h_degree"] + nodes_df["t_degree"] - nodes_df["n_loops"]
)
- nodes["tot_degree"] = nodes["h_degree"] + nodes["t_degree"] - nodes["n_loops"]
- return nodes[
+ return nodes_df[
["h_degree", "t_degree", "tot_degree", "h_unique_rel"]
+ (["h_rel_list"] if return_relation_list else [])
+ ["t_unique_rel"]
@@ -89,24 +183,115 @@ def node_degree_summary(
+ ["n_loops"]
]
- def edge_degree_cardinality_summary(self, df: pd.DataFrame) -> pd.DataFrame:
+ @cache
+ def edge_head_degree(self) -> pd.DataFrame:
"""
- For each triple, this function computes the number of edges with the same head
+ For each edge in the KG, compute the number of edges
+ (in total or of the same relation type) with the same head node.
+
+ :return:
+ The result DataFrame, with the same indexing and ordering of
+ triples as the original KG DataFrame, with columns
+ (in addition to `h`, `r`, `t`):
+
+ - **h_unique_rel** (int): Number of distinct relation types
+ among edges with head entity `h`.
+ - **h_degree** (int): Number of triples with head entity `h`.
+ - **h_degree_same_rel** (int): Number of triples with head entity `h`
+ and relation type `r`.
+ """
+ edge_by_hr_count = self.df.groupby(["h", "r"], as_index=False).agg(
+ h_degree_same_rel=("t", "count")
+ )
+ df_res = self.df.merge(
+ self.node_head_degree(), left_on=["h"], right_index=True, how="left"
+ )
+ return df_res.merge(edge_by_hr_count, on=["h", "r"], how="left")
+
+ @cache
+ def edge_tail_degree(self) -> pd.DataFrame:
+ """
+ For each edge in the KG, compute the number of edges
+ (in total or of the same relation type) with the same tail node.
+
+ :return:
+ The result DataFrame, with the same indexing and ordering of
+ triples as the original KG DataFrame, with columns
+ (in addition to `h`, `r`, `t`):
+
+ - **t_unique_rel** (int): Number of distinct relation types
+ among edges with tail entity `t`.
+ - **t_degree** (int): Number of triples with tail entity `t`.
+ - **t_degree_same_rel** (int): Number of triples with tail entity `t`
+ and relation type `r`.
+ """
+ edge_by_rt_count = self.df.groupby(["r", "t"], as_index=False).agg(
+ t_degree_same_rel=("h", "count")
+ )
+ df_res = self.df.merge(
+ self.node_tail_degree(), left_on=["t"], right_index=True, how="left"
+ )
+ return df_res.merge(edge_by_rt_count, on=["r", "t"], how="left")
+
+ def edge_cardinality(self) -> pd.DataFrame:
+ """
+ Classify the cardinality of each edge in the KG: one-to-one
+ (out-degree=in-degree=1), one-to-many (out-degree>1, in-degree=1),
+ many-to-one(out-degree=1, in-degree>1) or many-to-many
+ (in-degree>1, out-degree>1).
+
+ :return:
+ The result DataFrame, with the same indexing and ordering of
+ triples as the original KG DataFrame, with columns
+ (in addition to `h`, `r`, `t`):
+
+ - **triple_cardinality** (int): cardinality type of the edge.
+ - **triple_cardinality_same_rel** (int): cardinality type of the edge in
+ the subgraph of edges with relation type `r`.
+ """
+ head_degree = self.edge_head_degree()
+ tail_degree = self.edge_tail_degree()
+ df_res = pd.DataFrame(
+ {"h": head_degree.h, "r": head_degree.r, "t": head_degree.t}
+ )
+ # check if the values in the pair (h_degree, t_degree) are =1 or >1
+ # to determine the edge cardinality
+ for suffix in ["", "_same_rel"]:
+ # check if the values in the pair (h_degree, t_degree) are =1 or >1
+ # to determine the edge cardinality
+ edge_type = 2 * (head_degree["h_degree" + suffix] == 1) + (
+ tail_degree["t_degree" + suffix] == 1
+ )
+ df_res["triple_cardinality" + suffix] = pd.cut(
+ edge_type,
+ bins=[0, 1, 2, 3, 4],
+ right=False,
+ labels=["M:M", "1:M", "M:1", "1:1"],
+ ).astype(str)
+ return df_res
+
+ def edge_degree_cardinality_summary(
+ self, aggregate_by_r: bool = False
+ ) -> pd.DataFrame:
+ """
+ For each edge in the KG, compute the number of edges with the same head
(head-degree, or out-degree), the same tail (tail-degree, or in-degree)
- or one of the two (total-degree) in the Knowledge Graph.
+ or one of the two (total-degree).
Based on entity degrees, each triple is classified as either one-to-one
(out-degree=in-degree=1), one-to-many (out-degree>1, in-degree=1),
many-to-one(out-degree=1, in-degree>1) or many-to-many
(in-degree>1, out-degree>1).
The output dataframe maintains the same indexing and ordering of triples
- as the input one.
+ as the original Knowledge Graph dataframe.
- :param df: A graph represented as a pd.DataFrame.
- Must contain at least three columns `h`, `r`, `t`.
+ :param aggregate_by_r:
+ If True, return metrics aggregated by relation type
+ (the output DataFrame will be indexed over relation IDs).
- :return: The results dataframe. Contains the following columns
- (in addition to `h`, `r`, `t` in ``df``):
+ :return:
+ The results dataframe. Contains the following columns
+ (in addition to `h`, `r`, `t`):
- **h_unique_rel** (int): Number of distinct relation types
among edges with head entity h.
@@ -126,33 +311,18 @@ def edge_degree_cardinality_summary(self, df: pd.DataFrame) -> pd.DataFrame:
- **triple_cardinality_same_rel** (int): cardinality type of the edge in
the subgraph of edges with relation type r.
"""
- gr_by_h_count = df.groupby("h", as_index=False).agg(
- h_unique_rel=("r", "nunique"), h_degree=("t", "count")
- )
- gr_by_hr_count = df.groupby(["h", "r"], as_index=False).agg(
- h_degree_same_rel=("t", "count")
- )
- gr_by_t_count = df.groupby("t", as_index=False).agg(
- t_unique_rel=("r", "nunique"), t_degree=("h", "count")
- )
- gr_by_rt_count = df.groupby(["r", "t"], as_index=False).agg(
- t_degree_same_rel=("h", "count")
- )
-
- df_res = df.merge(gr_by_h_count, left_on=["h"], right_on=["h"], how="left")
- df_res = df_res.merge(
- gr_by_hr_count, left_on=["h", "r"], right_on=["h", "r"], how="left"
- )
- df_res = df_res.merge(gr_by_t_count, left_on=["t"], right_on=["t"], how="left")
- df_res = df_res.merge(
- gr_by_rt_count, left_on=["t", "r"], right_on=["t", "r"], how="left"
+ df_res = pd.concat(
+ [
+ self.edge_head_degree(),
+ self.edge_tail_degree().drop(columns=["h", "r", "t"]),
+ ],
+ axis=1,
)
# compute number of parallel edges to avoid double-counting them
# in total degree
num_parallel = df_res.merge(
- df.groupby(["h", "t"], as_index=False).agg(n_parallel=("r", "count")),
- left_on=["h", "t"],
- right_on=["h", "t"],
+ self.df.groupby(["h", "t"], as_index=False).agg(n_parallel=("r", "count")),
+ on=["h", "t"],
how="left",
)
df_res["tot_degree"] = (
@@ -164,46 +334,43 @@ def edge_degree_cardinality_summary(self, df: pd.DataFrame) -> pd.DataFrame:
df_res.h_degree_same_rel + df_res.t_degree_same_rel - 1
)
- # check if the values in the pair (h_degree, t_degree) are =1 or >1
- # to determine the edge cardinality
- legend = {
- 0: "M:M",
- 1: "1:M",
- 2: "M:1",
- 3: "1:1",
- }
- for suffix in ["", "_same_rel"]:
- edge_type = 2 * (df_res["h_degree" + suffix] == 1) + (
- df_res["t_degree" + suffix] == 1
- )
- df_res["triple_cardinality" + suffix] = edge_type.apply(lambda x: legend[x])
- return df_res
+ edge_cardinality = self.edge_cardinality()
+ df_res["triple_cardinality"] = edge_cardinality["triple_cardinality"]
+ df_res["triple_cardinality_same_rel"] = edge_cardinality[
+ "triple_cardinality_same_rel"
+ ]
+ return aggregate_by_relation(df_res) if aggregate_by_r else df_res
def edge_pattern_summary(
self,
- df: pd.DataFrame,
return_metapath_list: bool = False,
composition_chunk_size: int = 2**8,
composition_workers: int = 32,
+ aggregate_by_r: bool = False,
) -> pd.DataFrame:
"""
- This function analyses the structural properties of each edge in the graph:
+ Analyse structural properties of each edge in the KG:
symmetry, presence of inverse/inference(=parallel) edges and
triangles supported on the edge.
The output dataframe maintains the same indexing and ordering of triples
- as the input one.
+ as the original Knowledge Graph dataframe.
- :param df: A graph represented as a pd.DataFrame.
- Must contain at least three columns `h`, `r`, `t`.
- :param return_metapath_list: If True, return the list of unique metapaths for all
+ :param return_metapath_list:
+ If True, return the list of unique metapaths for all
triangles supported over one edge. WARNING: very expensive for large graphs.
- :param composition_chunk_size: Size of column chunks of sparse adjacency matrix
+ :param composition_chunk_size:
+ Size of column chunks of sparse adjacency matrix
to compute the triangle count.
- :param composition_workers: Number of workers to compute the triangle count.
+ :param composition_workers:
+ Number of workers to compute the triangle count.
+ :param aggregate_by_r:
+ If True, return metrics aggregated by relation type
+ (the output DataFrame will be indexed over relation IDs).
- :return: The results dataframe. Contains the following columns
- (in addition to `h`, `r`, `t` in ``df``):
+ :return:
+ The results dataframe. Contains the following columns
+ (in addition to `h`, `r`, `t`):
- **is_loop** (bool): True if the triple is a loop (``h == t``).
- **is_symmetric** (bool): True if the triple (t, r, h) is also contained
@@ -230,12 +397,14 @@ def edge_pattern_summary(
"""
# symmetry-asymmetry
# edges with h/t switched
- df_inv = df.reindex(columns=["t", "r", "h"]).rename(
+ df_inv = self.df.reindex(columns=["t", "r", "h"]).rename(
columns={"t": "h", "r": "r", "h": "t"}
)
- df_res = pd.DataFrame({"h": df.h, "r": df.r, "t": df.t, "is_symmetric": False})
+ df_res = pd.DataFrame(
+ {"h": self.df.h, "r": self.df.r, "t": self.df.t, "is_symmetric": False}
+ )
df_res.loc[
- df.reset_index().merge(df_inv)["index"],
+ self.df.reset_index().merge(df_inv)["index"],
"is_symmetric",
] = True
# loops are treated separately
@@ -277,7 +446,7 @@ def edge_pattern_summary(
# composition & metapaths
# discard loops as edges of a triangle
- df_wo_loops = df[df.h != df.t]
+ df_wo_loops = self.df[self.df.h != self.df.t]
if return_metapath_list:
# 2-hop paths
df_bridges = df_wo_loops.merge(
@@ -336,7 +505,7 @@ def edge_pattern_summary(
)
df_res["has_undirected_composition"] = df_res["n_undirected_triangles"] > 0
- return df_res[
+ df_res = df_res[
[
"h",
"r",
@@ -357,95 +526,16 @@ def edge_pattern_summary(
+ (["metapath_list"] if return_metapath_list else [])
]
- def aggregate_by_relation(self, edge_topology_df: pd.DataFrame) -> pd.DataFrame:
- """
- Aggregate topology metrics of all triples of the same relation type.
- To be applied to the output dataframe of either
- :meth:`KGTopologyToolbox.edge_degree_cardinality_summary` or
- :meth:`KGTopologyToolbox.edge_pattern_summary`.
-
- The returned dataframe is indexed over relation type IDs, with columns
- giving the aggregated statistics of triples of the correspondig relation.
- The name of the columns is of the form ``column_name_in_input_df + suffix``.
- The aggregation is perfomed by returning:
-
- - for numerical metrics: mean, standard deviation and quartiles
- (``suffix`` = "_mean", "_std", "_quartile1", "_quartile2", "_quartile3");
- - for boolean metrics: the fraction of triples of the relation type
- with metric = True (``suffix`` = "_frac");
- - for string metrics: for each possible label, the fraction of triples
- of the relation type with that metric value (``suffix`` = "_{label}_frac")
- - for list metrics: the unique metric values across triples of the relation
- type (``suffix`` = "_unique").
-
- :param edge_topology_df: pd.DataFrame of edge topology metrics.
- Must contain at least three columns `h`, `r`, `t`.
-
- :return: The results dataframe. In addition to the columns with the aggregated
- metrics by relation type, it also contains columns:
-
- - **num_triples** (int): Number of triples for each relation type.
- - **frac_triples** (float): Fraction of overall triples represented by each
- relation type.
- - **unique_h** (int): Number of unique head entities used by triples of each
- relation type.
- - **unique_t** (int): Number of unique tail entities used by triples of each
- relation type.
- """
- df_by_r = edge_topology_df.groupby("r")
- df_res = df_by_r.agg(num_triples=("r", "count"))
- df_res["frac_triples"] = df_res["num_triples"] / edge_topology_df.shape[0]
- col: str
- for col, col_dtype in edge_topology_df.drop(columns=["r"]).dtypes.items(): # type: ignore
- if col in ["h", "t"]:
- df_res[f"unique_{col}"] = df_by_r[col].nunique()
- elif col_dtype == object:
- if isinstance(edge_topology_df[col].iloc[0], str):
- for label in np.unique(edge_topology_df[col]):
- df_res[f"{col}_{label}_frac"] = (
- edge_topology_df[edge_topology_df[col] == label]
- .groupby("r")[col]
- .count()
- / df_res["num_triples"]
- ).fillna(0)
- elif isinstance(edge_topology_df[col].iloc[0], Iterable):
- df_res[f"{col}_unique"] = (
- df_by_r[col]
- .agg(np.unique)
- .apply(
- lambda x: (
- np.unique(
- np.concatenate(
- [lst for lst in x if len(lst) > 0] or [[]]
- )
- ).tolist()
- )
- )
- )
- else:
- print(f"Skipping column {col}: no known aggregation mode")
- continue
- elif col_dtype == int or col_dtype == float:
- df_res[f"{col}_mean"] = df_by_r[col].mean()
- df_res[f"{col}_std"] = df_by_r[col].std()
- for q in range(1, 4):
- df_res[f"{col}_quartile{q}"] = df_by_r[col].agg(
- lambda x: np.quantile(x, 0.25 * q)
- )
- elif col_dtype == bool:
- df_res[f"{col}_frac"] = df_by_r[col].mean()
- return df_res
+ return aggregate_by_relation(df_res) if aggregate_by_r else df_res
- def jaccard_similarity_relation_sets(self, df: pd.DataFrame) -> pd.DataFrame:
+ def jaccard_similarity_relation_sets(self) -> pd.DataFrame:
"""
Compute the similarity between relations defined as the Jaccard Similarity
between sets of entities (heads and tails) for all pairs
of relations in the graph.
- :param df: A graph represented as a pd.DataFrame.
- Must contain at least three columns `h`, `r`, `t`.
-
- :return: The results dataframe. Contains the following columns:
+ :return:
+ The results dataframe. Contains the following columns:
- **r1** (int): Index of the first relation.
- **r2** (int): Index of the second relation.
@@ -468,7 +558,7 @@ def jaccard_similarity_relation_sets(self, df: pd.DataFrame) -> pd.DataFrame:
- **jaccard_both** (float): Jaccard similarity between the full entity set
of r1 and r2.
"""
- ent_unique = df.groupby("r", as_index=False).agg(
+ ent_unique = self.df.groupby("r", as_index=False).agg(
num_triples=("r", "count"), head=("h", "unique"), tail=("t", "unique")
)
ent_unique["both"] = ent_unique.apply(
@@ -487,7 +577,7 @@ def jaccard_similarity_relation_sets(self, df: pd.DataFrame) -> pd.DataFrame:
df_res = df_res[df_res.r1 < df_res.r2]
df_res["num_triples_both"] = df_res["num_triples_r1"] + df_res["num_triples_r2"]
- df_res["frac_triples_both"] = df_res["num_triples_both"] / df.shape[0]
+ df_res["frac_triples_both"] = df_res["num_triples_both"] / self.df.shape[0]
df_res["num_entities_both"] = df_res.apply(
lambda x: len(
np.unique(
@@ -531,9 +621,7 @@ def jaccard_similarity_relation_sets(self, df: pd.DataFrame) -> pd.DataFrame:
]
return df_res
- def relational_affinity_ingram(
- self, df: pd.DataFrame, min_max_norm: bool = False
- ) -> pd.DataFrame:
+ def relational_affinity_ingram(self, min_max_norm: bool = False) -> pd.DataFrame:
"""
Compute the similarity between relations based on the approach proposed in
InGram: Inductive Knowledge Graph Embedding via Relation Graphs,
@@ -542,34 +630,31 @@ def relational_affinity_ingram(
Only the pairs of relations witn ``affinity > 0`` are shown in the
returned dataframe.
- :param df: A graph represented as a pd.DataFrame.
- Must contain at least three columns `h`, `r`, `t`.
- :param min_max_norm: min-max normalization of edge weights. Defaults to False.
+ :param min_max_norm:
+ min-max normalization of edge weights. Defaults to False.
- :return: The results dataframe. Contains the following columns:
+ :return:
+ The results dataframe. Contains the following columns:
- **h_relation** (int): Index of the head relation.
- **t_relation** (int): Index of the tail relation.
- **edge_weight** (float): Weight for the affinity between
the head and the tail relation.
"""
- n_entities = df[["h", "t"]].max().max() + 1
- n_rels = df.r.max() + 1
-
- hr_freqs = df.groupby(["h", "r"], as_index=False).count()
+ hr_freqs = self.df.groupby(["h", "r"], as_index=False).count()
# normalize by global h frequency
hr_freqs["t"] = hr_freqs["t"] / hr_freqs.groupby("h")["t"].transform("sum")
- rt_freqs = df.groupby(["t", "r"], as_index=False).count()
+ rt_freqs = self.df.groupby(["t", "r"], as_index=False).count()
# normalize by global t frequency
rt_freqs["h"] = rt_freqs["h"] / rt_freqs.groupby("t")["h"].transform("sum")
E_h = coo_array(
(hr_freqs.t, (hr_freqs.h, hr_freqs.r)),
- shape=[n_entities, n_rels],
+ shape=[self.n_entity, self.n_rel],
)
E_t = coo_array(
(rt_freqs.h, (rt_freqs.t, rt_freqs.r)),
- shape=[n_entities, n_rels],
+ shape=[self.n_entity, self.n_rel],
)
A = (E_h.T @ E_h).toarray() + (E_t.T @ E_t).toarray()
diff --git a/src/kg_topology_toolbox/utils.py b/src/kg_topology_toolbox/utils.py
index c35b0f0..d3a3d55 100644
--- a/src/kg_topology_toolbox/utils.py
+++ b/src/kg_topology_toolbox/utils.py
@@ -4,24 +4,183 @@
Utility functions
"""
+import warnings
+from collections.abc import Iterable
from multiprocessing import Pool
import numpy as np
import pandas as pd
from numpy.typing import NDArray
+from pandas.api.types import is_integer_dtype
from scipy.sparse import coo_array, csc_array, csr_array
+def check_kg_df_structure(kg_df: pd.DataFrame, h: str, r: str, t: str) -> None:
+ """
+ Utility to perform sanity checks on the structure of the provided DataFrame,
+ to ensure that it encodes a Knowledge Graph in a compatible way.
+
+ :param kg_df:
+ The Knowledge Graph DataFrame.
+ :param h:
+ The name of the column with the IDs of head entities.
+ :param r:
+ The name of the column with the IDs of relation types.
+ :param t:
+ The name of the column with the IDs of tail entities.
+
+ """
+ # check h,r,t columns are present and of an integer type
+ for col_name in [h, r, t]:
+ if col_name in kg_df.columns:
+ if not is_integer_dtype(kg_df[col_name]):
+ raise TypeError(f"Column {col_name} needs to be of an integer dtype")
+ else:
+ raise ValueError(f"DataFrame {kg_df} has no column named {col_name}")
+ # check there are no duplicated (h,r,t) triples
+ if kg_df[[h, r, t]].duplicated().any():
+ warnings.warn(
+ "The Knowledge Graph contains duplicated edges"
+ " -- some functionalities may produce incorrect results"
+ )
+
+
+def node_degrees_and_rels(
+ df: pd.DataFrame, column: str, n_entity: int, return_relation_list: bool
+) -> pd.DataFrame:
+ """
+ Aggregate edges by head/tail node and compute associated statistics.
+
+ :param df:
+ Dataframe of (h,r,t) triples.
+ :param column:
+ Name of the column used to aggregate edges.
+ :param n_entity:
+ Total number of entities in the graph.
+ :param return_relation_list:
+ If True, return the list of unique relations types
+ in the set of aggregated edges.
+
+ :return:
+ The result DataFrame, indexed on the IDs of the graph entities,
+ with columns:
+
+ - **degree** (int): Number of triples in the aggregation.
+ - **unique_rel** (int): Number of distinct relation types
+ in the set of aggregated edges.
+ - **rel_list** (Optional[list]): List of unique relation types
+ in the set of aggregated edges.
+ Only returned if `return_relation_list = True`.
+ """
+ rel_list = {"rel_list": ("r", "unique")} if return_relation_list else {}
+ deg_df = pd.DataFrame(
+ df.groupby(column).agg(
+ degree=("r", "count"), unique_rel=("r", "nunique"), **rel_list # type: ignore
+ ),
+ index=np.arange(n_entity),
+ )
+ deg_df[["degree", "unique_rel"]] = (
+ deg_df[["degree", "unique_rel"]].fillna(0).astype(int)
+ )
+ return deg_df
+
+
+def aggregate_by_relation(edge_topology_df: pd.DataFrame) -> pd.DataFrame:
+ """
+ Aggregate topology metrics of all triples of the same relation type.
+ To be applied to a DataFrame of metrics having at least columns
+ `h`, `r`, `t` (e.g., the output of
+ :meth:`KGTopologyToolbox.edge_degree_cardinality_summary` or
+ :meth:`KGTopologyToolbox.edge_pattern_summary`).
+
+ The returned dataframe is indexed over relation type IDs, with columns
+ giving the aggregated statistics of triples of the corresponding relation.
+ The name of the columns is of the form ``column_name_in_input_df + suffix``.
+ The aggregation is performed by returning:
+
+ - for numerical metrics: mean, standard deviation and quartiles
+ (``suffix`` = "_mean", "_std", "_quartile1", "_quartile2", "_quartile3");
+ - for boolean metrics: the fraction of triples of the relation type
+ with metric = True (``suffix`` = "_frac");
+ - for string metrics: for each possible label, the fraction of triples
+ of the relation type with that metric value (``suffix`` = "_{label}_frac")
+ - for list metrics: the unique metric values across triples of the relation
+ type (``suffix`` = "_unique").
+
+ :param edge_topology_df:
+ pd.DataFrame of edge topology metrics.
+ Must contain at least three columns `h`, `r`, `t`.
+
+ :return:
+ The results dataframe. In addition to the columns with the aggregated
+ metrics by relation type, it also contains columns:
+
+ - **num_triples** (int): Number of triples for each relation type.
+ - **frac_triples** (float): Fraction of overall triples represented by each
+ relation type.
+ - **unique_h** (int): Number of unique head entities used by triples of each
+ relation type.
+ - **unique_t** (int): Number of unique tail entities used by triples of each
+ relation type.
+ """
+ df_by_r = edge_topology_df.groupby("r")
+ df_res = df_by_r.agg(num_triples=("r", "count"))
+ df_res["frac_triples"] = df_res["num_triples"] / edge_topology_df.shape[0]
+ col: str
+ for col, col_dtype in edge_topology_df.drop(columns=["r"]).dtypes.items(): # type: ignore
+ if col in ["h", "t"]:
+ df_res[f"unique_{col}"] = df_by_r[col].nunique()
+ elif col_dtype == object:
+ if isinstance(edge_topology_df[col].iloc[0], str):
+ for label in np.unique(edge_topology_df[col]):
+ df_res[f"{col}_{label}_frac"] = (
+ edge_topology_df[edge_topology_df[col] == label]
+ .groupby("r")[col]
+ .count()
+ / df_res["num_triples"]
+ ).fillna(0)
+ elif isinstance(edge_topology_df[col].iloc[0], Iterable):
+ df_res[f"{col}_unique"] = (
+ df_by_r[col]
+ .agg(np.unique)
+ .apply(
+ lambda x: (
+ np.unique(
+ np.concatenate(
+ [lst for lst in x if len(lst) > 0] or [[]]
+ )
+ ).tolist()
+ )
+ )
+ )
+ else:
+ print(f"Skipping column {col}: no known aggregation mode")
+ continue
+ elif col_dtype == int or col_dtype == float:
+ df_res[f"{col}_mean"] = df_by_r[col].mean()
+ df_res[f"{col}_std"] = df_by_r[col].std()
+ for q in range(1, 4):
+ df_res[f"{col}_quartile{q}"] = df_by_r[col].agg(
+ lambda x: np.quantile(x, 0.25 * q)
+ )
+ elif col_dtype == bool:
+ df_res[f"{col}_frac"] = df_by_r[col].mean()
+ return df_res
+
+
def jaccard_similarity(
entities_1: NDArray[np.int32], entities_2: NDArray[np.int32]
) -> float:
"""
Jaccard Similarity function for two sets of entities.
- :param entities_1: the array of IDs for the first set of entities.
- :param entities_2: the array of IDs for the second set of entities.
+ :param entities_1:
+ Array of IDs for the first set of entities.
+ :param entities_2:
+ Array of IDs for the second set of entities.
- :return: Jaccard Similarity score for two sets of entities.
+ :return:
+ Jaccard Similarity score for two sets of entities.
"""
intersection = len(np.intersect1d(entities_1, entities_2))
union = len(entities_1) + len(entities_2) - intersection
@@ -48,22 +207,26 @@ def composition_count(
) -> pd.DataFrame:
"""A helper function to compute the composition count of a graph.
- :param df: A graph represented as a pd.DataFrame. Must contain the columns
+ :param df:
+ A graph represented as a pd.DataFrame. Must contain the columns
`h` and `t`. No self-loops should be present in the graph.
- :param chunk_size: Size of chunks of columns of the adjacency matrix to be
+ :param chunk_size:
+ Size of chunks of columns of the adjacency matrix to be
processed together.
- :param workers: Number of workers processing chunks concurrently
- :param directed: Boolean flag. If false, bidirectional edges are considered for
- triangles by adding the adjacency matrix and its transposed. Defaults to True.
-
- :return: The results dataframe. Contains the following columns:
+ :param workers:
+ Number of workers processing chunks concurrently
+ :param directed:
+ Boolean flag. If false, bidirectional edges are considered for
+ triangles by adding the adjacency matrix and its transposed. Default: True.
+ :return:
+ The results dataframe. Contains the following columns:
- **h** (int): Index of the head entity.
- **t** (int): Index of the tail entity.
- **n_triangles** (int): Number of compositions for the (h, t) edge.
"""
- n_nodes = max(df[["h", "t"]].max()) + 1
+ n_nodes = df[["h", "t"]].max().max() + 1
adj = coo_array(
(np.ones(len(df)), (df.h, df.t)),
shape=[n_nodes, n_nodes],
diff --git a/tests/test_edge_topology_toolbox.py b/tests/test_edge_topology_toolbox.py
index b1846b5..eaba81a 100644
--- a/tests/test_edge_topology_toolbox.py
+++ b/tests/test_edge_topology_toolbox.py
@@ -8,14 +8,16 @@
df = pd.DataFrame(
dict(
- h=[0, 0, 0, 1, 2, 2, 1, 2],
- t=[1, 1, 2, 2, 0, 0, 1, 2],
- r=[0, 1, 0, 1, 0, 1, 1, 0],
+ H=[0, 0, 0, 1, 2, 2, 1, 2],
+ T=[1, 1, 2, 2, 0, 0, 1, 2],
+ R=[0, 1, 0, 1, 0, 1, 1, 0],
n=["a", "b", "c", "d", "e", "f", "g", "h"],
)
)
-tools = KGTopologyToolbox()
+kgtt = KGTopologyToolbox(
+ kg_df=df, head_column="H", relation_column="R", tail_column="T"
+)
@pytest.mark.parametrize("return_metapath_list", [True, False])
@@ -24,7 +26,7 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None:
# the edge_topology_toolbox
# entity degrees statistics
- res = tools.edge_degree_cardinality_summary(df)
+ res = kgtt.edge_degree_cardinality_summary()
assert np.allclose(res["h_unique_rel"], [2, 2, 2, 1, 2, 2, 1, 2])
assert np.allclose(res["h_degree"], [3, 3, 3, 2, 3, 3, 2, 3])
assert np.allclose(res["h_degree_same_rel"], [2, 1, 2, 2, 2, 1, 2, 2])
@@ -57,7 +59,7 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None:
]
# relation pattern symmetry
- res = tools.edge_pattern_summary(df, return_metapath_list=return_metapath_list)
+ res = kgtt.edge_pattern_summary(return_metapath_list=return_metapath_list)
assert np.allclose(
res["is_loop"], [False, False, False, False, False, False, True, True]
)
diff --git a/tests/test_node_topology_toolbox.py b/tests/test_node_topology_toolbox.py
index d78ce2d..18d87ed 100644
--- a/tests/test_node_topology_toolbox.py
+++ b/tests/test_node_topology_toolbox.py
@@ -8,14 +8,14 @@
df = pd.DataFrame(
dict(
- h=[0, 0, 0, 1, 2, 2, 2],
- t=[1, 1, 2, 2, 0, 0, 2],
- r=[0, 1, 0, 1, 0, 1, 1],
+ H=[0, 0, 0, 1, 2, 2, 2],
+ T=[1, 1, 2, 2, 0, 0, 2],
+ R=[0, 1, 0, 1, 0, 1, 1],
n=["a", "b", "c", "d", "e", "f", "g"],
)
)
-tools = KGTopologyToolbox()
+kgtt = KGTopologyToolbox(df, head_column="H", relation_column="R", tail_column="T")
@pytest.mark.parametrize("return_relation_list", [True, False])
@@ -24,7 +24,7 @@ def test_small_graph_metrics(return_relation_list: bool) -> None:
# the node_topology_toolbox
# entity degrees statistics
- res = tools.node_degree_summary(df, return_relation_list=return_relation_list)
+ res = kgtt.node_degree_summary(return_relation_list=return_relation_list)
assert np.allclose(res["h_degree"], [3, 1, 3])
assert np.allclose(res["t_degree"], [2, 2, 3])
assert np.allclose(res["tot_degree"], [5, 3, 5])
diff --git a/tests/test_relation_topology_toolbox.py b/tests/test_relation_topology_toolbox.py
index a41c60d..3f0c05c 100644
--- a/tests/test_relation_topology_toolbox.py
+++ b/tests/test_relation_topology_toolbox.py
@@ -10,24 +10,22 @@
df = pd.DataFrame(
dict(
- h=[0, 0, 0, 1, 2, 2, 2, 3, 3, 4],
- t=[1, 1, 2, 2, 0, 3, 4, 2, 4, 3],
- r=[0, 1, 0, 1, 0, 1, 1, 0, 0, 1],
+ H=[0, 0, 0, 1, 2, 2, 2, 3, 3, 4],
+ T=[1, 1, 2, 2, 0, 3, 4, 2, 4, 3],
+ R=[0, 1, 0, 1, 0, 1, 1, 0, 0, 1],
n=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
)
)
-tools = KGTopologyToolbox()
+kgtt = KGTopologyToolbox(df, head_column="H", relation_column="R", tail_column="T")
def test_small_graph_metrics() -> None:
# Define a small graph on five nodes with all the features tested by
# the relation_topology_toolbox
- dcs = tools.aggregate_by_relation(tools.edge_degree_cardinality_summary(df))
- eps = tools.aggregate_by_relation(
- tools.edge_pattern_summary(df, return_metapath_list=True)
- )
+ dcs = kgtt.edge_degree_cardinality_summary(aggregate_by_r=True)
+ eps = kgtt.edge_pattern_summary(return_metapath_list=True, aggregate_by_r=True)
assert np.allclose(dcs["num_triples"], [5, 5])
assert np.allclose(dcs["frac_triples"], [0.5, 0.5])
@@ -73,7 +71,7 @@ def test_small_graph_metrics() -> None:
def test_jaccard_similarity() -> None:
# jaccard_similarity_relation_sets
- res = tools.jaccard_similarity_relation_sets(df)
+ res = kgtt.jaccard_similarity_relation_sets()
assert np.allclose(res["jaccard_head_head"], [2 / 5])
assert np.allclose(res["jaccard_tail_tail"], [3 / 5])
assert np.allclose(res["jaccard_head_tail"], [2 / 5])
@@ -86,5 +84,5 @@ def test_jaccard_similarity() -> None:
)
def test_ingram_affinity(min_max_norm: bool, expected: List[float]) -> None:
# relational_affinity_ingram
- res = tools.relational_affinity_ingram(df, min_max_norm)
+ res = kgtt.relational_affinity_ingram(min_max_norm)
assert np.allclose(res["edge_weight"], expected)