From acdcf42eeab8975454557dfdf4e986046685b3e6 Mon Sep 17 00:00:00 2001
From: Alberto Cattaneo <84471416+AlCatt91@users.noreply.github.com>
Date: Fri, 8 Nov 2024 14:54:17 +0000
Subject: [PATCH] Relation filtering for edge methods + sparse metapath
 counting (#12)

* filter relations for edge card; cap mp workers based on cores

* add relation filter to edge methods

* ci fix

* metapath tweaks

* refactor metapath counting with sparse matmuls

* docstring update

* use np.divmod

* avoid repeated work

* add metapath unit test

* reduce memory usage of metapath counting

* improve docstrings

* document new functionalities in the doc notebook

* tidy up redundant code

* fix typo

---------

Co-authored-by: Daniel Justus <danielj@graphcore.ai>
---
 docs/source/notebooks/ogb_biokg_demo.ipynb  | 316 +++++++++++++++++++-
 src/kg_topology_toolbox/topology_toolbox.py | 228 ++++++++++----
 src/kg_topology_toolbox/utils.py            | 152 ++++++++--
 tests/test_edge_topology_toolbox.py         |  49 ++-
 tests/test_node_topology_toolbox.py         |   5 +-
 tests/test_relation_topology_toolbox.py     |   5 +-
 6 files changed, 647 insertions(+), 108 deletions(-)
diff --git a/docs/source/notebooks/ogb_biokg_demo.ipynb b/docs/source/notebooks/ogb_biokg_demo.ipynb
index bb73448..dc13ff5 100644
--- a/docs/source/notebooks/ogb_biokg_demo.ipynb
+++ b/docs/source/notebooks/ogb_biokg_demo.ipynb
@@ -22,9 +22,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Found existing installation: kg-topology-toolbox 0.1.0\n",
-      "Uninstalling kg-topology-toolbox-0.1.0:\n",
-      "  Successfully uninstalled kg-topology-toolbox-0.1.0\n"
+      "Found existing installation: kg-topology-toolbox 1.0.0\n",
+      "Uninstalling kg-topology-toolbox-1.0.0:\n",
+      "  Successfully uninstalled kg-topology-toolbox-1.0.0\n"
      ]
     }
    ],
@@ -37,7 +37,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -63,7 +63,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -181,7 +181,7 @@
        "[5088434 rows x 3 columns]"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -209,14 +209,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/nethome/albertoc/research/knowledge_graphs/kg-topology-toolbox/.venv/lib/python3.10/site-packages/kg_topology_toolbox/topology_toolbox.py:64: UserWarning: The Knowledge Graph contains duplicated edges -- some functionalities may produce incorrect results\n",
+      "/nethome/albertoc/research/knowledge_graphs/kg-topology-toolbox/.venv/lib/python3.10/site-packages/kg_topology_toolbox/utils.py:42: UserWarning: The Knowledge Graph contains duplicated edges -- some functionalities may produce incorrect results\n",
       "  warnings.warn(\n"
      ]
     }
@@ -232,13 +232,77 @@
     "Notice the warning raised by the constructor, which detects duplicated edges in the `biokg_df` DataFrame: to ensure optimal functionalities, duplicated edges should be removed before instantiating the `KGTopologyToolbox` class."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>h</th>\n",
+       "      <th>r</th>\n",
+       "      <th>t</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>3854407</th>\n",
+       "      <td>1972</td>\n",
+       "      <td>45</td>\n",
+       "      <td>1972</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4000534</th>\n",
+       "      <td>1972</td>\n",
+       "      <td>45</td>\n",
+       "      <td>1972</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            h   r     t\n",
+       "3854407  1972  45  1972\n",
+       "4000534  1972  45  1972"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# find duplicated edges\n",
+    "biokg_df.loc[biokg_df.duplicated(keep=False)]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Node-level analysis\n",
     "\n",
-    "The method `node_degree_summary` provides a summary of the degrees of each individual node in the knowledge graph. The returned dataframe is indexed on the node ID.\n",
+    "The method `node_degree_summary` provides a summary of the degrees of each individual node in the knowledge graph. The returned DataFrame is indexed on the node ID.\n",
     "\n",
     "- `h_degree` is the number of edges coming out from the node;\n",
     "- `t_degree` is the number of edges going into the node;\n",
@@ -894,7 +958,7 @@
     "\n",
     "![image info](../images/edge_patterns.png)\n",
     "\n",
-    "For inverse/inference, the method also provides the number and types of unique relations `r'` realizing the counterpart edges; for composition, the number of triangles supported by the edge is provided (the unique metapaths `[r_1, r_2]` can also be listed by setting `return_metapath_list=True` when calling the method)."
+    "For inverse/inference, the method also provides the number and types of unique relations `r'` realizing the counterpart edges; for composition, the number of triangles supported by the edge is provided."
    ]
   },
   {
@@ -1210,6 +1274,15 @@
     "edge_eps"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If we need to identify the different metapaths `[r_1, r_2]` that give triangles `(h,r1,x) - (x,r2,t)` over an edge `(h,r,t)`, we can do so by setting `return_metapath_list=True` in the call of `edge_pattern_summary`. In order to disaggregate the total number of triangles over an edge into separate counts for each existing metapath, the `edge_metapath_count` method should be used instead. \n",
+    "\n",
+    "We can now easily produce a global view of the distribution of topological properties."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 12,
@@ -1277,6 +1350,225 @@
     "plt.tight_layout()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Filtering relation types\n",
+    "\n",
+    "The edge-level methods presented in the previous section simultaneously compute statistics for all edges in the KG, and this can be expensive on larger graphs. Moreover, in many practical cases the user might be interested in looking only at the properties of edges of one or few specific relation types.\n",
+    "\n",
+    "The methods `edge_degree_cardinality_summary`, `edge_pattern_summary` and `edge_metapath_count` can be passed a list of relation type IDs to restrict computations of their outputs to edges of those specific relation types."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>index</th>\n",
+       "      <th>h</th>\n",
+       "      <th>r</th>\n",
+       "      <th>t</th>\n",
+       "      <th>r1</th>\n",
+       "      <th>r2</th>\n",
+       "      <th>n_triangles</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>334382</td>\n",
+       "      <td>732</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1225</td>\n",
+       "      <td>41</td>\n",
+       "      <td>2</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>334382</td>\n",
+       "      <td>732</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1225</td>\n",
+       "      <td>39</td>\n",
+       "      <td>2</td>\n",
+       "      <td>123</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>334382</td>\n",
+       "      <td>732</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1225</td>\n",
+       "      <td>38</td>\n",
+       "      <td>2</td>\n",
+       "      <td>200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>334382</td>\n",
+       "      <td>732</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1225</td>\n",
+       "      <td>37</td>\n",
+       "      <td>2</td>\n",
+       "      <td>27</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>334382</td>\n",
+       "      <td>732</td>\n",
+       "      <td>7</td>\n",
+       "      <td>1225</td>\n",
+       "      <td>36</td>\n",
+       "      <td>2</td>\n",
+       "      <td>6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>732149</th>\n",
+       "      <td>4953327</td>\n",
+       "      <td>1529</td>\n",
+       "      <td>24</td>\n",
+       "      <td>2492</td>\n",
+       "      <td>13</td>\n",
+       "      <td>41</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>732150</th>\n",
+       "      <td>4953327</td>\n",
+       "      <td>1529</td>\n",
+       "      <td>24</td>\n",
+       "      <td>2492</td>\n",
+       "      <td>11</td>\n",
+       "      <td>41</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>732151</th>\n",
+       "      <td>4953327</td>\n",
+       "      <td>1529</td>\n",
+       "      <td>24</td>\n",
+       "      <td>2492</td>\n",
+       "      <td>6</td>\n",
+       "      <td>41</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>732152</th>\n",
+       "      <td>4953327</td>\n",
+       "      <td>1529</td>\n",
+       "      <td>24</td>\n",
+       "      <td>2492</td>\n",
+       "      <td>4</td>\n",
+       "      <td>41</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>732153</th>\n",
+       "      <td>4953327</td>\n",
+       "      <td>1529</td>\n",
+       "      <td>24</td>\n",
+       "      <td>2492</td>\n",
+       "      <td>2</td>\n",
+       "      <td>41</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>732154 rows × 7 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          index     h   r     t  r1  r2  n_triangles\n",
+       "0        334382   732   7  1225  41   2           10\n",
+       "1        334382   732   7  1225  39   2          123\n",
+       "2        334382   732   7  1225  38   2          200\n",
+       "3        334382   732   7  1225  37   2           27\n",
+       "4        334382   732   7  1225  36   2            6\n",
+       "...         ...   ...  ..   ...  ..  ..          ...\n",
+       "732149  4953327  1529  24  2492  13  41            2\n",
+       "732150  4953327  1529  24  2492  11  41            2\n",
+       "732151  4953327  1529  24  2492   6  41            2\n",
+       "732152  4953327  1529  24  2492   4  41            1\n",
+       "732153  4953327  1529  24  2492   2  41            2\n",
+       "\n",
+       "[732154 rows x 7 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filtered_metapath_counts = kgtt.edge_metapath_count(filter_relations=[7, 24])\n",
+    "filtered_metapath_counts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The previous cell computes the number of triangles of each existing `(r1, r2)` metapath, but only over `(h,r,t)` edges of the two relation types with ID 7 and 24 (the column `index` gives the index of the edge in the `biokkg_df` DataFrame). This is the same as calling `kgtt.edge_metapath_count().query('r==7 or r==24')`, but the computation is much cheaper and faster."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "r\n",
+       "24    413366\n",
+       "7     318788\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filtered_metapath_counts.r.value_counts()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -2267,7 +2559,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv38",
+   "display_name": ".venv",
    "language": "python",
    "name": "python3"
   },
@@ -2281,7 +2573,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py
index c3d6f5b..5ae642e 100644
--- a/src/kg_topology_toolbox/topology_toolbox.py
+++ b/src/kg_topology_toolbox/topology_toolbox.py
@@ -5,6 +5,7 @@
 Topology toolbox main functionalities
 """
 
+import multiprocessing as mp
 from functools import cache
 
 import numpy as np
@@ -257,8 +258,6 @@ def edge_cardinality(self) -> pd.DataFrame:
         # check if the values in the pair (h_degree, t_degree) are =1 or >1
         # to determine the edge cardinality
         for suffix in ["", "_same_rel"]:
-            # check if the values in the pair (h_degree, t_degree) are =1 or >1
-            # to determine the edge cardinality
             edge_type = 2 * (head_degree["h_degree" + suffix] == 1) + (
                 tail_degree["t_degree" + suffix] == 1
             )
@@ -270,8 +269,65 @@ def edge_cardinality(self) -> pd.DataFrame:
             ).astype(str)
         return df_res
 
+    def edge_metapath_count(
+        self,
+        filter_relations: list[int] = [],
+        composition_chunk_size: int = 2**8,
+        composition_workers: int = min(32, mp.cpu_count() - 1 or 1),
+    ) -> pd.DataFrame:
+        """
+        For each edge in the KG, compute the number of triangles supported on it
+        distinguishing between different metapaths (i.e., the unique ordered tuples
+        (r1, r2) of relation types of the two additional edges of the triangle).
+
+        :param filter_relations:
+            If not empty, compute the output only for the edges with relation
+            in this list of relation IDs.
+        :param composition_chunk_size:
+            Size of column chunks of sparse adjacency matrix
+            to compute the triangle count. Reduce the parameter if running OOM.
+            Default: 2**8.
+        :param composition_workers:
+            Number of workers to compute the triangle count. By default, assigned based
+            on number of available threads (max: 32).
+
+        :return:
+            The output dataframe has one row for each (h, r, t, r1, r2) such that
+            there exists at least one triangle of metapath (r1, r2) over (h, r, t).
+            The number of metapath triangles is given in the column **n_triangles**.
+            The column **index** provides the index of the edge (h, r, t) in the
+            original Knowledge Graph dataframe.
+        """
+        # discard loops as edges of a triangle
+        df_wo_loops = self.df[self.df.h != self.df.t]
+        if len(filter_relations) > 0:
+            rel_df = self.df[self.df.r.isin(filter_relations)]
+            # unique heads and tails used by filtered edges
+            filter_heads = rel_df.h.unique()
+            filter_tails = rel_df.t.unique()
+            # the only relevant edges for triangles are the ones with head in the
+            # set of filtered heads, or tail in the set of filtered tails
+            df_triangles = df_wo_loops[
+                np.logical_or(
+                    df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails)
+                )
+            ]
+        else:
+            rel_df = self.df
+            df_triangles = df_wo_loops
+
+        counts = composition_count(
+            df_triangles,
+            chunk_size=composition_chunk_size,
+            workers=composition_workers,
+            metapaths=True,
+            directed=True,
+        )
+
+        return rel_df.reset_index().merge(counts, on=["h", "t"], how="inner")
+
     def edge_degree_cardinality_summary(
-        self, aggregate_by_r: bool = False
+        self, filter_relations: list[int] = [], aggregate_by_r: bool = False
     ) -> pd.DataFrame:
         """
         For each edge in the KG, compute the number of edges with the same head
@@ -285,6 +341,9 @@ def edge_degree_cardinality_summary(
         The output dataframe maintains the same indexing and ordering of triples
         as the original Knowledge Graph dataframe.
 
+        :param filter_relations:
+            If not empty, compute the output only for the edges with relation
+            in this list of relation IDs.
         :param aggregate_by_r:
             If True, return metrics aggregated by relation type
             (the output DataFrame will be indexed over relation IDs).
@@ -318,6 +377,8 @@ def edge_degree_cardinality_summary(
             ],
             axis=1,
         )
+        if len(filter_relations) > 0:
+            df_res = df_res[df_res.r.isin(filter_relations)]
         # compute number of parallel edges to avoid double-counting them
         # in total degree
         num_parallel = df_res.merge(
@@ -326,9 +387,9 @@ def edge_degree_cardinality_summary(
             how="left",
         )
         df_res["tot_degree"] = (
-            df_res.h_degree + df_res.t_degree - num_parallel.n_parallel
+            df_res.h_degree + df_res.t_degree - num_parallel.n_parallel.values
         )
-        # when restricting to the relation type, there is only one edge
+        # when restricting to the same relation type, there is only one edge
         # (the edge itself) that is double-counted
         df_res["tot_degree_same_rel"] = (
             df_res.h_degree_same_rel + df_res.t_degree_same_rel - 1
@@ -344,9 +405,10 @@ def edge_degree_cardinality_summary(
     def edge_pattern_summary(
         self,
         return_metapath_list: bool = False,
-        composition_chunk_size: int = 2**8,
-        composition_workers: int = 32,
+        filter_relations: list[int] = [],
         aggregate_by_r: bool = False,
+        composition_chunk_size: int = 2**8,
+        composition_workers: int = min(32, mp.cpu_count() - 1 or 1),
     ) -> pd.DataFrame:
         """
         Analyse structural properties of each edge in the KG:
@@ -358,15 +420,20 @@ def edge_pattern_summary(
 
         :param return_metapath_list:
             If True, return the list of unique metapaths for all
-            triangles supported over one edge. WARNING: very expensive for large graphs.
-        :param composition_chunk_size:
-            Size of column chunks of sparse adjacency matrix
-            to compute the triangle count.
-        :param composition_workers:
-            Number of workers to compute the triangle count.
+            triangles supported over each edge. WARNING: very expensive for large graphs.
+        :param filter_relations:
+            If not empty, compute the output only for the edges with relation
+            in this list of relation IDs.
         :param aggregate_by_r:
             If True, return metrics aggregated by relation type
             (the output DataFrame will be indexed over relation IDs).
+        :param composition_chunk_size:
+            Size of column chunks of sparse adjacency matrix
+            to compute the triangle count. Reduce the parameter if running OOM.
+            Default: 2**8.
+        :param composition_workers:
+            Number of workers to compute the triangle count. By default, assigned based
+            on number of available threads (max: 32).
 
         :return:
             The results dataframe. Contains the following columns
@@ -395,29 +462,67 @@ def edge_pattern_summary(
             - **metapath_list** (list): The list of unique metapaths "r1-r2"
               for the directed triangles.
         """
+
+        # discard loops as edges of a triangle
+        df_wo_loops = self.df[self.df.h != self.df.t]
+        if len(filter_relations) > 0:
+            rel_df = self.df[self.df.r.isin(filter_relations)]
+            # unique heads and tails used by filtered edges
+            filter_heads = rel_df.h.unique()
+            filter_tails = rel_df.t.unique()
+            filter_entities = np.union1d(filter_heads, filter_tails)
+            # restrict relevant edges to count inference/inverse patterns
+            inference_df = self.df[
+                np.logical_and(
+                    self.df.h.isin(filter_heads), self.df.t.isin(filter_tails)
+                )
+            ]
+            inverse_df = self.df[
+                np.logical_and(
+                    self.df.h.isin(filter_tails), self.df.t.isin(filter_heads)
+                )
+            ]
+            # the only relevant edges for triangles are the ones with head in the
+            # set of filtered heads, or tail in the set of filtered tails
+            df_triangles = df_wo_loops[
+                np.logical_or(
+                    df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails)
+                )
+            ]
+            # for undirected triangles, heads and tails can be any of the
+            # filtered entities
+            df_triangles_und = df_wo_loops[
+                np.logical_or(
+                    df_wo_loops.h.isin(filter_entities),
+                    df_wo_loops.t.isin(filter_entities),
+                )
+            ]
+        else:
+            rel_df = inference_df = inverse_df = self.df
+            df_triangles = df_triangles_und = df_wo_loops
+        df_res = pd.DataFrame(
+            {"h": rel_df.h, "r": rel_df.r, "t": rel_df.t, "is_symmetric": False}
+        )
         # symmetry-asymmetry
         # edges with h/t switched
-        df_inv = self.df.reindex(columns=["t", "r", "h"]).rename(
+        df_inv = inverse_df.reindex(columns=["t", "r", "h"]).rename(
             columns={"t": "h", "r": "r", "h": "t"}
         )
-        df_res = pd.DataFrame(
-            {"h": self.df.h, "r": self.df.r, "t": self.df.t, "is_symmetric": False}
-        )
         df_res.loc[
-            self.df.reset_index().merge(df_inv)["index"],
+            df_res.reset_index().merge(df_inv)["index"],
             "is_symmetric",
         ] = True
         # loops are treated separately
         df_res["is_loop"] = df_res.h == df_res.t
         df_res.loc[df_res.h == df_res.t, "is_symmetric"] = False
 
+        df_res = df_res.reset_index()
+
         # inverse
         unique_inv_r_by_ht = df_inv.groupby(["h", "t"], as_index=False).agg(
             inverse_edge_types=("r", list),
         )
-        df_res = df_res.merge(
-            unique_inv_r_by_ht, left_on=["h", "t"], right_on=["h", "t"], how="left"
-        )
+        df_res = df_res.merge(unique_inv_r_by_ht, on=["h", "t"], how="left")
         df_res["inverse_edge_types"] = df_res["inverse_edge_types"].apply(
             lambda agg: agg if isinstance(agg, list) else []
         )
@@ -432,65 +537,64 @@ def edge_pattern_summary(
         df_res["has_inverse"] = df_res["n_inverse_relations"] > 0
 
         # inference
-        edges_between_ht = unique_inv_r_by_ht.reindex(
-            columns=["t", "h", "inverse_edge_types"]
-        ).rename(
-            columns={"t": "h", "h": "t", "inverse_edge_types": "inference_edge_types"}
-        )
-        df_res = df_res.merge(
-            edges_between_ht, left_on=["h", "t"], right_on=["h", "t"], how="left"
-        )
+        if len(filter_relations) > 0:
+            edges_between_ht = inference_df.groupby(["h", "t"], as_index=False).agg(
+                inference_edge_types=("r", list),
+            )
+        else:
+            edges_between_ht = unique_inv_r_by_ht.reindex(
+                columns=["t", "h", "inverse_edge_types"]
+            ).rename(
+                columns={
+                    "t": "h",
+                    "h": "t",
+                    "inverse_edge_types": "inference_edge_types",
+                }
+            )
+        df_res = df_res.merge(edges_between_ht, on=["h", "t"], how="left")
         # inference_edge_types always contains the edge itself, which we need to drop
         df_res["n_inference_relations"] = df_res.inference_edge_types.str.len() - 1
         df_res["has_inference"] = df_res["n_inference_relations"] > 0
 
         # composition & metapaths
-        # discard loops as edges of a triangle
-        df_wo_loops = self.df[self.df.h != self.df.t]
+        counts = composition_count(
+            df_triangles,
+            chunk_size=composition_chunk_size,
+            workers=composition_workers,
+            metapaths=return_metapath_list,
+            directed=True,
+        )
         if return_metapath_list:
-            # 2-hop paths
-            df_bridges = df_wo_loops.merge(
-                df_wo_loops, left_on="t", right_on="h", how="inner"
-            )
-            df_triangles = df_wo_loops.merge(
-                df_bridges, left_on=["h", "t"], right_on=["h_x", "t_y"], how="inner"
-            )
-            df_triangles["metapath"] = (
-                df_triangles["r_x"].astype(str) + "-" + df_triangles["r_y"].astype(str)
+            # turn (r1, r2) into "r1-r2" string for metapaths
+            counts["metapath"] = (
+                counts["r1"].astype(str) + "-" + counts["r2"].astype(str)
             )
-            grouped_triangles = df_triangles.groupby(
-                ["h", "r", "t"], as_index=False
-            ).agg(
-                n_triangles=("metapath", "count"), metapath_list=("metapath", "unique")
+            # count triangles (summing over all metapaths between two nodes)
+            # and list unique metapaths for each head and tail node pair
+            grouped_triangles = counts.groupby(["h", "t"], as_index=False).agg(
+                n_triangles=("n_triangles", "sum"), metapath_list=("metapath", list)
             )
             df_res = df_res.merge(
                 grouped_triangles,
-                left_on=["h", "r", "t"],
-                right_on=["h", "r", "t"],
+                on=["h", "t"],
                 how="left",
             )
+            # if no triangles are present over an edge, set metapath list to []
             df_res["metapath_list"] = df_res["metapath_list"].apply(
-                lambda agg: agg.tolist() if isinstance(agg, np.ndarray) else []
+                lambda agg: agg if isinstance(agg, list) else []
             )
-            df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int)
         else:
-            counts = composition_count(
-                df_wo_loops,
-                chunk_size=composition_chunk_size,
-                workers=composition_workers,
-                directed=True,
-            )
             df_res = df_res.merge(
                 counts,
                 on=["h", "t"],
                 how="left",
             )
-            df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int)
-
+        df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int)
         df_res["has_composition"] = df_res["n_triangles"] > 0
 
+        # undirected composition
         counts = composition_count(
-            df_wo_loops,
+            df_triangles_und,
             chunk_size=composition_chunk_size,
             workers=composition_workers,
             directed=False,
@@ -505,7 +609,7 @@ def edge_pattern_summary(
         )
         df_res["has_undirected_composition"] = df_res["n_undirected_triangles"] > 0
 
-        df_res = df_res[
+        df_res = df_res.set_index("index")[
             [
                 "h",
                 "r",
@@ -525,6 +629,7 @@ def edge_pattern_summary(
             ]
             + (["metapath_list"] if return_metapath_list else [])
         ]
+        df_res.index.name = None
 
         return aggregate_by_relation(df_res) if aggregate_by_r else df_res
 
@@ -558,6 +663,7 @@ def jaccard_similarity_relation_sets(self) -> pd.DataFrame:
             - **jaccard_both** (float): Jaccard similarity between the full entity set
               of r1 and r2.
         """
+        # set of unique heads/tails/any for each relation
         ent_unique = self.df.groupby("r", as_index=False).agg(
             num_triples=("r", "count"), head=("h", "unique"), tail=("t", "unique")
         )
@@ -574,6 +680,7 @@ def jaccard_similarity_relation_sets(self) -> pd.DataFrame:
             suffixes=["_r1", "_r2"],
             how="cross",
         )
+        # order doesn't matter
         df_res = df_res[df_res.r1 < df_res.r2]
 
         df_res["num_triples_both"] = df_res["num_triples_r1"] + df_res["num_triples_r2"]
@@ -631,7 +738,7 @@ def relational_affinity_ingram(self, min_max_norm: bool = False) -> pd.DataFrame
         returned dataframe.
 
         :param min_max_norm:
-            min-max normalization of edge weights. Defaults to False.
+            min-max normalization of edge weights. Default: False.
 
         :return:
             The results dataframe. Contains the following columns:
@@ -648,15 +755,18 @@ def relational_affinity_ingram(self, min_max_norm: bool = False) -> pd.DataFrame
         # normalize by global t frequency
         rt_freqs["h"] = rt_freqs["h"] / rt_freqs.groupby("t")["h"].transform("sum")
 
+        # sparse matrix of of (h,r) pair frequency
         E_h = coo_array(
             (hr_freqs.t, (hr_freqs.h, hr_freqs.r)),
             shape=[self.n_entity, self.n_rel],
         )
+        # sparse matrix of of (t,r) pair frequency
         E_t = coo_array(
             (rt_freqs.h, (rt_freqs.t, rt_freqs.r)),
             shape=[self.n_entity, self.n_rel],
         )
 
+        # adjacency matrix of relation graph
         A = (E_h.T @ E_h).toarray() + (E_t.T @ E_t).toarray()
         A[np.diag_indices_from(A)] = 0
 
diff --git a/src/kg_topology_toolbox/utils.py b/src/kg_topology_toolbox/utils.py
index d3a3d55..5bc7ac7 100644
--- a/src/kg_topology_toolbox/utils.py
+++ b/src/kg_topology_toolbox/utils.py
@@ -133,6 +133,7 @@ def aggregate_by_relation(edge_topology_df: pd.DataFrame) -> pd.DataFrame:
         elif col_dtype == object:
             if isinstance(edge_topology_df[col].iloc[0], str):
                 for label in np.unique(edge_topology_df[col]):
+                    # fraction of rows for each label
                     df_res[f"{col}_{label}_frac"] = (
                         edge_topology_df[edge_topology_df[col] == label]
                         .groupby("r")[col]
@@ -188,24 +189,63 @@ def jaccard_similarity(
 
 
 def _composition_count_worker(
-    adj_csr: csr_array, adj_csc: csc_array, tail_shift: int = 0
+    adj_csr: csr_array,
+    adj_csc_slice: csc_array,
+    adj_mask_slice: csc_array,
+    slice_tail_shift: int,
 ) -> pd.DataFrame:
-    adj_2hop = adj_csr @ adj_csc
-    adj_composition = (adj_2hop.tocsc() * (adj_csc > 0)).tocoo()
-    df_composition = pd.DataFrame(
-        dict(
-            h=adj_composition.row,
-            t=adj_composition.col + tail_shift,
-            n_triangles=adj_composition.data,
+    """
+    Masked sparse matmul to count triangles over graph edges.
+
+    :param adj_csr: shape (n_nodes * n_rels, n_nodes) if distinguishing between
+        metapaths, (n_nodes, n_nodes) otherwise
+    :param adj_csc_slice: shape (n_nodes, chunk_size)
+    :param adj_mask_slice: shape (n_nodes, chunk_size)
+    :param slice_tail_shift: column shift of the vertical slice
+
+    :return:
+        Pandas dataframe of triangle counts.
+    """
+    n_nodes = adj_csr.shape[1]
+    n_rels = adj_csr.shape[0] // n_nodes
+    # 2-hop count
+    adj_2hop = adj_csr @ adj_csc_slice
+    # mask out (h,t) pairs not connected by edges
+    adj_composition = (adj_2hop.tocsc() * adj_mask_slice).tocoo()
+    if n_rels > 1:
+        # distinguish between metapaths
+        # unflatten results
+        h, r1 = np.divmod(adj_composition.row, n_rels)
+        r2, t = np.divmod(adj_composition.col + slice_tail_shift, n_nodes)
+        df_composition = pd.DataFrame(
+            dict(
+                h=h,
+                t=t,
+                r1=r1,
+                r2=r2,
+                n_triangles=adj_composition.data,
+            )
+        )
+    else:
+        # don't distinguish between metapaths
+        df_composition = pd.DataFrame(
+            dict(
+                h=adj_composition.row,
+                t=adj_composition.col + slice_tail_shift,
+                n_triangles=adj_composition.data,
+            )
         )
-    )
     return df_composition
 
 
 def composition_count(
-    df: pd.DataFrame, chunk_size: int, workers: int, directed: bool = True
+    df: pd.DataFrame,
+    chunk_size: int,
+    workers: int,
+    metapaths: bool = False,
+    directed: bool = True,
 ) -> pd.DataFrame:
-    """A helper function to compute the composition count of a graph.
+    """Compute composition count of a graph.
 
     :param df:
         A graph represented as a pd.DataFrame. Must contain the columns
@@ -215,44 +255,112 @@ def composition_count(
         processed together.
     :param workers:
         Number of workers processing chunks concurrently
+    :param metapaths:
+        If True, the number of compositions is computed separately for each
+        unique metapath.
     :param directed:
-        Boolean flag. If false, bidirectional edges are considered for
-        triangles by adding the adjacency matrix and its transposed. Default: True.
+        If False, bidirectional edges are considered for
+        triangles, by adding the adjacency matrix and its transposed. Default: True.
 
     :return:
         The results dataframe. Contains the following columns:
         - **h** (int): Index of the head entity.
         - **t** (int): Index of the tail entity.
-        - **n_triangles** (int): Number of compositions for the (h, t) edge.
+        - **n_triangles** (int): Number of compositions for any edge between (h, t).
     """
 
     n_nodes = df[["h", "t"]].max().max() + 1
+    n_rels = df["r"].max() + 1
+    # sparse graph adjacency matrix, counting number of edges between each pair of nodes
     adj = coo_array(
         (np.ones(len(df)), (df.h, df.t)),
         shape=[n_nodes, n_nodes],
     ).astype(np.uint16)
-    if not directed:
-        adj = adj + adj.T
-    n_cols = adj.shape[1]
-    adj_csr = adj.tocsr()
-    adj_csc = adj.tocsc()
+
+    if metapaths:
+        if not directed:
+            raise NotImplementedError(
+                "Metapath counting only implemented for directed triangles"
+            )
+        # relation-aware adjacency matrix, flattened to 2D for sparse implementation
+        # (adj_csr @ adj_csc).reshape(n_nodes, n_rels, n_rels, n_nodes)[h,r1,r2,t] counts
+        # the number of 2-hop paths of metapath (r1, r2) between h and t
+        adj_csr = csr_array(
+            (np.ones(len(df)), (df.h * n_rels + df.r, df.t)),
+            shape=[n_nodes * n_rels, n_nodes],
+        ).astype(np.uint16)
+        adj_csc = csc_array(
+            (np.ones(len(df)), (df.h, df.r * n_nodes + df.t)),
+            shape=[n_nodes, n_nodes * n_rels],
+        ).astype(np.uint16)
+        # boolean mask to filter results, keep only triangles over (h,t) pairs connected
+        # by at least one edge (equivalent to flattened adj[:,None,None,:] > 0)
+        msk = csc_array(
+            (
+                [True] * (len(adj.data) * n_rels),
+                (
+                    (n_rels * adj.row + np.arange(n_rels)[:, None]).flatten(),
+                    np.tile(adj.col, n_rels),
+                ),
+            ),
+            shape=[n_nodes * n_rels, n_nodes],
+        )
+    else:
+        if not directed:
+            # add inverse edges for undirected compositions
+            adj = adj + adj.T
+        # (adj_csr @ adj_csc)[h,t] counts the number of 2-hop paths between h and t;
+        # the boolean mask here is simply adj_csc > 0
+        adj_csr = adj.tocsr()
+        adj_csc = adj.tocsc()
+
+    # to compute (adj_csr @ adj_csc) * msk, serialize over vertical slices of adj_csc
+    n_cols = adj_csc.shape[1]
     adj_csc_slices = {
         i: adj_csc[:, i * chunk_size : min((i + 1) * chunk_size, n_cols)]
         for i in range(int(np.ceil(n_cols / chunk_size)))
     }
-
     if len(adj_csc_slices) > 1 and workers > 1:
         with Pool(workers) as pool:
+            # workers are assigned different adj_csc slices
             df_composition_list = pool.starmap(
                 _composition_count_worker,
                 (
-                    (adj_csr, adj_csc_slice, i * chunk_size)
+                    (
+                        adj_csr,
+                        adj_csc_slice,
+                        (
+                            # relevant slice of boolean mask (with wraparound)
+                            msk[
+                                :,
+                                (i * chunk_size + np.arange(adj_csc_slice.shape[1]))
+                                % msk.shape[1],
+                            ]
+                            if metapaths
+                            else adj_csc_slice > 0
+                        ),
+                        i * chunk_size,
+                    )
                     for i, adj_csc_slice in adj_csc_slices.items()
                 ),
             )
     else:
         df_composition_list = [
-            _composition_count_worker(adj_csr, adj_csc_slice, i * chunk_size)
+            _composition_count_worker(
+                adj_csr,
+                adj_csc_slice,
+                (
+                    # relevant slice of boolean mask (with wraparound)
+                    msk[
+                        :,
+                        (i * chunk_size + np.arange(adj_csc_slice.shape[1]))
+                        % msk.shape[1],
+                    ]
+                    if metapaths
+                    else adj_csc_slice > 0
+                ),
+                i * chunk_size,
+            )
             for i, adj_csc_slice in adj_csc_slices.items()
         ]
 
diff --git a/tests/test_edge_topology_toolbox.py b/tests/test_edge_topology_toolbox.py
index eaba81a..849bd5c 100644
--- a/tests/test_edge_topology_toolbox.py
+++ b/tests/test_edge_topology_toolbox.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2023 Graphcore Ltd. All rights reserved.
 
+from functools import partial
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -20,12 +22,20 @@
 )
 
 
-@pytest.mark.parametrize("return_metapath_list", [True, False])
-def test_small_graph_metrics(return_metapath_list: bool) -> None:
-    # Define a small graph with all the features tested by
-    # the edge_topology_toolbox
+def test_edge_metapath_count() -> None:
+    res = kgtt.edge_metapath_count(composition_chunk_size=3)
+    assert np.allclose(res["index"], [2, 2])
+    assert np.allclose(res["h"], [0, 0])
+    assert np.allclose(res["r"], [0, 0])
+    assert np.allclose(res["t"], [2, 2])
+    assert set(zip(res["r1"].values.tolist(), res["r2"].values.tolist())) == set(
+        [(0, 1), (1, 1)]
+    )
+    assert np.allclose(res["n_triangles"], [1, 1])
 
-    # entity degrees statistics
+
+def test_edge_degree_cardinality_summary() -> None:
+    # edge degrees statistics
     res = kgtt.edge_degree_cardinality_summary()
     assert np.allclose(res["h_unique_rel"], [2, 2, 2, 1, 2, 2, 1, 2])
     assert np.allclose(res["h_degree"], [3, 3, 3, 2, 3, 3, 2, 3])
@@ -58,8 +68,13 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None:
         "M:M",
     ]
 
+
+@pytest.mark.parametrize("return_metapath_list", [True, False])
+def test_edge_pattern_summary(return_metapath_list: bool) -> None:
     # relation pattern symmetry
-    res = kgtt.edge_pattern_summary(return_metapath_list=return_metapath_list)
+    res = kgtt.edge_pattern_summary(
+        return_metapath_list=return_metapath_list, composition_chunk_size=3
+    )
     assert np.allclose(
         res["is_loop"], [False, False, False, False, False, False, True, True]
     )
@@ -84,4 +99,24 @@ def test_small_graph_metrics(return_metapath_list: bool) -> None:
     assert np.allclose(res["n_triangles"], [0, 0, 2, 0, 0, 0, 0, 0])
     assert np.allclose(res["n_undirected_triangles"], [3, 3, 2, 6, 2, 2, 0, 0])
     if return_metapath_list:
-        assert res["metapath_list"][2] == ["0-1", "1-1"]
+        assert set(res["metapath_list"][2]) == set(["0-1", "1-1"])
+
+
+def test_filter_relations() -> None:
+    for rels in [[0], [1], [0, 1]]:
+        for method in [
+            kgtt.edge_metapath_count,
+            kgtt.edge_degree_cardinality_summary,
+            partial(kgtt.edge_pattern_summary, return_metapath_list=True),
+        ]:
+            # compare outputs of standard method call and filtered call
+            res_all = method()  # type: ignore
+            res_all = res_all[res_all.r.isin(rels)]
+            res_filtered = method(filter_relations=rels)  # type: ignore
+            assert np.all(res_all.index.values == res_filtered.index.values)
+            for c in res_all.columns:
+                if c == "metapath_list":
+                    for a, b in zip(res_all[c].values, res_filtered[c].values):
+                        assert a == b
+                else:
+                    assert np.all(res_all[c].values == res_filtered[c].values)
diff --git a/tests/test_node_topology_toolbox.py b/tests/test_node_topology_toolbox.py
index 18d87ed..d002b41 100644
--- a/tests/test_node_topology_toolbox.py
+++ b/tests/test_node_topology_toolbox.py
@@ -19,10 +19,7 @@
 
 
 @pytest.mark.parametrize("return_relation_list", [True, False])
-def test_small_graph_metrics(return_relation_list: bool) -> None:
-    # Define a small graph with all the features tested by
-    # the node_topology_toolbox
-
+def test_node_degree_summary(return_relation_list: bool) -> None:
     # entity degrees statistics
     res = kgtt.node_degree_summary(return_relation_list=return_relation_list)
     assert np.allclose(res["h_degree"], [3, 1, 3])
diff --git a/tests/test_relation_topology_toolbox.py b/tests/test_relation_topology_toolbox.py
index 3f0c05c..e527a5f 100644
--- a/tests/test_relation_topology_toolbox.py
+++ b/tests/test_relation_topology_toolbox.py
@@ -20,10 +20,7 @@
 kgtt = KGTopologyToolbox(df, head_column="H", relation_column="R", tail_column="T")
 
 
-def test_small_graph_metrics() -> None:
-    # Define a small graph on five nodes with all the features tested by
-    # the relation_topology_toolbox
-
+def test_aggregate_by_r() -> None:
     dcs = kgtt.edge_degree_cardinality_summary(aggregate_by_r=True)
     eps = kgtt.edge_pattern_summary(return_metapath_list=True, aggregate_by_r=True)
 

	index	h	r	t	r1	r2	n_triangles
0	334382	732	7	1225	41	2	10
1	334382	732	7	1225	39	2	123
2	334382	732	7	1225	38	2	200
3	334382	732	7	1225	37	2	27
4	334382	732	7	1225	36	2	6
...	...	...	...	...	...	...	...
732149	4953327	1529	24	2492	13	41	2
732150	4953327	1529	24	2492	11	41	2
732151	4953327	1529	24	2492	6	41	2
732152	4953327	1529	24	2492	4	41	1
732153	4953327	1529	24	2492	2	41	2