Skip to content

Commit

Permalink
address PR comments
Browse files Browse the repository at this point in the history
  • Loading branch information
AlCatt91 committed Aug 19, 2024
1 parent 1644c21 commit 0c08b8d
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 16 deletions.
2 changes: 1 addition & 1 deletion docs/source/notebooks/ogb_biokg_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"source": [
"import sys\n",
"!{sys.executable} -m pip uninstall -y kg_topology_toolbox\n",
"!pip install -q git+https://github.com/graphcore-research/kg-topology-toolbox.git@refactor_kgtt --no-cache-dir\n",
"!pip install -q git+https://github.com/graphcore-research/kg-topology-toolbox.git --no-cache-dir\n",
"!pip install -q jupyter ipywidgets ogb seaborn"
]
},
Expand Down
18 changes: 3 additions & 15 deletions src/kg_topology_toolbox/topology_toolbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,15 @@
Topology toolbox main functionalities
"""

import warnings
from functools import cache

import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype
from scipy.sparse import coo_array

from kg_topology_toolbox.utils import (
aggregate_by_relation,
check_kg_df_structure,
composition_count,
jaccard_similarity,
node_degrees_and_rels,
Expand Down Expand Up @@ -49,22 +48,11 @@ def __init__(
The name of the column with the IDs of tail entities. Default: "t".
"""
for col_name in [head_column, relation_column, tail_column]:
if col_name in kg_df.columns:
if not is_integer_dtype(kg_df[col_name]):
raise TypeError(
f"Column {col_name} needs to be of an integer dtype"
)
else:
raise ValueError(f"DataFrame {kg_df} has no column named {col_name}")
check_kg_df_structure(kg_df, head_column, relation_column, tail_column)

self.df = kg_df[[head_column, relation_column, tail_column]].rename(
columns={head_column: "h", relation_column: "r", tail_column: "t"}
)
if self.df.duplicated().any():
warnings.warn(
"The Knowledge Graph contains duplicated edges"
" -- some functionalities may produce incorrect results"
)
self.n_entity = self.df[["h", "t"]].max().max() + 1
self.n_rel = self.df.r.max() + 1

Expand Down
32 changes: 32 additions & 0 deletions src/kg_topology_toolbox/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,47 @@
Utility functions
"""

import warnings
from collections.abc import Iterable
from multiprocessing import Pool

import numpy as np
import pandas as pd
from numpy.typing import NDArray
from pandas.api.types import is_integer_dtype
from scipy.sparse import coo_array, csc_array, csr_array


def check_kg_df_structure(kg_df: pd.DataFrame, h: str, r: str, t: str) -> None:
"""
Utility to perform sanity checks on the structure of the provided DataFrame,
to ensure that it encodes a Knowledge Graph in a compatible way.
:param kg_df:
The Knowledge Graph DataFrame.
:param h:
The name of the column with the IDs of head entities.
:param r:
The name of the column with the IDs of relation types.
:param t:
The name of the column with the IDs of tail entities.
"""
# check h,r,t columns are present and of an integer type
for col_name in [h, r, t]:
if col_name in kg_df.columns:
if not is_integer_dtype(kg_df[col_name]):
raise TypeError(f"Column {col_name} needs to be of an integer dtype")
else:
raise ValueError(f"DataFrame {kg_df} has no column named {col_name}")
# check there are no duplicated (h,r,t) triples
if kg_df[[h, r, t]].duplicated().any():
warnings.warn(
"The Knowledge Graph contains duplicated edges"
" -- some functionalities may produce incorrect results"
)


def node_degrees_and_rels(
df: pd.DataFrame, column: str, n_entity: int, return_relation_list: bool
) -> pd.DataFrame:
Expand Down

0 comments on commit 0c08b8d

Please sign in to comment.