diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..b7d7cb4 --- /dev/null +++ b/.flake8 @@ -0,0 +1,13 @@ +######################### +# Flake8 Configuration # +# (.flake8) # +######################### +[flake8] +ignore = + # line too long + E501 + # line break before binary operator + W503 + # whitespace before ':' + E203 +max-line-length = 90 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..487b761 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,39 @@ +name: CI + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + workflow_dispatch: + +concurrency: + # Run everything on main, most-recent on PR builds + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + ci: + runs-on: ubuntu-latest + timeout-minutes: 20 + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Install dev-requirements + run: | + sudo apt install pandoc + python -m pip install --upgrade pip + pip install -r requirements-dev.txt --no-cache-dir + shell: bash + - name: Run CI + run: ./dev ci + shell: bash + - name: Publish docs + if: ${{github.ref == 'refs/heads/main'}} + uses: Cecilapp/GitHub-Pages-deploy@3.2.1 + env: { GITHUB_TOKEN: "${{ github.token }}" } + with: + build_dir: docs/build/html/ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..59d7a62 --- /dev/null +++ b/.gitignore @@ -0,0 +1,177 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# ONNX checkpoints +*.onnx + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Data +data/ +datasets/ +*.pkl +*.pt + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/build/ +docs/source/generated +docs/source/api + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# VSCode +.vscode/ + +#WandB +wandb/ + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv* +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..51ce057 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Graphcore Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/NOTICE.md b/NOTICE.md new file mode 100644 index 0000000..24bc565 --- /dev/null +++ b/NOTICE.md @@ -0,0 +1,90 @@ +Copyright (c) 2023 Graphcore Ltd. Licensed under the MIT License. + +The included code is released under an MIT license, (see [LICENSE](LICENSE)). + +## Dependencies + +Our dependencies are (see [requirements.txt](requirements.txt)): + +| Component | About | License | +| --- | --- | --- | +| numpy | Array processing library | BSD 3-Clause | +| pandas | Structured data analysis library | BSD 3-Clause | +| scipy | Mathematical routines library | BSD 3-Clause | + +We also use additional Python dependencies for development/testing/documentation (see [requirements-dev.txt](requirements-dev.txt)). + +The [tutorial notebook](docs/source/notebooks/ogb_biokg_demo.ipynb) make use of the [ogbl-biokg](https://ogb.stanford.edu/docs/linkprop/#ogbl-biokg) dataset, licensed under CC-0; + +## Derived work + +This directory includes derived work from the following: + +--- + +Sphinx: https://github.com/sphinx-doc/sphinx, licensed under: + +> Unless otherwise indicated, all code in the Sphinx project is licenced under the +> two clause BSD licence below. +> +> Copyright (c) 2007-2023 by the Sphinx team (see AUTHORS file). +> All rights reserved. +> +> Redistribution and use in source and binary forms, with or without +> modification, are permitted provided that the following conditions are +> met: +> +> * Redistributions of source code must retain the above copyright +> notice, this list of conditions and the following disclaimer. +> +> * Redistributions in binary form must reproduce the above copyright +> notice, this list of conditions and the following disclaimer in the +> documentation and/or other materials provided with the distribution. +> +> THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +> "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +> LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +> A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +> HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +> SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +> LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +> DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +> THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +> (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +> OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +this applies to: +- `docs/source/_templates/module.rst` (modified) +- `docs/source/_templates/package.rst` (modified) +- `docs/source/_templates/toc.rst` (modified) + +--- + +The Example: Basic Sphinx project for Read the Docs: https://github.com/readthedocs-examples/example-sphinx-basic, licensed under: + +> MIT License +> +> Copyright (c) 2022 Read the Docs Inc +> +> Permission is hereby granted, free of charge, to any person obtaining a copy +> of this software and associated documentation files (the "Software"), to deal +> in the Software without restriction, including without limitation the rights +> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +> copies of the Software, and to permit persons to whom the Software is +> furnished to do so, subject to the following conditions: +> +> The above copyright notice and this permission notice shall be included in all +> copies or substantial portions of the Software. +> +> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +> SOFTWARE. + +this applies to: +- `docs/source/conf.py` (modified) +- `docs/make.bat` +- `docs/Makefile` \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..babb423 --- /dev/null +++ b/README.md @@ -0,0 +1,32 @@ +# KG Topology Toolbox +![Continuous integration](https://github.com/graphcore-research/kg-topology-toolbox/actions/workflows/ci.yaml/badge.svg) + +A Python toolbox to compute topological metrics and statistics for Knowledge Graphs. + +Documentation can be found at https://curly-barnacle-lnejye6.pages.github.io/ + +For a walkthrough of the main functionalities, we provide an introductory [Jupyter notebook](docs/source/notebooks/ogb_biokg_demo.ipynb). + +## Usage + +Tested on Ubuntu 20.04, Python >=3.8 + +To install the `kg-topology-toolbox` library, run + +``` +pip install wheel +pip install git+ssh://git@github.com/graphcore-research/kg-topology-toolbox +``` + +4\. Import and use: +```python +from kg_topology_toolbox import KGTopologyToolbox +``` + +## License + +Copyright (c) 2023 Graphcore Ltd. Licensed under the MIT License. + +The included code is released under the MIT license (see [details of the license](LICENSE)). + +See [notices](NOTICE.md) for dependencies, credits, derived work and further details. diff --git a/dev b/dev new file mode 100755 index 0000000..763e4b9 --- /dev/null +++ b/dev @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +# Copyright (c) 2023 Graphcore Ltd. All rights reserved. + +# Code derived from +# https://github.com/graphcore-research/poptorch-experimental-addons/blob/main/dev +# Copyright (c) 2023 Graphcore Ltd +# Licensed under the MIT License (credits @DouglasOrr) + +"""Dev task launcher.""" + +import argparse +import datetime +import os +import subprocess +import sys +from pathlib import Path +from typing import Any, Callable, Iterable, List, Optional, TypeVar + +# Utilities + + +def run(command: Iterable[Any], gdb: bool = False) -> None: + """Run a command, terminating on failure.""" + cmd = [str(arg) for arg in command if arg is not None] + if gdb: + cmd = ["gdb", "-ex", "catch throw", "-ex", "run", "--args"] + cmd + print("$ " + " ".join(cmd), file=sys.stderr) + environ = os.environ.copy() + environ["PYTHONPATH"] = f"{os.getcwd()}:{environ.get('PYTHONPATH', '')}" + exit_code = subprocess.call(cmd, env=environ) + if exit_code: + sys.exit(exit_code) + + +T = TypeVar("T") + + +def cli(*args: Any, **kwargs: Any) -> Callable[[T], T]: + """Declare a CLI command / arguments for that command.""" + + def wrap(func: T) -> T: + if not hasattr(func, "cli_args"): + setattr(func, "cli_args", []) + if args or kwargs: + getattr(func, "cli_args").append((args, kwargs)) + return func + + return wrap + + +# Commands + +PYTHON_ROOTS = ["src/kg_topology_toolbox", "tests", "dev"] + + +@cli("-k", "--filter") +@cli("--gdb", action="store_true") +def tests(filter: Optional[str], gdb: bool) -> None: + """run Python tests""" + run( + [ + "python", + "-m", + "pytest", + "tests", + None if filter else "--cov=kg_topology_toolbox", + *(["-k", filter] if filter else []), + ], + gdb=gdb, + ) + + +@cli() +def lint() -> None: + """run static analysis""" + run(["python", "-m", "flake8", *PYTHON_ROOTS]) + run(["python", "-m", "mypy", *PYTHON_ROOTS]) + + +@cli("--check", action="store_true") +def format(check: bool) -> None: + """autoformat all sources""" + run(["python", "-m", "black", "--check" if check else None, *PYTHON_ROOTS]) + run(["python", "-m", "isort", "--check" if check else None, *PYTHON_ROOTS]) + + +@cli() +def copyright() -> None: + """check for Graphcore copyright headers on relevant files""" + command = ( + "find " + " ".join(PYTHON_ROOTS) + " -type f -not -name *.pyc" + " | xargs grep -L 'Copyright (c) 202. Graphcore Ltd[.] All rights reserved[.]'" + ) + print(f"$ {command}", file=sys.stderr) + # Note: grep exit codes are not consistent between versions, so we don't use + # check=True + output = ( + subprocess.run( + command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT + ) + .stdout.decode() + .strip() + ) + if output: + print( + "Error - failed copyright header check in:\n " + + output.replace("\n", "\n "), + file=sys.stderr, + ) + print("Template(s):") + comment_prefixes = { + {".cpp": "//"}.get(Path(f).suffix, "#") for f in output.split("\n") + } + for prefix in comment_prefixes: + print( + f"{prefix} Copyright (c) {datetime.datetime.now().year}" + " Graphcore Ltd. All rights reserved.", + file=sys.stderr, + ) + sys.exit(1) + + +@cli() +def doc() -> None: + """generate Sphinx documentation""" + subprocess.call(["rm", "-r", "docs/build"]) + subprocess.call(["rm", "-r", "docs/source/api"]) + subprocess.call(["rm", "-r", "docs/source/generated"]) + run(["make", "clean", "-C", "docs/"]) + run(["make", "html", "-C", "docs/"]) + + +@cli("--skip", nargs="*", default=[], help="commands to skip") +def ci(skip: List[str] = []) -> None: + """run continuous integration tests & checks + doc build""" + if "lint" not in skip: + lint() + if "format" not in skip: + format(check=True) + if "copyright" not in skip: + copyright() + if "tests" not in skip: + tests(filter=None, gdb=False) + if "doc" not in skip: + doc() + + +# Script + + +def _main() -> None: + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.set_defaults(action=ci) + + subs = parser.add_subparsers() + for key, value in globals().items(): + if hasattr(value, "cli_args"): + sub = subs.add_parser(key.replace("_", "-"), help=value.__doc__) + for args, kwargs in value.cli_args: + sub.add_argument(*args, **kwargs) + sub.set_defaults(action=value) + + cli_args = vars(parser.parse_args()) + action = cli_args.pop("action") + action(**cli_args) + + +if __name__ == "__main__": + _main() diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..43eab3f --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,22 @@ +# Copyright (c) 2022 Read the Docs Inc. All rights reserved. + +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..c370eac --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,36 @@ +:: Copyright (c) 2022 Read the Docs Inc. All rights reserved. +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/API_reference.rst b/docs/source/API_reference.rst new file mode 100644 index 0000000..eb5d47d --- /dev/null +++ b/docs/source/API_reference.rst @@ -0,0 +1,11 @@ +API Reference +====================================== + +.. autosummary:: + :toctree: generated + :template: module.rst + :recursive: + + kg_topology_toolbox.topology_toolbox + kg_topology_toolbox.utils + \ No newline at end of file diff --git a/docs/source/_templates/class.rst b/docs/source/_templates/class.rst new file mode 100644 index 0000000..bcfcb17 --- /dev/null +++ b/docs/source/_templates/class.rst @@ -0,0 +1,11 @@ +.. + # Copyright (c) 2023 Graphcore Ltd. All rights reserved. + # Copyright (c) 2007-2023 by the Sphinx team. All rights reserved. + +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + :members: + :inherited-members: Module \ No newline at end of file diff --git a/docs/source/_templates/module.rst b/docs/source/_templates/module.rst new file mode 100644 index 0000000..b3fda69 --- /dev/null +++ b/docs/source/_templates/module.rst @@ -0,0 +1,60 @@ +.. + # Copyright (c) 2023 Graphcore Ltd. All rights reserved. + # Copyright (c) 2007-2023 by the Sphinx team. All rights reserved. + +{{ fullname | escape | underline}} + +.. automodule:: {{ fullname }} + + {% block attributes %} + {% if attributes %} + .. rubric:: {{ _('Module Attributes') }} + + .. autosummary:: + :toctree: + {% for item in attributes %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block functions %} + {% if functions %} + .. rubric:: {{ _('Functions') }} + + .. autosummary:: + :toctree: + {% for item in functions %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block classes %} + {% if classes %} + .. rubric:: {{ _('Classes') }} + + .. autosummary:: + :toctree: + :template: class.rst + {% for item in classes %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + +{% block modules %} +{% if modules %} +.. rubric:: Modules + +.. autosummary:: + :toctree: + :template: class.rst + :recursive: +{% for item in modules %} + {% if "test" not in item and "docs" not in item %} + {{ item }} + {% endif %} +{%- endfor %} +{% endif %} +{% endblock %} diff --git a/docs/source/_templates/package.rst b/docs/source/_templates/package.rst new file mode 100644 index 0000000..325848d --- /dev/null +++ b/docs/source/_templates/package.rst @@ -0,0 +1,61 @@ +.. + # Copyright (c) 2023 Graphcore Ltd. All rights reserved. + # Copyright (c) 2007-2023 by the Sphinx team. All rights reserved. + +{%- macro automodule(modname, options) -%} +.. automodule:: {{ modname }} +{%- for option in options %} + :{{ option }}: +{%- endfor %} +{%- endmacro %} + +{%- macro toctree(docnames) -%} +.. toctree:: + :maxdepth: {{ maxdepth }} +{% for docname in docnames %} + {{ docname }} +{%- endfor %} +{%- endmacro %} + +{%- if is_namespace %} +{{- [pkgname, "namespace"] | join(" ") | e | heading }} +{% else %} +{{- [pkgname, "package"] | join(" ") | e | heading }} +{% endif %} + +{%- if is_namespace %} +.. py:module:: {{ pkgname }} +{% endif %} + +{%- if modulefirst and not is_namespace %} +{{ automodule(pkgname, automodule_options) }} +{% endif %} + +{%- if subpackages %} +Subpackages +----------- + +{{ toctree(subpackages) }} +{% endif %} + +{%- if submodules %} +Submodules +---------- +{% if separatemodules %} +{{ toctree(submodules) }} +{% else %} +{%- for submodule in submodules %} +{% if show_headings %} +{{- submodule | e | heading(2) }} +{% endif %} +{{ automodule(submodule, automodule_options) }} +{% endfor %} +{%- endif %} +{%- endif %} + +{%- if not modulefirst and not is_namespace %} +Module contents +--------------- + +{{ automodule(pkgname, automodule_options) }} +{% endif %} diff --git a/docs/source/_templates/toc.rst b/docs/source/_templates/toc.rst new file mode 100644 index 0000000..39ab508 --- /dev/null +++ b/docs/source/_templates/toc.rst @@ -0,0 +1,12 @@ +.. + # Copyright (c) 2023 Graphcore Ltd. All rights reserved. + # Copyright (c) 2007-2023 by the Sphinx team. All rights reserved. + +{{ header | heading }} + +.. toctree:: + :maxdepth: {{ maxdepth }} +{% for docname in docnames %} + {{ docname }} +{%- endfor %} + diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..aae352e --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,67 @@ +# Copyright (c) 2023 Graphcore Ltd. All rights reserved. +# Copyright (c) 2022 Read the Docs Inc. All rights reserved. + +import os +import sys + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +sys.path.insert(0, os.path.abspath("../../src/")) + +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "KG Topology Toolbox" +copyright = "(c) 2023 Graphcore Ltd. All rights reserved" +author = "Alberto Cattaneo, Daniel Justus" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "sphinx.ext.todo", + "sphinx.ext.viewcode", + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.mathjax", + "sphinx.ext.viewcode", + "sphinx.ext.autosummary", + "sphinx_autodoc_typehints", + "sphinx_automodapi.automodapi", + "sphinx_automodapi.smart_resolver", + "sphinx.ext.intersphinx", + "sphinx.ext.autosectionlabel", + "myst_parser", + "nbsphinx", +] +numpydoc_show_class_members = False +todo_include_todos = True +autosummary_generate = True +autoclass_content = "both" +autodoc_typehints = "both" +napoleon_google_docstring = True +napoleon_numpy_docstring = False + +templates_path = ["_templates"] +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "sphinx_rtd_theme" +html_static_path = ["_static"] + + +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "numpy": ("https://numpy.org/doc/stable", None), + "pandas": ("http://pandas.pydata.org/pandas-docs/dev", None), + "scipy": ("https://docs.scipy.org/doc/scipy/", None), + "rtd": ("https://docs.readthedocs.io/en/stable/", None), + "sphinx": ("https://www.sphinx-doc.org/en/master/", None), +} diff --git a/docs/source/images/edge_patterns.png b/docs/source/images/edge_patterns.png new file mode 100644 index 0000000..85b9dd9 Binary files /dev/null and b/docs/source/images/edge_patterns.png differ diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..480c493 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,12 @@ +KG Topology Toolbox +====================================== + +.. automodule:: kg_topology_toolbox + +.. toctree:: + :maxdepth: 3 + :caption: Contents + + User guide + notebooks/ogb_biokg_demo + API reference diff --git a/docs/source/notebooks/ogb_biokg_demo.ipynb b/docs/source/notebooks/ogb_biokg_demo.ipynb new file mode 100644 index 0000000..83729b7 --- /dev/null +++ b/docs/source/notebooks/ogb_biokg_demo.ipynb @@ -0,0 +1,2269 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# KGTopologyToolbox walk-through\n", + "\n", + "Copyright (c) 2024 Graphcore Ltd. All rights reserved.\n", + "\n", + "In this notebook we give a general overview of the classes and methods included in the `kg-topology-toolbox` library and explain how to use them to extract topological data from any knowledge graph. As an example, we use the open-source biomedical dataset [ogbl-biokg](https://ogb.stanford.edu/docs/linkprop/#ogbl-biokg).\n", + "\n", + "## Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found existing installation: kg-topology-toolbox 0.1.0\n", + "Uninstalling kg-topology-toolbox-0.1.0:\n", + " Successfully uninstalled kg-topology-toolbox-0.1.0\n" + ] + } + ], + "source": [ + "import sys\n", + "!{sys.executable} -m pip uninstall -y kg_topology_toolbox\n", + "!pip install -q git+ssh://git@github.com/graphcore-research/kg-topology-toolbox\n", + "!pip install -q jupyter ipywidgets ogb seaborn" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "import ogb.linkproppred\n", + "from kg_topology_toolbox import KGTopologyToolbox\n", + "\n", + "dataset_directory = \"../../../data/ogb-biokg/\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data preparation\n", + "\n", + "We load the OGBL-BioKG dataset using the `ogb.linkproppred.LinkPropPredDataset` class and store all (h, r, t) triples in a `pandas` DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hrt
0171803207
14903013662
25480015999
3314807247
410300016202
............
50884292451505097
50884306456508833
508843194845015873
5088432636550496
508843313860506368
\n", + "

5088434 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " h r t\n", + "0 1718 0 3207\n", + "1 4903 0 13662\n", + "2 5480 0 15999\n", + "3 3148 0 7247\n", + "4 10300 0 16202\n", + "... ... .. ...\n", + "5088429 2451 50 5097\n", + "5088430 6456 50 8833\n", + "5088431 9484 50 15873\n", + "5088432 6365 50 496\n", + "5088433 13860 50 6368\n", + "\n", + "[5088434 rows x 3 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = ogb.linkproppred.LinkPropPredDataset(\n", + " name=\"ogbl-biokg\", root=dataset_directory\n", + ")\n", + "\n", + "all_triples = []\n", + "for split in dataset.get_edge_split().values():\n", + " all_triples.append(np.stack([split[\"head\"], split[\"relation\"], split[\"tail\"]]).T)\n", + "biokg_df = pd.DataFrame(np.concatenate(all_triples), columns=[\"h\", \"r\", \"t\"])\n", + "biokg_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Based on this representation of the knowledge graph, we can proceed to compute its topological properties using the `KGTopologyToolbox` class." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "kgtt = KGTopologyToolbox()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Node-level analysis\n", + "\n", + "The method `node_degree_summary` provides a summary of the degrees of each individual node in the knowledge graph. The returned dataframe is indexed on the node ID.\n", + "\n", + "- `h_degree` is the number of edges coming out from the node;\n", + "- `t_degree` is the number of edges going into the node;\n", + "- `tot_degree` is the number of edges that use the node as either head or tail;\n", + "- `h_unique_rel` (resp. `t_unique_rel`) is the number of unique relation types come out from (resp. go into) the node;\n", + "- `n_loops` is the number of loop edges around the node." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
h_degreet_degreetot_degreeh_unique_relt_unique_reln_loops
0277299440
11494108360
220895303570
328999261545515310110
436230266411120
.....................
45080212243110
45081293261110
45082283058110
45083171936110
45084283159110
\n", + "

45085 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " h_degree t_degree tot_degree h_unique_rel t_unique_rel n_loops\n", + "0 27 72 99 4 4 0\n", + "1 14 94 108 3 6 0\n", + "2 208 95 303 5 7 0\n", + "3 28999 26154 55153 10 11 0\n", + "4 362 302 664 11 12 0\n", + "... ... ... ... ... ... ...\n", + "45080 21 22 43 1 1 0\n", + "45081 29 32 61 1 1 0\n", + "45082 28 30 58 1 1 0\n", + "45083 17 19 36 1 1 0\n", + "45084 28 31 59 1 1 0\n", + "\n", + "[45085 rows x 6 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "node_ds = kgtt.node_degree_summary(biokg_df)\n", + "node_ds" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/nethome/albertoc/research/knowledge_graphs/kg-topology-toolbox/.venv38/lib/python3.8/site-packages/pandas/core/arraylike.py:396: RuntimeWarning: divide by zero encountered in log2\n", + " result = getattr(ufunc, method)(*inputs, **kwargs)\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "metrics = [\n", + " \"h_degree\",\n", + " \"t_degree\",\n", + "]\n", + "fig, ax = plt.subplots(1, len(metrics), figsize=(4.5 * len(metrics), 4))\n", + "\n", + "for i, metric in enumerate(metrics):\n", + " x = np.log2(node_ds[metric])\n", + " sns.histplot(\n", + " x=x, stat=\"probability\", binwidth=1, binrange=[0, x.max() + 1], ax=ax[i]\n", + " )\n", + " ax[i].set_xlabel(f\"log2({metric})\")\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "metrics = [\n", + " \"h_unique_rel\",\n", + " \"t_unique_rel\",\n", + "]\n", + "fig, ax = plt.subplots(1, len(metrics), figsize=(4.5 * len(metrics), 4))\n", + "\n", + "for i, metric in enumerate(metrics):\n", + " x = node_ds[metric]\n", + " sns.histplot(\n", + " x=x, stat=\"probability\", binwidth=1, binrange=[0, x.max() + 1], ax=ax[i]\n", + " )\n", + " ax[i].set_xlabel(f\"{metric}\")\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Edge-level analysis\n", + "\n", + "### Edge degrees and cardinality\n", + "\n", + "The method `edge_degree_cardinality_summary` provides, for each edge (h, r, t) in the KG, detailed information on the connectivity patterns of the head and tail nodes:\n", + "\n", + "- `h_unique_rel` (resp. `t_unique_rel`) is the number of unique relation types coming out of the head node (resp. going into the tail node);\n", + "- `h_degree` is the out-degree of the head node and `h_degree_same_rel` is the degree when only considering edges of the same relation type `r`;\n", + "- `t_degree` is the in-degree of the tail node and `t_degree_same_rel` is the degree when only considering edges of the same relation type `r`;\n", + "- `tot_degree` is the total number of edges with either head entity `h` or tail entity `t` (in particular, `tot_degree <= h_degree + t_degree`); `tot_degree_same_rel` is computed only considering edges of the same relation type `r`;\n", + "- `triple_cardinality` is the cardinality type of the edge:\n", + " - _one-to-one_ (1:1) if `h_degree = 1`, `t_degree = 1`;\n", + " - _one-to-many_ (1:M) if `h_degree > 1`, `t_degree = 1`;\n", + " - _many-to-one_ (M:1) if `h_degree = 1`, `t_degree > 1`;\n", + " - _many-to-many_ (M:M) if `h_degree > 1`, `t_degree > 1`.\n", + "- `triple_cardinality_same_rel` is defined as `triple_cardinality` but using `h_degree_same_rel`, `t_degree_same_rel`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hrth_unique_relh_degreeh_degree_same_relt_unique_relt_degreet_degree_same_reltot_degreetot_degree_same_reltriple_cardinalitytriple_cardinality_same_rel
0171803207519111664614236129M:MM:M
149030136628544339197550251882M:MM:M
25480015999310854722217926M:MM:M
331480724741109911673271782369M:MM:M
4103000162024414315614831561345M:MM:M
..........................................
508842924515050975636272108032721437543M:MM:M
5088430645650883310743259103711001111358M:MM:M
508843194845015873865221364861631135375M:MM:M
50884326365504969922277196181731537449M:MM:M
50884331386050636874851758455147939321M:MM:M
\n", + "

5088434 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " h r t h_unique_rel h_degree h_degree_same_rel \\\n", + "0 1718 0 3207 5 191 116 \n", + "1 4903 0 13662 8 544 33 \n", + "2 5480 0 15999 3 108 5 \n", + "3 3148 0 7247 4 110 99 \n", + "4 10300 0 16202 4 414 315 \n", + "... ... .. ... ... ... ... \n", + "5088429 2451 50 5097 5 636 272 \n", + "5088430 6456 50 8833 10 743 259 \n", + "5088431 9484 50 15873 8 652 213 \n", + "5088432 6365 50 496 9 922 277 \n", + "5088433 13860 50 6368 7 485 175 \n", + "\n", + " t_unique_rel t_degree t_degree_same_rel tot_degree \\\n", + "0 6 46 14 236 \n", + "1 9 1975 50 2518 \n", + "2 4 72 22 179 \n", + "3 11 673 271 782 \n", + "4 6 148 31 561 \n", + "... ... ... ... ... \n", + "5088429 10 803 272 1437 \n", + "5088430 10 371 100 1111 \n", + "5088431 6 486 163 1135 \n", + "5088432 19 618 173 1537 \n", + "5088433 8 455 147 939 \n", + "\n", + " tot_degree_same_rel triple_cardinality triple_cardinality_same_rel \n", + "0 129 M:M M:M \n", + "1 82 M:M M:M \n", + "2 26 M:M M:M \n", + "3 369 M:M M:M \n", + "4 345 M:M M:M \n", + "... ... ... ... \n", + "5088429 543 M:M M:M \n", + "5088430 358 M:M M:M \n", + "5088431 375 M:M M:M \n", + "5088432 449 M:M M:M \n", + "5088433 321 M:M M:M \n", + "\n", + "[5088434 rows x 13 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edge_dcs = kgtt.edge_degree_cardinality_summary(biokg_df)\n", + "edge_dcs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The data on the distribution of degrees and cardinalities can be then easily visualized." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Edge frequency when binning by head and tail degree\n", + "\n", + "metrics = [(\"h_degree\", \"t_degree\"), (\"h_degree_same_rel\", \"t_degree_same_rel\")]\n", + "fig, ax = plt.subplots(1, len(metrics), figsize=[5 * len(metrics), 4.5])\n", + "\n", + "for i, (group_metric_1, group_metric_2) in enumerate(metrics):\n", + " df_empty = pd.DataFrame(\n", + " columns=np.int32(2 ** np.arange(15)), index=np.int32(2 ** np.arange(15))\n", + " )\n", + " df_tmp = edge_dcs[[group_metric_1, group_metric_2]]\n", + " df_tmp.insert(\n", + " 0,\n", + " f\"log_{group_metric_1}\",\n", + " np.int32(2 ** np.floor(np.log2(df_tmp[group_metric_1]))),\n", + " )\n", + " df_tmp.insert(\n", + " 0,\n", + " f\"log_{group_metric_2}\",\n", + " np.int32(2 ** np.floor(np.log2(df_tmp[group_metric_2]))),\n", + " )\n", + " df_tmp = (\n", + " df_tmp.groupby([f\"log_{group_metric_1}\", f\"log_{group_metric_2}\"])\n", + " .count()\n", + " .reset_index()\n", + " )\n", + " df_tmp[group_metric_1] /= df_tmp[group_metric_1].sum()\n", + " sns.heatmap(\n", + " df_tmp.reset_index()\n", + " .pivot(\n", + " columns=f\"log_{group_metric_2}\",\n", + " index=f\"log_{group_metric_1}\",\n", + " values=group_metric_1,\n", + " )\n", + " .combine_first(df_empty),\n", + " annot=False,\n", + " vmin=0,\n", + " vmax=0.05,\n", + " ax=ax[i],\n", + " )\n", + " ax[i].set_xlabel(group_metric_2)\n", + " ax[i].set_ylabel(group_metric_1)\n", + " ax[i].invert_yaxis()\n", + "fig.suptitle(\"Edge frequency\")\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAxYAAAGMCAYAAABH3DSrAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuNSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/xnp5ZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA+1klEQVR4nO3de1iUdf7/8dcAMh4QPKAohuAxxXOYhGa6m0UeMN3NXMtQUNOSMtkO2kG0tqjMQ6uupqVkm2mWWa6tqaS5HjqI2eZmlopiHvCUoKhgcP/+8Od8nTg4cA8M4zwf18V1OZ+5D++5Ge83r7kPYzEMwxAAAAAAmODl6gIAAAAAuD+CBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAlMHGjRtlsVi0ceNG29jw4cMVFhZWrutNSUmRxWLRgQMHbGM9e/ZUz549y3W9JSnqdVssFk2ePLnc113U76Fnz55q27Ztua9bkg4cOCCLxaKUlJQKWR8AVGYECwBuad++fRo9erSaNm2qqlWryt/fX926ddPrr7+uCxcuuLo8lzpy5IgmT56snTt3urqUUlmyZIlmzpzp6jKKVJlrA4DKwsfVBQBAaa1evVqDBg2S1WpVbGys2rZtq7y8PG3evFlPPPGE/ve//2n+/PkVXteCBQtUUFBQ4etdu3at3eMjR45oypQpCgsLU8eOHSu8Hkm6cOGCfHxK12KWLFmiXbt26bHHHnN4nttuu00XLlyQr69vKSssneJqCw0N1YULF1SlSpVyXT8AuAOCBQC3kp6err/85S8KDQ3V559/roYNG9qeGzt2rPbu3avVq1ebXo9hGLp48aKqVavm8Dyu+uOyvP+oLouqVauW6/IvXrwoX19feXl5lfu6SmKxWFy6fgCoTDgVCoBbefXVV3Xu3Dm99dZbdqHiiubNm2vcuHG2x4sWLdIf//hH1a9fX1arVeHh4Zo7d26h+cLCwtSvXz999tln6ty5s6pVq6Y33nhDkvTLL79owIABqlGjhurXr6/x48crNze30DJ+f63BlfPvX3vtNc2fP1/NmjWT1WrVzTffrG+++cZu3v/+978aPny47dSuBg0aKD4+XqdOnbrmNrn6GouNGzfq5ptvliTFxcXJYrHYrgFISkpSlSpVdOLEiULLePDBB1WrVi1dvHixxHWtXLlSbdu2VdWqVdW2bVt99NFHRU73+2sszp49q8cee0xhYWGyWq2qX7++7rjjDu3YscP2GlavXq2DBw/aar6yLa9cR7F06VI9++yzatSokapXr67s7Owir7G4Ii0tTV27dlW1atXUpEkTzZs3z+75oq5XuXp9V5ZZUm3FXWPx+eefq3v37qpRo4Zq1aqlu+++W7t377abZvLkybJYLNq7d6+GDx+uWrVqKSAgQHFxcTp//nzxvwQAqKQ4YgHAraxatUpNmzZV165dHZp+7ty5atOmjfr37y8fHx+tWrVKDz/8sAoKCjR27Fi7affs2aMhQ4Zo9OjRGjVqlG688UZduHBBt99+uzIyMvToo48qODhY77zzjj7//HOHa16yZInOnj2r0aNHy2Kx6NVXX9Wf/vQn7d+/33aUY926ddq/f7/i4uLUoEED2+lc//vf//Tll1/KYrE4tK7WrVvr+eef16RJk/Tggw+qe/fukqSuXbvq1ltv1fPPP69ly5YpISHBNk9eXp4++OAD/fnPfy7x0/e1a9fqz3/+s8LDw5WcnKxTp04pLi5ON9xwwzXrGjNmjD744AMlJCQoPDxcp06d0ubNm7V7927ddNNNeuaZZ5SVlaVffvlFM2bMkCT5+fnZLeOFF16Qr6+vHn/8ceXm5pZ4pObXX39Vnz59dO+992rIkCF6//339dBDD8nX11fx8fHXrPdqjtR2tfXr16t3795q2rSpJk+erAsXLmjWrFnq1q2bduzYUehC93vvvVdNmjRRcnKyduzYoTfffFP169fXK6+8Uqo6AcDlDABwE1lZWYYk4+6773Z4nvPnzxcai46ONpo2bWo3Fhoaakgy1qxZYzc+c+ZMQ5Lx/vvv28ZycnKM5s2bG5KMDRs22MaHDRtmhIaG2h6np6cbkoy6desap0+fto1//PHHhiRj1apVJdb53nvvGZKMTZs22cYWLVpkSDLS09NtYz169DB69Ohhe/zNN98YkoxFixYVWmZUVJQRGRlpN7ZixYpCr6UoHTt2NBo2bGicOXPGNrZ27VpDkt3rNgzDkGQkJSXZHgcEBBhjx44tcfl9+/YttBzDMIwNGzYYkoymTZsW2k5Xnru69h49ehiSjGnTptnGcnNzjY4dOxr169c38vLyDMMoelsWt8ziarvyO756W19Zz6lTp2xj3333neHl5WXExsbaxpKSkgxJRnx8vN0yBw4caNStW7fQugCgsuNUKABuIzs7W5JUs2ZNh+e5+hqJrKwsnTx5Uj169ND+/fuVlZVlN22TJk0UHR1tN/bpp5+qYcOGuueee2xj1atX14MPPuhwDYMHD1bt2rVtj68cRdi/f3+RdV68eFEnT57ULbfcIkm204WcITY2Vl999ZX27dtnG3v33XcVEhKiHj16FDvf0aNHtXPnTg0bNkwBAQG28TvuuEPh4eHXXG+tWrX01Vdf6ciRI2WufdiwYQ5f8+Lj46PRo0fbHvv6+mr06NE6fvy40tLSylzDtVzZTsOHD1edOnVs4+3bt9cdd9yhTz/9tNA8Y8aMsXvcvXt3nTp1yvZ+BwB3QbAA4Db8/f0lXT5f31FbtmxRr169bOe616tXT08//bQkFRksfu/gwYNq3rx5oVORbrzxRodraNy4sd3jKyHj119/tY2dPn1a48aNU1BQkKpVq6Z69erZ6vl9nWYMHjxYVqtV7777rm3Z//rXv3T//feXeLrVwYMHJUktWrQo9Jwj2+LVV1/Vrl27FBISoi5dumjy5Ml2wcoRRf1+ihMcHKwaNWrYjbVs2VKSCl1T4UxXtlNR26R169Y6efKkcnJy7MYdeX8AgDsgWABwG/7+/goODtauXbscmn7fvn26/fbbdfLkSU2fPl2rV6/WunXrNH78eEkqdGvY0twBqjS8vb2LHDcMw/bve++9VwsWLNCYMWO0YsUKrV27VmvWrCmyTjNq166tfv362YLFBx98oNzcXA0dOtRp6yjKvffeq/3792vWrFkKDg7W1KlT1aZNG/373/92eBnO/v0UF6Ty8/Odup5rceT9AQDugGABwK3069dP+/bt07Zt26457apVq5Sbm6tPPvlEo0ePVp8+fdSrV69S/YEaGhqqffv2Ffojb8+ePaWuvTi//vqrUlNTNWHCBE2ZMkUDBw7UHXfcoaZNm5Zpede60Ds2NlY//fSTvvnmG7377rvq1KmT2rRpU+I8oaGhkqSff/650HOObouGDRvq4Ycf1sqVK5Wenq66devqxRdfdLju0jhy5EihIwM//fSTJNkunr5yZODMmTN201056nA1R2u7sp2K2iY//vijAgMDCx1JAYDrBcECgFt58sknVaNGDY0cOVKZmZmFnt+3b59ef/11Sf/3SfDVoSArK0uLFi1yeH19+vTRkSNH9MEHH9jGzp8/79Qv4CuqTkll/qbnK3+4/v4P5it69+6twMBAvfLKK/riiy8cOlrRsGFDdezYUW+//bbdqVnr1q3TDz/8UOK8+fn5hU7nql+/voKDg+1u21ujRg2nnfb122+/2W4XLF2+89Ubb7yhevXqKSIiQpLUrFkzSdKmTZvsai3qd+tobVdvp6u3/65du7R27Vr16dOnrC8JACo9bjcLwK00a9ZMS5Ys0eDBg9W6dWu7b97eunWrli9fruHDh0uS7rzzTvn6+iomJkajR4/WuXPntGDBAtWvX19Hjx51aH2jRo3S7NmzFRsbq7S0NDVs2FDvvPOOqlev7rTX5O/vr9tuu02vvvqqLl26pEaNGmnt2rVKT08v0/KaNWumWrVqad68eapZs6Zq1KihyMhI2zUKVapU0V/+8hfNnj1b3t7eGjJkiEPLTU5OVt++fXXrrbcqPj5ep0+f1qxZs9SmTRudO3eu2PnOnj2rG264Qffcc486dOggPz8/rV+/Xt98842mTZtmmy4iIkLLli1TYmKibr75Zvn5+SkmJqZM2yA4OFivvPKKDhw4oJYtW2rZsmXauXOn5s+fb7vFb5s2bXTLLbdo4sSJOn36tOrUqaOlS5fqt99+K7S80tQ2depU9e7dW1FRURoxYoTtdrMBAQF23+0BANcdl96TCgDK6KeffjJGjRplhIWFGb6+vkbNmjWNbt26GbNmzTIuXrxom+6TTz4x2rdvb1StWtUICwszXnnlFWPhwoWFbjMaGhpq9O3bt8h1HTx40Ojfv79RvXp1IzAw0Bg3bpyxZs0ah283O3Xq1ELL1O9ux/rLL78YAwcONGrVqmUEBAQYgwYNMo4cOVJoOkduN2sYl29pGx4ebvj4+BR569mvv/7akGTceeedRb7m4nz44YdG69atDavVaoSHhxsrVqwo9Lp///pyc3ONJ554wujQoYNRs2ZNo0aNGkaHDh2Mf/zjH3bznDt3zrjvvvuMWrVq2d3C9srtX5cvX16onuJuN9umTRtj+/btRlRUlFG1alUjNDTUmD17dqH59+3bZ/Tq1cuwWq1GUFCQ8fTTTxvr1q0rtMziaivqdrOGYRjr1683unXrZlSrVs3w9/c3YmJijB9++MFumiu3mz1x4oTdeHG3wQWAys5iGFwdBgCe5rvvvlPHjh21ePFiPfDAA64uBwBwHeAaCwDwQAsWLJCfn5/+9Kc/uboUAMB1gmssAMCDrFq1Sj/88IPmz5+vhIQE7lAEAHAaToUCAA8SFhamzMxMRUdH65133inVt5gDAFASggUAAAAA07jGAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmESwAAAAAmEawAAAAAGAawQIAAACAaQQLAAAAAKYRLAAAAACYRrAAAAAAYBrBAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmESwAAAAAmObj6gIqWkFBgY4cOaKaNWvKYrG4uhwAqBQMw9DZs2cVHBwsLy/P/cyJHgEA9krTHzwuWBw5ckQhISGuLgMAKqVDhw7phhtucHUZLkOPAICiOdIfPC5Y1KxZU9LljePv7+/iagCgcsjOzlZISIhtH+mp6BEAYK80/cHjgsWVQ9v+/v40DQD4HU8//YceAQBFc6Q/eO6JtAAAAACchmABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAqJQ2bdqkmJgYBQcHy2KxaOXKldecZ+PGjbrppptktVrVvHlzpaSklHudAIDLCBYAgEopJydHHTp00Jw5cxyaPj09XX379tUf/vAH7dy5U4899phGjhypzz77rJwrBQBILg4WfBoFAChO79699be//U0DBw50aPp58+apSZMmmjZtmlq3bq2EhATdc889mjFjRjlXCgCQXBws+DQKAOAs27ZtU69evezGoqOjtW3btmLnyc3NVXZ2tt0PAKBsXPrN271791bv3r0dnv7qT6MkqXXr1tq8ebNmzJih6Ojo8ioTAOAGjh07pqCgILuxoKAgZWdn68KFC6pWrVqheZKTkzVlypSKKhEArmtudY0Fn0YBAJxp4sSJysrKsv0cOnTI1SUBgNty6RGL0uLTKABlEfHEYleX4HJpU2NdXUK5a9CggTIzM+3GMjMz5e/vX2R/kCSr1Sqr1VoR5QGopOgRzusRbnXEoiz4NAoAPENUVJRSU1PtxtatW6eoqCgXVQQAnsWtjljwaRQAeI5z585p7969tsfp6enauXOn6tSpo8aNG2vixIk6fPiwFi++/GnjmDFjNHv2bD355JOKj4/X559/rvfff1+rV6921UsAAI/iVkcs+DQKADzH9u3b1alTJ3Xq1EmSlJiYqE6dOmnSpEmSpKNHjyojI8M2fZMmTbR69WqtW7dOHTp00LRp0/Tmm29ycw8AqCAuPWLBp1EAgOL07NlThmEU+3xR32PUs2dPffvtt+VYFQCgOC49YsGnUQAAAMD1waVHLPg0CgAAALg+uNU1FgAAAAAqJ4IFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAKi05syZo7CwMFWtWlWRkZH6+uuvS5x+5syZuvHGG1WtWjWFhIRo/PjxunjxYgVVCwCezeXBgqYBACjKsmXLlJiYqKSkJO3YsUMdOnRQdHS0jh8/XuT0S5Ys0YQJE5SUlKTdu3frrbfe0rJly/T0009XcOUA4JlcGixoGgCA4kyfPl2jRo1SXFycwsPDNW/ePFWvXl0LFy4scvqtW7eqW7duuu+++xQWFqY777xTQ4YMKfEDq9zcXGVnZ9v9AADKxqXBoiKaBgDA/eTl5SktLU29evWyjXl5ealXr17atm1bkfN07dpVaWlptp6wf/9+ffrpp+rTp0+x60lOTlZAQIDtJyQkxLkvBAA8iMuCRUU1DT6NAgD3c/LkSeXn5ysoKMhuPCgoSMeOHStynvvuu0/PP/+8br31VlWpUkXNmjVTz549SzyqPXHiRGVlZdl+Dh065NTXAQCexGXBoqKaBp9GAYBn2Lhxo1566SX94x//0I4dO7RixQqtXr1aL7zwQrHzWK1W+fv72/0AAMrG5Rdvl0ZZmgafRgGA+wkMDJS3t7cyMzPtxjMzM9WgQYMi53nuuef0wAMPaOTIkWrXrp0GDhyol156ScnJySooKKiIsgHAo/m4asVmm4YktWvXTjk5OXrwwQf1zDPPyMurcE6yWq2yWq3OfwEAgHLj6+uriIgIpaamasCAAZKkgoICpaamKiEhoch5zp8/X6gPeHt7S5IMwyjXegEALjxicXXTuOJK04iKiipyHpoGAHiOxMRELViwQG+//bZ2796thx56SDk5OYqLi5MkxcbGauLEibbpY2JiNHfuXC1dulTp6elat26dnnvuOcXExNh6BQCg/LjsiIV0uWkMGzZMnTt3VpcuXTRz5sxCTaNRo0ZKTk6WdLlpTJ8+XZ06dVJkZKT27t1L0wCA69TgwYN14sQJTZo0SceOHVPHjh21Zs0a27V5GRkZdh82Pfvss7JYLHr22Wd1+PBh1atXTzExMXrxxRdd9RIAwKO4NFjQNAAAJUlISCj21KeNGzfaPfbx8VFSUpKSkpIqoDIAwO+5NFhINA0AAADgeuBWd4UCAAAAUDkRLAAAAACYRrAAAAAAYBrBAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmESwAAAAAmEawAAAAAGAawQIAAACAaQQLAAAAAKYRLAAAAACYRrAAAAAAYBrBAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmESwAAAAAmEawAAAAAGAawQIAAACAaQQLAAAAAKYRLAAAAACYRrAAAAAAYBrBAgAAAIBpBAsAAAAAphEsAAAAAJhGsAAAAABgGsECAAAAgGkECwAAAACmESwAAAAAmEawAAAAAGAawQIAAACAaWUKFjk5Oc6uAwBwnaBHAIBnKlOwCAoKUnx8vDZv3uzsegAAbo4eAQCeqUzB4p///KdOnz6tP/7xj2rZsqVefvllHTlyxNm1AQDcED0CADxTmYLFgAEDtHLlSh0+fFhjxozRkiVLFBoaqn79+mnFihX67bffnF0nAMBN0CMAwDOZuni7Xr16SkxM1H//+19Nnz5d69ev1z333KPg4GBNmjRJ58+fd1adAAA3Q48AAM/iY2bmzMxMvf3220pJSdHBgwd1zz33aMSIEfrll1/0yiuv6Msvv9TatWudVSsAwI3QIwDAs5QpWKxYsUKLFi3SZ599pvDwcD388MMaOnSoatWqZZuma9euat26tbPqBAC4CXoEAHimMgWLuLg4/eUvf9GWLVt08803FzlNcHCwnnnmGVPFAQDcDz0CADxTma6xOHr0qN54441iG4YkVatWTUlJSWUuDADgnpzZI+bMmaOwsDBVrVpVkZGR+vrrr0uc/syZMxo7dqwaNmwoq9Wqli1b6tNPPy31awAAlF6ZgkXNmjV1/PjxQuOnTp2St7d3qZZF0wCA64uzesSyZcuUmJiopKQk7dixQx06dFB0dHSRy5akvLw83XHHHTpw4IA++OAD7dmzRwsWLFCjRo3K/FoAAI4r06lQhmEUOZ6bmytfX1+Hl3OlacybN0+RkZGaOXOmoqOjtWfPHtWvX7/Q9FeaRv369fXBBx+oUaNGOnjwoN15uwAA13JWj5g+fbpGjRqluLg4SdK8efO0evVqLVy4UBMmTCg0/cKFC3X69Glt3bpVVapUkSSFhYWV/gUAAMqkVMHi73//uyTJYrHozTfflJ+fn+25/Px8bdq0Sa1atXJ4eTQNALh+OLNH5OXlKS0tTRMnTrSNeXl5qVevXtq2bVuR83zyySeKiorS2LFj9fHHH6tevXq677779NRTTxV7pCQ3N1e5ubm2x9nZ2Q7VBwAorFTBYsaMGZIufxo1b948ux21r6+vwsLCNG/ePIeWRdMAgOuLM3vEyZMnlZ+fr6CgILvxoKAg/fjjj0XOs3//fn3++ee6//779emnn2rv3r16+OGHdenSpWKv50hOTtaUKVMcqgkAULJSBYv09HRJ0h/+8AetWLFCtWvXLvOKaRoAcH1xZo8oi4KCAtWvX1/z58+Xt7e3IiIidPjwYU2dOrXYHjFx4kQlJibaHmdnZyskJKSiSgaA60qZrrHYsGGDs+twCE0DACo/Z/SIwMBAeXt7KzMz0248MzNTDRo0KHKehg0bqkqVKnZHSlq3bq1jx44pLy+vyOs7rFarrFar6XoBAKUIFomJiXrhhRdUo0YNuz/UizJ9+vRrLo+mAQDXD2f3CF9fX0VERCg1NVUDBgyQdPnDpdTUVCUkJBQ5T7du3bRkyRIVFBTIy+vyTQ9/+uknNWzYsFQXjQMAysbhYPHtt9/q0qVLtn8Xx2KxOLQ8mgYAXD+c3SOky2Fl2LBh6ty5s7p06aKZM2cqJyfHdsOP2NhYNWrUSMnJyZKkhx56SLNnz9a4ceP0yCOP6Oeff9ZLL72kRx991MQrAwA4yuFgcfWhbWedCkXTAIDrQ3n0iMGDB+vEiROaNGmSjh07po4dO2rNmjW2a/MyMjJsHzJJUkhIiD777DONHz9e7du3V6NGjTRu3Dg99dRTTqkHAFCyMl1j4Sw0DQBASRISEoo9ir1x48ZCY1FRUfryyy/LuSoAQFEcDhZ/+tOfHF7oihUrHJ6WpgEA7q+8egQAwH04HCwCAgLKsw4AgBujRwAAHA4WixYtKs86AABujB4BAPC69iQAAAAAUDKHj1jcdNNNSk1NVe3atdWpU6cSbxm4Y8cOpxQHAHAP9AgAgMPB4u6777Z90dyV750AAECiRwAAShEskpKSivw3AAD0CACAqe+x2L59u3bv3i1JCg8PV0REhFOKAgC4P3oEAHiWMgWLX375RUOGDNGWLVtUq1YtSdKZM2fUtWtXLV26VDfccIMzawQAuBF6BAB4pjLdFWrkyJG6dOmSdu/erdOnT+v06dPavXu3CgoKNHLkSGfXCABwI/QIAPBMZTpi8cUXX2jr1q268cYbbWM33nijZs2ape7duzutOACA+6FHAIBnKtMRi5CQEF26dKnQeH5+voKDg00XBQBwX/QIAPBMZQoWU6dO1SOPPKLt27fbxrZv365x48bptddec1pxAAD3Q48AAM/k8KlQtWvXtvvCo5ycHEVGRsrH5/IifvvtN/n4+Cg+Pp57mAOAh6FHAAAcDhYzZ84sxzIAAO6MHgEAcDhYDBs2rDzrAAC4MXoEAMDUF+RJ0sWLF5WXl2c35u/vb3axAIDrAD0CADxHmS7ezsnJUUJCgurXr68aNWqodu3adj8AAM9FjwAAz1SmYPHkk0/q888/19y5c2W1WvXmm29qypQpCg4O1uLFi51dIwDAjdAjAMAzlelUqFWrVmnx4sXq2bOn4uLi1L17dzVv3lyhoaF69913df/99zu7TgCAm6BHAIBnKtMRi9OnT6tp06aSLp8re/r0aUnSrbfeqk2bNjmvOgCA26FHAIBnKlOwaNq0qdLT0yVJrVq10vvvvy/p8qdUtWrVclpxAAD3Q48AAM9UpmARFxen7777TpI0YcIEzZkzR1WrVtX48eP1xBNPOLVAAIB7oUcAgGcq0zUW48ePt/27V69e2r17t3bs2KHmzZurffv2TisOAOB+6BEA4JlMf4+FJIWFhSksLMwZiwIAXGfoEQDgGcp0KpQkpaamql+/fmrWrJmaNWumfv36af369c6sDQDgpugRAOB5yhQs/vGPf+iuu+5SzZo1NW7cOI0bN07+/v7q06eP5syZ4+waAQBuhB4BAJ6pTKdCvfTSS5oxY4YSEhJsY48++qi6deuml156SWPHjnVagQAA90KPAADPVKYjFmfOnNFdd91VaPzOO+9UVlaW6aIAAO6LHgEAnqlMwaJ///766KOPCo1//PHH6tevn+miAADuix4BAJ7J4VOh/v73v9v+HR4erhdffFEbN25UVFSUJOnLL7/Uli1b9Ne//tX5VQIAKjV6BADAYhiG4ciETZo0cWyBFov2799vqqjylJ2drYCAAGVlZcnf39/V5QCoABFPLHZ1CS6XNjW2xOfN7hvpEQDcFT2i5B5Rmv2iw0cs0tPTHa8OAOBR6BEAgDJ/j8UVhmHIwYMeAAAPQ48AAM9R5mCxePFitWvXTtWqVVO1atXUvn17vfPOO86sDQDgpugRAOB5yvQ9FtOnT9dzzz2nhIQEdevWTZK0efNmjRkzRidPntT48eOdWiQAwH3QIwDAM5UpWMyaNUtz585VbOz/XejRv39/tWnTRpMnT6ZpAIAHo0cAgGcq06lQR48eVdeuXQuNd+3aVUePHjVdFADAfdEjAMAzlSlYNG/eXO+//36h8WXLlqlFixamiwIAuC96BAB4pjKdCjVlyhQNHjxYmzZtsp0/u2XLFqWmphbZTAAAnoMeAQCeqUxHLP785z/r66+/VmBgoFauXKmVK1cqMDBQX3/9tQYOHOjsGgEAboQeAQCeqdRHLC5duqTRo0frueee0z//+c/yqAkA4KboEQDguUp9xKJKlSr68MMPy6MWAICbo0cAgOcq06lQAwYM0MqVK51cCgDgekCPAADPVKaLt1u0aKHnn39eW7ZsUUREhGrUqGH3/KOPPuqU4gAA7oceAQCeqUzB4q233lKtWrWUlpamtLQ0u+csFgtNAwA8mDN7xJw5czR16lQdO3ZMHTp00KxZs9SlS5drzrd06VINGTJEd999N0dPAKCClClYpKen2/5tGIaky82irGgcAHD9cFaPWLZsmRITEzVv3jxFRkZq5syZio6O1p49e1S/fv1i5ztw4IAef/xxde/evfTFAwDKrEzXWEiXP5Fq27atqlatqqpVq6pt27Z68803S72cK40jKSlJO3bsUIcOHRQdHa3jx4+XOB+NAwAqL2f0iOnTp2vUqFGKi4tTeHi45s2bp+rVq2vhwoXFzpOfn6/7779fU6ZMUdOmTc2+DABAKZQpWEyaNEnjxo1TTEyMli9fruXLlysmJkbjx4/XpEmTSrUsGgcAXF+c0SPy8vKUlpamXr162ca8vLzUq1cvbdu2rdj5nn/+edWvX18jRoxwaD25ubnKzs62+wEAlE2ZToWaO3euFixYoCFDhtjG+vfvr/bt2+uRRx7R888/79ByrjSOiRMn2sZK2zj+85//lLiO3Nxc5ebm2h7TNACgfDmjR5w8eVL5+fkKCgqyGw8KCtKPP/5Y5DybN2/WW2+9pZ07dzpca3JysqZMmeLw9ACA4pXpiMWlS5fUuXPnQuMRERH67bffHF5OSY3j2LFjRc5zpXEsWLDAoXUkJycrICDA9hMSEuJwfQCA0nNWjyiNs2fP6oEHHtCCBQsUGBjo8HwTJ05UVlaW7efQoUPlUh8AeIIyBYsHHnhAc+fOLTQ+f/583X///aaLKk5ZGgdNAwAqljN6RGBgoLy9vZWZmWk3npmZqQYNGhSaft++fTpw4IBiYmLk4+MjHx8fLV68WJ988ol8fHy0b9++ItdjtVrl7+9v9wMAKJsynQolXb4wb+3atbrlllskSV999ZUyMjIUGxurxMRE23TTp08vdhlmGscVBQUFl1+Ij4/27NmjZs2a2c1jtVpltVpL/wIBAGVmtkf4+voqIiJCqampGjBggKTL+/vU1FQlJCQUmr5Vq1b6/vvv7caeffZZnT17Vq+//jpHqwGgApQpWOzatUs33XSTJNk+BQoMDFRgYKB27dplm+5atxekcQDA9cdZPSIxMVHDhg1T586d1aVLF82cOVM5OTmKi4uTJMXGxqpRo0ZKTk623XnqarVq1ZKkQuMAgPJRpmCxYcMGpxVA4wCA64uzesTgwYN14sQJTZo0SceOHVPHjh21Zs0a23V5GRkZ8vIq813TAQBOVuZToZyFxgEAKE5CQkKRR7AlaePGjSXOm5KS4vyCAADFcnmwkGgcAAAAgLvjUAAAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMqRbCYM2eOwsLCVLVqVUVGRurrr78udtoFCxaoe/fuql27tmrXrq1evXqVOD0AwH3RHwDAfbg8WCxbtkyJiYlKSkrSjh071KFDB0VHR+v48eNFTr9x40YNGTJEGzZs0LZt2xQSEqI777xThw8fruDKAQDlif4AAO7FYhiG4coCIiMjdfPNN2v27NmSpIKCAoWEhOiRRx7RhAkTrjl/fn6+ateurdmzZys2Nvaa02dnZysgIEBZWVny9/c3XT+Ayi/iicWuLsHl0qaWvH+sjPvGiugPubm5ys3NtT3Ozs5WSEhIpdoOAMoXPaLkHlGa/uDSIxZ5eXlKS0tTr169bGNeXl7q1auXtm3b5tAyzp8/r0uXLqlOnTpFPp+bm6vs7Gy7HwBA5VYR/UGSkpOTFRAQYPsJCQkxXTsAeCqXBouTJ08qPz9fQUFBduNBQUE6duyYQ8t46qmnFBwcbNd8rkbTAAD3UxH9QZImTpyorKws28+hQ4dM1Q0Anszl11iY8fLLL2vp0qX66KOPVLVq1SKnoWkAgOdxpD9IktVqlb+/v90PAKBsfFy58sDAQHl7eyszM9NuPDMzUw0aNChx3tdee00vv/yy1q9fr/bt2xc7ndVqldVqdUq9AICKURH9AQDgXC49YuHr66uIiAilpqbaxgoKCpSamqqoqKhi53v11Vf1wgsvaM2aNercuXNFlAoAqED0BwBwPy49YiFJiYmJGjZsmDp37qwuXbpo5syZysnJUVxcnCQpNjZWjRo1UnJysiTplVde0aRJk7RkyRKFhYXZzrX18/OTn5+fy14HAMC56A8A4F5cHiwGDx6sEydOaNKkSTp27Jg6duyoNWvW2C7Yy8jIkJfX/x1YmTt3rvLy8nTPPffYLScpKUmTJ0+uyNIBAOWI/gAA7sXl32NR0SrjvdoBlC/uUe6e32PhCmwHwPPQI66T77EAAAAAcH0gWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwDSCBQAAAADTCBYAAAAATCNYAAAAADCNYAEAAADANIIFAAAAANMIFgAAAABMqxTBYs6cOQoLC1PVqlUVGRmpr7/+usTply9frlatWqlq1apq166dPv300wqqFABQkegPAOA+XB4sli1bpsTERCUlJWnHjh3q0KGDoqOjdfz48SKn37p1q4YMGaIRI0bo22+/1YABAzRgwADt2rWrgisHAJQn+gMAuBeLYRiGKwuIjIzUzTffrNmzZ0uSCgoKFBISokceeUQTJkwoNP3gwYOVk5Ojf/3rX7axW265RR07dtS8efOuub7s7GwFBAQoKytL/v7+znshACqtiCcWu7oEl0ubGlvi85Vx31jR/UGqnNvhesT/ycuu9f8SFYP3Y8nvxdLsF32cXVhp5OXlKS0tTRMnTrSNeXl5qVevXtq2bVuR82zbtk2JiYl2Y9HR0Vq5cmWR0+fm5io3N9f2OCsrS9LljQTAM+TnXnB1CS53rX3eledd/FmTTUX0B4ke4Sr8n7zM7Pvstmffc1Il7mvT34aYXgbvx5Lfi6XpDy4NFidPnlR+fr6CgoLsxoOCgvTjjz8WOc+xY8eKnP7YsWNFTp+cnKwpU6YUGg8JCSlj1QDgfgJmjXFourNnzyogIKCcq7m2iugPEj0CruXo/0sUj23oHI5sR0f6g0uDRUWYOHGi3SdYBQUFOn36tOrWrSuLxeLCyoqXnZ2tkJAQHTp0iEPxJrAdzWMbOoc7bEfDMHT27FkFBwe7upQKRY/wTGxD52A7mucO27A0/cGlwSIwMFDe3t7KzMy0G8/MzFSDBg2KnKdBgwalmt5qtcpqtdqN1apVq+xFVyB/f/9K+yZzJ2xH89iGzlHZt2NlOFJxRUX0B4ke4enYhs7BdjSvsm9DR/uDS+8K5evrq4iICKWmptrGCgoKlJqaqqioqCLniYqKsptektatW1fs9AAA90N/AAD34/JToRITEzVs2DB17txZXbp00cyZM5WTk6O4uDhJUmxsrBo1aqTk5GRJ0rhx49SjRw9NmzZNffv21dKlS7V9+3bNnz/flS8DAOBk9AcAcC8uDxaDBw/WiRMnNGnSJB07dkwdO3bUmjVrbBfgZWRkyMvr/w6sdO3aVUuWLNGzzz6rp59+Wi1atNDKlSvVtm1bV70Ep7NarUpKSip0eB6lw3Y0j23oHGzHsqE/FI33k3lsQ+dgO5p3vW1Dl3+PBQAAAAD35/Jv3gYAAADg/ggWAAAAAEwjWAAAAAAwjWABAAAAwDSCRQXbtGmTYmJiFBwcLIvFopUrV15znkcffVQRERGyWq3q2LFjuddYmZV2+x04cEAWi0Xe3t46fPiw3XNHjx6Vj4+PLBaLDhw4UH5FV0LDhw+XxWLRmDFjCj03duxYWSwWDR8+vNj5X3zxRXXt2lXVq1d3my8Tcyaz289ischisejLL7+0G8/NzbV94/PGjRudXDXcAT3CHHqEefQHczy9PxAsKlhOTo46dOigOXPmlGq++Ph4DR48uJyqch9l3X6NGjXS4sWL7cbefvttNWrUyJnluZWQkBAtXbpUFy5csI1dvHhRS5YsUePGjUucNy8vT4MGDdJDDz1U3mVWWma235X5Fy1aZDf20Ucfyc/Pz+m1wn3QI8yhRzgH/cEcT+4PBIsK1rt3b/3tb3/TwIEDHZ7n73//u8aOHaumTZuWY2XuoSzbT5KGDRtW6D/pokWLNGzYMGeW51ZuuukmhYSEaMWKFbaxFStWqHHjxurUqVOJ806ZMkXjx49Xu3btyrvMSsvM9pMuvyd/33gWLlzo0e9J0CPMokc4B/3BHE/uDwSLSmby5MkKCwtzdRluq7jt179/f/3666/avHmzJGnz5s369ddfFRMTU8EVVi7x8fF2zXThwoW2bzW+IiUlRRaLpaJLcwtmtl9ERITCwsL04YcfSrr8ZW+bNm3SAw88UL5Fw63RI8yhRziO/mCOp/YHgkUlExgYqGbNmrm6DLdV3ParUqWKhg4dqoULF0q6/B986NChqlKlSkWXWKkMHTpUmzdv1sGDB3Xw4EFt2bJFQ4cOtZsmICBAN954o4sqrNzMbr/4+HjbezIlJUV9+vRRvXr1yr1uuC96hDn0CMfRH8zx1P5AsKhkEhISlJqa6uoy3FZJ2y8+Pl7Lly/XsWPHtHz5csXHx1dwdZVPvXr11LdvX6WkpGjRokXq27evAgMD7aYZOHCgfvzxRxdVWLmZ3X5Dhw7Vtm3btH//fqWkpPCexDXRI8yhRziO/mCOp/YHH1cXAFSUdu3aqVWrVhoyZIhat26ttm3baufOna4uy+Xi4+OVkJAgSaW+4BHmtl/dunXVr18/jRgxQhcvXlTv3r119uzZ8igTwDXQIwqjP5jjif2BIxbwKPHx8dq4caPbJP+KcNdddykvL0+XLl1SdHS0q8txO2a335X3ZGxsrLy9vcuhQgCOokfYoz+Y44n9gWBRwc6dO6edO3faPgVJT0/Xzp07lZGRIUmaPXu2br/9drt59u7dq507d+rYsWO6cOGCbf68vLyKLt/lyrL9rjZq1CidOHFCI0eOrIhy3YK3t7d2796tH374ocgd10cffaRWrVrZjWVkZNi2e35+vu13cu7cuYoqu9Ioy/a72l133aUTJ07o+eefL88y4SboEebQI5yL/mCOJ/YHToWqYNu3b9cf/vAH2+PExERJl28tlpKSopMnT2rfvn1284wcOVJffPGF7fGVW5Wlp6d73N1ByrL9rubj41PoHEdI/v7+xT6XlZWlPXv22I1NmjRJb7/9tu3xlffkhg0b1LNnz3KpsTIr7fa7msVi4T0JG3qEOfQI56M/mONp/cFiGIbh6iIAAAAAuDdOhQIAAABgGsECAAAAgGkECwAAAACmESwAAAAAmEawAAAAAGAawQIAAACAaQQLAAAAAKYRLAAAAACYRrCA25s8ebI6duxYqnl69uypxx57rFzqKU/Dhw/XgAEDbI/L43X8fnv+fp0AcC3slx9z6jrYL5efjRs3ymKx6MyZM64u5brg4+oCgOL07NlTHTt21MyZM0uc7vHHH9cjjzxSMUVVMitWrFCVKlXKdR2vv/66DMOwPXb09wLg+sN++drYL8OTESzgtgzDUH5+vvz8/OTn5+fqcpzq0qVLDjWmOnXqlHstAQEB5b4OANcH9svslyuLK+9FHx/+1K1InAqFSmn48OH64osv9Prrr8tischisSglJUUWi0X//ve/FRERIavVqs2bNxd7iHjKlCmqV6+e/P39NWbMGOXl5RW7vtzcXD3++ONq1KiRatSoocjISG3cuNHherds2aKePXuqevXqql27tqKjo/Xrr79KktasWaNbb71VtWrVUt26ddWvXz/t27fPNu+BAwdksVi0bNky9ejRQ1WrVtW7776r/Px8JSYm2uZ78skn7T6hkgofcg8LC9NLL72k+Ph41axZU40bN9b8+fPt5nnqqafUsmVLVa9eXU2bNtVzzz2nS5cuFfvarj7kXtTvJT09Xc2bN9drr71mN9/OnTtlsVi0d+9eh7cjgMqL/bLn7JcNw9DkyZPVuHFjWa1WBQcH69FHH7U9/84776hz586qWbOmGjRooPvuu0/Hjx+3PX/l9KLPPvtMnTp1UrVq1fTHP/5Rx48f17///W+1bt1a/v7+uu+++3T+/HnbfAUFBUpOTlaTJk1UrVo1dejQQR988EGJtf5+nb9/L5pZJkqPYIFK6fXXX1dUVJRGjRqlo0eP6ujRowoJCZEkTZgwQS+//LJ2796t9u3bFzl/amqqdu/erY0bN+q9997TihUrNGXKlGLXl5CQoG3btmnp0qX673//q0GDBumuu+7Szz//fM1ad+7cqdtvv13h4eHatm2bNm/erJiYGOXn50uScnJylJiYqO3btys1NVVeXl4aOHCgCgoK7JYzYcIEjRs3Trt371Z0dLSmTZumlJQULVy4UJs3b9bp06f10UcfXbOeadOmqXPnzvr222/18MMP66GHHtKePXtsz9esWVMpKSn64Ycf9Prrr2vBggWaMWPGNZcrFf17ady4seLj47Vo0SK7aRctWqTbbrtNzZs3d2jZACo39sues1/+8MMPNWPGDL3xxhv6+eeftXLlSrVr1872/KVLl/TCCy/ou+++08qVK3XgwAENHz680HImT56s2bNna+vWrTp06JDuvfdezZw5U0uWLNHq1au1du1azZo1yzZ9cnKyFi9erHnz5ul///ufxo8fr6FDh+qLL75waFtIhd+LzlgmSsEAKqkePXoY48aNsz3esGGDIclYuXKl3XRJSUlGhw4dbI+HDRtm1KlTx8jJybGNzZ071/Dz8zPy8/MLLfvgwYOGt7e3cfjwYbvl3n777cbEiROvWeeQIUOMbt26Ofy6Tpw4YUgyvv/+e8MwDCM9Pd2QZMycOdNuuoYNGxqvvvqq7fGlS5eMG264wbj77rttY7/fRqGhocbQoUNtjwsKCoz69esbc+fOLbaeqVOnGhEREbbHRW3PktZpGIZx+PBhw9vb2/jqq68MwzCMvLw8IzAw0EhJSSl2vQDcD/tlz9gvT5s2zWjZsqWRl5d3zWkNwzC++eYbQ5Jx9uxZwzD+732xfv162zTJycmGJGPfvn22sdGjRxvR0dGGYRjGxYsXjerVqxtbt261W/aIESOMIUOGXLOGot6Ljizzyny//vqrQ68VJeOIBdxO586drzlNhw4dVL16ddvjqKgonTt3TocOHSo07ffff6/8/Hy1bNnSdl6wn5+fvvjiC7tD48W58slYcX7++WcNGTJETZs2lb+/v8LCwiRJGRkZxb6urKwsHT16VJGRkbYxHx8fh1771Z8WWiwWNWjQwO4Q9bJly9StWzc1aNBAfn5+evbZZwvVUlrBwcHq27evFi5cKElatWqVcnNzNWjQIFPLBeAe2C+XzN32y4MGDdKFCxfUtGlTjRo1Sh999JF+++032/NpaWmKiYlR48aNVbNmTfXo0UNS4e139esOCgqynep19diV7bB3716dP39ed9xxh93vfPHixQ79zq+4+vfhrGXCcVzRArdTo0YNpy7v3Llz8vb2Vlpamry9ve2ec+Tiw2rVqpX4fExMjEJDQ7VgwQIFBweroKBAbdu2LXRusbNe1+8vLrRYLLbD+9u2bdP999+vKVOmKDo6WgEBAVq6dKmmTZtmer0jR47UAw88oBkzZmjRokUaPHiw3R8RAK5f7JdL5m775ZCQEO3Zs0fr16/XunXr9PDDD2vq1Kn64osvlJeXp+joaEVHR+vdd99VvXr1lJGRoejo6ELb7+rXbbFYStwO586dkyStXr1ajRo1spvOarU6/Jqv/p05a5lwHMEClZavr6/tfNjS+u6773ThwgVbc/nyyy/l5+dnOx/4ap06dVJ+fr6OHz+u7t27l3pd7du3V2pqapHnCp86dUp79uzRggULbMvevHnzNZcZEBCghg0b6quvvtJtt90mSfrtt9+Ulpamm266qdQ1XrF161aFhobqmWeesY0dPHiwVMso7vfSp08f1ahRQ3PnztWaNWu0adOmMtcJoHJiv+w5++Vq1aopJiZGMTExGjt2rFq1aqXvv/9ehmHo1KlTevnll22/u+3bt5eq3qKEh4fLarUqIyPDdgSkMi4TJSNYoNIKCwvTV199pQMHDsjPz6/QRXUlycvL04gRI/Tss8/qwIEDSkpKUkJCgry8Cp/917JlS91///2KjY3VtGnT1KlTJ504cUKpqalq3769+vbtW+K6Jk6cqHbt2unhhx/WmDFj5Ovrqw0bNmjQoEGqU6eO6tatq/nz56thw4bKyMjQhAkTHHoN48aN08svv6wWLVqoVatWmj59uukv8GnRooUyMjK0dOlS3XzzzVq9erVDFx5e7fe/lzp16sjLy0ve3t4aPny4Jk6cqBYtWigqKspUrQAqH/bLnrFfTklJUX5+viIjI1W9enX985//VLVq1RQaGqqCggL5+vpq1qxZGjNmjHbt2qUXXnihLC/dTs2aNfX4449r/PjxKigo0K233qqsrCxt2bJF/v7+GjZsWKVYJkrGNRaotB5//HF5e3srPDzcdqjVUbfffrtatGih2267TYMHD1b//v01efLkYqdftGiRYmNj9de//lU33nijBgwYoG+++UaNGze+5rpatmyptWvX6rvvvlOXLl0UFRWljz/+WD4+PvLy8tLSpUuVlpamtm3bavz48Zo6dapDr+Gvf/2rHnjgAQ0bNkxRUVGqWbOmBg4c6OgmKFL//v01fvx4JSQkqGPHjtq6dauee+65Ui2jpN/LiBEjlJeXp7i4OFN1Aqic2C97xn65Vq1aWrBggbp166b27dtr/fr1WrVqlerWrat69eopJSVFy5cvV3h4uF5++eVCt7UtqxdeeEHPPfeckpOT1bp1a911111avXq1mjRpUqmWieJZDON3N2AG3Nzw4cN15swZrVy50tWleJz//Oc/uv3223Xo0CEFBQW5uhwAlQT7Zddhv4yKxKlQAEzLzc3ViRMnNHnyZA0aNIjmBQAuxn4ZrsCpUMA19O7d2+42dVf/vPTSS64ur1J47733FBoaqjNnzujVV191dTkArnPsl6+tpP3yu+++W+z2a9OmjYsqLtmYMWOKrXnMmDGuLg//H6dCAddw+PBhXbhwocjn6tSpozp16lRwRQDg2dgvm3P27FllZmYW+VyVKlUUGhpawRVd2/Hjx5WdnV3kc/7+/qpfv34FV4SiECwAAAAAmMapUAAAAABMI1gAAAAAMI1gAQAAAMA0ggUAAAAA0wgWAAAAAEwjWAAAAAAwjWABAAAAwLT/B4B9o3cEomC7AAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "metrics = [\"triple_cardinality\", \"triple_cardinality_same_rel\"]\n", + "fig, ax = plt.subplots(1, len(metrics), figsize=[4 * len(metrics), 4])\n", + "\n", + "for i, metric in enumerate(metrics):\n", + " sns.countplot(\n", + " x=edge_dcs[metric],\n", + " order=[\"1:1\", \"1:M\", \"M:1\", \"M:M\"],\n", + " stat=\"probability\",\n", + " ax=ax[i],\n", + " )\n", + "fig.suptitle(\"Cardinality distribution\")\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Edge topological patterns\n", + "\n", + "The second method provided by `KGTopologyToolbox` for topological analysis at the edge level is `edge_pattern_summary`, which extracts information on several significant edge topological patterns. In particular, it detects whether the edge (h,r,t) is a loop, is symmetric or has inverse, inference, composition (directed and undirected):\n", + "\n", + "![image info](../images/edge_patterns.png)\n", + "\n", + "For inverse/inference, the method also provides the number and types of unique relations `r'` realizing the counterpart edges; for composition, the number of triangles supported by the edge is provided (the unique metapaths `[r_1, r_2]` can also be listed by setting `return_metapath_list=True` when calling the method)." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
hrtis_loopis_symmetrichas_inversen_inverse_relationsinverse_edge_typeshas_inferencen_inference_relationsinference_edge_typeshas_compositionhas_undirected_compositionn_trianglesn_undirected_triangles
0171803207FalseFalseFalse0[]False0[0]FalseTrue015
14903013662FalseFalseFalse0[]False0[0]TrueTrue44153
25480015999FalseFalseFalse0[]False0[0]FalseTrue01
3314807247FalseFalseFalse0[]False0[0]TrueTrue1029
410300016202FalseFalseFalse0[]False0[0]TrueTrue379
................................................
50884292451505097FalseFalseTrue1[46]True1[46, 50]TrueTrue15325722
50884306456508833FalseFalseTrue2[45, 46]True2[45, 46, 50]TrueTrue234913
508843194845015873FalseFalseTrue1[46]True2[46, 45, 50]TrueTrue13265004
5088432636550496FalseFalseTrue2[45, 46]True2[45, 46, 50]TrueTrue14335554
508843313860506368FalseFalseFalse0[]False0[50]TrueTrue119489
\n", + "

5088434 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " h r t is_loop is_symmetric has_inverse \\\n", + "0 1718 0 3207 False False False \n", + "1 4903 0 13662 False False False \n", + "2 5480 0 15999 False False False \n", + "3 3148 0 7247 False False False \n", + "4 10300 0 16202 False False False \n", + "... ... .. ... ... ... ... \n", + "5088429 2451 50 5097 False False True \n", + "5088430 6456 50 8833 False False True \n", + "5088431 9484 50 15873 False False True \n", + "5088432 6365 50 496 False False True \n", + "5088433 13860 50 6368 False False False \n", + "\n", + " n_inverse_relations inverse_edge_types has_inference \\\n", + "0 0 [] False \n", + "1 0 [] False \n", + "2 0 [] False \n", + "3 0 [] False \n", + "4 0 [] False \n", + "... ... ... ... \n", + "5088429 1 [46] True \n", + "5088430 2 [45, 46] True \n", + "5088431 1 [46] True \n", + "5088432 2 [45, 46] True \n", + "5088433 0 [] False \n", + "\n", + " n_inference_relations inference_edge_types has_composition \\\n", + "0 0 [0] False \n", + "1 0 [0] True \n", + "2 0 [0] False \n", + "3 0 [0] True \n", + "4 0 [0] True \n", + "... ... ... ... \n", + "5088429 1 [46, 50] True \n", + "5088430 2 [45, 46, 50] True \n", + "5088431 2 [46, 45, 50] True \n", + "5088432 2 [45, 46, 50] True \n", + "5088433 0 [50] True \n", + "\n", + " has_undirected_composition n_triangles n_undirected_triangles \n", + "0 True 0 15 \n", + "1 True 44 153 \n", + "2 True 0 1 \n", + "3 True 10 29 \n", + "4 True 3 79 \n", + "... ... ... ... \n", + "5088429 True 1532 5722 \n", + "5088430 True 234 913 \n", + "5088431 True 1326 5004 \n", + "5088432 True 1433 5554 \n", + "5088433 True 119 489 \n", + "\n", + "[5088434 rows x 15 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edge_eps = kgtt.edge_pattern_summary(biokg_df)\n", + "edge_eps" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fraction of triples with property:\n" + ] + }, + { + "data": { + "text/plain": [ + "is_loop 0.000011\n", + "is_symmetric 0.713743\n", + "has_inverse 0.409704\n", + "has_inference 0.410111\n", + "has_composition 0.997605\n", + "dtype: float64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Fraction of triples with property:\")\n", + "edge_eps[\n", + " [\"is_loop\", \"is_symmetric\", \"has_inverse\", \"has_inference\", \"has_composition\"]\n", + "].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "metrics = [\n", + " \"n_inverse_relations\",\n", + " \"n_inference_relations\",\n", + " \"n_triangles\",\n", + " \"n_undirected_triangles\",\n", + "]\n", + "fig, ax = plt.subplots(2, 2, figsize=(9, 7))\n", + "\n", + "for axn, metric in zip(ax.flatten(), metrics):\n", + " x = np.sqrt(edge_eps[metric])\n", + " sns.histplot(x=x, stat=\"probability\", binwidth=1, binrange=[0, x.max() + 1], ax=axn)\n", + " axn.set_xlabel(f\"sqrt({metric})\")\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Relation-level analysis\n", + "\n", + "The method `aggregate_by_relation` allows the user to aggregate at the relation-level the statistics outputted by the edge-level methods `edge_degree_cardinality_summary` and `edge_pattern_summary`. This converts DataFrames indexed on the KG edges to DataFrames indexed on the IDs of the unique relation types." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
num_triplesfrac_triplesunique_hunique_th_unique_rel_meanh_unique_rel_stdh_unique_rel_quartile1h_unique_rel_quartile2h_unique_rel_quartile3h_degree_mean...tot_degree_same_rel_quartile1tot_degree_same_rel_quartile2tot_degree_same_rel_quartile3triple_cardinality_1:M_fractriple_cardinality_M:1_fractriple_cardinality_M:M_fractriple_cardinality_same_rel_1:1_fractriple_cardinality_same_rel_1:M_fractriple_cardinality_same_rel_M:1_fractriple_cardinality_same_rel_M:M_frac
r
0810660.015931974293378.1102938.2472774.05.08.0569.252202...45.0112.0211.00.00.01.00.0016280.0235860.0649590.909827
156690.001114698153627.04815712.93641017.031.036.02518.765391...14.032.060.00.00.01.00.0028220.1042510.0275180.865408
2669540.01315861261236.4043075.60070633.036.041.04129.511919...332.0404.0482.00.00.01.00.0000000.0002540.0002390.999507
3195850.00384949149137.0959415.54738933.037.041.04527.399592...114.0157.0202.00.00.01.00.0000000.0008680.0009700.998162
4320340.00629552652537.3195675.38452334.038.041.04511.067834...188.0243.0299.00.00.01.00.0000620.0005310.0005930.998814
\n", + "

5 rows × 51 columns

\n", + "
" + ], + "text/plain": [ + " num_triples frac_triples unique_h unique_t h_unique_rel_mean \\\n", + "r \n", + "0 81066 0.015931 9742 9337 8.110293 \n", + "1 5669 0.001114 698 1536 27.048157 \n", + "2 66954 0.013158 612 612 36.404307 \n", + "3 19585 0.003849 491 491 37.095941 \n", + "4 32034 0.006295 526 525 37.319567 \n", + "\n", + " h_unique_rel_std h_unique_rel_quartile1 h_unique_rel_quartile2 \\\n", + "r \n", + "0 8.247277 4.0 5.0 \n", + "1 12.936410 17.0 31.0 \n", + "2 5.600706 33.0 36.0 \n", + "3 5.547389 33.0 37.0 \n", + "4 5.384523 34.0 38.0 \n", + "\n", + " h_unique_rel_quartile3 h_degree_mean ... tot_degree_same_rel_quartile1 \\\n", + "r ... \n", + "0 8.0 569.252202 ... 45.0 \n", + "1 36.0 2518.765391 ... 14.0 \n", + "2 41.0 4129.511919 ... 332.0 \n", + "3 41.0 4527.399592 ... 114.0 \n", + "4 41.0 4511.067834 ... 188.0 \n", + "\n", + " tot_degree_same_rel_quartile2 tot_degree_same_rel_quartile3 \\\n", + "r \n", + "0 112.0 211.0 \n", + "1 32.0 60.0 \n", + "2 404.0 482.0 \n", + "3 157.0 202.0 \n", + "4 243.0 299.0 \n", + "\n", + " triple_cardinality_1:M_frac triple_cardinality_M:1_frac \\\n", + "r \n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "\n", + " triple_cardinality_M:M_frac triple_cardinality_same_rel_1:1_frac \\\n", + "r \n", + "0 1.0 0.001628 \n", + "1 1.0 0.002822 \n", + "2 1.0 0.000000 \n", + "3 1.0 0.000000 \n", + "4 1.0 0.000062 \n", + "\n", + " triple_cardinality_same_rel_1:M_frac triple_cardinality_same_rel_M:1_frac \\\n", + "r \n", + "0 0.023586 0.064959 \n", + "1 0.104251 0.027518 \n", + "2 0.000254 0.000239 \n", + "3 0.000868 0.000970 \n", + "4 0.000531 0.000593 \n", + "\n", + " triple_cardinality_same_rel_M:M_frac \n", + "r \n", + "0 0.909827 \n", + "1 0.865408 \n", + "2 0.999507 \n", + "3 0.998162 \n", + "4 0.998814 \n", + "\n", + "[5 rows x 51 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtt.aggregate_by_relation(edge_dcs).head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice on the left the columns `num_triples`, `frac_triples`, `unique_h`, `unique_t` giving additional statistics for relation types (number of edges and relative frequency, number of unique entities used as heads/tails by triples of the relation type).\n", + "\n", + "Similarly, by aggregating the `edge_eps` DataFrame we can look at the distribution of edge topological patterns within each relation type." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
num_triplesfrac_triplesunique_hunique_tis_loop_fracis_symmetric_frachas_inverse_fracn_inverse_relations_meann_inverse_relations_stdn_inverse_relations_quartile1...n_triangles_meann_triangles_stdn_triangles_quartile1n_triangles_quartile2n_triangles_quartile3n_undirected_triangles_meann_undirected_triangles_stdn_undirected_triangles_quartile1n_undirected_triangles_quartile2n_undirected_triangles_quartile3
r
0810660.015931974293370.0000120.0002220.0094740.0187620.3361200.0...49.615572816.7767383.07.016.00136.4528411421.83000818.0036.068.0
156690.00111469815360.0000000.0003530.0615630.5277832.5023230.0...1630.9121546563.52273613.084.0234.002864.1044289520.11681254.00224.0586.0
2669540.0131586126120.0000000.9473670.99825311.0191184.7072468.0...27666.69492515797.64974614990.025934.038868.5032678.99356318619.01605616691.0032647.548637.0
3195850.0038494914910.0000000.9473580.99959213.4172584.58515010.0...30250.85897417053.92541016204.028873.043798.0032696.12535118685.28168616563.0032808.048653.0
4320340.0062955265250.0000000.9473680.99937613.2995884.42789810.0...30942.23119216888.95665617303.030137.544161.2532685.21046418685.26715416645.2532580.048767.0
\n", + "

5 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " num_triples frac_triples unique_h unique_t is_loop_frac \\\n", + "r \n", + "0 81066 0.015931 9742 9337 0.000012 \n", + "1 5669 0.001114 698 1536 0.000000 \n", + "2 66954 0.013158 612 612 0.000000 \n", + "3 19585 0.003849 491 491 0.000000 \n", + "4 32034 0.006295 526 525 0.000000 \n", + "\n", + " is_symmetric_frac has_inverse_frac n_inverse_relations_mean \\\n", + "r \n", + "0 0.000222 0.009474 0.018762 \n", + "1 0.000353 0.061563 0.527783 \n", + "2 0.947367 0.998253 11.019118 \n", + "3 0.947358 0.999592 13.417258 \n", + "4 0.947368 0.999376 13.299588 \n", + "\n", + " n_inverse_relations_std n_inverse_relations_quartile1 ... \\\n", + "r ... \n", + "0 0.336120 0.0 ... \n", + "1 2.502323 0.0 ... \n", + "2 4.707246 8.0 ... \n", + "3 4.585150 10.0 ... \n", + "4 4.427898 10.0 ... \n", + "\n", + " n_triangles_mean n_triangles_std n_triangles_quartile1 \\\n", + "r \n", + "0 49.615572 816.776738 3.0 \n", + "1 1630.912154 6563.522736 13.0 \n", + "2 27666.694925 15797.649746 14990.0 \n", + "3 30250.858974 17053.925410 16204.0 \n", + "4 30942.231192 16888.956656 17303.0 \n", + "\n", + " n_triangles_quartile2 n_triangles_quartile3 n_undirected_triangles_mean \\\n", + "r \n", + "0 7.0 16.00 136.452841 \n", + "1 84.0 234.00 2864.104428 \n", + "2 25934.0 38868.50 32678.993563 \n", + "3 28873.0 43798.00 32696.125351 \n", + "4 30137.5 44161.25 32685.210464 \n", + "\n", + " n_undirected_triangles_std n_undirected_triangles_quartile1 \\\n", + "r \n", + "0 1421.830008 18.00 \n", + "1 9520.116812 54.00 \n", + "2 18619.016056 16691.00 \n", + "3 18685.281686 16563.00 \n", + "4 18685.267154 16645.25 \n", + "\n", + " n_undirected_triangles_quartile2 n_undirected_triangles_quartile3 \n", + "r \n", + "0 36.0 68.0 \n", + "1 224.0 586.0 \n", + "2 32647.5 48637.0 \n", + "3 32808.0 48653.0 \n", + "4 32580.0 48767.0 \n", + "\n", + "[5 rows x 32 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtt.aggregate_by_relation(edge_eps).head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Additional methods are provided for the analysis at the relation level: `jaccard_similarity_relation_sets` to compute the Jaccard similarity of the sets of head/tail entities used by each relation; `relational_affinity_ingram` to compute the InGram pairwise relation similarity (see [paper](https://arxiv.org/abs/2305.19987)). " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
r1r2num_triples_bothfrac_triples_bothnum_entities_bothnum_h_r1num_h_r2num_t_r1num_t_r2jaccard_head_headjaccard_head_tailjaccard_tail_headjaccard_tail_tailjaccard_both
101867350.017046143389742698933715360.0641120.0553010.0373170.0796350.112289
2021480200.02908913934974261293376120.0565310.0565310.0319470.0319470.041768
3031006510.01978013929974249193374910.0450370.0450370.0265300.0265300.033527
4041131000.02222713931974252693375250.0482900.0481880.0276100.0275060.035819
5051322760.02599513931974257693375780.0532870.0534910.0298150.0299160.039624
.............................................
24464749180210.0035422414806188580918860.1311480.1315680.1324090.1323530.135874
244747503741930.0735385592806522480952280.0823910.0825260.0833180.0834530.084764
24974849431220.008475340727281885272918860.3712840.3699520.3719890.3710640.370707
249848503992940.078471620127285224272952280.2873560.2873790.2860610.2860840.289147
254949503880920.076269616918855224188652280.1568760.1567730.1568500.1567480.158373
\n", + "

1275 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " r1 r2 num_triples_both frac_triples_both num_entities_both \\\n", + "1 0 1 86735 0.017046 14338 \n", + "2 0 2 148020 0.029089 13934 \n", + "3 0 3 100651 0.019780 13929 \n", + "4 0 4 113100 0.022227 13931 \n", + "5 0 5 132276 0.025995 13931 \n", + "... .. .. ... ... ... \n", + "2446 47 49 18021 0.003542 2414 \n", + "2447 47 50 374193 0.073538 5592 \n", + "2497 48 49 43122 0.008475 3407 \n", + "2498 48 50 399294 0.078471 6201 \n", + "2549 49 50 388092 0.076269 6169 \n", + "\n", + " num_h_r1 num_h_r2 num_t_r1 num_t_r2 jaccard_head_head \\\n", + "1 9742 698 9337 1536 0.064112 \n", + "2 9742 612 9337 612 0.056531 \n", + "3 9742 491 9337 491 0.045037 \n", + "4 9742 526 9337 525 0.048290 \n", + "5 9742 576 9337 578 0.053287 \n", + "... ... ... ... ... ... \n", + "2446 806 1885 809 1886 0.131148 \n", + "2447 806 5224 809 5228 0.082391 \n", + "2497 2728 1885 2729 1886 0.371284 \n", + "2498 2728 5224 2729 5228 0.287356 \n", + "2549 1885 5224 1886 5228 0.156876 \n", + "\n", + " jaccard_head_tail jaccard_tail_head jaccard_tail_tail jaccard_both \n", + "1 0.055301 0.037317 0.079635 0.112289 \n", + "2 0.056531 0.031947 0.031947 0.041768 \n", + "3 0.045037 0.026530 0.026530 0.033527 \n", + "4 0.048188 0.027610 0.027506 0.035819 \n", + "5 0.053491 0.029815 0.029916 0.039624 \n", + "... ... ... ... ... \n", + "2446 0.131568 0.132409 0.132353 0.135874 \n", + "2447 0.082526 0.083318 0.083453 0.084764 \n", + "2497 0.369952 0.371989 0.371064 0.370707 \n", + "2498 0.287379 0.286061 0.286084 0.289147 \n", + "2549 0.156773 0.156850 0.156748 0.158373 \n", + "\n", + "[1275 rows x 14 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtt.jaccard_similarity_relation_sets(biokg_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
h_relationt_relationedge_weight
0015.565931
1020.244410
2030.049564
3040.079068
4050.159787
............
25455045393.082900
25465046421.818843
254750471.194898
2548504818.124874
254950495.420267
\n", + "

2550 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " h_relation t_relation edge_weight\n", + "0 0 1 5.565931\n", + "1 0 2 0.244410\n", + "2 0 3 0.049564\n", + "3 0 4 0.079068\n", + "4 0 5 0.159787\n", + "... ... ... ...\n", + "2545 50 45 393.082900\n", + "2546 50 46 421.818843\n", + "2547 50 47 1.194898\n", + "2548 50 48 18.124874\n", + "2549 50 49 5.420267\n", + "\n", + "[2550 rows x 3 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtt.relational_affinity_ingram(biokg_df)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv38", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/source/user_guide.rst b/docs/source/user_guide.rst new file mode 100644 index 0000000..9e6714e --- /dev/null +++ b/docs/source/user_guide.rst @@ -0,0 +1,27 @@ +User guide +================ + +Installation and usage +------------------------ + +1. Pip install :code:`kg-topology-toolbox`: + +.. code-block:: + + pip install git+https://github.com/graphcore-research/kg-topology-toolbox.git + +2. Import and use: + +.. code-block:: + + from kg_topology_toolbox import KGTopologyToolbox + +.. Note:: The library has been tested on Ubuntu 20.04, Python >= 3.8. + + +Getting started +------------------------ + +For a walkthrough of the library functionalities, see the `Jupyter notebook `_. + +For more details, have a look at the `API reference `_ page. \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..6628c79 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,84 @@ +[build-system] +requires = ["setuptools>=61.0.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "kg-topology-toolbox" +version = "0.1.0" +authors = [ + {name = "Alberto Cattaneo"}, + {name = "Daniel Justus"}, + {name = "Thomas Martynec"}, + {name = "Stephen Bonner"}, +] +description = "A Python toolbox for Knowledge Graph topology metrics." +readme = "README.md" +license = {text = "MIT License"} +requires-python = ">=3.8" +dependencies = [ + 'numpy >= 1.24.4', + 'pandas >= 2.0.3', + 'scipy >= 1.10.1', +] + +[project.optional-dependencies] +dev = [ + 'black', + 'flake8', + 'isort', + 'mypy', + 'pandas-stubs >= 2.0.3.230814', + 'pytest >= 8.1.1', + 'pytest-cov', + 'sphinx >= 7.1.2', + 'sphinx_rtd_theme', + 'sphinx_autodoc_typehints', + 'sphinx-automodapi', + 'myst-parser', +] + +[project.urls] +repository = "https://github.com/graphcore-research/kg-topology-toolbox" + +[tool.setuptools.packages.find] +where = ["src"] +exclude = ["tests"] +namespaces = true + +[tool.black] +target-version = ["py38", "py39", "py310", "py311"] + +[tool.isort] +profile = "black" + +[tool.mypy] +pretty = true +show_error_codes = true +strict = true +check_untyped_defs = true +plugins = ["numpy.typing.mypy_plugin"] + +[[tool.mypy.overrides]] +module = "scipy.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "setuptools.*" +ignore_missing_imports = true + +[tool.pytest] +addopts = ["--no-cov-on-fail"] + +[tool.pytest.ini_options] +pythonpath = [ + "src" +] + +[tool.coverage.report] +skip_covered = true +show_missing = true +exclude_lines = [ + "pragma: no cover", + "raise NotImplementedError", + "assert False", +] \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..45a15b1 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,15 @@ +-r requirements.txt +black +flake8 +isort +mypy +pandas-stubs>=2.0.3.230814 +pandoc +pytest>=8.1.1 +pytest-cov +sphinx>=7.1.2 +sphinx_rtd_theme +sphinx_autodoc_typehints +sphinx-automodapi +nbsphinx +myst-parser diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f4eff4d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +numpy>=1.24.4 +pandas>=2.0.3 +scipy>=1.10.1 diff --git a/src/kg_topology_toolbox/__init__.py b/src/kg_topology_toolbox/__init__.py new file mode 100644 index 0000000..7ff262f --- /dev/null +++ b/src/kg_topology_toolbox/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2023 Graphcore Ltd. All rights reserved. + +""" +A Python toolbox for computing topological metrics and statistics for Knowledge Graphs. +""" + +from . import utils # NOQA:F401,E402,F403 +from .topology_toolbox import * # NOQA:F401,E402,F403 diff --git a/src/kg_topology_toolbox/topology_toolbox.py b/src/kg_topology_toolbox/topology_toolbox.py new file mode 100644 index 0000000..4fffe64 --- /dev/null +++ b/src/kg_topology_toolbox/topology_toolbox.py @@ -0,0 +1,588 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023 Graphcore Ltd. All rights reserved. + +""" +Topology toolbox main functionalities +""" + +from collections.abc import Iterable + +import numpy as np +import pandas as pd +from scipy.sparse import coo_array + +from kg_topology_toolbox.utils import composition_count, jaccard_similarity + + +class KGTopologyToolbox: + """ + Toolbox class to compute various Knowledge Graph topology statistics. + """ + + def node_degree_summary( + self, df: pd.DataFrame, return_relation_list: bool = False + ) -> pd.DataFrame: + """ + For each entity, this function computes the number of edges having it as a head + (head-degree, or out-degree), as a tail (tail-degree, or in-degree) + or one of the two (total-degree) in the Knowledge Graph. + The in-going and out-going relation types are also identified. + + The output dataframe is indexed on the IDs of the graph entities. + + :param df: A graph represented as a pd.DataFrame. + Must contain at least three columns `h`, `r`, `t`. + :param return_relation_list: If True, return the list of unique relations going + in/out of an entity. WARNING: expensive for large graphs. + + :return: The results dataframe, indexed over the same entity ID `e` used in df, + with columns: + + - **h_degree** (int): Number of triples with head entity `e`. + - **t_degree** (int): Number of triples with tail entity `e`. + - **tot_degree** (int): Number of triples with head entity `e` or tail entity `e`. + - **h_unique_rel** (int): Number of distinct relation types + among edges with head entity `e`. + - **h_rel_list** (list): List of unique relation types among edges + with head entity `e`. + - **t_unique_rel** (int): Number of distinct relation types + among edges with tail entity `e`. + - **t_rel_list** (list): List of unique relation types among edges + with tail entity `e`. + - **n_loops** (int): number of loops around entity `e`. + """ + n_entity = df[["h", "t"]].max().max() + 1 + h_rel_list = {"h_rel_list": ("r", "unique")} if return_relation_list else {} + t_rel_list = {"t_rel_list": ("r", "unique")} if return_relation_list else {} + nodes = pd.DataFrame( + df.groupby("h").agg( + h_degree=("r", "count"), h_unique_rel=("r", "nunique"), **h_rel_list # type: ignore + ), + index=np.arange(n_entity), + ) + nodes = nodes.merge( + df.groupby("t").agg( + t_degree=("r", "count"), t_unique_rel=("r", "nunique"), **t_rel_list # type: ignore + ), + left_index=True, + right_index=True, + how="left", + ) + nodes = nodes.merge( + df[df.h == df.t].groupby("h").agg(n_loops=("r", "count")), + left_index=True, + right_index=True, + how="left", + ) + nodes[["h_degree", "h_unique_rel", "t_degree", "t_unique_rel", "n_loops"]] = ( + nodes[["h_degree", "h_unique_rel", "t_degree", "t_unique_rel", "n_loops"]] + .fillna(0) + .astype(int) + ) + nodes["tot_degree"] = nodes["h_degree"] + nodes["t_degree"] - nodes["n_loops"] + + return nodes[ + ["h_degree", "t_degree", "tot_degree", "h_unique_rel"] + + (["h_rel_list"] if return_relation_list else []) + + ["t_unique_rel"] + + (["t_rel_list"] if return_relation_list else []) + + ["n_loops"] + ] + + def edge_degree_cardinality_summary(self, df: pd.DataFrame) -> pd.DataFrame: + """ + For each triple, this function computes the number of edges with the same head + (head-degree, or out-degree), the same tail (tail-degree, or in-degree) + or one of the two (total-degree) in the Knowledge Graph. + Based on entity degrees, each triple is classified as either one-to-one + (out-degree=in-degree=1), one-to-many (out-degree>1, in-degree=1), + many-to-one(out-degree=1, in-degree>1) or many-to-many + (in-degree>1, out-degree>1). + + The output dataframe maintains the same indexing and ordering of triples + as the input one. + + :param df: A graph represented as a pd.DataFrame. + Must contain at least three columns `h`, `r`, `t`. + + :return: The results dataframe. Contains the following columns + (in addition to `h`, `r`, `t` in ``df``): + + - **h_unique_rel** (int): Number of distinct relation types + among edges with head entity h. + - **h_degree** (int): Number of triples with head entity h. + - **h_degree_same_rel** (int): Number of triples with head entity h + and relation type r. + - **t_unique_rel** (int): Number of distinct relation types + among edges with tail entity t. + - **t_degree** (int): Number of triples with tail entity t. + - **t_degree_same_rel** (int): Number of triples with tail entity t + and relation type r. + - **tot_degree** (int): Number of triples with head entity h or + tail entity t. + - **tot_degree_same_rel** (int): Number of triples with head entity h or + tail entity t, and relation type r. + - **triple_cardinality** (int): cardinality type of the edge. + - **triple_cardinality_same_rel** (int): cardinality type of the edge in + the subgraph of edges with relation type r. + """ + gr_by_h_count = df.groupby("h", as_index=False).agg( + h_unique_rel=("r", "nunique"), h_degree=("t", "count") + ) + gr_by_hr_count = df.groupby(["h", "r"], as_index=False).agg( + h_degree_same_rel=("t", "count") + ) + gr_by_t_count = df.groupby("t", as_index=False).agg( + t_unique_rel=("r", "nunique"), t_degree=("h", "count") + ) + gr_by_rt_count = df.groupby(["r", "t"], as_index=False).agg( + t_degree_same_rel=("h", "count") + ) + + df_res = df.merge(gr_by_h_count, left_on=["h"], right_on=["h"], how="left") + df_res = df_res.merge( + gr_by_hr_count, left_on=["h", "r"], right_on=["h", "r"], how="left" + ) + df_res = df_res.merge(gr_by_t_count, left_on=["t"], right_on=["t"], how="left") + df_res = df_res.merge( + gr_by_rt_count, left_on=["t", "r"], right_on=["t", "r"], how="left" + ) + # compute number of parallel edges to avoid double-counting them + # in total degree + num_parallel = df_res.merge( + df.groupby(["h", "t"], as_index=False).agg(n_parallel=("r", "count")), + left_on=["h", "t"], + right_on=["h", "t"], + how="left", + ) + df_res["tot_degree"] = ( + df_res.h_degree + df_res.t_degree - num_parallel.n_parallel + ) + # when restricting to the relation type, there is only one edge + # (the edge itself) that is double-counted + df_res["tot_degree_same_rel"] = ( + df_res.h_degree_same_rel + df_res.t_degree_same_rel - 1 + ) + + # check if the values in the pair (h_degree, t_degree) are =1 or >1 + # to determine the edge cardinality + legend = { + 0: "M:M", + 1: "1:M", + 2: "M:1", + 3: "1:1", + } + for suffix in ["", "_same_rel"]: + edge_type = 2 * (df_res["h_degree" + suffix] == 1) + ( + df_res["t_degree" + suffix] == 1 + ) + df_res["triple_cardinality" + suffix] = edge_type.apply(lambda x: legend[x]) + return df_res + + def edge_pattern_summary( + self, + df: pd.DataFrame, + return_metapath_list: bool = False, + composition_chunk_size: int = 2**8, + composition_workers: int = 32, + ) -> pd.DataFrame: + """ + This function analyses the structural properties of each edge in the graph: + symmetry, presence of inverse/inference(=parallel) edges and + triangles supported on the edge. + + The output dataframe maintains the same indexing and ordering of triples + as the input one. + + :param df: A graph represented as a pd.DataFrame. + Must contain at least three columns `h`, `r`, `t`. + :param return_metapath_list: If True, return the list of unique metapaths for all + triangles supported over one edge. WARNING: very expensive for large graphs. + :param composition_chunk_size: Size of column chunks of sparse adjacency matrix + to compute the triangle count. + :param composition_workers: Number of workers to compute the triangle count. + + :return: The results dataframe. Contains the following columns + (in addition to `h`, `r`, `t` in ``df``): + + - **is_loop** (bool): True if the triple is a loop (``h == t``). + - **is_symmetric** (bool): True if the triple (t, r, h) is also contained + in the graph (assuming t and h are different). + - **has_inverse** (bool): True if the graph contains one or more triples + (t, r', h) with ``r' != r``. + - **n_inverse_relations** (int): The number of inverse relations r'. + - **inverse_edge_types** (list): All relations r' (including r if the edge + is symmetric) such that (t, r', h) is in the graph. + - **has_inference** (bool): True if the graph contains one or more triples + (h, r', t) with ``r' != r``. + - **n_inference_relations** (int): The number of inference relations r'. + - **inference_edge_types** (list): All relations r' (including r) such that + (h, r', t) is in the graph. + - **has_composition** (bool): True if the graph contains one or more triangles + supported on the edge: (h, r1, x) + (x, r2, t). + - **n_triangles** (int): The number of triangles. + - **has_undirected_composition** (bool): True if the graph contains one or more + undirected triangles supported on the edge. + - **n_undirected_triangles** (int): The number of undirected triangles + (considering all edges as bidirectional). + - **metapath_list** (list): The list of unique metapaths "r1-r2" + for the directed triangles. + """ + # symmetry-asymmetry + # edges with h/t switched + df_inv = df.reindex(columns=["t", "r", "h"]).rename( + columns={"t": "h", "r": "r", "h": "t"} + ) + df_res = pd.DataFrame({"h": df.h, "r": df.r, "t": df.t, "is_symmetric": False}) + df_res.loc[ + df.reset_index().merge(df_inv)["index"], + "is_symmetric", + ] = True + # loops are treated separately + df_res["is_loop"] = df_res.h == df_res.t + df_res.loc[df_res.h == df_res.t, "is_symmetric"] = False + + # inverse + unique_inv_r_by_ht = df_inv.groupby(["h", "t"], as_index=False).agg( + inverse_edge_types=("r", list), + ) + df_res = df_res.merge( + unique_inv_r_by_ht, left_on=["h", "t"], right_on=["h", "t"], how="left" + ) + df_res["inverse_edge_types"] = df_res["inverse_edge_types"].apply( + lambda agg: agg if isinstance(agg, list) else [] + ) + # if the edge (h,r,t) is symmetric or loop, we do not consider the relation + # r as a proper inverse + df_res["n_inverse_relations"] = ( + df_res.inverse_edge_types.str.len() - df_res.is_symmetric - df_res.is_loop + ) + df_res["n_inverse_relations"] = ( + df_res["n_inverse_relations"].fillna(0).astype(int) + ) + df_res["has_inverse"] = df_res["n_inverse_relations"] > 0 + + # inference + edges_between_ht = unique_inv_r_by_ht.reindex( + columns=["t", "h", "inverse_edge_types"] + ).rename( + columns={"t": "h", "h": "t", "inverse_edge_types": "inference_edge_types"} + ) + df_res = df_res.merge( + edges_between_ht, left_on=["h", "t"], right_on=["h", "t"], how="left" + ) + # inference_edge_types always contains the edge itself, which we need to drop + df_res["n_inference_relations"] = df_res.inference_edge_types.str.len() - 1 + df_res["has_inference"] = df_res["n_inference_relations"] > 0 + + # composition & metapaths + # discard loops as edges of a triangle + df_wo_loops = df[df.h != df.t] + if return_metapath_list: + # 2-hop paths + df_bridges = df_wo_loops.merge( + df_wo_loops, left_on="t", right_on="h", how="inner" + ) + df_triangles = df_wo_loops.merge( + df_bridges, left_on=["h", "t"], right_on=["h_x", "t_y"], how="inner" + ) + df_triangles["metapath"] = ( + df_triangles["r_x"].astype(str) + "-" + df_triangles["r_y"].astype(str) + ) + grouped_triangles = df_triangles.groupby( + ["h", "r", "t"], as_index=False + ).agg( + n_triangles=("metapath", "count"), metapath_list=("metapath", "unique") + ) + df_res = df_res.merge( + grouped_triangles, + left_on=["h", "r", "t"], + right_on=["h", "r", "t"], + how="left", + ) + df_res["metapath_list"] = df_res["metapath_list"].apply( + lambda agg: agg.tolist() if isinstance(agg, np.ndarray) else [] + ) + df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int) + else: + counts = composition_count( + df_wo_loops, + chunk_size=composition_chunk_size, + workers=composition_workers, + directed=True, + ) + df_res = df_res.merge( + counts, + on=["h", "t"], + how="left", + ) + df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int) + + df_res["has_composition"] = df_res["n_triangles"] > 0 + + counts = composition_count( + df_wo_loops, + chunk_size=composition_chunk_size, + workers=composition_workers, + directed=False, + ) + df_res = df_res.merge( + counts.rename(columns={"n_triangles": "n_undirected_triangles"}), + on=["h", "t"], + how="left", + ) + df_res["n_undirected_triangles"] = ( + df_res["n_undirected_triangles"].fillna(0).astype(int) + ) + df_res["has_undirected_composition"] = df_res["n_undirected_triangles"] > 0 + + return df_res[ + [ + "h", + "r", + "t", + "is_loop", + "is_symmetric", + "has_inverse", + "n_inverse_relations", + "inverse_edge_types", + "has_inference", + "n_inference_relations", + "inference_edge_types", + "has_composition", + "has_undirected_composition", + "n_triangles", + "n_undirected_triangles", + ] + + (["metapath_list"] if return_metapath_list else []) + ] + + def aggregate_by_relation(self, edge_topology_df: pd.DataFrame) -> pd.DataFrame: + """ + Aggregate topology metrics of all triples of the same relation type. + To be applied to the output dataframe of either + :meth:`KGTopologyToolbox.edge_degree_cardinality_summary` or + :meth:`KGTopologyToolbox.edge_pattern_summary`. + + The returned dataframe is indexed over relation type IDs, with columns + giving the aggregated statistics of triples of the correspondig relation. + The name of the columns is of the form ``column_name_in_input_df + suffix``. + The aggregation is perfomed by returning: + + - for numerical metrics: mean, standard deviation and quartiles + (``suffix`` = "_mean", "_std", "_quartile1", "_quartile2", "_quartile3"); + - for boolean metrics: the fraction of triples of the relation type + with metric = True (``suffix`` = "_frac"); + - for string metrics: for each possible label, the fraction of triples + of the relation type with that metric value (``suffix`` = "_{label}_frac") + - for list metrics: the unique metric values across triples of the relation + type (``suffix`` = "_unique"). + + :param edge_topology_df: pd.DataFrame of edge topology metrics. + Must contain at least three columns `h`, `r`, `t`. + + :return: The results dataframe. In addition to the columns with the aggregated + metrics by relation type, it also contains columns: + + - **num_triples** (int): Number of triples for each relation type. + - **frac_triples** (float): Fraction of overall triples represented by each + relation type. + - **unique_h** (int): Number of unique head entities used by triples of each + relation type. + - **unique_t** (int): Number of unique tail entities used by triples of each + relation type. + """ + df_by_r = edge_topology_df.groupby("r") + df_res = df_by_r.agg(num_triples=("r", "count")) + df_res["frac_triples"] = df_res["num_triples"] / edge_topology_df.shape[0] + col: str + for col, col_dtype in edge_topology_df.drop(columns=["r"]).dtypes.items(): # type: ignore + if col in ["h", "t"]: + df_res[f"unique_{col}"] = df_by_r[col].nunique() + elif col_dtype == object: + if isinstance(edge_topology_df[col].iloc[0], str): + for label in np.unique(edge_topology_df[col]): + df_res[f"{col}_{label}_frac"] = ( + edge_topology_df[edge_topology_df[col] == label] + .groupby("r")[col] + .count() + / df_res["num_triples"] + ).fillna(0) + elif isinstance(edge_topology_df[col].iloc[0], Iterable): + df_res[f"{col}_unique"] = ( + df_by_r[col] + .agg(np.unique) + .apply( + lambda x: ( + np.unique( + np.concatenate( + [lst for lst in x if len(lst) > 0] or [[]] + ) + ).tolist() + ) + ) + ) + else: + print(f"Skipping column {col}: no known aggregation mode") + continue + elif col_dtype == int or col_dtype == float: + df_res[f"{col}_mean"] = df_by_r[col].mean() + df_res[f"{col}_std"] = df_by_r[col].std() + for q in range(1, 4): + df_res[f"{col}_quartile{q}"] = df_by_r[col].agg( + lambda x: np.quantile(x, 0.25 * q) + ) + elif col_dtype == bool: + df_res[f"{col}_frac"] = df_by_r[col].mean() + return df_res + + def jaccard_similarity_relation_sets(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Compute the similarity between relations defined as the Jaccard Similarity + between sets of entities (heads and tails) for all pairs + of relations in the graph. + + :param df: A graph represented as a pd.DataFrame. + Must contain at least three columns `h`, `r`, `t`. + + :return: The results dataframe. Contains the following columns: + + - **r1** (int): Index of the first relation. + - **r2** (int): Index of the second relation. + - **num_triples_both** (int): Number of triples with relation r1/r2. + - **frac_triples_both** (float): Fraction of triples with relation r1/r2. + - **num_entities_both** (int): Number of unique entities (h or t) for triples + with relation r1/r2. + - **num_h_r1** (int): Number of unique head entities for relation r1. + - **num_h_r2** (int): Number of unique head entities for relation r2. + - **num_t_r1** (int): Number of unique tail entities for relation r1. + - **num_t_r2** (int): Number of unique tail entities for relation r2. + - **jaccard_head_head** (float): Jaccard similarity between the head set of r1 + and the head set of r2. + - **jaccard_tail_tail** (float): Jaccard similarity between the tail set of r1 + and the tail set of r2. + - **jaccard_head_tail** (float): Jaccard similarity between the head set of r1 + and the tail set of r2. + - **jaccard_tail_head** (float): Jaccard similarity between the tail set of r1 + and the head set of r2. + - **jaccard_both** (float): Jaccard similarity between the full entity set + of r1 and r2. + """ + ent_unique = df.groupby("r", as_index=False).agg( + num_triples=("r", "count"), head=("h", "unique"), tail=("t", "unique") + ) + ent_unique["both"] = ent_unique.apply( + lambda x: np.unique(np.concatenate([x["head"], x["tail"]])), axis=1 + ) + ent_unique["num_h"] = ent_unique["head"].str.len() + ent_unique["num_t"] = ent_unique["tail"].str.len() + r_num = ent_unique[["r", "num_h", "num_t", "num_triples"]] + # combinations of relations + df_res = pd.merge( + r_num.rename(columns={"r": "r1"}), + r_num.rename(columns={"r": "r2"}), + suffixes=["_r1", "_r2"], + how="cross", + ) + df_res = df_res[df_res.r1 < df_res.r2] + + df_res["num_triples_both"] = df_res["num_triples_r1"] + df_res["num_triples_r2"] + df_res["frac_triples_both"] = df_res["num_triples_both"] / df.shape[0] + df_res["num_entities_both"] = df_res.apply( + lambda x: len( + np.unique( + np.concatenate( + [ + ent_unique.loc[x["r1"], "both"], + ent_unique.loc[x["r2"], "both"], + ] + ) + ) + ), + axis=1, + ) + df_res = df_res[ + [ + "r1", + "r2", + "num_triples_both", + "frac_triples_both", + "num_entities_both", + "num_h_r1", + "num_h_r2", + "num_t_r1", + "num_t_r2", + ] + ] + for r1_ent in ["head", "tail"]: + for r2_ent in ["head", "tail"]: + df_res[f"jaccard_{r1_ent}_{r2_ent}"] = [ + jaccard_similarity(a, b) + for a, b in zip( + ent_unique.loc[df_res.r1, r1_ent], + ent_unique.loc[df_res.r2, r2_ent], + ) + ] + df_res["jaccard_both"] = [ + jaccard_similarity(a, b) + for a, b in zip( + ent_unique.loc[df_res.r1, "both"], ent_unique.loc[df_res.r2, "both"] + ) + ] + return df_res + + def relational_affinity_ingram( + self, df: pd.DataFrame, min_max_norm: bool = False + ) -> pd.DataFrame: + """ + Compute the similarity between relations based on the approach proposed in + InGram: Inductive Knowledge Graph Embedding via Relation Graphs, + https://arxiv.org/abs/2305.19987. + + Only the pairs of relations witn ``affinity > 0`` are shown in the + returned dataframe. + + :param df: A graph represented as a pd.DataFrame. + Must contain at least three columns `h`, `r`, `t`. + :param min_max_norm: min-max normalization of edge weights. Defaults to False. + + :return: The results dataframe. Contains the following columns: + + - **h_relation** (int): Index of the head relation. + - **t_relation** (int): Index of the tail relation. + - **edge_weight** (float): Weight for the affinity between + the head and the tail relation. + """ + n_entities = df[["h", "t"]].max().max() + 1 + n_rels = df.r.max() + 1 + + hr_freqs = df.groupby(["h", "r"], as_index=False).count() + # normalize by global h frequency + hr_freqs["t"] = hr_freqs["t"] / hr_freqs.groupby("h")["t"].transform("sum") + rt_freqs = df.groupby(["t", "r"], as_index=False).count() + # normalize by global t frequency + rt_freqs["h"] = rt_freqs["h"] / rt_freqs.groupby("t")["h"].transform("sum") + + E_h = coo_array( + (hr_freqs.t, (hr_freqs.h, hr_freqs.r)), + shape=[n_entities, n_rels], + ) + E_t = coo_array( + (rt_freqs.h, (rt_freqs.t, rt_freqs.r)), + shape=[n_entities, n_rels], + ) + + A = (E_h.T @ E_h).toarray() + (E_t.T @ E_t).toarray() + A[np.diag_indices_from(A)] = 0 + + if min_max_norm: + A = (A - np.min(A)) / (np.max(A) - np.min(A)) + + h_rels, t_rels = np.nonzero(A) + return pd.DataFrame( + { + "h_relation": h_rels, + "t_relation": t_rels, + "edge_weight": A[h_rels, t_rels], + } + ) diff --git a/src/kg_topology_toolbox/utils.py b/src/kg_topology_toolbox/utils.py new file mode 100644 index 0000000..64b01fa --- /dev/null +++ b/src/kg_topology_toolbox/utils.py @@ -0,0 +1,95 @@ +# Copyright (c) 2023 Graphcore Ltd. All rights reserved. + +""" +Utility functions +""" + +from multiprocessing import Pool + +import numpy as np +import pandas as pd +from numpy.typing import NDArray +from scipy.sparse import coo_array, csc_array, csr_array + + +def jaccard_similarity( + entities_1: NDArray[np.int32], entities_2: NDArray[np.int32] +) -> float: + """ + Jaccard Similarity function for two sets of entities. + + :param entities_1: the array of IDs for the first set of entities. + :param entities_2: the array of IDs for the second set of entities. + + :return: Jaccard Similarity score for two sets of entities. + """ + intersection = len(np.intersect1d(entities_1, entities_2)) + union = len(entities_1) + len(entities_2) - intersection + return float(intersection / union) + + +def _composition_count_worker( + adj_csr: csr_array, adj_csc: csc_array, tail_shift: int = 0 +) -> pd.DataFrame: + adj_2hop = adj_csr @ adj_csc + adj_composition = (adj_2hop.tocsc() * (adj_csc > 0)).tocoo() + df_composition = pd.DataFrame( + dict( + h=adj_composition.row, + t=adj_composition.col + tail_shift, + n_triangles=adj_composition.data, + ) + ) + return df_composition + + +def composition_count( + df: pd.DataFrame, chunk_size: int, workers: int, directed: bool = True +) -> pd.DataFrame: + """A helper function to compute the composition count of a graph. + + :param df: A graph represented as a pd.DataFrame. Must contain the columns + `h` and `t`. No self-loops should be present in the graph. + :param chunk_size: Size of chunks of columns of the adjacency matrix to be + processed together. + :param workers: Number of workers processing chunks concurrently + :param directed: Boolean flag. If false, bidirectional edges are considered for + triangles by adding the adjacency matrix and its transposed. Defaults to True. + + :return: The results dataframe. Contains the following columns: + + - **h** (int): Index of the head entity. + - **t** (int): Index of the tail entity. + - **n_triangles** (int): Number of compositions for the (h, t) edge. + """ + + adj = coo_array( + (np.ones(len(df)), (df.h, df.t)), + shape=[max(df.max()) + 1, max(df.max()) + 1], + ).astype(np.uint16) + if not directed: + adj = adj + adj.T + n_cols = adj.shape[1] + adj_csr = adj.tocsr() + adj_csc = adj.tocsc() + adj_csc_slices = { + i: adj_csc[:, i * chunk_size : min((i + 1) * chunk_size, n_cols)] + for i in range(int(np.ceil(n_cols / chunk_size))) + } + + if len(adj_csc_slices) > 1 and workers > 1: + with Pool(workers) as pool: + df_composition_list = pool.starmap( + _composition_count_worker, + ( + (adj_csr, adj_csc_slice, i * chunk_size) + for i, adj_csc_slice in adj_csc_slices.items() + ), + ) + else: + df_composition_list = [ + _composition_count_worker(adj_csr, adj_csc_slice, i * chunk_size) + for i, adj_csc_slice in adj_csc_slices.items() + ] + + return pd.concat(df_composition_list) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..9dc9fcb --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2023 Graphcore Ltd. All rights reserved. diff --git a/tests/test_edge_topology_toolbox.py b/tests/test_edge_topology_toolbox.py new file mode 100644 index 0000000..a6aab47 --- /dev/null +++ b/tests/test_edge_topology_toolbox.py @@ -0,0 +1,84 @@ +# Copyright (c) 2023 Graphcore Ltd. All rights reserved. + +import numpy as np +import pandas as pd +import pytest + +from kg_topology_toolbox import KGTopologyToolbox + +df = pd.DataFrame( + dict( + h=[0, 0, 0, 1, 2, 2, 1, 2], + t=[1, 1, 2, 2, 0, 0, 1, 2], + r=[0, 1, 0, 1, 0, 1, 1, 0], + ) +) + +tools = KGTopologyToolbox() + + +@pytest.mark.parametrize("return_metapath_list", [True, False]) +def test_small_graph_metrics(return_metapath_list: bool) -> None: + # Define a small graph with all the features tested by + # the edge_topology_toolbox + + # entity degrees statistics + res = tools.edge_degree_cardinality_summary(df) + assert np.allclose(res["h_unique_rel"], [2, 2, 2, 1, 2, 2, 1, 2]) + assert np.allclose(res["h_degree"], [3, 3, 3, 2, 3, 3, 2, 3]) + assert np.allclose(res["h_degree_same_rel"], [2, 1, 2, 2, 2, 1, 2, 2]) + assert np.allclose(res["t_unique_rel"], [2, 2, 2, 2, 2, 2, 2, 2]) + assert np.allclose(res["t_degree"], [3, 3, 3, 3, 2, 2, 3, 3]) + assert np.allclose(res["t_degree_same_rel"], [1, 2, 2, 1, 1, 1, 2, 2]) + assert np.allclose(res["tot_degree"], [4, 4, 5, 4, 3, 3, 4, 5]) + assert np.allclose(res["tot_degree_same_rel"], [2, 2, 3, 2, 2, 1, 3, 3]) + + # triple cardinality + assert res["triple_cardinality"].tolist() == [ + "M:M", + "M:M", + "M:M", + "M:M", + "M:M", + "M:M", + "M:M", + "M:M", + ] + assert res["triple_cardinality_same_rel"].tolist() == [ + "1:M", + "M:1", + "M:M", + "1:M", + "1:M", + "1:1", + "M:M", + "M:M", + ] + + # relation pattern symmetry + res = tools.edge_pattern_summary(df, return_metapath_list=return_metapath_list) + assert np.allclose( + res["is_loop"], [False, False, False, False, False, False, True, True] + ) + assert np.allclose( + res["is_symmetric"], [False, False, True, False, True, False, False, False] + ) + # relation pattern inverse + assert np.allclose( + res["has_inverse"], [False, False, True, False, False, True, False, False] + ) + assert np.allclose(res["n_inverse_relations"], [0, 0, 1, 0, 0, 1, 0, 0]) + # relation pattern inference + assert np.allclose( + res["has_inference"], [True, True, False, False, True, True, False, False] + ) + assert np.allclose(res["n_inference_relations"], [1, 1, 0, 0, 1, 1, 0, 0]) + + # relation_pattern_composition & metapaths + assert np.allclose( + res["has_composition"], [False, False, True, False, False, False, False, False] + ) + assert np.allclose(res["n_triangles"], [0, 0, 2, 0, 0, 0, 0, 0]) + assert np.allclose(res["n_undirected_triangles"], [3, 3, 2, 6, 2, 2, 0, 0]) + if return_metapath_list: + assert res["metapath_list"][2] == ["0-1", "1-1"] diff --git a/tests/test_node_topology_toolbox.py b/tests/test_node_topology_toolbox.py new file mode 100644 index 0000000..371180c --- /dev/null +++ b/tests/test_node_topology_toolbox.py @@ -0,0 +1,43 @@ +# Copyright (c) 2023 Graphcore Ltd. All rights reserved. + +import numpy as np +import pandas as pd +import pytest + +from kg_topology_toolbox import KGTopologyToolbox + +df = pd.DataFrame( + dict( + h=[0, 0, 0, 1, 2, 2, 2], + t=[1, 1, 2, 2, 0, 0, 2], + r=[0, 1, 0, 1, 0, 1, 1], + ) +) + +tools = KGTopologyToolbox() + + +@pytest.mark.parametrize("return_relation_list", [True, False]) +def test_small_graph_metrics(return_relation_list: bool) -> None: + # Define a small graph with all the features tested by + # the node_topology_toolbox + + # entity degrees statistics + res = tools.node_degree_summary(df, return_relation_list=return_relation_list) + assert np.allclose(res["h_degree"], [3, 1, 3]) + assert np.allclose(res["t_degree"], [2, 2, 3]) + assert np.allclose(res["tot_degree"], [5, 3, 5]) + assert np.allclose(res["h_unique_rel"], [2, 1, 2]) + assert np.allclose(res["t_unique_rel"], [2, 2, 2]) + assert np.allclose(res["n_loops"], [0, 0, 1]) + if return_relation_list: + assert [x.tolist() for x in res["h_rel_list"].to_list()] == [ + [0, 1], + [1], + [0, 1], + ] + assert [x.tolist() for x in res["t_rel_list"].to_list()] == [ + [0, 1], + [0, 1], + [0, 1], + ] diff --git a/tests/test_relation_topology_toolbox.py b/tests/test_relation_topology_toolbox.py new file mode 100644 index 0000000..cdbdaa7 --- /dev/null +++ b/tests/test_relation_topology_toolbox.py @@ -0,0 +1,89 @@ +# Copyright (c) 2023 Graphcore Ltd. All rights reserved. + +from typing import List + +import numpy as np +import pandas as pd +import pytest + +from kg_topology_toolbox import KGTopologyToolbox + +df = pd.DataFrame( + dict( + h=[0, 0, 0, 1, 2, 2, 2, 3, 3, 4], + t=[1, 1, 2, 2, 0, 3, 4, 2, 4, 3], + r=[0, 1, 0, 1, 0, 1, 1, 0, 0, 1], + ) +) + +tools = KGTopologyToolbox() + + +def test_small_graph_metrics() -> None: + # Define a small graph on five nodes with all the features tested by + # the relation_topology_toolbox + + dcs = tools.aggregate_by_relation(tools.edge_degree_cardinality_summary(df)) + eps = tools.aggregate_by_relation( + tools.edge_pattern_summary(df, return_metapath_list=True) + ) + + assert np.allclose(dcs["num_triples"], [5, 5]) + assert np.allclose(dcs["frac_triples"], [0.5, 0.5]) + assert np.allclose(dcs["unique_h"], [3, 4]) + assert np.allclose(dcs["unique_t"], [4, 4]) + + # entity_degree_statistics + assert np.allclose(dcs["h_degree_mean"], [2.6, 2.2]) + assert np.allclose(dcs["t_degree_mean"], [2.2, 2.2]) + assert np.allclose(dcs["tot_degree_mean"], [3.6, 3.2]) + + # triple_relation_cardinality + assert np.allclose(dcs["triple_cardinality_1:M_frac"], [1 / 5, 0]) + assert np.allclose(dcs["triple_cardinality_M:1_frac"], [0, 2 / 5]) + assert np.allclose(dcs["triple_cardinality_M:M_frac"], [4 / 5, 3 / 5]) + assert np.allclose(dcs["triple_cardinality_same_rel_1:1_frac"], [1 / 5, 2 / 5]) + assert np.allclose(dcs["triple_cardinality_same_rel_1:M_frac"], [2 / 5, 1 / 5]) + assert np.allclose(dcs["triple_cardinality_same_rel_M:1_frac"], [0, 1 / 5]) + assert np.allclose(dcs["triple_cardinality_same_rel_M:M_frac"], [2 / 5, 1 / 5]) + + # relation_pattern_loop + assert np.allclose(eps["is_loop_frac"], [0, 0]) + + # relation_pattern_symmetric + assert np.allclose(eps["is_symmetric_frac"], [2 / 5, 0]) + + # relation_pattern_inverse + assert np.allclose(eps["has_inverse_frac"], [2 / 5, 2 / 5]) + assert eps["inverse_edge_types_unique"][0] == [0, 1] + assert eps["inverse_edge_types_unique"][1] == [0] + + # relation_pattern_composition + assert np.allclose(eps["has_composition_frac"], [2 / 5, 2 / 5]) + assert np.allclose(eps["has_undirected_composition_frac"], [1, 1]) + assert eps["metapath_list_unique"][0] == ["0-1", "1-1"] + assert eps["metapath_list_unique"][1] == ["1-0", "1-1"] + + # relation_pattern_inference + assert np.allclose(eps["has_inference_frac"], [1 / 5, 1 / 5]) + assert eps["inference_edge_types_unique"][0] == [0, 1] + assert eps["inference_edge_types_unique"][1] == [0, 1] + + +def test_jaccard_similarity() -> None: + # jaccard_similarity_relation_sets + res = tools.jaccard_similarity_relation_sets(df) + assert np.allclose(res["jaccard_head_head"], [2 / 5]) + assert np.allclose(res["jaccard_tail_tail"], [3 / 5]) + assert np.allclose(res["jaccard_head_tail"], [2 / 5]) + assert np.allclose(res["jaccard_tail_head"], [1]) + assert np.allclose(res["jaccard_both"], [1]) + + +@pytest.mark.parametrize( + "min_max_norm,expected", [(True, [1, 1]), (False, [7 / 6, 7 / 6])] +) +def test_ingram_affinity(min_max_norm: bool, expected: List[float]) -> None: + # relational_affinity_ingram + res = tools.relational_affinity_ingram(df, min_max_norm) + assert np.allclose(res["edge_weight"], expected)