From aca60a858c76fc6a87d6164c112f3f77191759e9 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Thu, 29 Jun 2023 22:50:22 +0200 Subject: [PATCH] Update notebook --- notebooks/Process CLO Mappings.ipynb | 1101 ++- notebooks/clo.sssom.tsv | 10926 ------------------------- 2 files changed, 673 insertions(+), 11354 deletions(-) delete mode 100644 notebooks/clo.sssom.tsv diff --git a/notebooks/Process CLO Mappings.ipynb b/notebooks/Process CLO Mappings.ipynb index a0bb717f..a2d9cdfd 100644 --- a/notebooks/Process CLO Mappings.ipynb +++ b/notebooks/Process CLO Mappings.ipynb @@ -10,7 +10,12 @@ "\n", "The [Cell Line Ontology (CLO)](https://bioregistry.io/registry/clo) is a detailed resouce, however it does not follow standard OBO modeling pattern for cross-references that either uses `oboInOwl:hasDbXref` or a SKOS and pointing to a single CURIE encoded as a string. Instead, it uses `rdfs:seeAlso` with a combination of non-standard CURIEs that are either comma or semi-colon delimited.\n", "\n", - "This notebook attempts to unpack and operationalize these cross-references." + "This notebook attempts to unpack and operationalize these cross-references.\n", + "\n", + "See also:\n", + "\n", + "- https://github.com/CLO-ontology/CLO/issues/103\n", + "- https://gist.github.com/cthoyt/a91ae12a94c7e1647e9d9d8fa61e80ce" ] }, { @@ -29,51 +34,34 @@ "import pyobo\n", "from tqdm.auto import tqdm\n", "\n", + "from semra.sources.clo import get_clo_mappings\n", + "from semra.api import summarize_prefixes, get_many_to_many, keep_prefixes, filter_mappings\n", + "from semra.io import get_sssom_df\n", + "\n", "from biomappings import PredictionTuple\n", "from biomappings.resources import PREDICTIONS_HEADER, append_prediction_tuples" ] }, { - "cell_type": "code", - "execution_count": 2, - "id": "3f16ace2", - "metadata": {}, - "outputs": [], - "source": [ - "graph = bioontologies.get_obograph_by_prefix(\"clo\", check=False).guess(\"clo\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "fe2debdb", + "cell_type": "markdown", + "id": "f089bee6", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'2.1.178'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "graph.version" + "## Extraction and Processing\n", + "\n", + "The following cell uses [this script](https://github.com/biopragmatics/semra/blob/main/src/semra/sources/clo.py) in [SeMRA](https://github.com/biopragmatics/semra) to extract cross-references from CLO." ] }, { "cell_type": "code", - "execution_count": 4, - "id": "3c724323", + "execution_count": 2, + "id": "d0201639", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "09fc001c94044e6c9a59baae1426e39e", + "model_id": "ba63f03a8b5c45d5a2cf130ce6ce464b", "version_major": 2, "version_minor": 0 }, @@ -84,39 +72,6 @@ "metadata": {}, "output_type": "display_data" }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ERROR: [2023-06-29 00:35:00] pyobo.api.names - [reo vNone] could not load\n", - "Traceback (most recent call last):\n", - " File \"/Users/cthoyt/dev/pyobo/src/pyobo/api/names.py\", line 132, in get_id_name_mapping\n", - " return _get_id_name_mapping()\n", - " File \"/Users/cthoyt/.virtualenvs/indra/lib/python3.10/site-packages/pystow/cache.py\", line 83, in _wrapped\n", - " rv = func()\n", - " File \"/Users/cthoyt/dev/pyobo/src/pyobo/api/names.py\", line 128, in _get_id_name_mapping\n", - " ontology = get_ontology(prefix, force=force, strict=strict, version=version)\n", - " File \"/Users/cthoyt/dev/pyobo/src/pyobo/identifier_utils.py\", line 115, in _wrapped\n", - " return f(norm_prefix, *args, **kwargs)\n", - " File \"/Users/cthoyt/dev/pyobo/src/pyobo/getters.py\", line 126, in get_ontology\n", - " robot.convert(path, _converted_obo_path, check=robot_check)\n", - " File \"/Users/cthoyt/dev/bioontologies/src/bioontologies/robot.py\", line 449, in convert\n", - " ret = check_output( # noqa:S603\n", - " File \"/usr/local/Cellar/python@3.10/3.10.11/Frameworks/Python.framework/Versions/3.10/lib/python3.10/subprocess.py\", line 421, in check_output\n", - " return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,\n", - " File \"/usr/local/Cellar/python@3.10/3.10.11/Frameworks/Python.framework/Versions/3.10/lib/python3.10/subprocess.py\", line 526, in run\n", - " raise CalledProcessError(retcode, process.args,\n", - "subprocess.CalledProcessError: Command '['java', '-jar', '/Users/cthoyt/.data/robot/robot.jar', 'merge', '-i', '/Users/cthoyt/.data/pyobo/raw/reo/reo_reasoned_9-6-12.owl', 'convert', '-o', '/Users/cthoyt/.data/pyobo/raw/reo/reo_reasoned_9-6-12.obo']' returned non-zero exit status 1.\n", - "WARNING: [2023-06-29 00:35:00] pyobo.api.names - [reo] no results produced with \n", - "WARNING: [2023-06-29 00:35:00] pyobo.api.names - [cldb] unable to look up results with \n", - "WARNING: [2023-06-29 00:35:00] pyobo.api.names - [chembl.cell] unable to look up results with \n", - "WARNING: [2023-06-29 00:35:00] pyobo.api.names - [cosmic.cell] unable to look up results with \n", - "WARNING: [2023-06-29 00:35:00] pyobo.api.names - [atcc] unable to look up results with \n", - "WARNING: [2023-06-29 00:35:00] pyobo.api.names - [ecacc] unable to look up results with \n", - "WARNING: [2023-06-29 00:35:00] pyobo.api.names - [dsmz] unable to look up results with \n", - "WARNING: [2023-06-29 00:35:03] pyobo.api.names - [jcrb] unable to look up results with \n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -126,20 +81,7 @@ "CLO:0002336 unparsed: \u001b[31mCCL-120\u001b[0m from line:\n", " CCL-120\n", "CLO:0002406 invalid: \u001b[33mdsmz:ACC360\u001b[0m from line:\n", - " DSMZ: ACC 360,COSMIC ID:910568; DSMZ ACC 360\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING: [2023-06-29 00:35:06] pyobo.api.names - [hms.lincs.cell] unable to look up results with \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + " DSMZ: ACC 360,COSMIC ID:910568; DSMZ ACC 360\n", "CLO:0002557 invalid: \u001b[33mcldb:Cl847\u001b[0m from line:\n", " HyperCLDB: Cl847\n", "CLO:0002593 unparsed: \u001b[31m92031916\u001b[0m from line:\n", @@ -195,20 +137,7 @@ "CLO:0008512 unparsed: \u001b[31m94060601\u001b[0m from line:\n", " ECACC: 94060601,94060601; COSMIC ID:910546\n", "CLO:0008870 unparsed: \u001b[31mCCL-113\u001b[0m from line:\n", - " CCL-113\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING: [2023-06-29 00:35:20] pyobo.api.names - [pubchem.bioassay] unable to look up results with \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + " CCL-113\n", "CLO:0009038 unparsed: \u001b[31mATCCHTB-88\u001b[0m from line:\n", " ATCC: HTB-88,COSMIC ID:909720; ATCC HTB-88\n", "CLO:0009042 invalid: \u001b[33matcc:COSMICID:909725\u001b[0m from line:\n", @@ -242,20 +171,7 @@ "CLO:0009973 unparsed: \u001b[31mnotavailable\u001b[0m from line:\n", " BWH: 621-101,not available\n", "CLO:0009998 invalid: \u001b[33mjcrb:JCRB1080.0\u001b[0m from line:\n", - " JHSF: JCRB1080.0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING: [2023-06-29 00:35:32] pyobo.api.names - [iclc] unable to look up results with \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + " JHSF: JCRB1080.0\n", "CLO:0037013 unparsed: \u001b[31mBroadInstitute:contactprovider\u001b[0m from line:\n", " Broad Institute: contact provider\n", "CLO:0037033 unparsed: \u001b[31mDFCI:n/a-1\u001b[0m from line:\n", @@ -321,42 +237,7 @@ "CLO:0037186 invalid: \u001b[33mjcrb:JCRB:JCRB0158.2\u001b[0m from line:\n", " JCRB: JCRB0158.2\n", "CLO:0037189 unparsed: \u001b[31mAsterandbio:\u001b[0m from line:\n", - " Asterandbio:\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ERROR: [2023-06-29 00:35:54] pyobo.api.names - [bao v2.8.4] could not load\n", - "Traceback (most recent call last):\n", - " File \"/Users/cthoyt/dev/pyobo/src/pyobo/api/names.py\", line 132, in get_id_name_mapping\n", - " return _get_id_name_mapping()\n", - " File \"/Users/cthoyt/.virtualenvs/indra/lib/python3.10/site-packages/pystow/cache.py\", line 83, in _wrapped\n", - " rv = func()\n", - " File \"/Users/cthoyt/dev/pyobo/src/pyobo/api/names.py\", line 128, in _get_id_name_mapping\n", - " ontology = get_ontology(prefix, force=force, strict=strict, version=version)\n", - " File \"/Users/cthoyt/dev/pyobo/src/pyobo/identifier_utils.py\", line 115, in _wrapped\n", - " return f(norm_prefix, *args, **kwargs)\n", - " File \"/Users/cthoyt/dev/pyobo/src/pyobo/getters.py\", line 126, in get_ontology\n", - " robot.convert(path, _converted_obo_path, check=robot_check)\n", - " File \"/Users/cthoyt/dev/bioontologies/src/bioontologies/robot.py\", line 449, in convert\n", - " ret = check_output( # noqa:S603\n", - " File \"/usr/local/Cellar/python@3.10/3.10.11/Frameworks/Python.framework/Versions/3.10/lib/python3.10/subprocess.py\", line 421, in check_output\n", - " return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,\n", - " File \"/usr/local/Cellar/python@3.10/3.10.11/Frameworks/Python.framework/Versions/3.10/lib/python3.10/subprocess.py\", line 526, in run\n", - " raise CalledProcessError(retcode, process.args,\n", - "subprocess.CalledProcessError: Command '['java', '-jar', '/Users/cthoyt/.data/robot/robot.jar', 'merge', '-i', '/Users/cthoyt/.data/pyobo/raw/bao/2.8.4/bao_complete.owl', 'convert', '-o', '/Users/cthoyt/.data/pyobo/raw/bao/2.8.4/bao_complete.obo']' returned non-zero exit status 1.\n", - "WARNING: [2023-06-29 00:35:54] pyobo.api.names - [bao] no results produced with \n", - "WARNING: [2023-06-29 00:35:54] pyobo.api.names - [thermofisher] unable to look up results with \n", - "WARNING: [2023-06-29 00:35:54] pyobo.api.names - [biosample] unable to look up results with \n", - "WARNING: [2023-06-29 00:35:54] pyobo.api.names - [ebisc] unable to look up results with \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + " Asterandbio:\n", "CLO:0037271 invalid: \u001b[33mbto:BAO_0030063\u001b[0m from line:\n", " BTO: BAO_0030063\n", "CLO:0050067 unparsed: \u001b[31m:MGH-889\u001b[0m from line:\n", @@ -388,150 +269,37 @@ "CLO:0051539 unparsed: \u001b[31mRCB0478\u001b[0m from line:\n", " RIKEN: COSMIC ID:910929; Riken RCB0478,RCB0478\n" ] + }, + { + "data": { + "text/plain": [ + "10925" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "counter = Counter()\n", - "xx1 = defaultdict(lambda: defaultdict(list))\n", - "xx2 = defaultdict(lambda: defaultdict(list))\n", - "\n", - "\n", - "def split(s):\n", - " return [\n", - " p2.replace(\" \", \"\").rstrip(\")\")\n", - " for p1 in s.strip().split(\";\")\n", - " for p2 in p1.strip().split(\",\")\n", - " ]\n", - "\n", - "\n", - "SKIP_PREFIXES = {\"omim\"}\n", - "Q_PREFIXES = {\"bao\", \"reo\", \"cellosaurus\"}\n", - "\n", - "# Accumulate CLO local unique identifiers to labels\n", - "names = {}\n", - "\n", - "fails = {}\n", - "rows = []\n", - "for node in tqdm(graph.nodes, unit_scale=True, unit=\"node\"):\n", - " if not node.id.startswith(\"http://purl.obolibrary.org/obo/CLO_\"):\n", - " continue\n", - " clo_id = node.id.removeprefix(\"http://purl.obolibrary.org/obo/CLO_\")\n", - " names[clo_id] = node.name\n", - " for p in node.properties or []:\n", - " if p.predicate_raw != \"http://www.w3.org/2000/01/rdf-schema#seeAlso\":\n", - " continue\n", - " for raw_curie in split(p.value_raw):\n", - " curie = raw_curie.removeprefix(\"rrid:\").removeprefix(\"RRID:\")\n", - " if curie.startswith(\"Sanger:COSMICID:\"):\n", - " prefix, identifier = \"cosmic.cell\", curie.removeprefix(\"Sanger:COSMICID:\")\n", - " elif curie.startswith(\"atcc:COSMICID:\"):\n", - " prefix, identifier = \"cosmic.cell\", curie.removeprefix(\"atcc:COSMICID:\")\n", - " elif curie.startswith(\"DSMZ:COSMICID:\"):\n", - " prefix, identifier = \"cosmic.cell\", curie.removeprefix(\"DSMZ:COSMICID:\")\n", - " elif curie.startswith(\"COSMIC: COSMIC ID:\"):\n", - " prefix, identifier = \"cosmic.cell\", curie.removeprefix(\"COSMIC: COSMIC ID:\")\n", - " elif curie.startswith(\"RIKEN:COSMICID:\"):\n", - " prefix, identifier = \"cosmic.cell\", curie.removeprefix(\"RIKEN:COSMICID:\")\n", - " elif curie.startswith(\"COSMICID:\"):\n", - " prefix, identifier = \"cosmic.cell\", curie.removeprefix(\"COSMICID:\")\n", - " elif curie.startswith(\"LINCS_HMS:\"):\n", - " prefix, identifier = \"hms.lincs.cell\", curie.removeprefix(\"LINCS_HMS:\")\n", - " elif curie.startswith(\"CHEMBL:\"):\n", - " prefix, identifier = \"chembl.cell\", curie.removeprefix(\"CHEMBL:\")\n", - " elif curie.startswith(\"ChEMBL:\"):\n", - " prefix, identifier = \"chembl.cell\", curie.removeprefix(\"ChEMBL:\")\n", - " elif curie.startswith(\"BTO_\"):\n", - " prefix, identifier = \"bto\", curie.removeprefix(\"BTO_\")\n", - " elif curie.startswith(\"CVCL_\"):\n", - " prefix, identifier = \"cellosaurus\", curie.removeprefix(\"CVCL_\")\n", - " elif curie.startswith(\"JHSF:\"):\n", - " prefix, identifier = \"jcrb\", curie.removeprefix(\"JHSF:\")\n", - " elif curie.startswith(\"CRL-\"):\n", - " prefix, identifier = \"atcc\", curie\n", - " elif curie.startswith(\"jcrb:JHSF:\"):\n", - " prefix, identifier = \"jcrb\", curie.removeprefix(\"jcrb:JHSF:\")\n", - " elif curie.startswith(\"JCRB\"):\n", - " prefix, identifier = \"jcrb\", curie\n", - " elif curie.startswith(\"JHSF:JCRB\"):\n", - " prefix, identifier = \"jcrb\", curie.removeprefix(\"JHSF:\")\n", - " elif curie.startswith(\"ATCCCRL\"):\n", - " prefix, identifier = \"atcc\", curie.removeprefix(\"ATCC\")\n", - " elif curie.startswith(\"bto:BAO_\"):\n", - " prefix, identifier = \"bao\", curie.removeprefix(\"bto:BAO_\")\n", - " elif curie.startswith(\"ACC\"):\n", - " prefix, identifier = \"dsmz\", curie\n", - " elif curie.startswith(\"DSMZACC\"):\n", - " prefix, identifier = \"dsmz\", curie.removeprefix(\"DSMZ\")\n", - " elif curie.startswith(\"dsmz:ACC\"):\n", - " prefix, identifier = \"dsmz\", \"ACC-\" + curie.removeprefix(\"dsmz:ACC\")\n", - " elif curie.startswith(\"DSMZ:ACC\"):\n", - " prefix, identifier = \"dsmz\", \"ACC-\" + curie.removeprefix(\"DSMZ:ACC\")\n", - " else:\n", - " prefix, identifier = bioregistry.parse_curie(curie)\n", - "\n", - " if prefix is None:\n", - " tqdm.write(\n", - " f\"CLO:{clo_id} unparsed: {click.style(curie, fg='red')} from line:\\n {p.value_raw}\"\n", - " )\n", - " continue\n", - " if prefix in SKIP_PREFIXES:\n", - " continue\n", - " if bioregistry.get_pattern(prefix) is None:\n", - " fails[prefix] = identifier\n", - " if not bioregistry.is_valid_identifier(prefix, identifier):\n", - " c = click.style(f\"{prefix}:{identifier}\", fg=\"yellow\")\n", - " tqdm.write(f\"CLO:{clo_id} invalid: {c} from line:\\n {p.value_raw}\")\n", - " continue\n", - "\n", - " rows.append(\n", - " (\n", - " bioregistry.curie_to_str(\"clo\", clo_id),\n", - " \"oboInOwl:hasDbXref\",\n", - " bioregistry.curie_to_str(prefix, identifier),\n", - " \"semapv:UnspecifiedMatching\",\n", - " node.name,\n", - " pyobo.get_name(prefix, identifier),\n", - " \"obo:clo\",\n", - " graph.version,\n", - " )\n", - " )\n", - "\n", - " counter[prefix, identifier] += 1\n", - " xx1[prefix][clo_id].append(identifier)\n", - " xx2[prefix][identifier].append(clo_id)\n", - "\n", - "\n", - "for p, i in sorted(fails.items()):\n", - " print(click.style(f\"missing pattern in {p}\", fg=\"red\"), \"example:\", i)\n", - "\n", - "mappings_df = pd.DataFrame(\n", - " rows,\n", - " columns=[\n", - " \"subject_id\",\n", - " \"predicate_id\",\n", - " \"object_id\",\n", - " \"mapping_justification\",\n", - " \"subject_label\",\n", - " \"object_label\",\n", - " \"mapping_set_id\",\n", - " \"mapping_set_version\",\n", - " ],\n", - ")\n", - "mappings_df.to_csv(\"clo.sssom.tsv\", sep=\"\\t\", index=False)" + "mappings = get_clo_mappings()\n", + "len(mappings)" ] }, { "cell_type": "markdown", - "id": "d15e8d48", + "id": "b442beab", "metadata": {}, "source": [ - "# Summary of Cross-Referenced Resources" + "## Prefix Summary\n", + "\n", + "The table below this cell summarizes all of the prefixes appearing in cross-references extracted from CLO." ] }, { "cell_type": "code", - "execution_count": 5, - "id": "8a5f503c", + "execution_count": 3, + "id": "1ad34e9d", "metadata": {}, "outputs": [ { @@ -601,6 +369,11 @@ " The Cell Line Data Base (CLDB) is a reference ...\n", " \n", " \n", + " clo\n", + " Cell Line Ontology\n", + " The Cell Line Ontology is a community-based on...\n", + " \n", + " \n", " cosmic.cell\n", " COSMIC Cell Lines\n", " COSMIC, the Catalogue Of Somatic Mutations In ...\n", @@ -679,6 +452,7 @@ "cellosaurus Cellosaurus \n", "chembl.cell ChEMBL database of bioactive drug-like small m... \n", "cldb Cell Line Database \n", + "clo Cell Line Ontology \n", "cosmic.cell COSMIC Cell Lines \n", "dsmz Deutsche Sammlung von Mikroorganismen und Zell... \n", "ebisc European Bank for induced pluripotent Stem Cells \n", @@ -702,6 +476,7 @@ "cellosaurus The Cellosaurus is a knowledge resource on cel... \n", "chembl.cell Chemistry resources \n", "cldb The Cell Line Data Base (CLDB) is a reference ... \n", + "clo The Cell Line Ontology is a community-based on... \n", "cosmic.cell COSMIC, the Catalogue Of Somatic Mutations In ... \n", "dsmz The Leibniz Institute DSMZ is the most diverse... \n", "ebisc Cell line collections \n", @@ -717,24 +492,538 @@ "thermofisher ThermoFisher is a life sciences supply vendor. " ] }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summarize_prefixes(mappings)" + ] + }, + { + "cell_type": "markdown", + "id": "61eb2d19", + "metadata": {}, + "source": [ + "Many of the resources cross-referenced by CLO aren't accessible in a structured format. Therefore, we can't programatically look up names or synonyms. In some (but not all) cases, the resource has a site that can be used to manually examine information about a given record, but this ultimately leaves review very difficult.\n", + "\n", + "There might be an automated way to get the list of all resources that can be used with `pyobo.get_name`, but until that's figured out, the following is a shortlist of resources we can follow up on easily." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d74b5151", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "812" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DESIRED_PREFIXES = {\"bto\", \"efo\", \"mesh\", \"cellosaurus\", \"obi\", \"clo\"}\n", + "\n", + "mappings = keep_prefixes(mappings, prefixes=DESIRED_PREFIXES, progress=False)\n", + "len(mappings)" + ] + }, + { + "cell_type": "markdown", + "id": "c7f277dc", + "metadata": {}, + "source": [ + "## Identify Inconsistencies\n", + "\n", + "The following cell identifies many-to-many mappings, e.g., when a given CLO has multiple cross-references to entities in another semantic space, or visa-versa." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5125de73", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Preparing SSSOM: 0%| | 0.00/25.0 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idsubject_labelpredicate_idobject_idobject_labelmapping_justificationmapping_setmapping_set_versionmapping_set_licensemapping_set_confidence
0clo:0001230HEK293oboInOwl:hasDbXrefcellosaurus:0045HEK293semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
1clo:0037237293-derived celloboInOwl:hasDbXrefcellosaurus:0045HEK293semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
2clo:0007050K 562 celloboInOwl:hasDbXrefcellosaurus:0004K-562semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
3clo:0007059K-562 celloboInOwl:hasDbXrefcellosaurus:0004K-562semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
4clo:0037163Ishikawa celloboInOwl:hasDbXrefcellosaurus:D199Ishikawa 3-H-12semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
5clo:0037230Ishikawa 3-H-12 celloboInOwl:hasDbXrefcellosaurus:D199Ishikawa 3-H-12semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
6clo:0037300BALL-1 celloboInOwl:hasDbXrefcellosaurus:1075BALL-1semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
7clo:0051004RCB0256 celloboInOwl:hasDbXrefcellosaurus:1075BALL-1semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
8clo:0051005RCB1882 celloboInOwl:hasDbXrefcellosaurus:1075BALL-1semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
9clo:0037372HEK293T celloboInOwl:hasDbXrefcellosaurus:0063HEK293Tsemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
10clo:0050894RCB2202 celloboInOwl:hasDbXrefcellosaurus:0063HEK293Tsemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
11clo:0050405RCB2280 celloboInOwl:hasDbXrefcellosaurus:1272HCE-Tsemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
12clo:0050406RCB1384 celloboInOwl:hasDbXrefcellosaurus:1272HCE-Tsemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
13clo:0002585COR-L23 celloboInOwl:hasDbXrefefo:0002142CORL23semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
14clo:0037287COR123 celloboInOwl:hasDbXrefefo:0002142CORL23semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
15clo:0007634MDA-MB-231 celloboInOwl:hasDbXrefefo:0001209MDAMB231semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
16clo:0037291MDAMB231 celloboInOwl:hasDbXrefefo:0001209MDAMB231semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
17clo:0009034SK-BR-3 celloboInOwl:hasDbXrefefo:0001236SKBR3semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
18clo:0037295SKBR3 celloboInOwl:hasDbXrefefo:0001236SKBR3semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
19clo:0009040SK-MEL-1 celloboInOwl:hasDbXrefefo:0002332SKMEL1semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
20clo:0037292SKMEL1oboInOwl:hasDbXrefefo:0002332SKMEL1semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
21clo:00013453T3 celloboInOwl:hasDbXrefmesh:D0164753T3 Cellssemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
22clo:00372613T3-derived celloboInOwl:hasDbXrefmesh:D0164753T3 Cellssemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
23clo:0002596COS-1 celloboInOwl:hasDbXrefmesh:D019556COS Cellssemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
24clo:0002597COS-7 celloboInOwl:hasDbXrefmesh:D019556COS Cellssemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
\n", + "" + ], + "text/plain": [ + " subject_id subject_label predicate_id object_id \\\n", + "0 clo:0001230 HEK293 oboInOwl:hasDbXref cellosaurus:0045 \n", + "1 clo:0037237 293-derived cell oboInOwl:hasDbXref cellosaurus:0045 \n", + "2 clo:0007050 K 562 cell oboInOwl:hasDbXref cellosaurus:0004 \n", + "3 clo:0007059 K-562 cell oboInOwl:hasDbXref cellosaurus:0004 \n", + "4 clo:0037163 Ishikawa cell oboInOwl:hasDbXref cellosaurus:D199 \n", + "5 clo:0037230 Ishikawa 3-H-12 cell oboInOwl:hasDbXref cellosaurus:D199 \n", + "6 clo:0037300 BALL-1 cell oboInOwl:hasDbXref cellosaurus:1075 \n", + "7 clo:0051004 RCB0256 cell oboInOwl:hasDbXref cellosaurus:1075 \n", + "8 clo:0051005 RCB1882 cell oboInOwl:hasDbXref cellosaurus:1075 \n", + "9 clo:0037372 HEK293T cell oboInOwl:hasDbXref cellosaurus:0063 \n", + "10 clo:0050894 RCB2202 cell oboInOwl:hasDbXref cellosaurus:0063 \n", + "11 clo:0050405 RCB2280 cell oboInOwl:hasDbXref cellosaurus:1272 \n", + "12 clo:0050406 RCB1384 cell oboInOwl:hasDbXref cellosaurus:1272 \n", + "13 clo:0002585 COR-L23 cell oboInOwl:hasDbXref efo:0002142 \n", + "14 clo:0037287 COR123 cell oboInOwl:hasDbXref efo:0002142 \n", + "15 clo:0007634 MDA-MB-231 cell oboInOwl:hasDbXref efo:0001209 \n", + "16 clo:0037291 MDAMB231 cell oboInOwl:hasDbXref efo:0001209 \n", + "17 clo:0009034 SK-BR-3 cell oboInOwl:hasDbXref efo:0001236 \n", + "18 clo:0037295 SKBR3 cell oboInOwl:hasDbXref efo:0001236 \n", + "19 clo:0009040 SK-MEL-1 cell oboInOwl:hasDbXref efo:0002332 \n", + "20 clo:0037292 SKMEL1 oboInOwl:hasDbXref efo:0002332 \n", + "21 clo:0001345 3T3 cell oboInOwl:hasDbXref mesh:D016475 \n", + "22 clo:0037261 3T3-derived cell oboInOwl:hasDbXref mesh:D016475 \n", + "23 clo:0002596 COS-1 cell oboInOwl:hasDbXref mesh:D019556 \n", + "24 clo:0002597 COS-7 cell oboInOwl:hasDbXref mesh:D019556 \n", + "\n", + " object_label mapping_justification mapping_set \\\n", + "0 HEK293 semapv:UnspecifiedMatching clo \n", + "1 HEK293 semapv:UnspecifiedMatching clo \n", + "2 K-562 semapv:UnspecifiedMatching clo \n", + "3 K-562 semapv:UnspecifiedMatching clo \n", + "4 Ishikawa 3-H-12 semapv:UnspecifiedMatching clo \n", + "5 Ishikawa 3-H-12 semapv:UnspecifiedMatching clo \n", + "6 BALL-1 semapv:UnspecifiedMatching clo \n", + "7 BALL-1 semapv:UnspecifiedMatching clo \n", + "8 BALL-1 semapv:UnspecifiedMatching clo \n", + "9 HEK293T semapv:UnspecifiedMatching clo \n", + "10 HEK293T semapv:UnspecifiedMatching clo \n", + "11 HCE-T semapv:UnspecifiedMatching clo \n", + "12 HCE-T semapv:UnspecifiedMatching clo \n", + "13 CORL23 semapv:UnspecifiedMatching clo \n", + "14 CORL23 semapv:UnspecifiedMatching clo \n", + "15 MDAMB231 semapv:UnspecifiedMatching clo \n", + "16 MDAMB231 semapv:UnspecifiedMatching clo \n", + "17 SKBR3 semapv:UnspecifiedMatching clo \n", + "18 SKBR3 semapv:UnspecifiedMatching clo \n", + "19 SKMEL1 semapv:UnspecifiedMatching clo \n", + "20 SKMEL1 semapv:UnspecifiedMatching clo \n", + "21 3T3 Cells semapv:UnspecifiedMatching clo \n", + "22 3T3 Cells semapv:UnspecifiedMatching clo \n", + "23 COS Cells semapv:UnspecifiedMatching clo \n", + "24 COS Cells semapv:UnspecifiedMatching clo \n", + "\n", + " mapping_set_version mapping_set_license mapping_set_confidence \n", + "0 2.1.178 CC-BY-3.0 0.8 \n", + "1 2.1.178 CC-BY-3.0 0.8 \n", + "2 2.1.178 CC-BY-3.0 0.8 \n", + "3 2.1.178 CC-BY-3.0 0.8 \n", + "4 2.1.178 CC-BY-3.0 0.8 \n", + "5 2.1.178 CC-BY-3.0 0.8 \n", + "6 2.1.178 CC-BY-3.0 0.8 \n", + "7 2.1.178 CC-BY-3.0 0.8 \n", + "8 2.1.178 CC-BY-3.0 0.8 \n", + "9 2.1.178 CC-BY-3.0 0.8 \n", + "10 2.1.178 CC-BY-3.0 0.8 \n", + "11 2.1.178 CC-BY-3.0 0.8 \n", + "12 2.1.178 CC-BY-3.0 0.8 \n", + "13 2.1.178 CC-BY-3.0 0.8 \n", + "14 2.1.178 CC-BY-3.0 0.8 \n", + "15 2.1.178 CC-BY-3.0 0.8 \n", + "16 2.1.178 CC-BY-3.0 0.8 \n", + "17 2.1.178 CC-BY-3.0 0.8 \n", + "18 2.1.178 CC-BY-3.0 0.8 \n", + "19 2.1.178 CC-BY-3.0 0.8 \n", + "20 2.1.178 CC-BY-3.0 0.8 \n", + "21 2.1.178 CC-BY-3.0 0.8 \n", + "22 2.1.178 CC-BY-3.0 0.8 \n", + "23 2.1.178 CC-BY-3.0 0.8 \n", + "24 2.1.178 CC-BY-3.0 0.8 " + ] + }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pd.DataFrame(\n", - " [\n", - " (p, bioregistry.get_name(p), bioregistry.get_description(p))\n", - " for p in sorted({prefix for prefix, _ in counter})\n", - " ],\n", - " columns=[\"prefix\", \"name\", \"description\"],\n", - ").set_index(\"prefix\")" + "m2m_mappings = get_many_to_many(mappings)\n", + "get_sssom_df(m2m_mappings, add_labels=True)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "cd09647e", "metadata": {}, "outputs": [ @@ -742,12 +1031,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "can not look up name for efo 0002082\n", - "can not look up name for efo 0002823\n", - "can not look up name for efo 0002080\n", - "can not look up name for efo 0002336\n", - "can not look up name for efo 0001256\n", - "can not look up name for efo 0002387\n" + "can not look up name for efo:0002082\n", + "can not look up name for efo:0002823\n", + "can not look up name for efo:0002080\n", + "can not look up name for efo:0002336\n", + "can not look up name for efo:0001256\n", + "can not look up name for efo:0002387\n" ] }, { @@ -787,12 +1076,12 @@ " \n", " 0\n", " clo\n", - " 0001230\n", - " HEK293\n", + " 0001008\n", + " 697 cell\n", " skos:exactMatch\n", - " bto\n", - " 0000007\n", - " HEK-293 cell\n", + " cellosaurus\n", + " 0079\n", + " 697\n", " semapv:UnspecifiedMatching\n", " 0.8\n", " clo\n", @@ -800,12 +1089,12 @@ " \n", " 1\n", " clo\n", - " 0037238\n", - " HEK293-T-REx cell\n", + " 0001088\n", + " 143B cell\n", " skos:exactMatch\n", - " bto\n", - " 0005238\n", - " T-REx 293 cell\n", + " cellosaurus\n", + " 2270\n", + " 143B\n", " semapv:UnspecifiedMatching\n", " 0.8\n", " clo\n", @@ -813,12 +1102,12 @@ " \n", " 2\n", " clo\n", - " 0037242\n", - " 2B4 cell\n", + " 0001230\n", + " HEK293\n", " skos:exactMatch\n", " bto\n", - " 0005682\n", - " 2B4-T cell\n", + " 0000007\n", + " HEK-293 cell\n", " semapv:UnspecifiedMatching\n", " 0.8\n", " clo\n", @@ -826,12 +1115,12 @@ " \n", " 3\n", " clo\n", - " 0037256\n", - " HBL-1 cell\n", + " 0001230\n", + " HEK293\n", " skos:exactMatch\n", - " bto\n", - " 0002522\n", - " HBL-1 cell\n", + " efo\n", + " 0001182\n", + " HEK293\n", " semapv:UnspecifiedMatching\n", " 0.8\n", " clo\n", @@ -839,12 +1128,12 @@ " \n", " 4\n", " clo\n", - " 0037269\n", - " S16 cell\n", + " 0001234\n", + " 293/CHE-Fc cell\n", " skos:exactMatch\n", - " bto\n", - " 0002937\n", - " S-16 cell\n", + " cellosaurus\n", + " 6352\n", + " 293/CHE-Fc\n", " semapv:UnspecifiedMatching\n", " 0.8\n", " clo\n", @@ -865,12 +1154,12 @@ " \n", " 776\n", " clo\n", - " 0008392\n", - " PC-12 cell\n", + " 0051547\n", + " RCB2084 cell\n", " skos:exactMatch\n", - " mesh\n", - " D016716\n", - " PC12 Cells\n", + " cellosaurus\n", + " 1736\n", + " TALL-1 [Human adult T-ALL]\n", " semapv:UnspecifiedMatching\n", " 0.8\n", " clo\n", @@ -878,12 +1167,12 @@ " \n", " 777\n", " clo\n", - " 0009227\n", - " Swiss-3T3 cell\n", + " 0051567\n", + " RCB1902 cell\n", " skos:exactMatch\n", - " mesh\n", - " D041701\n", - " Swiss 3T3 Cells\n", + " cellosaurus\n", + " 1289\n", + " HSC-4\n", " semapv:UnspecifiedMatching\n", " 0.8\n", " clo\n", @@ -891,12 +1180,12 @@ " \n", " 778\n", " clo\n", - " 0009465\n", - " U-937 cell\n", + " 0051568\n", + " RCB1974 cell\n", " skos:exactMatch\n", - " mesh\n", - " D020298\n", - " U937 Cells\n", + " cellosaurus\n", + " 1675\n", + " SAS\n", " semapv:UnspecifiedMatching\n", " 0.8\n", " clo\n", @@ -904,12 +1193,12 @@ " \n", " 779\n", " clo\n", - " 0009524\n", - " Vero cell\n", + " 0051569\n", + " RCB1975 cell\n", " skos:exactMatch\n", - " mesh\n", - " D014709\n", - " Vero Cells\n", + " cellosaurus\n", + " 1288\n", + " HSC-3\n", " semapv:UnspecifiedMatching\n", " 0.8\n", " clo\n", @@ -917,12 +1206,12 @@ " \n", " 780\n", " clo\n", - " 0037201\n", - " RMA cell\n", + " 0051609\n", + " RCB0881 cell\n", " skos:exactMatch\n", - " obi\n", - " 1110038\n", - " RMA cell line\n", + " cellosaurus\n", + " 1621\n", + " OCUB-M\n", " semapv:UnspecifiedMatching\n", " 0.8\n", " clo\n", @@ -933,31 +1222,31 @@ "" ], "text/plain": [ - " source prefix source identifier source name relation \\\n", - "0 clo 0001230 HEK293 skos:exactMatch \n", - "1 clo 0037238 HEK293-T-REx cell skos:exactMatch \n", - "2 clo 0037242 2B4 cell skos:exactMatch \n", - "3 clo 0037256 HBL-1 cell skos:exactMatch \n", - "4 clo 0037269 S16 cell skos:exactMatch \n", - ".. ... ... ... ... \n", - "776 clo 0008392 PC-12 cell skos:exactMatch \n", - "777 clo 0009227 Swiss-3T3 cell skos:exactMatch \n", - "778 clo 0009465 U-937 cell skos:exactMatch \n", - "779 clo 0009524 Vero cell skos:exactMatch \n", - "780 clo 0037201 RMA cell skos:exactMatch \n", + " source prefix source identifier source name relation \\\n", + "0 clo 0001008 697 cell skos:exactMatch \n", + "1 clo 0001088 143B cell skos:exactMatch \n", + "2 clo 0001230 HEK293 skos:exactMatch \n", + "3 clo 0001230 HEK293 skos:exactMatch \n", + "4 clo 0001234 293/CHE-Fc cell skos:exactMatch \n", + ".. ... ... ... ... \n", + "776 clo 0051547 RCB2084 cell skos:exactMatch \n", + "777 clo 0051567 RCB1902 cell skos:exactMatch \n", + "778 clo 0051568 RCB1974 cell skos:exactMatch \n", + "779 clo 0051569 RCB1975 cell skos:exactMatch \n", + "780 clo 0051609 RCB0881 cell skos:exactMatch \n", "\n", - " target prefix target identifier target name \\\n", - "0 bto 0000007 HEK-293 cell \n", - "1 bto 0005238 T-REx 293 cell \n", - "2 bto 0005682 2B4-T cell \n", - "3 bto 0002522 HBL-1 cell \n", - "4 bto 0002937 S-16 cell \n", - ".. ... ... ... \n", - "776 mesh D016716 PC12 Cells \n", - "777 mesh D041701 Swiss 3T3 Cells \n", - "778 mesh D020298 U937 Cells \n", - "779 mesh D014709 Vero Cells \n", - "780 obi 1110038 RMA cell line \n", + " target prefix target identifier target name \\\n", + "0 cellosaurus 0079 697 \n", + "1 cellosaurus 2270 143B \n", + "2 bto 0000007 HEK-293 cell \n", + "3 efo 0001182 HEK293 \n", + "4 cellosaurus 6352 293/CHE-Fc \n", + ".. ... ... ... \n", + "776 cellosaurus 1736 TALL-1 [Human adult T-ALL] \n", + "777 cellosaurus 1289 HSC-4 \n", + "778 cellosaurus 1675 SAS \n", + "779 cellosaurus 1288 HSC-3 \n", + "780 cellosaurus 1621 OCUB-M \n", "\n", " type confidence source \n", "0 semapv:UnspecifiedMatching 0.8 clo \n", @@ -975,91 +1264,47 @@ "[781 rows x 10 columns]" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "DESIRED_PREFIXES = {\"bto\", \"efo\", \"mesh\", \"cellosaurus\", \"obi\"}\n", - "\n", - "\n", "rows = []\n", - "for prefix in sorted(DESIRED_PREFIXES):\n", - " pyobo.get_id_name_mapping(prefix, strict=prefix != \"cellosaurus\")\n", - "\n", - " # later we can relax these constraints\n", - " external_skip = {\n", - " external_id for external_id, clo_ids in xx2[prefix].items() if len(clo_ids) > 1\n", - " }\n", - "\n", - " for clo_id, external_ids in xx1[prefix].items():\n", - " if len(external_ids) > 1:\n", - " continue\n", - " external_id = external_ids[0]\n", - " if external_id in external_skip:\n", - " continue\n", - "\n", - " clo_name = names[clo_id]\n", - " external_name = pyobo.get_name(prefix, external_id)\n", - " if not clo_name or not external_name:\n", - " print(\"can not look up name for\", prefix, external_id)\n", - " continue\n", - " rows.append(\n", - " PredictionTuple(\n", - " \"clo\",\n", - " clo_id,\n", - " clo_name,\n", - " \"skos:exactMatch\",\n", - " prefix,\n", - " external_id,\n", - " external_name,\n", - " \"semapv:UnspecifiedMatching\",\n", - " 0.8,\n", - " \"clo\",\n", - " )\n", + "for mapping in filter_mappings(mappings, m2m_mappings, progress=False):\n", + " s_name = pyobo.get_name(*mapping.s.pair)\n", + " if not s_name:\n", + " tqdm.write(f\"can not look up name for {mapping.s.curie}\")\n", + " continue\n", + " o_name = pyobo.get_name(*mapping.o.pair)\n", + " if not o_name:\n", + " tqdm.write(f\"can not look up name for {mapping.o.curie}\")\n", + " continue\n", + " \n", + " rows.append(\n", + " PredictionTuple(\n", + " *mapping.s.pair,\n", + " s_name,\n", + " \"skos:exactMatch\",\n", + " *mapping.o.pair,\n", + " o_name,\n", + " \"semapv:UnspecifiedMatching\",\n", + " 0.8,\n", + " \"clo\",\n", " )\n", + " )\n", "\n", "pd.DataFrame(rows, columns=PREDICTIONS_HEADER)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "d77b5def", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9344a284eab54762bf2117d88a9426f9", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Removing curated from predicted: 0%| | 0.00/41.8k [00:00