From d728c92d788a1dcd3eb6fb594afb032b2147dcbf Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Thu, 29 Jun 2023 23:09:50 +0200 Subject: [PATCH] Update notebook --- notebooks/Process CLO Mappings.ipynb | 1395 +++++++++++++++++++++ src/biomappings/resources/__init__.py | 45 +- src/biomappings/resources/predictions.tsv | 25 + src/biomappings/templates/home.html | 4 +- 4 files changed, 1450 insertions(+), 19 deletions(-) create mode 100644 notebooks/Process CLO Mappings.ipynb diff --git a/notebooks/Process CLO Mappings.ipynb b/notebooks/Process CLO Mappings.ipynb new file mode 100644 index 00000000..80596041 --- /dev/null +++ b/notebooks/Process CLO Mappings.ipynb @@ -0,0 +1,1395 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "506955b2", + "metadata": {}, + "source": [ + "# Processing CLO Mappings\n", + "\n", + "\n", + "The [Cell Line Ontology (CLO)](https://bioregistry.io/registry/clo) is a detailed resouce, however it does not follow standard OBO modeling pattern for cross-references that either uses `oboInOwl:hasDbXref` or a SKOS and pointing to a single CURIE encoded as a string. Instead, it uses `rdfs:seeAlso` with a combination of non-standard CURIEs that are either comma or semi-colon delimited.\n", + "\n", + "This notebook attempts to unpack and operationalize these cross-references.\n", + "\n", + "See also:\n", + "\n", + "- https://github.com/CLO-ontology/CLO/issues/103\n", + "- https://gist.github.com/cthoyt/a91ae12a94c7e1647e9d9d8fa61e80ce" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "676743ba", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import Counter, defaultdict\n", + "\n", + "import bioontologies\n", + "import bioregistry\n", + "import click\n", + "import pandas as pd\n", + "import pyobo\n", + "from tqdm.auto import tqdm\n", + "\n", + "from semra.sources.clo import get_clo_mappings\n", + "from semra.api import summarize_prefixes, get_many_to_many, keep_prefixes, filter_mappings\n", + "from semra.io import get_sssom_df\n", + "\n", + "from biomappings import PredictionTuple\n", + "from biomappings.resources import PREDICTIONS_HEADER, append_prediction_tuples" + ] + }, + { + "cell_type": "markdown", + "id": "84102e56", + "metadata": {}, + "source": [ + "## Extraction and Processing\n", + "\n", + "The following cell uses [this script](https://github.com/biopragmatics/semra/blob/main/src/semra/sources/clo.py) in [SeMRA](https://github.com/biopragmatics/semra) to extract cross-references from CLO." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2e7615a0", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a77ef9ccbc444b719b1bb69f23ba30d5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0.00/44.4k [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namedescription
prefix
atccAmerican Type Culture CollectionThe American Type Culture Collection (ATCC) is...
baoBioAssay OntologyThe BioAssay Ontology (BAO) describes chemical...
biosampleBioSampleThe BioSample Database stores information abou...
btoBRENDA Tissue OntologyThe Brenda tissue ontology is a structured con...
cellosaurusCellosaurusThe Cellosaurus is a knowledge resource on cel...
chembl.cellChEMBL database of bioactive drug-like small m...Chemistry resources
cldbCell Line DatabaseThe Cell Line Data Base (CLDB) is a reference ...
cloCell Line OntologyThe Cell Line Ontology is a community-based on...
cosmic.cellCOSMIC Cell LinesCOSMIC, the Catalogue Of Somatic Mutations In ...
dsmzDeutsche Sammlung von Mikroorganismen und Zell...The Leibniz Institute DSMZ is the most diverse...
ebiscEuropean Bank for induced pluripotent Stem CellsCell line collections
ecaccEuropean Collection of Authenticated Cell CultureThe European Collection of Authenticated Cell ...
efoExperimental Factor OntologyThe Experimental Factor Ontology (EFO) provide...
hms.lincs.cellHMS LINCS CellDatabase contains all publicly available HMS L...
iclcInterlab Cell Line CollectionCell line collections
jcrbJRBC Cell BankCell line collections
meshMedical Subject HeadingsMeSH (Medical Subject Headings) is the Nationa...
obiOntology for Biomedical InvestigationsThe Ontology for Biomedical Investigations (OB...
pubchem.bioassayNCBI PubChem database of bioassay recordsPubChem provides information on the biological...
reoReagent OntologyThe Reagent Ontology (ReO) adheres to OBO Foun...
thermofisherThermo Fisher ScientificThermoFisher is a life sciences supply vendor.
\n", + "" + ], + "text/plain": [ + " name \\\n", + "prefix \n", + "atcc American Type Culture Collection \n", + "bao BioAssay Ontology \n", + "biosample BioSample \n", + "bto BRENDA Tissue Ontology \n", + "cellosaurus Cellosaurus \n", + "chembl.cell ChEMBL database of bioactive drug-like small m... \n", + "cldb Cell Line Database \n", + "clo Cell Line Ontology \n", + "cosmic.cell COSMIC Cell Lines \n", + "dsmz Deutsche Sammlung von Mikroorganismen und Zell... \n", + "ebisc European Bank for induced pluripotent Stem Cells \n", + "ecacc European Collection of Authenticated Cell Culture \n", + "efo Experimental Factor Ontology \n", + "hms.lincs.cell HMS LINCS Cell \n", + "iclc Interlab Cell Line Collection \n", + "jcrb JRBC Cell Bank \n", + "mesh Medical Subject Headings \n", + "obi Ontology for Biomedical Investigations \n", + "pubchem.bioassay NCBI PubChem database of bioassay records \n", + "reo Reagent Ontology \n", + "thermofisher Thermo Fisher Scientific \n", + "\n", + " description \n", + "prefix \n", + "atcc The American Type Culture Collection (ATCC) is... \n", + "bao The BioAssay Ontology (BAO) describes chemical... \n", + "biosample The BioSample Database stores information abou... \n", + "bto The Brenda tissue ontology is a structured con... \n", + "cellosaurus The Cellosaurus is a knowledge resource on cel... \n", + "chembl.cell Chemistry resources \n", + "cldb The Cell Line Data Base (CLDB) is a reference ... \n", + "clo The Cell Line Ontology is a community-based on... \n", + "cosmic.cell COSMIC, the Catalogue Of Somatic Mutations In ... \n", + "dsmz The Leibniz Institute DSMZ is the most diverse... \n", + "ebisc Cell line collections \n", + "ecacc The European Collection of Authenticated Cell ... \n", + "efo The Experimental Factor Ontology (EFO) provide... \n", + "hms.lincs.cell Database contains all publicly available HMS L... \n", + "iclc Cell line collections \n", + "jcrb Cell line collections \n", + "mesh MeSH (Medical Subject Headings) is the Nationa... \n", + "obi The Ontology for Biomedical Investigations (OB... \n", + "pubchem.bioassay PubChem provides information on the biological... \n", + "reo The Reagent Ontology (ReO) adheres to OBO Foun... \n", + "thermofisher ThermoFisher is a life sciences supply vendor. " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summarize_prefixes(mappings)" + ] + }, + { + "cell_type": "markdown", + "id": "611e7970", + "metadata": {}, + "source": [ + "Many of the resources cross-referenced by CLO aren't accessible in a structured format. Therefore, we can't programatically look up names or synonyms. In some (but not all) cases, the resource has a site that can be used to manually examine information about a given record, but this ultimately leaves review very difficult.\n", + "\n", + "There might be an automated way to get the list of all resources that can be used with `pyobo.get_name`, but until that's figured out, the following is a shortlist of resources we can follow up on easily." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "eece0802", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "812" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DESIRED_PREFIXES = {\"bto\", \"efo\", \"mesh\", \"cellosaurus\", \"obi\", \"clo\"}\n", + "\n", + "mappings = keep_prefixes(mappings, prefixes=DESIRED_PREFIXES, progress=False)\n", + "len(mappings)" + ] + }, + { + "cell_type": "markdown", + "id": "dbb07f85", + "metadata": {}, + "source": [ + "## Identify Inconsistencies\n", + "\n", + "The following cell identifies many-to-many mappings, e.g., when a given CLO has multiple cross-references to entities in another semantic space, or visa-versa." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ddd69547", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Preparing SSSOM: 0%| | 0.00/25.0 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subject_idsubject_labelpredicate_idobject_idobject_labelmapping_justificationmapping_setmapping_set_versionmapping_set_licensemapping_set_confidence
0clo:0001230HEK293oboInOwl:hasDbXrefcellosaurus:0045HEK293semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
1clo:0037237293-derived celloboInOwl:hasDbXrefcellosaurus:0045HEK293semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
2clo:0007050K 562 celloboInOwl:hasDbXrefcellosaurus:0004K-562semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
3clo:0007059K-562 celloboInOwl:hasDbXrefcellosaurus:0004K-562semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
4clo:0037163Ishikawa celloboInOwl:hasDbXrefcellosaurus:D199Ishikawa 3-H-12semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
5clo:0037230Ishikawa 3-H-12 celloboInOwl:hasDbXrefcellosaurus:D199Ishikawa 3-H-12semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
6clo:0037300BALL-1 celloboInOwl:hasDbXrefcellosaurus:1075BALL-1semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
7clo:0051004RCB0256 celloboInOwl:hasDbXrefcellosaurus:1075BALL-1semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
8clo:0051005RCB1882 celloboInOwl:hasDbXrefcellosaurus:1075BALL-1semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
9clo:0037372HEK293T celloboInOwl:hasDbXrefcellosaurus:0063HEK293Tsemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
10clo:0050894RCB2202 celloboInOwl:hasDbXrefcellosaurus:0063HEK293Tsemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
11clo:0050405RCB2280 celloboInOwl:hasDbXrefcellosaurus:1272HCE-Tsemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
12clo:0050406RCB1384 celloboInOwl:hasDbXrefcellosaurus:1272HCE-Tsemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
13clo:0002585COR-L23 celloboInOwl:hasDbXrefefo:0002142CORL23semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
14clo:0037287COR123 celloboInOwl:hasDbXrefefo:0002142CORL23semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
15clo:0007634MDA-MB-231 celloboInOwl:hasDbXrefefo:0001209MDAMB231semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
16clo:0037291MDAMB231 celloboInOwl:hasDbXrefefo:0001209MDAMB231semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
17clo:0009034SK-BR-3 celloboInOwl:hasDbXrefefo:0001236SKBR3semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
18clo:0037295SKBR3 celloboInOwl:hasDbXrefefo:0001236SKBR3semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
19clo:0009040SK-MEL-1 celloboInOwl:hasDbXrefefo:0002332SKMEL1semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
20clo:0037292SKMEL1oboInOwl:hasDbXrefefo:0002332SKMEL1semapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
21clo:00013453T3 celloboInOwl:hasDbXrefmesh:D0164753T3 Cellssemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
22clo:00372613T3-derived celloboInOwl:hasDbXrefmesh:D0164753T3 Cellssemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
23clo:0002596COS-1 celloboInOwl:hasDbXrefmesh:D019556COS Cellssemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
24clo:0002597COS-7 celloboInOwl:hasDbXrefmesh:D019556COS Cellssemapv:UnspecifiedMatchingclo2.1.178CC-BY-3.00.8
\n", + "" + ], + "text/plain": [ + " subject_id subject_label predicate_id object_id \\\n", + "0 clo:0001230 HEK293 oboInOwl:hasDbXref cellosaurus:0045 \n", + "1 clo:0037237 293-derived cell oboInOwl:hasDbXref cellosaurus:0045 \n", + "2 clo:0007050 K 562 cell oboInOwl:hasDbXref cellosaurus:0004 \n", + "3 clo:0007059 K-562 cell oboInOwl:hasDbXref cellosaurus:0004 \n", + "4 clo:0037163 Ishikawa cell oboInOwl:hasDbXref cellosaurus:D199 \n", + "5 clo:0037230 Ishikawa 3-H-12 cell oboInOwl:hasDbXref cellosaurus:D199 \n", + "6 clo:0037300 BALL-1 cell oboInOwl:hasDbXref cellosaurus:1075 \n", + "7 clo:0051004 RCB0256 cell oboInOwl:hasDbXref cellosaurus:1075 \n", + "8 clo:0051005 RCB1882 cell oboInOwl:hasDbXref cellosaurus:1075 \n", + "9 clo:0037372 HEK293T cell oboInOwl:hasDbXref cellosaurus:0063 \n", + "10 clo:0050894 RCB2202 cell oboInOwl:hasDbXref cellosaurus:0063 \n", + "11 clo:0050405 RCB2280 cell oboInOwl:hasDbXref cellosaurus:1272 \n", + "12 clo:0050406 RCB1384 cell oboInOwl:hasDbXref cellosaurus:1272 \n", + "13 clo:0002585 COR-L23 cell oboInOwl:hasDbXref efo:0002142 \n", + "14 clo:0037287 COR123 cell oboInOwl:hasDbXref efo:0002142 \n", + "15 clo:0007634 MDA-MB-231 cell oboInOwl:hasDbXref efo:0001209 \n", + "16 clo:0037291 MDAMB231 cell oboInOwl:hasDbXref efo:0001209 \n", + "17 clo:0009034 SK-BR-3 cell oboInOwl:hasDbXref efo:0001236 \n", + "18 clo:0037295 SKBR3 cell oboInOwl:hasDbXref efo:0001236 \n", + "19 clo:0009040 SK-MEL-1 cell oboInOwl:hasDbXref efo:0002332 \n", + "20 clo:0037292 SKMEL1 oboInOwl:hasDbXref efo:0002332 \n", + "21 clo:0001345 3T3 cell oboInOwl:hasDbXref mesh:D016475 \n", + "22 clo:0037261 3T3-derived cell oboInOwl:hasDbXref mesh:D016475 \n", + "23 clo:0002596 COS-1 cell oboInOwl:hasDbXref mesh:D019556 \n", + "24 clo:0002597 COS-7 cell oboInOwl:hasDbXref mesh:D019556 \n", + "\n", + " object_label mapping_justification mapping_set \\\n", + "0 HEK293 semapv:UnspecifiedMatching clo \n", + "1 HEK293 semapv:UnspecifiedMatching clo \n", + "2 K-562 semapv:UnspecifiedMatching clo \n", + "3 K-562 semapv:UnspecifiedMatching clo \n", + "4 Ishikawa 3-H-12 semapv:UnspecifiedMatching clo \n", + "5 Ishikawa 3-H-12 semapv:UnspecifiedMatching clo \n", + "6 BALL-1 semapv:UnspecifiedMatching clo \n", + "7 BALL-1 semapv:UnspecifiedMatching clo \n", + "8 BALL-1 semapv:UnspecifiedMatching clo \n", + "9 HEK293T semapv:UnspecifiedMatching clo \n", + "10 HEK293T semapv:UnspecifiedMatching clo \n", + "11 HCE-T semapv:UnspecifiedMatching clo \n", + "12 HCE-T semapv:UnspecifiedMatching clo \n", + "13 CORL23 semapv:UnspecifiedMatching clo \n", + "14 CORL23 semapv:UnspecifiedMatching clo \n", + "15 MDAMB231 semapv:UnspecifiedMatching clo \n", + "16 MDAMB231 semapv:UnspecifiedMatching clo \n", + "17 SKBR3 semapv:UnspecifiedMatching clo \n", + "18 SKBR3 semapv:UnspecifiedMatching clo \n", + "19 SKMEL1 semapv:UnspecifiedMatching clo \n", + "20 SKMEL1 semapv:UnspecifiedMatching clo \n", + "21 3T3 Cells semapv:UnspecifiedMatching clo \n", + "22 3T3 Cells semapv:UnspecifiedMatching clo \n", + "23 COS Cells semapv:UnspecifiedMatching clo \n", + "24 COS Cells semapv:UnspecifiedMatching clo \n", + "\n", + " mapping_set_version mapping_set_license mapping_set_confidence \n", + "0 2.1.178 CC-BY-3.0 0.8 \n", + "1 2.1.178 CC-BY-3.0 0.8 \n", + "2 2.1.178 CC-BY-3.0 0.8 \n", + "3 2.1.178 CC-BY-3.0 0.8 \n", + "4 2.1.178 CC-BY-3.0 0.8 \n", + "5 2.1.178 CC-BY-3.0 0.8 \n", + "6 2.1.178 CC-BY-3.0 0.8 \n", + "7 2.1.178 CC-BY-3.0 0.8 \n", + "8 2.1.178 CC-BY-3.0 0.8 \n", + "9 2.1.178 CC-BY-3.0 0.8 \n", + "10 2.1.178 CC-BY-3.0 0.8 \n", + "11 2.1.178 CC-BY-3.0 0.8 \n", + "12 2.1.178 CC-BY-3.0 0.8 \n", + "13 2.1.178 CC-BY-3.0 0.8 \n", + "14 2.1.178 CC-BY-3.0 0.8 \n", + "15 2.1.178 CC-BY-3.0 0.8 \n", + "16 2.1.178 CC-BY-3.0 0.8 \n", + "17 2.1.178 CC-BY-3.0 0.8 \n", + "18 2.1.178 CC-BY-3.0 0.8 \n", + "19 2.1.178 CC-BY-3.0 0.8 \n", + "20 2.1.178 CC-BY-3.0 0.8 \n", + "21 2.1.178 CC-BY-3.0 0.8 \n", + "22 2.1.178 CC-BY-3.0 0.8 \n", + "23 2.1.178 CC-BY-3.0 0.8 \n", + "24 2.1.178 CC-BY-3.0 0.8 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m2m_mappings = get_many_to_many(mappings)\n", + "get_sssom_df(m2m_mappings, add_labels=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d3d7d3b8", + "metadata": {}, + "outputs": [], + "source": [ + "def make_prediction_tuples(xxx) -> list[PredictionTuple]:\n", + " rows = []\n", + " for mapping in xxx:\n", + " s_name = pyobo.get_name(*mapping.s.pair)\n", + " if not s_name:\n", + " tqdm.write(f\"can not look up name for {mapping.s.curie}\")\n", + " continue\n", + " o_name = pyobo.get_name(*mapping.o.pair)\n", + " if not o_name:\n", + " tqdm.write(f\"can not look up name for {mapping.o.curie}\")\n", + " continue\n", + " rows.append(\n", + " PredictionTuple(\n", + " *mapping.s.pair,\n", + " s_name,\n", + " \"skos:exactMatch\",\n", + " *mapping.o.pair,\n", + " o_name,\n", + " \"semapv:UnspecifiedMatching\",\n", + " 0.8,\n", + " \"clo\",\n", + " )\n", + " )\n", + " return rows" + ] + }, + { + "cell_type": "markdown", + "id": "f8d49d7d", + "metadata": {}, + "source": [ + "Add short set of xrefs that aren't exact for triage" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "767e8575", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3aca1d2a6c1d4d29badeb1270d1fdf3f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Standardizing mappings: 0.00mapping [00:00, ?mapping/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1702cda6cb004554a3c971e2a247b7b9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Removing curated from predicted: 0%| | 0.00/40.5k [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
source prefixsource identifiersource namerelationtarget prefixtarget identifiertarget nametypeconfidencesource
0clo0001008697 cellskos:exactMatchcellosaurus0079697semapv:UnspecifiedMatching0.8clo
1clo0001088143B cellskos:exactMatchcellosaurus2270143Bsemapv:UnspecifiedMatching0.8clo
2clo0001230HEK293skos:exactMatchbto0000007HEK-293 cellsemapv:UnspecifiedMatching0.8clo
3clo0001230HEK293skos:exactMatchefo0001182HEK293semapv:UnspecifiedMatching0.8clo
4clo0001234293/CHE-Fc cellskos:exactMatchcellosaurus6352293/CHE-Fcsemapv:UnspecifiedMatching0.8clo
.................................
776clo0051547RCB2084 cellskos:exactMatchcellosaurus1736TALL-1 [Human adult T-ALL]semapv:UnspecifiedMatching0.8clo
777clo0051567RCB1902 cellskos:exactMatchcellosaurus1289HSC-4semapv:UnspecifiedMatching0.8clo
778clo0051568RCB1974 cellskos:exactMatchcellosaurus1675SASsemapv:UnspecifiedMatching0.8clo
779clo0051569RCB1975 cellskos:exactMatchcellosaurus1288HSC-3semapv:UnspecifiedMatching0.8clo
780clo0051609RCB0881 cellskos:exactMatchcellosaurus1621OCUB-Msemapv:UnspecifiedMatching0.8clo
\n", + "

781 rows × 10 columns

\n", + "" + ], + "text/plain": [ + " source prefix source identifier source name relation \\\n", + "0 clo 0001008 697 cell skos:exactMatch \n", + "1 clo 0001088 143B cell skos:exactMatch \n", + "2 clo 0001230 HEK293 skos:exactMatch \n", + "3 clo 0001230 HEK293 skos:exactMatch \n", + "4 clo 0001234 293/CHE-Fc cell skos:exactMatch \n", + ".. ... ... ... ... \n", + "776 clo 0051547 RCB2084 cell skos:exactMatch \n", + "777 clo 0051567 RCB1902 cell skos:exactMatch \n", + "778 clo 0051568 RCB1974 cell skos:exactMatch \n", + "779 clo 0051569 RCB1975 cell skos:exactMatch \n", + "780 clo 0051609 RCB0881 cell skos:exactMatch \n", + "\n", + " target prefix target identifier target name \\\n", + "0 cellosaurus 0079 697 \n", + "1 cellosaurus 2270 143B \n", + "2 bto 0000007 HEK-293 cell \n", + "3 efo 0001182 HEK293 \n", + "4 cellosaurus 6352 293/CHE-Fc \n", + ".. ... ... ... \n", + "776 cellosaurus 1736 TALL-1 [Human adult T-ALL] \n", + "777 cellosaurus 1289 HSC-4 \n", + "778 cellosaurus 1675 SAS \n", + "779 cellosaurus 1288 HSC-3 \n", + "780 cellosaurus 1621 OCUB-M \n", + "\n", + " type confidence source \n", + "0 semapv:UnspecifiedMatching 0.8 clo \n", + "1 semapv:UnspecifiedMatching 0.8 clo \n", + "2 semapv:UnspecifiedMatching 0.8 clo \n", + "3 semapv:UnspecifiedMatching 0.8 clo \n", + "4 semapv:UnspecifiedMatching 0.8 clo \n", + ".. ... ... ... \n", + "776 semapv:UnspecifiedMatching 0.8 clo \n", + "777 semapv:UnspecifiedMatching 0.8 clo \n", + "778 semapv:UnspecifiedMatching 0.8 clo \n", + "779 semapv:UnspecifiedMatching 0.8 clo \n", + "780 semapv:UnspecifiedMatching 0.8 clo \n", + "\n", + "[781 rows x 10 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rows = make_prediction_tuples(filter_mappings(mappings, m2m_mappings, progress=False))\n", + "pd.DataFrame(rows, columns=PREDICTIONS_HEADER)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d77b5def", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/biomappings/resources/__init__.py b/src/biomappings/resources/__init__.py index 9f2475d5..60ff1c7d 100644 --- a/src/biomappings/resources/__init__.py +++ b/src/biomappings/resources/__init__.py @@ -22,7 +22,7 @@ ) import bioregistry -from tqdm import tqdm +from tqdm.auto import tqdm from biomappings.utils import OVERRIDE_MIRIAM, RESOURCE_PATH, get_canonical_tuple @@ -187,7 +187,7 @@ def load_mappings() -> List[Dict[str, str]]: def append_true_mappings(m: Iterable[Mapping[str, str]], sort: bool = True) -> None: """Append new lines to the mappings table.""" - _write_helper(MAPPINGS_HEADER, m, TRUE_MAPPINGS_PATH, "a") + _write_helper(MAPPINGS_HEADER, m, TRUE_MAPPINGS_PATH, mode="a") if sort: lint_true_mappings() @@ -199,7 +199,7 @@ def append_true_mapping_tuples(mappings: Iterable[MappingTuple]) -> None: def write_true_mappings(m: Iterable[Mapping[str, str]]) -> None: """Write mappigns to the true mappings file.""" - _write_helper(MAPPINGS_HEADER, m, TRUE_MAPPINGS_PATH, "w") + _write_helper(MAPPINGS_HEADER, m, TRUE_MAPPINGS_PATH, mode="w") def lint_true_mappings(*, standardize: bool = False) -> None: @@ -219,14 +219,14 @@ def load_false_mappings() -> List[Dict[str, str]]: def append_false_mappings(m: Iterable[Mapping[str, str]], sort: bool = True) -> None: """Append new lines to the false mappings table.""" - _write_helper(MAPPINGS_HEADER, m, FALSE_MAPPINGS_PATH, "a") + _write_helper(MAPPINGS_HEADER, m, FALSE_MAPPINGS_PATH, mode="a") if sort: lint_false_mappings() def write_false_mappings(m: Iterable[Mapping[str, str]]) -> None: """Write mappings to the false mappings file.""" - _write_helper(MAPPINGS_HEADER, m, FALSE_MAPPINGS_PATH, "w") + _write_helper(MAPPINGS_HEADER, m, FALSE_MAPPINGS_PATH, mode="w") def lint_false_mappings(*, standardize: bool = False) -> None: @@ -246,14 +246,14 @@ def load_unsure() -> List[Dict[str, str]]: def append_unsure_mappings(m: Iterable[Mapping[str, str]], sort: bool = True) -> None: """Append new lines to the "unsure" mappings table.""" - _write_helper(MAPPINGS_HEADER, m, UNSURE_PATH, "a") + _write_helper(MAPPINGS_HEADER, m, UNSURE_PATH, mode="a") if sort: lint_unsure_mappings() def write_unsure_mappings(m: Iterable[Mapping[str, str]]) -> None: """Write mappings to the unsure mappings file.""" - _write_helper(MAPPINGS_HEADER, m, UNSURE_PATH, "w") + _write_helper(MAPPINGS_HEADER, m, UNSURE_PATH, mode="w") def lint_unsure_mappings(*, standardize: bool = False) -> None: @@ -273,24 +273,35 @@ def load_predictions(*, path: Optional[Path] = None) -> List[Dict[str, str]]: def write_predictions(m: Iterable[Mapping[str, str]], *, path: Optional[Path] = None) -> None: """Write new content to the predictions table.""" - _write_helper(PREDICTIONS_HEADER, m, path or PREDICTIONS_PATH, "w") + _write_helper(PREDICTIONS_HEADER, m, path or PREDICTIONS_PATH, mode="w") def append_prediction_tuples( - prediction_tuples: Iterable[PredictionTuple], deduplicate: bool = True, sort: bool = True + prediction_tuples: Iterable[PredictionTuple], + *, + deduplicate: bool = True, + sort: bool = True, + standardize: bool = True, ) -> None: """Append new lines to the predictions table that come as canonical tuples.""" append_predictions( (prediction_tuple.as_dict() for prediction_tuple in set(prediction_tuples)), deduplicate=deduplicate, sort=sort, + standardize=standardize, ) def append_predictions( - mappings: Iterable[Mapping[str, str]], deduplicate: bool = True, sort: bool = True + mappings: Iterable[Mapping[str, str]], + *, + deduplicate: bool = True, + sort: bool = True, + standardize: bool = True, ) -> None: """Append new lines to the predictions table.""" + if standardize: + mappings = _standardize_mappings(mappings) if deduplicate: existing_mappings = { get_canonical_tuple(existing_mapping) @@ -305,7 +316,7 @@ def append_predictions( mapping for mapping in mappings if get_canonical_tuple(mapping) not in existing_mappings ) - _write_helper(PREDICTIONS_HEADER, mappings, PREDICTIONS_PATH, "a") + _write_helper(PREDICTIONS_HEADER, mappings, PREDICTIONS_PATH, mode="a") if sort: lint_predictions() @@ -341,15 +352,15 @@ def lint_predictions(standardize: bool = False) -> None: def _remove_redundant(mappings, tuple_cls, standardize: bool = False): if standardize: - mappings = ( - _standardize_mapping(mapping) - for mapping in tqdm( - mappings, desc="Standardizing mappings", unit_scale=True, unit="mapping" - ) - ) + mappings = _standardize_mappings(mappings) return (mapping.as_dict() for mapping in {tuple_cls.from_dict(mapping) for mapping in mappings}) +def _standardize_mappings(mappings, *, progress: bool = True): + for mapping in tqdm(mappings, desc="Standardizing mappings", unit_scale=True, unit="mapping", disable=not progress): + yield _standardize_mapping(mapping) + + def _standardize_mapping(mapping): """Standardize a mapping.""" for prefix_key, identifier_key in [ diff --git a/src/biomappings/resources/predictions.tsv b/src/biomappings/resources/predictions.tsv index bc2a79e0..4c3fe9f7 100644 --- a/src/biomappings/resources/predictions.tsv +++ b/src/biomappings/resources/predictions.tsv @@ -9457,6 +9457,31 @@ chebi CHEBI:9910 Usambarensine skos:exactMatch mesh C070902 usambarensine semapv chebi CHEBI:9925 sodium valproate skos:exactMatch mesh C420746 Epilim semapv:LexicalMatching 0.95 generate_chebi_mesh_mappings.py chebi CHEBI:9954 Verbenalin skos:exactMatch mesh C000511 cornin iridoid semapv:LexicalMatching 0.95 generate_chebi_mesh_mappings.py chebi CHEBI:9955 (R)-(+)-verbenone skos:exactMatch mesh C052875 verbenone semapv:LexicalMatching 0.95 generate_chebi_mesh_mappings.py +clo 0001230 HEK293 skos:exactMatch cellosaurus CVCL_0045 HEK293 semapv:UnspecifiedMatching 0.8 clo +clo 0001345 3T3 cell skos:exactMatch mesh D016475 3T3 Cells semapv:UnspecifiedMatching 0.8 clo +clo 0002585 COR-L23 cell skos:exactMatch efo 0002142 CORL23 semapv:UnspecifiedMatching 0.8 clo +clo 0002596 COS-1 cell skos:exactMatch mesh D019556 COS Cells semapv:UnspecifiedMatching 0.8 clo +clo 0002597 COS-7 cell skos:exactMatch mesh D019556 COS Cells semapv:UnspecifiedMatching 0.8 clo +clo 0007050 K 562 cell skos:exactMatch cellosaurus CVCL_0004 K-562 semapv:UnspecifiedMatching 0.8 clo +clo 0007059 K-562 cell skos:exactMatch cellosaurus CVCL_0004 K-562 semapv:UnspecifiedMatching 0.8 clo +clo 0007634 MDA-MB-231 cell skos:exactMatch efo 0001209 MDAMB231 semapv:UnspecifiedMatching 0.8 clo +clo 0009034 SK-BR-3 cell skos:exactMatch efo 0001236 SKBR3 semapv:UnspecifiedMatching 0.8 clo +clo 0009040 SK-MEL-1 cell skos:exactMatch efo 0002332 SKMEL1 semapv:UnspecifiedMatching 0.8 clo +clo 0037163 Ishikawa cell skos:exactMatch cellosaurus CVCL_D199 Ishikawa 3-H-12 semapv:UnspecifiedMatching 0.8 clo +clo 0037230 Ishikawa 3-H-12 cell skos:exactMatch cellosaurus CVCL_D199 Ishikawa 3-H-12 semapv:UnspecifiedMatching 0.8 clo +clo 0037237 293-derived cell skos:exactMatch cellosaurus CVCL_0045 HEK293 semapv:UnspecifiedMatching 0.8 clo +clo 0037261 3T3-derived cell skos:exactMatch mesh D016475 3T3 Cells semapv:UnspecifiedMatching 0.8 clo +clo 0037287 COR123 cell skos:exactMatch efo 0002142 CORL23 semapv:UnspecifiedMatching 0.8 clo +clo 0037291 MDAMB231 cell skos:exactMatch efo 0001209 MDAMB231 semapv:UnspecifiedMatching 0.8 clo +clo 0037292 SKMEL1 skos:exactMatch efo 0002332 SKMEL1 semapv:UnspecifiedMatching 0.8 clo +clo 0037295 SKBR3 cell skos:exactMatch efo 0001236 SKBR3 semapv:UnspecifiedMatching 0.8 clo +clo 0037300 BALL-1 cell skos:exactMatch cellosaurus CVCL_1075 BALL-1 semapv:UnspecifiedMatching 0.8 clo +clo 0037372 HEK293T cell skos:exactMatch cellosaurus CVCL_0063 HEK293T semapv:UnspecifiedMatching 0.8 clo +clo 0050405 RCB2280 cell skos:exactMatch cellosaurus CVCL_1272 HCE-T semapv:UnspecifiedMatching 0.8 clo +clo 0050406 RCB1384 cell skos:exactMatch cellosaurus CVCL_1272 HCE-T semapv:UnspecifiedMatching 0.8 clo +clo 0050894 RCB2202 cell skos:exactMatch cellosaurus CVCL_0063 HEK293T semapv:UnspecifiedMatching 0.8 clo +clo 0051004 RCB0256 cell skos:exactMatch cellosaurus CVCL_1075 BALL-1 semapv:UnspecifiedMatching 0.8 clo +clo 0051005 RCB1882 cell skos:exactMatch cellosaurus CVCL_1075 BALL-1 semapv:UnspecifiedMatching 0.8 clo doid DOID:0050041 Astrakhan spotted fever skos:exactMatch umls C0549160 North Asian Tick Typhus semapv:LexicalMatching 0.5555555555555556 https://github.com/biomappings/biomappings/blob/f293d0/scripts/generate_doid_mappings.py doid DOID:0050042 Indian tick typhus skos:exactMatch umls C0343768 India tick typhus semapv:LexicalMatching 0.5555555555555556 https://github.com/biomappings/biomappings/blob/f293d0/scripts/generate_doid_mappings.py doid DOID:0050134 cutaneous mycosis skos:exactMatch umls C0011630 Dermatomycoses semapv:LexicalMatching 0.5555555555555556 https://github.com/biomappings/biomappings/blob/f293d0/scripts/generate_doid_mappings.py diff --git a/src/biomappings/templates/home.html b/src/biomappings/templates/home.html index f79b7621..1ebbf648 100644 --- a/src/biomappings/templates/home.html +++ b/src/biomappings/templates/home.html @@ -1,7 +1,7 @@ {% extends "base.html" %} -{% import "bootstrap/form.html" as wtf %} -{% import "bootstrap/utils.html" as util %} +{% import "bootstrap4/form.html" as wtf %} +{% import "bootstrap4/utils.html" as util %} {% set remaining_rows = controller.count_predictions(query=query, source_query=source_query, target_query=target_query, prefix=prefix, same_text=same_text, provenance=provenance) %}