Skip to content

Commit

Permalink
Add CLO processing notebook (#133)
Browse files Browse the repository at this point in the history
The [Cell Line Ontology (CLO)](https://bioregistry.io/registry/clo) is a
detailed resource, however it does not follow standard OBO modeling
pattern for cross-references that either a predicate from
[SKOS](https://bioregistry.io/skos) or `oboInOwl:hasDbXref` to point to
a single CURIE encoded as a string. Instead, it uses `rdfs:seeAlso` with
a combination of non-standard CURIEs that are either comma or semi-colon
delimited.

Depends on:
- biopragmatics/bioregistry#896
- CLO-ontology/CLO#103
  • Loading branch information
cthoyt authored Jul 3, 2023
1 parent bd52829 commit dc39c45
Show file tree
Hide file tree
Showing 9 changed files with 1,553 additions and 7 deletions.
1,366 changes: 1,366 additions & 0 deletions notebooks/Process CLO Mappings.ipynb

Large diffs are not rendered by default.

38 changes: 38 additions & 0 deletions scripts/generate_clo_mesh_mappings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-

"""Generate mappings to CLO from to MeSH."""

import click
from more_click import verbose_option
from semra.sources.clo import get_clo_mappings

from biomappings.gilda_utils import append_gilda_predictions
from biomappings.mapping_graph import get_filter_from_semra
from biomappings.utils import get_script_url


@click.command()
@verbose_option
def main():
"""Generate CLO-MeSH mappings."""
provenance = get_script_url(__file__)

prefix = "clo"
targets = [
"mesh",
"efo",
]

clo_mappings = get_clo_mappings()
custom_filter = get_filter_from_semra(clo_mappings)

append_gilda_predictions(
prefix,
targets,
provenance=provenance,
custom_filter=custom_filter,
)


if __name__ == "__main__":
main()
5 changes: 2 additions & 3 deletions src/biomappings/gilda_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import logging
from collections import defaultdict
from typing import Iterable, Mapping, Optional, Tuple, Union
from typing import Iterable, Optional, Tuple, Union

import bioregistry
import pyobo
Expand All @@ -13,11 +13,10 @@
from pyobo.gilda_utils import get_grounder, iter_gilda_prediction_tuples

from biomappings.resources import PredictionTuple, append_prediction_tuples
from biomappings.utils import CMapping

logger = logging.getLogger(__name__)

CMapping = Mapping[str, Mapping[str, Mapping[str, str]]]


def append_gilda_predictions(
prefix: str,
Expand Down
25 changes: 21 additions & 4 deletions src/biomappings/mapping_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,32 @@

import itertools as itt
from collections import defaultdict
from typing import DefaultDict, Dict, Iterable, Mapping, Optional
from typing import TYPE_CHECKING, DefaultDict, Dict, Iterable, List, Optional

import networkx as nx
import pyobo

from biomappings.utils import CMapping

def get_custom_filter(
prefix: str, targets: Iterable[str]
) -> Mapping[str, Mapping[str, Mapping[str, str]]]:
if TYPE_CHECKING:
import semra

__all__ = [
"get_custom_filter",
"get_filter_from_semra",
"mutual_mapping_graph",
]


def get_filter_from_semra(mappings: List["semra.Mapping"]) -> CMapping:
"""Get a custom filter dictionary from a set of SeMRA mappings."""
rv: DefaultDict[str, DefaultDict[str, Dict[str, str]]] = defaultdict(lambda: defaultdict(dict))
for mapping in mappings:
rv[mapping.s.prefix][mapping.o.prefix][mapping.s.identifier] = mapping.o.identifier
return rv


def get_custom_filter(prefix: str, targets: Iterable[str]) -> CMapping:
"""Get a custom filter dictionary induced over the mutual mapping graph with all target prefixes.
:param prefix: The source prefix
Expand Down
48 changes: 48 additions & 0 deletions src/biomappings/resources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,37 @@ def from_dict(cls, mapping: Mapping[str, str]) -> "PredictionTuple":
values.append(value)
return cls(*values) # type:ignore

@classmethod
def from_semra(cls, mapping, confidence) -> "PredictionTuple":
"""Instantiate from a SeMRA mapping."""
import pyobo
import semra

s_name = pyobo.get_name(*mapping.s.pair)
if not s_name:
raise KeyError(f"could not look up name for {mapping.s.curie}")
o_name = pyobo.get_name(*mapping.o.pair)
if not o_name:
raise KeyError(f"could not look up name for {mapping.o.curie}")
# Assume that each mapping has a single simple evidence with a mapping set annotation
if len(mapping.evidence) != 1:
raise ValueError
evidence = mapping.evidence[0]
if not isinstance(evidence, semra.SimpleEvidence):
raise TypeError
if evidence.mapping_set is None:
raise ValueError
return cls( # type:ignore
*mapping.s.pair,
s_name,
mapping.p.curie,
*mapping.o.pair,
o_name,
evidence.justification.curie,
confidence,
evidence.mapping_set.name,
)

@property
def source_curie(self) -> str:
"""Concatenate the source prefix and ID to a CURIE."""
Expand Down Expand Up @@ -421,3 +452,20 @@ def get_curated_filter() -> Mapping[str, Mapping[str, Mapping[str, str]]]:
for m in itt.chain(load_mappings(), load_false_mappings(), load_unsure()):
d[m["source prefix"]][m["target prefix"]][m["source identifier"]] = m["target identifier"]
return {k: dict(v) for k, v in d.items()}


def prediction_tuples_from_semra(
mappings,
*,
confidence: float,
) -> List[PredictionTuple]:
"""Get prediction tuples from SeMRA mappings."""
rows = []
for mapping in mappings:
try:
row = PredictionTuple.from_semra(mapping, confidence)
except KeyError as e:
tqdm.write(str(e))
continue
rows.append(row)
return rows
12 changes: 12 additions & 0 deletions src/biomappings/resources/incorrect.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,18 @@ cl CL:2000004 pituitary gland cell skos:exactMatch mesh D010902 Pituitary Gland
cl CL:2000021 sebaceous gland cell skos:exactMatch mesh D012627 Sebaceous Glands semapv:ManualMappingCuration orcid:0000-0001-9439-5346
cl CL:2000022 cardiac septum cell skos:exactMatch mesh D006346 Heart Septum semapv:ManualMappingCuration orcid:0000-0001-9439-5346
cl CL:2000030 hypothalamus cell skos:exactMatch mesh D007031 Hypothalamus semapv:ManualMappingCuration orcid:0000-0001-9439-5346
clo 0001922 BE2 cell skos:exactMatch mesh D016175 B-Lymphocyte Subsets semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.502
clo 0002596 COS-1 cell skos:exactMatch mesh D019556 COS Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8
clo 0002597 COS-7 cell skos:exactMatch mesh D019556 COS Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8
clo 0002941 EPI cell skos:exactMatch mesh D015251 Epirubicin semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.556
clo 0003413 G cell skos:exactMatch mesh D019863 Gastrin-Secreting Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549
clo 0009279 TC-1 cell skos:exactMatch mesh D013602 T-Lymphocytes, Cytotoxic semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549
clo 0037163 Ishikawa cell skos:exactMatch cellosaurus CVCL_D199 Ishikawa 3-H-12 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8
clo 0037237 293-derived cell skos:exactMatch cellosaurus CVCL_0045 HEK293 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8
clo 0037261 3T3-derived cell skos:exactMatch mesh D016475 3T3 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8
clo 0037287 COR123 cell skos:exactMatch efo 0002142 CORL23 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8
clo 0051004 RCB0256 cell skos:exactMatch cellosaurus CVCL_1075 BALL-1 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8
clo 0051005 RCB1882 cell skos:exactMatch cellosaurus CVCL_1075 BALL-1 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8
doid DOID:0001816 angiosarcoma skos:exactMatch mesh D006394 Hemangiosarcoma semapv:ManualMappingCuration orcid:0000-0003-4423-4370
doid DOID:0001816 angiosarcoma skos:exactMatch umls C0018923 Hemangiosarcoma semapv:ManualMappingCuration orcid:0000-0003-4423-4370
doid DOID:0001816 angiosarcoma skos:exactMatch umls C0278592 Adult Angiosarcoma semapv:ManualMappingCuration orcid:0000-0003-4423-4370
Expand Down
20 changes: 20 additions & 0 deletions src/biomappings/resources/mappings.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -3011,6 +3011,26 @@ cl CL:0008002 skeletal muscle fiber skos:exactMatch mesh D018485 Muscle Fibers,
cl CL:0010003 epithelial cell of alveolus of lung skos:exactMatch mesh D056809 Alveolar Epithelial Cells semapv:ManualMappingCuration orcid:0000-0001-9439-5346
cl CL:0010017 zygote skos:exactMatch mesh D015053 Zygote semapv:ManualMappingCuration orcid:0000-0001-9439-5346
cl CL:0010021 cardiac myoblast skos:exactMatch mesh D032386 Myoblasts, Cardiac semapv:ManualMappingCuration orcid:0000-0001-9439-5346
clo 0000031 cell line skos:exactMatch mesh D002460 Cell Line semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.762
clo 0001230 HEK293 skos:exactMatch cellosaurus CVCL_0045 HEK293 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8
clo 0001345 3T3 cell skos:exactMatch mesh D016475 3T3 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8
clo 0001601 A549 cell skos:exactMatch mesh D000072283 A549 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549
clo 0002585 COR-L23 cell skos:exactMatch efo 0002142 CORL23 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8
clo 0003704 Hep G2 cell skos:exactMatch mesh D056945 Hep G2 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549
clo 0007606 MCF7 cell skos:exactMatch mesh D061986 MCF-7 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549
clo 0007634 MDA-MB-231 cell skos:exactMatch efo 0001209 MDAMB231 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8
clo 0007634 MDA-MB-231 cell skos:exactMatch mesh D000092302 MDA-MB-231 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549
clo 0007646 MDCK cell skos:exactMatch mesh D061985 Madin Darby Canine Kidney Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549
clo 0008395 PC-3 cell skos:exactMatch mesh D000078722 PC-3 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549
clo 0008753 RAW 264.7 cell skos:exactMatch mesh D000067996 RAW 264.7 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549
clo 0009348 THP-1 cell skos:exactMatch mesh D000074084 THP-1 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549
clo 0036932 Hybridoma skos:exactMatch mesh D006825 Hybridomas semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.556
clo 0036936 Somatic cell hybrid skos:exactMatch mesh D006822 Hybrid Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549
clo 0037116 LNCaP cell skos:exactMatch efo 0002071 LNCAP semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/b250ea/scripts/generate_clo_mesh_mappings.py 0.53
clo 0037230 Ishikawa 3-H-12 cell skos:exactMatch cellosaurus CVCL_D199 Ishikawa 3-H-12 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8
clo 0037291 MDAMB231 cell skos:exactMatch mesh D000092302 MDA-MB-231 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549
clo 0037300 BALL-1 cell skos:exactMatch cellosaurus CVCL_1075 BALL-1 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8
clo 0037339 tissue donor skos:exactMatch mesh D014019 Tissue Donors semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.54
doid DOID:0040002 aspirin allergy skos:exactMatch umls C0004058 Allergy to aspirin semapv:ManualMappingCuration orcid:0000-0003-4423-4370
doid DOID:0040004 amoxicillin allergy skos:exactMatch umls C0571417 Allergy to amoxicillin semapv:ManualMappingCuration orcid:0000-0003-4423-4370
doid DOID:0040005 ceftriaxone allergy skos:exactMatch umls C0571463 Allergy to ceftriaxone semapv:ManualMappingCuration orcid:0000-0003-4423-4370
Expand Down
Loading

0 comments on commit dc39c45

Please sign in to comment.