Skip to content

Commit

Permalink
Merge pull request #147 from gyorilab/add-disgenet
Browse files Browse the repository at this point in the history
Add DisGeNet processor
  • Loading branch information
bgyori authored Dec 13, 2023
2 parents 6c7b4b4 + cac4e04 commit 6d27b5e
Show file tree
Hide file tree
Showing 9 changed files with 341 additions and 80 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ non-causal contextual relations including properties, ontology, and data.
| [NIH reporter](https://reporter.nih.gov) | has_publication | The NIH Reporter Project represented by the source has an associated publication represented by the target. |
| [NIH reporter](https://reporter.nih.gov) | has_clinical_trial | The NIH Reporter Project represented by the source has an associated clinical trial represented by the target. |
| [NIH reporter](https://reporter.nih.gov) | has_patent | The NIH Reporter Project represented by the source has an associated patent represented by the target. |
| [DisGeNet](https://www.disgenet.org/) | gene_disease_association | Literature curated associations between genes and diseases |
| [DisGeNet](https://www.disgenet.org/) | variant_disease_association | Literature curated associations between variants (e.g., identified by dbSNP) and diseases |
| [DisGeNet](https://www.disgenet.org/) | variant_gene_association | Literature curated associations between genes and variants (e.g., from dbSNP) |

## Installation

Expand Down
5 changes: 5 additions & 0 deletions docs/source/modules/sources/disgenet.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
DisGeNet Processor (:py:mod:`indra_cogex.sources.disgenet`)
===========================================================
.. automodule:: indra_cogex.sources.disgenet
:members:
:show-inheritance:
1 change: 1 addition & 0 deletions docs/source/modules/sources/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ INDRA CoGEx Sources
pathways
pubmed
sider
disgenet
7 changes: 5 additions & 2 deletions src/indra_cogex/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .cellmarker import CellMarkerProcessor
from .chembl import ChemblIndicationsProcessor
from .clinicaltrials import ClinicaltrialsProcessor
from .disgenet import DisgenetProcessor
from .goa import GoaProcessor
from .hpoa import HpDiseasePhenotypeProcessor, HpPhenotypeGeneProcessor
from .indra_db import DbProcessor, EvidenceProcessor
Expand Down Expand Up @@ -48,7 +49,9 @@
"InterproProcessor",
"CellMarkerProcessor",
"JournalPublisherProcessor",
"DisgenetProcessor",
]

processor_resolver = Resolver.from_subclasses(Processor,
skip=[WikiDataProcessor])
processor_resolver: Resolver[Processor] = Resolver.from_subclasses(
Processor, skip=[WikiDataProcessor]
)
222 changes: 222 additions & 0 deletions src/indra_cogex/sources/disgenet/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
# -*- coding: utf-8 -*-

"""Process DisGeNet, a resource for gene-disease and variant-disease associations."""

import logging
import pickle

import click
import pandas as pd
import pystow
from indra.databases import hgnc_client

from indra_cogex.representation import Node, Relation
from indra_cogex.sources.processor import Processor
from indra_cogex.sources.utils import UmlsMapper

__all__ = [
"DisgenetProcessor",
]

logger = logging.getLogger(__name__)

SUBMODULE = pystow.module("indra", "cogex", "disgenet")

DOWNLOAD_BASE = "https://www.disgenet.org/static/disgenet_ap1/files/downloads"
CURATED_DISEASE_GENES_ASSOCIATIONS_URL = (
f"{DOWNLOAD_BASE}/curated_gene_disease_associations.tsv.gz"
)
CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL = (
f"{DOWNLOAD_BASE}/curated_variant_disease_associations.tsv.gz"
)
CURATED_VARIANT_GENE_ASSOCIATIONS_URL = (
f"{DOWNLOAD_BASE}/variant_to_gene_mappings.tsv.gz"
)


class DisgenetProcessor(Processor):
"""Processor for the DisGeNet database."""

name = "disgenet"
node_types = ["BioEntity"]
gene_relation = "gene_disease_association"
variant_relation = "variant_disease_association"
variant_gene_relation = "variant_gene_association"

def __init__(self):
"""Initialize the DisGeNet processor."""
self.gene_df = load_disgenet_disease_gene(
CURATED_DISEASE_GENES_ASSOCIATIONS_URL
)
self.variant_df = load_disgenet_disease_variant(
CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL
)

self.variant_gene_df = load_disgenet_variant_gene(
CURATED_VARIANT_GENE_ASSOCIATIONS_URL
)

def get_nodes(self): # noqa:D102
diseases = {
tuple(row)
for df in [self.gene_df, self.variant_df]
for row in df[["disease_prefix", "disease_id", "disease_name"]].values
}
for prefix, identifier, name in diseases:
yield Node.standardized(
db_ns=prefix, db_id=identifier, name=name, labels=["BioEntity"]
)
for hgnc_id in self.gene_df["hgnc_id"].unique():
yield Node.standardized(db_ns="HGNC", db_id=hgnc_id, labels=["BioEntity"])

for dbsnp_id in self.variant_df["snpId"].unique():
yield Node.standardized(
db_ns="DBSNP", db_id=dbsnp_id, name=dbsnp_id, labels=["BioEntity"]
)

def get_relations(self): # noqa:D102
yield from _yield_gene_relations(self.gene_df, self.name, self.gene_relation)
yield from _yield_variant_relations(
self.variant_df, self.name, self.variant_relation
)
yield from _yield_variant_gene_relations(
self.variant_gene_df, self.name, self.variant_gene_relation
)


def _yield_gene_relations(df, name, relation):
columns = [
"hgnc_id",
"disease_prefix",
"disease_id",
"DSI",
"DPI",
"score",
"NofSnps",
"NofPmids",
]

for hgnc_id, disease_prefix, disease_id, dsi, dpi, snps, score, papers in (
df[columns].drop_duplicates().values
):
data = {"snps:int": snps, "source": name, "papers:int": papers}
if pd.notna(dsi):
data["disgenet_dsi:float"] = dsi
if pd.notna(dpi):
data["disgenet_dpi:float"] = dpi
if pd.notna(score):
data["disgenet_score:float"] = score
yield Relation("HGNC", hgnc_id, disease_prefix, disease_id, relation, data)


def _yield_variant_relations(df, name, relation):
columns = [
"snpId",
"DSI",
"DPI",
"score",
"disease_prefix",
"disease_id",
"NofPmids",
]
for snp_id, dsi, dpi, score, disease_prefix, disease_id, papers in (
df[columns].drop_duplicates().values
):
data = {
"source": name,
"papers:int": papers,
}
if pd.notna(dsi):
data["disgenet_dsi:float"] = dsi
if pd.notna(dpi):
data["disgenet_dpi:float"] = dpi
if pd.notna(score):
data["disgenet_score:float"] = score
yield Relation("DBSNP", snp_id, disease_prefix, disease_id, relation, data)


def _yield_variant_gene_relations(df, name, relation):
columns = ["snpId", "hgnc_id"]
for snp_id, hgnc_id in df[columns].drop_duplicates().values:
data = {"source": name}
yield Relation("DBSNP", snp_id, "HGNC", hgnc_id, relation, data)


def load_disgenet_disease_gene(
url,
force: bool = False,
) -> pd.DataFrame:
"""Export disease-gene association file."""
df = SUBMODULE.ensure_csv(
url=url,
read_csv_kwargs=dict(dtype={"geneId": str}),
force=force,
)
_map_disease(df)
# Filter out ungroundable
df = df[df["disease_prefix"].notna()]
_map_entrez(df)
df = df[df["hgnc_id"].notna()]
return df


def load_disgenet_disease_variant(
url,
force: bool = False,
) -> pd.DataFrame:
df = SUBMODULE.ensure_csv(
url=url,
read_csv_kwargs=dict(dtype={"snpId": str}),
force=force,
)
_map_disease(df)
# Filter out ungroundable
df = df[df["disease_prefix"].notna()]
return df


def _map_disease(df):
mapper_path = SUBMODULE.join(name="umls_mapper.pkl")
if mapper_path.is_file():
click.echo("loading UMLS mapper")
umls_mapper = pickle.loads(mapper_path.read_bytes())
click.echo("done loading UMLS mapper")
else:
click.echo("loading UMLS mapper")
umls_mapper = UmlsMapper()
click.echo("writing UMLS mapper")
mapper_path.write_bytes(
pickle.dumps(umls_mapper, protocol=pickle.HIGHEST_PROTOCOL)
)
click.echo("done writing UMLS mapper")

click.echo("mapping UMLS")

(
df["disease_prefix"],
df["disease_id"],
df["disease_name"],
) = zip(*df["diseaseId"].map(umls_mapper.standardize))
click.echo("done mapping UMLS")


def load_disgenet_variant_gene(
url,
force: bool = False,
) -> pd.DataFrame:
df = SUBMODULE.ensure_csv(
url=url,
read_csv_kwargs=dict(dtype={"geneId": str, "snpId": str}),
force=force,
)
_map_entrez(df)
df = df[df["hgnc_id"].notna()]
return df


def _map_entrez(df):
click.echo("mapping HGNC")
df["hgnc_id"] = df["geneId"].map(
lambda s: hgnc_client.get_hgnc_from_entrez(s.strip())
)
click.echo("done mapping HGNC")
4 changes: 4 additions & 0 deletions src/indra_cogex/sources/disgenet/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from . import DisgenetProcessor

if __name__ == "__main__":
DisgenetProcessor.cli()
84 changes: 6 additions & 78 deletions src/indra_cogex/sources/sider/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,16 @@
import gilda
import gilda.grounder
import pandas as pd
import pyobo
import pystow
from biomappings import load_mappings
from tabulate import tabulate
from tqdm import tqdm

from indra.databases import biolookup_client
from indra.databases.identifiers import get_ns_id_from_identifiers
from indra.ontology.bio import bio_ontology
from indra_cogex.representation import Node, Relation, standardize
from indra_cogex.sources import Processor
from tabulate import tabulate
from tqdm import tqdm

from indra_cogex.representation import Node, Relation
from indra_cogex.sources import Processor
from indra_cogex.sources.utils import UmlsMapper

VERSION = "4.1"
SUBMODULE = pystow.module("indra", "cogex", "sider", VERSION)
Expand All @@ -43,77 +41,6 @@ def stitch_stereo_to_pubchem(cid: str) -> str:
return re.sub(cid_to_pubchem_pattern, "\\1", cid)


class UmlsMapper:
"""A utility class for mapping out of UMLS."""

prefixes = ["doid", "mesh", "hp", "efo", "mondo"]

def __init__(self):
"""Prepare the UMLS mappings from PyOBO and Biomappings."""
#: A dictionary from external prefix to UMLS id to external ID
self.xrefs = {}

for prefix in self.prefixes:
self.xrefs[prefix] = {}
# Get external to UMLS
for external_id, umls_id in pyobo.get_filtered_xrefs(
prefix, "umls"
).items():
self.xrefs[prefix][umls_id] = external_id
# Get UMLS to external
for umls_id, external_id in pyobo.get_filtered_xrefs(
"umls", prefix
).items():
self.xrefs[prefix][umls_id] = external_id

# Get manually curated UMLS mappings from biomappings
biomappings_from_umls, biomappings_to_umls = Counter(), Counter()
for mapping in load_mappings():
if mapping["source prefix"] == "umls":
target_prefix = mapping["target prefix"]
biomappings_from_umls[target_prefix] += 1
target_id = mapping["target identifier"]
source_id = mapping["source identifier"]
if target_prefix in self.xrefs:
self.xrefs[target_prefix][target_id] = source_id
else:
self.xrefs[target_prefix] = {
target_id: source_id,
}
elif mapping["target prefix"] == "umls":
source_prefix = mapping["source prefix"]
biomappings_to_umls[source_prefix] += 1
source_id = mapping["source identifier"]
target_id = mapping["target identifier"]
if source_prefix in self.xrefs:
self.xrefs[source_prefix][source_id] = target_id
else:
self.xrefs[source_prefix] = {
source_id: target_id,
}

print("Mapping out of UMLS")
print(tabulate(biomappings_from_umls.most_common()))
print("Mapping into UMLS")
print(tabulate(biomappings_to_umls.most_common()))

print("Total xrefs")
print(
tabulate(
[(prefix, len(self.xrefs[prefix])) for prefix in self.prefixes],
headers=["Prefix", "Mappings"],
)
)

def lookup(self, umls_id: str):
for prefix in self.prefixes:
xrefs = self.xrefs[prefix]
identifier = xrefs.get(umls_id)
if identifier is not None:
return standardize(prefix, identifier)
return "umls", umls_id, pyobo.get_name("umls", umls_id)


class SIDERSideEffectProcessor(Processor):
"""A processor for SIDER side effects."""

Expand Down Expand Up @@ -167,6 +94,7 @@ def __init__(self):
umls_mapper = UmlsMapper()
self.side_effects = {}
for umls_id in self.df["UMLS CUI from MedDRA"].unique():
# TODO replace with "standardize"
prefix, identifier, name = umls_mapper.lookup(umls_id)
db_ns, db_id = get_ns_id_from_identifiers(prefix, identifier)
if db_ns is None:
Expand Down
Loading

0 comments on commit 6d27b5e

Please sign in to comment.