Merge pull request #147 from gyorilab/add-disgenet

Add DisGeNet processor
gyorilab · Dec 13, 2023 · 6d27b5e · 6d27b5e
2 parents 6c7b4b4 + cac4e04
commit 6d27b5e
Show file tree

Hide file tree

Showing 9 changed files with 341 additions and 80 deletions.
diff --git a/README.md b/README.md
@@ -31,6 +31,9 @@ non-causal contextual relations including properties, ontology, and data.
 | [NIH reporter](https://reporter.nih.gov)                            | has_publication                     | The NIH Reporter Project represented by the source has an associated publication represented by the target.                    |
 | [NIH reporter](https://reporter.nih.gov)                            | has_clinical_trial                  | The NIH Reporter Project represented by the source has an associated clinical trial represented by the target.                 |
 | [NIH reporter](https://reporter.nih.gov)                            | has_patent                          | The NIH Reporter Project represented by the source has an associated patent represented by the target.                         |
+| [DisGeNet](https://www.disgenet.org/)                               | gene_disease_association            | Literature curated associations between genes and diseases                                                                     |
+| [DisGeNet](https://www.disgenet.org/)                               | variant_disease_association         | Literature curated associations between variants (e.g., identified by dbSNP) and diseases                                      |
+| [DisGeNet](https://www.disgenet.org/)                               | variant_gene_association            | Literature curated associations between genes and variants (e.g., from dbSNP)                                                  |
 
 ## Installation
 

diff --git a/docs/source/modules/sources/disgenet.rst b/docs/source/modules/sources/disgenet.rst
@@ -0,0 +1,5 @@
+DisGeNet Processor (:py:mod:`indra_cogex.sources.disgenet`)
+===========================================================
+.. automodule:: indra_cogex.sources.disgenet
+    :members:
+    :show-inheritance:
diff --git a/docs/source/modules/sources/index.rst b/docs/source/modules/sources/index.rst
@@ -21,3 +21,4 @@ INDRA CoGEx Sources
    pathways
    pubmed
    sider
+   disgenet
diff --git a/src/indra_cogex/sources/__init__.py b/src/indra_cogex/sources/__init__.py
@@ -13,6 +13,7 @@
 from .cellmarker import CellMarkerProcessor
 from .chembl import ChemblIndicationsProcessor
 from .clinicaltrials import ClinicaltrialsProcessor
+from .disgenet import DisgenetProcessor
 from .goa import GoaProcessor
 from .hpoa import HpDiseasePhenotypeProcessor, HpPhenotypeGeneProcessor
 from .indra_db import DbProcessor, EvidenceProcessor
@@ -48,7 +49,9 @@
     "InterproProcessor",
     "CellMarkerProcessor",
     "JournalPublisherProcessor",
+    "DisgenetProcessor",
 ]
 
-processor_resolver = Resolver.from_subclasses(Processor,
-                                              skip=[WikiDataProcessor])
+processor_resolver: Resolver[Processor] = Resolver.from_subclasses(
+    Processor, skip=[WikiDataProcessor]
+)
diff --git a/src/indra_cogex/sources/disgenet/__init__.py b/src/indra_cogex/sources/disgenet/__init__.py
@@ -0,0 +1,222 @@
+# -*- coding: utf-8 -*-
+
+"""Process DisGeNet, a resource for gene-disease and variant-disease associations."""
+
+import logging
+import pickle
+
+import click
+import pandas as pd
+import pystow
+from indra.databases import hgnc_client
+
+from indra_cogex.representation import Node, Relation
+from indra_cogex.sources.processor import Processor
+from indra_cogex.sources.utils import UmlsMapper
+
+__all__ = [
+    "DisgenetProcessor",
+]
+
+logger = logging.getLogger(__name__)
+
+SUBMODULE = pystow.module("indra", "cogex", "disgenet")
+
+DOWNLOAD_BASE = "https://www.disgenet.org/static/disgenet_ap1/files/downloads"
+CURATED_DISEASE_GENES_ASSOCIATIONS_URL = (
+    f"{DOWNLOAD_BASE}/curated_gene_disease_associations.tsv.gz"
+)
+CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL = (
+    f"{DOWNLOAD_BASE}/curated_variant_disease_associations.tsv.gz"
+)
+CURATED_VARIANT_GENE_ASSOCIATIONS_URL = (
+    f"{DOWNLOAD_BASE}/variant_to_gene_mappings.tsv.gz"
+)
+
+
+class DisgenetProcessor(Processor):
+    """Processor for the DisGeNet database."""
+
+    name = "disgenet"
+    node_types = ["BioEntity"]
+    gene_relation = "gene_disease_association"
+    variant_relation = "variant_disease_association"
+    variant_gene_relation = "variant_gene_association"
+
+    def __init__(self):
+        """Initialize the DisGeNet processor."""
+        self.gene_df = load_disgenet_disease_gene(
+            CURATED_DISEASE_GENES_ASSOCIATIONS_URL
+        )
+        self.variant_df = load_disgenet_disease_variant(
+            CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL
+        )
+
+        self.variant_gene_df = load_disgenet_variant_gene(
+            CURATED_VARIANT_GENE_ASSOCIATIONS_URL
+        )
+
+    def get_nodes(self):  # noqa:D102
+        diseases = {
+            tuple(row)
+            for df in [self.gene_df, self.variant_df]
+            for row in df[["disease_prefix", "disease_id", "disease_name"]].values
+        }
+        for prefix, identifier, name in diseases:
+            yield Node.standardized(
+                db_ns=prefix, db_id=identifier, name=name, labels=["BioEntity"]
+            )
+        for hgnc_id in self.gene_df["hgnc_id"].unique():
+            yield Node.standardized(db_ns="HGNC", db_id=hgnc_id, labels=["BioEntity"])
+
+        for dbsnp_id in self.variant_df["snpId"].unique():
+            yield Node.standardized(
+                db_ns="DBSNP", db_id=dbsnp_id, name=dbsnp_id, labels=["BioEntity"]
+            )
+
+    def get_relations(self):  # noqa:D102
+        yield from _yield_gene_relations(self.gene_df, self.name, self.gene_relation)
+        yield from _yield_variant_relations(
+            self.variant_df, self.name, self.variant_relation
+        )
+        yield from _yield_variant_gene_relations(
+            self.variant_gene_df, self.name, self.variant_gene_relation
+        )
+
+
+def _yield_gene_relations(df, name, relation):
+    columns = [
+        "hgnc_id",
+        "disease_prefix",
+        "disease_id",
+        "DSI",
+        "DPI",
+        "score",
+        "NofSnps",
+        "NofPmids",
+    ]
+
+    for hgnc_id, disease_prefix, disease_id, dsi, dpi, snps, score, papers in (
+        df[columns].drop_duplicates().values
+    ):
+        data = {"snps:int": snps, "source": name, "papers:int": papers}
+        if pd.notna(dsi):
+            data["disgenet_dsi:float"] = dsi
+        if pd.notna(dpi):
+            data["disgenet_dpi:float"] = dpi
+        if pd.notna(score):
+            data["disgenet_score:float"] = score
+        yield Relation("HGNC", hgnc_id, disease_prefix, disease_id, relation, data)
+
+
+def _yield_variant_relations(df, name, relation):
+    columns = [
+        "snpId",
+        "DSI",
+        "DPI",
+        "score",
+        "disease_prefix",
+        "disease_id",
+        "NofPmids",
+    ]
+    for snp_id, dsi, dpi, score, disease_prefix, disease_id, papers in (
+        df[columns].drop_duplicates().values
+    ):
+        data = {
+            "source": name,
+            "papers:int": papers,
+        }
+        if pd.notna(dsi):
+            data["disgenet_dsi:float"] = dsi
+        if pd.notna(dpi):
+            data["disgenet_dpi:float"] = dpi
+        if pd.notna(score):
+            data["disgenet_score:float"] = score
+        yield Relation("DBSNP", snp_id, disease_prefix, disease_id, relation, data)
+
+
+def _yield_variant_gene_relations(df, name, relation):
+    columns = ["snpId", "hgnc_id"]
+    for snp_id, hgnc_id in df[columns].drop_duplicates().values:
+        data = {"source": name}
+        yield Relation("DBSNP", snp_id, "HGNC", hgnc_id, relation, data)
+
+
+def load_disgenet_disease_gene(
+    url,
+    force: bool = False,
+) -> pd.DataFrame:
+    """Export disease-gene association file."""
+    df = SUBMODULE.ensure_csv(
+        url=url,
+        read_csv_kwargs=dict(dtype={"geneId": str}),
+        force=force,
+    )
+    _map_disease(df)
+    # Filter out ungroundable
+    df = df[df["disease_prefix"].notna()]
+    _map_entrez(df)
+    df = df[df["hgnc_id"].notna()]
+    return df
+
+
+def load_disgenet_disease_variant(
+    url,
+    force: bool = False,
+) -> pd.DataFrame:
+    df = SUBMODULE.ensure_csv(
+        url=url,
+        read_csv_kwargs=dict(dtype={"snpId": str}),
+        force=force,
+    )
+    _map_disease(df)
+    # Filter out ungroundable
+    df = df[df["disease_prefix"].notna()]
+    return df
+
+
+def _map_disease(df):
+    mapper_path = SUBMODULE.join(name="umls_mapper.pkl")
+    if mapper_path.is_file():
+        click.echo("loading UMLS mapper")
+        umls_mapper = pickle.loads(mapper_path.read_bytes())
+        click.echo("done loading UMLS mapper")
+    else:
+        click.echo("loading UMLS mapper")
+        umls_mapper = UmlsMapper()
+        click.echo("writing UMLS mapper")
+        mapper_path.write_bytes(
+            pickle.dumps(umls_mapper, protocol=pickle.HIGHEST_PROTOCOL)
+        )
+        click.echo("done writing UMLS mapper")
+
+    click.echo("mapping UMLS")
+
+    (
+        df["disease_prefix"],
+        df["disease_id"],
+        df["disease_name"],
+    ) = zip(*df["diseaseId"].map(umls_mapper.standardize))
+    click.echo("done mapping UMLS")
+
+
+def load_disgenet_variant_gene(
+    url,
+    force: bool = False,
+) -> pd.DataFrame:
+    df = SUBMODULE.ensure_csv(
+        url=url,
+        read_csv_kwargs=dict(dtype={"geneId": str, "snpId": str}),
+        force=force,
+    )
+    _map_entrez(df)
+    df = df[df["hgnc_id"].notna()]
+    return df
+
+
+def _map_entrez(df):
+    click.echo("mapping HGNC")
+    df["hgnc_id"] = df["geneId"].map(
+        lambda s: hgnc_client.get_hgnc_from_entrez(s.strip())
+    )
+    click.echo("done mapping HGNC")
diff --git a/src/indra_cogex/sources/disgenet/__main__.py b/src/indra_cogex/sources/disgenet/__main__.py
@@ -0,0 +1,4 @@
+from . import DisgenetProcessor
+
+if __name__ == "__main__":
+    DisgenetProcessor.cli()
diff --git a/src/indra_cogex/sources/sider/__init__.py b/src/indra_cogex/sources/sider/__init__.py
@@ -9,18 +9,16 @@
 import gilda
 import gilda.grounder
 import pandas as pd
-import pyobo
 import pystow
-from biomappings import load_mappings
-from tabulate import tabulate
-from tqdm import tqdm
-
 from indra.databases import biolookup_client
 from indra.databases.identifiers import get_ns_id_from_identifiers
 from indra.ontology.bio import bio_ontology
-from indra_cogex.representation import Node, Relation, standardize
-from indra_cogex.sources import Processor
+from tabulate import tabulate
+from tqdm import tqdm
 
+from indra_cogex.representation import Node, Relation
+from indra_cogex.sources import Processor
+from indra_cogex.sources.utils import UmlsMapper
 
 VERSION = "4.1"
 SUBMODULE = pystow.module("indra", "cogex", "sider", VERSION)
@@ -43,77 +41,6 @@ def stitch_stereo_to_pubchem(cid: str) -> str:
     return re.sub(cid_to_pubchem_pattern, "\\1", cid)
 
 
-class UmlsMapper:
-    """A utility class for mapping out of UMLS."""
-
-    prefixes = ["doid", "mesh", "hp", "efo", "mondo"]
-
-    def __init__(self):
-        """Prepare the UMLS mappings from PyOBO and Biomappings."""
-        #: A dictionary from external prefix to UMLS id to external ID
-        self.xrefs = {}
-
-        for prefix in self.prefixes:
-            self.xrefs[prefix] = {}
-            # Get external to UMLS
-            for external_id, umls_id in pyobo.get_filtered_xrefs(
-                prefix, "umls"
-            ).items():
-                self.xrefs[prefix][umls_id] = external_id
-            # Get UMLS to external
-            for umls_id, external_id in pyobo.get_filtered_xrefs(
-                "umls", prefix
-            ).items():
-                self.xrefs[prefix][umls_id] = external_id
-
-        # Get manually curated UMLS mappings from biomappings
-        biomappings_from_umls, biomappings_to_umls = Counter(), Counter()
-        for mapping in load_mappings():
-            if mapping["source prefix"] == "umls":
-                target_prefix = mapping["target prefix"]
-                biomappings_from_umls[target_prefix] += 1
-                target_id = mapping["target identifier"]
-                source_id = mapping["source identifier"]
-                if target_prefix in self.xrefs:
-                    self.xrefs[target_prefix][target_id] = source_id
-                else:
-                    self.xrefs[target_prefix] = {
-                        target_id: source_id,
-                    }
-            elif mapping["target prefix"] == "umls":
-                source_prefix = mapping["source prefix"]
-                biomappings_to_umls[source_prefix] += 1
-                source_id = mapping["source identifier"]
-                target_id = mapping["target identifier"]
-                if source_prefix in self.xrefs:
-                    self.xrefs[source_prefix][source_id] = target_id
-                else:
-                    self.xrefs[source_prefix] = {
-                        source_id: target_id,
-                    }
-
-        print("Mapping out of UMLS")
-        print(tabulate(biomappings_from_umls.most_common()))
-        print("Mapping into UMLS")
-        print(tabulate(biomappings_to_umls.most_common()))
-
-        print("Total xrefs")
-        print(
-            tabulate(
-                [(prefix, len(self.xrefs[prefix])) for prefix in self.prefixes],
-                headers=["Prefix", "Mappings"],
-            )
-        )
-
-    def lookup(self, umls_id: str):
-        for prefix in self.prefixes:
-            xrefs = self.xrefs[prefix]
-            identifier = xrefs.get(umls_id)
-            if identifier is not None:
-                return standardize(prefix, identifier)
-        return "umls", umls_id, pyobo.get_name("umls", umls_id)
-
-
 class SIDERSideEffectProcessor(Processor):
     """A processor for SIDER side effects."""
 
@@ -167,6 +94,7 @@ def __init__(self):
         umls_mapper = UmlsMapper()
         self.side_effects = {}
         for umls_id in self.df["UMLS CUI from MedDRA"].unique():
+            # TODO replace with "standardize"
             prefix, identifier, name = umls_mapper.lookup(umls_id)
             db_ns, db_id = get_ns_id_from_identifiers(prefix, identifier)
             if db_ns is None:
-Original file line number
+Diff line change
@@ Expand Up / @@ -21,3 +21,4 @@ INDRA CoGEx Sources @@
        pathways
        pubmed
        sider
+       disgenet