Add variant-gene mapping

gyorilab · Dec 12, 2023 · 8aafab4 · 8aafab4
1 parent cec82f8
commit 8aafab4
Showing 1 changed file with 58 additions and 18 deletions.
diff --git a/src/indra_cogex/sources/disgenet/__init__.py b/src/indra_cogex/sources/disgenet/__init__.py
@@ -29,6 +29,9 @@
 CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL = (
     f"{DOWNLOAD_BASE}/curated_variant_disease_associations.tsv.gz"
 )
+CURATED_VARIANT_GENE_ASSOCIATIONS_URL = (
+    f"{DOWNLOAD_BASE}/variant_to_gene_mappings.tsv.gz"
+)
 
 TARGET_KEYS = {
     "NofSnps": int,
@@ -62,31 +65,43 @@ class DisgenetProcessor(Processor):
     node_types = ["BioEntity"]
     gene_relation = "gene_disease_association"
     variant_relation = "variant_disease_association"
+    variant_gene_relation = "variant_gene_association"
 
     def __init__(self):
         """Initialize the DisGeNet processor."""
         self.gene_df = load_disgenet_disease_gene(
-            CURATED_DISEASE_GENES_ASSOCIATIONS_URL
+            CURATED_DISEASE_GENES_ASSOCIATIONS_URL, gene_present=True,
+            disease_present=True
         )
         self.variant_df = load_disgenet_disease_gene(
-            CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL, variant=True
+            CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL, gene_present=False,
+            disease_present=True
+        )
+
+        self.variant_gene_df = load_disgenet_disease_gene(
+            CURATED_VARIANT_GENE_ASSOCIATIONS_URL, gene_present=True,
+            disease_present=False
         )
 
     def get_nodes(self):  # noqa:D102
         diseases = {
             tuple(row)
             for df in [self.gene_df, self.variant_df]
-            for row in df[["disease_prefix", "disease_id", "disease_name"]].values
+            for row in
+            df[["disease_prefix", "disease_id", "disease_name"]].values
         }
         for prefix, identifier, name in diseases:
             yield Node.standardized(
                 db_ns=prefix, db_id=identifier, name=name, labels=["BioEntity"]
             )
         for hgnc_id in self.gene_df["hgnc_id"].unique():
-            yield Node.standardized(db_ns="HGNC", db_id=hgnc_id, labels=["BioEntity"])
+            yield Node.standardized(db_ns="HGNC", db_id=hgnc_id,
+                                    labels=["BioEntity"])
+
         for dbsnp_id in self.variant_df["snpId"].unique():
             yield Node.standardized(
-                db_ns="DBSNP", db_id=dbsnp_id, name=dbsnp_id, labels=["BioEntity"]
+                db_ns="DBSNP", db_id=dbsnp_id, name=dbsnp_id,
+                labels=["BioEntity"]
             )
 
     def get_relations(self):  # noqa:D102
@@ -100,15 +115,21 @@ def get_relations(self):  # noqa:D102
         for hgnc_id, disease_prefix, disease_id, snps, papers in self.gene_df[
             columns
         ].values:
-            data = {"snps:int": snps, "source": self.name, "papers:int": papers}
+            data = {"snps:int": snps, "source": self.name,
+                    "papers:int": papers}
             yield Relation(
-                "HGNC", hgnc_id, disease_prefix, disease_id, self.gene_relation, data
+                "HGNC", hgnc_id, disease_prefix, disease_id,
+                self.gene_relation, data
             )
 
         yield from _yield_variant_relations(
             self.variant_df, self.name, self.variant_relation
         )
 
+        yield from _yield_from_variant_gene_relations(
+            self.variant_gene_df, self.name, self.variant_gene_relation
+        )
+
 
 def _yield_variant_relations(variants_df, name, relation):
     columns = [
@@ -130,11 +151,27 @@ def _yield_variant_relations(variants_df, name, relation):
             data["dsi:float"] = dsi
         if pd.notna(dpi):
             data["dpi:float"] = dpi
-        yield Relation("DBSNP", snp_id, disease_prefix, disease_id, relation, data)
+        yield Relation("DBSNP", snp_id, disease_prefix, disease_id,
+                       relation, data)
+
+
+def _yield_from_variant_gene_relations(variant_gene_df, name, relation):
+    columns = [
+        "snpId",
+        "hgnc_id"
+    ]
+    for snp_id, hgnc_id in variant_gene_df[
+        columns
+    ].values:
+        data = {
+            "source": name
+        }
+        yield Relation("DBSNP", snp_id, "HGNC", hgnc_id,
+                       relation, data)
 
 
 def load_disgenet_disease_gene(
-    url, force: bool = False, variant: bool = False
+    url, gene_present: bool, disease_present: bool, force: bool = False,
 ) -> pd.DataFrame:
     """Export disease-gene association file."""
     df = SUBMODULE.ensure_csv(
@@ -158,17 +195,19 @@ def load_disgenet_disease_gene(
         click.echo("done writing UMLS mapper")
 
     click.echo("mapping UMLS")
-    (
-        df["disease_prefix"],
-        df["disease_id"],
-        df["disease_name"],
-    ) = zip(*df["diseaseId"].map(umls_mapper.standardize))
-    click.echo("done mapping UMLS")
 
-    # Filter out ungroundable
-    df = df[df["disease_prefix"].notna()]
+    if disease_present:
+        (
+            df["disease_prefix"],
+            df["disease_id"],
+            df["disease_name"],
+        ) = zip(*df["diseaseId"].map(umls_mapper.standardize))
+        click.echo("done mapping UMLS")
 
-    if not variant:
+        # Filter out ungroundable
+        df = df[df["disease_prefix"].notna()]
+
+    if gene_present:
         click.echo("mapping HGNC")
         df["hgnc_id"] = df["geneId"].map(
             lambda s: hgnc_client.get_hgnc_from_entrez(s.strip())
@@ -177,3 +216,4 @@ def load_disgenet_disease_gene(
         df = df[df["hgnc_id"].notna()]
 
     return df
+