Final cleanup

gyorilab · Dec 12, 2023 · 03ee98e · 03ee98e
1 parent 6d949b4
commit 03ee98e
Showing 1 changed file with 33 additions and 85 deletions.
diff --git a/src/indra_cogex/sources/disgenet/__init__.py b/src/indra_cogex/sources/disgenet/__init__.py
@@ -33,30 +33,6 @@
     f"{DOWNLOAD_BASE}/variant_to_gene_mappings.tsv.gz"
 )
 
-TARGET_KEYS = {
-    "NofSnps": int,
-    "DSI": str,
-    "DPI": str,
-    "diseaseType": str,
-    "diseaseSemanticType": str,
-    "score": str,
-    "EI": str,
-    "YearInitial": int,
-    "YearFinal": int,
-    "NofPmids": str,
-    "source": lambda s: set(s.split(";")) if s and pd.notna(s) else s,
-    "GD_dis1GeneIdSet": int,
-    "GD_dis2GeneIdSet": int,
-    "GD_commonGeneIdSet": int,
-    "GD_jaccard": float,
-    "GD_pvalue": float,
-    "VD_variant1GeneIdSet": int,
-    "VD_variant2GeneIdSet": int,
-    "VD_commonVariantIdSet": int,
-    "VD_jaccard": float,
-    "VD_pvalue": float,
-}
-
 
 class DisgenetProcessor(Processor):
     """Processor for the DisGeNet database."""
@@ -99,31 +75,29 @@ def get_nodes(self):  # noqa:D102
             )
 
     def get_relations(self):  # noqa:D102
-        columns = [
-            "hgnc_id",
-            "disease_prefix",
-            "disease_id",
-            "NofSnps",
-            "NofPmids",
-        ]
-        for hgnc_id, disease_prefix, disease_id, snps, papers in self.gene_df[
-            columns
-        ].values:
-            data = {"snps:int": snps, "source": self.name, "papers:int": papers}
-            yield Relation(
-                "HGNC", hgnc_id, disease_prefix, disease_id, self.gene_relation, data
-            )
-
+        yield from _yield_gene_relations(self.gene_df, self.name, self.gene_relation)
         yield from _yield_variant_relations(
             self.variant_df, self.name, self.variant_relation
         )
-
         yield from _yield_variant_gene_relations(
             self.variant_gene_df, self.name, self.variant_gene_relation
         )
 
 
-def _yield_variant_relations(variants_df, name, relation):
+def _yield_gene_relations(df, name, relation):
+    columns = [
+        "hgnc_id",
+        "disease_prefix",
+        "disease_id",
+        "NofSnps",
+        "NofPmids",
+    ]
+    for hgnc_id, disease_prefix, disease_id, snps, papers in df[columns].values:
+        data = {"snps:int": snps, "source": name, "papers:int": papers}
+        yield Relation("HGNC", hgnc_id, disease_prefix, disease_id, relation, data)
+
+
+def _yield_variant_relations(df, name, relation):
     columns = [
         "snpId",
         "DSI",
@@ -132,9 +106,7 @@ def _yield_variant_relations(variants_df, name, relation):
         "disease_id",
         "NofPmids",
     ]
-    for snp_id, dsi, dpi, disease_prefix, disease_id, papers in variants_df[
-        columns
-    ].values:
+    for snp_id, dsi, dpi, disease_prefix, disease_id, papers in df[columns].values:
         data = {
             "source": name,
             "papers:int": papers,
@@ -146,9 +118,9 @@ def _yield_variant_relations(variants_df, name, relation):
         yield Relation("DBSNP", snp_id, disease_prefix, disease_id, relation, data)
 
 
-def _yield_variant_gene_relations(variant_gene_df, name, relation):
+def _yield_variant_gene_relations(df, name, relation):
     columns = ["snpId", "hgnc_id"]
-    for snp_id, hgnc_id in variant_gene_df[columns].values:
+    for snp_id, hgnc_id in df[columns].values:
         data = {"source": name}
         yield Relation("DBSNP", snp_id, "HGNC", hgnc_id, relation, data)
 
@@ -163,40 +135,11 @@ def load_disgenet_disease_gene(
         read_csv_kwargs=dict(dtype={"geneId": str}),
         force=force,
     )
-
-    mapper_path = SUBMODULE.join(name="umls_mapper.pkl")
-    if mapper_path.is_file():
-        click.echo("loading UMLS mapper")
-        umls_mapper = pickle.loads(mapper_path.read_bytes())
-        click.echo("done loading UMLS mapper")
-    else:
-        click.echo("loading UMLS mapper")
-        umls_mapper = UmlsMapper()
-        click.echo("writing UMLS mapper")
-        mapper_path.write_bytes(
-            pickle.dumps(umls_mapper, protocol=pickle.HIGHEST_PROTOCOL)
-        )
-        click.echo("done writing UMLS mapper")
-
-    click.echo("mapping UMLS")
-
-    (
-        df["disease_prefix"],
-        df["disease_id"],
-        df["disease_name"],
-    ) = zip(*df["diseaseId"].map(umls_mapper.standardize))
-    click.echo("done mapping UMLS")
-
+    _map_disease(df)
     # Filter out ungroundable
     df = df[df["disease_prefix"].notna()]
-
-    click.echo("mapping HGNC")
-    df["hgnc_id"] = df["geneId"].map(
-        lambda s: hgnc_client.get_hgnc_from_entrez(s.strip())
-    )
-    click.echo("done mapping HGNC")
+    _map_entrez(df)
     df = df[df["hgnc_id"].notna()]
-
     return df
 
 
@@ -209,6 +152,13 @@ def load_disgenet_disease_variant(
         read_csv_kwargs=dict(dtype={"snpId": str}),
         force=force,
     )
+    _map_disease(df)
+    # Filter out ungroundable
+    df = df[df["disease_prefix"].notna()]
+    return df
+
+
+def _map_disease(df):
     mapper_path = SUBMODULE.join(name="umls_mapper.pkl")
     if mapper_path.is_file():
         click.echo("loading UMLS mapper")
@@ -224,18 +174,14 @@ def load_disgenet_disease_variant(
         click.echo("done writing UMLS mapper")
 
     click.echo("mapping UMLS")
+
     (
         df["disease_prefix"],
         df["disease_id"],
         df["disease_name"],
     ) = zip(*df["diseaseId"].map(umls_mapper.standardize))
     click.echo("done mapping UMLS")
 
-    # Filter out ungroundable
-    df = df[df["disease_prefix"].notna()]
-
-    return df
-
 
 def load_disgenet_variant_gene(
     url,
@@ -246,12 +192,14 @@ def load_disgenet_variant_gene(
         read_csv_kwargs=dict(dtype={"geneId": str, "snpId": str}),
         force=force,
     )
+    _map_entrez(df)
+    df = df[df["hgnc_id"].notna()]
+    return df
+
 
+def _map_entrez(df):
     click.echo("mapping HGNC")
     df["hgnc_id"] = df["geneId"].map(
         lambda s: hgnc_client.get_hgnc_from_entrez(s.strip())
     )
     click.echo("done mapping HGNC")
-    df = df[df["hgnc_id"].notna()]
-
-    return df