diff --git a/src/indra_cogex/sources/disgenet/__init__.py b/src/indra_cogex/sources/disgenet/__init__.py index 9bda41531..44a9c478e 100644 --- a/src/indra_cogex/sources/disgenet/__init__.py +++ b/src/indra_cogex/sources/disgenet/__init__.py @@ -29,6 +29,9 @@ CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL = ( f"{DOWNLOAD_BASE}/curated_variant_disease_associations.tsv.gz" ) +CURATED_VARIANT_GENE_ASSOCIATIONS_URL = ( + f"{DOWNLOAD_BASE}/variant_to_gene_mappings.tsv.gz" +) TARGET_KEYS = { "NofSnps": int, @@ -62,31 +65,43 @@ class DisgenetProcessor(Processor): node_types = ["BioEntity"] gene_relation = "gene_disease_association" variant_relation = "variant_disease_association" + variant_gene_relation = "variant_gene_association" def __init__(self): """Initialize the DisGeNet processor.""" self.gene_df = load_disgenet_disease_gene( - CURATED_DISEASE_GENES_ASSOCIATIONS_URL + CURATED_DISEASE_GENES_ASSOCIATIONS_URL, gene_present=True, + disease_present=True ) self.variant_df = load_disgenet_disease_gene( - CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL, variant=True + CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL, gene_present=False, + disease_present=True + ) + + self.variant_gene_df = load_disgenet_disease_gene( + CURATED_VARIANT_GENE_ASSOCIATIONS_URL, gene_present=True, + disease_present=False ) def get_nodes(self): # noqa:D102 diseases = { tuple(row) for df in [self.gene_df, self.variant_df] - for row in df[["disease_prefix", "disease_id", "disease_name"]].values + for row in + df[["disease_prefix", "disease_id", "disease_name"]].values } for prefix, identifier, name in diseases: yield Node.standardized( db_ns=prefix, db_id=identifier, name=name, labels=["BioEntity"] ) for hgnc_id in self.gene_df["hgnc_id"].unique(): - yield Node.standardized(db_ns="HGNC", db_id=hgnc_id, labels=["BioEntity"]) + yield Node.standardized(db_ns="HGNC", db_id=hgnc_id, + labels=["BioEntity"]) + for dbsnp_id in self.variant_df["snpId"].unique(): yield Node.standardized( - db_ns="DBSNP", db_id=dbsnp_id, name=dbsnp_id, labels=["BioEntity"] + db_ns="DBSNP", db_id=dbsnp_id, name=dbsnp_id, + labels=["BioEntity"] ) def get_relations(self): # noqa:D102 @@ -100,15 +115,21 @@ def get_relations(self): # noqa:D102 for hgnc_id, disease_prefix, disease_id, snps, papers in self.gene_df[ columns ].values: - data = {"snps:int": snps, "source": self.name, "papers:int": papers} + data = {"snps:int": snps, "source": self.name, + "papers:int": papers} yield Relation( - "HGNC", hgnc_id, disease_prefix, disease_id, self.gene_relation, data + "HGNC", hgnc_id, disease_prefix, disease_id, + self.gene_relation, data ) yield from _yield_variant_relations( self.variant_df, self.name, self.variant_relation ) + yield from _yield_from_variant_gene_relations( + self.variant_gene_df, self.name, self.variant_gene_relation + ) + def _yield_variant_relations(variants_df, name, relation): columns = [ @@ -130,11 +151,27 @@ def _yield_variant_relations(variants_df, name, relation): data["dsi:float"] = dsi if pd.notna(dpi): data["dpi:float"] = dpi - yield Relation("DBSNP", snp_id, disease_prefix, disease_id, relation, data) + yield Relation("DBSNP", snp_id, disease_prefix, disease_id, + relation, data) + + +def _yield_from_variant_gene_relations(variant_gene_df, name, relation): + columns = [ + "snpId", + "hgnc_id" + ] + for snp_id, hgnc_id in variant_gene_df[ + columns + ].values: + data = { + "source": name + } + yield Relation("DBSNP", snp_id, "HGNC", hgnc_id, + relation, data) def load_disgenet_disease_gene( - url, force: bool = False, variant: bool = False + url, gene_present: bool, disease_present: bool, force: bool = False, ) -> pd.DataFrame: """Export disease-gene association file.""" df = SUBMODULE.ensure_csv( @@ -158,17 +195,19 @@ def load_disgenet_disease_gene( click.echo("done writing UMLS mapper") click.echo("mapping UMLS") - ( - df["disease_prefix"], - df["disease_id"], - df["disease_name"], - ) = zip(*df["diseaseId"].map(umls_mapper.standardize)) - click.echo("done mapping UMLS") - # Filter out ungroundable - df = df[df["disease_prefix"].notna()] + if disease_present: + ( + df["disease_prefix"], + df["disease_id"], + df["disease_name"], + ) = zip(*df["diseaseId"].map(umls_mapper.standardize)) + click.echo("done mapping UMLS") - if not variant: + # Filter out ungroundable + df = df[df["disease_prefix"].notna()] + + if gene_present: click.echo("mapping HGNC") df["hgnc_id"] = df["geneId"].map( lambda s: hgnc_client.get_hgnc_from_entrez(s.strip()) @@ -177,3 +216,4 @@ def load_disgenet_disease_gene( df = df[df["hgnc_id"].notna()] return df +