Skip to content

Commit

Permalink
Add variant-gene mapping
Browse files Browse the repository at this point in the history
  • Loading branch information
nanglo123 committed Dec 12, 2023
1 parent cec82f8 commit 8aafab4
Showing 1 changed file with 58 additions and 18 deletions.
76 changes: 58 additions & 18 deletions src/indra_cogex/sources/disgenet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL = (
f"{DOWNLOAD_BASE}/curated_variant_disease_associations.tsv.gz"
)
CURATED_VARIANT_GENE_ASSOCIATIONS_URL = (
f"{DOWNLOAD_BASE}/variant_to_gene_mappings.tsv.gz"
)

TARGET_KEYS = {
"NofSnps": int,
Expand Down Expand Up @@ -62,31 +65,43 @@ class DisgenetProcessor(Processor):
node_types = ["BioEntity"]
gene_relation = "gene_disease_association"
variant_relation = "variant_disease_association"
variant_gene_relation = "variant_gene_association"

def __init__(self):
"""Initialize the DisGeNet processor."""
self.gene_df = load_disgenet_disease_gene(
CURATED_DISEASE_GENES_ASSOCIATIONS_URL
CURATED_DISEASE_GENES_ASSOCIATIONS_URL, gene_present=True,
disease_present=True
)
self.variant_df = load_disgenet_disease_gene(
CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL, variant=True
CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL, gene_present=False,
disease_present=True
)

self.variant_gene_df = load_disgenet_disease_gene(
CURATED_VARIANT_GENE_ASSOCIATIONS_URL, gene_present=True,
disease_present=False
)

def get_nodes(self): # noqa:D102
diseases = {
tuple(row)
for df in [self.gene_df, self.variant_df]
for row in df[["disease_prefix", "disease_id", "disease_name"]].values
for row in
df[["disease_prefix", "disease_id", "disease_name"]].values
}
for prefix, identifier, name in diseases:
yield Node.standardized(
db_ns=prefix, db_id=identifier, name=name, labels=["BioEntity"]
)
for hgnc_id in self.gene_df["hgnc_id"].unique():
yield Node.standardized(db_ns="HGNC", db_id=hgnc_id, labels=["BioEntity"])
yield Node.standardized(db_ns="HGNC", db_id=hgnc_id,
labels=["BioEntity"])

for dbsnp_id in self.variant_df["snpId"].unique():
yield Node.standardized(
db_ns="DBSNP", db_id=dbsnp_id, name=dbsnp_id, labels=["BioEntity"]
db_ns="DBSNP", db_id=dbsnp_id, name=dbsnp_id,
labels=["BioEntity"]
)

def get_relations(self): # noqa:D102
Expand All @@ -100,15 +115,21 @@ def get_relations(self): # noqa:D102
for hgnc_id, disease_prefix, disease_id, snps, papers in self.gene_df[
columns
].values:
data = {"snps:int": snps, "source": self.name, "papers:int": papers}
data = {"snps:int": snps, "source": self.name,
"papers:int": papers}
yield Relation(
"HGNC", hgnc_id, disease_prefix, disease_id, self.gene_relation, data
"HGNC", hgnc_id, disease_prefix, disease_id,
self.gene_relation, data
)

yield from _yield_variant_relations(
self.variant_df, self.name, self.variant_relation
)

yield from _yield_from_variant_gene_relations(
self.variant_gene_df, self.name, self.variant_gene_relation
)


def _yield_variant_relations(variants_df, name, relation):
columns = [
Expand All @@ -130,11 +151,27 @@ def _yield_variant_relations(variants_df, name, relation):
data["dsi:float"] = dsi
if pd.notna(dpi):
data["dpi:float"] = dpi
yield Relation("DBSNP", snp_id, disease_prefix, disease_id, relation, data)
yield Relation("DBSNP", snp_id, disease_prefix, disease_id,
relation, data)


def _yield_from_variant_gene_relations(variant_gene_df, name, relation):
columns = [
"snpId",
"hgnc_id"
]
for snp_id, hgnc_id in variant_gene_df[
columns
].values:
data = {
"source": name
}
yield Relation("DBSNP", snp_id, "HGNC", hgnc_id,
relation, data)


def load_disgenet_disease_gene(
url, force: bool = False, variant: bool = False
url, gene_present: bool, disease_present: bool, force: bool = False,
) -> pd.DataFrame:
"""Export disease-gene association file."""
df = SUBMODULE.ensure_csv(
Expand All @@ -158,17 +195,19 @@ def load_disgenet_disease_gene(
click.echo("done writing UMLS mapper")

click.echo("mapping UMLS")
(
df["disease_prefix"],
df["disease_id"],
df["disease_name"],
) = zip(*df["diseaseId"].map(umls_mapper.standardize))
click.echo("done mapping UMLS")

# Filter out ungroundable
df = df[df["disease_prefix"].notna()]
if disease_present:
(
df["disease_prefix"],
df["disease_id"],
df["disease_name"],
) = zip(*df["diseaseId"].map(umls_mapper.standardize))
click.echo("done mapping UMLS")

if not variant:
# Filter out ungroundable
df = df[df["disease_prefix"].notna()]

if gene_present:
click.echo("mapping HGNC")
df["hgnc_id"] = df["geneId"].map(
lambda s: hgnc_client.get_hgnc_from_entrez(s.strip())
Expand All @@ -177,3 +216,4 @@ def load_disgenet_disease_gene(
df = df[df["hgnc_id"].notna()]

return df

0 comments on commit 8aafab4

Please sign in to comment.