Skip to content

Commit

Permalink
Black and minor refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed Dec 12, 2023
1 parent e7af087 commit 7d65d72
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 54 deletions.
106 changes: 54 additions & 52 deletions src/indra_cogex/sources/disgenet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from indra_cogex.sources.utils import UmlsMapper

__all__ = [
"DisgenetGeneProcessor",
"DisgenetProcessor",
]

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -53,32 +53,33 @@
}


class DisgenetGeneProcessor(Processor):
class DisgenetProcessor(Processor):
"""Processor for the DisGeNet database."""

name = "disgenet_gene"
df: pd.DataFrame
name = "disgenet"
node_types = ["BioEntity"]
relation = "gene_disease_association"
gene_relation = "gene_disease_association"
variant_relation = "variant_disease_association"

def __init__(self):
"""Initialize the DisGeNet processor."""
self.df = load_disgenet_disease_gene(
CURATED_DISEASE_GENES_ASSOCIATIONS_URL)
self.gene_df = load_disgenet_disease_gene(
CURATED_DISEASE_GENES_ASSOCIATIONS_URL
)

def get_nodes(self): # noqa:D102
diseases = {
tuple(row)
for row in
self.df[["disease_prefix", "disease_id", "disease_name"]].values
for row in self.gene_df[
["disease_prefix", "disease_id", "disease_name"]
].values
}
for prefix, identifier, name in diseases:
yield Node.standardized(
db_ns=prefix, db_id=identifier, name=name, labels=["BioEntity"]
)
for hgnc_id in self.df["hgnc_id"].unique():
yield Node.standardized(db_ns="HGNC", db_id=hgnc_id,
labels=["BioEntity"])
for hgnc_id in self.gene_df["hgnc_id"].unique():
yield Node.standardized(db_ns="HGNC", db_id=hgnc_id, labels=["BioEntity"])

def get_relations(self): # noqa:D102
columns = [
Expand All @@ -88,46 +89,46 @@ def get_relations(self): # noqa:D102
"NofSnps",
"NofPmids",
]
for hgnc_id, disease_prefix, disease_id, snps, papers in self.df[
for hgnc_id, disease_prefix, disease_id, snps, papers in self.gene_df[
columns
].values:
data = {"snps:int": snps, "source": self.name,
"papers:int": papers}
data = {"snps:int": snps, "source": self.name, "papers:int": papers}
yield Relation(
"HGNC", hgnc_id, disease_prefix, disease_id, self.relation,
data
"HGNC", hgnc_id, disease_prefix, disease_id, self.gene_relation, data
)


class DisgenetVariantProcessor(Processor):
"""Processor for the DisGeNet database."""

name = "disgenet_variant"
df: pd.DataFrame
variant_df: pd.DataFrame
node_types = ["BioEntity"]
relation = "variant_disease_association"

def __init__(self):
"""Initialize the DisGeNet processor."""
self.df = load_disgenet_disease_gene(
CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL, variant=True)
self.variant_df = load_disgenet_disease_gene(
CURATED_DISEASE_VARIANT_ASSOCIATIONS_URL, variant=True
)

def get_nodes(self): # noqa:D102
# Since there exists no column in the disease-variant association tsv
# that describes disease prefix, opted to use diseaseClass
diseases = {
tuple(row)
for row in
self.df[["diseaseClass", "diseaseId", "diseaseName"]].values
for row in self.variant_df[
["diseaseClass", "diseaseId", "diseaseName"]
].values
}
for disease_class, identifier, name in diseases:
yield Node.standardized(
db_ns=disease_class, db_id=identifier, name=name,
labels=["BioEntity"]
db_ns=disease_class, db_id=identifier, name=name, labels=["BioEntity"]
)
for snp_id in self.variant_df["snpId"].unique():
yield Node.standardized(
db_ns="DBSNP", db_id=snp_id, name=snp_id, labels=["BioEntity"]
)
for snp_id in self.df["snpId"].unique():
yield Node.standardized(db_ns="DBSNP", db_id=snp_id,
labels=["BioEntity"])

def get_relations(self): # noqa:D102
# Use diseaseName instead of diseasePrefix as diseasePrefix does not
Expand All @@ -136,49 +137,50 @@ def get_relations(self): # noqa:D102
"snpId",
"DSI",
"DPI",
"diseaseName",
"disease_prefix",
"diseaseId",
"NofPmids",
]

for snp_id, dsi, dpi, disease_name, disease_id, papers in self.df[
for snp_id, dsi, dpi, disease_prefix, disease_id, papers in self.variant_df[
columns
].values:
data = {"dsi:float": dsi,
"dpi:float": dpi,
"source": self.name,
"papers:int": papers}
data = {
"dsi:float": dsi,
"dpi:float": dpi,
"source": self.name,
"papers:int": papers,
}
yield Relation(
"DBSNP", snp_id, disease_name, disease_id, self.relation,
data
"DBSNP", snp_id, disease_prefix, disease_id, self.relation, data
)


def load_disgenet_disease_gene(url, force: bool = False,
variant: bool = False) -> (pd.DataFrame):
def load_disgenet_disease_gene(
url, force: bool = False, variant: bool = False
) -> pd.DataFrame:
"""Export disease-gene association file."""
df = SUBMODULE.ensure_csv(
url=url,
read_csv_kwargs=dict(dtype={"geneId": str,
"snpId": str}),
read_csv_kwargs=dict(dtype={"geneId": str, "snpId": str}),
force=force,
)
umls_mapper = UmlsMapper()
(
df["disease_prefix"],
df["disease_id"],
df["disease_name"],
) = zip(*df["diseaseId"].map(umls_mapper.standardize))
df = df[df["disease_prefix"].notna()]

# several dpi and dsi fields are NaN
df = df[df["DSI"].notna()]
df = df[df["DPI"].notna()]

if not variant:
df["hgnc_id"] = df["geneId"].map(
lambda s: hgnc_client.get_hgnc_from_entrez(s.strip())
)
df = df[df["hgnc_id"].notna()]
umls_mapper = UmlsMapper()
(
df["disease_prefix"],
df["disease_id"],
df["disease_name"],
) = zip(*df["diseaseId"].map(umls_mapper.standardize))
df = df[df["disease_prefix"].notna()]
return df
elif variant:
# several dpi and dsi fields are NaN
df = df[df["DSI"].notna()]
df = df[df["DPI"].notna()]
return df

return df
4 changes: 2 additions & 2 deletions src/indra_cogex/sources/disgenet/__main__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from . import DisgenetGeneProcessor
from . import DisgenetProcessor

if __name__ == "__main__":
DisgenetGeneProcessor.cli()
DisgenetProcessor.cli()

0 comments on commit 7d65d72

Please sign in to comment.