Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Final cleanup
Browse files Browse the repository at this point in the history
cthoyt committed Dec 12, 2023
1 parent 6d949b4 commit 03ee98e
Showing 1 changed file with 33 additions and 85 deletions.
118 changes: 33 additions & 85 deletions src/indra_cogex/sources/disgenet/__init__.py
Original file line number Diff line number Diff line change
@@ -33,30 +33,6 @@
f"{DOWNLOAD_BASE}/variant_to_gene_mappings.tsv.gz"
)

TARGET_KEYS = {
"NofSnps": int,
"DSI": str,
"DPI": str,
"diseaseType": str,
"diseaseSemanticType": str,
"score": str,
"EI": str,
"YearInitial": int,
"YearFinal": int,
"NofPmids": str,
"source": lambda s: set(s.split(";")) if s and pd.notna(s) else s,
"GD_dis1GeneIdSet": int,
"GD_dis2GeneIdSet": int,
"GD_commonGeneIdSet": int,
"GD_jaccard": float,
"GD_pvalue": float,
"VD_variant1GeneIdSet": int,
"VD_variant2GeneIdSet": int,
"VD_commonVariantIdSet": int,
"VD_jaccard": float,
"VD_pvalue": float,
}


class DisgenetProcessor(Processor):
"""Processor for the DisGeNet database."""
@@ -99,31 +75,29 @@ def get_nodes(self): # noqa:D102
)

def get_relations(self): # noqa:D102
columns = [
"hgnc_id",
"disease_prefix",
"disease_id",
"NofSnps",
"NofPmids",
]
for hgnc_id, disease_prefix, disease_id, snps, papers in self.gene_df[
columns
].values:
data = {"snps:int": snps, "source": self.name, "papers:int": papers}
yield Relation(
"HGNC", hgnc_id, disease_prefix, disease_id, self.gene_relation, data
)

yield from _yield_gene_relations(self.gene_df, self.name, self.gene_relation)
yield from _yield_variant_relations(
self.variant_df, self.name, self.variant_relation
)

yield from _yield_variant_gene_relations(
self.variant_gene_df, self.name, self.variant_gene_relation
)


def _yield_variant_relations(variants_df, name, relation):
def _yield_gene_relations(df, name, relation):
columns = [
"hgnc_id",
"disease_prefix",
"disease_id",
"NofSnps",
"NofPmids",
]
for hgnc_id, disease_prefix, disease_id, snps, papers in df[columns].values:
data = {"snps:int": snps, "source": name, "papers:int": papers}
yield Relation("HGNC", hgnc_id, disease_prefix, disease_id, relation, data)


def _yield_variant_relations(df, name, relation):
columns = [
"snpId",
"DSI",
@@ -132,9 +106,7 @@ def _yield_variant_relations(variants_df, name, relation):
"disease_id",
"NofPmids",
]
for snp_id, dsi, dpi, disease_prefix, disease_id, papers in variants_df[
columns
].values:
for snp_id, dsi, dpi, disease_prefix, disease_id, papers in df[columns].values:
data = {
"source": name,
"papers:int": papers,
@@ -146,9 +118,9 @@ def _yield_variant_relations(variants_df, name, relation):
yield Relation("DBSNP", snp_id, disease_prefix, disease_id, relation, data)


def _yield_variant_gene_relations(variant_gene_df, name, relation):
def _yield_variant_gene_relations(df, name, relation):
columns = ["snpId", "hgnc_id"]
for snp_id, hgnc_id in variant_gene_df[columns].values:
for snp_id, hgnc_id in df[columns].values:
data = {"source": name}
yield Relation("DBSNP", snp_id, "HGNC", hgnc_id, relation, data)

@@ -163,40 +135,11 @@ def load_disgenet_disease_gene(
read_csv_kwargs=dict(dtype={"geneId": str}),
force=force,
)

mapper_path = SUBMODULE.join(name="umls_mapper.pkl")
if mapper_path.is_file():
click.echo("loading UMLS mapper")
umls_mapper = pickle.loads(mapper_path.read_bytes())
click.echo("done loading UMLS mapper")
else:
click.echo("loading UMLS mapper")
umls_mapper = UmlsMapper()
click.echo("writing UMLS mapper")
mapper_path.write_bytes(
pickle.dumps(umls_mapper, protocol=pickle.HIGHEST_PROTOCOL)
)
click.echo("done writing UMLS mapper")

click.echo("mapping UMLS")

(
df["disease_prefix"],
df["disease_id"],
df["disease_name"],
) = zip(*df["diseaseId"].map(umls_mapper.standardize))
click.echo("done mapping UMLS")

_map_disease(df)
# Filter out ungroundable
df = df[df["disease_prefix"].notna()]

click.echo("mapping HGNC")
df["hgnc_id"] = df["geneId"].map(
lambda s: hgnc_client.get_hgnc_from_entrez(s.strip())
)
click.echo("done mapping HGNC")
_map_entrez(df)
df = df[df["hgnc_id"].notna()]

return df


@@ -209,6 +152,13 @@ def load_disgenet_disease_variant(
read_csv_kwargs=dict(dtype={"snpId": str}),
force=force,
)
_map_disease(df)
# Filter out ungroundable
df = df[df["disease_prefix"].notna()]
return df


def _map_disease(df):
mapper_path = SUBMODULE.join(name="umls_mapper.pkl")
if mapper_path.is_file():
click.echo("loading UMLS mapper")
@@ -224,18 +174,14 @@ def load_disgenet_disease_variant(
click.echo("done writing UMLS mapper")

click.echo("mapping UMLS")

(
df["disease_prefix"],
df["disease_id"],
df["disease_name"],
) = zip(*df["diseaseId"].map(umls_mapper.standardize))
click.echo("done mapping UMLS")

# Filter out ungroundable
df = df[df["disease_prefix"].notna()]

return df


def load_disgenet_variant_gene(
url,
@@ -246,12 +192,14 @@ def load_disgenet_variant_gene(
read_csv_kwargs=dict(dtype={"geneId": str, "snpId": str}),
force=force,
)
_map_entrez(df)
df = df[df["hgnc_id"].notna()]
return df


def _map_entrez(df):
click.echo("mapping HGNC")
df["hgnc_id"] = df["geneId"].map(
lambda s: hgnc_client.get_hgnc_from_entrez(s.strip())
)
click.echo("done mapping HGNC")
df = df[df["hgnc_id"].notna()]

return df

0 comments on commit 03ee98e

Please sign in to comment.