Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: aggregate moa oncotree concepts during transformation #413

Merged
merged 3 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 9 additions & 19 deletions src/metakb/harvesters/moa.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,24 +151,27 @@ def _harvest_assertion(self, assertion: dict, variants_list: list[dict]) -> dict
assertion_record = {
"id": assertion["assertion_id"],
"context": assertion["context"],
"deprecated": assertion["deprecated"],
"description": assertion["description"],
"disease": {
"name": assertion["disease"],
"oncotree_code": assertion["oncotree_code"],
"oncotree_term": assertion["oncotree_term"],
},
"therapy_name": assertion["therapy_name"],
"therapy_type": assertion["therapy_type"],
"clinical_significance": self._get_therapy(
assertion["therapy_resistance"], assertion["therapy_sensitivity"]
),
"therapy": {
"name": assertion["therapy_name"],
"type": assertion["therapy_type"],
"strategy": assertion["therapy_strategy"],
"resistance": assertion["therapy_resistance"],
"sensitivity": assertion["therapy_sensitivity"],
},
"predictive_implication": assertion["predictive_implication"],
"favorable_prognosis": assertion["favorable_prognosis"],
"created_on": assertion["created_on"],
"last_updated": assertion["last_updated"],
"submitted_by": assertion["submitted_by"],
"validated": assertion["validated"],
"source_ids": assertion["sources"][0]["source_id"],
"source_id": assertion["sources"][0]["source_id"],
}

for v in variants_list:
Expand All @@ -177,19 +180,6 @@ def _harvest_assertion(self, assertion: dict, variants_list: list[dict]) -> dict

return assertion_record

def _get_therapy(self, resistance: bool, sensitivity: bool) -> str | None:
"""Get therapy response data.

:param resistance: `True` if Therapy Resistance. `False` if not Therapy Resistance
:param sensitivity: `True` if Therapy Sensitivity. `False` if not Therapy Sensitivity
:return: whether the therapy response is resistance or sensitivity
"""
if resistance:
return "resistance"
if sensitivity:
return "sensitivity"
return None

def _get_feature(self, v: dict) -> dict:
"""Get feature name from the harvested variants

Expand Down
267 changes: 149 additions & 118 deletions src/metakb/transformers/moa.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
from ga4gh.cat_vrs.core_models import CategoricalVariant, DefiningContextConstraint
from ga4gh.core import sha512t24u
from ga4gh.core.domain_models import (
CombinationTherapy,
Disease,
Gene,
TherapeuticAgent,
TherapeuticSubstituteGroup,
)
from ga4gh.core.entity_models import (
Coding,
Expand All @@ -21,7 +23,9 @@
)
from ga4gh.va_spec.profiles.var_study_stmt import (
AlleleOriginQualifier,
PrognosticPredicate,
TherapeuticResponsePredicate,
VariantPrognosticStudyStatement,
VariantTherapeuticResponseStudyStatement,
)
from ga4gh.vrs.models import Variation
Expand Down Expand Up @@ -85,133 +89,97 @@ async def transform(self, harvested_data: MoaHarvestedData) -> None:
self._add_documents(harvested_data.sources)

# Add variant therapeutic response study statement data. Will update `statements`
await self._add_variant_tr_study_stmts(harvested_data.assertions)
for assertion in harvested_data.assertions:
await self._add_variant_study_stmt(assertion)

async def _add_variant_tr_study_stmts(self, assertions: list[dict]) -> None:
"""Create Variant Therapeutic Response Study Statements from MOA assertions.
async def _add_variant_study_stmt(self, assertion: dict) -> None:
"""Create Variant Study Statements from MOA assertions.
Will add associated values to ``processed_data`` instance variable
(``therapeutic_procedures``, ``conditions``, and ``statements``).
``able_to_normalize`` and ``unable_to_normalize`` will
also be mutated for associated therapeutic_procedures and conditions.

:param assertions: A list of MOA assertion records
:param assertions: MOA assertion record
"""
for record in assertions:
assertion_id = f"moa.assertion:{record['id']}"
variant_id = record["variant"]["id"]
assertion_id = f"moa.assertion:{assertion['id']}"
variant_id = assertion["variant"]["id"]

# Check cache for variation record (which contains gene information)
variation_gene_map = self.able_to_normalize["variations"].get(variant_id)
if not variation_gene_map:
logger.debug(
"%s has no variation for variant_id %s", assertion_id, variant_id
)
continue

# Get predicate. We only support therapeutic resistance/sensitivity
if record["clinical_significance"] == "resistance":
predicate = TherapeuticResponsePredicate.RESISTANCE
elif record["clinical_significance"] == "sensitivity":
predicate = TherapeuticResponsePredicate.SENSITIVITY
else:
logger.debug(
"clinical_significance not supported: %s",
record["clinical_significance"],
)
continue
# Check cache for variation record (which contains gene information)
variation_gene_map = self.able_to_normalize["variations"].get(variant_id)
if not variation_gene_map:
logger.debug(
"%s has no variation for variant_id %s", assertion_id, variant_id
)
return

# Get strength
predictive_implication = (
assertion["predictive_implication"]
.strip()
.replace(" ", "_")
.replace("-", "_")
.upper()
)
moa_evidence_level = MoaEvidenceLevel[predictive_implication]
strength = self.evidence_level_to_vicc_concept_mapping[moa_evidence_level]

# Get strength
predictive_implication = (
record["predictive_implication"]
.strip()
.replace(" ", "_")
.replace("-", "_")
.upper()
# Add disease
moa_disease = self._add_disease(assertion["disease"])
if not moa_disease:
logger.debug(
"%s has no disease for disease %s", assertion_id, assertion["disease"]
)
moa_evidence_level = MoaEvidenceLevel[predictive_implication]
strength = self.evidence_level_to_vicc_concept_mapping[moa_evidence_level]
return

# Add therapeutic agent. We only support one therapy, so we will skip others
therapy_name = record["therapy_name"]
if not therapy_name:
logger.debug("%s has no therapy_name", assertion_id)
continue
# Add document
document = self.able_to_normalize["documents"].get(assertion["source_id"])

therapy_interaction_type = record["therapy_type"]

if "+" in therapy_name:
# Indicates multiple therapies
if therapy_interaction_type.upper() in {
"COMBINATION THERAPY",
"IMMUNOTHERAPY",
"RADIATION THERAPY",
"TARGETED THERAPY",
}:
therapeutic_procedure_type = (
TherapeuticProcedureType.COMBINATION_THERAPY
)
else:
# skipping HORMONE and CHEMOTHERAPY for now
continue
feature_type = assertion["variant"]["feature_type"]
if feature_type == "somatic_variant":
allele_origin_qualifier = AlleleOriginQualifier.SOMATIC
elif feature_type == "germline_variant":
allele_origin_qualifier = AlleleOriginQualifier.GERMLINE
else:
allele_origin_qualifier = None

params = {
"id": assertion_id,
"description": assertion["description"],
"strength": strength,
"subjectVariant": variation_gene_map["cv"],
"alleleOriginQualifier": allele_origin_qualifier,
"geneContextQualifier": variation_gene_map["moa_gene"],
"specifiedBy": self.processed_data.methods[0],
"reportedIn": [document],
}

therapies = [{"label": tn.strip()} for tn in therapy_name.split("+")]
therapeutic_digest = self._get_digest_for_str_lists(
[f"moa.therapy:{tn}" for tn in therapies]
)
therapeutic_procedure_id = f"moa.ctid:{therapeutic_digest}"
else:
therapeutic_procedure_id = f"moa.therapy:{therapy_name}"
therapies = [{"label": therapy_name}]
therapeutic_procedure_type = TherapeuticProcedureType.THERAPEUTIC_AGENT

moa_therapeutic = self._add_therapeutic_procedure(
therapeutic_procedure_id,
therapies,
therapeutic_procedure_type,
therapy_interaction_type,
if assertion["favorable_prognosis"] == "":
params["conditionQualifier"] = moa_disease
params["predicate"] = (
TherapeuticResponsePredicate.RESISTANCE
if assertion["therapy"]["resistance"]
else TherapeuticResponsePredicate.SENSITIVITY
)
params["objectTherapeutic"] = self._get_therapeutic_procedure(assertion)

if not moa_therapeutic:
if not params["objectTherapeutic"]:
logger.debug(
"%s has no therapeutic agent for therapy_name %s",
"%s has no therapeutic procedure for therapy_name %s",
assertion_id,
therapy_name,
)
continue

# Add disease
moa_disease = self._add_disease(record["disease"])
if not moa_disease:
logger.debug(
"%s has no disease for disease %s", assertion_id, record["disease"]
assertion["therapy"]["name"],
)
continue

# Add document
document = self.able_to_normalize["documents"].get(record["source_ids"])

feature_type = record["variant"]["feature_type"]
if feature_type == "somatic_variant":
allele_origin_qualifier = AlleleOriginQualifier.SOMATIC
elif feature_type == "germline_variant":
allele_origin_qualifier = AlleleOriginQualifier.GERMLINE
else:
allele_origin_qualifier = None

statement = VariantTherapeuticResponseStudyStatement(
id=assertion_id,
description=record["description"],
strength=strength,
predicate=predicate,
subjectVariant=variation_gene_map["cv"],
objectTherapeutic=moa_therapeutic,
conditionQualifier=moa_disease,
alleleOriginQualifier=allele_origin_qualifier,
geneContextQualifier=variation_gene_map["moa_gene"],
specifiedBy=self.processed_data.methods[0],
reportedIn=[document],
return
statement = VariantTherapeuticResponseStudyStatement(**params)
else:
params["objectCondition"] = moa_disease
params["predicate"] = (
PrognosticPredicate.BETTER_OUTCOME
if assertion["favorable_prognosis"]
else PrognosticPredicate.WORSE_OUTCOME
)
self.processed_data.statements.append(statement)
statement = VariantPrognosticStudyStatement(**params)

self.processed_data.statements.append(statement)

async def _add_categorical_variants(self, variants: list[dict]) -> None:
"""Create Categorical Variant objects for all MOA variant records.
Expand Down Expand Up @@ -437,6 +405,54 @@ def _add_documents(self, sources: list) -> None:
self.able_to_normalize["documents"][source_id] = document
self.processed_data.documents.append(document)

def _get_therapeutic_procedure(
self, assertion: dict
) -> TherapeuticAgent | TherapeuticSubstituteGroup | CombinationTherapy | None:
"""Get therapeutic procedure object

:param assertion: MOA assertion record
:return: Therapeutic procedure object, if found and able to be normalized
"""
therapy = assertion["therapy"]
therapy_name = therapy["name"]
if not therapy_name:
logger.debug("%s has no therapy_name", assertion["id"])
return None

therapy_interaction_type = therapy["type"]

if "+" in therapy_name:
# Indicates multiple therapies
if therapy_interaction_type.upper() in {
"COMBINATION THERAPY",
"IMMUNOTHERAPY",
"RADIATION THERAPY",
"TARGETED THERAPY",
}:
therapeutic_procedure_type = (
TherapeuticProcedureType.COMBINATION_THERAPY
)
else:
# skipping HORMONE and CHEMOTHERAPY for now
return None

therapies = [{"label": tn.strip()} for tn in therapy_name.split("+")]
therapeutic_digest = self._get_digest_for_str_lists(
[f"moa.therapy:{tn}" for tn in therapies]
)
therapeutic_procedure_id = f"moa.ctid:{therapeutic_digest}"
else:
therapeutic_procedure_id = f"moa.therapy:{therapy_name}"
therapies = [{"label": therapy_name}]
therapeutic_procedure_type = TherapeuticProcedureType.THERAPEUTIC_AGENT

return self._add_therapeutic_procedure(
therapeutic_procedure_id,
therapies,
therapeutic_procedure_type,
therapy_interaction_type,
)

def _get_therapeutic_substitute_group(
self,
therapeutic_sub_group_id: str,
Expand Down Expand Up @@ -489,12 +505,17 @@ def _get_therapeutic_agent(self, therapy: dict) -> TherapeuticAgent | None:

def _add_disease(self, disease: dict) -> dict | None:
"""Create or get disease given MOA disease.

First looks in cache for existing disease, if not found will attempt to
normalize. Will generate a digest from the original MOA disease object. This
will be used as the key in the caches. Will add the generated digest to
``processed_data.conditions`` and ``able_to_normalize['conditions']`` if
normalize. Will generate a digest from the original MOA disease object oncotree
fields. This will be used as the key in the caches. Will add the generated digest
to ``processed_data.conditions`` and ``able_to_normalize['conditions']`` if
disease-normalizer is able to normalize. Else will add the generated digest to
``unable_to_normalize['conditions']``
``unable_to_normalize['conditions']``.

Since there may be duplicate Oncotree code/terms with different names, the first
name will be used as the Disease label. Others will be added to the
alternativeLabels field.

:param disease: MOA disease object
:return: Disease object if disease-normalizer was able to normalize
Expand All @@ -503,16 +524,26 @@ def _add_disease(self, disease: dict) -> dict | None:
return None

# Since MOA disease objects do not have an ID, we will create a digest from
# the original MOA disease object
disease_list = sorted([f"{k}:{v}" for k, v in disease.items() if v])
blob = json.dumps(disease_list, separators=(",", ":"), sort_keys=True).encode(
"ascii"
)
# the original MOA disease object.
# The `name` is as written in the source text. In an upcoming MOA release, these
# will have leading underscore to differentiate "raw" values
oncotree_code = disease["oncotree_code"]
oncotree_key = "oncotree_code" if oncotree_code else "oncotree_term"
oncotree_value = oncotree_code or disease[oncotree_key]
oncotree_kv = [f"{oncotree_key}:{oncotree_value}"]
blob = json.dumps(oncotree_kv, separators=(",", ":")).encode("ascii")
disease_id = sha512t24u(blob)

vrs_disease = self.able_to_normalize["conditions"].get(disease_id)
if vrs_disease:
source_disease_name = disease["name"]
if source_disease_name != vrs_disease.label:
vrs_disease.alternativeLabels = vrs_disease.alternativeLabels or []

if source_disease_name not in vrs_disease.alternativeLabels:
vrs_disease.alternativeLabels.append(source_disease_name)
return vrs_disease

vrs_disease = None
if disease_id not in self.unable_to_normalize["conditions"]:
vrs_disease = self._get_disease(disease)
Expand All @@ -523,7 +554,7 @@ def _add_disease(self, disease: dict) -> dict | None:
self.unable_to_normalize["conditions"].add(disease_id)
return vrs_disease

def _get_disease(self, disease: dict) -> dict | None:
def _get_disease(self, disease: dict) -> Disease | None:
"""Get Disease object for a MOA disease

:param disease: MOA disease record
Expand Down
Loading
Loading