From 4ac64395005a6d6b9a62cff2c5b00235d2d71c1e Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Wed, 18 Dec 2024 10:08:18 -0500 Subject: [PATCH] fix: aggregate moa oncotree concepts during transformation (#413) close #409 * Internal digest is now created using `oncotree_code` or `oncotree_term` * Since there may be duplicate codes or terms with different source text disease names, the first record will be used as the `Disease` label and others will be added to `alternativeLabels` * Also fixed return type annotation in `_get_disease` --- src/metakb/transformers/moa.py | 35 ++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/src/metakb/transformers/moa.py b/src/metakb/transformers/moa.py index 45df5501..35b9019f 100644 --- a/src/metakb/transformers/moa.py +++ b/src/metakb/transformers/moa.py @@ -505,12 +505,17 @@ def _get_therapeutic_agent(self, therapy: dict) -> TherapeuticAgent | None: def _add_disease(self, disease: dict) -> dict | None: """Create or get disease given MOA disease. + First looks in cache for existing disease, if not found will attempt to - normalize. Will generate a digest from the original MOA disease object. This - will be used as the key in the caches. Will add the generated digest to - ``processed_data.conditions`` and ``able_to_normalize['conditions']`` if + normalize. Will generate a digest from the original MOA disease object oncotree + fields. This will be used as the key in the caches. Will add the generated digest + to ``processed_data.conditions`` and ``able_to_normalize['conditions']`` if disease-normalizer is able to normalize. Else will add the generated digest to - ``unable_to_normalize['conditions']`` + ``unable_to_normalize['conditions']``. + + Since there may be duplicate Oncotree code/terms with different names, the first + name will be used as the Disease label. Others will be added to the + alternativeLabels field. :param disease: MOA disease object :return: Disease object if disease-normalizer was able to normalize @@ -519,16 +524,26 @@ def _add_disease(self, disease: dict) -> dict | None: return None # Since MOA disease objects do not have an ID, we will create a digest from - # the original MOA disease object - disease_list = sorted([f"{k}:{v}" for k, v in disease.items() if v]) - blob = json.dumps(disease_list, separators=(",", ":"), sort_keys=True).encode( - "ascii" - ) + # the original MOA disease object. + # The `name` is as written in the source text. In an upcoming MOA release, these + # will have leading underscore to differentiate "raw" values + oncotree_code = disease["oncotree_code"] + oncotree_key = "oncotree_code" if oncotree_code else "oncotree_term" + oncotree_value = oncotree_code or disease[oncotree_key] + oncotree_kv = [f"{oncotree_key}:{oncotree_value}"] + blob = json.dumps(oncotree_kv, separators=(",", ":")).encode("ascii") disease_id = sha512t24u(blob) vrs_disease = self.able_to_normalize["conditions"].get(disease_id) if vrs_disease: + source_disease_name = disease["name"] + if source_disease_name != vrs_disease.label: + vrs_disease.alternativeLabels = vrs_disease.alternativeLabels or [] + + if source_disease_name not in vrs_disease.alternativeLabels: + vrs_disease.alternativeLabels.append(source_disease_name) return vrs_disease + vrs_disease = None if disease_id not in self.unable_to_normalize["conditions"]: vrs_disease = self._get_disease(disease) @@ -539,7 +554,7 @@ def _add_disease(self, disease: dict) -> dict | None: self.unable_to_normalize["conditions"].add(disease_id) return vrs_disease - def _get_disease(self, disease: dict) -> dict | None: + def _get_disease(self, disease: dict) -> Disease | None: """Get Disease object for a MOA disease :param disease: MOA disease record