From 4ac64395005a6d6b9a62cff2c5b00235d2d71c1e Mon Sep 17 00:00:00 2001
From: Kori Kuzma <korikuzma@gmail.com>
Date: Wed, 18 Dec 2024 10:08:18 -0500
Subject: [PATCH] fix: aggregate moa oncotree concepts during transformation
 (#413)

close #409

* Internal digest is now created using `oncotree_code` or
`oncotree_term`
* Since there may be duplicate codes or terms with different source text
disease names, the first record will be used as the `Disease` label and
others will be added to `alternativeLabels`
* Also fixed return type annotation in `_get_disease`
---
 src/metakb/transformers/moa.py | 35 ++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/src/metakb/transformers/moa.py b/src/metakb/transformers/moa.py
index 45df5501..35b9019f 100644
--- a/src/metakb/transformers/moa.py
+++ b/src/metakb/transformers/moa.py
@@ -505,12 +505,17 @@ def _get_therapeutic_agent(self, therapy: dict) -> TherapeuticAgent | None:
 
     def _add_disease(self, disease: dict) -> dict | None:
         """Create or get disease given MOA disease.
+
         First looks in cache for existing disease, if not found will attempt to
-        normalize. Will generate a digest from the original MOA disease object. This
-        will be used as the key in the caches. Will add the generated digest to
-        ``processed_data.conditions`` and ``able_to_normalize['conditions']`` if
+        normalize. Will generate a digest from the original MOA disease object oncotree
+        fields. This will be used as the key in the caches. Will add the generated digest
+        to ``processed_data.conditions`` and ``able_to_normalize['conditions']`` if
         disease-normalizer is able to normalize. Else will add the generated digest to
-        ``unable_to_normalize['conditions']``
+        ``unable_to_normalize['conditions']``.
+
+        Since there may be duplicate Oncotree code/terms with different names, the first
+        name will be used as the Disease label. Others will be added to the
+        alternativeLabels field.
 
         :param disease: MOA disease object
         :return: Disease object if disease-normalizer was able to normalize
@@ -519,16 +524,26 @@ def _add_disease(self, disease: dict) -> dict | None:
             return None
 
         # Since MOA disease objects do not have an ID, we will create a digest from
-        # the original MOA disease object
-        disease_list = sorted([f"{k}:{v}" for k, v in disease.items() if v])
-        blob = json.dumps(disease_list, separators=(",", ":"), sort_keys=True).encode(
-            "ascii"
-        )
+        # the original MOA disease object.
+        # The `name` is as written in the source text. In an upcoming MOA release, these
+        # will have leading underscore to differentiate "raw" values
+        oncotree_code = disease["oncotree_code"]
+        oncotree_key = "oncotree_code" if oncotree_code else "oncotree_term"
+        oncotree_value = oncotree_code or disease[oncotree_key]
+        oncotree_kv = [f"{oncotree_key}:{oncotree_value}"]
+        blob = json.dumps(oncotree_kv, separators=(",", ":")).encode("ascii")
         disease_id = sha512t24u(blob)
 
         vrs_disease = self.able_to_normalize["conditions"].get(disease_id)
         if vrs_disease:
+            source_disease_name = disease["name"]
+            if source_disease_name != vrs_disease.label:
+                vrs_disease.alternativeLabels = vrs_disease.alternativeLabels or []
+
+                if source_disease_name not in vrs_disease.alternativeLabels:
+                    vrs_disease.alternativeLabels.append(source_disease_name)
             return vrs_disease
+
         vrs_disease = None
         if disease_id not in self.unable_to_normalize["conditions"]:
             vrs_disease = self._get_disease(disease)
@@ -539,7 +554,7 @@ def _add_disease(self, disease: dict) -> dict | None:
                 self.unable_to_normalize["conditions"].add(disease_id)
         return vrs_disease
 
-    def _get_disease(self, disease: dict) -> dict | None:
+    def _get_disease(self, disease: dict) -> Disease | None:
         """Get Disease object for a MOA disease
 
         :param disease: MOA disease record