update

umccr · Sep 19, 2024 · ef3984b · ef3984b
1 parent 9d8a901
commit ef3984b
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 14 deletions.
diff --git a/lib/workload/stateless/stacks/metadata-manager/README.md b/lib/workload/stateless/stacks/metadata-manager/README.md
@@ -52,11 +52,11 @@ on the model of the record.
 
 | Model      | Prefix |
 |------------|--------|
-| Subject    | `sbj.` | 
-| Sample     | `smp.` | 
-| Library    | `lib.` | 
+| Subject    | `sbj.` |
+| Sample     | `smp.` |
+| Library    | `lib.` |
 | Individual | `idv.` |
-| Contact    | `ctc.` | 
+| Contact    | `ctc.` |
 | Project    | `prj.` |
 
 ## How things work
@@ -88,11 +88,11 @@ Some important notes of the sync:
 
 1. The sync will only run from the current year.
 2. The tracking sheet is the single source of truth for the current year. Any deletion or update to existing records
-   will be applied based on their internal IDs (`library_id`, `specimen_id`, and `subject_id`). For the library
+   will be applied based on their internal IDs (e.g. `library_id`, `subject_id`, etc. ). For the library
    model, the deletion will only occur based on the current year's prefix. For example, syncing the 2024 tracking
-   sheet will only query libraries with `library_id` starting with `L24` to determine whether to delete it.
-3. `LibraryId` is treated as a unique value in the tracking sheet, so for any duplicated value (including from other
-   tabs) it will only recognize the last appearance.
+   sheet will only query libraries with `library_id` tarting with `L24` to determine whether to delete it.
+3. `LibraryId` is treated as a unique value in the tracking sheet, so for any duplicated value will only recognize 
+   the last appearance.
 4. In cases where multiple records share the same unique identifier (such as SampleId), only the data from the most
    recent record is stored. For instance, if a SampleId appears twice with differing source values, only the values from
    the latter record will be retained.
@@ -122,7 +122,7 @@ python3 --version
 Python 3.12.2
 ```
 
-You would need to go to thisps microservice app directory from the root project
+You would need to go to this microservice app directory from the root project
 
 ```bash
 cd lib/workload/stateless/stacks/metadata-manager

diff --git a/lib/workload/stateless/stacks/metadata-manager/app/management/commands/__init__.py b/lib/workload/stateless/stacks/metadata-manager/app/management/commands/__init__.py
@@ -0,0 +1,3 @@
+import os
+os.environ['SSM_NAME_GDRIVE_ACCOUNT'] = "/umccr/google/drive/lims_service_account_json"
+os.environ["SSM_NAME_TRACKING_SHEET_ID"] = "/umccr/google/drive/tracking_sheet_id"
diff --git a/lib/workload/stateless/stacks/metadata-manager/app/models/project.py b/lib/workload/stateless/stacks/metadata-manager/app/models/project.py
@@ -29,7 +29,7 @@ class Project(BaseModel):
     )
 
     # Relationships
-    contact_set = models.ManyToManyField(Contact, related_name='project_set', blank=True, null=True)
+    contact_set = models.ManyToManyField(Contact, related_name='project_set', blank=True, )
 
     # history
     history = HistoricalRecords(m2m_fields=[contact_set])
diff --git a/lib/workload/stateless/stacks/metadata-manager/app/models/subject.py b/lib/workload/stateless/stacks/metadata-manager/app/models/subject.py
@@ -18,7 +18,7 @@ class Subject(BaseModel):
     )
 
     # Relationships
-    individual_set = models.ManyToManyField('Individual', related_name='subject_set', blank=True, null=True)
+    individual_set = models.ManyToManyField('Individual', related_name='subject_set', blank=True, )
 
     # history
     history = HistoricalRecords(m2m_fields=[individual_set])
diff --git a/lib/workload/stateless/stacks/metadata-manager/proc/service/tracking_sheet_srv.py b/lib/workload/stateless/stacks/metadata-manager/proc/service/tracking_sheet_srv.py
@@ -38,6 +38,7 @@ def persist_lab_metadata(df: pd.DataFrame, sheet_year: str):
     logger.info(f"Start processing LabMetadata")
 
     # Used for statistics
+    invalid_data = []
     stats = {
         "library": {
             "create_count": 0,
@@ -149,17 +150,21 @@ def persist_lab_metadata(df: pd.DataFrame, sheet_year: str):
             if is_smp_created:
                 stats['sample']['create_count'] += 1
             if is_smp_updated:
-                stats['sample']['create_update'] += 1
+                stats['sample']['update_count'] += 1
 
             # ------------------------------
             # Contact
             # ------------------------------
-            contact, _is_ctc_created, _is_ctc_updated = Contact.objects.update_or_create_if_needed(
+            contact, is_ctc_created, is_ctc_updated = Contact.objects.update_or_create_if_needed(
                 search_key={"contact_id": record.get('project_owner')},
                 data={
                     "contact_id": record.get('project_owner'),
                 }
             )
+            if is_ctc_created:
+                stats['contact']['create_count'] += 1
+            if is_ctc_updated:
+                stats['contact']['update_count'] += 1
 
             # ------------------------------
             # Project: Upsert project with contact as part of the project
@@ -170,6 +175,11 @@ def persist_lab_metadata(df: pd.DataFrame, sheet_year: str):
                     "project_id": record.get('project_name'),
                 }
             )
+            if is_prj_created:
+                stats['project']['create_count'] += 1
+            if is_prj_updated:
+                stats['project']['update_count'] += 1
+
             # link project to its contact
             try:
                 project.contact_set.get(orcabus_id=contact.orcabus_id)
@@ -218,14 +228,19 @@ def persist_lab_metadata(df: pd.DataFrame, sheet_year: str):
 
         except Exception as e:
             if any(record.values()):
-                logger.warning(f"Invalid record ({e}): {json.dumps(record, indent=2)}")
                 stats['invalid_record_count'] += 1
+                invalid_data.append({
+                    "reason": e,
+                    "data": record
+                })
             continue
 
     # clean up history for django-simple-history model if any
     # Only clean for the past 15 minutes as this is what the maximum lambda cutoff
     clean_model_history(minutes=15)
 
+    logger.warning(f"Invalid record: {invalid_data}")
+    logger.info(f"Processed LabMetadata: {json.dumps(stats)}")
     return stats