Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
williamputraintan committed Sep 19, 2024
1 parent 9d8a901 commit ef3984b
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 14 deletions.
18 changes: 9 additions & 9 deletions lib/workload/stateless/stacks/metadata-manager/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ on the model of the record.

| Model | Prefix |
|------------|--------|
| Subject | `sbj.` |
| Sample | `smp.` |
| Library | `lib.` |
| Subject | `sbj.` |
| Sample | `smp.` |
| Library | `lib.` |
| Individual | `idv.` |
| Contact | `ctc.` |
| Contact | `ctc.` |
| Project | `prj.` |

## How things work
Expand Down Expand Up @@ -88,11 +88,11 @@ Some important notes of the sync:

1. The sync will only run from the current year.
2. The tracking sheet is the single source of truth for the current year. Any deletion or update to existing records
will be applied based on their internal IDs (`library_id`, `specimen_id`, and `subject_id`). For the library
will be applied based on their internal IDs (e.g. `library_id`, `subject_id`, etc. ). For the library
model, the deletion will only occur based on the current year's prefix. For example, syncing the 2024 tracking
sheet will only query libraries with `library_id` starting with `L24` to determine whether to delete it.
3. `LibraryId` is treated as a unique value in the tracking sheet, so for any duplicated value (including from other
tabs) it will only recognize the last appearance.
sheet will only query libraries with `library_id` tarting with `L24` to determine whether to delete it.
3. `LibraryId` is treated as a unique value in the tracking sheet, so for any duplicated value will only recognize
the last appearance.
4. In cases where multiple records share the same unique identifier (such as SampleId), only the data from the most
recent record is stored. For instance, if a SampleId appears twice with differing source values, only the values from
the latter record will be retained.
Expand Down Expand Up @@ -122,7 +122,7 @@ python3 --version
Python 3.12.2
```

You would need to go to thisps microservice app directory from the root project
You would need to go to this microservice app directory from the root project

```bash
cd lib/workload/stateless/stacks/metadata-manager
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import os
os.environ['SSM_NAME_GDRIVE_ACCOUNT'] = "/umccr/google/drive/lims_service_account_json"
os.environ["SSM_NAME_TRACKING_SHEET_ID"] = "/umccr/google/drive/tracking_sheet_id"
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class Project(BaseModel):
)

# Relationships
contact_set = models.ManyToManyField(Contact, related_name='project_set', blank=True, null=True)
contact_set = models.ManyToManyField(Contact, related_name='project_set', blank=True, )

# history
history = HistoricalRecords(m2m_fields=[contact_set])
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Subject(BaseModel):
)

# Relationships
individual_set = models.ManyToManyField('Individual', related_name='subject_set', blank=True, null=True)
individual_set = models.ManyToManyField('Individual', related_name='subject_set', blank=True, )

# history
history = HistoricalRecords(m2m_fields=[individual_set])
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def persist_lab_metadata(df: pd.DataFrame, sheet_year: str):
logger.info(f"Start processing LabMetadata")

# Used for statistics
invalid_data = []
stats = {
"library": {
"create_count": 0,
Expand Down Expand Up @@ -149,17 +150,21 @@ def persist_lab_metadata(df: pd.DataFrame, sheet_year: str):
if is_smp_created:
stats['sample']['create_count'] += 1
if is_smp_updated:
stats['sample']['create_update'] += 1
stats['sample']['update_count'] += 1

# ------------------------------
# Contact
# ------------------------------
contact, _is_ctc_created, _is_ctc_updated = Contact.objects.update_or_create_if_needed(
contact, is_ctc_created, is_ctc_updated = Contact.objects.update_or_create_if_needed(
search_key={"contact_id": record.get('project_owner')},
data={
"contact_id": record.get('project_owner'),
}
)
if is_ctc_created:
stats['contact']['create_count'] += 1
if is_ctc_updated:
stats['contact']['update_count'] += 1

# ------------------------------
# Project: Upsert project with contact as part of the project
Expand All @@ -170,6 +175,11 @@ def persist_lab_metadata(df: pd.DataFrame, sheet_year: str):
"project_id": record.get('project_name'),
}
)
if is_prj_created:
stats['project']['create_count'] += 1
if is_prj_updated:
stats['project']['update_count'] += 1

# link project to its contact
try:
project.contact_set.get(orcabus_id=contact.orcabus_id)
Expand Down Expand Up @@ -218,14 +228,19 @@ def persist_lab_metadata(df: pd.DataFrame, sheet_year: str):

except Exception as e:
if any(record.values()):
logger.warning(f"Invalid record ({e}): {json.dumps(record, indent=2)}")
stats['invalid_record_count'] += 1
invalid_data.append({
"reason": e,
"data": record
})
continue

# clean up history for django-simple-history model if any
# Only clean for the past 15 minutes as this is what the maximum lambda cutoff
clean_model_history(minutes=15)

logger.warning(f"Invalid record: {invalid_data}")
logger.info(f"Processed LabMetadata: {json.dumps(stats)}")
return stats


Expand Down

0 comments on commit ef3984b

Please sign in to comment.