From 9297e760edf8b21f75a04df07b3f9da0d601fbc9 Mon Sep 17 00:00:00 2001 From: Alexis Lucattini Date: Tue, 5 Nov 2024 21:21:28 +1100 Subject: [PATCH] Pull the snomed databases from github if they're not on the local system Workaround until codepipeline supports Git LFS --- .../generate_pieriandx_objects.py | 161 +++++++++++++++++- .../pieriandx_lookup/get_disease_label.py | 15 +- .../pieriandx_lookup/get_specimen_label.py | 13 +- 3 files changed, 183 insertions(+), 6 deletions(-) diff --git a/lib/workload/stateless/stacks/pieriandx-pipeline-manager/lambdas/generate_pieriandx_objects_py/generate_pieriandx_objects.py b/lib/workload/stateless/stacks/pieriandx-pipeline-manager/lambdas/generate_pieriandx_objects_py/generate_pieriandx_objects.py index 064600506..2d4532754 100644 --- a/lib/workload/stateless/stacks/pieriandx-pipeline-manager/lambdas/generate_pieriandx_objects_py/generate_pieriandx_objects.py +++ b/lib/workload/stateless/stacks/pieriandx-pipeline-manager/lambdas/generate_pieriandx_objects_py/generate_pieriandx_objects.py @@ -106,8 +106,6 @@ logger = logging.getLogger(__name__) - - def handler(event, context): # Set env vars set_icav2_env_vars() @@ -279,7 +277,6 @@ def handler(event, context): } - # # Idenitified Patient # if __name__ == "__main__": # import json @@ -621,3 +618,161 @@ def handler(event, context): # # "sequencerrun_s3_path": "s3://pdx-cgwxfer-test/melbournetest/240229_A00130_0288_BH5HM2DSXC__L2400160__V2__20241003f44a5496__20241003f44a5496", # # "sample_name": "L2400160" # # } + + +# PROD +# if __name__ == "__main__": +# import json +# from os import environ +# +# environ['AWS_PROFILE'] = 'umccr-production' +# environ['AWS_REGION'] = 'ap-southeast-2' +# environ['ICAV2_ACCESS_TOKEN_SECRET_ID'] = "ICAv2JWTKey-umccr-prod-service-production" +# print( +# json.dumps( +# handler( +# { +# "sequencerrun_s3_path_root": "s3://pdx-cgwxfer/melbourne", +# "portal_run_id": "20241105f6bc3fb9", +# "samplesheet_uri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Logs_Intermediates/SampleSheetValidation/SampleSheet_Intermediate.csv", +# "panel_name": "tso500_DRAGEN_ctDNA_v2_1_Universityofmelbourne", # pragma: allowlist secret +# "dag": { +# "dagName": "cromwell_tso500_ctdna_workflow_1.0.4", +# "dagDescription": "tso500_ctdna_workflow" +# }, +# "data_files": { +# "microsatOutputUri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Logs_Intermediates/DragenCaller/L2401560/L2401560.microsat_output.json", +# "tmbMetricsUri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Logs_Intermediates/Tmb/L2401560/L2401560.tmb.metrics.csv", +# "cnvVcfUri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560.cnv.vcf.gz", +# "hardFilteredVcfUri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560.hard-filtered.vcf.gz", +# "fusionsUri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560_Fusions.csv", +# "metricsOutputUri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560_MetricsOutput.tsv", +# "samplesheetUri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Logs_Intermediates/SampleSheetValidation/SampleSheet_Intermediate.csv" +# }, +# "case_metadata": { +# "isIdentified": False, +# "caseAccessionNumber": "L2401560__V2__20241105f6bc3fb9", +# "externalSpecimenId": "0042-61203", +# "sampleType": "patientcare", +# "specimenLabel": "primarySpecimen", +# "indication": "NA", +# "diseaseCode": 254637007, +# "specimenCode": "122561005", +# "sampleReception": { +# "dateAccessioned": "2024-11-05T16:11:36+1100", +# "dateCollected": "2024-10-23T23:00:00+1100", +# "dateReceived": "2024-10-24T00:00:00+1100" +# }, +# "study": { +# "id": "OCEANiC", +# "subjectIdentifier": "0042-61203" +# } +# }, +# "instrument_run_id": "241101_A01052_0236_BHVJNMDMXY" +# }, +# None +# ), +# indent=2 +# ) +# ) +# +# # Yields +# # { +# # "case_creation_obj": { +# # "identified": false, +# # "indication": "NA", +# # "panelName": "tso500_DRAGEN_ctDNA_v2_1_Universityofmelbourne", # pragma: allowlist secret +# # "sampleType": "patientcare", +# # "specimens": [ +# # { +# # "accessionNumber": "L2401560__V2__20241105f6bc3fb9", +# # "dateAccessioned": "2024-11-05T05:11:36Z", +# # "dateReceived": "2024-10-23T13:00:00Z", +# # "datecollected": "2024-10-23T12:00:00Z", +# # "externalSpecimenId": "0042-61203", +# # "name": "primarySpecimen", +# # "type": { +# # "code": "122561005", +# # "label": "Blood specimen from patient" +# # }, +# # "studyIdentifier": "OCEANiC", +# # "studySubjectIdentifier": "0042-61203" +# # } +# # ], +# # "dagDescription": "tso500_ctdna_workflow", +# # "dagName": "cromwell_tso500_ctdna_workflow_1.0.4", +# # "disease": { +# # "code": "254637007", +# # "label": "Non-small cell lung cancer" +# # } +# # }, +# # "sequencerrun_creation_obj": { +# # "runId": "241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9", +# # "specimens": [ +# # { +# # "accessionNumber": "L2401560__V2__20241105f6bc3fb9", +# # "barcode": "ATTCAGAA-AGGCTATA", +# # "lane": "1", +# # "sampleId": "L2401560", +# # "sampleType": "DNA" +# # } +# # ], +# # "type": "pairedEnd" +# # }, +# # "informaticsjob_creation_obj": { +# # "input": [ +# # { +# # "accessionNumber": "L2401560__V2__20241105f6bc3fb9", +# # "sequencerRunInfos": [ +# # { +# # "runId": "241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9", +# # "barcode": "ATTCAGAA-AGGCTATA", +# # "lane": "1", +# # "sampleId": "L2401560", +# # "sampleType": "DNA" +# # } +# # ] +# # } +# # ] +# # }, +# # "data_files": [ +# # { +# # "src_uri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Logs_Intermediates/DragenCaller/L2401560/L2401560.microsat_output.json", +# # "dest_uri": "s3://pdx-cgwxfer/melbourne/241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9/Data/Intensities/BaseCalls/L2401560.microsat_output.json", +# # "needs_decompression": false, +# # "contents": null +# # }, +# # { +# # "src_uri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Logs_Intermediates/Tmb/L2401560/L2401560.tmb.metrics.csv", +# # "dest_uri": "s3://pdx-cgwxfer/melbourne/241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9/Data/Intensities/BaseCalls/L2401560.tmb.metrics.csv", +# # "needs_decompression": false, +# # "contents": null +# # }, +# # { +# # "src_uri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560.cnv.vcf.gz", +# # "dest_uri": "s3://pdx-cgwxfer/melbourne/241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9/Data/Intensities/BaseCalls/L2401560.cnv.vcf", +# # "needs_decompression": true, +# # "contents": null +# # }, +# # { +# # "src_uri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560.hard-filtered.vcf.gz", +# # "dest_uri": "s3://pdx-cgwxfer/melbourne/241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9/Data/Intensities/BaseCalls/L2401560.hard-filtered.vcf", +# # "needs_decompression": true, +# # "contents": null +# # }, +# # { +# # "src_uri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560_Fusions.csv", +# # "dest_uri": "s3://pdx-cgwxfer/melbourne/241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9/Data/Intensities/BaseCalls/L2401560_Fusions.csv", +# # "needs_decompression": false, +# # "contents": null +# # }, +# # { +# # "src_uri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560_MetricsOutput.tsv", +# # "dest_uri": "s3://pdx-cgwxfer/melbourne/241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9/Data/Intensities/BaseCalls/L2401560_MetricsOutput.tsv", +# # "needs_decompression": false, +# # "contents": null +# # } +# # ], +# # "sequencerrun_s3_path": "s3://pdx-cgwxfer/melbourne/241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9", +# # "sample_name": "L2401560" +# # } diff --git a/lib/workload/stateless/stacks/pieriandx-pipeline-manager/layers/src/pieriandx_pipeline_tools/pieriandx_lookup/get_disease_label.py b/lib/workload/stateless/stacks/pieriandx-pipeline-manager/layers/src/pieriandx_pipeline_tools/pieriandx_lookup/get_disease_label.py index fc6dbf73b..7d5c3fe5d 100644 --- a/lib/workload/stateless/stacks/pieriandx-pipeline-manager/layers/src/pieriandx_pipeline_tools/pieriandx_lookup/get_disease_label.py +++ b/lib/workload/stateless/stacks/pieriandx-pipeline-manager/layers/src/pieriandx_pipeline_tools/pieriandx_lookup/get_disease_label.py @@ -3,17 +3,19 @@ """ Given a disease code, get the disease label """ - # Imports +from gzip import BadGzipFile from pathlib import Path import pandas as pd from tempfile import NamedTemporaryFile +import requests from ..utils.compression_helpers import decompress_file # Compressed version of # https://velserapm.atlassian.net/wiki/download/attachments/86704490/SNOMED_CT%20Disease_trees.xlsx?version=1&modificationDate=1561395438000&api=v2 SNOMED_CT_DISEASE_TREE_FILE = Path(__file__).parent / "snomed_ct_disease_tree.json.gz" +SNOMED_CT_DISEASE_TREE_GITHUB_RAW_URL = "https://github.com/umccr/orcabus/raw/refs/heads/main/lib/workload/stateless/stacks/pieriandx-pipeline-manager/layers/src/pieriandx_pipeline_tools/pieriandx_lookup/snomed_ct_disease_tree.json.gz" def get_disease_tree() -> pd.DataFrame: @@ -26,7 +28,16 @@ def get_disease_tree() -> pd.DataFrame: """ # Decompress the disease tree file into a temp file decompressed_disease_tree_file = NamedTemporaryFile(suffix=".json") - decompress_file(SNOMED_CT_DISEASE_TREE_FILE, Path(decompressed_disease_tree_file.name)) + + try: + decompress_file(SNOMED_CT_DISEASE_TREE_FILE, Path(decompressed_disease_tree_file.name)) + except BadGzipFile: + # Git LFS not supported on CodePipeline Deployments + # Write to file + with NamedTemporaryFile(suffix=".json.gz") as download_h: + download_h.write(requests.get(SNOMED_CT_DISEASE_TREE_GITHUB_RAW_URL).content) + download_h.flush() + decompress_file(Path(download_h.name), Path(decompressed_disease_tree_file.name)) return pd.read_json(decompressed_disease_tree_file.name) diff --git a/lib/workload/stateless/stacks/pieriandx-pipeline-manager/layers/src/pieriandx_pipeline_tools/pieriandx_lookup/get_specimen_label.py b/lib/workload/stateless/stacks/pieriandx-pipeline-manager/layers/src/pieriandx_pipeline_tools/pieriandx_lookup/get_specimen_label.py index 5a2f656b5..ab9782de2 100644 --- a/lib/workload/stateless/stacks/pieriandx-pipeline-manager/layers/src/pieriandx_pipeline_tools/pieriandx_lookup/get_specimen_label.py +++ b/lib/workload/stateless/stacks/pieriandx-pipeline-manager/layers/src/pieriandx_pipeline_tools/pieriandx_lookup/get_specimen_label.py @@ -3,6 +3,7 @@ """ Given a specimen code, get the specimen label """ +from gzip import BadGzipFile #!/usr/bin/env python3 @@ -15,10 +16,12 @@ import pandas as pd from tempfile import NamedTemporaryFile from ..utils.compression_helpers import decompress_file +import requests # Compressed version of # https://velserapm.atlassian.net/wiki/download/attachments/86704490/SnomedCT-Term_For_SpecimenType.xls?version=1&modificationDate=1561395451000&api=v2 SNOMED_CT_SPECIMEN_TYPE_FILE = Path(__file__).parent / "snomed_ct_specimen_type.json.gz" +SNOMED_CT_SPECIMEN_TYPE_RAW_GITHUB_URL = "https://github.com/umccr/orcabus/raw/refs/heads/main/lib/workload/stateless/stacks/pieriandx-pipeline-manager/layers/src/pieriandx_pipeline_tools/pieriandx_lookup/snomed_ct_specimen_type.json.gz" def get_specimen_df() -> pd.DataFrame: @@ -31,7 +34,15 @@ def get_specimen_df() -> pd.DataFrame: """ # Decompress the specimen file into a temp file decompressed_specimen_df_file = NamedTemporaryFile(suffix=".json") - decompress_file(SNOMED_CT_SPECIMEN_TYPE_FILE, Path(decompressed_specimen_df_file.name)) + try: + decompress_file(SNOMED_CT_SPECIMEN_TYPE_FILE, Path(decompressed_specimen_df_file.name)) + except BadGzipFile: + # Git LFS not supported on CodePipeline Deployments + # Write to file + with NamedTemporaryFile(suffix=".json.gz") as download_h: + download_h.write(requests.get(SNOMED_CT_SPECIMEN_TYPE_RAW_GITHUB_URL).content) + download_h.flush() + decompress_file(Path(download_h.name), Path(decompressed_specimen_df_file.name)) return pd.read_json(decompressed_specimen_df_file.name)