Skip to content

Commit

Permalink
Pull the snomed databases from github if they're not on the local system
Browse files Browse the repository at this point in the history
Workaround until codepipeline supports Git LFS
  • Loading branch information
alexiswl committed Nov 5, 2024
1 parent e8e5c89 commit 9297e76
Show file tree
Hide file tree
Showing 3 changed files with 183 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,6 @@
logger = logging.getLogger(__name__)




def handler(event, context):
# Set env vars
set_icav2_env_vars()
Expand Down Expand Up @@ -279,7 +277,6 @@ def handler(event, context):
}



# # Idenitified Patient
# if __name__ == "__main__":
# import json
Expand Down Expand Up @@ -621,3 +618,161 @@ def handler(event, context):
# # "sequencerrun_s3_path": "s3://pdx-cgwxfer-test/melbournetest/240229_A00130_0288_BH5HM2DSXC__L2400160__V2__20241003f44a5496__20241003f44a5496",
# # "sample_name": "L2400160"
# # }


# PROD
# if __name__ == "__main__":
# import json
# from os import environ
#
# environ['AWS_PROFILE'] = 'umccr-production'
# environ['AWS_REGION'] = 'ap-southeast-2'
# environ['ICAV2_ACCESS_TOKEN_SECRET_ID'] = "ICAv2JWTKey-umccr-prod-service-production"
# print(
# json.dumps(
# handler(
# {
# "sequencerrun_s3_path_root": "s3://pdx-cgwxfer/melbourne",
# "portal_run_id": "20241105f6bc3fb9",
# "samplesheet_uri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Logs_Intermediates/SampleSheetValidation/SampleSheet_Intermediate.csv",
# "panel_name": "tso500_DRAGEN_ctDNA_v2_1_Universityofmelbourne", # pragma: allowlist secret
# "dag": {
# "dagName": "cromwell_tso500_ctdna_workflow_1.0.4",
# "dagDescription": "tso500_ctdna_workflow"
# },
# "data_files": {
# "microsatOutputUri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Logs_Intermediates/DragenCaller/L2401560/L2401560.microsat_output.json",
# "tmbMetricsUri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Logs_Intermediates/Tmb/L2401560/L2401560.tmb.metrics.csv",
# "cnvVcfUri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560.cnv.vcf.gz",
# "hardFilteredVcfUri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560.hard-filtered.vcf.gz",
# "fusionsUri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560_Fusions.csv",
# "metricsOutputUri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560_MetricsOutput.tsv",
# "samplesheetUri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Logs_Intermediates/SampleSheetValidation/SampleSheet_Intermediate.csv"
# },
# "case_metadata": {
# "isIdentified": False,
# "caseAccessionNumber": "L2401560__V2__20241105f6bc3fb9",
# "externalSpecimenId": "0042-61203",
# "sampleType": "patientcare",
# "specimenLabel": "primarySpecimen",
# "indication": "NA",
# "diseaseCode": 254637007,
# "specimenCode": "122561005",
# "sampleReception": {
# "dateAccessioned": "2024-11-05T16:11:36+1100",
# "dateCollected": "2024-10-23T23:00:00+1100",
# "dateReceived": "2024-10-24T00:00:00+1100"
# },
# "study": {
# "id": "OCEANiC",
# "subjectIdentifier": "0042-61203"
# }
# },
# "instrument_run_id": "241101_A01052_0236_BHVJNMDMXY"
# },
# None
# ),
# indent=2
# )
# )
#
# # Yields
# # {
# # "case_creation_obj": {
# # "identified": false,
# # "indication": "NA",
# # "panelName": "tso500_DRAGEN_ctDNA_v2_1_Universityofmelbourne", # pragma: allowlist secret
# # "sampleType": "patientcare",
# # "specimens": [
# # {
# # "accessionNumber": "L2401560__V2__20241105f6bc3fb9",
# # "dateAccessioned": "2024-11-05T05:11:36Z",
# # "dateReceived": "2024-10-23T13:00:00Z",
# # "datecollected": "2024-10-23T12:00:00Z",
# # "externalSpecimenId": "0042-61203",
# # "name": "primarySpecimen",
# # "type": {
# # "code": "122561005",
# # "label": "Blood specimen from patient"
# # },
# # "studyIdentifier": "OCEANiC",
# # "studySubjectIdentifier": "0042-61203"
# # }
# # ],
# # "dagDescription": "tso500_ctdna_workflow",
# # "dagName": "cromwell_tso500_ctdna_workflow_1.0.4",
# # "disease": {
# # "code": "254637007",
# # "label": "Non-small cell lung cancer"
# # }
# # },
# # "sequencerrun_creation_obj": {
# # "runId": "241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9",
# # "specimens": [
# # {
# # "accessionNumber": "L2401560__V2__20241105f6bc3fb9",
# # "barcode": "ATTCAGAA-AGGCTATA",
# # "lane": "1",
# # "sampleId": "L2401560",
# # "sampleType": "DNA"
# # }
# # ],
# # "type": "pairedEnd"
# # },
# # "informaticsjob_creation_obj": {
# # "input": [
# # {
# # "accessionNumber": "L2401560__V2__20241105f6bc3fb9",
# # "sequencerRunInfos": [
# # {
# # "runId": "241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9",
# # "barcode": "ATTCAGAA-AGGCTATA",
# # "lane": "1",
# # "sampleId": "L2401560",
# # "sampleType": "DNA"
# # }
# # ]
# # }
# # ]
# # },
# # "data_files": [
# # {
# # "src_uri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Logs_Intermediates/DragenCaller/L2401560/L2401560.microsat_output.json",
# # "dest_uri": "s3://pdx-cgwxfer/melbourne/241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9/Data/Intensities/BaseCalls/L2401560.microsat_output.json",
# # "needs_decompression": false,
# # "contents": null
# # },
# # {
# # "src_uri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Logs_Intermediates/Tmb/L2401560/L2401560.tmb.metrics.csv",
# # "dest_uri": "s3://pdx-cgwxfer/melbourne/241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9/Data/Intensities/BaseCalls/L2401560.tmb.metrics.csv",
# # "needs_decompression": false,
# # "contents": null
# # },
# # {
# # "src_uri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560.cnv.vcf.gz",
# # "dest_uri": "s3://pdx-cgwxfer/melbourne/241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9/Data/Intensities/BaseCalls/L2401560.cnv.vcf",
# # "needs_decompression": true,
# # "contents": null
# # },
# # {
# # "src_uri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560.hard-filtered.vcf.gz",
# # "dest_uri": "s3://pdx-cgwxfer/melbourne/241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9/Data/Intensities/BaseCalls/L2401560.hard-filtered.vcf",
# # "needs_decompression": true,
# # "contents": null
# # },
# # {
# # "src_uri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560_Fusions.csv",
# # "dest_uri": "s3://pdx-cgwxfer/melbourne/241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9/Data/Intensities/BaseCalls/L2401560_Fusions.csv",
# # "needs_decompression": false,
# # "contents": null
# # },
# # {
# # "src_uri": "s3://pipeline-prod-cache-503977275616-ap-southeast-2/byob-icav2/production/analysis/cttsov2/202411053da6481e/Results/L2401560/L2401560_MetricsOutput.tsv",
# # "dest_uri": "s3://pdx-cgwxfer/melbourne/241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9/Data/Intensities/BaseCalls/L2401560_MetricsOutput.tsv",
# # "needs_decompression": false,
# # "contents": null
# # }
# # ],
# # "sequencerrun_s3_path": "s3://pdx-cgwxfer/melbourne/241101_A01052_0236_BHVJNMDMXY__L2401560__V2__20241105f6bc3fb9__20241105f6bc3fb9",
# # "sample_name": "L2401560"
# # }
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@
"""
Given a disease code, get the disease label
"""

# Imports
from gzip import BadGzipFile
from pathlib import Path
import pandas as pd
from tempfile import NamedTemporaryFile
import requests

from ..utils.compression_helpers import decompress_file

# Compressed version of
# https://velserapm.atlassian.net/wiki/download/attachments/86704490/SNOMED_CT%20Disease_trees.xlsx?version=1&modificationDate=1561395438000&api=v2
SNOMED_CT_DISEASE_TREE_FILE = Path(__file__).parent / "snomed_ct_disease_tree.json.gz"
SNOMED_CT_DISEASE_TREE_GITHUB_RAW_URL = "https://github.com/umccr/orcabus/raw/refs/heads/main/lib/workload/stateless/stacks/pieriandx-pipeline-manager/layers/src/pieriandx_pipeline_tools/pieriandx_lookup/snomed_ct_disease_tree.json.gz"


def get_disease_tree() -> pd.DataFrame:
Expand All @@ -26,7 +28,16 @@ def get_disease_tree() -> pd.DataFrame:
"""
# Decompress the disease tree file into a temp file
decompressed_disease_tree_file = NamedTemporaryFile(suffix=".json")
decompress_file(SNOMED_CT_DISEASE_TREE_FILE, Path(decompressed_disease_tree_file.name))

try:
decompress_file(SNOMED_CT_DISEASE_TREE_FILE, Path(decompressed_disease_tree_file.name))
except BadGzipFile:
# Git LFS not supported on CodePipeline Deployments
# Write to file
with NamedTemporaryFile(suffix=".json.gz") as download_h:
download_h.write(requests.get(SNOMED_CT_DISEASE_TREE_GITHUB_RAW_URL).content)
download_h.flush()
decompress_file(Path(download_h.name), Path(decompressed_disease_tree_file.name))

return pd.read_json(decompressed_disease_tree_file.name)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
Given a specimen code, get the specimen label
"""
from gzip import BadGzipFile

#!/usr/bin/env python3

Expand All @@ -15,10 +16,12 @@
import pandas as pd
from tempfile import NamedTemporaryFile
from ..utils.compression_helpers import decompress_file
import requests

# Compressed version of
# https://velserapm.atlassian.net/wiki/download/attachments/86704490/SnomedCT-Term_For_SpecimenType.xls?version=1&modificationDate=1561395451000&api=v2
SNOMED_CT_SPECIMEN_TYPE_FILE = Path(__file__).parent / "snomed_ct_specimen_type.json.gz"
SNOMED_CT_SPECIMEN_TYPE_RAW_GITHUB_URL = "https://github.com/umccr/orcabus/raw/refs/heads/main/lib/workload/stateless/stacks/pieriandx-pipeline-manager/layers/src/pieriandx_pipeline_tools/pieriandx_lookup/snomed_ct_specimen_type.json.gz"


def get_specimen_df() -> pd.DataFrame:
Expand All @@ -31,7 +34,15 @@ def get_specimen_df() -> pd.DataFrame:
"""
# Decompress the specimen file into a temp file
decompressed_specimen_df_file = NamedTemporaryFile(suffix=".json")
decompress_file(SNOMED_CT_SPECIMEN_TYPE_FILE, Path(decompressed_specimen_df_file.name))
try:
decompress_file(SNOMED_CT_SPECIMEN_TYPE_FILE, Path(decompressed_specimen_df_file.name))
except BadGzipFile:
# Git LFS not supported on CodePipeline Deployments
# Write to file
with NamedTemporaryFile(suffix=".json.gz") as download_h:
download_h.write(requests.get(SNOMED_CT_SPECIMEN_TYPE_RAW_GITHUB_URL).content)
download_h.flush()
decompress_file(Path(download_h.name), Path(decompressed_specimen_df_file.name))

return pd.read_json(decompressed_specimen_df_file.name)

Expand Down

0 comments on commit 9297e76

Please sign in to comment.