From f38855e3ed1ea270718f38c0cf5564737dac5f84 Mon Sep 17 00:00:00 2001 From: William Putra Intan <61998484+williamputraintan@users.noreply.github.com> Date: Wed, 15 Jan 2025 14:24:11 +0700 Subject: [PATCH] feat(sscheck): Add New SSCheck Stack (#808) * Migrate SSChecker to the Orca world * instruction && override_cycles * fix body parse * tests and fixes * updates * Update statelessStackCollectionClass.ts * Update handler.py --- Makefile | 1 + config/config.ts | 2 + config/stacks/sampleSheetChecker.ts | 19 + .../stacks/sample-sheet-check/Makefile | 11 + .../stacks/sample-sheet-check/README.md | 9 + .../sample-sheet-check-lambda/.gitignore | 8 + .../sample-sheet-check-lambda/Makefile | 27 + .../sample-sheet-check-lambda/README.md | 40 + .../sample-sheet-check-lambda/handler.py | 85 ++ .../lambda.Dockerfile | 12 + .../sample-sheet-check-lambda/main.py | 98 ++ .../requirements.txt | 3 + .../sample-sheet-check-lambda/src/__init__.py | 9 + .../sample-sheet-check-lambda/src/checker.py | 115 +++ .../sample-sheet-check-lambda/src/errors.py | 109 +++ .../sample-sheet-check-lambda/src/globals.py | 467 ++++++++++ .../sample-sheet-check-lambda/src/http.py | 55 ++ .../sample-sheet-check-lambda/src/logger.py | 93 ++ .../sample-sheet-check-lambda/src/metadata.py | 51 + .../src/samplesheet.py | 879 ++++++++++++++++++ .../src/v2_samplesheet_builder.py | 551 +++++++++++ .../tests/sample/mock-1.csv | 23 + .../tests/sample/mock-2.csv | 23 + .../tests/test_sample_sheet.py | 103 ++ .../stacks/sample-sheet-check/stack.ts | 63 ++ .../statelessStackCollectionClass.ts | 11 + test/stateless/deployment.test.ts | 15 + 27 files changed, 2882 insertions(+) create mode 100644 config/stacks/sampleSheetChecker.ts create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/Makefile create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/README.md create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/.gitignore create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/Makefile create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/README.md create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/handler.py create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/lambda.Dockerfile create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/main.py create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/requirements.txt create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/__init__.py create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/checker.py create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/errors.py create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/globals.py create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/http.py create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/logger.py create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/metadata.py create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/samplesheet.py create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/v2_samplesheet_builder.py create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/tests/sample/mock-1.csv create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/tests/sample/mock-2.csv create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/tests/test_sample_sheet.py create mode 100644 lib/workload/stateless/stacks/sample-sheet-check/stack.ts diff --git a/Makefile b/Makefile index 0ae96bfe1..d8f343f6c 100644 --- a/Makefile +++ b/Makefile @@ -72,6 +72,7 @@ test-stateless-app-suite: @(cd lib/workload/stateless/stacks/fmannotator && $(MAKE) test) @(cd lib/workload/stateless/stacks/bclconvert-manager && $(MAKE) test) @(cd lib/workload/stateless/stacks/workflow-manager && $(MAKE) test) + @(cd lib/workload/stateless/stacks/sample-sheet-check && $(MAKE) test) # The default outer `test` target run all test in this repo test: test-stateful-iac test-stateless-iac test-stateful-app-suite test-stateless-app-suite diff --git a/config/config.ts b/config/config.ts index 98a63883f..3604b8316 100644 --- a/config/config.ts +++ b/config/config.ts @@ -66,6 +66,7 @@ import { getOraDecompressionManagerStackProps } from './stacks/oraDecompressionP import { getPgDDProps } from './stacks/pgDD'; import { getDataMigrateStackProps } from './stacks/dataMigrate'; import { getHtsgetProps } from './stacks/htsget'; +import { getSampleSheetCheckerProps } from './stacks/sampleSheetChecker'; interface EnvironmentConfig { name: string; @@ -135,6 +136,7 @@ export const getEnvironmentConfig = (stage: AppStage): EnvironmentConfig | null fmAnnotatorProps: getFmAnnotatorProps(), dataMigrateProps: getDataMigrateStackProps(stage), htsgetProps: getHtsgetProps(stage), + sampleSheetCheckerProps: getSampleSheetCheckerProps(stage), pgDDProps: getPgDDProps(stage), }, }; diff --git a/config/stacks/sampleSheetChecker.ts b/config/stacks/sampleSheetChecker.ts new file mode 100644 index 000000000..707c660a5 --- /dev/null +++ b/config/stacks/sampleSheetChecker.ts @@ -0,0 +1,19 @@ +import { SampleSheetCheckerStackProps } from '../../lib/workload/stateless/stacks/sample-sheet-check/stack'; +import { + AppStage, + cognitoApiGatewayConfig, + corsAllowOrigins, + logsApiGatewayConfig, +} from '../constants'; + +export const getSampleSheetCheckerProps = (stage: AppStage): SampleSheetCheckerStackProps => { + return { + apiGatewayConstructProps: { + ...cognitoApiGatewayConfig, + corsAllowOrigins: corsAllowOrigins[stage], + apiGwLogsConfig: logsApiGatewayConfig[stage], + apiName: 'SSCheck', + customDomainNamePrefix: 'sscheck-orcabus', + }, + }; +}; diff --git a/lib/workload/stateless/stacks/sample-sheet-check/Makefile b/lib/workload/stateless/stacks/sample-sheet-check/Makefile new file mode 100644 index 000000000..0bf72d345 --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/Makefile @@ -0,0 +1,11 @@ +# Variables +LAMBDA_DIR := ./sample-sheet-check-lambda + +# Default target +all: test + +# Run tests +test: + $(MAKE) -C $(LAMBDA_DIR) test + +.PHONY: all test diff --git a/lib/workload/stateless/stacks/sample-sheet-check/README.md b/lib/workload/stateless/stacks/sample-sheet-check/README.md new file mode 100644 index 000000000..9494b218d --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/README.md @@ -0,0 +1,9 @@ +# Sample Sheet Checker + +Deploy sample-sheet-checker in a lambda + +## Checker Implementation + +```sh +cd ./sample-sheet-check-lambda +``` diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/.gitignore b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/.gitignore new file mode 100644 index 000000000..4a34b2271 --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/.gitignore @@ -0,0 +1,8 @@ +.venv +venv + +/tests/sample/ +!tests/sample/mock-* + +/log/ +SampleSheet_v2.csv diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/Makefile b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/Makefile new file mode 100644 index 000000000..fd06446f8 --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/Makefile @@ -0,0 +1,27 @@ +# Makefile for Python project + +# Variables +VENV_DIR = .venv +PYTHON = $(VENV_DIR)/bin/python +PIP = $(VENV_DIR)/bin/pip +TEST_DIR = tests + +# Create virtual environment +$(VENV_DIR)/bin/activate: requirements.txt + python3 -m venv $(VENV_DIR) + $(PIP) install -r requirements.txt + +# Install dependencies +install: $(VENV_DIR)/bin/activate + +# Run tests +test: install + $(PYTHON) -m unittest discover $(TEST_DIR) + +# Clean up +clean: + rm -rf $(VENV_DIR) + find . -type f -name '*.pyc' -delete + find . -type d -name '__pycache__' -delete + +.PHONY: install test clean diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/README.md b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/README.md new file mode 100644 index 000000000..d38bf09b1 --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/README.md @@ -0,0 +1,40 @@ +# Sample Sheet Checker + +## Setup + +### Using Python Environment + +```shell +conda create -n orcabus_sscheck python=3.12 +conda activate orcabus_sscheck +``` + +### Running Locally + +To run the script and see the available options, use the following command: + +```shell +python main.py -h + +usage: main.py [-h] --path PATH [--log-path LOG_PATH] [--skip-metadata-check] [--skip-v2] [--v2-filename V2_FILENAME] + +Run sample sheet check locally. + +options: + -h, --help show this help message and exit + --path PATH The path to the sample sheet file. + --log-path LOG_PATH Name of the output file for the sscheck log file. Default: log/ss-checker.log + --skip-metadata-check + Skip sample sheet check against metadata API (API token required). + --skip-v2, --skip-v2-sample sheet-output + Skip generating the sample sheet v2. ('--skip-metadata-check' must be set to False). + --v2-filename V2_FILENAME + Name of the output file for the generated sample sheet v2. Default: SampleSheet_v2.csv + +``` + +Running example + +```shell + python main.py --path ./tests/sample/sample-1.csv +``` diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/handler.py b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/handler.py new file mode 100644 index 000000000..95efe0c39 --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/handler.py @@ -0,0 +1,85 @@ +import base64 +import tempfile +import logging +from email.parser import BytesParser + +from src.checker import construct_sample_sheet, run_sample_sheet_content_check, run_sample_sheet_check_with_metadata, \ + construct_logger +from src.http import construct_body, construct_response +from src.v2_samplesheet_builder import v1_to_v2_samplesheet + +# Logging +LOG_PATH = "/tmp/samplesheet_check.log" +logger = logging.getLogger() +logger.setLevel(logging.INFO) + + +def lambda_handler(event, context): + """ + Parameters + ---------- + event : Object + An object of payload pass through the lambda + context : Object + An aws resource information + + """ + event_copy = event.copy() + event_copy['headers'] = event_copy.get('headers', {}).copy() + event_copy['headers'].pop('Authorization', None) + event_copy['headers'].pop('authorization', None) + + logger.info(f"Processing (event, context): {event_copy}, {context}") + + # Parse header + headers = event.get("headers", {}) + origin = headers.get("origin", "") + authorization = headers.get("Authorization", headers.get("authorization", "")) + content_type = headers.get("Content-Type", headers.get("content-type", "")) + + # Parse body payload + if event.get("isBase64Encoded", False): + body = base64.b64decode(event["body"]) + else: + body = event["body"].encode() + ct = f"Content-Type: {content_type}\n\n".encode() + msg = BytesParser().parsebytes(ct + body) + if not msg.is_multipart(): + body = construct_body(check_status="FAIL", error_message="Invalid body", + v2_sample_sheet='') + response = construct_response(status_code=400, body=body, origin=origin) + return response + + multipart_content = {} + for part in msg.get_payload(): + multipart_content[part.get_param( + 'name', header='content-disposition')] = part.get_payload(decode=True) + + file_data = multipart_content["file"] + log_level = multipart_content["logLevel"].decode("utf-8") + + # Save file to temp file + temporary_data = tempfile.NamedTemporaryFile(mode='w+', delete=False) + temporary_data.write(file_data.decode("utf-8")) + temporary_data.seek(0) + + try: + construct_logger(log_path=LOG_PATH, log_level=log_level) + + # Construct and run sample sheet checker + sample_sheet = construct_sample_sheet(temporary_data.name) + run_sample_sheet_content_check(sample_sheet) + run_sample_sheet_check_with_metadata(sample_sheet, authorization) + + # run sample sheet v2 conversion + v2_sample_sheet_str = v1_to_v2_samplesheet(sample_sheet) + + except Exception as e: + body = construct_body(check_status="FAIL", error_message=str(e), log_path=LOG_PATH, + v2_sample_sheet='') + response = construct_response(status_code=200, body=body, origin=origin) + return response + + body = construct_body(check_status='PASS', log_path=LOG_PATH, v2_sample_sheet=v2_sample_sheet_str) + response = construct_response(status_code=200, body=body, origin=origin) + return response diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/lambda.Dockerfile b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/lambda.Dockerfile new file mode 100644 index 000000000..bea298de8 --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/lambda.Dockerfile @@ -0,0 +1,12 @@ +FROM public.ecr.aws/lambda/python:3.12 + +WORKDIR ${LAMBDA_TASK_ROOT} + +# COPY all files +COPY . . + +# Install the specified packages +RUN pip install -r requirements.txt + +# Specify handler +CMD [ "handler.lambda_handler" ] diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/main.py b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/main.py new file mode 100644 index 000000000..68e73f2da --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/main.py @@ -0,0 +1,98 @@ +import json +import logging +import os +import argparse + +from src.checker import construct_sample_sheet, run_sample_sheet_content_check, run_sample_sheet_check_with_metadata +from src.logger import set_logger +from src.v2_samplesheet_builder import v1_to_v2_samplesheet + +def get_argument(): + parser = argparse.ArgumentParser( + description="Run sample sheet check locally." + ) + parser.add_argument( + "--path", + required=True, + help="The path to the sample sheet file.", + ) + + parser.add_argument( + "--log-path", + default="log/ss-checker.log", + help="Name of the output file for the sscheck log file. Default: log/ss-checker.log", + ) + + parser.add_argument( + "--skip-metadata-check", action="store_true", default=False, + help="Skip sample sheet check against metadata API (API token required)." + ) + + parser.add_argument( + "--skip-v2", "--skip-v2-sample sheet-output", action="store_true", default=False, + help="Skip generating the sample sheet v2. ('--skip-metadata-check' must be set to False)." + ) + + parser.add_argument( + "--v2-filename", + default="SampleSheet_v2.csv", + help="Name of the output file for the generated sample sheet v2. Default: SampleSheet_v2.csv", + ) + + args_input = parser.parse_args() + + print("#" * 30) + print(f"Sample sheet (SS) Path : {args_input.path}") + print(f"Log path : {args_input.log_path}") + print(f"Skip SS Check w/ metadata : {args_input.skip_metadata_check}") + print(f"Skip generating v2 : {True if args_input.skip_metadata_check is True else args_input.skip_v2}") + print(f"SS V2 output (if enabled) : {args_input.v2_filename}") + print("#" * 30) + + return args_input + + +if __name__ == "__main__": + args = get_argument() + filepath = args.path + log_path = args.log_path + v2_filename = args.v2_filename + result = { + "Check status": "PASS", "Log path": log_path, "V2 SampleSheet (if enabled)": v2_filename + } + + # Setup logger logistic + directory = os.path.dirname(log_path) + if directory: + os.makedirs(directory, exist_ok=True) + set_logger(log_path=log_path, log_level=logging.INFO) + + # Construct and run sample sheet checker + sample_sheet = construct_sample_sheet(filepath) + run_sample_sheet_content_check(sample_sheet) + + if not args.skip_metadata_check: + + token = os.environ.get("JWT_AUTH", None) + if token is None: + raise ValueError("JWT_AUTH environment variable is not set.") + + run_sample_sheet_check_with_metadata(sample_sheet, token) + + result = {"Check status": "PASS", "Log path": log_path} + + if not args.skip_v2 and not args.skip_metadata_check: + try: + + v2_sample_sheet_str = v1_to_v2_samplesheet(sample_sheet) + + with open(v2_filename, 'w') as file: + file.write(v2_sample_sheet_str) + except Exception as e: + logging.error(f"Error generating v2 sample sheet: {e}") + raise e + + result["V2 SampleSheet (if enabled)"] = v2_filename + + print("\n") + print(json.dumps(result, indent=4)) diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/requirements.txt b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/requirements.txt new file mode 100644 index 000000000..605e28823 --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/requirements.txt @@ -0,0 +1,3 @@ +v2-samplesheet-maker==4.2.4.post20241110133537 +scipy==1.15.0 +pandas==2.2.3 diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/__init__.py b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/__init__.py new file mode 100644 index 000000000..ad694f581 --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/__init__.py @@ -0,0 +1,9 @@ +import re + + +def camel_to_snake(name): + """ + Convert camel case to snake case + """ + name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower() \ No newline at end of file diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/checker.py b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/checker.py new file mode 100644 index 000000000..2a49a6f7c --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/checker.py @@ -0,0 +1,115 @@ +import pandas as pd + +from src.errors import FileContentError +from src.logger import get_logger, set_basic_logger, set_logger +from src.samplesheet import SampleSheet, check_sample_sheet_for_index_clashes, check_samplesheet_header_metadata, \ + get_years_from_samplesheet, check_metadata_correspondence, check_global_override_cycles, \ + check_internal_override_cycles + +logger = set_basic_logger() + + +def construct_logger(log_path, log_level): + """ + Cosntructing logger for samplesheet. + + Parameters + ---------- + log_path : str + The path where the logger lives + log_level : str + The type of logging desired + + """ + global logger + set_logger(log_path=log_path, log_level=log_level) + + +def construct_sample_sheet(sample_sheet_path: str): + """ + Constructing and parse sample sheet content. + + Return + ---------- + sample_sheet : SampleSheet + sample sheet data to be checked + + """ + + try: + return SampleSheet(sample_sheet_path) + + except: + logger.error("Unable to parse SampleSheet from the given file.") + raise FileContentError + + +def run_sample_sheet_content_check(sample_sheet: SampleSheet): + """ + Run check for the samplesheet. + + Parameters + ---------- + sample_sheet : SampleSheet + sample sheet data to be checked + + Return + ---------- + error_message : str + any error message that stops the check + + """ + logger.info("Check samplesheet content") + + # Run some consistency checks + logger.info("Get all years of samples in samplesheets") + years = get_years_from_samplesheet(sample_sheet) + if len(list(years)) == 1: + logger.info("SampleSheet contains IDs from year: {}".format(list(years)[0])) + else: + logger.info("SampleSheet contains IDs from {} years: {}".format(len(years), ', '.join(map(str, list(years))))) + + logger.info('----------check_sample_sheet_header_metadata----------') + check_samplesheet_header_metadata(sample_sheet) + logger.info('----------check_sample_sheet_for_index_clashes----------') + check_sample_sheet_for_index_clashes(sample_sheet) + + +def run_sample_sheet_check_with_metadata(sample_sheet: SampleSheet, auth_header: str): + """ + Run check for the sample sheet. + + Parameters + ---------- + sample_sheet : SampleSheet + sample sheet data to be checked + auth_header : str + JWT token to fetch on data-portal API + + Return + ---------- + error_message : str + any error message that stops the check + + """ + + logger.info("Check sample sheet against metadata") + + # Run through checks with metadata integrate + logger.info('----------set_metadata_from_api----------') + sample_sheet.set_metadata_from_api(auth_header) + + logger.info('----------check_metadata_correspondence----------') + check_metadata_correspondence(sample_sheet) + + logger.info('----------check_global_override_cycles----------') + check_global_override_cycles(sample_sheet) + logger.info('----------check_internal_override_cycles----------') + check_internal_override_cycles(sample_sheet) + + logger.info("Info on the value_counts of the sample sheet (by assay, type and override cycles)") + sample_sheet_df = pd.DataFrame([{"assay": sample.library_series['assay'], + "type": sample.library_series['type'], + "override_cycles": sample.library_series['override_cycles']} + for sample in sample_sheet]) + logger.info(f"Value Counts:\n{sample_sheet_df.value_counts()}") diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/errors.py b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/errors.py new file mode 100644 index 000000000..17cbe68cb --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/errors.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python +""" +ERRORS +""" + + +class ColumnNotFoundError(Exception): + """ + The Column of the dataframe or excel spread sheet is not found + """ + pass + + +class LibraryNotFoundError(Exception): + """ + We could not find the library ID in the metadata spreadsheet + """ + pass + + +class MultipleLibraryError(Exception): + """ + We found more than one library corresponding to this sample in the metadata sheet + """ + pass + + +class GetMetaDataError(Exception): + """ + A collective error for LibraryNotFoundError and MultipleLibraryError + We failed to collect the requested metadata + """ + pass + + +class SampleSheetFormatError(Exception): + """ + Config-like construction was not found + """ + pass + + +class SampleSheetHeaderError(Exception): + """ + We failed to collect an attribute in the SampleSheet header + """ + pass + + +class SampleNotFoundError(Exception): + """ + We failed to find a sample sheet in the sample sheet with this ID + """ + pass + + +class SampleDuplicateError(Exception): + """ + Sample with the same id already exists in the sample sheet + """ + + +class SampleNameFormatError(Exception): + """ + The sample sheet was not in the correct format + """ + pass + + +class SimilarIndexError(Exception): + """ + Two indexes of separate samples were too similar + """ + pass + + +class MetaDataError(Exception): + """ + Wrapper error for GetMetaDataError and LibraryNotFound, ColumnNotFound + """ + pass + + +class OverrideCyclesError(Exception): + """ + Wrapper error - a samples' override cycle section in the metadata sheet wasn't correct + """ + pass + + +class InvalidColumnError(Exception): + """ + This column is not recognised + """ + pass + + +class ApiCallError(Exception): + """ + Failure on API calls + """ + pass + + +class FileContentError(Exception): + """ + File content is not as expected + """ + pass \ No newline at end of file diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/globals.py b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/globals.py new file mode 100644 index 000000000..133acae2b --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/globals.py @@ -0,0 +1,467 @@ +#!/usr/bin/env python + +""" +GLOBALS used in projects + +* METADATA SPREAD SHEET +* SAMPLE SHEET REGEXES +* GOOGLE LIMS +* LOGS + +""" + +import re +from typing import List + +from v2_samplesheet_maker.enums import FastqCompressionFormat + +METADATA_COLUMN_NAMES = { + "library_id": 'LibraryID', # the internal ID for the library + "sample_name": 'SampleName', # the sample name assigned by the lab + "sample_id": 'SampleID', # the internal ID for the sample + "external_sample_id": 'ExternalSampleID', # the external ID for the sample + "subject_id": 'SubjectID', # the internal ID for the subject/patient + "external_subject_id": "ExternalSubjectID", # The external subject ID + "phenotype": 'Phenotype', # tumor, normal, negative-control, ... + "quality": 'Quality', # Good, Poor, Borderline + "source": 'Source', # tissue, FFPE, ... + "project_name": 'ProjectName', + "project_owner": 'ProjectOwner', + "experiment_id": "ExperimentID", + "type": 'Type', # the sample type: WGS, WTS, 10X, ... + "assay": "Assay", # the assay type; TsqNano, NebRNA ... + "override_cycles": "OverrideCycles", # The Override cycles list for this run + "secondary_analysis": "Workflow", # ? + "coverage": "Coverage (X)", # ? + "truseq_index": "TruSeq Index, unless stated", # FIXME - this is a terrible column name + "run": "Run#", + "comments": "Comments", + "rrna": "rRNA", + "qpc_id": "qPCR ID", + "sample_id_samplesheet": "Sample_ID (SampleSheet)" # FIXME - this is named 'Sample_ID (SampleSheet)' in the dev spreadsheet +} + + +""" +METADATA SPREAD SHEET +""" + +METADATA_VALIDATION_COLUMN_NAMES = { + "val_phenotype": "PhenotypeValues", + "val_quality": "QualityValues", + "val_source": "SourceValues", + "val_type": "TypeValues", + "val_project_name": "ProjectNameValues", + "val_project_owner": "ProjectOwnerValues", +} + +#METADATA_COLUMN_NAMES.update(METADATA_VALIDATION_COLUMN_NAMES) + +""" +SAMPLE SHEET DATA COLUMNS +""" + +REQUIRED_SAMPLE_SHEET_DATA_COLUMN_NAMES = { + "v1": ["Sample_ID", "Sample_Name", "index"], + "v2": ["Sample_ID", "index"] +} + +VALID_SAMPLE_SHEET_DATA_COLUMN_NAMES = { + # This is the standard + "v1": ["Lane", "Sample_ID", "Sample_Name", "Sample_Plate", "Sample_Well", + "Index_Plate_Well", "I7_Index_ID", "index", + "I5_Index_ID", "index2", "Sample_Project", "Description"], + # This is the future + "v2": ["Lane", "Sample_ID", "index", "index2", "Sample_Project"] +} + + +""" +SAMPLE SHEET REGEXES +""" + +EXPERIMENT_REGEX_STR = { + "top_up": r"(?:_topup\d?)", + "rerun": r"(?:_rerun\d?)" +} + +SAMPLE_ID_REGEX_STR = { + "sample_id_non_control": r"(?:PRJ|CCR|MDX|TGX)\d{6}", + "sample_id_control": r"(?:NTC|PTC)_\w+" +} + +SAMPLE_ID_REGEX_STR["sample_id"] = r"(?:(?:{})|(?:{}))".format( + SAMPLE_ID_REGEX_STR["sample_id_non_control"], + SAMPLE_ID_REGEX_STR["sample_id_control"] +) + +LIBRARY_REGEX_STR = { + "id_int": r"L\d{7}", + "id_ext": r"L{}".format(SAMPLE_ID_REGEX_STR["sample_id"]), + "year": r"(?:L|LPRJ)(\d{2})\d+" +} + +LIBRARY_REGEX_STR["id"] = r"(?:{}|{})(?:{}|{})?".format( + LIBRARY_REGEX_STR["id_int"], + LIBRARY_REGEX_STR["id_ext"], + EXPERIMENT_REGEX_STR["top_up"], # TODO - could a top_up/rerun exist? + EXPERIMENT_REGEX_STR["rerun"] +) + +SAMPLE_REGEX_OBJS = { + # Sample ID: https://regex101.com/r/Z7fvHt/1 + "sample_id": re.compile(SAMPLE_ID_REGEX_STR["sample_id"]), + # https://regex101.com/r/Z7fvHt/2 + "library_id": re.compile(LIBRARY_REGEX_STR["id"]), + # https://regex101.com/r/Yf2t8E/2 + "unique_id_full_match": re.compile("{}_{}".format(SAMPLE_ID_REGEX_STR["sample_id"], LIBRARY_REGEX_STR["id"])), + # https://regex101.com/r/Yf2t8E/3 + # Use brackets to capture the sample id and the library id + "unique_id": re.compile("({})_({})".format(SAMPLE_ID_REGEX_STR["sample_id"], LIBRARY_REGEX_STR["id"])), + # https://regex101.com/r/pkqI1n/1 + "topup": re.compile(EXPERIMENT_REGEX_STR["top_up"]), + # https://regex101.com/r/nNPwQu/1 + "year": re.compile(LIBRARY_REGEX_STR["year"]) +} + +SAMPLESHEET_REGEX_STR = { + "section_header": r"^\[(\S+)\](,+)?" +} + +SAMPLESHEET_REGEX_OBJS = { + # https://regex101.com/r/5nbe9I/1 + "section_header": re.compile(SAMPLESHEET_REGEX_STR["section_header"]) +} + +OVERRIDE_CYCLES_STR = { + "cycles": r"(?:([INYU])(\d*))", + "cycles_full_match": r"(?:[INYU]+(\d*))+", + "indexes": r"((?:[I])(\d*))" +} + +OVERRIDE_CYCLES_OBJS = { + # https://regex101.com/r/U7bJUI/1 + "cycles": re.compile(OVERRIDE_CYCLES_STR["cycles"]), + # https://regex101.com/r/U7bJUI/2 + "cycles_full_match": re.compile(OVERRIDE_CYCLES_STR["cycles_full_match"]), + "indexes": re.compile(OVERRIDE_CYCLES_STR["indexes"]) +} + + + +""" +LOGS +""" +LOGGER_STYLE = "%(asctime)s - %(levelname)-8s - %(module)-25s - %(funcName)-40s : LineNo. %(lineno)-4d - %(message)s" + + +""" +INDEX DISTANCES +""" + +MIN_INDEX_HAMMING_DISTANCE = 3 + +LOG_DIRECTORY = { + "samplesheet_check" : "/tmp/samplesheet_check.log" +} + + +ADAPTERS_BY_KIT = { + "truseq": { + "adapter_read_1": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA", + "adapter_read_2": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT" + }, + "nextera": { + "adapter_read_1": "CTGTCTCTTATACACATCT", + "adapter_read_2": "" # null is default and keeps current samplesheet, "" removes value + }, + "pcr_free_tagmentation": { + "adapter_read_1": "CTGTCTCTTATACACATCTCCGAGCCCACGAGAC+ATGTGTATAAGAGACA", + "adapter_read_2": "CTGTCTCTTATACACATCTCGCAGGGGATAGTCAGATGACGCTGCCGACGA+ATGTGTATAAGAGACA" + }, + "agilent_sureselect_qxt": { + "adapter_read_1": "CTGTCTCTTGATCACA", + "adapter_read_2": "" # null is default and keeps current samplesheet, "" removes value + }, +} + +V2_BCLCONVERT_BASESPACE_URN = "urn:ilmn:ica:pipeline:bf93b5cf-cb27-4dfa-846e-acd6eb081aca#BclConvert_v4_2_7" +V2_BCLCONVERT_BASESPACE_SOFTWARE_VERSION = "4.2.7" + +V2_SAMPLESHEET_BCLCONVERT_ADAPTER_SETTINGS_BY_ASSAY_TYPE = { + # All 10X sample types + "10X:.*": { + "create_fastq_for_index_reads": True, + "minimum_trimmed_read_length": 8, + "mask_short_reads": 8, + # Remove adapters as suggested here: + # https://kb.10xgenomics.com/hc/en-us/articles/4424193781517-What-adapters-should-I-use-in-my-IEM-sample-sheet- + "adapter_read_1": "", + "adapter_read_2": "" + }, + # TSO Assays + "ctDNA:ctTSOv2": { + "adapter_read_1": ADAPTERS_BY_KIT["nextera"]["adapter_read_1"], + "adapter_read_2": ADAPTERS_BY_KIT["nextera"]["adapter_read_1"], # Not a typo, both adapter reads are the same + "adapter_behavior": "trim", + "minimum_trimmed_read_length": 35, + "mask_short_reads": 35, + }, + "ctDNA:ctTSO|TSODNA|TSORNA": { + "adapter_read_1": ADAPTERS_BY_KIT["truseq"]["adapter_read_1"], + "adapter_read_2": ADAPTERS_BY_KIT["truseq"]["adapter_read_2"], + "adapter_behavior": "trim", + "minimum_trimmed_read_length": 35, + "mask_short_reads": 35, + }, + # PCR Free Tagementation Assays (rare) + ".*:PCR-Free-Tagmentation": { + "adapter_read_1": ADAPTERS_BY_KIT["pcr_free_tagmentation"]["adapter_read_1"], + "adapter_read_2": ADAPTERS_BY_KIT["pcr_free_tagmentation"]["adapter_read_2"], + }, + # Minimum Adapater Overlap for all samples set to 3 + ".*:.*": { + "minimum_adapter_overlap": 3 + } +} + +# Adapter settins can be set per sample +V2_ADAPTER_SETTINGS = [ + "barcode_mismatches_index1", + "barcode_mismatches_index2", + "adapter_read_1", + "adapter_read_2", + "adapter_behavior", + "adapter_stringency" +] + +# Data specific rows for v2 +V2_DATA_ROWS = [ + "sample_id", + "lane", + "index", + "index2", + "sample_project", + "sample_name", + "library_prep_kit_name" +] + +# Non-adapter settings that can be data settings +V2_SAMPLESHEET_DATA_SETTINGS = [ + "override_cycles" +] + +# Samplesheet settings in the BCLConvert_Settings section +V2_SAMPLESHEET_GLOBAL_SETTINGS = { + "minimum_trimmed_read_length": int, + "minimum_adapter_overlap": int, + "mask_short_reads": int, + "override_cycles": int, + "trim_umi": bool, + "create_fastq_for_index_reads": bool, + "no_lane_splitting": bool, + "fastq_compression_format": FastqCompressionFormat, + "find_adapters_with_indels": bool, + "independent_index_collision_check": list, +} + +# Add adapter settings to samplesheet settings +V2_SAMPLESHEET_DATA_SETTINGS.extend(V2_ADAPTER_SETTINGS) + +# Add settings +V2_DATA_ROWS.extend(V2_SAMPLESHEET_DATA_SETTINGS) + +V2_CTTSO_VALID_INDEXES = [ + {"index_id": "UDP0001", "index": "GAACTGAGCG", "index2": "TCGTGGAGCG", "index_rev": "CGCTCAGTTC", "index2_rev": "CGCTCCACGA"}, + {"index_id": "UDP0002", "index": "AGGTCAGATA", "index2": "CTACAAGATA", "index_rev": "TATCTGACCT", "index2_rev": "TATCTTGTAG"}, + {"index_id": "UDP0003", "index": "CGTCTCATAT", "index2": "TATAGTAGCT", "index_rev": "ATATGAGACG", "index2_rev": "AGCTACTATA"}, + {"index_id": "UDP0004", "index": "ATTCCATAAG", "index2": "TGCCTGGTGG", "index_rev": "CTTATGGAAT", "index2_rev": "CCACCAGGCA"}, + {"index_id": "UDP0005", "index": "GACGAGATTA", "index2": "ACATTATCCT", "index_rev": "TAATCTCGTC", "index2_rev": "AGGATAATGT"}, + {"index_id": "UDP0006", "index": "AACATCGCGC", "index2": "GTCCACTTGT", "index_rev": "GCGCGATGTT", "index2_rev": "ACAAGTGGAC"}, + {"index_id": "UDP0007", "index": "CTAGTGCTCT", "index2": "TGGAACAGTA", "index_rev": "AGAGCACTAG", "index2_rev": "TACTGTTCCA"}, + {"index_id": "UDP0008", "index": "GATCAAGGCA", "index2": "CCTTGTTAAT", "index_rev": "TGCCTTGATC", "index2_rev": "ATTAACAAGG"}, + {"index_id": "UDP0009", "index": "GACTGAGTAG", "index2": "GTTGATAGTG", "index_rev": "CTACTCAGTC", "index2_rev": "CACTATCAAC"}, + {"index_id": "UDP0010", "index": "AGTCAGACGA", "index2": "ACCAGCGACA", "index_rev": "TCGTCTGACT", "index2_rev": "TGTCGCTGGT"}, + {"index_id": "UDP0011", "index": "CCGTATGTTC", "index2": "CATACACTGT", "index_rev": "GAACATACGG", "index2_rev": "ACAGTGTATG"}, + {"index_id": "UDP0012", "index": "GAGTCATAGG", "index2": "GTGTGGCGCT", "index_rev": "CCTATGACTC", "index2_rev": "AGCGCCACAC"}, + {"index_id": "UDP0013", "index": "CTTGCCATTA", "index2": "ATCACGAAGG", "index_rev": "TAATGGCAAG", "index2_rev": "CCTTCGTGAT"}, + {"index_id": "UDP0014", "index": "GAAGCGGCAC", "index2": "CGGCTCTACT", "index_rev": "GTGCCGCTTC", "index2_rev": "AGTAGAGCCG"}, + {"index_id": "UDP0015", "index": "TCCATTGCCG", "index2": "GAATGCACGA", "index_rev": "CGGCAATGGA", "index2_rev": "TCGTGCATTC"}, + {"index_id": "UDP0016", "index": "CGGTTACGGC", "index2": "AAGACTATAG", "index_rev": "GCCGTAACCG", "index2_rev": "CTATAGTCTT"}, + {"index_id": "UDP0017", "index": "GAGAATGGTT", "index2": "TCGGCAGCAA", "index_rev": "AACCATTCTC", "index2_rev": "TTGCTGCCGA"}, + {"index_id": "UDP0018", "index": "AGAGGCAACC", "index2": "CTAATGATGG", "index_rev": "GGTTGCCTCT", "index2_rev": "CCATCATTAG"}, + {"index_id": "UDP0019", "index": "CCATCATTAG", "index2": "GGTTGCCTCT", "index_rev": "CTAATGATGG", "index2_rev": "AGAGGCAACC"}, + {"index_id": "UDP0020", "index": "GATAGGCCGA", "index2": "CGCACATGGC", "index_rev": "TCGGCCTATC", "index2_rev": "GCCATGTGCG"}, + {"index_id": "UDP0021", "index": "ATGGTTGACT", "index2": "GGCCTGTCCT", "index_rev": "AGTCAACCAT", "index2_rev": "AGGACAGGCC"}, + {"index_id": "UDP0022", "index": "TATTGCGCTC", "index2": "CTGTGTTAGG", "index_rev": "GAGCGCAATA", "index2_rev": "CCTAACACAG"}, + {"index_id": "UDP0023", "index": "ACGCCTTGTT", "index2": "TAAGGAACGT", "index_rev": "AACAAGGCGT", "index2_rev": "ACGTTCCTTA"}, + {"index_id": "UDP0024", "index": "TTCTACATAC", "index2": "CTAACTGTAA", "index_rev": "GTATGTAGAA", "index2_rev": "TTACAGTTAG"}, + {"index_id": "UDP0025", "index": "AACCATAGAA", "index2": "GGCGAGATGG", "index_rev": "TTCTATGGTT", "index2_rev": "CCATCTCGCC"}, + {"index_id": "UDP0026", "index": "GGTTGCGAGG", "index2": "AATAGAGCAA", "index_rev": "CCTCGCAACC", "index2_rev": "TTGCTCTATT"}, + {"index_id": "UDP0027", "index": "TAAGCATCCA", "index2": "TCAATCCATT", "index_rev": "TGGATGCTTA", "index2_rev": "AATGGATTGA"}, + {"index_id": "UDP0028", "index": "ACCACGACAT", "index2": "TCGTATGCGG", "index_rev": "ATGTCGTGGT", "index2_rev": "CCGCATACGA"}, + {"index_id": "UDP0029", "index": "GCCGCACTCT", "index2": "TCCGACCTCG", "index_rev": "AGAGTGCGGC", "index2_rev": "CGAGGTCGGA"}, + {"index_id": "UDP0030", "index": "CCACCAGGCA", "index2": "CTTATGGAAT", "index_rev": "TGCCTGGTGG", "index2_rev": "ATTCCATAAG"}, + {"index_id": "UDP0031", "index": "GTGACACGCA", "index2": "GCTTACGGAC", "index_rev": "TGCGTGTCAC", "index2_rev": "GTCCGTAAGC"}, + {"index_id": "UDP0032", "index": "ACAGTGTATG", "index2": "GAACATACGG", "index_rev": "CATACACTGT", "index2_rev": "CCGTATGTTC"}, + {"index_id": "UDP0033", "index": "TGATTATACG", "index2": "GTCGATTACA", "index_rev": "CGTATAATCA", "index2_rev": "TGTAATCGAC"}, + {"index_id": "UDP0034", "index": "CAGCCGCGTA", "index2": "ACTAGCCGTG", "index_rev": "TACGCGGCTG", "index2_rev": "CACGGCTAGT"}, + {"index_id": "UDP0035", "index": "GGTAACTCGC", "index2": "AAGTTGGTGA", "index_rev": "GCGAGTTACC", "index2_rev": "TCACCAACTT"}, + {"index_id": "UDP0036", "index": "ACCGGCCGTA", "index2": "TGGCAATATT", "index_rev": "TACGGCCGGT", "index2_rev": "AATATTGCCA"}, + {"index_id": "UDP0037", "index": "TGTAATCGAC", "index2": "GATCACCGCG", "index_rev": "GTCGATTACA", "index2_rev": "CGCGGTGATC"}, + {"index_id": "UDP0038", "index": "GTGCAGACAG", "index2": "TACCATCCGT", "index_rev": "CTGTCTGCAC", "index2_rev": "ACGGATGGTA"}, + {"index_id": "UDP0039", "index": "CAATCGGCTG", "index2": "GCTGTAGGAA", "index_rev": "CAGCCGATTG", "index2_rev": "TTCCTACAGC"}, + {"index_id": "UDP0040", "index": "TATGTAGTCA", "index2": "CGCACTAATG", "index_rev": "TGACTACATA", "index2_rev": "CATTAGTGCG"}, + {"index_id": "UDP0041", "index": "ACTCGGCAAT", "index2": "GACAACTGAA", "index_rev": "ATTGCCGAGT", "index2_rev": "TTCAGTTGTC"}, + {"index_id": "UDP0042", "index": "GTCTAATGGC", "index2": "AGTGGTCAGG", "index_rev": "GCCATTAGAC", "index2_rev": "CCTGACCACT"}, + {"index_id": "UDP0043", "index": "CCATCTCGCC", "index2": "TTCTATGGTT", "index_rev": "GGCGAGATGG", "index2_rev": "AACCATAGAA"}, + {"index_id": "UDP0044", "index": "CTGCGAGCCA", "index2": "AATCCGGCCA", "index_rev": "TGGCTCGCAG", "index2_rev": "TGGCCGGATT"}, + {"index_id": "UDP0045", "index": "CGTTATTCTA", "index2": "CCATAAGGTT", "index_rev": "TAGAATAACG", "index2_rev": "AACCTTATGG"}, + {"index_id": "UDP0046", "index": "AGATCCATTA", "index2": "ATCTCTACCA", "index_rev": "TAATGGATCT", "index2_rev": "TGGTAGAGAT"}, + {"index_id": "UDP0047", "index": "GTCCTGGATA", "index2": "CGGTGGCGAA", "index_rev": "TATCCAGGAC", "index2_rev": "TTCGCCACCG"}, + {"index_id": "UDP0048", "index": "CAGTGGCACT", "index2": "TAACAATAGG", "index_rev": "AGTGCCACTG", "index2_rev": "CCTATTGTTA"}, + {"index_id": "UDP0049", "index": "AGTGTTGCAC", "index2": "CTGGTACACG", "index_rev": "GTGCAACACT", "index2_rev": "CGTGTACCAG"}, + {"index_id": "UDP0050", "index": "GACACCATGT", "index2": "TCAACGTGTA", "index_rev": "ACATGGTGTC", "index2_rev": "TACACGTTGA"}, + {"index_id": "UDP0051", "index": "CCTGTCTGTC", "index2": "ACTGTTGTGA", "index_rev": "GACAGACAGG", "index2_rev": "TCACAACAGT"}, + {"index_id": "UDP0052", "index": "TGATGTAAGA", "index2": "GTGCGTCCTT", "index_rev": "TCTTACATCA", "index2_rev": "AAGGACGCAC"}, + {"index_id": "UDP0053", "index": "GGAATTGTAA", "index2": "AGCACATCCT", "index_rev": "TTACAATTCC", "index2_rev": "AGGATGTGCT"}, + {"index_id": "UDP0054", "index": "GCATAAGCTT", "index2": "TTCCGTCGCA", "index_rev": "AAGCTTATGC", "index2_rev": "TGCGACGGAA"}, + {"index_id": "UDP0055", "index": "CTGAGGAATA", "index2": "CTTAACCACT", "index_rev": "TATTCCTCAG", "index2_rev": "AGTGGTTAAG"}, + {"index_id": "UDP0056", "index": "AACGCACGAG", "index2": "GCCTCGGATA", "index_rev": "CTCGTGCGTT", "index2_rev": "TATCCGAGGC"}, + {"index_id": "UDP0057", "index": "TCTATCCTAA", "index2": "CGTCGACTGG", "index_rev": "TTAGGATAGA", "index2_rev": "CCAGTCGACG"}, + {"index_id": "UDP0058", "index": "CTCGCTTCGG", "index2": "TACTAGTCAA", "index_rev": "CCGAAGCGAG", "index2_rev": "TTGACTAGTA"}, + {"index_id": "UDP0059", "index": "CTGTTGGTCC", "index2": "ATAGACCGTT", "index_rev": "GGACCAACAG", "index2_rev": "AACGGTCTAT"}, + {"index_id": "UDP0060", "index": "TTACCTGGAA", "index2": "ACAGTTCCAG", "index_rev": "TTCCAGGTAA", "index2_rev": "CTGGAACTGT"}, + {"index_id": "UDP0061", "index": "TGGCTAATCA", "index2": "AGGCATGTAG", "index_rev": "TGATTAGCCA", "index2_rev": "CTACATGCCT"}, + {"index_id": "UDP0062", "index": "AACACTGTTA", "index2": "GCAAGTCTCA", "index_rev": "TAACAGTGTT", "index2_rev": "TGAGACTTGC"}, + {"index_id": "UDP0063", "index": "ATTGCGCGGT", "index2": "TTGGCTCCGC", "index_rev": "ACCGCGCAAT", "index2_rev": "GCGGAGCCAA"}, + {"index_id": "UDP0064", "index": "TGGCGCGAAC", "index2": "AACTGATACT", "index_rev": "GTTCGCGCCA", "index2_rev": "AGTATCAGTT"}, + {"index_id": "UDP0065", "index": "TAATGTGTCT", "index2": "GTAAGGCATA", "index_rev": "AGACACATTA", "index2_rev": "TATGCCTTAC"}, + {"index_id": "UDP0066", "index": "ATACCAACGC", "index2": "AATTGCTGCG", "index_rev": "GCGTTGGTAT", "index2_rev": "CGCAGCAATT"}, + {"index_id": "UDP0067", "index": "AGGATGTGCT", "index2": "TTACAATTCC", "index_rev": "AGCACATCCT", "index2_rev": "GGAATTGTAA"}, + {"index_id": "UDP0068", "index": "CACGGAACAA", "index2": "AACCTAGCAC", "index_rev": "TTGTTCCGTG", "index2_rev": "GTGCTAGGTT"}, + {"index_id": "UDP0069", "index": "TGGAGTACTT", "index2": "TCTGTGTGGA", "index_rev": "AAGTACTCCA", "index2_rev": "TCCACACAGA"}, + {"index_id": "UDP0070", "index": "GTATTGACGT", "index2": "GGAATTCCAA", "index_rev": "ACGTCAATAC", "index2_rev": "TTGGAATTCC"}, + {"index_id": "UDP0071", "index": "CTTGTACACC", "index2": "AAGCGCGCTT", "index_rev": "GGTGTACAAG", "index2_rev": "AAGCGCGCTT"}, + {"index_id": "UDP0072", "index": "ACACAGGTGG", "index2": "TGAGCGTTGT", "index_rev": "CCACCTGTGT", "index2_rev": "ACAACGCTCA"}, + {"index_id": "UDP0073", "index": "CCTGCGGAAC", "index2": "ATCATAGGCT", "index_rev": "GTTCCGCAGG", "index2_rev": "AGCCTATGAT"}, + {"index_id": "UDP0074", "index": "TTCATAAGGT", "index2": "TGTTAGAAGG", "index_rev": "ACCTTATGAA", "index2_rev": "CCTTCTAACA"}, + {"index_id": "UDP0075", "index": "CTCTGCAGCG", "index2": "GATGGATGTA", "index_rev": "CGCTGCAGAG", "index2_rev": "TACATCCATC"}, + {"index_id": "UDP0076", "index": "CTGACTCTAC", "index2": "ACGGCCGTCA", "index_rev": "GTAGAGTCAG", "index2_rev": "TGACGGCCGT"}, + {"index_id": "UDP0077", "index": "TCTGGTATCC", "index2": "CGTTGCTTAC", "index_rev": "GGATACCAGA", "index2_rev": "GTAAGCAACG"}, + {"index_id": "UDP0078", "index": "CATTAGTGCG", "index2": "TGACTACATA", "index_rev": "CGCACTAATG", "index2_rev": "TATGTAGTCA"}, + {"index_id": "UDP0079", "index": "ACGGTCAGGA", "index2": "CGGCCTCGTT", "index_rev": "TCCTGACCGT", "index2_rev": "AACGAGGCCG"}, + {"index_id": "UDP0080", "index": "GGCAAGCCAG", "index2": "CAAGCATCCG", "index_rev": "CTGGCTTGCC", "index2_rev": "CGGATGCTTG"}, + {"index_id": "UDP0081", "index": "TGTCGCTGGT", "index2": "TCGTCTGACT", "index_rev": "ACCAGCGACA", "index2_rev": "AGTCAGACGA"}, + {"index_id": "UDP0082", "index": "ACCGTTACAA", "index2": "CTCATAGCGA", "index_rev": "TTGTAACGGT", "index2_rev": "TCGCTATGAG"}, + {"index_id": "UDP0083", "index": "TATGCCTTAC", "index2": "AGACACATTA", "index_rev": "GTAAGGCATA", "index2_rev": "TAATGTGTCT"}, + {"index_id": "UDP0084", "index": "ACAAGTGGAC", "index2": "GCGCGATGTT", "index_rev": "GTCCACTTGT", "index2_rev": "AACATCGCGC"}, + {"index_id": "UDP0085", "index": "TGGTACCTAA", "index2": "CATGAGTACT", "index_rev": "TTAGGTACCA", "index2_rev": "AGTACTCATG"}, + {"index_id": "UDP0086", "index": "TTGGAATTCC", "index2": "ACGTCAATAC", "index_rev": "GGAATTCCAA", "index2_rev": "GTATTGACGT"}, + {"index_id": "UDP0087", "index": "CCTCTACATG", "index2": "GATACCTCCT", "index_rev": "CATGTAGAGG", "index2_rev": "AGGAGGTATC"}, + {"index_id": "UDP0088", "index": "GGAGCGTGTA", "index2": "ATCCGTAAGT", "index_rev": "TACACGCTCC", "index2_rev": "ACTTACGGAT"}, + {"index_id": "UDP0089", "index": "GTCCGTAAGC", "index2": "CGTGTATCTT", "index_rev": "GCTTACGGAC", "index2_rev": "AAGATACACG"}, + {"index_id": "UDP0090", "index": "ACTTCAAGCG", "index2": "GAACCATGAA", "index_rev": "CGCTTGAAGT", "index2_rev": "TTCATGGTTC"}, + {"index_id": "UDP0091", "index": "TCAGAAGGCG", "index2": "GGCCATCATA", "index_rev": "CGCCTTCTGA", "index2_rev": "TATGATGGCC"}, + {"index_id": "UDP0092", "index": "GCGTTGGTAT", "index2": "ACATACTTCC", "index_rev": "ATACCAACGC", "index2_rev": "GGAAGTATGT"}, + {"index_id": "UDP0093", "index": "ACATATCCAG", "index2": "TATGTGCAAT", "index_rev": "CTGGATATGT", "index2_rev": "ATTGCACATA"}, + {"index_id": "UDP0094", "index": "TCATAGATTG", "index2": "GATTAAGGTG", "index_rev": "CAATCTATGA", "index2_rev": "CACCTTAATC"}, + {"index_id": "UDP0095", "index": "GTATTCCACC", "index2": "ATGTAGACAA", "index_rev": "GGTGGAATAC", "index2_rev": "TTGTCTACAT"}, + {"index_id": "UDP0096", "index": "CCTCCGTCCA", "index2": "CACATCGGTG", "index_rev": "TGGACGGAGG", "index2_rev": "CACCGATGTG"}, + {"index_id": "UDP0097", "index": "TGCCGGTCAG", "index2": "CCTGATACAA", "index_rev": "CTGACCGGCA", "index2_rev": "TTGTATCAGG"}, + {"index_id": "UDP0098", "index": "CACTCAATTC", "index2": "TTAAGTTGTG", "index_rev": "GAATTGAGTG", "index2_rev": "CACAACTTAA"}, + {"index_id": "UDP0099", "index": "TCTCACACGC", "index2": "CGGACAGTGA", "index_rev": "GCGTGTGAGA", "index2_rev": "TCACTGTCCG"}, + {"index_id": "UDP0100", "index": "TCAATGGAGA", "index2": "GCACTACAAC", "index_rev": "TCTCCATTGA", "index2_rev": "GTTGTAGTGC"}, + {"index_id": "UDP0101", "index": "ATATGCATGT", "index2": "TGGTGCCTGG", "index_rev": "ACATGCATAT", "index2_rev": "CCAGGCACCA"}, + {"index_id": "UDP0102", "index": "ATGGCGCCTG", "index2": "TCCACGGCCT", "index_rev": "CAGGCGCCAT", "index2_rev": "AGGCCGTGGA"}, + {"index_id": "UDP0103", "index": "TCCGTTATGT", "index2": "TTGTAGTGTA", "index_rev": "ACATAACGGA", "index2_rev": "TACACTACAA"}, + {"index_id": "UDP0104", "index": "GGTCTATTAA", "index2": "CCACGACACG", "index_rev": "TTAATAGACC", "index2_rev": "CGTGTCGTGG"}, + {"index_id": "UDP0105", "index": "CAGCAATCGT", "index2": "TGTGATGTAT", "index_rev": "ACGATTGCTG", "index2_rev": "ATACATCACA"}, + {"index_id": "UDP0106", "index": "TTCTGTAGAA", "index2": "GAGCGCAATA", "index_rev": "TTCTACAGAA", "index2_rev": "TATTGCGCTC"}, + {"index_id": "UDP0107", "index": "GAACGCAATA", "index2": "ATCTTACTGT", "index_rev": "TATTGCGTTC", "index2_rev": "ACAGTAAGAT"}, + {"index_id": "UDP0108", "index": "AGTACTCATG", "index2": "ATGTCGTGGT", "index_rev": "CATGAGTACT", "index2_rev": "ACCACGACAT"}, + {"index_id": "UDP0109", "index": "GGTAGAATTA", "index2": "GTAGCCATCA", "index_rev": "TAATTCTACC", "index2_rev": "TGATGGCTAC"}, + {"index_id": "UDP0110", "index": "TAATTAGCGT", "index2": "TGGTTAAGAA", "index_rev": "ACGCTAATTA", "index2_rev": "TTCTTAACCA"}, + {"index_id": "UDP0111", "index": "ATTAACAAGG", "index2": "TGTTGTTCGT", "index_rev": "CCTTGTTAAT", "index2_rev": "ACGAACAACA"}, + {"index_id": "UDP0112", "index": "TGATGGCTAC", "index2": "CCAACAACAT", "index_rev": "GTAGCCATCA", "index2_rev": "ATGTTGTTGG"}, + {"index_id": "UDP0113", "index": "GAATTACAAG", "index2": "ACCGGCTCAG", "index_rev": "CTTGTAATTC", "index2_rev": "CTGAGCCGGT"}, + {"index_id": "UDP0114", "index": "TAGAATTGGA", "index2": "GTTAATCTGA", "index_rev": "TCCAATTCTA", "index2_rev": "TCAGATTAAC"}, + {"index_id": "UDP0115", "index": "AGGCAGCTCT", "index2": "CGGCTAACGT", "index_rev": "AGAGCTGCCT", "index2_rev": "ACGTTAGCCG"}, + {"index_id": "UDP0116", "index": "ATCGGCGAAG", "index2": "TCCAAGAATT", "index_rev": "CTTCGCCGAT", "index2_rev": "AATTCTTGGA"}, + {"index_id": "UDP0117", "index": "CCGTGACCGA", "index2": "CCGAACGTTG", "index_rev": "TCGGTCACGG", "index2_rev": "CAACGTTCGG"}, + {"index_id": "UDP0118", "index": "ATACTTGTTC", "index2": "TAACCGCCGA", "index_rev": "GAACAAGTAT", "index2_rev": "TCGGCGGTTA"}, + {"index_id": "UDP0119", "index": "TCCGCCAATT", "index2": "CTCCGTGCTG", "index_rev": "AATTGGCGGA", "index2_rev": "CAGCACGGAG"}, + {"index_id": "UDP0120", "index": "AGGACAGGCC", "index2": "CATTCCAGCT", "index_rev": "GGCCTGTCCT", "index2_rev": "AGCTGGAATG"}, + {"index_id": "UDP0121", "index": "AGAGAACCTA", "index2": "GGTTATGCTA", "index_rev": "TAGGTTCTCT", "index2_rev": "TAGCATAACC"}, + {"index_id": "UDP0122", "index": "GATATTGTGT", "index2": "ACCACACGGT", "index_rev": "ACACAATATC", "index2_rev": "ACCGTGTGGT"}, + {"index_id": "UDP0123", "index": "CGTACAGGAA", "index2": "TAGGTTCTCT", "index_rev": "TTCCTGTACG", "index2_rev": "AGAGAACCTA"}, + {"index_id": "UDP0124", "index": "CTGCGTTACC", "index2": "TATGGCTCGA", "index_rev": "GGTAACGCAG", "index2_rev": "TCGAGCCATA"}, + {"index_id": "UDP0125", "index": "AGGCCGTGGA", "index2": "CTCGTGCGTT", "index_rev": "TCCACGGCCT", "index2_rev": "AACGCACGAG"}, + {"index_id": "UDP0126", "index": "AGGAGGTATC", "index2": "CCAGTTGGCA", "index_rev": "GATACCTCCT", "index2_rev": "TGCCAACTGG"}, + {"index_id": "UDP0127", "index": "GCTGACGTTG", "index2": "TGTTCGCATT", "index_rev": "CAACGTCAGC", "index2_rev": "AATGCGAACA"}, + {"index_id": "UDP0128", "index": "CTAATAACCG", "index2": "AACCGCATCG", "index_rev": "CGGTTATTAG", "index2_rev": "CGATGCGGTT"}, + {"index_id": "UDP0129", "index": "TCTAGGCGCG", "index2": "CGAAGGTTAA", "index_rev": "CGCGCCTAGA", "index2_rev": "TTAACCTTCG"}, + {"index_id": "UDP0130", "index": "ATAGCCAAGA", "index2": "AGTGCCACTG", "index_rev": "TCTTGGCTAT", "index2_rev": "CAGTGGCACT"}, + {"index_id": "UDP0131", "index": "TTCGGTGTGA", "index2": "GAACAAGTAT", "index_rev": "TCACACCGAA", "index2_rev": "ATACTTGTTC"}, + {"index_id": "UDP0132", "index": "ATGTAACGTT", "index2": "ACGATTGCTG", "index_rev": "AACGTTACAT", "index2_rev": "CAGCAATCGT"}, + {"index_id": "UDP0133", "index": "AACGAGGCCG", "index2": "ATACCTGGAT", "index_rev": "CGGCCTCGTT", "index2_rev": "ATCCAGGTAT"}, + {"index_id": "UDP0134", "index": "TGGTGTTATG", "index2": "TCCAATTCTA", "index_rev": "CATAACACCA", "index2_rev": "TAGAATTGGA"}, + {"index_id": "UDP0135", "index": "TGGCCTCTGT", "index2": "TGAGACAGCG", "index_rev": "ACAGAGGCCA", "index2_rev": "CGCTGTCTCA"}, + {"index_id": "UDP0136", "index": "CCAGGCACCA", "index2": "ACGCTAATTA", "index_rev": "TGGTGCCTGG", "index2_rev": "TAATTAGCGT"}, + {"index_id": "UDP0137", "index": "CCGGTTCCTA", "index2": "TATATTCGAG", "index_rev": "TAGGAACCGG", "index2_rev": "CTCGAATATA"}, + {"index_id": "UDP0138", "index": "GGCCAATATT", "index2": "CGGTCCGATA", "index_rev": "AATATTGGCC", "index2_rev": "TATCGGACCG"}, + {"index_id": "UDP0139", "index": "GAATACCTAT", "index2": "ACAATAGAGT", "index_rev": "ATAGGTATTC", "index2_rev": "ACTCTATTGT"}, + {"index_id": "UDP0140", "index": "TACGTGAAGG", "index2": "CGGTTATTAG", "index_rev": "CCTTCACGTA", "index2_rev": "CTAATAACCG"}, + {"index_id": "UDP0141", "index": "CTTATTGGCC", "index2": "GATAACAAGT", "index_rev": "GGCCAATAAG", "index2_rev": "ACTTGTTATC"}, + {"index_id": "UDP0142", "index": "ACAACTACTG", "index2": "AGTTATCACA", "index_rev": "CAGTAGTTGT", "index2_rev": "TGTGATAACT"}, + {"index_id": "UDP0143", "index": "GTTGGATGAA", "index2": "TTCCAGGTAA", "index_rev": "TTCATCCAAC", "index2_rev": "TTACCTGGAA"}, + {"index_id": "UDP0144", "index": "AATCCAATTG", "index2": "CATGTAGAGG", "index_rev": "CAATTGGATT", "index2_rev": "CCTCTACATG"}, + {"index_id": "UDP0145", "index": "TATGATGGCC", "index2": "GATTGTCATA", "index_rev": "GGCCATCATA", "index2_rev": "TATGACAATC"}, + {"index_id": "UDP0146", "index": "CGCAGCAATT", "index2": "ATTCCGCTAT", "index_rev": "AATTGCTGCG", "index2_rev": "ATAGCGGAAT"}, + {"index_id": "UDP0147", "index": "ACGTTCCTTA", "index2": "GACCGCTGTG", "index_rev": "TAAGGAACGT", "index2_rev": "CACAGCGGTC"}, + {"index_id": "UDP0148", "index": "CCGCGTATAG", "index2": "TAGGAACCGG", "index_rev": "CTATACGCGG", "index2_rev": "CCGGTTCCTA"}, + {"index_id": "UDP0149", "index": "GATTCTGAAT", "index2": "AGCGGTGGAC", "index_rev": "ATTCAGAATC", "index2_rev": "GTCCACCGCT"}, + {"index_id": "UDP0150", "index": "TAGAGAATAC", "index2": "TATAGATTCG", "index_rev": "GTATTCTCTA", "index2_rev": "CGAATCTATA"}, + {"index_id": "UDP0151", "index": "TTGTATCAGG", "index2": "ACAGAGGCCA", "index_rev": "CCTGATACAA", "index2_rev": "TGGCCTCTGT"}, + {"index_id": "UDP0152", "index": "CACAGCGGTC", "index2": "ATTCCTATTG", "index_rev": "GACCGCTGTG", "index2_rev": "CAATAGGAAT"}, + {"index_id": "UDP0153", "index": "CCACGCTGAA", "index2": "TATTCCTCAG", "index_rev": "TTCAGCGTGG", "index2_rev": "CTGAGGAATA"}, + {"index_id": "UDP0154", "index": "GTTCGGAGTT", "index2": "CGCCTTCTGA", "index_rev": "AACTCCGAAC", "index2_rev": "TCAGAAGGCG"}, + {"index_id": "UDP0155", "index": "ATAGCGGAAT", "index2": "GCGCAGAGTA", "index_rev": "ATTCCGCTAT", "index2_rev": "TACTCTGCGC"}, + {"index_id": "UDP0156", "index": "GCAATATTCA", "index2": "GGCGCCAATT", "index_rev": "TGAATATTGC", "index2_rev": "AATTGGCGCC"}, + {"index_id": "UDP0157", "index": "CTAGATTGCG", "index2": "AGATATGGCG", "index_rev": "CGCAATCTAG", "index2_rev": "CGCCATATCT"}, + {"index_id": "UDP0158", "index": "CGATGCGGTT", "index2": "CCTGCTTGGT", "index_rev": "AACCGCATCG", "index2_rev": "ACCAAGCAGG"}, + {"index_id": "UDP0159", "index": "TCCGGACTAG", "index2": "GACGAACAAT", "index_rev": "CTAGTCCGGA", "index2_rev": "ATTGTTCGTC"}, + {"index_id": "UDP0160", "index": "GTGACGGAGC", "index2": "TGGCGGTCCA", "index_rev": "GCTCCGTCAC", "index2_rev": "TGGACCGCCA"}, + {"index_id": "UDP0161", "index": "AATTCCATCT", "index2": "CTTCAGTTAC", "index_rev": "AGATGGAATT", "index2_rev": "GTAACTGAAG"}, + {"index_id": "UDP0162", "index": "TTAACGGTGT", "index2": "TCCTGACCGT", "index_rev": "ACACCGTTAA", "index2_rev": "ACGGTCAGGA"}, + {"index_id": "UDP0163", "index": "ACTTGTTATC", "index2": "CGCGCCTAGA", "index_rev": "GATAACAAGT", "index2_rev": "TCTAGGCGCG"}, + {"index_id": "UDP0164", "index": "CGTGTACCAG", "index2": "AGGATAAGTT", "index_rev": "CTGGTACACG", "index2_rev": "AACTTATCCT"}, + {"index_id": "UDP0165", "index": "TTAACCTTCG", "index2": "AGGCCAGACA", "index_rev": "CGAAGGTTAA", "index2_rev": "TGTCTGGCCT"}, + {"index_id": "UDP0166", "index": "CATATGCGAT", "index2": "CCTTGAACGG", "index_rev": "ATCGCATATG", "index2_rev": "CCGTTCAAGG"}, + {"index_id": "UDP0167", "index": "AGCCTATGAT", "index2": "CACCACCTAC", "index_rev": "ATCATAGGCT", "index2_rev": "GTAGGTGGTG"}, + {"index_id": "UDP0168", "index": "TATGACAATC", "index2": "TTGCTTGTAT", "index_rev": "GATTGTCATA", "index2_rev": "ATACAAGCAA"}, + {"index_id": "UDP0169", "index": "ATGTTGTTGG", "index2": "CAATCTATGA", "index_rev": "CCAACAACAT", "index2_rev": "TCATAGATTG"}, + {"index_id": "UDP0170", "index": "GCACCACCAA", "index2": "TGGTACTGAT", "index_rev": "TTGGTGGTGC", "index2_rev": "ATCAGTACCA"}, + {"index_id": "UDP0171", "index": "AGGCGTTCGC", "index2": "TTCATCCAAC", "index_rev": "GCGAACGCCT", "index2_rev": "GTTGGATGAA"}, + {"index_id": "UDP0172", "index": "CCTCCGGTTG", "index2": "CATAACACCA", "index_rev": "CAACCGGAGG", "index2_rev": "TGGTGTTATG"}, + {"index_id": "UDP0173", "index": "GTCCACCGCT", "index2": "TCCTATTAGC", "index_rev": "AGCGGTGGAC", "index2_rev": "GCTAATAGGA"}, + {"index_id": "UDP0174", "index": "ATTGTTCGTC", "index2": "TCTCTAGATT", "index_rev": "GACGAACAAT", "index2_rev": "AATCTAGAGA"}, + {"index_id": "UDP0175", "index": "GGACCAGTGG", "index2": "CGCGAGCCTA", "index_rev": "CCACTGGTCC", "index2_rev": "TAGGCTCGCG"}, + {"index_id": "UDP0176", "index": "CCTTCTAACA", "index2": "GATAAGCTCT", "index_rev": "TGTTAGAAGG", "index2_rev": "AGAGCTTATC"}, + {"index_id": "UDP0177", "index": "CTCGAATATA", "index2": "GAGATGTCGA", "index_rev": "TATATTCGAG", "index2_rev": "TCGACATCTC"}, + {"index_id": "UDP0178", "index": "GATCGTCGCG", "index2": "CTGGATATGT", "index_rev": "CGCGACGATC", "index2_rev": "ACATATCCAG"}, + {"index_id": "UDP0179", "index": "TATCCGAGGC", "index2": "GGCCAATAAG", "index_rev": "GCCTCGGATA", "index2_rev": "CTTATTGGCC"}, + {"index_id": "UDP0180", "index": "CGCTGTCTCA", "index2": "ATTACTCACC", "index_rev": "TGAGACAGCG", "index2_rev": "GGTGAGTAAT"}, + {"index_id": "UDP0181", "index": "AATGCGAACA", "index2": "AATTGGCGGA", "index_rev": "TGTTCGCATT", "index2_rev": "TCCGCCAATT"}, + {"index_id": "UDP0182", "index": "AATTCTTGGA", "index2": "TTGTCAACTT", "index_rev": "TCCAAGAATT", "index2_rev": "AAGTTGACAA"}, + {"index_id": "UDP0183", "index": "TTCCTACAGC", "index2": "GGCGAATTCT", "index_rev": "GCTGTAGGAA", "index2_rev": "AGAATTCGCC"}, + {"index_id": "UDP0184", "index": "ATCCAGGTAT", "index2": "CAACGTCAGC", "index_rev": "ATACCTGGAT", "index2_rev": "GCTGACGTTG"}, + {"index_id": "UDP0185", "index": "ACGGTCCAAC", "index2": "TCTTACATCA", "index_rev": "GTTGGACCGT", "index2_rev": "TGATGTAAGA"}, + {"index_id": "UDP0186", "index": "GTAACTTGGT", "index2": "CGCCATACCT", "index_rev": "ACCAAGTTAC", "index2_rev": "AGGTATGGCG"}, + {"index_id": "UDP0187", "index": "AGCGCCACAC", "index2": "CTAATGTCTT", "index_rev": "GTGTGGCGCT", "index2_rev": "AAGACATTAG"}, + {"index_id": "UDP0188", "index": "TGCTACTGCC", "index2": "CAACCGGAGG", "index_rev": "GGCAGTAGCA", "index2_rev": "CCTCCGGTTG"}, + {"index_id": "UDP0189", "index": "CAACACCGCA", "index2": "GGCAGTAGCA", "index_rev": "TGCGGTGTTG", "index2_rev": "TGCTACTGCC"}, + {"index_id": "UDP0190", "index": "CACCTTAATC", "index2": "TTAGGATAGA", "index_rev": "GATTAAGGTG", "index2_rev": "TCTATCCTAA"}, + {"index_id": "UDP0191", "index": "TTGAATGTTG", "index2": "CGCAATCTAG", "index_rev": "CAACATTCAA", "index2_rev": "CTAGATTGCG"}, + {"index_id": "UDP0192", "index": "CCGGTAACAC", "index2": "GAGTTGTACT", "index_rev": "GTGTTACCGG", "index2_rev": "AGTACAACTC"} +] \ No newline at end of file diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/http.py b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/http.py new file mode 100644 index 000000000..9a681f214 --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/http.py @@ -0,0 +1,55 @@ +import json +from typing import Literal + + +def construct_body(check_status: Literal["PASS", "FAIL"], error_message: str = '', log_path='', + v2_sample_sheet: str = ''): + """ + Parameters + ---------- + check_status : One of 'PASS' or 'FAIL' + + error_message : The error message to return + log_path : The path to the log file + + v2_sample_sheet : The string representation of the v2 samplesheet + + Return + ---------- + error_message : str + any error message that stops the check + """ + + # Get Log Data + with open(log_path, 'r') as log_file: + log_text = log_file.read() + + body = { + "check_status": check_status, + "log_file": log_text, + "error_message": error_message, + "v2_sample_sheet": v2_sample_sheet + } + return json.dumps(body) + + +def construct_response(status_code, body, origin: str): + """Construct response from parameter""" + + if not origin.endswith('umccr.org'): + origin = 'https://umccr.org' + + response = { + 'statusCode': status_code, + 'headers': { + 'Access-Control-Allow-Headers': 'Content-Type', + 'Access-Control-Allow-Origin': origin, + 'Access-Control-Allow-Methods': 'OPTIONS,POST,GET', + 'Content-Type': 'application/json', + }, + } + + if body: + response['body'] = body + + return response diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/logger.py b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/logger.py new file mode 100644 index 000000000..8ebdbc90f --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/logger.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +import inspect +import logging +import os +from logging.handlers import RotatingFileHandler +from src.globals import LOGGER_STYLE + + +def get_caller_function(): + """ + Get the function that was used to call the previous + Some loggers report + :return: + """ + # Get the inspect stack trace + inspect_stack = inspect.stack() + + # Since we're already in a function, we need the third attribute + # i.e function of interest -> function that called this one -> this function + frame_info = inspect_stack[2] + + # Required attribute is' function + function_id = getattr(frame_info, "function", None) + + if function_id is None: + # Don't really want to break on this just yet but code is ready to go for it. + return None + else: + return function_id + + +def set_basic_logger(): + """ + Set the basic logger before we then take in the --deploy-env values to see where we write to + :return: + """ + # Get a basic logger + logger = logging.getLogger() + + # Get a stderr handler + console = logging.StreamHandler() + + # Set level + console.setLevel(logging.DEBUG) + + # Set format + formatter = logging.Formatter(LOGGER_STYLE) + console.setFormatter(formatter) + + return logger + + +def set_logger(log_path, log_level=logging.DEBUG): + """ + Initialise a logger + :return: + """ + if os.path.exists(log_path): + os.remove(log_path) + with open(log_path, 'w') as f: + f.write("") + + new_logger = logging.getLogger() + new_logger.setLevel(log_level) + + # create a logging format + formatter = logging.Formatter(LOGGER_STYLE) + + # create a file handler + file_handler = RotatingFileHandler(filename=log_path, mode='w', maxBytes=100000000, backupCount=5) + # Set Level + file_handler.setLevel(log_level) + file_handler.setFormatter(formatter) + + # create a console handler + console_handler = logging.StreamHandler() + # Hard coded as don't need too much verbosity on the console side + console_handler.setLevel(logging.INFO) + console_handler.setFormatter(formatter) + + # add the handlers to the logger + new_logger.addHandler(file_handler) + new_logger.addHandler(console_handler) + + +def get_logger(): + """ + Return logger object + :return: + """ + function_that_called_this_one = get_caller_function() + return logging.getLogger(function_that_called_this_one) diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/metadata.py b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/metadata.py new file mode 100644 index 000000000..c88589a1f --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/metadata.py @@ -0,0 +1,51 @@ +import os +import urllib.request +from typing import List +import json + +# Grab api constant from environment variable +DOMAIN_NAME = os.environ.get("DATA_PORTAL_DOMAIN_NAME", "dev.umccr.org") +METADATA_API_PATH = 'api/v1/library' +METADATA_SUBDOMAIN = 'metadata' + + +def get_metadata_record_from_array_of_field_name(auth_header: str, field_name: str, + value_list: List[str]): + # Define header request + headers = { + 'Authorization': auth_header + } + + # Removing any duplicates for api efficiency + value_list = list(set(value_list)) + + # Result variable + query_result = [] + + max_number_of_library_per_api_call = 300 + for i in range(0, len(value_list), max_number_of_library_per_api_call): + + # Define start and stop element from the list + start_index = i + end_index = start_index + max_number_of_library_per_api_call + + array_to_process = value_list[start_index:end_index] + + # Define query string + query_param_string = f'&{field_name}='.join(array_to_process) + query_param_string = f'?{field_name}=' + query_param_string # Appending name at the beginning + + query_param_string = query_param_string + f'&rowsPerPage=1000' # Add Rows per page (1000 is the maximum rows) + + url = f"https://{METADATA_SUBDOMAIN.strip('.')}.{DOMAIN_NAME.strip('.')}/{METADATA_API_PATH.strip('/')}/{query_param_string}" + # Make sure no data is left, looping data until the end + while url is not None: + req = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(req) as response: + if response.status < 200 or response.status >= 300: + raise ValueError(f'Non 20X status code returned') + + response_json = json.loads(response.read().decode()) + query_result.extend(response_json["results"]) + url = response_json["links"]["next"] + return query_result diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/samplesheet.py b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/samplesheet.py new file mode 100644 index 000000000..0ebd800d8 --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/samplesheet.py @@ -0,0 +1,879 @@ +#!/usr/bin/env python3 + +""" +Sample-sheet samplesheet to be used in the checker script +""" + +# Standards +from copy import deepcopy +from typing import List + +import pandas as pd +import collections +from scipy.spatial import distance + +from src.logger import get_logger +from src.errors import SampleSheetFormatError, SampleDuplicateError, SampleNotFoundError, \ + ColumnNotFoundError, LibraryNotFoundError, MultipleLibraryError, GetMetaDataError, SimilarIndexError, \ + SampleSheetHeaderError, MetaDataError, InvalidColumnError, SampleNameFormatError, OverrideCyclesError, \ + ApiCallError +from src.globals import SAMPLE_REGEX_OBJS, SAMPLESHEET_REGEX_OBJS, OVERRIDE_CYCLES_OBJS, \ + MIN_INDEX_HAMMING_DISTANCE +from src.globals import METADATA_COLUMN_NAMES, REQUIRED_SAMPLE_SHEET_DATA_COLUMN_NAMES, \ + VALID_SAMPLE_SHEET_DATA_COLUMN_NAMES +from src.metadata import get_metadata_record_from_array_of_field_name + +logger = get_logger() + + +class Sample: + """ + Sample on the sequencer + """ + + # Initialise attributes + def __init__(self, sample_id, sample_name, index, index2, lane, project): + """ + Initialise the sample object + :param sample_id: + :param sample_name: + :param index: + :param index2: + :param lane: + :param project + """ + + # Corresponds to the Sample_ID column in the sample sheet + # And Sample_ID (SampleSheet) in the metadata excel sheet + self.unique_id = sample_id + self.index = index # The i7 index + self.index2 = index2 # The i5 index - could be None if a single indexed flowcell + self.lane = lane # The lane of the sample + self.project = project # This may be useful at some point + self.sample_name = sample_name + + # Initialise read cycles and override_cycles + self.read_cycle_counts = [] + self.override_cycles = None + + # Initialise library + self.sample_id = None + self.library_id = None + + # Initialise year for easy usage + self.year = None + + # Initialise library_df for easy reference + self.library_series = None + + # Now calculate sample_id, library_id and year + self.set_sample_id_and_library_id_from_unique_id() + self.set_year_from_library_id() + + # Run checks on sample id and library id + self.check_unique_library_id_format() + self.check_library_id_format() + self.check_sample_id_format() + self.check_sample_name_format() + + def __str__(self): + return self.unique_id + + def set_sample_id_and_library_id_from_unique_id(self): + """ + From the unique_id, return the library id + MDX200001_L2000001 to [MDX200001, L2000001] + Use unique_id regex to ungroup each + Assumes fullmatch check has already been done + :return: + """ + + unique_id_regex_obj = SAMPLE_REGEX_OBJS["unique_id"].match(self.unique_id) + + # Check unique id regex match is not None + if unique_id_regex_obj is None: + logger.error(f"Could not split sample and library id from {self.unique_id}") + raise SampleNameFormatError + + # Sample ID is the first group and the library ID is the second group + self.sample_id = unique_id_regex_obj.group(1) + self.library_id = unique_id_regex_obj.group(2) + + def check_sample_name_format(self): + """ + Ensure that the sample name is not null + :return: + """ + if self.sample_name is None or self.sample_name == "": + logger.error("Sample ID {} did not have corresponding Sample_Name".format(self.sample_id)) + raise SampleNameFormatError + + def check_sample_id_format(self): + """ + Ensure that the sample id is of the expected format + :return: + """ + sample_regex_obj = SAMPLE_REGEX_OBJS["sample_id"].fullmatch(self.sample_id) + if sample_regex_obj is None: + logger.error("Sample ID {} did not match the expected regex".format(self.sample_id)) + raise SampleNameFormatError + + def check_library_id_format(self): + """ + Ensure that the library id is of the expected format + :return: + """ + library_regex_obj = SAMPLE_REGEX_OBJS["library_id"].fullmatch(self.library_id) + + if library_regex_obj is None: + logger.error("Library ID {} did not match the expected regex".format(self.library_id)) + raise SampleNameFormatError + + def check_unique_library_id_format(self): + """ + Ensure that the sample id and the library id combined match the expected regex + :return: + """ + + unique_regex_obj = SAMPLE_REGEX_OBJS["unique_id_full_match"].fullmatch(self.unique_id) + + if unique_regex_obj is None: + logger.error("Sample / Library ID {} did not match the expected regex".format(self.unique_id)) + + def set_year_from_library_id(self): + """ + Get the year from the library id by appending 20 on the end + :return: + """ + year_re_match = SAMPLE_REGEX_OBJS.get("year").match(self.library_id) + if year_re_match is None: + logger.error("Could not get library ID from \"{}\"".format(self.library_id)) + raise SampleNameFormatError + # Year is truncated with 20 + self.year = '20{}'.format(year_re_match.group(1)) + + def set_override_cycles(self): + """ + Extract from the library metadata sheet the override cycles count and set as sample attribute + :return: + """ + self.override_cycles = self.library_series["override_cycles"] + + def set_metadata_row_for_sample(self, metadata_df): + """ + :param metadata_df: metadata data frame + :return: + """ + library_id_column_var = METADATA_COLUMN_NAMES["library_id"] + sample_id_column_var = METADATA_COLUMN_NAMES["sample_id"] + library_id_var = self.library_id + sample_id_var = self.sample_id + + # Query for specific dataframe value + library_row = metadata_df.loc[ + (metadata_df['library_id'] == library_id_var) & (metadata_df["sample_id"] == sample_id_var)] + + # Check result_df exist + number_of_rows = len(library_row) + if number_of_rows == 0: + logger.error("Got no rows back for library id '{}' and sample id '{}'" + "in columns {} and {} respectively".format(library_id_var, sample_id_var, + library_id_column_var, sample_id_column_var)) + raise LibraryNotFoundError + + # Check library_row is just one row + if not number_of_rows == 1: + logger.error("Got multiple rows back for library id '{}' and sample id '{}'" + "in columns {} and {} respectively".format(library_id_var, sample_id_var, + library_id_column_var, sample_id_column_var)) + raise MultipleLibraryError + + # Set the library df + self.library_series = library_row.squeeze() + + +class SampleSheet: + """ + SampleSheet object + """ + + def __init__(self, samplesheet_path=None, header=None, reads=None, settings=None, data=None, samples=None): + self.samplesheet_path = samplesheet_path + self.header = header + self.reads = reads + self.settings = settings + self.data = data + self.samples = samples + + self.metadata_df = pd.DataFrame() + + # Ensure that header, reads, settings are all None or all Not None + if not (self.header is None and self.reads is None and self.settings is None): + if not (self.header is not None and self.reads is not None and self.settings is not None): + logger.error("header, reads and settings configurations need to either all be set or all be 'None'") + raise NotImplementedError + else: + settings_defined = True + else: + settings_defined = False + + # Check we haven't double defined the configuration settings + if not (bool(self.samplesheet_path is not None) ^ settings_defined): + """ + We can't have the samplesheet_path defined and the sections also defined + """ + logger.error("Specify only the samplesheet path OR header, reads, settings") + raise NotImplementedError + # Check we haven't double defined the data settings + elif not ( + bool(self.samplesheet_path is not None) ^ bool(self.samples is not None) ^ bool(self.data is not None)): + """ + Only one of samplesheet_path and samples can be specified + Can we confirm this is legit + """ + logger.error("Specify only the samplesheet path OR data OR samples. The latter two options" + "will also need to have header, reads and settings defined") + raise NotImplementedError + + # If there's a samplesheet path, we need to read it + if self.samplesheet_path is not None: + self.read() + + def read(self): + """ + Read in the sample sheet object as a list of dicts + :return: + """ + with open(self.samplesheet_path, "r") as samplesheet_csv_h: + # Read samplesheet in + sample_sheet_sections = {} + current_section = None + current_section_item_list = [] + + for line in samplesheet_csv_h.readlines(): + # Check if blank line + if line.strip().rstrip(",") == "": + continue + # Check if the current line is a header + header_match_obj = SAMPLESHEET_REGEX_OBJS["section_header"].match(line.strip()) + if header_match_obj is not None and current_section is None: + # First line, don't need to write out previous section to obj + # Set current section to first group + current_section = header_match_obj.group(1) + current_section_item_list = [] + elif header_match_obj is not None and current_section is not None: + # A header further down, write out previous section and then reset sections + sample_sheet_sections[current_section] = current_section_item_list + # Now reset sections + current_section = header_match_obj.group(1) + current_section_item_list = [] + # Make sure the first line is a section + elif current_section is None and header_match_obj is None: + logger.error("Top line of csv was not a section header. Exiting") + raise SampleSheetFormatError + else: # We're in a section + if not current_section == "Data": + # Strip trailing slashes from line + current_section_item_list.append(line.strip().rstrip(",")) + else: + # Don't strip trailing slashes from line + current_section_item_list.append(line.strip()) + + # Write out the last section + sample_sheet_sections[current_section] = current_section_item_list + + # Now iterate through sections and map them to the appropriate objects + for section_name, section_str_list in sample_sheet_sections.items(): + if section_name == "Header": + # Convert to dict + self.header = {line.split(",", 1)[0]: line.split(",", 1)[-1] + for line in section_str_list} + elif section_name == "Settings": + # Convert to dict + self.settings = {line.split(",", 1)[0]: line.split(",", 1)[-1] + for line in section_str_list} + elif section_name == "Reads": + # List type + self.reads = section_str_list + elif section_name == "Data": + # Convert to dataframe + self.data = pd.DataFrame(columns=section_str_list[0].split(","), + data=[row.split(",") for row in + section_str_list[1:]]) + # Ensure each of the required SAMPLE_SHEET_DATA_COLUMNS exists + for column in REQUIRED_SAMPLE_SHEET_DATA_COLUMN_NAMES["v1"]: + if column not in self.data.columns.tolist(): + logger.error("Could not find column \"{}\" in samplesheet".format(column)) + raise ColumnNotFoundError + # Ensure each of the columns are valid columns + for column in self.data.columns.tolist(): + if column not in VALID_SAMPLE_SHEET_DATA_COLUMN_NAMES["v1"]: + logger.error("Could not find column \"{}\" in samplesheet".format(column)) + raise InvalidColumnError + # Strip Ns from index and index2 + self.data['index'] = self.data['index'].apply(lambda x: x.rstrip("N")) + if 'index2' in self.data.columns.tolist(): + self.data['index2'] = self.data['index2'].apply(lambda x: x.rstrip("N")) + # TO then also add sample attributes + # Write out each sample + self.convert_data_to_samples() + else: + # We're not familiar with how to handle this section + raise NotImplementedError + + def convert_data_to_samples(self): + """ + Take the data attribute to create a samples objects + :return: + """ + # Ensure this function has not been called inappropriately + if self.data is None: + logger.error("Tried to convert data attribute to samples object when data wasn't defined") + raise ValueError + + if self.samples is None: + self.samples = [] + + for row_index, sample_row in self.data.iterrows(): + # Set default lane to 1, so we can still compare indexes across + lane = sample_row["Lane"] if "Lane" in sample_row.keys() else 1 + + index2 = sample_row["index2"] if "index2" in sample_row.keys() else None + project = sample_row["Sample_Project"] if "Sample_Project" in sample_row.keys() else None + + self.samples.append( + Sample( + lane=lane, + sample_name=sample_row["Sample_Name"], + sample_id=sample_row["Sample_ID"], + index=sample_row["index"], + index2=index2, + project=project + ) + ) + + def add_sample(self, new_sample_to_add): + """ + Add sample to the list of samples + :param new_sample_to_add: + :return: + """ + for sample in self.samples: + if sample.id == new_sample_to_add.id: + logger.error("Sample with ID: {} already exists in sample sheet".format(sample.id)) + raise SampleDuplicateError + self.samples.append(new_sample_to_add) + + def remove_sample(self, sample_id_to_remove): + """ + Remove sample with this Sample_ID + :param sample_id_to_remove: + :return: + """ + for sample in self.samples: + if sample.id == sample_id_to_remove: + sample_to_remove = sample + break + else: + logger.error("Could not find sample {} when removing sample from sample sheet".format(sample_id_to_remove)) + raise SampleNotFoundError + + self.samples.remove(sample_to_remove) + + def get_lanes(self): + """ + Iterate through samples and get the set of lanes in the samples + :return: + """ + lanes = set() + + # For the purposes of testing, we'll just return '1' if lane is not specified + if "Lane" not in self.data.columns.tolist(): + logger.info("Attempting to get 'lanes' but no lanes defined, " + "returning set(1) for purpose of checking indexes") + return {1} + + for sample in self: + lanes.add(sample.lane) + + return lanes + + def write(self, samplesheet_h): + """ + Write samplesheet to file handle + :param samplesheet_h: + :return: + """ + # Write out header + samplesheet_h.write("[Header]\n") + samplesheet_h.write("\n".join(map(str, ["{},{}".format(key, value) + for key, value in self.header.items()]))) + # Add new line before the next section + samplesheet_h.write("\n\n") + # Write out reads + samplesheet_h.write("[Reads]\n") + samplesheet_h.write("\n".join(self.reads)) + # Add new line before the next section + samplesheet_h.write("\n\n") + # Write out settings + samplesheet_h.write("[Settings]\n") + samplesheet_h.write("\n".join(map(str, ["{},{}".format(key, value) + for key, value in self.settings.items()]))) + # Add new line before the next section + samplesheet_h.write("\n\n") + # Write out data + samplesheet_h.write("[Data]\n") + self.data.to_csv(samplesheet_h, index=False, header=True, sep=",") + # Add final new line + samplesheet_h.write("\n") + + def check_sample_uniqueness(self): + """ + Ensure all samples are unique + :return: + """ + + for s_i, sample in self.samples: + for s2_i, sample2 in self.samples: + # Check we already haven't done this comparison + if s_i >= s2_i: + continue + if sample.id == sample2.id: + logger.error("Found two samples with the same id: '{}'".format(sample.id)) + raise SampleDuplicateError + + def __iter__(self): + yield from self.samples + + def set_metadata_from_api(self, auth_header): + library_id_array = [] + + for sample in self: + library_id_array.append(sample.library_id) + + # check that the primary library for the topup exists + if SAMPLE_REGEX_OBJS["topup"].search(sample.library_id) is not None: + logger.info("{} is a top up sample. Investigating the previous sample".format(sample.unique_id)) + orig_unique_id = SAMPLE_REGEX_OBJS["topup"].sub('', sample.unique_id) + + unique_id_regex_obj = SAMPLE_REGEX_OBJS["unique_id"].match(orig_unique_id) + + # Sample ID is the first group and the library ID is the second group + topup_sample_id = unique_id_regex_obj.group(1) + topup_library_id = unique_id_regex_obj.group(2) + + # Appending these original sample/library id to the search query + library_id_array.append(topup_library_id) + + try: + metadata_response = get_metadata_record_from_array_of_field_name(auth_header=auth_header, + field_name='library_id', + value_list=library_id_array) + + except Exception as e: + raise ApiCallError("Fail to fetch metadata api for library id in the sample sheet") + + # Convert api result to panda dataframe + # Replicate what the old portal does + metadata_response = [ + { + "library_id": metadata["libraryId"], + "sample_id": metadata["sample"]["sampleId"], + "override_cycles": metadata["overrideCycles"], + "assay": metadata["assay"], + "type": metadata["type"], + "subject_id": metadata["subject"]["subjectId"], + } + for metadata in metadata_response + ] + self.metadata_df = pd.json_normalize(metadata_response) + + has_error = False + error_samples = [] + metadata_df = self.metadata_df + + for sample in self.samples: + try: + sample.set_metadata_row_for_sample(metadata_df=metadata_df) + except LibraryNotFoundError: + logger.error("Error trying to find library id in tracking sheet for sample {}".format(sample.sample_id)) + error_samples.append(sample.sample_id) + has_error = True + except MultipleLibraryError: + logger.error("Got multiple rows from tracking sheet for sample {}".format(sample.sample_id)) + error_samples.append(sample.sample_id) + has_error = True + except ApiCallError: + logger.error("API call fail") + has_error = True + else: + # Now we can set other things that may need to be done + # Once we can confirm the metadata + sample.set_override_cycles() + + if has_error: + raise GetMetaDataError("The following samples had issues - {}".format(", ".join(map(str, error_samples)))) + + +def get_years_from_samplesheet(samplesheet): + """ + Get a unique list of years used. + Tells us which metadata sheets we'll need to access + :param samplesheet: Samplesheet object + :return: + """ + years = set() + for sample in samplesheet: + years.add(sample.year) + return years + + +def check_samplesheet_header_metadata(samplesheet): + """ + # Check that Assay and Experiment Name are defined in the SampleSheet header + :param samplesheet: + :return: + """ + logger.info("Checking SampleSheet metadata") + has_error = False + required_keys = ["Assay", "Experiment Name"] + + for key in required_keys: + if samplesheet.header.get(key, None) is None: + logger.error("{} not defined in Header!".format(key)) + has_error = True + + if has_error: + raise SampleSheetHeaderError + + return + + +def check_metadata_correspondence(samplesheet): + """ + Checking sample sheet data against metadata df + :param samplesheet: + :return: + """ + logger.info("Checking SampleSheet data against metadata") + has_error = False + + for sample in samplesheet: + # exclude 10X samples for now, as they usually don't comply + if sample.library_series["type"] == '10X': + logger.debug("Not checking metadata columns as this sample is '10X'") + continue + + # check presence of subject ID + if sample.library_series["subject_id"] == '': + logger.error(f"No subject ID for {sample.sample_id}") + raise SampleNotFoundError + + # check that the primary library for the topup exists + if SAMPLE_REGEX_OBJS["topup"].search(sample.library_id) is not None: + logger.info("{} is a top up sample. Investigating the previous sample".format(sample.unique_id)) + orig_unique_id = SAMPLE_REGEX_OBJS["topup"].sub('', sample.unique_id) + try: + # Recreate the original sample object + orig_sample = Sample( + sample_id=orig_unique_id, + sample_name=sample.sample_name, + index=None, + index2=None, + lane=None, + project=None + ) + # Try get metadata for sample row + orig_sample.set_metadata_row_for_sample(metadata_df=samplesheet.metadata_df) + except LibraryNotFoundError: + logger.error("Could not find library of original sample") + has_error = True + except MultipleLibraryError: + logger.error("It seems that there is multiple libraries for the original sample") + has_error = True + except ApiCallError: + logger.error("API call fails") + has_error = True + + if not has_error: + return + else: + raise MetaDataError + + +def check_sample_sheet_for_index_clashes(samplesheet): + """ + Ensure that two given indexes are not within one hamming distance of each other + :param samplesheet: + :return: + """ + logger.debug("Checking SampleSheet for index clashes") + has_error = False + + lanes = samplesheet.get_lanes() + + for lane in lanes: + for s_i, sample in enumerate(samplesheet.samples): + # Ensures samples are in the same lane + if not sample.lane == lane: + continue + logger.debug(f"Comparing indexes of sample {sample}") + for s2_i, sample_2 in enumerate(samplesheet.samples): + # Reset for each sample we're comparing against + sample_has_i7_error = False + # Ensures samples are in the same lane + if not sample_2.lane == lane: + continue + # Ensures we only do half of the n^2 logic. + if s2_i <= s_i: + # We've already done this comparison + # OR they're the same sample + continue + + logger.debug(f"Checking indexes of sample {sample} against {sample_2}") + if sample.unique_id == sample_2.unique_id: + # We're testing the sample on itself, next! + continue + + # i7 check + # Strip i7 to min length of the two indexes + try: + compare_two_indexes(sample.index, sample_2.index) + except SimilarIndexError: + # Not a failure - we might have different i5 indexes for the sample + logger.debug("i7 indexes {} and {} are too similar to run in the same lane".format(sample.index, + sample_2.index)) + logger.debug("This may be okay if i5 indexes are different enough") + sample_has_i7_error = True + + # We may not have an i5 index - continue on to next sample if so + if sample.index2 is None or sample_2.index2 is None: + # If the i7 was too close then this is a fail + if sample_has_i7_error: + logger.error("i7 indexes {} and {} are too similar to run in the same lane".format(sample.index, + sample_2.index)) + has_error = True + continue + + # i5 check + # Strip i5 to min length of the two indexes + try: + compare_two_indexes(sample.index2, sample_2.index2) + except SimilarIndexError: + logger.debug("i5 indexes {} and {} are too similar to run in the same lane." + "This might be okay if i7 indexes are different enough".format(sample.index2, + sample_2.index2)) + if sample_has_i7_error: + logger.error("i7 indexes {} and {} are too similar to run in the same lane" + "with i5 indexes {} and {} are too similar to run in the same lane ".format( + sample.index, + sample_2.index, + sample.index2, + sample_2.index2) + ) + has_error = True + + if not has_error: + return + else: + raise SimilarIndexError + + +def check_internal_override_cycles(samplesheet): + """ + For each sample in the samplesheet, compare a given samples override cycles attributes with those + of the indexes of the samples. + i.e + If the sample has the override cycles Y151;I8;I8;Y151, we should expect the non-N lengths of i7 and i5 to both be 8. + :param samplesheet: + :return: + """ + for sample in samplesheet: + # Check override cycles attribute exists + if not sample.override_cycles: + logger.warning("Could not find override cycles for sample \"{}\"".format(sample.unique_id)) + continue + index_count = 0 + for cycle_set in sample.override_cycles.split(";"): + # Makes sure that the cycles completes a full match + if OVERRIDE_CYCLES_OBJS["indexes"].match(cycle_set) is None: + logger.debug("Not an index cycle, skipping") + continue + # Get the length of index + index_length = int(OVERRIDE_CYCLES_OBJS["indexes"].match(cycle_set).group(1).replace("I", "")) + index_count += 1 + # Get index valuex + if index_count == 1: + # Check against sample's i7 value + i7_length = len(sample.index.replace("N", "")) + if not i7_length == index_length: + logger.warning(f"Sample '{sample.sample_id}' override cycle value '{sample.override_cycles}' " + f"does not match sample i7 '{sample.index}") + elif index_count == 2 and sample.index2 is not None and not sample.index2 == "": + # Check against samples' i5 value + i5_length = len(sample.index2.replace("N", "")) + if not i5_length == index_length: + logger.warning(f"Sample '{sample.sample_id}' override cycle value '{sample.override_cycles}' " + f"does not match sample i5 '{sample.index2}") + # Make sure that if sample.index2 is not None but the override cycles count + # only made it to '1' then we throw a warning + if index_count == 1 and sample.index2 is not None and not sample.index2 == "": + logger.warning(f"Override cycles '{sample.override_cycles}' suggests only one index " + f"but sample '{sample.sample_id}' has a second index '{sample.index2}'") + + +def check_global_override_cycles(samplesheet) -> List: + """ + Check that the override cycles exists, + matches the reads entered in the samplesheet + and is consistent with all other samples in the sample sheet. + :param samplesheet: + :return: + """ + for sample in samplesheet: + # We've already initialised this attribute + if not len(sample.read_cycle_counts) == 0: + continue + # for Y151;I8N2;I8N2;Y151 to ["Y151", "I8N2", "I8N2", "Y151"] + if not sample.override_cycles: + logger.warning("Could not find override cycles for sample \"{}\"".format(sample.unique_id)) + continue + for cycle_set in sample.override_cycles.split(";"): + # Makes sure that the cycles completes a fullmatch + if OVERRIDE_CYCLES_OBJS["cycles_full_match"].fullmatch(cycle_set) is None: + logger.error("Couldn't interpret override cycles section {} from {}".format( + cycle_set, sample.override_cycles + )) + read_cycles_sum = 0 + # Run regex over each set + for re_match in OVERRIDE_CYCLES_OBJS["cycles"].findall(cycle_set): + # re_match is a tuple like ('Y', '151') or ('N', '') + if re_match[-1] == "": + read_cycles_sum += 1 + else: + read_cycles_sum += int(re_match[-1]) + sample.read_cycle_counts.append(read_cycles_sum) + # Now we ensure all samples have the same read_cycle counts + num_read_index_per_sample = set([len(sample.read_cycle_counts) + for sample in samplesheet + if not len(sample.read_cycle_counts) == 0]) + # Check the number of segments for each section are even the same + if len(num_read_index_per_sample) > 1: + logger.error("Found an error with override cycles matches") + for num_read_index in num_read_index_per_sample: + samples_with_this_num_read_index = [sample.sample_id + for sample in samplesheet + if len(sample.read_cycle_counts) == num_read_index] + logger.error("The following samples have {} read/index sections: {}". + format(num_read_index, ", ".join(map(str, samples_with_this_num_read_index)))) + raise OverrideCyclesError + elif len(num_read_index_per_sample) == 0: + logger.error("Found no override cycles matches") + raise OverrideCyclesError + else: + logger.info("Override cycles check 1/2 complete - " + "All samples have the correct number of override cycles sections - {}". + format(list(num_read_index_per_sample)[0])) + + # For each segment - check that the counts are the same + section_cycle_counts = [] + for read_index in range(list(num_read_index_per_sample)[0]): + num_cycles_in_read_per_sample = set([sample.read_cycle_counts[read_index] + for sample in samplesheet + if not len(sample.read_cycle_counts) == 0]) + if len(num_cycles_in_read_per_sample) > 1: + logger.error("Found an error with override cycles matches for read/index section {}".format(read_index + 1)) + for num_cycles in num_cycles_in_read_per_sample: + samples_with_this_cycle_count_in_this_read_index_section = \ + [sample.sample_id + for sample in samplesheet + if sample.read_cycle_counts[read_index] == num_cycles] + logger.error("The following samples have this this read count for this read index section: {}\n" + "CycleCount: {}\n" + "Samples: {}". + format(read_index + 1, + num_cycles, + ", ".join(map(str, samples_with_this_cycle_count_in_this_read_index_section)))) + raise OverrideCyclesError + else: + section_cycle_counts.append(list(num_cycles_in_read_per_sample)[0]) + else: + logger.info("Override cycles check 2/2 complete - " + "All samples have the identical number of cycles per section - \"{}\"". + format(", ".join(map(str, section_cycle_counts)))) + + return section_cycle_counts + + +def compare_two_indexes(first_index, second_index): + """ + Ensure that the hamming distance between the two indexes + is more than 1 + If one index is longer than the other - strip the longer one from the right + # scipy.spatial.distance.hamming + # https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.hamming.html + :param first_index: + :param second_index: + :return: + """ + + min_index_length = min(len(first_index), len(second_index)) + first_index = first_index[0:min_index_length] + second_index = second_index[0:min_index_length] + + # Ensure that both the indexes are the same length + if not len(first_index) == len(second_index): + logger.error("Index lengths {} and {} are not the same".format( + first_index, second_index + )) + raise SimilarIndexError + + # hamming distance returns a float - we then multiple this by the index length + h_float = distance.hamming(list(first_index), list(second_index)) + + if not h_float * min_index_length >= MIN_INDEX_HAMMING_DISTANCE: + logger.debug("Indexes {} and {} are too similar".format(first_index, second_index)) + raise SimilarIndexError + else: + return + + +def get_grouped_samplesheets(samplesheet): + """ + Get samples sorted by their override-cycles metric. + Write out each samplesheet. + :param samplesheet: + :return: + """ + grouped_samplesheets = collections.defaultdict() + + override_cycles_list = set([sample.override_cycles + for sample in samplesheet]) + + for override_cycles in override_cycles_list: + samples_unique_ids_subset = [sample.unique_id + for sample in samplesheet + if sample.override_cycles == override_cycles] + + # Create new samplesheet from old sheet + override_cycles_samplesheet = deepcopy(samplesheet) + + # Truncate data + override_cycles_samplesheet.data = override_cycles_samplesheet.data. \ + query("Sample_ID in @samples_unique_ids_subset") + + # Ensure we haven't just completely truncated everything + if override_cycles_samplesheet.data.shape[0] == 0: + logger.error("Here are the list of sample ids " + "that were meant to have the Override cycles setting \"{}\": {}".format( + override_cycles, ", ".join(map(str, samples_unique_ids_subset)))) + logger.error("We accidentally filtered our override cycles samplesheet to contain no samples") + raise ValueError + + # Append OverrideCycles setting to Settings in Samplesheet + override_cycles_samplesheet.settings["OverrideCycles"] = override_cycles + + # Append SampleSheet to list of grouped sample sheets + grouped_samplesheets[override_cycles] = override_cycles_samplesheet + + return grouped_samplesheets diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/v2_samplesheet_builder.py b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/v2_samplesheet_builder.py new file mode 100644 index 000000000..c805e36b6 --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/src/v2_samplesheet_builder.py @@ -0,0 +1,551 @@ +#!/usr/bin/env python3 + +""" +Steps needed: + +Generate a json of the existing v1 samplesheet + +Convert the json to a v2 samplesheet +""" +import re +from copy import deepcopy +from pathlib import Path +from typing import Dict, List +from tempfile import NamedTemporaryFile +import pandas as pd + +from src.samplesheet import SampleSheet, check_global_override_cycles +from src.logger import get_logger +from src.globals import ( + V2_SAMPLESHEET_BCLCONVERT_ADAPTER_SETTINGS_BY_ASSAY_TYPE, + V2_ADAPTER_SETTINGS, V2_DATA_ROWS, V2_SAMPLESHEET_GLOBAL_SETTINGS, V2_SAMPLESHEET_DATA_SETTINGS, + V2_BCLCONVERT_BASESPACE_URN, + V2_CTTSO_VALID_INDEXES, V2_BCLCONVERT_BASESPACE_SOFTWARE_VERSION, EXPERIMENT_REGEX_STR +) +from v2_samplesheet_maker.functions.v2_samplesheet_writer import v2_samplesheet_writer + +from src import camel_to_snake + +logger = get_logger() + + +def get_cttso_index_id_from_index(index_str: str, index_type: str) -> str: + """ + Base function for get_cttso_i7_index_id_from_index and get_cttso_i5_index_id_from_index2 + """ + try: + return next( + filter( + lambda index_dict: index_dict.get(index_type) == index_str, + V2_CTTSO_VALID_INDEXES + ) + ).get("index_id") + except StopIteration: + logger.error(f"Could not get index id for {index_type} - {index_str}") + raise ValueError + + +def get_cttso_i7_index_id_from_index(i7_index_str: str) -> str: + return get_cttso_index_id_from_index(i7_index_str, "index") + + +def get_cttso_i5_index_id_from_index(i5_index_str: str, is_forward_index_orientation: bool = False) -> str: + if is_forward_index_orientation: + return get_cttso_index_id_from_index(i5_index_str, "index2") + else: + return get_cttso_index_id_from_index(i5_index_str, "index2_rev") + + +def get_bclconvert_adapter_setting_by_type_and_assay(sample_type: str, sample_assay: str, setting_name: str) -> str: + """ + This function retrieves the adapter setting for a given sample type and assay. + + Parameters: + sample_type (str): The type of the sample. + sample_assay (str): The assay of the sample. + setting_name (str): The name of the setting to be retrieved. + + Returns: + str: The value of the adapter setting for the given sample type and assay. + """ + for key in V2_SAMPLESHEET_BCLCONVERT_ADAPTER_SETTINGS_BY_ASSAY_TYPE.keys(): + if re.match(key, f"{sample_type}:{sample_assay}"): + setting_value = V2_SAMPLESHEET_BCLCONVERT_ADAPTER_SETTINGS_BY_ASSAY_TYPE[key].get(setting_name, None) + if setting_value is not None: + return setting_value + else: + logger.debug(f"Could not get the bclconvert settings for this type / assay combination '{sample_type}' / '{sample_assay}'") + + +def get_bclconvert_settings_by_library_id(library_id: str, samplesheet: SampleSheet) -> Dict: + """ + This function retrieves the BCLConvert settings for a given library ID from a sample sheet. + + Parameters: + library_id (str): The ID of the library. + samplesheet (SampleSheet): The sample sheet from which to retrieve the settings. + + Returns: + Dict: A dictionary containing the BCLConvert settings for the given library ID. + """ + + # Get metadata for library id + library_id_metadata = samplesheet.metadata_df.query(f"library_id=='{library_id}'").squeeze() + + bclconvert_settings_dict = { + "override_cycles": library_id_metadata["override_cycles"], + "library_prep_kit_name": library_id_metadata["assay"] + } + + for adapter_setting in V2_ADAPTER_SETTINGS: + bclconvert_settings_dict.update( + { + adapter_setting: get_bclconvert_adapter_setting_by_type_and_assay( + library_id_metadata["type"], + library_id_metadata["assay"], + adapter_setting + ) + } + ) + + return bclconvert_settings_dict + + +def get_samplesheet_header_dict(samplesheet: SampleSheet) -> Dict: + """ + This function retrieves the header information from a given SampleSheet object and returns it as a dictionary. + + Parameters: + samplesheet (SampleSheet): The SampleSheet object from which to retrieve the header information. + + Returns: + Dict: A dictionary containing the header information from the SampleSheet object. + """ + header_dict = dict(samplesheet.header) + + # Update FileFormatVersion + header_dict['file_format_version'] = '2' + + # Convert Experiment Name to Run Name + header_dict['run_name'] = header_dict.pop('Experiment Name') + + # Convert Instrument Type to instrument_type + header_dict['instrument_type'] = header_dict.pop('Instrument Type') + + return header_dict + + +def get_reads_dict(samplesheet: SampleSheet) -> Dict: + """ + This function retrieves the read cycle information from a given SampleSheet object and returns it as a dictionary. + + Parameters: + samplesheet (SampleSheet): The SampleSheet object from which to retrieve the read cycle information. + + Returns: + Dict: A dictionary containing the read cycle information from the SampleSheet object. + """ + # Count the cycle list + cycle_count_list = check_global_override_cycles(samplesheet) + + # Convert the reads list into a dict + reads_dict = { + "read_1_cycles": samplesheet.reads[0], + "read_2_cycles": samplesheet.reads[-1] + } + + # If we have dual-indexed + paired end reads, add the index cycles + if len(cycle_count_list) == 4: + # Confirm that we have matches between reads and cycle_count_list + if not samplesheet.reads[0] == cycle_count_list[0] and samplesheet.reads[-1] == cycle_count_list[-1]: + logger.warning("Got mismatch between reads in samplesheet and override cycle count list") + logger.warning(f"'{samplesheet.reads[0]}' vs '{cycle_count_list[0]}' and " + f"'{samplesheet.reads[-1]}' vs '{cycle_count_list[-1]}'") + else: + reads_dict.update( + { + "index_1_cycles": cycle_count_list[1], + "index_2_cycles": cycle_count_list[2] + } + ) + + return reads_dict + + +def get_bclconvert_settings_dict(samplesheet: SampleSheet) -> Dict: + """ + This function retrieves the BCLConvert settings for a given SampleSheet object. + + Parameters: + samplesheet (SampleSheet): The SampleSheet object from which to retrieve the BCLConvert settings. + + Returns: + Dict: A dictionary containing the BCLConvert settings for the given SampleSheet object. + """ + # Initialise settings dictionary + bclconvert_settings_dict = {} + + # Add global bclconvert settings + for setting_key, setting_value in samplesheet.settings.items(): + setting_key_snake_case = camel_to_snake(setting_key) + if setting_key_snake_case in V2_SAMPLESHEET_GLOBAL_SETTINGS.keys(): + bclconvert_settings_dict.update( + { + # Coerce type + setting_key_snake_case: setting_value + } + ) + + # Get BCLConvert settings by assay and type + bclconvert_settings_list = [] + for (sample_type, sample_assay), mini_sample_df in samplesheet.metadata_df.groupby(["type", "assay"]): + sample_bclconvert_settings = {} + for adapter_setting in V2_SAMPLESHEET_GLOBAL_SETTINGS.keys(): + sample_bclconvert_settings.update( + { + adapter_setting: get_bclconvert_adapter_setting_by_type_and_assay( + sample_type, + sample_assay, + adapter_setting + ) + } + ) + + bclconvert_settings_list.append(sample_bclconvert_settings) + + # Convert settings to a dataframe + bclconvert_settings_df = pd.DataFrame(bclconvert_settings_list) + + # Append settings in the samplesheet + bclconvert_settings_df = pd.concat( + [ + bclconvert_settings_df, + pd.DataFrame( + [ + pd.Series(bclconvert_settings_dict).reindex(V2_SAMPLESHEET_GLOBAL_SETTINGS.keys()) + ] + ) + ] + ) + + # Drop empty settings + bclconvert_settings_df = bclconvert_settings_df.dropna( + how='all', + axis='columns' + ) + + # Check that all global settings are uniform across all samples + has_error = False + bclconvert_settings_dict = {} + for column in bclconvert_settings_df.columns.tolist(): + if not bclconvert_settings_df[column].dropna().unique().shape[0] == 1: + logger.error(f"{column}: {bclconvert_settings_df[column].unique()}") + has_error = True + continue + # Create BCLConvert settings dict + bclconvert_settings_dict.update( + { + column: bclconvert_settings_df[column].dropna().unique().item() + } + ) + if has_error: + raise ValueError + + # Coerce types of settings + # Add global bclconvert settings + for setting_key, setting_value in bclconvert_settings_dict.items(): + bclconvert_settings_dict[setting_key] = V2_SAMPLESHEET_GLOBAL_SETTINGS[setting_key](setting_value) + + # Add in BCLConvert URN + # Add in Software Version too + bclconvert_settings_dict["urn"] = V2_BCLCONVERT_BASESPACE_URN + bclconvert_settings_dict["software_version"] = V2_BCLCONVERT_BASESPACE_SOFTWARE_VERSION + + # Return bclconvert settings dict + return bclconvert_settings_dict + + +def get_bclconvert_data_list(samplesheet: SampleSheet) -> List: + """ + This function retrieves the BCLConvert data list for a given SampleSheet object. + + Some hacky updates - + 1. Lowercase all keys + 2. Drop Sample_Project, and Sample_ID + 3. Rename Sample_Name (the library id) to SampleID + 4. Drop tailing N's from index and index2 + 5. Drop empty columns + + Parameters: + samplesheet (SampleSheet): The SampleSheet object from which to retrieve the BCLConvert data list. + + Returns: + List: A list containing the BCLConvert data for the given SampleSheet object. + """ + data_dict_list = [] + + for index, data_row in samplesheet.data.iterrows(): + # Drop datadict + data_dict = dict(data_row) + + # Add bclconvert settings + data_dict.update( + get_bclconvert_settings_by_library_id( + library_id=data_dict["Sample_Name"], + samplesheet=samplesheet + ) + ) + + # Append data dict + data_dict_list.append( + data_dict + ) + + # Convert to dataframe + data_dict_list_df = pd.DataFrame(data_dict_list) + + # Some hacky updates - + # 1. Lowercase all keys + # 2. Drop Sample_Project, and Sample_ID + # 3. Rename Sample_Name (the library id) to SampleID + # 4. Drop tailing N's from index and index2 + # 5. Drop empty columns + + # Lowercase all columns + data_dict_list_df = data_dict_list_df.rename( + columns={ + column: column.lower() + for column in data_dict_list_df.columns + } + ) + + # Drop Sample_Project and Sample_ID + data_dict_list_df = data_dict_list_df.drop( + columns=[ + "sample_project", + "sample_id" + ] + ) + + # Rename Sample_Name (the library id) to SampleID + data_dict_list_df = data_dict_list_df.rename( + columns={ + "sample_name": "sample_id" + } + ) + + # Select only columns in V2_DATA_ROWS + data_dict_list_df = data_dict_list_df[ + filter( + lambda col: col in V2_DATA_ROWS, + data_dict_list_df.columns + ) + ] + + # Drop tailing N's from index and index2 + for index_col in ['index', 'index2']: + data_dict_list_df[index_col] = data_dict_list_df[index_col].str.rstrip("N") + + # Strip topup and reruns from sample id names + data_dict_list_df["sample_id"] = data_dict_list_df["sample_id"].apply( + lambda sample_id: re.sub(EXPERIMENT_REGEX_STR["top_up"], "", sample_id) + ) + data_dict_list_df["sample_id"] = data_dict_list_df["sample_id"].apply( + lambda sample_id: re.sub(EXPERIMENT_REGEX_STR["rerun"], "", sample_id) + ) + + # Convert to dataframe and drop empty columns + data_dict_list_df = data_dict_list_df.replace( + { + "": pd.NA + } + ).dropna( + how='all', + axis='columns' + ) + + # Convert and return back as list + return data_dict_list_df.to_dict( + orient="records" + ) + + +def update_bclconvert_settings_on_data_list_settings(bclconvert_settings_dict: Dict, bclconvert_data_list: List) -> [Dict, pd.DataFrame]: + """ + This function updates the BCLConvert settings based on the data list settings. + + Parameters: + bclconvert_settings_dict (Dict): The dictionary containing the BCLConvert settings. + bclconvert_data_list (List): The list containing the BCLConvert data. + + Returns: + Tuple[Dict, pd.DataFrame]: A tuple containing the updated BCLConvert settings dictionary and a DataFrame of the BCLConvert data list. + """ + # Always copy + bclconvert_settings_dict = deepcopy(bclconvert_settings_dict) + bclconvert_data_list = deepcopy(bclconvert_data_list) + + # Find uniform settings within BCLConvert_Data and move them to BCLConvert_Settings + bclconvert_data_list_df = pd.DataFrame(bclconvert_data_list) + for setting_column in V2_SAMPLESHEET_DATA_SETTINGS: + # Check column in dataframe first + if setting_column not in bclconvert_data_list_df.columns: + continue + # Move setting column to bclconvert settings dict + if bclconvert_data_list_df[setting_column].unique().shape[0] == 1: + # Add to bclconvert settings + bclconvert_settings_dict[setting_column] = bclconvert_data_list_df[setting_column].unique().item() + # Drop from data list + bclconvert_data_list_df = bclconvert_data_list_df.drop( + columns=setting_column + ) + + # Some items might be empty in the bclconvert settings dict, so drop them + for setting_name, setting_value in deepcopy(bclconvert_settings_dict).items(): + if setting_value is None or setting_value == "": + _ = bclconvert_settings_dict.pop(setting_name) + + # Drop minimum adapter overlap if no adapters are present + if bclconvert_settings_dict.get("minimum_adapter_overlap", None) is not None: + # Check settings + if ( + bclconvert_settings_dict.get("adapter_read_1", None) is None and + bclconvert_settings_dict.get("adapter_read_2", None) is None + ) and ( + "adapter_read_1" not in bclconvert_data_list_df.columns and + "adapter_read_2" not in bclconvert_data_list_df.columns + ): + logger.debug("Dropping minimum_adapter_overlap from bclconvert settings as no adapters are present") + _ = bclconvert_settings_dict.pop("minimum_adapter_overlap") + + return bclconvert_settings_dict, bclconvert_data_list_df.to_dict(orient="records") + + +def v1_samplesheet_to_json(samplesheet: SampleSheet) -> Dict: + """ + This function converts a version 1 SampleSheet object into a JSON format. + + Parameters: + samplesheet (SampleSheet): The version 1 SampleSheet object to be converted. + + Returns: + Dict: A dictionary representing the JSON format of the SampleSheet object. + """ + # Get header dict + header_dict = get_samplesheet_header_dict(samplesheet) + + # Get reads dict (and add index cycles) + reads_dict = get_reads_dict(samplesheet) + + # Get bclconvert settings dict + bclconvert_settings_dict = get_bclconvert_settings_dict(samplesheet) + + # Add bclconvert settings by sample id + bclconvert_data_list = get_bclconvert_data_list(samplesheet) + + # Update bclconvert settings based on data list settings + # For now this is removing the minimum_adapter overlap if no other adapters are present + bclconvert_settings_dict, bclconvert_data_list = update_bclconvert_settings_on_data_list_settings( + bclconvert_settings_dict, + bclconvert_data_list + ) + + cloud_settings_section = { + "generated_version": "0.0.0", + "cloud_workflow": "ica_workflow_1" + } + + # Write out json + samplesheet_dict = { + "header": header_dict, + "reads": reads_dict, + "bclconvert_settings": bclconvert_settings_dict, + "bclconvert_data": bclconvert_data_list, + "cloud_settings": cloud_settings_section + } + + # Check if any cttso samples in the bclconvert data list + cttso_bclconvert_data_list = list( + filter( + lambda bclconvert_iter: bclconvert_iter.get("library_prep_kit_name", "").lower() == 'cttsov2', + bclconvert_data_list + ) + ) + if len(cttso_bclconvert_data_list) > 0: + # Has a cttso sample, lets generate the TSO500L_Settings and TSO500L_Data section - + # We don't make these 'Cloud_' settings as we don't want to run these through auto launch + # These are hardcoded + tso500l_settings = { + "adapter_read_1": "CTGTCTCTTATACACATCT", + "adapter_read_2": "CTGTCTCTTATACACATCT", + "adapter_behaviour": "trim", + "minimum_trimmed_read_length": 35, + "mask_short_reads": 35, + "override_cycles": "U7N1Y143;I10;I10;U7N1Y143" + } + tso500l_data = list( + map( + lambda bclconvert_data_item: { + "sample_id": bclconvert_data_item.get("sample_id"), + "index": bclconvert_data_item.get("index"), + "index2": bclconvert_data_item.get("index2"), + "sample_type": "DNA", + "lane": bclconvert_data_item.get("lane"), + "i7_index_id": get_cttso_i7_index_id_from_index(bclconvert_data_item.get("index")), + "i5_index_id": get_cttso_i5_index_id_from_index(bclconvert_data_item.get("index2")) + }, + cttso_bclconvert_data_list + ) + ) + + samplesheet_dict.update( + { + "tso500l_settings": tso500l_settings, + "tso500l_data": tso500l_data + } + ) + + return samplesheet_dict + + +def build_v2_samplesheet(samplesheet_json: dict) -> str: + """ + This function constructs a version 2 SampleSheet from a given JSON representation of a version 1 SampleSheet. + + Parameters: + samplesheet_json (dict): A dictionary representing the JSON format of a version 1 SampleSheet object. + + Returns: + str: A string representation of the version 2 SampleSheet. + """ + + with NamedTemporaryFile(prefix="v2_samplesheet_", suffix=".csv") as tmp_file_obj_h: + # Write to CSV + v2_samplesheet_writer( + samplesheet_json, + Path(tmp_file_obj_h.name) + ) + + # Read as str + with open(tmp_file_obj_h.name, "r") as f_h: + return f_h.read() + + +def v1_to_v2_samplesheet(samplesheet): + """ + This function converts a version 1 SampleSheet object into a version 2 SampleSheet string. + + Parameters: + samplesheet (SampleSheet): The version 1 SampleSheet object to be converted. + + Returns: + str: A string representation of the version 2 SampleSheet. + """ + # Convert to samplesheet json + samplesheet_json = v1_samplesheet_to_json(samplesheet) + + # Build v2 samplesheet and convert to a string + v2_samplesheet_str = build_v2_samplesheet(samplesheet_json) + + # Return samplesheet as a string + return v2_samplesheet_str diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/tests/sample/mock-1.csv b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/tests/sample/mock-1.csv new file mode 100644 index 000000000..f5b08a7fd --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/tests/sample/mock-1.csv @@ -0,0 +1,23 @@ +[Header],,,,,,,,,,, +IEMFileVersion,5,,,,,,,,,, +Experiment Name,-,,,,,,,,,, +Date,30/4/2020,,,,,,,,,, +Workflow,-,,,,,,,,,, +Application,-,,,,,,,,,, +Instrument Type,-,,,,,,,,,, +Assay,-,,,,,,,,,, +Index Adapters,-,,,,,,,,,, +Chemistry,-,,,,,,,,,, +,,,,,,,,,,, +[Reads],,,,,,,,,,, +50,,,,,,,,,,, +50,,,,,,,,,,, +,,,,,,,,,,, +[Settings],,,,,,,,,,, +Adapter,AGATCGG,,,,,,,,,, +AdapterRead2,AGATCGG,,,,,,,,,, +,,,,,,,,,,, +[Data],,,,,,,,,,, +Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Index_Plate_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description +3,MDX200001_L2000001,L0000000_topup,,,ABCD,ABCD,ABCD,ABCD,ABCD,ABCD,ABCD +1,MDX200002_L2000002,,,,ABCD,ABCD,ABCD,ABCD,ABCD,ABCD,ABCD diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/tests/sample/mock-2.csv b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/tests/sample/mock-2.csv new file mode 100644 index 000000000..81ed92df0 --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/tests/sample/mock-2.csv @@ -0,0 +1,23 @@ +[Header],,,,,,,,,,, +IEMFileVersion,5,,,,,,,,,, +Experiment Name,-,,,,,,,,,, +Date,30/4/2020,,,,,,,,,, +Workflow,-,,,,,,,,,, +Application,-,,,,,,,,,, +Instrument Type,-,,,,,,,,,, +Assay,-,,,,,,,,,, +Index Adapters,-,,,,,,,,,, +Chemistry,-,,,,,,,,,, +,,,,,,,,,,, +[Reads],,,,,,,,,,, +50,,,,,,,,,,, +50,,,,,,,,,,, +,,,,,,,,,,, +[Settings],,,,,,,,,,, +Adapter,AGATCGG,,,,,,,,,, +AdapterRead2,AGATCGG,,,,,,,,,, +,,,,,,,,,,, +[Data],,,,,,,,,,, +Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Index_Plate_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description +3,MDX200001_L2000001,L0000000_topup,,,ABCD,ABCD,ABCD,ABCD,ABCD,ABCD,ABCD +1,MDX200001_L2000002,L0000000_topup,,,ABCD,ABCD,ABCD,ABCD,ABCD,ABCD,ABCD diff --git a/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/tests/test_sample_sheet.py b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/tests/test_sample_sheet.py new file mode 100644 index 000000000..3b503de98 --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/sample-sheet-check-lambda/tests/test_sample_sheet.py @@ -0,0 +1,103 @@ +""" +Unit tests for samplesheet checker + +run: python -m unittest samplesheet/tests/test_samplesheet_check.py + +""" + +import logging +import os + +from unittest import TestCase, mock, main + +from src.checker import run_sample_sheet_check_with_metadata, run_sample_sheet_content_check +from src.samplesheet import SampleSheet +from src.errors import SampleNameFormatError +from src.errors import GetMetaDataError, SampleSheetHeaderError, SimilarIndexError, \ + MetaDataError, OverrideCyclesError + +dirname = os.path.dirname(__file__) +SAMPLE1_PATH = os.path.join(dirname, "./sample/mock-1.csv") +SAMPLE2_PATH = os.path.join(dirname, "./sample/mock-2.csv") + + +class TestSamplesheetCheckUnitTestCase(TestCase): + sample_sheet = None + + @classmethod + def setUpClass(cls) -> None: + logging.disable(logging.CRITICAL) + print("\n---Running sample sheet check unit tests---") + + def test_fail_sample_name_check(self): + with self.assertRaises(SampleNameFormatError): + SampleSheet(SAMPLE1_PATH) + + def test_success_sample_format(self): + sample_sheet = SampleSheet(SAMPLE2_PATH) + try: + run_sample_sheet_content_check(sample_sheet) + except Exception as e: + self.fail("Should not raise an exception", e) + + @mock.patch('src.checker.check_sample_sheet_for_index_clashes', mock.MagicMock( + side_effect=SimilarIndexError("Found at least two indexes that were too similar to each other"))) + def test_run_check_SimilarIndexError(self): + ss = SampleSheet(SAMPLE2_PATH) + with self.assertRaises(SimilarIndexError) as context: + run_sample_sheet_content_check(ss) + self.assertEqual(str(context.exception), "Found at least two indexes that were too similar to each other", + "Expected error") + + @mock.patch('src.checker.check_samplesheet_header_metadata', + mock.MagicMock( + side_effect=SampleSheetHeaderError("Samplesheet header did not have the appropriate attributes"))) + def test_run_check_SampleSheetHeaderError(self): + ss = SampleSheet(SAMPLE2_PATH) + with self.assertRaises(SampleSheetHeaderError) as context: + run_sample_sheet_content_check(ss) + self.assertEqual(str(context.exception), "Samplesheet header did not have the appropriate attributes") + + @mock.patch.object(SampleSheet, 'set_metadata_from_api', + mock.MagicMock(side_effect=GetMetaDataError("Unable to get metadata"))) + def test_run_check_GetMetaDataError(self): + ss = SampleSheet(SAMPLE2_PATH) + with self.assertRaises(GetMetaDataError) as context: + run_sample_sheet_check_with_metadata(ss, "MOCK_JWT") + self.assertEqual(str(context.exception), "Unable to get metadata") + + @mock.patch('src.checker.check_metadata_correspondence', + mock.MagicMock(side_effect=MetaDataError("Metadata could not be extracted"))) + @mock.patch.object(SampleSheet, 'set_metadata_from_api', mock.MagicMock(return_value=None)) + def test_run_check_MetaDataError(self): + + ss = SampleSheet(SAMPLE2_PATH) + with self.assertRaises(MetaDataError) as context: + run_sample_sheet_check_with_metadata(ss, "MOCK_JWT") + self.assertEqual(str(context.exception), "Metadata could not be extracted") + + @mock.patch('src.checker.check_global_override_cycles', + mock.MagicMock(side_effect=OverrideCyclesError("Override cycles check failed"))) + @mock.patch.object(SampleSheet, 'set_metadata_from_api', mock.MagicMock(return_value=None)) + @mock.patch('src.checker.check_metadata_correspondence', + mock.MagicMock(return_value=mock.MagicMock(return_value=None))) + def test_run_check_globalOverrideCyclesError(self): + + ss = SampleSheet(SAMPLE2_PATH) + with self.assertRaises(OverrideCyclesError) as context: + run_sample_sheet_check_with_metadata(ss, "MOCK_JWT") + self.assertEqual(str(context.exception), "Override cycles check failed") + + @mock.patch('src.checker.check_internal_override_cycles', + mock.MagicMock(side_effect=OverrideCyclesError("Override cycles check failed"))) + @mock.patch.object(SampleSheet, 'set_metadata_from_api', mock.MagicMock(return_value=None)) + @mock.patch('src.checker.check_metadata_correspondence', + mock.MagicMock(return_value=mock.MagicMock(return_value=None))) + @mock.patch('src.checker.check_global_override_cycles', + mock.MagicMock(return_value=mock.MagicMock(return_value=None))) + def test_run_check_internalOverrideCyclesError(self): + + ss = SampleSheet(SAMPLE2_PATH) + with self.assertRaises(OverrideCyclesError) as context: + run_sample_sheet_check_with_metadata(ss, "MOCK_JWT") + self.assertEqual(str(context.exception), "Override cycles check failed") diff --git a/lib/workload/stateless/stacks/sample-sheet-check/stack.ts b/lib/workload/stateless/stacks/sample-sheet-check/stack.ts new file mode 100644 index 000000000..98022ede9 --- /dev/null +++ b/lib/workload/stateless/stacks/sample-sheet-check/stack.ts @@ -0,0 +1,63 @@ +import { Construct } from 'constructs'; +import { Duration, Stack, StackProps } from 'aws-cdk-lib'; +import { HttpLambdaIntegration } from 'aws-cdk-lib/aws-apigatewayv2-integrations'; +import { HttpMethod, HttpRoute, HttpRouteKey } from 'aws-cdk-lib/aws-apigatewayv2'; + +import { ApiGatewayConstruct, ApiGatewayConstructProps } from '../../../components/api-gateway'; +import path from 'path'; +import { Architecture, DockerImageCode, DockerImageFunction } from 'aws-cdk-lib/aws-lambda'; +import { StringParameter } from 'aws-cdk-lib/aws-ssm'; +import { RetentionDays } from 'aws-cdk-lib/aws-logs'; +import { Effect, PolicyStatement } from 'aws-cdk-lib/aws-iam'; + +export interface SampleSheetCheckerStackProps { + /** + * The props for api-gateway + */ + apiGatewayConstructProps: ApiGatewayConstructProps; +} + +export class SampleSheetCheckerStack extends Stack { + constructor(scope: Construct, id: string, props: StackProps & SampleSheetCheckerStackProps) { + super(scope, id, props); + + const apiGW = new ApiGatewayConstruct( + this, + 'OrcaBusAPI-SampleSheetChecker', + props.apiGatewayConstructProps + ); + + const domainName = StringParameter.valueForStringParameter(this, 'umccr_domain'); + + const sscheckLambda = new DockerImageFunction(this, 'SSCheckLambda', { + code: DockerImageCode.fromImageAsset(path.join(__dirname, 'sample-sheet-check-lambda'), { + file: 'lambda.Dockerfile', + }), + logRetention: RetentionDays.TWO_WEEKS, + architecture: Architecture.ARM_64, + timeout: Duration.seconds(28), + memorySize: 1024, + environment: { + DATA_PORTAL_DOMAIN_NAME: domainName, + }, + initialPolicy: [ + // Not enabling logs + new PolicyStatement({ + effect: Effect.DENY, + actions: ['logs:CreateLogGroup', 'logs:CreateLogStream', 'logs:PutLogEvents'], + resources: ['arn:aws:logs:*:*:*'], + }), + ], + }); + + // add some integration to the http api gw + const apiIntegration = new HttpLambdaIntegration('ApiLambdaIntegration', sscheckLambda); + + // Routes for API schemas + new HttpRoute(this, 'PostHttpRoute', { + httpApi: apiGW.httpApi, + integration: apiIntegration, + routeKey: HttpRouteKey.with(`/{PROXY+}`, HttpMethod.POST), + }); + } +} diff --git a/lib/workload/stateless/statelessStackCollectionClass.ts b/lib/workload/stateless/statelessStackCollectionClass.ts index b0ac853d4..8e0d7ca7b 100644 --- a/lib/workload/stateless/statelessStackCollectionClass.ts +++ b/lib/workload/stateless/statelessStackCollectionClass.ts @@ -82,6 +82,10 @@ import { import { PgDDStack, PgDDStackProps } from './stacks/pg-dd/deploy/stack'; import { DataMigrateStack, DataMigrateStackProps } from './stacks/data-migrate/deploy/stack'; import { HtsgetStack, HtsgetStackConfigurableProps } from './stacks/htsget/stack'; +import { + SampleSheetCheckerStack, + SampleSheetCheckerStackProps, +} from './stacks/sample-sheet-check/stack'; export interface StatelessStackCollectionProps { metadataManagerStackProps: MetadataManagerStackProps; @@ -109,6 +113,7 @@ export interface StatelessStackCollectionProps { fmAnnotatorProps: FMAnnotatorConfigurableProps; dataMigrateProps: DataMigrateStackProps; htsgetProps: HtsgetStackConfigurableProps; + sampleSheetCheckerProps: SampleSheetCheckerStackProps; pgDDProps?: PgDDStackProps; } @@ -140,6 +145,7 @@ export class StatelessStackCollection { readonly dataMigrate: Stack; readonly htsgetStack: Stack; readonly pgDDStack: Stack; + readonly sampleSheetCheckerStack: Stack; constructor( scope: Construct, @@ -328,6 +334,11 @@ export class StatelessStackCollection { role: fileManagerStack.role, }); + this.sampleSheetCheckerStack = new SampleSheetCheckerStack(scope, 'SampleSheetCheckerStack', { + ...this.createTemplateProps(env, 'SampleSheetCheckerStack'), + ...statelessConfiguration.sampleSheetCheckerProps, + }); + if (statelessConfiguration.pgDDProps) { this.pgDDStack = new PgDDStack(scope, 'PgDDStack', { ...this.createTemplateProps(env, 'PgDDStack'), diff --git a/test/stateless/deployment.test.ts b/test/stateless/deployment.test.ts index 95e275450..eddc5ead1 100644 --- a/test/stateless/deployment.test.ts +++ b/test/stateless/deployment.test.ts @@ -138,6 +138,21 @@ function applyNagSuppression(stackId: string, stack: Stack) { // for each stack specific switch (stackId) { + case 'SampleSheetCheckerStack': + NagSuppressions.addResourceSuppressionsByPath( + stack, + `/SampleSheetCheckerStack/LogRetentionaae0aa3c5b4d4f87b02d85b201efdd8a/ServiceRole/DefaultPolicy/Resource`, + [ + { + id: 'AwsSolutions-IAM5', + reason: 'Used to deny aws logs to be sent to cloudwatch logs. ', + }, + ], + true + ); + + break; + case 'MetadataManagerStack': NagSuppressions.addResourceSuppressionsByPath( stack,