From 6be8ca288e8a8df1ff97102178d05f02e3964cf3 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 11 Sep 2024 16:12:42 +0200 Subject: [PATCH] handle special idxs --- scripts/generate_aviti_run_manifest.py | 102 ++++++++++++++++++------- 1 file changed, 74 insertions(+), 28 deletions(-) diff --git a/scripts/generate_aviti_run_manifest.py b/scripts/generate_aviti_run_manifest.py index 66414773..7b28338f 100644 --- a/scripts/generate_aviti_run_manifest.py +++ b/scripts/generate_aviti_run_manifest.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +import json import logging import os import re @@ -13,12 +14,18 @@ from genologics.lims import Lims from Levenshtein import hamming as distance +from data.Chromium_10X_indexes import Chromium_10X_indexes from scilifelab_epps.epp import upload_file from scilifelab_epps.wrapper import epp_decorator from scripts.generate_minknow_samplesheet import get_pool_sample_label_mapping TIMESTAMP = dt.now().strftime("%y%m%d_%H%M%S") -LABEL_SEQ_SUBSTRING = re.compile(r"[ACGT]{4,}(-[ACGT]{4,})?") + +# Pre-compile regexes in global scope: +IDX_PAT = re.compile("([ATCG]{4,}N*)-?([ATCG]*)") +TENX_SINGLE_PAT = re.compile("SI-(?:GA|NA)-[A-H][1-9][0-2]?") +TENX_DUAL_PAT = re.compile("SI-(?:TT|NT|NN|TN|TS)-[A-H][1-9][0-2]?") +SMARTSEQ_PAT = re.compile("SMARTSEQ[1-9]?-[1-9][0-9]?[A-P]") # Set up Element PhiX control sets, keys are options in LIMS dropdown UDF PHIX_SETS = { @@ -51,6 +58,13 @@ }, } +# Load SS3 indexes +SMARTSEQ3_indexes_json = ( + "/opt/gls/clarity/users/glsai/repos/scilifelab_epps/data/SMARTSEQ3_indexes.json" +) +with open(SMARTSEQ3_indexes_json) as file: + SMARTSEQ3_indexes = json.loads(file.read()) + def get_flowcell_id(process: Process) -> str: flowcell_ids = [ @@ -105,6 +119,47 @@ def get_settings_section() -> str: return settings_section +def idxs_from_label(label: str) -> list[str | tuple[str, str]]: + """From a LIMS reagent label, return list whose elements are + single indices or tuples of dual index pairs. + """ + + # Initialize result + idxs = [] + + # Expand 10X single indexes + if TENX_SINGLE_PAT.findall(label): + for tenXidx in Chromium_10X_indexes[TENX_SINGLE_PAT.findall(label)[0]]: + idxs.append(tenXidx) + # Case of 10X dual indexes + elif TENX_DUAL_PAT.findall(label): + i7_idx = Chromium_10X_indexes[TENX_DUAL_PAT.findall(label)[0][0]] + i5_idx = Chromium_10X_indexes[TENX_DUAL_PAT.findall(label)[0][1]] + idxs.append((i7_idx, revcomp(i5_idx))) + # Case of SS3 indexes + elif SMARTSEQ_PAT.findall(label): + for i7_idx in SMARTSEQ3_indexes[label][0]: + for i5_idx in SMARTSEQ3_indexes[label][1]: + idxs.append((i7_idx, revcomp(i5_idx))) + # NoIndex cases + elif label.replace(",", "").upper() == "NOINDEX" or ( + label.replace(",", "").upper() == "" + ): + raise AssertionError("NoIndex cases not allowed.") + # Ordinary indexes + elif IDX_PAT.findall(label): + idx_match = IDX_PAT.findall(label)[0] + if "-" in idx_match: + idx1, idx2 = idx_match.split("-") + idxs.append((idx1, idx2)) + else: + idxs.append(idx_match) + else: + raise AssertionError(f"Could not parse index from '{label}'.") + + return idxs + + def get_samples_section(process: Process) -> str: """Generate the [SAMPLES] section of the AVITI run manifest and return it as a string.""" @@ -134,28 +189,13 @@ def get_samples_section(process: Process) -> str: ), "Unequal number of samples and reagent labels." sample2label: dict[str, str] = get_pool_sample_label_mapping(art_out) - samples = art_out.samples - labels = art_out.reagent_labels - - assert len(set(labels)) == len(labels), "Detected non-unique reagent labels." + assert len(set(art_out.reagent_labels)) == len( + art_out.reagent_labels + ), "Detected non-unique reagent labels." + samples = art_out.samples # Iterate over samples for sample in samples: - lims_label = sample2label[sample.name] - - # Parse sample index - label_seq_match = re.search(LABEL_SEQ_SUBSTRING, lims_label) - assert ( - label_seq_match is not None - ), f"Could not parse label sequence from {lims_label}" - label_seq = label_seq_match.group(0) - - if "-" in label_seq: - index1, index2 = label_seq.split("-") - else: - index1 = label_seq - index2 = "" - # Project name and sequencing setup if sample.project: project = sample.project.name.replace(".", "__").replace(",", "") @@ -164,15 +204,21 @@ def get_samples_section(process: Process) -> str: project = "Control" seq_setup = "0-0" - row = {} - row["SampleName"] = sample.name - row["Index1"] = index1 - row["Index2"] = index2 - row["Lane"] = lane - row["Project"] = project - row["Recipe"] = seq_setup + # Add row(s), depending on index type + lims_label = sample2label[sample.name] + for idx in idxs_from_label(lims_label): + row = {} + row["SampleName"] = sample.name + if isinstance(idx, tuple): + row["Index1"], row["Index2"] = idx + else: + row["Index1"] = idx + row["Index2"] = "" + row["Lane"] = lane + row["Project"] = project + row["Recipe"] = seq_setup - lane_rows.append(row) + lane_rows.append(row) # Add PhiX controls if added: phix_loaded: bool = art_out.udf["% phiX"] != 0