From c217533991f1ef0d95f6c21eb0f0daaaedd682f1 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Thu, 18 Jan 2024 17:37:34 +0100 Subject: [PATCH 01/30] WIP, new samplesheet writer. --- clarity_epp.py | 10 +- clarity_epp/export/illumina.py | 256 +++++++++++++++++++++++++++++++++ config.py | 19 +++ 3 files changed, 278 insertions(+), 7 deletions(-) diff --git a/clarity_epp.py b/clarity_epp.py index e0ffd81..1c89751 100755 --- a/clarity_epp.py +++ b/clarity_epp.py @@ -48,8 +48,8 @@ def export_hamilton(args): def export_illumina(args): """Export (updated) illumina samplesheet.""" - clarity_epp.export.illumina.update_samplesheet( - lims, args.process_id, args.artifact_id, args.output_file, args.conversion_tool + clarity_epp.export.illumina.create_samplesheet( + lims, args.process_id, args.output_file ) @@ -287,13 +287,9 @@ def placement_pipetting(args): parser_export_hamilton.set_defaults(func=export_hamilton) parser_export_illumina = subparser_export.add_parser( - 'illumina', help='Export updated illumina samplesheet', parents=[output_parser] + 'illumina', help='Export illumina samplesheet', parents=[output_parser] ) parser_export_illumina.add_argument('process_id', help='Clarity lims process id') - parser_export_illumina.add_argument('artifact_id', help='Clarity lims samplesheet artifact id') - parser_export_illumina.add_argument( - '-c', '--conversion_tool', choices=['bcl2fastq', 'bclconvert'], default='bcl2fastq', help='Illumina conversion tool' - ) parser_export_illumina.set_defaults(func=export_illumina) parser_export_labels = subparser_export.add_parser('labels', help='Export container labels', parents=[output_parser]) diff --git a/clarity_epp/export/illumina.py b/clarity_epp/export/illumina.py index 11aff7c..a0d48c1 100644 --- a/clarity_epp/export/illumina.py +++ b/clarity_epp/export/illumina.py @@ -10,6 +10,262 @@ import config +def get_project(projects, urgent=False): + """Get a project name for sample.""" + if urgent: # Sort projects for urgent samples on name + projects_sorted = sorted(projects.items(), key=operator.itemgetter(0)) + for project in projects_sorted: + if project[1] < 9: + return project[0] # return first project with < 9 samples + + # Sort projects on number of samples, if not urgent or no projects left with <9 samples + projects_sorted = sorted(projects.items(), key=operator.itemgetter(1)) + return projects_sorted[0][0] # return project with least amount of samples. + + +def get_override_cycles(read_len, umi_len, index_len, max_index_len): + """Get override cycles per sample.""" + # TODO: Adjust for ortientation on index 2 + + # Read cycles, Trim last base from read cycles + read_1_cycle = f'Y{read_len[0]-1}N1' + read_2_cycle = f'Y{read_len[1]-1}N1' + + # Adjust read cycles if umi present + if umi_len[0]: + read_1_cycle = f'U{umi_len[0]}Y{read_len[0]-1-umi_len[0]}N1' + if umi_len[1]: + read_2_cycle = f'U{umi_len[1]}Y{read_len[1]-1-umi_len[1]}N1' + + # Index cycles + index_1_cycle = f'I{index_len[0]}' + index_2_cycle = f'I{index_len[1]}' + + # Adjust if index length is shorter than max index length + if index_len[0] < max_index_len[0]: + n_bases = max_index_len[0] - index_len[0] + index_1_cycle = f'I{index_len[0]}N{n_bases}' + + if index_len[1] < max_index_len[1]: + n_bases = max_index_len[1] - index_len[1] + index_2_cycle = f'I{index_len[1]}N{n_bases}' + + override_cycles = ';'.join([ + read_1_cycle, # read 1 + index_1_cycle, # index 1 + index_2_cycle, # index 2 + read_2_cycle, # read 2 + ]) + + return override_cycles + + +def parse_sample_artifacts(sample_artifacts, process): + families = {} + samplesheet_samples = {} + + for sample_artifact in sample_artifacts: + # Find sample artifact index, expected pattern = " (index1-index2)" + sample_index = re.search(r".*\(([ACTGN]+)-([ACTGN]+)\)$", sample_artifact.reagent_labels[0]) + sample_sequence_name = get_sequence_name(sample_artifact) + + for sample in sample_artifact.samples: + # Dx production sample + if ( + 'Dx Familienummer' in list(sample.udf) and + 'Dx NICU Spoed' in list(sample.udf) and + 'Dx Protocolomschrijving' in list(sample.udf) and + 'Dx Stoftest code' in list(sample.udf) + ): + # Skip Mengfractie samples + if sample.udf['Dx Stoftest code'] == config.stoftestcode_wes_duplo: + continue + + # Get sample conversion_settings + sample_conversion_setting = config.conversion_settings['default'] + newest_protocol = sample.udf['Dx Protocolomschrijving'].split(';')[0] + for protocol_code in config.conversion_settings: + if protocol_code in newest_protocol: + sample_conversion_setting = config.conversion_settings[protocol_code] + break + + # Get sample override cycles + sample_override_cycles = get_override_cycles( + read_len=[process.udf['Read 1 Cycles'], process.udf['Read 2 Cycles']], + umi_len=sample_conversion_setting['umi_len'], + trim_last_base=True, + index_len=[len(sample_index.group(1)), len(sample_index.group(2))], + max_index_len=[process.udf['Index Read 1'], process.udf['Index Read 2']] + ) + + # Set family and create if not exist + family = sample.udf['Dx Familienummer'] + if family not in families: + families[family] = { + 'samples': [], + 'NICU': False, + 'project_type': sample_conversion_setting['project'], + 'split_project_type': sample_conversion_setting['split_project'], + 'urgent': False, + 'deviating': False # merge, deep sequencing (5x), etc samples + } + + # Update family information + if sample.udf['Dx Onderzoeksreden'] == 'Research': # Dx research sample + for onderzoeksindicatie in config.research_onderzoeksindicatie_project: + if sample.udf['Dx Onderzoeksindicatie'] == onderzoeksindicatie: + project_type = config.research_onderzoeksindicatie_project[onderzoeksindicatie] + families[family]['project_type'] = project_type + families[family]['split_project_type'] = False + break + + else: # Dx clinic sample + if sample.udf['Dx NICU Spoed']: + families[family]['NICU'] = True + families[family]['project_type'] = 'NICU_{0}'.format(sample.udf['Dx Familienummer']) + families[family]['split_project_type'] = False + + # Set urgent status + if 'Dx Spoed' in list(sample.udf) and sample.udf['Dx Spoed']: + families[family]['urgent'] = True + + # Set deviating status, remove urgent status if deviating + if ( + ('Dx Mergen' in list(sample.udf) and sample.udf['Dx Mergen']) or + ('Dx Exoomequivalent' in list(sample.udf) and sample.udf['Dx Exoomequivalent'] > 1) + ): + families[family]['deviating'] = True + families[family]['urgent'] = False + + else: # Other samples + # Use project name as family name and Remove 'dx' (ignore case) and strip leading space or _ + family = re.sub('^dx[ _]*', '', sample.project.name, flags=re.IGNORECASE) + if family not in families: + families[family] = { + 'samples': [], + 'NICU': False, + 'project_type': family, + 'split_project_type': False, + 'urgent': False, + 'deviating': False + } + + # Setup override cycles + if 'Dx Override Cycles' in list(sample.udf) and sample.udf['Dx Override Cycles']: + sample_override_cycles = sample.udf['Dx Override Cycles'] + else: + sample_override_cycles = get_override_cycles( + read_len=[process.udf['Read 1 Cycles'], process.udf['Read 2 Cycles']], + umi_len=config.conversion_settings['default']['umi_len'], + trim_last_base=config.conversion_settings['default']['trim_last_base'], + index_len=[len(sample_index.group(1)), len(sample_index.group(2))], + max_index_len=[process.udf['Index Read 1'], process.udf['Index Read 2']] + ) + + # Add sample to samplesheet_samples + samplesheet_samples[sample_sequence_name] = { + 'index_1': sample_index.group(1), + 'index_2': sample_index.group(2), + 'override_cycles': sample_override_cycles, + } + + # Add sample to family + if sample_sequence_name not in families[family]['samples']: + families[family]['samples'].append(sample_sequence_name) + + # Get all project types and count samples + project_types = {} + for family in families.values(): + if family['project_type'] in project_types: + project_types[family['project_type']]['sample_count'] += len(family['samples']) + else: + project_types[family['project_type']] = { + 'sample_count': len(family['samples']), + 'projects': {}, + 'split_project_type': family['split_project_type'] + } + + # Define projects per project_type + for project_type in project_types: + project_types[project_type]['index'] = 0 + if project_types[project_type]['split_project_type']: + for i in range(0, int(project_types[project_type]['sample_count']/9+1)): + project_types[project_type]['projects']['{0}_{1}'.format(project_type, i+1)] = 0 + else: + project_types[project_type]['projects'][project_type] = 0 + + # Set sample projects + # Urgent families / samples, skip deviating + for family in [family for family in families.values() if family['urgent'] and not family['deviating']]: + family_project = get_project(project_types[family['project_type']]['projects'], urgent=True) + for sample_sequence_name in family['samples']: + samplesheet_samples[sample_sequence_name]['project'] = family_project + project_types[family['project_type']]['projects'][family_project] += 1 + + # Deviating families / samples + for family in [family for family in families.values() if family['deviating']]: + family_project = get_project(project_types[family['project_type']]['projects']) + for sample_sequence_name in family['samples']: + samplesheet_samples[sample_sequence_name]['project'] = family_project + project_types[family['project_type']]['projects'][family_project] += 1 + + # Non urgent and non deviating families / samples + normal_families = [family for family in families.values() if not family['urgent'] and not family['deviating']] + for family in sorted(normal_families, key=lambda fam: (len(fam['samples'])), reverse=True): + family_project = get_project(project_types[family['project_type']]['projects']) + for sample_sequence_name in family['samples']: + samplesheet_samples[sample_sequence_name]['project'] = family_project + project_types[family['project_type']]['projects'][family_project] += 1 + + return samplesheet_samples + + +def create_samplesheet(lims, process_id, output_file): + """Create illumina samplesheet v2.""" + # Default trim last base + + process = Process(lims, id=process_id) + sample_artifacts = get_sample_artifacts_from_pool(lims, process.analytes()[0][0]) + samplesheet_samples = parse_sample_artifacts(sample_artifacts, process) + + # Create SampleSheet + # TODO: Add orientation support for index 2 + # TODO: Compare with novaseq 6000 samplesheets + sample_sheet = [] + + # Header + sample_sheet.append('[Header]') + sample_sheet.append('FileFormatVersion,2') + + # Reads + sample_sheet.append('[Reads]') + sample_sheet.append('Read1Cycles,{0}'.format(process.udf['Read 1 Cycles'])) + sample_sheet.append('Read2Cycles,{0}'.format(process.udf['Read 2 Cycles'])) + + # BCLConvert_Settings + sample_sheet.append('[BCLConvert_Settings]') + sample_sheet.append('AdapterRead1,{0}'.format(process.udf['Adapter'])) + sample_sheet.append('AdapterRead2,{0}'.format(process.udf['Adapter Read 2'])) + sample_sheet.append('FindAdaptersWithIndels,true') + + # BCLConvert_Data + sample_sheet.append('[BCLConvert_Data]') + sample_sheet.append('Sample_ID,index,index2,OverrideCycles,Sample_Project') + + for sample in samplesheet_samples: + sample_sheet.append( + '{sample_name},{index_1},{index_2},{override_cycles},{project}'.format( + sample_name=sample, + index_1=samplesheet_samples[sample]['index_1'], + index_2=samplesheet_samples[sample]['index_2'], + override_cycles=samplesheet_samples[sample]['override_cycles'], + project=samplesheet_samples[sample]['project'] + ) + ) + + output_file.write('\n'.join(sample_sheet)) + + def update_samplesheet(lims, process_id, artifact_id, output_file, conversion_tool): """Update illumina samplesheet.""" process = Process(lims, id=process_id) diff --git a/config.py b/config.py index c002932..97b13ed 100755 --- a/config.py +++ b/config.py @@ -87,6 +87,25 @@ 'AUTOMATED - NovaSeq Run (NovaSeq 6000 v3.1)', ] +# BCLConvert conversion settings +conversion_settings = { + 'default': { + 'project': 'unknown', + 'split_project': False, + 'umi_len': [0, 0], + }, + 'elidS34226467': { + 'project': 'CREv4', + 'split_project': True, + 'umi_len': [5, 5], + }, + 'elidS31285117': { + 'project': 'SSv7', + 'split_project': True, + 'umi_len': [5, 5], + }, +} + # Post sequencing workflow sequencing_workflow = '1701' # DEV Dx Illumina Sequencing v1.2 post_sequencing_workflow = '1204' # DEV Dx Bioinformatica analyses v1.1 From 824434fa94147fb09479c489402c3654ba016a6f Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Fri, 19 Jan 2024 15:16:11 +0100 Subject: [PATCH 02/30] Add orientation fixes --- clarity_epp/export/illumina.py | 72 ++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 25 deletions(-) diff --git a/clarity_epp/export/illumina.py b/clarity_epp/export/illumina.py index a0d48c1..ee82063 100644 --- a/clarity_epp/export/illumina.py +++ b/clarity_epp/export/illumina.py @@ -23,9 +23,8 @@ def get_project(projects, urgent=False): return projects_sorted[0][0] # return project with least amount of samples. -def get_override_cycles(read_len, umi_len, index_len, max_index_len): +def get_override_cycles(read_len, umi_len, index_len, max_index_len, index_2_orientation): """Get override cycles per sample.""" - # TODO: Adjust for ortientation on index 2 # Read cycles, Trim last base from read cycles read_1_cycle = f'Y{read_len[0]-1}N1' @@ -48,7 +47,10 @@ def get_override_cycles(read_len, umi_len, index_len, max_index_len): if index_len[1] < max_index_len[1]: n_bases = max_index_len[1] - index_len[1] - index_2_cycle = f'I{index_len[1]}N{n_bases}' + if index_2_orientation == 'RC': + index_2_cycle = f'I{index_len[1]}N{n_bases}' + else: # index_2_orientation == 'F + index_2_cycle = f'N{n_bases}I{index_len[1]}' override_cycles = ';'.join([ read_1_cycle, # read 1 @@ -60,7 +62,7 @@ def get_override_cycles(read_len, umi_len, index_len, max_index_len): return override_cycles -def parse_sample_artifacts(sample_artifacts, process): +def get_samplesheet_samples(sample_artifacts, process, index_2_orientation): families = {} samplesheet_samples = {} @@ -93,9 +95,9 @@ def parse_sample_artifacts(sample_artifacts, process): sample_override_cycles = get_override_cycles( read_len=[process.udf['Read 1 Cycles'], process.udf['Read 2 Cycles']], umi_len=sample_conversion_setting['umi_len'], - trim_last_base=True, index_len=[len(sample_index.group(1)), len(sample_index.group(2))], - max_index_len=[process.udf['Index Read 1'], process.udf['Index Read 2']] + max_index_len=[process.udf['Index Read 1'], process.udf['Index Read 2']], + index_2_orientation=index_2_orientation ) # Set family and create if not exist @@ -157,9 +159,9 @@ def parse_sample_artifacts(sample_artifacts, process): sample_override_cycles = get_override_cycles( read_len=[process.udf['Read 1 Cycles'], process.udf['Read 2 Cycles']], umi_len=config.conversion_settings['default']['umi_len'], - trim_last_base=config.conversion_settings['default']['trim_last_base'], index_len=[len(sample_index.group(1)), len(sample_index.group(2))], - max_index_len=[process.udf['Index Read 1'], process.udf['Index Read 2']] + max_index_len=[process.udf['Index Read 1'], process.udf['Index Read 2']], + index_2_orientation=index_2_orientation ) # Add sample to samplesheet_samples @@ -168,6 +170,10 @@ def parse_sample_artifacts(sample_artifacts, process): 'index_2': sample_index.group(2), 'override_cycles': sample_override_cycles, } + if index_2_orientation == 'RC': # Reverse complement index 2 + samplesheet_samples[sample_sequence_name]['index_2'] = clarity_epp.export.utils.reverse_complement( + samplesheet_samples[sample_sequence_name]['index_2'] + ) # Add sample to family if sample_sequence_name not in families[family]['samples']: @@ -222,15 +228,16 @@ def parse_sample_artifacts(sample_artifacts, process): def create_samplesheet(lims, process_id, output_file): """Create illumina samplesheet v2.""" - # Default trim last base - process = Process(lims, id=process_id) - sample_artifacts = get_sample_artifacts_from_pool(lims, process.analytes()[0][0]) - samplesheet_samples = parse_sample_artifacts(sample_artifacts, process) + index_2_orientation = config.index_2_orientation[process.type.name] + + # Get samples samples per lane + samplesheet_samples = [] + for lane in process.analytes()[0]: + sample_artifacts = get_sample_artifacts_from_pool(lims, process.analytes()[0][0]) + samplesheet_samples.append(get_samplesheet_samples(sample_artifacts, process, index_2_orientation)) # Create SampleSheet - # TODO: Add orientation support for index 2 - # TODO: Compare with novaseq 6000 samplesheets sample_sheet = [] # Header @@ -250,18 +257,33 @@ def create_samplesheet(lims, process_id, output_file): # BCLConvert_Data sample_sheet.append('[BCLConvert_Data]') - sample_sheet.append('Sample_ID,index,index2,OverrideCycles,Sample_Project') - - for sample in samplesheet_samples: - sample_sheet.append( - '{sample_name},{index_1},{index_2},{override_cycles},{project}'.format( - sample_name=sample, - index_1=samplesheet_samples[sample]['index_1'], - index_2=samplesheet_samples[sample]['index_2'], - override_cycles=samplesheet_samples[sample]['override_cycles'], - project=samplesheet_samples[sample]['project'] + if len(samplesheet_samples) == 1: # All samples on all lanes + lane = 0 + sample_sheet.append('Sample_ID,index,index2,OverrideCycles,Sample_Project') + for sample in samplesheet_samples[lane]: + sample_sheet.append( + '{sample_name},{index_1},{index_2},{override_cycles},{project}'.format( + sample_name=sample, + index_1=samplesheet_samples[lane][sample]['index_1'], + index_2=samplesheet_samples[lane][sample]['index_2'], + override_cycles=samplesheet_samples[lane][sample]['override_cycles'], + project=samplesheet_samples[lane][sample]['project'] + ) ) - ) + else: # Samples divided over lanes + sample_sheet.append('Lane,Sample_ID,index,index2,OverrideCycles,Sample_Project') + for lane, lane_samples in enumerate(samplesheet_samples): + for sample in lane_samples: + sample_sheet.append( + '{lane},{sample_name},{index_1},{index_2},{override_cycles},{project}'.format( + lane=lane+1, + sample_name=sample, + index_1=samplesheet_samples[lane][sample]['index_1'], + index_2=samplesheet_samples[lane][sample]['index_2'], + override_cycles=samplesheet_samples[lane][sample]['override_cycles'], + project=samplesheet_samples[lane][sample]['project'] + ) + ) output_file.write('\n'.join(sample_sheet)) From 18d322c3873c2ea72aca790a5596ffe53e47b8aa Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Fri, 19 Jan 2024 15:17:27 +0100 Subject: [PATCH 03/30] Add orientation fixes --- config.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/config.py b/config.py index 97b13ed..7553e7b 100755 --- a/config.py +++ b/config.py @@ -88,6 +88,12 @@ ] # BCLConvert conversion settings +index_2_orientation = { + # Orientation options: F=forward or RC=reverse complement + # https://knowledge.illumina.com/software/general/software-general-reference_material-list/000001800 + 'Dx Library pool denatureren en laden (NovaSeq) v1.3': 'RC', + 'Dx Library pool denatureren en laden (NovaSeqXPlus) v1.0': 'F', +} conversion_settings = { 'default': { 'project': 'unknown', From e816fc23c3ce17e04867531b484031302b1cafdd Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Fri, 19 Jan 2024 15:18:16 +0100 Subject: [PATCH 04/30] Remove old update samplesheet code --- clarity_epp/export/illumina.py | 286 +-------------------------------- 1 file changed, 1 insertion(+), 285 deletions(-) diff --git a/clarity_epp/export/illumina.py b/clarity_epp/export/illumina.py index ee82063..549a8e3 100644 --- a/clarity_epp/export/illumina.py +++ b/clarity_epp/export/illumina.py @@ -1,9 +1,8 @@ """Illumina export functions.""" import operator import re -import csv -from genologics.entities import Process, Artifact +from genologics.entities import Process from .. import get_sequence_name, get_sample_artifacts_from_pool import clarity_epp.export.utils @@ -286,286 +285,3 @@ def create_samplesheet(lims, process_id, output_file): ) output_file.write('\n'.join(sample_sheet)) - - -def update_samplesheet(lims, process_id, artifact_id, output_file, conversion_tool): - """Update illumina samplesheet.""" - process = Process(lims, id=process_id) - trim_last_base = True # Used to set Read1EndWithCycle - - def get_project(projects, urgent=False): - """Inner function to get a project name for samples.""" - if urgent: # Sort projects for urgent samples on name - projects_sorted = sorted(projects.items(), key=operator.itemgetter(0)) - for project in projects_sorted: - if project[1] < 9: - return project[0] # return first project with < 9 samples - - # Sort projects on number of samples, if not urgent or no projects left with <9 samples - projects_sorted = sorted(projects.items(), key=operator.itemgetter(1)) - return projects_sorted[0][0] # return project with least amount of samples. - - # Parse families - families = {} - sample_artifacts = get_sample_artifacts_from_pool(lims, process.analytes()[0][0]) - - for sample_artifact in sample_artifacts: - for sample in sample_artifact.samples: - if ( - 'Dx Familienummer' in list(sample.udf) and - 'Dx NICU Spoed' in list(sample.udf) and - 'Dx Protocolomschrijving' in list(sample.udf) - ): - # Dx production sample - family = sample.udf['Dx Familienummer'] - - # Create family if not exist - if family not in families: - families[family] = { - 'samples': [], - 'NICU': False, - 'project_type': 'unknown_project', - 'split_project_type': False, - 'urgent': False, - 'deviating': False # merge, deep sequencing (5x), etc samples - } - - # Update family information - if sample.udf['Dx Onderzoeksreden'] == 'Research': # Dx research sample - for onderzoeksindicatie in config.research_onderzoeksindicatie_project: - if sample.udf['Dx Onderzoeksindicatie'] == onderzoeksindicatie: - project_type = config.research_onderzoeksindicatie_project[onderzoeksindicatie] - families[family]['project_type'] = project_type - families[family]['split_project_type'] = False - break - - else: # Dx clinic sample - newest_protocol = sample.udf['Dx Protocolomschrijving'].split(';')[0] - if 'SNP fingerprint MIP' in newest_protocol and not families[family]['NICU']: - project_type = 'Fingerprint' - families[family]['project_type'] = project_type - families[family]['split_project_type'] = False - trim_last_base = False - elif 'PID09.V7_smMIP' in newest_protocol and not families[family]['NICU']: - project_type = 'ERARE' - families[family]['project_type'] = project_type - families[family]['split_project_type'] = False - trim_last_base = False - elif sample.udf['Dx NICU Spoed']: - families[family]['NICU'] = True - project_type = 'NICU_{0}'.format(sample.udf['Dx Familienummer']) - families[family]['project_type'] = project_type - families[family]['split_project_type'] = False - elif 'elidS30409818' in newest_protocol and not families[family]['NICU']: - project_type = 'CREv2' - families[family]['project_type'] = project_type - families[family]['split_project_type'] = True - elif 'elidS31285117' in newest_protocol and not families[family]['NICU']: - project_type = 'SSv7' - families[family]['project_type'] = project_type - families[family]['split_project_type'] = True - elif 'elidS34226467' in newest_protocol and not families[family]['NICU']: - project_type = 'CREv4' - families[family]['project_type'] = project_type - families[family]['split_project_type'] = True - - # Set urgent status - if 'Dx Spoed' in list(sample.udf) and sample.udf['Dx Spoed']: - families[family]['urgent'] = True - - # Set deviating status, remove urgent status if deviating - if ( - ('Dx Mergen' in list(sample.udf) and sample.udf['Dx Mergen']) or - ('Dx Exoomequivalent' in list(sample.udf) and sample.udf['Dx Exoomequivalent'] > 1) - ): - families[family]['deviating'] = True - families[family]['urgent'] = False - - else: # Other samples - if 'GIAB' in sample.name.upper() and not sample.project: # GIAB control samples - family = 'GIAB' - else: - family = sample.project.name - # Remove 'dx' (ignore case) and strip leading space or _ - family = re.sub('^dx[ _]*', '', family, flags=re.IGNORECASE) - if family not in families: - families[family] = { - 'samples': [], - 'NICU': False, - 'project_type': family, - 'split_project_type': False, - 'urgent': False, - 'deviating': False - } - - # Add sample_artifact to family - if sample_artifact not in families[family]['samples']: - families[family]['samples'].append(sample_artifact) - - # Get all project types and count samples - project_types = {} - for family in families.values(): - if family['project_type'] in project_types: - project_types[family['project_type']]['sample_count'] += len(family['samples']) - else: - project_types[family['project_type']] = { - 'sample_count': len(family['samples']), - 'projects': {}, - 'split_project_type': family['split_project_type'] - } - - # Define projects per project_type - for project_type in project_types: - project_types[project_type]['index'] = 0 - if project_types[project_type]['split_project_type']: - for i in range(0, int(project_types[project_type]['sample_count']/9+1)): - project_types[project_type]['projects']['{0}_{1}'.format(project_type, i+1)] = 0 - else: - project_types[project_type]['projects'][project_type] = 0 - - # Set sample projects - sample_projects = {} - sample_sequence_names = {} - - # Urgent families / samples, skip deviating - for family in [family for family in families.values() if family['urgent'] and not family['deviating']]: - family_project = get_project(project_types[family['project_type']]['projects'], urgent=True) - for sample_artifact in family['samples']: - sample_sequence_name = get_sequence_name(sample_artifact) - for sample in sample_artifact.samples: - sample_sequence_names[sample.name] = sample_sequence_name - sample_projects[sample_sequence_name] = family_project - project_types[family['project_type']]['projects'][family_project] += 1 - - # Deviating families / samples - for family in [family for family in families.values() if family['deviating']]: - family_project = get_project(project_types[family['project_type']]['projects']) - for sample_artifact in family['samples']: - sample_sequence_name = get_sequence_name(sample_artifact) - for sample in sample_artifact.samples: - sample_sequence_names[sample.name] = sample_sequence_name - sample_projects[sample_sequence_name] = family_project - project_types[family['project_type']]['projects'][family_project] += 1 - - # Non urgent and non deviating families / samples - normal_families = [family for family in families.values() if not family['urgent'] and not family['deviating']] - for family in sorted(normal_families, key=lambda fam: (len(fam['samples'])), reverse=True): - family_project = get_project(project_types[family['project_type']]['projects']) - for sample_artifact in family['samples']: - sample_sequence_name = get_sequence_name(sample_artifact) - for sample in sample_artifact.samples: - sample_sequence_names[sample.name] = sample_sequence_name - sample_projects[sample_sequence_name] = family_project - project_types[family['project_type']]['projects'][family_project] += 1 - - # Check sequencer type - # NextSeq runs need to reverse complement 'index2' for dual barcodes and 'index' for single barcodes. - if 'nextseq' in process.type.name.lower(): - nextseq_run = True - else: - nextseq_run = False - - # Edit clarity samplesheet - sample_header = '' # empty until [data] section - settings_section = False - samplesheet_artifact = Artifact(lims, id=artifact_id) - file_id = samplesheet_artifact.files[0].id - - # Setup custom settings - custom_settings = '' - - if conversion_tool == 'bcl2fastq' and trim_last_base: - custom_settings = ( - 'Read1EndWithCycle,{read_1_value}\n' - 'Read2EndWithCycle,{read_2_value}\n' - ).format( - read_1_value=process.udf['Read 1 Cycles']-1, read_2_value=process.udf['Read 2 Cycles']-1 - ) - - elif conversion_tool == 'bclconvert': - # Setup OverrideCycles - if trim_last_base or process.udf['UMI - Trim']: - override_cycles = [ - '', # read 1 - 'I{0}'.format(process.udf['Index Read 1']), # index 1 - 'I{0}'.format(process.udf['Index Read 2']), # index 2 - '', # read 2 - ] - - if trim_last_base and process.udf['UMI - Trim']: - override_cycles[0] = 'U{umi}Y{read}N1'.format( - umi=process.udf['UMI - Read 1 Length'], - read=process.udf['Read 1 Cycles'] - process.udf['UMI - Read 1 Length'] - 1 - ) - override_cycles[3] = 'U{umi}Y{read}N1'.format( - umi=process.udf['UMI - Read 2 Length'], - read=process.udf['Read 2 Cycles'] - process.udf['UMI - Read 2 Length'] - 1 - ) - custom_settings = 'TrimUMI,1\n' - - elif trim_last_base: - override_cycles[0] = 'Y{read}N1'.format(read=process.udf['Read 1 Cycles'] - 1) - override_cycles[3] = 'Y{read}N1'.format(read=process.udf['Read 2 Cycles'] - 1) - - elif process.udf['UMI - Trim']: - override_cycles[0] = 'U{umi}Y{read}'.format( - umi=process.udf['UMI - Read 1 Length'], - read=process.udf['Read 1 Cycles'] - process.udf['UMI - Read 1 Length'] - ) - override_cycles[3] = 'U{umi}Y{read}'.format( - umi=process.udf['UMI - Read 2 Length'], - read=process.udf['Read 2 Cycles'] - process.udf['UMI - Read 2 Length'] - ) - custom_settings = 'TrimUMI,1\n' - - custom_settings = '{settings}OverrideCycles,{override_cycles}\n'.format( - settings=custom_settings, - override_cycles=';'.join(override_cycles) - ) - - for data in csv.reader( - lims.get_file_contents(id=file_id).rstrip().split('\n'), - quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True - ): - if data[0] == '[Settings]' and custom_settings: - output_file.write('{line}\n'.format(line=','.join(data))) - output_file.write(custom_settings) - settings_section = True - - elif data[0] == '[Data]' and custom_settings and not settings_section: - output_file.write('[Settings]\n') - output_file.write(custom_settings) - output_file.write('{line}\n'.format(line=','.join(data))) - - elif data[0] == 'Sample_ID': # Samples header line - sample_header = data - sample_id_index = sample_header.index('Sample_ID') - sample_name_index = sample_header.index('Sample_Name') - sample_project_index = sample_header.index('Sample_Project') - - if 'index2' in sample_header: - index_index = sample_header.index('index2') - else: - index_index = sample_header.index('index') - - output_file.write('{line}\n'.format(line=','.join(data))) - - elif sample_header: # Samples header seen, so continue with samples. - sample_name = data[sample_name_index].split(',')[0] - if sample_name in sample_sequence_names: - data[sample_name_index] = sample_sequence_names[sample_name] - - # Set Sample_Project - if data[sample_name_index] in sample_projects: - data[sample_project_index] = sample_projects[data[sample_name_index]] - - # Overwrite Sample_ID with Sample_name to get correct conversion output folder structure - data[sample_id_index] = data[sample_name_index] - - # Reverse complement index for NextSeq runs - if nextseq_run: - data[index_index] = clarity_epp.export.utils.reverse_complement(data[index_index]) - - output_file.write('{line}\n'.format(line=','.join(data))) - else: # Leave other lines untouched. - output_file.write('{line}\n'.format(line=','.join(data))) From eeb11c48a7e928bb178f5ee84838053d935b6cec Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Fri, 19 Jan 2024 16:06:33 +0100 Subject: [PATCH 05/30] Update sequencing workflow --- config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.py b/config.py index 7553e7b..076eb83 100755 --- a/config.py +++ b/config.py @@ -113,7 +113,7 @@ } # Post sequencing workflow -sequencing_workflow = '1701' # DEV Dx Illumina Sequencing v1.2 +sequencing_workflow = '2052' # DEV Dx Illumina Sequencing v1.3 post_sequencing_workflow = '1204' # DEV Dx Bioinformatica analyses v1.1 post_bioinf_workflow = '1803' # DEV Dx NGS WES onderzoeken afronden v2.0 From 890feda60ec49ce95f67330a56ece30f278c25f4 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Mon, 22 Jan 2024 14:28:06 +0100 Subject: [PATCH 06/30] Add illumina tests --- tests/test_export_illumina.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 tests/test_export_illumina.py diff --git a/tests/test_export_illumina.py b/tests/test_export_illumina.py new file mode 100644 index 0000000..fd5eb9e --- /dev/null +++ b/tests/test_export_illumina.py @@ -0,0 +1,21 @@ +from clarity_epp.export import illumina + + +def test_get_override_cycles(): + # Magnis prep with legacy index settings (8, 8) + assert illumina.get_override_cycles([151, 151], [5, 5], [8, 8], [8, 8], 'RC') == 'U5Y145N1;I8;I8;U5Y145N1' + assert illumina.get_override_cycles([151, 151], [5, 5], [8, 8], [8, 8], 'F') == 'U5Y145N1;I8;I8;U5Y145N1' + + # Magnis prep with new default index settings (19, 10) + assert illumina.get_override_cycles([151, 151], [5, 5], [8, 8], [19, 10], 'RC') == 'U5Y145N1;I8N11;I8N2;U5Y145N1' + assert illumina.get_override_cycles([151, 151], [5, 5], [8, 8], [19, 10], 'F') == 'U5Y145N1;I8N11;N2I8;U5Y145N1' + + +def test_get_project(): + assert illumina.get_project({'SSv7_1': 1, 'SSv7_2': 0, 'SSv7_3': 0}) == 'SSv7_2' + assert illumina.get_project({'SSv7_1': 1, 'SSv7_2': 0, 'SSv7_3': 0}, urgent=True) == 'SSv7_1' + assert illumina.get_project({'SSv7_1': 3, 'SSv7_2': 1, 'SSv7_3': 0}) == 'SSv7_3' + assert illumina.get_project({'SSv7_1': 3, 'SSv7_2': 1, 'SSv7_3': 0}, urgent=True) == 'SSv7_1' + assert illumina.get_project({'SSv7_1': 3, 'SSv7_2': 1, 'SSv7_3': 1}) == 'SSv7_2' + assert illumina.get_project({'SSv7_1': 3, 'SSv7_2': 1, 'SSv7_3': 0}, urgent=True) == 'SSv7_1' + assert illumina.get_project({'SSv7_1': 9, 'SSv7_2': 5, 'SSv7_3': 5}, urgent=True) == 'SSv7_2' From 2b45a43cb89652ce7e1483d60af755992b22d0a2 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Mon, 22 Jan 2024 17:18:02 +0100 Subject: [PATCH 07/30] Move to get_sample_sequence_index to utils and test --- clarity_epp/export/illumina.py | 14 +++++++------- clarity_epp/export/utils.py | 11 +++++++++++ tests/test_export_utils.py | 7 +++++++ 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/clarity_epp/export/illumina.py b/clarity_epp/export/illumina.py index 549a8e3..28fa276 100644 --- a/clarity_epp/export/illumina.py +++ b/clarity_epp/export/illumina.py @@ -5,7 +5,7 @@ from genologics.entities import Process from .. import get_sequence_name, get_sample_artifacts_from_pool -import clarity_epp.export.utils +from clarity_epp.export.utils import get_sample_sequence_index, reverse_complement import config @@ -67,7 +67,7 @@ def get_samplesheet_samples(sample_artifacts, process, index_2_orientation): for sample_artifact in sample_artifacts: # Find sample artifact index, expected pattern = " (index1-index2)" - sample_index = re.search(r".*\(([ACTGN]+)-([ACTGN]+)\)$", sample_artifact.reagent_labels[0]) + sample_index = get_sample_sequence_index(sample_artifact.reagent_labels[0]) sample_sequence_name = get_sequence_name(sample_artifact) for sample in sample_artifact.samples: @@ -94,7 +94,7 @@ def get_samplesheet_samples(sample_artifacts, process, index_2_orientation): sample_override_cycles = get_override_cycles( read_len=[process.udf['Read 1 Cycles'], process.udf['Read 2 Cycles']], umi_len=sample_conversion_setting['umi_len'], - index_len=[len(sample_index.group(1)), len(sample_index.group(2))], + index_len=[len(sample_index[0]), len(sample_index[1])], max_index_len=[process.udf['Index Read 1'], process.udf['Index Read 2']], index_2_orientation=index_2_orientation ) @@ -158,19 +158,19 @@ def get_samplesheet_samples(sample_artifacts, process, index_2_orientation): sample_override_cycles = get_override_cycles( read_len=[process.udf['Read 1 Cycles'], process.udf['Read 2 Cycles']], umi_len=config.conversion_settings['default']['umi_len'], - index_len=[len(sample_index.group(1)), len(sample_index.group(2))], + index_len=[len(sample_index[0]), len(sample_index[1])], max_index_len=[process.udf['Index Read 1'], process.udf['Index Read 2']], index_2_orientation=index_2_orientation ) # Add sample to samplesheet_samples samplesheet_samples[sample_sequence_name] = { - 'index_1': sample_index.group(1), - 'index_2': sample_index.group(2), + 'index_1': sample_index[0], + 'index_2': sample_index[1], 'override_cycles': sample_override_cycles, } if index_2_orientation == 'RC': # Reverse complement index 2 - samplesheet_samples[sample_sequence_name]['index_2'] = clarity_epp.export.utils.reverse_complement( + samplesheet_samples[sample_sequence_name]['index_2'] = reverse_complement( samplesheet_samples[sample_sequence_name]['index_2'] ) diff --git a/clarity_epp/export/utils.py b/clarity_epp/export/utils.py index 1f9d20c..52b1987 100755 --- a/clarity_epp/export/utils.py +++ b/clarity_epp/export/utils.py @@ -1,4 +1,5 @@ """Utility functions used for creating samplesheets.""" +import re def sort_96_well_plate(wells): @@ -84,3 +85,13 @@ def get_well_index(well, one_based=False): return wells.index(well) + 1 else: return wells.index(well) + + +def get_sample_sequence_index(reagent_label): + """Return sample sequence indices [index1, index2] from reagent label. + expected reagent label pattern = " (index1-index2) + """ + sample_index_search = re.search(r".*\(([ACTGN]+)-([ACTGN]+)\)$", reagent_label) + sample_index = [sample_index_search.group(1), sample_index_search.group(2)] + + return sample_index \ No newline at end of file diff --git a/tests/test_export_utils.py b/tests/test_export_utils.py index 70bfea8..0918057 100644 --- a/tests/test_export_utils.py +++ b/tests/test_export_utils.py @@ -21,3 +21,10 @@ def test_sort_artifact_list(): def test_get_well_index(): assert utils.get_well_index('A1') == 0 assert utils.get_well_index('A1', one_based=True) == 1 + + +def test_get_sample_sequence_index(): + assert utils.get_sample_sequence_index('Dx 12D NEXTflex UDI 48 (TTAGAGTC-TGTGACGA)') == ['TTAGAGTC', 'TGTGACGA'] + assert utils.get_sample_sequence_index('Dx 10G NEXTflex custom UDI 79 (TGAGGCGC-GGAGACCA)') == ['TGAGGCGC', 'GGAGACCA'] + assert utils.get_sample_sequence_index('Dx 01G Agilent SureSelect XT HS2 UDI_v2 007 (GCAGGTTC-AGAAGCAA)') == ['GCAGGTTC', 'AGAAGCAA'] + assert utils.get_sample_sequence_index('Dx 02B Agilent SureSelect XT HS2 UDI_v1 010 (TAGAGCTC-CTACCGAA)') == ['TAGAGCTC', 'CTACCGAA'] From 0d33a9ab908d1bf4d6c74d32b0395b2b87e324f8 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Mon, 22 Jan 2024 17:19:51 +0100 Subject: [PATCH 08/30] Enable unit test @ master --- .github/workflows/python.yml | 51 ++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 6f567d9..5f7fcff 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -5,13 +5,12 @@ name: Python (flake8, pytest) on: push: - branches: [main, develop] + branches: [master, develop] pull_request: - branches: [main, develop] + branches: [master, develop] jobs: build: - runs-on: ubuntu-20.04 strategy: fail-fast: false @@ -19,26 +18,26 @@ jobs: python-version: [3.6] steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - - name: "Install Apache package" - run: sudo apt install -y apache2-dev - - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip install flake8 pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - pytest \ No newline at end of file + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: "Install Apache package" + run: sudo apt install -y apache2-dev + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest From 20e8018ccc85b1be5ac139ce8d3aec1f6dc50bdc Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Thu, 25 Jan 2024 16:50:53 +0100 Subject: [PATCH 09/30] Fix for single index samples --- clarity_epp/export/illumina.py | 63 +++++++++++++++++----------------- clarity_epp/export/utils.py | 8 ++--- tests/test_export_illumina.py | 6 ++-- tests/test_export_utils.py | 7 ++++ 4 files changed, 46 insertions(+), 38 deletions(-) diff --git a/clarity_epp/export/illumina.py b/clarity_epp/export/illumina.py index 28fa276..6994f24 100644 --- a/clarity_epp/export/illumina.py +++ b/clarity_epp/export/illumina.py @@ -24,38 +24,35 @@ def get_project(projects, urgent=False): def get_override_cycles(read_len, umi_len, index_len, max_index_len, index_2_orientation): """Get override cycles per sample.""" - - # Read cycles, Trim last base from read cycles - read_1_cycle = f'Y{read_len[0]-1}N1' - read_2_cycle = f'Y{read_len[1]-1}N1' - - # Adjust read cycles if umi present - if umi_len[0]: - read_1_cycle = f'U{umi_len[0]}Y{read_len[0]-1-umi_len[0]}N1' - if umi_len[1]: - read_2_cycle = f'U{umi_len[1]}Y{read_len[1]-1-umi_len[1]}N1' - - # Index cycles - index_1_cycle = f'I{index_len[0]}' - index_2_cycle = f'I{index_len[1]}' - - # Adjust if index length is shorter than max index length - if index_len[0] < max_index_len[0]: - n_bases = max_index_len[0] - index_len[0] - index_1_cycle = f'I{index_len[0]}N{n_bases}' - - if index_len[1] < max_index_len[1]: - n_bases = max_index_len[1] - index_len[1] - if index_2_orientation == 'RC': - index_2_cycle = f'I{index_len[1]}N{n_bases}' - else: # index_2_orientation == 'F - index_2_cycle = f'N{n_bases}I{index_len[1]}' + read_cycles = ['', ''] + index_cycles = ['', ''] + + for idx in range(len(read_cycles)): + if umi_len[idx]: # read cycle with umi + read_cycle = f'U{umi_len[idx]}Y{read_len[idx]-1-umi_len[idx]}N1' + else: # read cycle without umi + read_cycle = f'Y{read_len[idx]-1}N1' + read_cycles[idx] = read_cycle + + for idx in range(len(index_cycles)): + if index_len[idx]: + if index_len[idx] < max_index_len[idx]: + n_bases = max_index_len[idx] - index_len[idx] + if idx == 1 and index_2_orientation == 'F': # Index 2 in forward orientation (NovaSeq X Plus) + index_cycle = f'N{n_bases}I{index_len[idx]}' + else: + index_cycle = f'I{index_len[idx]}N{n_bases}' + else: + index_cycle = f'I{index_len[idx]}' + else: # empty index, single index library + index_cycle = f'N{index_len[idx]}' + index_cycles[idx] = index_cycle override_cycles = ';'.join([ - read_1_cycle, # read 1 - index_1_cycle, # index 1 - index_2_cycle, # index 2 - read_2_cycle, # read 2 + read_cycles[0], # read 1 + index_cycles[0], # index 1 + index_cycles[1], # index 2 + read_cycles[1], # read 2 ]) return override_cycles @@ -66,9 +63,11 @@ def get_samplesheet_samples(sample_artifacts, process, index_2_orientation): samplesheet_samples = {} for sample_artifact in sample_artifacts: - # Find sample artifact index, expected pattern = " (index1-index2)" - sample_index = get_sample_sequence_index(sample_artifact.reagent_labels[0]) sample_sequence_name = get_sequence_name(sample_artifact) + sample_index = get_sample_sequence_index(sample_artifact.reagent_labels[0]) + # Adjust empty second index for single index samples + if len(sample_index) == 1: + sample_index.append('') for sample in sample_artifact.samples: # Dx production sample diff --git a/clarity_epp/export/utils.py b/clarity_epp/export/utils.py index 52b1987..549e9fa 100755 --- a/clarity_epp/export/utils.py +++ b/clarity_epp/export/utils.py @@ -89,9 +89,9 @@ def get_well_index(well, one_based=False): def get_sample_sequence_index(reagent_label): """Return sample sequence indices [index1, index2] from reagent label. - expected reagent label pattern = " (index1-index2) + expected reagent label pattern = " (index1-index2)" or " (index1)" """ - sample_index_search = re.search(r".*\(([ACTGN]+)-([ACTGN]+)\)$", reagent_label) - sample_index = [sample_index_search.group(1), sample_index_search.group(2)] + sample_index_search = re.search(r"\(([ACTGN-]+)\)$", reagent_label) + sample_index = sample_index_search.group(1).split('-') - return sample_index \ No newline at end of file + return sample_index diff --git a/tests/test_export_illumina.py b/tests/test_export_illumina.py index fd5eb9e..da43b9e 100644 --- a/tests/test_export_illumina.py +++ b/tests/test_export_illumina.py @@ -2,12 +2,14 @@ def test_get_override_cycles(): - # Magnis prep with legacy index settings (8, 8) + # Magnis prep with legacy index settings (8, 8) - NovaSeq 6000 assert illumina.get_override_cycles([151, 151], [5, 5], [8, 8], [8, 8], 'RC') == 'U5Y145N1;I8;I8;U5Y145N1' + # Magnis prep with legacy index settings (8, 8) - NovaSeq X Plus assert illumina.get_override_cycles([151, 151], [5, 5], [8, 8], [8, 8], 'F') == 'U5Y145N1;I8;I8;U5Y145N1' - # Magnis prep with new default index settings (19, 10) + # Magnis prep with new default index settings (19, 10) - NovaSeq 6000 assert illumina.get_override_cycles([151, 151], [5, 5], [8, 8], [19, 10], 'RC') == 'U5Y145N1;I8N11;I8N2;U5Y145N1' + # Magnis prep with new default index settings (19, 10) - NovaSeq X Plus assert illumina.get_override_cycles([151, 151], [5, 5], [8, 8], [19, 10], 'F') == 'U5Y145N1;I8N11;N2I8;U5Y145N1' diff --git a/tests/test_export_utils.py b/tests/test_export_utils.py index 0918057..f5c16c3 100644 --- a/tests/test_export_utils.py +++ b/tests/test_export_utils.py @@ -24,7 +24,14 @@ def test_get_well_index(): def test_get_sample_sequence_index(): + # Dual Index assert utils.get_sample_sequence_index('Dx 12D NEXTflex UDI 48 (TTAGAGTC-TGTGACGA)') == ['TTAGAGTC', 'TGTGACGA'] assert utils.get_sample_sequence_index('Dx 10G NEXTflex custom UDI 79 (TGAGGCGC-GGAGACCA)') == ['TGAGGCGC', 'GGAGACCA'] assert utils.get_sample_sequence_index('Dx 01G Agilent SureSelect XT HS2 UDI_v2 007 (GCAGGTTC-AGAAGCAA)') == ['GCAGGTTC', 'AGAAGCAA'] assert utils.get_sample_sequence_index('Dx 02B Agilent SureSelect XT HS2 UDI_v1 010 (TAGAGCTC-CTACCGAA)') == ['TAGAGCTC', 'CTACCGAA'] + + # Single Index + assert utils.get_sample_sequence_index('Dx 12D NEXTflex UDI 48 (TTAGAGTC)') == ['TTAGAGTC'] + assert utils.get_sample_sequence_index('Dx 10G NEXTflex custom UDI 79 (TGAGGCGC)') == ['TGAGGCGC'] + assert utils.get_sample_sequence_index('Dx 01G Agilent SureSelect XT HS2 UDI_v2 007 (GCAGGTTC)') == ['GCAGGTTC'] + assert utils.get_sample_sequence_index('Dx 02B Agilent SureSelect XT HS2 UDI_v1 010 (TAGAGCTC)') == ['TAGAGCTC'] From 1dc667d65401f3492033746ee2024c6cf02776bc Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Fri, 2 Feb 2024 16:20:55 +0100 Subject: [PATCH 10/30] Add some extra fields --- clarity_epp/export/illumina.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clarity_epp/export/illumina.py b/clarity_epp/export/illumina.py index 6994f24..53266e0 100644 --- a/clarity_epp/export/illumina.py +++ b/clarity_epp/export/illumina.py @@ -241,11 +241,14 @@ def create_samplesheet(lims, process_id, output_file): # Header sample_sheet.append('[Header]') sample_sheet.append('FileFormatVersion,2') + sample_sheet.append('RunName,{0}'.format(process.udf['Experiment Name'])) # Reads sample_sheet.append('[Reads]') sample_sheet.append('Read1Cycles,{0}'.format(process.udf['Read 1 Cycles'])) sample_sheet.append('Read2Cycles,{0}'.format(process.udf['Read 2 Cycles'])) + sample_sheet.append('Index1Cycles,{0}'.format(process.udf['Index Read 1'])) + sample_sheet.append('Index2Cycles,{0}'.format(process.udf['Index Read 2'])) # BCLConvert_Settings sample_sheet.append('[BCLConvert_Settings]') From 163d21e7e2df3eafb0f432c8c12ee60fe75775c0 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Fri, 2 Feb 2024 17:01:04 +0100 Subject: [PATCH 11/30] Add 0 Mismatches setting for barcodes --- clarity_epp/export/illumina.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clarity_epp/export/illumina.py b/clarity_epp/export/illumina.py index 53266e0..14b8e92 100644 --- a/clarity_epp/export/illumina.py +++ b/clarity_epp/export/illumina.py @@ -255,6 +255,8 @@ def create_samplesheet(lims, process_id, output_file): sample_sheet.append('AdapterRead1,{0}'.format(process.udf['Adapter'])) sample_sheet.append('AdapterRead2,{0}'.format(process.udf['Adapter Read 2'])) sample_sheet.append('FindAdaptersWithIndels,true') + sample_sheet.append('BarcodeMismatchesIndex1,0') + sample_sheet.append('BarcodeMismatchesIndex2,0') # BCLConvert_Data sample_sheet.append('[BCLConvert_Data]') From 67f35c6c357de25306a5527ae47f9fa56d4acc8f Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Mon, 12 Feb 2024 17:32:52 +0100 Subject: [PATCH 12/30] Code review changes --- clarity_epp/export/illumina.py | 92 ++++++++++++++++------------------ 1 file changed, 44 insertions(+), 48 deletions(-) diff --git a/clarity_epp/export/illumina.py b/clarity_epp/export/illumina.py index 14b8e92..b75994b 100644 --- a/clarity_epp/export/illumina.py +++ b/clarity_epp/export/illumina.py @@ -10,7 +10,9 @@ def get_project(projects, urgent=False): - """Get a project name for sample.""" + """Get a project name from projects dict ({'project_name': sample_count, ...}) + If urgent is True, return the first project with < 9 samples, else return the project with the least amount of samples. + """ if urgent: # Sort projects for urgent samples on name projects_sorted = sorted(projects.items(), key=operator.itemgetter(0)) for project in projects_sorted: @@ -72,10 +74,10 @@ def get_samplesheet_samples(sample_artifacts, process, index_2_orientation): for sample in sample_artifact.samples: # Dx production sample if ( - 'Dx Familienummer' in list(sample.udf) and - 'Dx NICU Spoed' in list(sample.udf) and - 'Dx Protocolomschrijving' in list(sample.udf) and - 'Dx Stoftest code' in list(sample.udf) + 'Dx Familienummer' in sample.udf and + 'Dx NICU Spoed' in sample.udf and + 'Dx Protocolomschrijving' in sample.udf and + 'Dx Stoftest code' in sample.udf ): # Skip Mengfractie samples if sample.udf['Dx Stoftest code'] == config.stoftestcode_wes_duplo: @@ -85,7 +87,7 @@ def get_samplesheet_samples(sample_artifacts, process, index_2_orientation): sample_conversion_setting = config.conversion_settings['default'] newest_protocol = sample.udf['Dx Protocolomschrijving'].split(';')[0] for protocol_code in config.conversion_settings: - if protocol_code in newest_protocol: + if protocol_code in newest_protocol: # Look for protocol code (elid number) in newest protocol sample_conversion_setting = config.conversion_settings[protocol_code] break @@ -236,56 +238,50 @@ def create_samplesheet(lims, process_id, output_file): samplesheet_samples.append(get_samplesheet_samples(sample_artifacts, process, index_2_orientation)) # Create SampleSheet - sample_sheet = [] - - # Header - sample_sheet.append('[Header]') - sample_sheet.append('FileFormatVersion,2') - sample_sheet.append('RunName,{0}'.format(process.udf['Experiment Name'])) - - # Reads - sample_sheet.append('[Reads]') - sample_sheet.append('Read1Cycles,{0}'.format(process.udf['Read 1 Cycles'])) - sample_sheet.append('Read2Cycles,{0}'.format(process.udf['Read 2 Cycles'])) - sample_sheet.append('Index1Cycles,{0}'.format(process.udf['Index Read 1'])) - sample_sheet.append('Index2Cycles,{0}'.format(process.udf['Index Read 2'])) - - # BCLConvert_Settings - sample_sheet.append('[BCLConvert_Settings]') - sample_sheet.append('AdapterRead1,{0}'.format(process.udf['Adapter'])) - sample_sheet.append('AdapterRead2,{0}'.format(process.udf['Adapter Read 2'])) - sample_sheet.append('FindAdaptersWithIndels,true') - sample_sheet.append('BarcodeMismatchesIndex1,0') - sample_sheet.append('BarcodeMismatchesIndex2,0') + sample_sheet = [ + # Header + "[Header]", + "FileFormatVersion,2", + f"RunName,{process.udf['Experiment Name']}", + # Reads + "[Reads]", + f"Read1Cycles,{process.udf['Read 1 Cycles']}", + f"Read2Cycles,{process.udf['Read 2 Cycles']}", + f"Index1Cycles,{process.udf['Index Read 1']}", + f"Index2Cycles,{process.udf['Index Read 2']}", + # BCLConvert_Settings + "[BCLConvert_Settings]", + f"AdapterRead1,{process.udf['Adapter']}", + f"AdapterRead2,{process.udf['Adapter Read 2']}", + "FindAdaptersWithIndels,true", + "BarcodeMismatchesIndex1,0", + "BarcodeMismatchesIndex2,0", + "[BCLConvert_Data]" + ] # BCLConvert_Data - sample_sheet.append('[BCLConvert_Data]') + # Set header for single or multiple lanes conversion + bcl_convert_data_header = "Sample_ID,index,index2,OverrideCycles,Sample_Project" if len(samplesheet_samples) == 1: # All samples on all lanes - lane = 0 - sample_sheet.append('Sample_ID,index,index2,OverrideCycles,Sample_Project') - for sample in samplesheet_samples[lane]: - sample_sheet.append( - '{sample_name},{index_1},{index_2},{override_cycles},{project}'.format( + multiple_lanes = False + else: + multiple_lanes = True + bcl_convert_data_header = f"Lane,{bcl_convert_data_header}" # Add lane column to header if multiple lanes conversion + sample_sheet.append(bcl_convert_data_header) + + # Add samples to SampleSheet + for lane, lane_samples in enumerate(samplesheet_samples): + for sample in lane_samples: + bcl_convert_data_row = "{sample_name},{index_1},{index_2},{override_cycles},{project}".format( sample_name=sample, index_1=samplesheet_samples[lane][sample]['index_1'], index_2=samplesheet_samples[lane][sample]['index_2'], override_cycles=samplesheet_samples[lane][sample]['override_cycles'], project=samplesheet_samples[lane][sample]['project'] ) - ) - else: # Samples divided over lanes - sample_sheet.append('Lane,Sample_ID,index,index2,OverrideCycles,Sample_Project') - for lane, lane_samples in enumerate(samplesheet_samples): - for sample in lane_samples: - sample_sheet.append( - '{lane},{sample_name},{index_1},{index_2},{override_cycles},{project}'.format( - lane=lane+1, - sample_name=sample, - index_1=samplesheet_samples[lane][sample]['index_1'], - index_2=samplesheet_samples[lane][sample]['index_2'], - override_cycles=samplesheet_samples[lane][sample]['override_cycles'], - project=samplesheet_samples[lane][sample]['project'] - ) - ) + if multiple_lanes: # Add lane number to row if multiple lanes conversion + bcl_convert_data_row = f"{lane+1},{bcl_convert_data_row}" + sample_sheet.append(bcl_convert_data_row) + # Write SampleSheet to file output_file.write('\n'.join(sample_sheet)) From 02fe411ef0b9d4793eb10c981454b369032d4f8a Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Mon, 12 Feb 2024 17:35:54 +0100 Subject: [PATCH 13/30] Layout --- clarity_epp/export/illumina.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clarity_epp/export/illumina.py b/clarity_epp/export/illumina.py index b75994b..f35b8de 100644 --- a/clarity_epp/export/illumina.py +++ b/clarity_epp/export/illumina.py @@ -256,10 +256,10 @@ def create_samplesheet(lims, process_id, output_file): "FindAdaptersWithIndels,true", "BarcodeMismatchesIndex1,0", "BarcodeMismatchesIndex2,0", + # BCLConvert_Data "[BCLConvert_Data]" ] - # BCLConvert_Data # Set header for single or multiple lanes conversion bcl_convert_data_header = "Sample_ID,index,index2,OverrideCycles,Sample_Project" if len(samplesheet_samples) == 1: # All samples on all lanes From 0c5ee77a6bfa0b07969a3658333eedcf8f917b7c Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Tue, 13 Feb 2024 11:59:01 +0100 Subject: [PATCH 14/30] Add indications_exome_equivalent --- clarity_epp/upload/samples.py | 4 ++++ config.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/clarity_epp/upload/samples.py b/clarity_epp/upload/samples.py index b858e0c..cba9527 100644 --- a/clarity_epp/upload/samples.py +++ b/clarity_epp/upload/samples.py @@ -166,6 +166,10 @@ def from_helix(lims, email_settings, input_file): if udf_data['Dx Onderzoeksindicatie'] == 'DSD00' and udf_data['Dx Familie status'] == 'Kind': udf_data['Dx Geslacht'] = 'Onbekend' + # Set 'Dx Exoomequivalent' for specific indications + if udf_data['Dx Onderzoeksindicatie'] in config.indications_exome_equivalent: + udf_data['Dx Exoomequivalent'] = config.indications_exome_equivalent[udf_data['Dx Onderzoeksindicatie']] + # Check 'Dx Familienummer' and correct if '/' in udf_data['Dx Familienummer']: udf_data['Dx Import warning'] = ';'.join([ diff --git a/config.py b/config.py index c002932..4b45714 100755 --- a/config.py +++ b/config.py @@ -30,6 +30,9 @@ stoftestcode_mip: '1651', # DEV Dx smMIP v1.2 } +# Update exome equivalent for certain indications +indications_exome_equivalent = {'UBA1': 5, 'PID09': 5} + # Export meetw protocol steps WES meetw_zui_wes_processes = [ 'Dx Sample registratie zuivering v1.1', From 94848dba6fce1b124818c97dd20be1f176d20190 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Thu, 15 Feb 2024 21:47:14 +0100 Subject: [PATCH 15/30] Change volume --- clarity_epp/export/manual_pipetting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clarity_epp/export/manual_pipetting.py b/clarity_epp/export/manual_pipetting.py index 81cea0c..09a007e 100755 --- a/clarity_epp/export/manual_pipetting.py +++ b/clarity_epp/export/manual_pipetting.py @@ -681,9 +681,9 @@ def samplesheet_pool_samples(lims, process_id, output_file): input_sample = input_artifact.samples[0] # Asume one sample if 'Dx Exoomequivalent' in input_sample.udf: - volume = 5 * input_sample.udf['Dx Exoomequivalent'] + volume = 4 * input_sample.udf['Dx Exoomequivalent'] else: - volume = 5 + volume = 4 output_file.write( '{sample}\t{container}\t{well}\t{pool}\t{volume}\n'.format( From 6ccd9cdd2ed98d39ab3d2ceba603f0e9132a225a Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Thu, 15 Feb 2024 22:53:24 +0100 Subject: [PATCH 16/30] Add extra required conversion settings --- clarity_epp/export/illumina.py | 34 +++++++++++++++++++++------------- config.py | 20 ++++++++++++++++---- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/clarity_epp/export/illumina.py b/clarity_epp/export/illumina.py index f35b8de..f536868 100644 --- a/clarity_epp/export/illumina.py +++ b/clarity_epp/export/illumina.py @@ -24,7 +24,7 @@ def get_project(projects, urgent=False): return projects_sorted[0][0] # return project with least amount of samples. -def get_override_cycles(read_len, umi_len, index_len, max_index_len, index_2_orientation): +def get_override_cycles(read_len, umi_len, index_len, max_index_len, index_2_conversion_orientation): """Get override cycles per sample.""" read_cycles = ['', ''] index_cycles = ['', ''] @@ -40,7 +40,7 @@ def get_override_cycles(read_len, umi_len, index_len, max_index_len, index_2_ori if index_len[idx]: if index_len[idx] < max_index_len[idx]: n_bases = max_index_len[idx] - index_len[idx] - if idx == 1 and index_2_orientation == 'F': # Index 2 in forward orientation (NovaSeq X Plus) + if idx == 1 and index_2_conversion_orientation == 'F': # Index 2 in forward orientation (NovaSeq X Plus) index_cycle = f'N{n_bases}I{index_len[idx]}' else: index_cycle = f'I{index_len[idx]}N{n_bases}' @@ -60,7 +60,7 @@ def get_override_cycles(read_len, umi_len, index_len, max_index_len, index_2_ori return override_cycles -def get_samplesheet_samples(sample_artifacts, process, index_2_orientation): +def get_samplesheet_samples(sample_artifacts, process, index_2_conversion_orientation): families = {} samplesheet_samples = {} @@ -83,12 +83,12 @@ def get_samplesheet_samples(sample_artifacts, process, index_2_orientation): if sample.udf['Dx Stoftest code'] == config.stoftestcode_wes_duplo: continue - # Get sample conversion_settings - sample_conversion_setting = config.conversion_settings['default'] + # Get sample conversion settings + sample_conversion_setting = config.sample_conversion_settings['default'] newest_protocol = sample.udf['Dx Protocolomschrijving'].split(';')[0] - for protocol_code in config.conversion_settings: + for protocol_code in config.sample_conversion_settings: if protocol_code in newest_protocol: # Look for protocol code (elid number) in newest protocol - sample_conversion_setting = config.conversion_settings[protocol_code] + sample_conversion_setting = config.sample_conversion_settings[protocol_code] break # Get sample override cycles @@ -97,7 +97,7 @@ def get_samplesheet_samples(sample_artifacts, process, index_2_orientation): umi_len=sample_conversion_setting['umi_len'], index_len=[len(sample_index[0]), len(sample_index[1])], max_index_len=[process.udf['Index Read 1'], process.udf['Index Read 2']], - index_2_orientation=index_2_orientation + index_2_conversion_orientation=index_2_conversion_orientation ) # Set family and create if not exist @@ -158,10 +158,10 @@ def get_samplesheet_samples(sample_artifacts, process, index_2_orientation): else: sample_override_cycles = get_override_cycles( read_len=[process.udf['Read 1 Cycles'], process.udf['Read 2 Cycles']], - umi_len=config.conversion_settings['default']['umi_len'], + umi_len=config.sample_conversion_settings['default']['umi_len'], index_len=[len(sample_index[0]), len(sample_index[1])], max_index_len=[process.udf['Index Read 1'], process.udf['Index Read 2']], - index_2_orientation=index_2_orientation + index_2_conversion_orientation=index_2_conversion_orientation ) # Add sample to samplesheet_samples @@ -170,7 +170,7 @@ def get_samplesheet_samples(sample_artifacts, process, index_2_orientation): 'index_2': sample_index[1], 'override_cycles': sample_override_cycles, } - if index_2_orientation == 'RC': # Reverse complement index 2 + if index_2_conversion_orientation == 'RC': # Reverse complement index 2 samplesheet_samples[sample_sequence_name]['index_2'] = reverse_complement( samplesheet_samples[sample_sequence_name]['index_2'] ) @@ -229,19 +229,25 @@ def get_samplesheet_samples(sample_artifacts, process, index_2_orientation): def create_samplesheet(lims, process_id, output_file): """Create illumina samplesheet v2.""" process = Process(lims, id=process_id) - index_2_orientation = config.index_2_orientation[process.type.name] + sequencer_conversion_settings = config.sequencer_conversion_settings[process.type.name] # Get samples samples per lane samplesheet_samples = [] for lane in process.analytes()[0]: sample_artifacts = get_sample_artifacts_from_pool(lims, process.analytes()[0][0]) - samplesheet_samples.append(get_samplesheet_samples(sample_artifacts, process, index_2_orientation)) + samplesheet_samples.append( + get_samplesheet_samples( + sample_artifacts, process, sequencer_conversion_settings['index_2_conversion_orientation'] + ) + ) # Create SampleSheet sample_sheet = [ # Header "[Header]", "FileFormatVersion,2", + f"InstrumentPlatform,{sequencer_conversion_settings['instrument_platform']}", + f"IndexOrientation,{sequencer_conversion_settings['index_orientation']}", f"RunName,{process.udf['Experiment Name']}", # Reads "[Reads]", @@ -251,6 +257,8 @@ def create_samplesheet(lims, process_id, output_file): f"Index2Cycles,{process.udf['Index Read 2']}", # BCLConvert_Settings "[BCLConvert_Settings]", + f"SoftwareVersion,{sequencer_conversion_settings['software_version']}", + f"FastqCompressionFormat,{sequencer_conversion_settings['fastq_compression_format']}", f"AdapterRead1,{process.udf['Adapter']}", f"AdapterRead2,{process.udf['Adapter Read 2']}", "FindAdaptersWithIndels,true", diff --git a/config.py b/config.py index 09619a9..dd9a1d2 100755 --- a/config.py +++ b/config.py @@ -91,13 +91,25 @@ ] # BCLConvert conversion settings -index_2_orientation = { +sequencer_conversion_settings = { # Orientation options: F=forward or RC=reverse complement # https://knowledge.illumina.com/software/general/software-general-reference_material-list/000001800 - 'Dx Library pool denatureren en laden (NovaSeq) v1.3': 'RC', - 'Dx Library pool denatureren en laden (NovaSeqXPlus) v1.0': 'F', + 'Dx Library pool denatureren en laden (NovaSeq) v1.3': { + 'index_2_conversion_orientation': 'RC', + 'instrument_platform': 'NovaSeq', + 'index_orientation': 'Forward', + 'software_version': '4.1.7', + 'fastq_compression_format': 'GZIP', + }, + 'Dx Library pool denatureren en laden (NovaSeqXPlus) v1.0': { + 'index_2_conversion_orientation': 'F', + 'instrument_platform': 'NovaSeqXPlus', + 'index_orientation': 'Forward', + 'software_version': '4.1.7', + 'fastq_compression_format': 'GZIP', + }, } -conversion_settings = { +sample_conversion_settings = { 'default': { 'project': 'unknown', 'split_project': False, From 4bf19f0d122f62e65effb3878a8b29953b95f508 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Tue, 20 Feb 2024 15:48:54 +0100 Subject: [PATCH 17/30] Tweaks based on lab input --- clarity_epp/export/illumina.py | 2 +- config.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/clarity_epp/export/illumina.py b/clarity_epp/export/illumina.py index f536868..5b00707 100644 --- a/clarity_epp/export/illumina.py +++ b/clarity_epp/export/illumina.py @@ -261,7 +261,7 @@ def create_samplesheet(lims, process_id, output_file): f"FastqCompressionFormat,{sequencer_conversion_settings['fastq_compression_format']}", f"AdapterRead1,{process.udf['Adapter']}", f"AdapterRead2,{process.udf['Adapter Read 2']}", - "FindAdaptersWithIndels,true", + "FindAdaptersWithIndels,TRUE", "BarcodeMismatchesIndex1,0", "BarcodeMismatchesIndex2,0", # BCLConvert_Data diff --git a/config.py b/config.py index dd9a1d2..3659e7b 100755 --- a/config.py +++ b/config.py @@ -99,14 +99,14 @@ 'instrument_platform': 'NovaSeq', 'index_orientation': 'Forward', 'software_version': '4.1.7', - 'fastq_compression_format': 'GZIP', + 'fastq_compression_format': 'gzip', }, 'Dx Library pool denatureren en laden (NovaSeqXPlus) v1.0': { 'index_2_conversion_orientation': 'F', - 'instrument_platform': 'NovaSeqXPlus', + 'instrument_platform': 'NovaSeqXSeries', 'index_orientation': 'Forward', 'software_version': '4.1.7', - 'fastq_compression_format': 'GZIP', + 'fastq_compression_format': 'gzip', }, } sample_conversion_settings = { From 94316306d58c95487e62c212a461a8dd7b885175 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Wed, 21 Feb 2024 14:23:40 +0100 Subject: [PATCH 18/30] python3 --- clarity_epp/export/bioanalyzer.py | 2 +- clarity_epp/export/manual_pipetting.py | 2 ++ clarity_epp/export/tapestation.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/clarity_epp/export/bioanalyzer.py b/clarity_epp/export/bioanalyzer.py index 1e619f6..0eefe96 100644 --- a/clarity_epp/export/bioanalyzer.py +++ b/clarity_epp/export/bioanalyzer.py @@ -16,7 +16,7 @@ def samplesheet(lims, process_id, output_file): } # Get sample placement - for placement, artifact in process.output_containers()[0].placements.iteritems(): + for placement, artifact in process.output_containers()[0].placements.items(): placement = ''.join(placement.split(':')) plate[placement]['name'] = artifact.name plate[placement]['comment'] = '' diff --git a/clarity_epp/export/manual_pipetting.py b/clarity_epp/export/manual_pipetting.py index 09a007e..95400c2 100755 --- a/clarity_epp/export/manual_pipetting.py +++ b/clarity_epp/export/manual_pipetting.py @@ -362,6 +362,8 @@ def samplesheet_multiplex_sequence_pool(lims, process_id, output_file): # print header output_file.write('Naam\tuL\n') + print(total_sample_count) + print(input_pools) # Last calcuations and print sample for input_pool in input_pools: input_pool_load_pM = (float(process.udf['Dx Laadconcentratie (pM)'])/total_sample_count) * input_pool['sample_count'] diff --git a/clarity_epp/export/tapestation.py b/clarity_epp/export/tapestation.py index b7fcc32..f39d52e 100644 --- a/clarity_epp/export/tapestation.py +++ b/clarity_epp/export/tapestation.py @@ -10,7 +10,7 @@ def samplesheet(lims, process_id, output_file): process = Process(lims, id=process_id) well_plate = {} - for placement, artifact in process.output_containers()[0].placements.iteritems(): + for placement, artifact in process.output_containers()[0].placements.items(): placement = ''.join(placement.split(':')) well_plate[placement] = artifact.name.split('_')[0] From f40ed380c72f3cc24b84fe48d69ada138b6f7bfa Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Wed, 21 Feb 2024 14:33:30 +0100 Subject: [PATCH 19/30] python3 --- clarity_epp/export/hamilton.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clarity_epp/export/hamilton.py b/clarity_epp/export/hamilton.py index d4c589b..0002df4 100755 --- a/clarity_epp/export/hamilton.py +++ b/clarity_epp/export/hamilton.py @@ -11,7 +11,7 @@ def samplesheet_filling_out(lims, process_id, output_file): process = Process(lims, id=process_id) well_plate = {} - for placement, artifact in process.output_containers()[0].placements.iteritems(): + for placement, artifact in process.output_containers()[0].placements.items(): placement = ''.join(placement.split(':')) well_plate[placement] = artifact.samples[0].udf['Dx Fractienummer'] @@ -29,7 +29,7 @@ def samplesheet_purify(lims, process_id, output_file): parent_process_barcode = process.parent_processes()[0].output_containers()[0].name well_plate = {} - for placement, artifact in process.output_containers()[0].placements.iteritems(): + for placement, artifact in process.output_containers()[0].placements.items(): placement = ''.join(placement.split(':')) well_plate[placement] = artifact.samples[0].udf['Dx Fractienummer'] From 9192ec4612f29e07b2a5f92d2073a2a9614d6fda Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Fri, 23 Feb 2024 09:03:05 +0100 Subject: [PATCH 20/30] Remove debug prints --- clarity_epp/export/manual_pipetting.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/clarity_epp/export/manual_pipetting.py b/clarity_epp/export/manual_pipetting.py index 95400c2..09a007e 100755 --- a/clarity_epp/export/manual_pipetting.py +++ b/clarity_epp/export/manual_pipetting.py @@ -362,8 +362,6 @@ def samplesheet_multiplex_sequence_pool(lims, process_id, output_file): # print header output_file.write('Naam\tuL\n') - print(total_sample_count) - print(input_pools) # Last calcuations and print sample for input_pool in input_pools: input_pool_load_pM = (float(process.udf['Dx Laadconcentratie (pM)'])/total_sample_count) * input_pool['sample_count'] From 9be5ff25fed7bfd7b1235b972747d50864899024 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Thu, 29 Feb 2024 09:25:38 +0100 Subject: [PATCH 21/30] Fix 'map' object is not subscriptable --- clarity_epp/qc/qubit.py | 2 +- clarity_epp/upload/tecan.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clarity_epp/qc/qubit.py b/clarity_epp/qc/qubit.py index fcc07b6..2c16aee 100644 --- a/clarity_epp/qc/qubit.py +++ b/clarity_epp/qc/qubit.py @@ -9,7 +9,7 @@ def set_qc_flag(lims, process_id, cutoff=10): """Set qubit qc flags based on Dx Concentratie fluorescentie (ng/ul) values.""" process = Process(lims, id=process_id) artifacts = process.result_files() - concentration_range = map(float, re.findall('[\d\.]+', process.udf['Concentratiebereik (ng/ul)'])) + concentration_range = list(map(float, re.findall('[\d\.]+', process.udf['Concentratiebereik (ng/ul)']))) samples_measurements = {} for artifact in artifacts: diff --git a/clarity_epp/upload/tecan.py b/clarity_epp/upload/tecan.py index d0e33c0..1dbbaec 100644 --- a/clarity_epp/upload/tecan.py +++ b/clarity_epp/upload/tecan.py @@ -10,7 +10,7 @@ def results_qc(lims, process_id): """Upload tecan results to artifacts.""" process = Process(lims, id=process_id) - concentration_range = map(float, re.findall('[\d\.]+', process.udf['Concentratiebereik (ng/ul)'])) + concentration_range = list(map(float, re.findall('[\d\.]+', process.udf['Concentratiebereik (ng/ul)']))) # Parse output file for output in process.all_outputs(unique=True): From b45a587131f0ddd6fc2ac9abfc902f2226ee009f Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Thu, 29 Feb 2024 11:58:36 +0100 Subject: [PATCH 22/30] Add default setting for TrimUMI --- clarity_epp/export/illumina.py | 1 + 1 file changed, 1 insertion(+) diff --git a/clarity_epp/export/illumina.py b/clarity_epp/export/illumina.py index 5b00707..ece8512 100644 --- a/clarity_epp/export/illumina.py +++ b/clarity_epp/export/illumina.py @@ -264,6 +264,7 @@ def create_samplesheet(lims, process_id, output_file): "FindAdaptersWithIndels,TRUE", "BarcodeMismatchesIndex1,0", "BarcodeMismatchesIndex2,0", + "TrimUMI,TRUE", # BCLConvert_Data "[BCLConvert_Data]" ] From 2e6bcec0b67640d53009d413cfeb031d36643cff Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Thu, 29 Feb 2024 13:19:06 +0100 Subject: [PATCH 23/30] Change or to and --- clarity_epp/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clarity_epp/__init__.py b/clarity_epp/__init__.py index e41654a..61e1013 100644 --- a/clarity_epp/__init__.py +++ b/clarity_epp/__init__.py @@ -38,8 +38,8 @@ def get_sample_artifacts_from_pool(lims, pool_artifact): # Check if sample_artifact with 2 samples are from the same person if len(sample_artifact.samples) == 2: if ( - 'Dx Persoons ID' in sample_artifact.samples[0].udf or - 'Dx Persoons ID' in sample_artifact.samples[1].udf or + 'Dx Persoons ID' in sample_artifact.samples[0].udf and + 'Dx Persoons ID' in sample_artifact.samples[1].udf and sample_artifact.samples[0].udf['Dx Persoons ID'] == sample_artifact.samples[1].udf['Dx Persoons ID'] ): sample_artifacts.append(sample_artifact) From b6a9dbd9a52df668e816694facf0727822546c13 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Fri, 1 Mar 2024 14:19:53 +0100 Subject: [PATCH 24/30] Fix lane, merge, ped exports --- clarity_epp/export/illumina.py | 28 +++++++++++++++------------- clarity_epp/export/merge.py | 11 ++++++++++- clarity_epp/export/ped.py | 12 +++++++++++- 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/clarity_epp/export/illumina.py b/clarity_epp/export/illumina.py index ece8512..89f21fb 100644 --- a/clarity_epp/export/illumina.py +++ b/clarity_epp/export/illumina.py @@ -231,14 +231,16 @@ def create_samplesheet(lims, process_id, output_file): process = Process(lims, id=process_id) sequencer_conversion_settings = config.sequencer_conversion_settings[process.type.name] + # Get output container assume one flowcell per sequencing run + output_container = process.output_containers()[0] + # Get samples samples per lane - samplesheet_samples = [] - for lane in process.analytes()[0]: - sample_artifacts = get_sample_artifacts_from_pool(lims, process.analytes()[0][0]) - samplesheet_samples.append( - get_samplesheet_samples( - sample_artifacts, process, sequencer_conversion_settings['index_2_conversion_orientation'] - ) + samplesheet_samples = {} + for lane_idx, lane_artifact in output_container.get_placements().items(): + lane_idx = lane_idx.split(':')[0] + sample_artifacts = get_sample_artifacts_from_pool(lims, lane_artifact) + samplesheet_samples[lane_idx] = get_samplesheet_samples( + sample_artifacts, process, sequencer_conversion_settings['index_2_conversion_orientation'] ) # Create SampleSheet @@ -279,17 +281,17 @@ def create_samplesheet(lims, process_id, output_file): sample_sheet.append(bcl_convert_data_header) # Add samples to SampleSheet - for lane, lane_samples in enumerate(samplesheet_samples): + for lane, lane_samples in samplesheet_samples.items(): for sample in lane_samples: bcl_convert_data_row = "{sample_name},{index_1},{index_2},{override_cycles},{project}".format( sample_name=sample, - index_1=samplesheet_samples[lane][sample]['index_1'], - index_2=samplesheet_samples[lane][sample]['index_2'], - override_cycles=samplesheet_samples[lane][sample]['override_cycles'], - project=samplesheet_samples[lane][sample]['project'] + index_1=lane_samples[sample]['index_1'], + index_2=lane_samples[sample]['index_2'], + override_cycles=lane_samples[sample]['override_cycles'], + project=lane_samples[sample]['project'] ) if multiple_lanes: # Add lane number to row if multiple lanes conversion - bcl_convert_data_row = f"{lane+1},{bcl_convert_data_row}" + bcl_convert_data_row = f"{lane},{bcl_convert_data_row}" sample_sheet.append(bcl_convert_data_row) # Write SampleSheet to file diff --git a/clarity_epp/export/merge.py b/clarity_epp/export/merge.py index bab9afd..7095800 100644 --- a/clarity_epp/export/merge.py +++ b/clarity_epp/export/merge.py @@ -7,7 +7,16 @@ def create_file(lims, process_id, output_file): """Create mege file.""" process = Process(lims, id=process_id) - sample_artifacts = get_sample_artifacts_from_pool(lims, process.analytes()[0][0]) + + # Get output container assume one flowcell per sequencing run + output_container = process.output_containers()[0] + + # Get unique sample artifacts in run + sample_artifacts = [] + for lane_artifact in output_container.get_placements().values(): + for sample_artifact in get_sample_artifacts_from_pool(lims, lane_artifact): + if sample_artifact not in sample_artifacts: + sample_artifacts.append(sample_artifact) output_file.write('Sample\tMerge 1 Sample\tMerge 1 Sequencing Run\tMerge 2 Sample\tMerge 2 Sequencing Run\n') diff --git a/clarity_epp/export/ped.py b/clarity_epp/export/ped.py index 836d7af..a659308 100644 --- a/clarity_epp/export/ped.py +++ b/clarity_epp/export/ped.py @@ -7,7 +7,17 @@ def create_file(lims, process_id, output_file): """Create ped file.""" process = Process(lims, id=process_id) - sample_artifacts = get_sample_artifacts_from_pool(lims, process.analytes()[0][0]) + + # Get output container assume one flowcell per sequencing run + output_container = process.output_containers()[0] + + # Get unique sample artifacts in run + sample_artifacts = [] + for lane_artifact in output_container.get_placements().values(): + for sample_artifact in get_sample_artifacts_from_pool(lims, lane_artifact): + if sample_artifact not in sample_artifacts: + sample_artifacts.append(sample_artifact) + ped_families = {} for sample_artifact in sample_artifacts: From a2ceef60d3ee669b70a90b74ae5a0a5d8611719b Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Fri, 1 Mar 2024 15:31:48 +0100 Subject: [PATCH 25/30] Sort by lane --- clarity_epp/export/illumina.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clarity_epp/export/illumina.py b/clarity_epp/export/illumina.py index 89f21fb..f4ee596 100644 --- a/clarity_epp/export/illumina.py +++ b/clarity_epp/export/illumina.py @@ -281,7 +281,7 @@ def create_samplesheet(lims, process_id, output_file): sample_sheet.append(bcl_convert_data_header) # Add samples to SampleSheet - for lane, lane_samples in samplesheet_samples.items(): + for lane, lane_samples in sorted(samplesheet_samples.items()): for sample in lane_samples: bcl_convert_data_row = "{sample_name},{index_1},{index_2},{override_cycles},{project}".format( sample_name=sample, From af57ce0897a9662ccda2a6e780ed833530b9075f Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Mon, 4 Mar 2024 09:47:19 +0100 Subject: [PATCH 26/30] Add duplicate code todo --- clarity_epp/export/merge.py | 1 + clarity_epp/export/ped.py | 1 + 2 files changed, 2 insertions(+) diff --git a/clarity_epp/export/merge.py b/clarity_epp/export/merge.py index 7095800..e360f5f 100644 --- a/clarity_epp/export/merge.py +++ b/clarity_epp/export/merge.py @@ -12,6 +12,7 @@ def create_file(lims, process_id, output_file): output_container = process.output_containers()[0] # Get unique sample artifacts in run + # TODO: This is a copy of the code from ped.py. It should be refactored to a common function. sample_artifacts = [] for lane_artifact in output_container.get_placements().values(): for sample_artifact in get_sample_artifacts_from_pool(lims, lane_artifact): diff --git a/clarity_epp/export/ped.py b/clarity_epp/export/ped.py index a659308..822b0e3 100644 --- a/clarity_epp/export/ped.py +++ b/clarity_epp/export/ped.py @@ -12,6 +12,7 @@ def create_file(lims, process_id, output_file): output_container = process.output_containers()[0] # Get unique sample artifacts in run + # TODO: This is a copy of the code from merge.py. It should be refactored to a common function. sample_artifacts = [] for lane_artifact in output_container.get_placements().values(): for sample_artifact in get_sample_artifacts_from_pool(lims, lane_artifact): From 4eca97919cfa3d84b4298e5089c24f9410a310bd Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Tue, 5 Mar 2024 16:22:48 +0100 Subject: [PATCH 27/30] Fix file encoding --- clarity_epp/upload/tecan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clarity_epp/upload/tecan.py b/clarity_epp/upload/tecan.py index 1dbbaec..bd025e0 100644 --- a/clarity_epp/upload/tecan.py +++ b/clarity_epp/upload/tecan.py @@ -21,7 +21,7 @@ def results_qc(lims, process_id): measurements = {} sample_measurements = {} - for line in lims.get_file_contents(tecan_result_file.id).data.split('\n'): + for line in lims.get_file_contents(tecan_result_file.id).data.decode('utf-8').split('\n'): if not line.startswith('<>'): data = line.rstrip().split('\t') for index, value in enumerate(data[1:]): From d1969acce452e9556bad725654ad6389a8639ee2 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Fri, 8 Mar 2024 15:15:25 +0100 Subject: [PATCH 28/30] Fix for NovaSeqXPlus workflow configuration --- clarity_epp/placement/artifact.py | 15 ++++++++---- clarity_epp/placement/pool.py | 38 ++++++++++++++++++------------- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/clarity_epp/placement/artifact.py b/clarity_epp/placement/artifact.py index aeead75..d96bd1f 100644 --- a/clarity_epp/placement/artifact.py +++ b/clarity_epp/placement/artifact.py @@ -3,6 +3,7 @@ from genologics.entities import Process, Workflow from .. import get_sequence_name +from clarity_epp.export.utils import sort_artifact_list import config @@ -17,19 +18,23 @@ def set_sequence_name(lims, process_id): def set_runid_name(lims, process_id): """Change artifact name to run id.""" process = Process(lims, id=process_id) - analyte = process.analytes()[0][0] input_artifact = process.all_inputs()[0] - container_name = analyte.container.name + # Fix for NovaSeqXPlus workflow configuration + # TODO: Set NovaSeqXPlus step to 'Analysis' type. + if 'NovaSeqXPlus' in input_artifact.parent_process.type.name: + input_artifact = input_artifact.parent_process.all_inputs()[0] # Find sequencing process # Assume one sequence process per input artifact for sequence_process_type in config.sequence_process_types: sequence_processes = lims.get_processes(type=sequence_process_type, inputartifactlimsid=input_artifact.id) for sequence_process in sequence_processes: - if sequence_process.analytes()[0][0].container.name == container_name: - analyte.name = sequence_process.udf['Run ID'] - analyte.put() + sequence_process_lanes = sorted(sequence_process.analytes()[0], key=sort_artifact_list) + for lane_idx, lane in enumerate(sorted(process.analytes()[0], key=sort_artifact_list)): + if sequence_process_lanes[lane_idx].container.name == lane.container.name: + lane.name = sequence_process.udf['Run ID'] + lane.put() def route_to_workflow(lims, process_id, workflow): diff --git a/clarity_epp/placement/pool.py b/clarity_epp/placement/pool.py index ae69b92..f733046 100644 --- a/clarity_epp/placement/pool.py +++ b/clarity_epp/placement/pool.py @@ -14,6 +14,11 @@ def unpooling(lims, process_id): pool_artifact = process.all_inputs()[0] pool_artifact_parent_process = pool_artifact.parent_process + # Fix for NovaSeqXPlus workflow configuration + # TODO: Set NovaSeqXPlus step to 'Analysis' type. + if 'laden' not in pool_artifact_parent_process.type.name.lower(): + pool_artifact_parent_process = pool_artifact_parent_process.all_inputs()[0].parent_process + run_id = pool_artifact.name # Assume run id is set as pool name using placement/artifact/set_runid_name sample_artifacts = [] # sample artifacts before pooling sample_projects = {} @@ -35,22 +40,23 @@ def unpooling(lims, process_id): sample_projects[data[sample_index]] = data[project_index] # Parse sequencing run samples and move Dx samples to post sequencing workflow - for sample_artifact in get_sample_artifacts_from_pool(lims, pool_artifact): - sample = sample_artifact.samples[0] # Asume all samples metadata is identical. - - # Set sample sequencing run and project - sample_artifact.udf['Dx Sequencing Run ID'] = run_id - # Use sample.name for external (clarity_portal) samples - if 'Sample Type' in sample.udf and 'library' in sample.udf['Sample Type']: - sample_artifact.udf['Dx Sequencing Run Project'] = sample_projects[sample.name] - else: # Use sample_artifact.name for Dx samples (upload via Helix) - sample_artifact.udf['Dx Sequencing Run Project'] = sample_projects[sample_artifact.name] - sample_artifact.put() - - # Only move DX production samples to post sequencing workflow - if sample.project and sample.project.udf['Application'] == 'DX': - sample_artifacts.append(sample_artifact) - + # for lane in + for lane in process.all_inputs(): + for sample_artifact in get_sample_artifacts_from_pool(lims, lane): + sample = sample_artifact.samples[0] # Asume all samples metadata is identical. + + # Set sample sequencing run and project + sample_artifact.udf['Dx Sequencing Run ID'] = run_id + # Use sample.name for external (clarity_portal) samples + if 'Sample Type' in sample.udf and 'library' in sample.udf['Sample Type']: + sample_artifact.udf['Dx Sequencing Run Project'] = sample_projects[sample.name] + else: # Use sample_artifact.name for Dx samples (upload via Helix) + sample_artifact.udf['Dx Sequencing Run Project'] = sample_projects[sample_artifact.name] + sample_artifact.put() + + # Only move DX production samples to post sequencing workflow + if sample_artifact not in sample_artifacts and sample.project and sample.project.udf['Application'] == 'DX': + sample_artifacts.append(sample_artifact) lims.route_artifacts(sample_artifacts, workflow_uri=Workflow(lims, id=config.post_sequencing_workflow).uri) From 36ed301967bc030823ea8b85f7bc411b5fc50dd3 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Fri, 8 Mar 2024 15:24:39 +0100 Subject: [PATCH 29/30] Add Dx NovaSeqXPlus Run v1.0 --- config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/config.py b/config.py index 3659e7b..d6102c7 100755 --- a/config.py +++ b/config.py @@ -88,6 +88,7 @@ 'Dx NextSeq Run v1.0', 'Dx NextSeq Run v1.1', 'Dx Automated NovaSeq Run (standaard) v1.0', 'Dx Automated NovaSeq Run (standaard) v1.1', 'AUTOMATED - NovaSeq Run (NovaSeq 6000 v3.1)', + 'Dx NovaSeqXPlus Run v1.0' ] # BCLConvert conversion settings From f345fc95b0cb25ddc58af8bac30b8ae4224c49db Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Fri, 8 Mar 2024 15:28:52 +0100 Subject: [PATCH 30/30] Remove debug comment --- clarity_epp/placement/pool.py | 1 - 1 file changed, 1 deletion(-) diff --git a/clarity_epp/placement/pool.py b/clarity_epp/placement/pool.py index f733046..3e4b26e 100644 --- a/clarity_epp/placement/pool.py +++ b/clarity_epp/placement/pool.py @@ -40,7 +40,6 @@ def unpooling(lims, process_id): sample_projects[data[sample_index]] = data[project_index] # Parse sequencing run samples and move Dx samples to post sequencing workflow - # for lane in for lane in process.all_inputs(): for sample_artifact in get_sample_artifacts_from_pool(lims, lane): sample = sample_artifact.samples[0] # Asume all samples metadata is identical.