diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 6f567d9..5f7fcff 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -5,13 +5,12 @@ name: Python (flake8, pytest) on: push: - branches: [main, develop] + branches: [master, develop] pull_request: - branches: [main, develop] + branches: [master, develop] jobs: build: - runs-on: ubuntu-20.04 strategy: fail-fast: false @@ -19,26 +18,26 @@ jobs: python-version: [3.6] steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - - name: "Install Apache package" - run: sudo apt install -y apache2-dev - - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip install flake8 pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - pytest \ No newline at end of file + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: "Install Apache package" + run: sudo apt install -y apache2-dev + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest diff --git a/clarity_epp.py b/clarity_epp.py index e0ffd81..1c89751 100755 --- a/clarity_epp.py +++ b/clarity_epp.py @@ -48,8 +48,8 @@ def export_hamilton(args): def export_illumina(args): """Export (updated) illumina samplesheet.""" - clarity_epp.export.illumina.update_samplesheet( - lims, args.process_id, args.artifact_id, args.output_file, args.conversion_tool + clarity_epp.export.illumina.create_samplesheet( + lims, args.process_id, args.output_file ) @@ -287,13 +287,9 @@ def placement_pipetting(args): parser_export_hamilton.set_defaults(func=export_hamilton) parser_export_illumina = subparser_export.add_parser( - 'illumina', help='Export updated illumina samplesheet', parents=[output_parser] + 'illumina', help='Export illumina samplesheet', parents=[output_parser] ) parser_export_illumina.add_argument('process_id', help='Clarity lims process id') - parser_export_illumina.add_argument('artifact_id', help='Clarity lims samplesheet artifact id') - parser_export_illumina.add_argument( - '-c', '--conversion_tool', choices=['bcl2fastq', 'bclconvert'], default='bcl2fastq', help='Illumina conversion tool' - ) parser_export_illumina.set_defaults(func=export_illumina) parser_export_labels = subparser_export.add_parser('labels', help='Export container labels', parents=[output_parser]) diff --git a/clarity_epp/__init__.py b/clarity_epp/__init__.py index e41654a..61e1013 100644 --- a/clarity_epp/__init__.py +++ b/clarity_epp/__init__.py @@ -38,8 +38,8 @@ def get_sample_artifacts_from_pool(lims, pool_artifact): # Check if sample_artifact with 2 samples are from the same person if len(sample_artifact.samples) == 2: if ( - 'Dx Persoons ID' in sample_artifact.samples[0].udf or - 'Dx Persoons ID' in sample_artifact.samples[1].udf or + 'Dx Persoons ID' in sample_artifact.samples[0].udf and + 'Dx Persoons ID' in sample_artifact.samples[1].udf and sample_artifact.samples[0].udf['Dx Persoons ID'] == sample_artifact.samples[1].udf['Dx Persoons ID'] ): sample_artifacts.append(sample_artifact) diff --git a/clarity_epp/export/bioanalyzer.py b/clarity_epp/export/bioanalyzer.py index 1e619f6..0eefe96 100644 --- a/clarity_epp/export/bioanalyzer.py +++ b/clarity_epp/export/bioanalyzer.py @@ -16,7 +16,7 @@ def samplesheet(lims, process_id, output_file): } # Get sample placement - for placement, artifact in process.output_containers()[0].placements.iteritems(): + for placement, artifact in process.output_containers()[0].placements.items(): placement = ''.join(placement.split(':')) plate[placement]['name'] = artifact.name plate[placement]['comment'] = '' diff --git a/clarity_epp/export/hamilton.py b/clarity_epp/export/hamilton.py index d4c589b..0002df4 100755 --- a/clarity_epp/export/hamilton.py +++ b/clarity_epp/export/hamilton.py @@ -11,7 +11,7 @@ def samplesheet_filling_out(lims, process_id, output_file): process = Process(lims, id=process_id) well_plate = {} - for placement, artifact in process.output_containers()[0].placements.iteritems(): + for placement, artifact in process.output_containers()[0].placements.items(): placement = ''.join(placement.split(':')) well_plate[placement] = artifact.samples[0].udf['Dx Fractienummer'] @@ -29,7 +29,7 @@ def samplesheet_purify(lims, process_id, output_file): parent_process_barcode = process.parent_processes()[0].output_containers()[0].name well_plate = {} - for placement, artifact in process.output_containers()[0].placements.iteritems(): + for placement, artifact in process.output_containers()[0].placements.items(): placement = ''.join(placement.split(':')) well_plate[placement] = artifact.samples[0].udf['Dx Fractienummer'] diff --git a/clarity_epp/export/illumina.py b/clarity_epp/export/illumina.py index 11aff7c..f4ee596 100644 --- a/clarity_epp/export/illumina.py +++ b/clarity_epp/export/illumina.py @@ -1,53 +1,113 @@ """Illumina export functions.""" import operator import re -import csv -from genologics.entities import Process, Artifact +from genologics.entities import Process from .. import get_sequence_name, get_sample_artifacts_from_pool -import clarity_epp.export.utils +from clarity_epp.export.utils import get_sample_sequence_index, reverse_complement import config -def update_samplesheet(lims, process_id, artifact_id, output_file, conversion_tool): - """Update illumina samplesheet.""" - process = Process(lims, id=process_id) - trim_last_base = True # Used to set Read1EndWithCycle +def get_project(projects, urgent=False): + """Get a project name from projects dict ({'project_name': sample_count, ...}) + If urgent is True, return the first project with < 9 samples, else return the project with the least amount of samples. + """ + if urgent: # Sort projects for urgent samples on name + projects_sorted = sorted(projects.items(), key=operator.itemgetter(0)) + for project in projects_sorted: + if project[1] < 9: + return project[0] # return first project with < 9 samples + + # Sort projects on number of samples, if not urgent or no projects left with <9 samples + projects_sorted = sorted(projects.items(), key=operator.itemgetter(1)) + return projects_sorted[0][0] # return project with least amount of samples. + + +def get_override_cycles(read_len, umi_len, index_len, max_index_len, index_2_conversion_orientation): + """Get override cycles per sample.""" + read_cycles = ['', ''] + index_cycles = ['', ''] + + for idx in range(len(read_cycles)): + if umi_len[idx]: # read cycle with umi + read_cycle = f'U{umi_len[idx]}Y{read_len[idx]-1-umi_len[idx]}N1' + else: # read cycle without umi + read_cycle = f'Y{read_len[idx]-1}N1' + read_cycles[idx] = read_cycle + + for idx in range(len(index_cycles)): + if index_len[idx]: + if index_len[idx] < max_index_len[idx]: + n_bases = max_index_len[idx] - index_len[idx] + if idx == 1 and index_2_conversion_orientation == 'F': # Index 2 in forward orientation (NovaSeq X Plus) + index_cycle = f'N{n_bases}I{index_len[idx]}' + else: + index_cycle = f'I{index_len[idx]}N{n_bases}' + else: + index_cycle = f'I{index_len[idx]}' + else: # empty index, single index library + index_cycle = f'N{index_len[idx]}' + index_cycles[idx] = index_cycle + + override_cycles = ';'.join([ + read_cycles[0], # read 1 + index_cycles[0], # index 1 + index_cycles[1], # index 2 + read_cycles[1], # read 2 + ]) - def get_project(projects, urgent=False): - """Inner function to get a project name for samples.""" - if urgent: # Sort projects for urgent samples on name - projects_sorted = sorted(projects.items(), key=operator.itemgetter(0)) - for project in projects_sorted: - if project[1] < 9: - return project[0] # return first project with < 9 samples + return override_cycles - # Sort projects on number of samples, if not urgent or no projects left with <9 samples - projects_sorted = sorted(projects.items(), key=operator.itemgetter(1)) - return projects_sorted[0][0] # return project with least amount of samples. - # Parse families +def get_samplesheet_samples(sample_artifacts, process, index_2_conversion_orientation): families = {} - sample_artifacts = get_sample_artifacts_from_pool(lims, process.analytes()[0][0]) + samplesheet_samples = {} for sample_artifact in sample_artifacts: + sample_sequence_name = get_sequence_name(sample_artifact) + sample_index = get_sample_sequence_index(sample_artifact.reagent_labels[0]) + # Adjust empty second index for single index samples + if len(sample_index) == 1: + sample_index.append('') + for sample in sample_artifact.samples: + # Dx production sample if ( - 'Dx Familienummer' in list(sample.udf) and - 'Dx NICU Spoed' in list(sample.udf) and - 'Dx Protocolomschrijving' in list(sample.udf) + 'Dx Familienummer' in sample.udf and + 'Dx NICU Spoed' in sample.udf and + 'Dx Protocolomschrijving' in sample.udf and + 'Dx Stoftest code' in sample.udf ): - # Dx production sample - family = sample.udf['Dx Familienummer'] + # Skip Mengfractie samples + if sample.udf['Dx Stoftest code'] == config.stoftestcode_wes_duplo: + continue + + # Get sample conversion settings + sample_conversion_setting = config.sample_conversion_settings['default'] + newest_protocol = sample.udf['Dx Protocolomschrijving'].split(';')[0] + for protocol_code in config.sample_conversion_settings: + if protocol_code in newest_protocol: # Look for protocol code (elid number) in newest protocol + sample_conversion_setting = config.sample_conversion_settings[protocol_code] + break + + # Get sample override cycles + sample_override_cycles = get_override_cycles( + read_len=[process.udf['Read 1 Cycles'], process.udf['Read 2 Cycles']], + umi_len=sample_conversion_setting['umi_len'], + index_len=[len(sample_index[0]), len(sample_index[1])], + max_index_len=[process.udf['Index Read 1'], process.udf['Index Read 2']], + index_2_conversion_orientation=index_2_conversion_orientation + ) - # Create family if not exist + # Set family and create if not exist + family = sample.udf['Dx Familienummer'] if family not in families: families[family] = { 'samples': [], 'NICU': False, - 'project_type': 'unknown_project', - 'split_project_type': False, + 'project_type': sample_conversion_setting['project'], + 'split_project_type': sample_conversion_setting['split_project'], 'urgent': False, 'deviating': False # merge, deep sequencing (5x), etc samples } @@ -62,34 +122,10 @@ def get_project(projects, urgent=False): break else: # Dx clinic sample - newest_protocol = sample.udf['Dx Protocolomschrijving'].split(';')[0] - if 'SNP fingerprint MIP' in newest_protocol and not families[family]['NICU']: - project_type = 'Fingerprint' - families[family]['project_type'] = project_type - families[family]['split_project_type'] = False - trim_last_base = False - elif 'PID09.V7_smMIP' in newest_protocol and not families[family]['NICU']: - project_type = 'ERARE' - families[family]['project_type'] = project_type - families[family]['split_project_type'] = False - trim_last_base = False - elif sample.udf['Dx NICU Spoed']: + if sample.udf['Dx NICU Spoed']: families[family]['NICU'] = True - project_type = 'NICU_{0}'.format(sample.udf['Dx Familienummer']) - families[family]['project_type'] = project_type + families[family]['project_type'] = 'NICU_{0}'.format(sample.udf['Dx Familienummer']) families[family]['split_project_type'] = False - elif 'elidS30409818' in newest_protocol and not families[family]['NICU']: - project_type = 'CREv2' - families[family]['project_type'] = project_type - families[family]['split_project_type'] = True - elif 'elidS31285117' in newest_protocol and not families[family]['NICU']: - project_type = 'SSv7' - families[family]['project_type'] = project_type - families[family]['split_project_type'] = True - elif 'elidS34226467' in newest_protocol and not families[family]['NICU']: - project_type = 'CREv4' - families[family]['project_type'] = project_type - families[family]['split_project_type'] = True # Set urgent status if 'Dx Spoed' in list(sample.udf) and sample.udf['Dx Spoed']: @@ -104,12 +140,8 @@ def get_project(projects, urgent=False): families[family]['urgent'] = False else: # Other samples - if 'GIAB' in sample.name.upper() and not sample.project: # GIAB control samples - family = 'GIAB' - else: - family = sample.project.name - # Remove 'dx' (ignore case) and strip leading space or _ - family = re.sub('^dx[ _]*', '', family, flags=re.IGNORECASE) + # Use project name as family name and Remove 'dx' (ignore case) and strip leading space or _ + family = re.sub('^dx[ _]*', '', sample.project.name, flags=re.IGNORECASE) if family not in families: families[family] = { 'samples': [], @@ -120,9 +152,32 @@ def get_project(projects, urgent=False): 'deviating': False } - # Add sample_artifact to family - if sample_artifact not in families[family]['samples']: - families[family]['samples'].append(sample_artifact) + # Setup override cycles + if 'Dx Override Cycles' in list(sample.udf) and sample.udf['Dx Override Cycles']: + sample_override_cycles = sample.udf['Dx Override Cycles'] + else: + sample_override_cycles = get_override_cycles( + read_len=[process.udf['Read 1 Cycles'], process.udf['Read 2 Cycles']], + umi_len=config.sample_conversion_settings['default']['umi_len'], + index_len=[len(sample_index[0]), len(sample_index[1])], + max_index_len=[process.udf['Index Read 1'], process.udf['Index Read 2']], + index_2_conversion_orientation=index_2_conversion_orientation + ) + + # Add sample to samplesheet_samples + samplesheet_samples[sample_sequence_name] = { + 'index_1': sample_index[0], + 'index_2': sample_index[1], + 'override_cycles': sample_override_cycles, + } + if index_2_conversion_orientation == 'RC': # Reverse complement index 2 + samplesheet_samples[sample_sequence_name]['index_2'] = reverse_complement( + samplesheet_samples[sample_sequence_name]['index_2'] + ) + + # Add sample to family + if sample_sequence_name not in families[family]['samples']: + families[family]['samples'].append(sample_sequence_name) # Get all project types and count samples project_types = {} @@ -146,148 +201,98 @@ def get_project(projects, urgent=False): project_types[project_type]['projects'][project_type] = 0 # Set sample projects - sample_projects = {} - sample_sequence_names = {} - # Urgent families / samples, skip deviating for family in [family for family in families.values() if family['urgent'] and not family['deviating']]: family_project = get_project(project_types[family['project_type']]['projects'], urgent=True) - for sample_artifact in family['samples']: - sample_sequence_name = get_sequence_name(sample_artifact) - for sample in sample_artifact.samples: - sample_sequence_names[sample.name] = sample_sequence_name - sample_projects[sample_sequence_name] = family_project + for sample_sequence_name in family['samples']: + samplesheet_samples[sample_sequence_name]['project'] = family_project project_types[family['project_type']]['projects'][family_project] += 1 # Deviating families / samples for family in [family for family in families.values() if family['deviating']]: family_project = get_project(project_types[family['project_type']]['projects']) - for sample_artifact in family['samples']: - sample_sequence_name = get_sequence_name(sample_artifact) - for sample in sample_artifact.samples: - sample_sequence_names[sample.name] = sample_sequence_name - sample_projects[sample_sequence_name] = family_project + for sample_sequence_name in family['samples']: + samplesheet_samples[sample_sequence_name]['project'] = family_project project_types[family['project_type']]['projects'][family_project] += 1 # Non urgent and non deviating families / samples normal_families = [family for family in families.values() if not family['urgent'] and not family['deviating']] for family in sorted(normal_families, key=lambda fam: (len(fam['samples'])), reverse=True): family_project = get_project(project_types[family['project_type']]['projects']) - for sample_artifact in family['samples']: - sample_sequence_name = get_sequence_name(sample_artifact) - for sample in sample_artifact.samples: - sample_sequence_names[sample.name] = sample_sequence_name - sample_projects[sample_sequence_name] = family_project + for sample_sequence_name in family['samples']: + samplesheet_samples[sample_sequence_name]['project'] = family_project project_types[family['project_type']]['projects'][family_project] += 1 - # Check sequencer type - # NextSeq runs need to reverse complement 'index2' for dual barcodes and 'index' for single barcodes. - if 'nextseq' in process.type.name.lower(): - nextseq_run = True - else: - nextseq_run = False - - # Edit clarity samplesheet - sample_header = '' # empty until [data] section - settings_section = False - samplesheet_artifact = Artifact(lims, id=artifact_id) - file_id = samplesheet_artifact.files[0].id - - # Setup custom settings - custom_settings = '' - - if conversion_tool == 'bcl2fastq' and trim_last_base: - custom_settings = ( - 'Read1EndWithCycle,{read_1_value}\n' - 'Read2EndWithCycle,{read_2_value}\n' - ).format( - read_1_value=process.udf['Read 1 Cycles']-1, read_2_value=process.udf['Read 2 Cycles']-1 - ) + return samplesheet_samples - elif conversion_tool == 'bclconvert': - # Setup OverrideCycles - if trim_last_base or process.udf['UMI - Trim']: - override_cycles = [ - '', # read 1 - 'I{0}'.format(process.udf['Index Read 1']), # index 1 - 'I{0}'.format(process.udf['Index Read 2']), # index 2 - '', # read 2 - ] - - if trim_last_base and process.udf['UMI - Trim']: - override_cycles[0] = 'U{umi}Y{read}N1'.format( - umi=process.udf['UMI - Read 1 Length'], - read=process.udf['Read 1 Cycles'] - process.udf['UMI - Read 1 Length'] - 1 - ) - override_cycles[3] = 'U{umi}Y{read}N1'.format( - umi=process.udf['UMI - Read 2 Length'], - read=process.udf['Read 2 Cycles'] - process.udf['UMI - Read 2 Length'] - 1 - ) - custom_settings = 'TrimUMI,1\n' - elif trim_last_base: - override_cycles[0] = 'Y{read}N1'.format(read=process.udf['Read 1 Cycles'] - 1) - override_cycles[3] = 'Y{read}N1'.format(read=process.udf['Read 2 Cycles'] - 1) +def create_samplesheet(lims, process_id, output_file): + """Create illumina samplesheet v2.""" + process = Process(lims, id=process_id) + sequencer_conversion_settings = config.sequencer_conversion_settings[process.type.name] + + # Get output container assume one flowcell per sequencing run + output_container = process.output_containers()[0] + + # Get samples samples per lane + samplesheet_samples = {} + for lane_idx, lane_artifact in output_container.get_placements().items(): + lane_idx = lane_idx.split(':')[0] + sample_artifacts = get_sample_artifacts_from_pool(lims, lane_artifact) + samplesheet_samples[lane_idx] = get_samplesheet_samples( + sample_artifacts, process, sequencer_conversion_settings['index_2_conversion_orientation'] + ) - elif process.udf['UMI - Trim']: - override_cycles[0] = 'U{umi}Y{read}'.format( - umi=process.udf['UMI - Read 1 Length'], - read=process.udf['Read 1 Cycles'] - process.udf['UMI - Read 1 Length'] - ) - override_cycles[3] = 'U{umi}Y{read}'.format( - umi=process.udf['UMI - Read 2 Length'], - read=process.udf['Read 2 Cycles'] - process.udf['UMI - Read 2 Length'] + # Create SampleSheet + sample_sheet = [ + # Header + "[Header]", + "FileFormatVersion,2", + f"InstrumentPlatform,{sequencer_conversion_settings['instrument_platform']}", + f"IndexOrientation,{sequencer_conversion_settings['index_orientation']}", + f"RunName,{process.udf['Experiment Name']}", + # Reads + "[Reads]", + f"Read1Cycles,{process.udf['Read 1 Cycles']}", + f"Read2Cycles,{process.udf['Read 2 Cycles']}", + f"Index1Cycles,{process.udf['Index Read 1']}", + f"Index2Cycles,{process.udf['Index Read 2']}", + # BCLConvert_Settings + "[BCLConvert_Settings]", + f"SoftwareVersion,{sequencer_conversion_settings['software_version']}", + f"FastqCompressionFormat,{sequencer_conversion_settings['fastq_compression_format']}", + f"AdapterRead1,{process.udf['Adapter']}", + f"AdapterRead2,{process.udf['Adapter Read 2']}", + "FindAdaptersWithIndels,TRUE", + "BarcodeMismatchesIndex1,0", + "BarcodeMismatchesIndex2,0", + "TrimUMI,TRUE", + # BCLConvert_Data + "[BCLConvert_Data]" + ] + + # Set header for single or multiple lanes conversion + bcl_convert_data_header = "Sample_ID,index,index2,OverrideCycles,Sample_Project" + if len(samplesheet_samples) == 1: # All samples on all lanes + multiple_lanes = False + else: + multiple_lanes = True + bcl_convert_data_header = f"Lane,{bcl_convert_data_header}" # Add lane column to header if multiple lanes conversion + sample_sheet.append(bcl_convert_data_header) + + # Add samples to SampleSheet + for lane, lane_samples in sorted(samplesheet_samples.items()): + for sample in lane_samples: + bcl_convert_data_row = "{sample_name},{index_1},{index_2},{override_cycles},{project}".format( + sample_name=sample, + index_1=lane_samples[sample]['index_1'], + index_2=lane_samples[sample]['index_2'], + override_cycles=lane_samples[sample]['override_cycles'], + project=lane_samples[sample]['project'] ) - custom_settings = 'TrimUMI,1\n' - - custom_settings = '{settings}OverrideCycles,{override_cycles}\n'.format( - settings=custom_settings, - override_cycles=';'.join(override_cycles) - ) - - for data in csv.reader( - lims.get_file_contents(id=file_id).rstrip().split('\n'), - quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True - ): - if data[0] == '[Settings]' and custom_settings: - output_file.write('{line}\n'.format(line=','.join(data))) - output_file.write(custom_settings) - settings_section = True - - elif data[0] == '[Data]' and custom_settings and not settings_section: - output_file.write('[Settings]\n') - output_file.write(custom_settings) - output_file.write('{line}\n'.format(line=','.join(data))) - - elif data[0] == 'Sample_ID': # Samples header line - sample_header = data - sample_id_index = sample_header.index('Sample_ID') - sample_name_index = sample_header.index('Sample_Name') - sample_project_index = sample_header.index('Sample_Project') - - if 'index2' in sample_header: - index_index = sample_header.index('index2') - else: - index_index = sample_header.index('index') - - output_file.write('{line}\n'.format(line=','.join(data))) - - elif sample_header: # Samples header seen, so continue with samples. - sample_name = data[sample_name_index].split(',')[0] - if sample_name in sample_sequence_names: - data[sample_name_index] = sample_sequence_names[sample_name] - - # Set Sample_Project - if data[sample_name_index] in sample_projects: - data[sample_project_index] = sample_projects[data[sample_name_index]] - - # Overwrite Sample_ID with Sample_name to get correct conversion output folder structure - data[sample_id_index] = data[sample_name_index] - - # Reverse complement index for NextSeq runs - if nextseq_run: - data[index_index] = clarity_epp.export.utils.reverse_complement(data[index_index]) + if multiple_lanes: # Add lane number to row if multiple lanes conversion + bcl_convert_data_row = f"{lane},{bcl_convert_data_row}" + sample_sheet.append(bcl_convert_data_row) - output_file.write('{line}\n'.format(line=','.join(data))) - else: # Leave other lines untouched. - output_file.write('{line}\n'.format(line=','.join(data))) + # Write SampleSheet to file + output_file.write('\n'.join(sample_sheet)) diff --git a/clarity_epp/export/manual_pipetting.py b/clarity_epp/export/manual_pipetting.py index 81cea0c..09a007e 100755 --- a/clarity_epp/export/manual_pipetting.py +++ b/clarity_epp/export/manual_pipetting.py @@ -681,9 +681,9 @@ def samplesheet_pool_samples(lims, process_id, output_file): input_sample = input_artifact.samples[0] # Asume one sample if 'Dx Exoomequivalent' in input_sample.udf: - volume = 5 * input_sample.udf['Dx Exoomequivalent'] + volume = 4 * input_sample.udf['Dx Exoomequivalent'] else: - volume = 5 + volume = 4 output_file.write( '{sample}\t{container}\t{well}\t{pool}\t{volume}\n'.format( diff --git a/clarity_epp/export/merge.py b/clarity_epp/export/merge.py index bab9afd..e360f5f 100644 --- a/clarity_epp/export/merge.py +++ b/clarity_epp/export/merge.py @@ -7,7 +7,17 @@ def create_file(lims, process_id, output_file): """Create mege file.""" process = Process(lims, id=process_id) - sample_artifacts = get_sample_artifacts_from_pool(lims, process.analytes()[0][0]) + + # Get output container assume one flowcell per sequencing run + output_container = process.output_containers()[0] + + # Get unique sample artifacts in run + # TODO: This is a copy of the code from ped.py. It should be refactored to a common function. + sample_artifacts = [] + for lane_artifact in output_container.get_placements().values(): + for sample_artifact in get_sample_artifacts_from_pool(lims, lane_artifact): + if sample_artifact not in sample_artifacts: + sample_artifacts.append(sample_artifact) output_file.write('Sample\tMerge 1 Sample\tMerge 1 Sequencing Run\tMerge 2 Sample\tMerge 2 Sequencing Run\n') diff --git a/clarity_epp/export/ped.py b/clarity_epp/export/ped.py index 836d7af..822b0e3 100644 --- a/clarity_epp/export/ped.py +++ b/clarity_epp/export/ped.py @@ -7,7 +7,18 @@ def create_file(lims, process_id, output_file): """Create ped file.""" process = Process(lims, id=process_id) - sample_artifacts = get_sample_artifacts_from_pool(lims, process.analytes()[0][0]) + + # Get output container assume one flowcell per sequencing run + output_container = process.output_containers()[0] + + # Get unique sample artifacts in run + # TODO: This is a copy of the code from merge.py. It should be refactored to a common function. + sample_artifacts = [] + for lane_artifact in output_container.get_placements().values(): + for sample_artifact in get_sample_artifacts_from_pool(lims, lane_artifact): + if sample_artifact not in sample_artifacts: + sample_artifacts.append(sample_artifact) + ped_families = {} for sample_artifact in sample_artifacts: diff --git a/clarity_epp/export/tapestation.py b/clarity_epp/export/tapestation.py index b7fcc32..f39d52e 100644 --- a/clarity_epp/export/tapestation.py +++ b/clarity_epp/export/tapestation.py @@ -10,7 +10,7 @@ def samplesheet(lims, process_id, output_file): process = Process(lims, id=process_id) well_plate = {} - for placement, artifact in process.output_containers()[0].placements.iteritems(): + for placement, artifact in process.output_containers()[0].placements.items(): placement = ''.join(placement.split(':')) well_plate[placement] = artifact.name.split('_')[0] diff --git a/clarity_epp/export/utils.py b/clarity_epp/export/utils.py index 1f9d20c..549e9fa 100755 --- a/clarity_epp/export/utils.py +++ b/clarity_epp/export/utils.py @@ -1,4 +1,5 @@ """Utility functions used for creating samplesheets.""" +import re def sort_96_well_plate(wells): @@ -84,3 +85,13 @@ def get_well_index(well, one_based=False): return wells.index(well) + 1 else: return wells.index(well) + + +def get_sample_sequence_index(reagent_label): + """Return sample sequence indices [index1, index2] from reagent label. + expected reagent label pattern = " (index1-index2)" or " (index1)" + """ + sample_index_search = re.search(r"\(([ACTGN-]+)\)$", reagent_label) + sample_index = sample_index_search.group(1).split('-') + + return sample_index diff --git a/clarity_epp/placement/artifact.py b/clarity_epp/placement/artifact.py index aeead75..d96bd1f 100644 --- a/clarity_epp/placement/artifact.py +++ b/clarity_epp/placement/artifact.py @@ -3,6 +3,7 @@ from genologics.entities import Process, Workflow from .. import get_sequence_name +from clarity_epp.export.utils import sort_artifact_list import config @@ -17,19 +18,23 @@ def set_sequence_name(lims, process_id): def set_runid_name(lims, process_id): """Change artifact name to run id.""" process = Process(lims, id=process_id) - analyte = process.analytes()[0][0] input_artifact = process.all_inputs()[0] - container_name = analyte.container.name + # Fix for NovaSeqXPlus workflow configuration + # TODO: Set NovaSeqXPlus step to 'Analysis' type. + if 'NovaSeqXPlus' in input_artifact.parent_process.type.name: + input_artifact = input_artifact.parent_process.all_inputs()[0] # Find sequencing process # Assume one sequence process per input artifact for sequence_process_type in config.sequence_process_types: sequence_processes = lims.get_processes(type=sequence_process_type, inputartifactlimsid=input_artifact.id) for sequence_process in sequence_processes: - if sequence_process.analytes()[0][0].container.name == container_name: - analyte.name = sequence_process.udf['Run ID'] - analyte.put() + sequence_process_lanes = sorted(sequence_process.analytes()[0], key=sort_artifact_list) + for lane_idx, lane in enumerate(sorted(process.analytes()[0], key=sort_artifact_list)): + if sequence_process_lanes[lane_idx].container.name == lane.container.name: + lane.name = sequence_process.udf['Run ID'] + lane.put() def route_to_workflow(lims, process_id, workflow): diff --git a/clarity_epp/placement/pool.py b/clarity_epp/placement/pool.py index ae69b92..3e4b26e 100644 --- a/clarity_epp/placement/pool.py +++ b/clarity_epp/placement/pool.py @@ -14,6 +14,11 @@ def unpooling(lims, process_id): pool_artifact = process.all_inputs()[0] pool_artifact_parent_process = pool_artifact.parent_process + # Fix for NovaSeqXPlus workflow configuration + # TODO: Set NovaSeqXPlus step to 'Analysis' type. + if 'laden' not in pool_artifact_parent_process.type.name.lower(): + pool_artifact_parent_process = pool_artifact_parent_process.all_inputs()[0].parent_process + run_id = pool_artifact.name # Assume run id is set as pool name using placement/artifact/set_runid_name sample_artifacts = [] # sample artifacts before pooling sample_projects = {} @@ -35,22 +40,22 @@ def unpooling(lims, process_id): sample_projects[data[sample_index]] = data[project_index] # Parse sequencing run samples and move Dx samples to post sequencing workflow - for sample_artifact in get_sample_artifacts_from_pool(lims, pool_artifact): - sample = sample_artifact.samples[0] # Asume all samples metadata is identical. - - # Set sample sequencing run and project - sample_artifact.udf['Dx Sequencing Run ID'] = run_id - # Use sample.name for external (clarity_portal) samples - if 'Sample Type' in sample.udf and 'library' in sample.udf['Sample Type']: - sample_artifact.udf['Dx Sequencing Run Project'] = sample_projects[sample.name] - else: # Use sample_artifact.name for Dx samples (upload via Helix) - sample_artifact.udf['Dx Sequencing Run Project'] = sample_projects[sample_artifact.name] - sample_artifact.put() - - # Only move DX production samples to post sequencing workflow - if sample.project and sample.project.udf['Application'] == 'DX': - sample_artifacts.append(sample_artifact) - + for lane in process.all_inputs(): + for sample_artifact in get_sample_artifacts_from_pool(lims, lane): + sample = sample_artifact.samples[0] # Asume all samples metadata is identical. + + # Set sample sequencing run and project + sample_artifact.udf['Dx Sequencing Run ID'] = run_id + # Use sample.name for external (clarity_portal) samples + if 'Sample Type' in sample.udf and 'library' in sample.udf['Sample Type']: + sample_artifact.udf['Dx Sequencing Run Project'] = sample_projects[sample.name] + else: # Use sample_artifact.name for Dx samples (upload via Helix) + sample_artifact.udf['Dx Sequencing Run Project'] = sample_projects[sample_artifact.name] + sample_artifact.put() + + # Only move DX production samples to post sequencing workflow + if sample_artifact not in sample_artifacts and sample.project and sample.project.udf['Application'] == 'DX': + sample_artifacts.append(sample_artifact) lims.route_artifacts(sample_artifacts, workflow_uri=Workflow(lims, id=config.post_sequencing_workflow).uri) diff --git a/clarity_epp/qc/qubit.py b/clarity_epp/qc/qubit.py index fcc07b6..2c16aee 100644 --- a/clarity_epp/qc/qubit.py +++ b/clarity_epp/qc/qubit.py @@ -9,7 +9,7 @@ def set_qc_flag(lims, process_id, cutoff=10): """Set qubit qc flags based on Dx Concentratie fluorescentie (ng/ul) values.""" process = Process(lims, id=process_id) artifacts = process.result_files() - concentration_range = map(float, re.findall('[\d\.]+', process.udf['Concentratiebereik (ng/ul)'])) + concentration_range = list(map(float, re.findall('[\d\.]+', process.udf['Concentratiebereik (ng/ul)']))) samples_measurements = {} for artifact in artifacts: diff --git a/clarity_epp/upload/samples.py b/clarity_epp/upload/samples.py index b858e0c..cba9527 100644 --- a/clarity_epp/upload/samples.py +++ b/clarity_epp/upload/samples.py @@ -166,6 +166,10 @@ def from_helix(lims, email_settings, input_file): if udf_data['Dx Onderzoeksindicatie'] == 'DSD00' and udf_data['Dx Familie status'] == 'Kind': udf_data['Dx Geslacht'] = 'Onbekend' + # Set 'Dx Exoomequivalent' for specific indications + if udf_data['Dx Onderzoeksindicatie'] in config.indications_exome_equivalent: + udf_data['Dx Exoomequivalent'] = config.indications_exome_equivalent[udf_data['Dx Onderzoeksindicatie']] + # Check 'Dx Familienummer' and correct if '/' in udf_data['Dx Familienummer']: udf_data['Dx Import warning'] = ';'.join([ diff --git a/clarity_epp/upload/tecan.py b/clarity_epp/upload/tecan.py index d0e33c0..bd025e0 100644 --- a/clarity_epp/upload/tecan.py +++ b/clarity_epp/upload/tecan.py @@ -10,7 +10,7 @@ def results_qc(lims, process_id): """Upload tecan results to artifacts.""" process = Process(lims, id=process_id) - concentration_range = map(float, re.findall('[\d\.]+', process.udf['Concentratiebereik (ng/ul)'])) + concentration_range = list(map(float, re.findall('[\d\.]+', process.udf['Concentratiebereik (ng/ul)']))) # Parse output file for output in process.all_outputs(unique=True): @@ -21,7 +21,7 @@ def results_qc(lims, process_id): measurements = {} sample_measurements = {} - for line in lims.get_file_contents(tecan_result_file.id).data.split('\n'): + for line in lims.get_file_contents(tecan_result_file.id).data.decode('utf-8').split('\n'): if not line.startswith('<>'): data = line.rstrip().split('\t') for index, value in enumerate(data[1:]): diff --git a/config.py b/config.py index c002932..d6102c7 100755 --- a/config.py +++ b/config.py @@ -30,6 +30,9 @@ stoftestcode_mip: '1651', # DEV Dx smMIP v1.2 } +# Update exome equivalent for certain indications +indications_exome_equivalent = {'UBA1': 5, 'PID09': 5} + # Export meetw protocol steps WES meetw_zui_wes_processes = [ 'Dx Sample registratie zuivering v1.1', @@ -85,10 +88,48 @@ 'Dx NextSeq Run v1.0', 'Dx NextSeq Run v1.1', 'Dx Automated NovaSeq Run (standaard) v1.0', 'Dx Automated NovaSeq Run (standaard) v1.1', 'AUTOMATED - NovaSeq Run (NovaSeq 6000 v3.1)', + 'Dx NovaSeqXPlus Run v1.0' ] +# BCLConvert conversion settings +sequencer_conversion_settings = { + # Orientation options: F=forward or RC=reverse complement + # https://knowledge.illumina.com/software/general/software-general-reference_material-list/000001800 + 'Dx Library pool denatureren en laden (NovaSeq) v1.3': { + 'index_2_conversion_orientation': 'RC', + 'instrument_platform': 'NovaSeq', + 'index_orientation': 'Forward', + 'software_version': '4.1.7', + 'fastq_compression_format': 'gzip', + }, + 'Dx Library pool denatureren en laden (NovaSeqXPlus) v1.0': { + 'index_2_conversion_orientation': 'F', + 'instrument_platform': 'NovaSeqXSeries', + 'index_orientation': 'Forward', + 'software_version': '4.1.7', + 'fastq_compression_format': 'gzip', + }, +} +sample_conversion_settings = { + 'default': { + 'project': 'unknown', + 'split_project': False, + 'umi_len': [0, 0], + }, + 'elidS34226467': { + 'project': 'CREv4', + 'split_project': True, + 'umi_len': [5, 5], + }, + 'elidS31285117': { + 'project': 'SSv7', + 'split_project': True, + 'umi_len': [5, 5], + }, +} + # Post sequencing workflow -sequencing_workflow = '1701' # DEV Dx Illumina Sequencing v1.2 +sequencing_workflow = '2052' # DEV Dx Illumina Sequencing v1.3 post_sequencing_workflow = '1204' # DEV Dx Bioinformatica analyses v1.1 post_bioinf_workflow = '1803' # DEV Dx NGS WES onderzoeken afronden v2.0 diff --git a/tests/test_export_illumina.py b/tests/test_export_illumina.py new file mode 100644 index 0000000..da43b9e --- /dev/null +++ b/tests/test_export_illumina.py @@ -0,0 +1,23 @@ +from clarity_epp.export import illumina + + +def test_get_override_cycles(): + # Magnis prep with legacy index settings (8, 8) - NovaSeq 6000 + assert illumina.get_override_cycles([151, 151], [5, 5], [8, 8], [8, 8], 'RC') == 'U5Y145N1;I8;I8;U5Y145N1' + # Magnis prep with legacy index settings (8, 8) - NovaSeq X Plus + assert illumina.get_override_cycles([151, 151], [5, 5], [8, 8], [8, 8], 'F') == 'U5Y145N1;I8;I8;U5Y145N1' + + # Magnis prep with new default index settings (19, 10) - NovaSeq 6000 + assert illumina.get_override_cycles([151, 151], [5, 5], [8, 8], [19, 10], 'RC') == 'U5Y145N1;I8N11;I8N2;U5Y145N1' + # Magnis prep with new default index settings (19, 10) - NovaSeq X Plus + assert illumina.get_override_cycles([151, 151], [5, 5], [8, 8], [19, 10], 'F') == 'U5Y145N1;I8N11;N2I8;U5Y145N1' + + +def test_get_project(): + assert illumina.get_project({'SSv7_1': 1, 'SSv7_2': 0, 'SSv7_3': 0}) == 'SSv7_2' + assert illumina.get_project({'SSv7_1': 1, 'SSv7_2': 0, 'SSv7_3': 0}, urgent=True) == 'SSv7_1' + assert illumina.get_project({'SSv7_1': 3, 'SSv7_2': 1, 'SSv7_3': 0}) == 'SSv7_3' + assert illumina.get_project({'SSv7_1': 3, 'SSv7_2': 1, 'SSv7_3': 0}, urgent=True) == 'SSv7_1' + assert illumina.get_project({'SSv7_1': 3, 'SSv7_2': 1, 'SSv7_3': 1}) == 'SSv7_2' + assert illumina.get_project({'SSv7_1': 3, 'SSv7_2': 1, 'SSv7_3': 0}, urgent=True) == 'SSv7_1' + assert illumina.get_project({'SSv7_1': 9, 'SSv7_2': 5, 'SSv7_3': 5}, urgent=True) == 'SSv7_2' diff --git a/tests/test_export_utils.py b/tests/test_export_utils.py index 70bfea8..f5c16c3 100644 --- a/tests/test_export_utils.py +++ b/tests/test_export_utils.py @@ -21,3 +21,17 @@ def test_sort_artifact_list(): def test_get_well_index(): assert utils.get_well_index('A1') == 0 assert utils.get_well_index('A1', one_based=True) == 1 + + +def test_get_sample_sequence_index(): + # Dual Index + assert utils.get_sample_sequence_index('Dx 12D NEXTflex UDI 48 (TTAGAGTC-TGTGACGA)') == ['TTAGAGTC', 'TGTGACGA'] + assert utils.get_sample_sequence_index('Dx 10G NEXTflex custom UDI 79 (TGAGGCGC-GGAGACCA)') == ['TGAGGCGC', 'GGAGACCA'] + assert utils.get_sample_sequence_index('Dx 01G Agilent SureSelect XT HS2 UDI_v2 007 (GCAGGTTC-AGAAGCAA)') == ['GCAGGTTC', 'AGAAGCAA'] + assert utils.get_sample_sequence_index('Dx 02B Agilent SureSelect XT HS2 UDI_v1 010 (TAGAGCTC-CTACCGAA)') == ['TAGAGCTC', 'CTACCGAA'] + + # Single Index + assert utils.get_sample_sequence_index('Dx 12D NEXTflex UDI 48 (TTAGAGTC)') == ['TTAGAGTC'] + assert utils.get_sample_sequence_index('Dx 10G NEXTflex custom UDI 79 (TGAGGCGC)') == ['TGAGGCGC'] + assert utils.get_sample_sequence_index('Dx 01G Agilent SureSelect XT HS2 UDI_v2 007 (GCAGGTTC)') == ['GCAGGTTC'] + assert utils.get_sample_sequence_index('Dx 02B Agilent SureSelect XT HS2 UDI_v1 010 (TAGAGCTC)') == ['TAGAGCTC']