diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cac7308e..73d0cdbb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -49,7 +49,16 @@ jobs: - "23.04.0" - "latest-everything" profile: - ["test_tcr", "test_no_umi", "test_nocluster", "test_fetchimgt", "test_assembled_hs", "test_assembled_mm"] + [ + "test_tcr", + "test_no_umi", + "test_nocluster", + "test_fetchimgt", + "test_assembled_hs", + "test_assembled_mm", + "test_clontech_umi", + "test_nebnext_umi", + ] fail-fast: false steps: - name: Check out pipeline code diff --git a/CHANGELOG.md b/CHANGELOG.md index a2f9f389..944bc3e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Added` - [#294](https://github.com/nf-core/airrflow/pull/294) Merge template updates nf-core/tools v2.11.1 +- [#299](https://github.com/nf-core/airrflow/pull/299) Add profile for common NEB and TAKARA protocols ### `Fixed` diff --git a/bin/log_parsing.py b/bin/log_parsing.py index dce9c831..d262f51b 100755 --- a/bin/log_parsing.py +++ b/bin/log_parsing.py @@ -52,7 +52,7 @@ df_process_list = [] for process in processes: - find = subprocess.check_output(["find", process, "-name", "*command_log.txt"]) + find = subprocess.check_output(["find", process, "-name", "*command_log*"]) log_files = find.decode().split("\n") log_files = list(filter(None, log_files)) @@ -90,50 +90,37 @@ elif process in ["mask_primers", "filter_by_sequence_quality"]: s_code = [] + s_readtype = [] output_file = [] - seqs_R1 = [] - seqs_R2 = [] - pass_R1 = [] - pass_R2 = [] - fail_R1 = [] - fail_R2 = [] + n_seqs = [] + n_pass = [] + n_fail = [] process_name = [] for logfile in log_files: - c = 0 + if "_R1" in logfile: + s_readtype.append("R1") + elif "_R2" in logfile: + s_readtype.append("R2") with open(logfile, "r") as f: for line in f: if " START>" in line: - if c < 1: - s_code.append(logfile.split("/")[1].split("_command_log")[0]) - - process_name.append(process) + s_code.append(logfile.split("/")[1].split("_command_log")[0]) + process_name.append(process) elif "SEQUENCES>" in line: - if c < 1: - seqs_R1.append(line.strip().removeprefix("SEQUENCES> ")) - else: - seqs_R2.append(line.strip().removeprefix("SEQUENCES> ")) + n_seqs.append(line.strip().removeprefix("SEQUENCES> ")) elif "PASS>" in line: - if c < 1: - pass_R1.append(line.strip().removeprefix("PASS> ")) - else: - pass_R2.append(line.strip().removeprefix("PASS> ")) + n_pass.append(line.strip().removeprefix("PASS> ")) elif "FAIL>" in line: - if c < 1: - fail_R1.append(line.strip().removeprefix("FAIL> ")) - c += 1 - else: - fail_R2.append(line.strip().removeprefix("FAIL> ")) + n_fail.append(line.strip().removeprefix("FAIL> ")) df_process = pd.DataFrame.from_dict( { "Sample": s_code, - "start_R1": seqs_R1, - "start_R2": seqs_R2, - "pass_R1": pass_R1, - "pass_R2": pass_R2, - "fail_R1": fail_R1, - "fail_R2": fail_R2, + "readtype": s_readtype, + "start": n_seqs, + "pass": n_pass, + "fail": n_fail, "process": process_name, } ) @@ -344,48 +331,13 @@ df_process_list.append(df_process) -# Getting table colnames - -colnames = [ - "Sample", - "Sequences_R1", - "Sequences_R2", - "Filtered_quality_R1", - "Filtered_quality_R2", - "Mask_primers_R1", - "Mask_primers_R2", - "Paired", - "Build_consensus", - "Assemble_pairs", - "Unique", - "Representative_2", - "Igblast", -] - - -values = [ - df_process_list[0].sort_values(by=["Sample"]).iloc[:, 0].tolist(), - df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R1"].tolist(), - df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R2"].tolist(), - df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(), - df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(), - df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(), - df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(), - df_process_list[2].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(), - df_process_list[4].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(), - df_process_list[5].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(), - df_process_list[6].sort_values(by=["Sample"]).loc[:, "unique"].tolist(), - df_process_list[7].sort_values(by=["Sample"]).loc[:, "repres_2"].tolist(), - df_process_list[7].sort_values(by=["Sample"]).loc[:, "pass_igblast"].tolist(), -] - # Tables provide extra info and help debugging df_process_list[0].to_csv( path_or_buf="Table_all_details_filter_quality.tsv", sep="\t", header=True, - index=False, + index=True, ) df_process_list[1].to_csv(path_or_buf="Table_all_details_mask_primers.tsv", sep="\t", header=True, index=False) df_process_list[2].to_csv(path_or_buf="Table_all_details_paired.tsv", sep="\t", header=True, index=False) @@ -393,7 +345,7 @@ path_or_buf="Table_all_details_build_consensus.tsv", sep="\t", header=True, - index=False, + index=True, ) df_process_list[4].to_csv(path_or_buf="Table_all_details_repaired.tsv", sep="\t", header=True, index=False) df_process_list[5].to_csv( @@ -413,6 +365,43 @@ index=False, ) +# Getting table colnames + +colnames = [ + "Sample", + "Sequences_R1", + "Sequences_R2", + "Filtered_quality_R1", + "Filtered_quality_R2", + "Mask_primers_R1", + "Mask_primers_R2", + "Paired", + "Build_consensus", + "Assemble_pairs", + "Unique", + "Representative_2", + "Igblast", +] + +print(df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")) + +values = [ + df_process_list[2].sort_values(by=["Sample"]).iloc[:, 0].tolist(), + df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["start"]["R1"].tolist(), + df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["start"]["R2"].tolist(), + df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R1"].tolist(), + df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R2"].tolist(), + df_process_list[1].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R1"].tolist(), + df_process_list[1].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R2"].tolist(), + df_process_list[2].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(), + df_process_list[4].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(), + df_process_list[5].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(), + df_process_list[6].sort_values(by=["Sample"]).loc[:, "unique"].tolist(), + df_process_list[7].sort_values(by=["Sample"]).loc[:, "repres_2"].tolist(), + df_process_list[7].sort_values(by=["Sample"]).loc[:, "pass_igblast"].tolist(), +] + + final_table = dict(zip(colnames, values)) print(final_table) df_final_table = pd.DataFrame.from_dict(final_table) diff --git a/conf/clontech_umi_bcr.config b/conf/clontech_umi_bcr.config new file mode 100644 index 00000000..f5458dfb --- /dev/null +++ b/conf/clontech_umi_bcr.config @@ -0,0 +1,40 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/airrflow -profile clontech_umi_bcr, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Takara Bio / Clontech SMARTer v2' + config_profile_description = 'Profile to run pipeline for the Takara Bio / Clontech SMARTer v2 (UMI) BCR protocol profile' + + mode = 'fastq' + + library_generation_method = 'dt_5p_race_umi' + + cprimers = 'https://bitbucket.org/kleinstein/immcantation/raw/c98269b194e9c6262fe3b098be3600ba7f64b85c/protocols/Universal/Human_IG_CRegion_RC.fasta' + + // primer options + cprimer_position = 'R1' + cprimer_start = 0 + vprimer_start = 0 + umi_length = 12 + umi_position = 'R2' + cluster_sets = false + + + // Mask primer options + maskprimers_align = true + primer_extract_len = 7 + primer_mask_mode = 'cut' + primer_maxlen = 70 + primer_r1_maxerror = 0.2 + assemblepairs_sequential = true + primer_consensus = 0.6 +} diff --git a/conf/clontech_umi_tcr.config b/conf/clontech_umi_tcr.config new file mode 100644 index 00000000..d620dcee --- /dev/null +++ b/conf/clontech_umi_tcr.config @@ -0,0 +1,44 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/airrflow -profile clontech_umi_tcr, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Takara Bio / Clontech SMARTer v2 TCR' + config_profile_description = 'Profile to run pipeline for the Takara Bio / Clontech SMARTer v2 (UMI) TCR protocol profile' + + mode = 'fastq' + + library_generation_method = 'dt_5p_race_umi' + + cprimers = 'https://bitbucket.org/kleinstein/immcantation/raw/16f94088c1df5c7a0ee1c9ea8b403cd4d2488e8a/protocols/Universal/Human_TR_CRegion_RC.fasta' + + // primer options + cprimer_position = 'R1' + cprimer_start = 0 + vprimer_start = 0 + umi_length = 12 + umi_position = 'R2' + cluster_sets = false + + + // Mask primer options + maskprimers_align = true + primer_extract_len = 7 + primer_mask_mode = 'cut' + primer_maxlen = 70 + primer_r1_maxerror = 0.2 + assemblepairs_sequential = true + primer_consensus = 0.6 + + // TCR options + clonal_threshold = 0 + skip_lineage = true +} diff --git a/conf/modules.config b/conf/modules.config index 3d69f9b4..7ac94a44 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -146,6 +146,36 @@ process { ] } + withName: PRESTO_MASKPRIMERS_ALIGN { + publishDir = [ + path: { "${params.outdir}/presto/02-maskprimers/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = '--skiprc --pf CREGION' + ext.args2 = '-f ID CREGION ERROR' + } + + withName: PRESTO_ALIGN_CREGION { + publishDir = [ + path: { "${params.outdir}/presto/internal_cregion/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = '--skiprc --revpr --pf CREGION' + ext.args2 = '-f ID PRIMER ERROR --outname cregion_alignment' + } + + withName: PRESTO_MASKPRIMERS_EXTRACT { + publishDir = [ + path: { "${params.outdir}/presto/02-maskprimers/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = '--barcode --bf BARCODE' + ext.args2 = '-f ID PRIMER ERROR PRSTART' + } + withName: PRESTO_MASKPRIMERS_POSTASSEMBLY_SANS_UMI { publishDir = [ path: { "${params.outdir}/presto/03-maskprimers/${meta.id}" }, @@ -160,6 +190,16 @@ process { mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + ext.args = "--coord illumina" + } + + withName: PRESTO_PAIRSEQ_ALIGN { + publishDir = [ + path: { "${params.outdir}/presto/03-pairseq/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = '--1f CREGION --coord illumina' } withName: PRESTO_CLUSTERSETS { @@ -184,9 +224,20 @@ process { mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - ext.args = '' - ext.args2 = '' - ext.args3 = 'ID BARCODE SEQCOUNT PRIMER PRCOUNT PRCONS PRFREQ CONSCOUNT' + ext.args = '--pf PRIMER' + ext.args2 = '--pf PRIMER' + ext.args3 = '-f ID BARCODE SEQCOUNT PRIMER PRCOUNT PRCONS PRFREQ CONSCOUNT ERROR' + } + + withName: PRESTO_BUILDCONSENSUS_ALIGN { + publishDir = [ + path: { "${params.outdir}/presto/06-build-consensus/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = '--pf CREGION' + ext.args2 = '--pf CREGION' + ext.args3 = '-f ID BARCODE SEQCOUNT PRIMER PRCOUNT PRCONS PRFREQ CONSCOUNT ERROR' } withName: PRESTO_POSTCONSENSUS_PAIRSEQ { @@ -207,6 +258,16 @@ process { ext.args2 = '-f ID BARCODE SEQCOUNT PRIMER PRCOUNT PRCONS PRFREQ CONSCOUNT LENGTH OVERLAP ERROR PVALUE' } + withName: PRESTO_ASSEMBLEPAIRS_SEQUENTIAL { + publishDir = [ + path: { "${params.outdir}/presto/08-assemble-pairs/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = '--coord presto --rc tail --1f CONSCOUNT --2f CONSCOUNT PRCONS --minlen 8 --maxerror 0.3 --alpha 1e-5 --scanrev --minident 0.5 --evalue 1e-5 --maxhits 100 --aligner blastn' + ext.args2 = '-f ID REFID LENGTH OVERLAP GAP ERROR PVALUE EVALUE1 EVALUE2 IDENTITY FIELDS1 FIELDS2' + } + withName: PRESTO_ASSEMBLEPAIRS_SANS_UMI { publishDir = [ path: { "${params.outdir}/presto/01-assemble-pairs/${meta.id}" }, @@ -232,6 +293,14 @@ process { ext.args = 'PRCONS PRCONS' } + withName: PRESTO_PARSEHEADERS_CREGION { + publishDir = [ + enabled: false + ] + ext.subcommand = 'rename' + ext.args = '-f PRCONS -k CREGION' + } + withName: PRESTO_PARSEHEADERS_PRIMERS_SANS_UMI { publishDir = [ enabled: false @@ -256,6 +325,26 @@ process { ext.args2 = '-f HEADER DUPCOUNT CONSCOUNT' } + withName: PRESTO_COLLAPSESEQ_ALIGN { + publishDir = [ + path: { "${params.outdir}/presto/09-collapseseq/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = '-n 0 --inner --uf CREGION --cf CONSCOUNT --act sum --keepmiss' + ext.args2 = '-f HEADER DUPCOUNT CONSCOUNT' + } + + withName: PRESTO_COLLAPSESEQ_CREGION { + publishDir = [ + path: { "${params.outdir}/presto/09-collapseseq/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = '-n 0 --inner --uf PRCONS CREGION --cf CONSCOUNT --act sum --keepmiss' + ext.args2 = '-f HEADER DUPCOUNT CONSCOUNT' + } + withName: PRESTO_COLLAPSESEQ_SANS_UMI { publishDir = [ path: { "${params.outdir}/presto/04-collapseseq/${meta.id}" }, diff --git a/conf/nebnext_umi_bcr.config b/conf/nebnext_umi_bcr.config new file mode 100644 index 00000000..7467bffe --- /dev/null +++ b/conf/nebnext_umi_bcr.config @@ -0,0 +1,38 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/airrflow -profile nebnext_umi_bcr, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'NEBNext - AbSeq BCR profile' + config_profile_description = 'Profile to run pipeline for the NEBNext - AbSeq (UMI) BCR experimental protocol' + + mode = 'fastq' + cprimers = 'https://bitbucket.org/kleinstein/immcantation/raw/354f49228a43b4c2858d67fb09886126b314e317/protocols/AbSeq/AbSeq_R1_Human_IG_Primers.fasta' + race_linker = 'https://bitbucket.org/kleinstein/immcantation/raw/354f49228a43b4c2858d67fb09886126b314e317/protocols/AbSeq/AbSeq_R2_TS.fasta' + + library_generation_method = 'dt_5p_race_umi' + cprimer_position = 'R1' + cprimer_start = 0 + umi_length = 17 + umi_position = 'R2' + cluster_sets = false + + //presto options + primer_r1_maxerror = 0.2 + primer_r2_maxerror = 0.5 + assemblepairs_sequential = true + maskprimers_align = false + align_cregion = true + internal_cregion_sequences = 'https://bitbucket.org/kleinstein/immcantation/raw/2025594fd9a2a64df4444070171d6fc00c4e78c7/protocols/AbSeq/AbSeq_Human_IG_InternalCRegion.fasta' + cregion_maxlen = 100 + cregion_maxerror = 0.3 + cregion_mask_mode = 'tag' +} diff --git a/conf/nebnext_umi_tcr.config b/conf/nebnext_umi_tcr.config new file mode 100644 index 00000000..e030d952 --- /dev/null +++ b/conf/nebnext_umi_tcr.config @@ -0,0 +1,41 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/airrflow -profile nebnext_umi_tcr, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'NEBNext - AbSeq TCR profile' + config_profile_description = 'Profile to run pipeline for the NEBNext - AbSeq (UMI) TCR experimental protocol' + + mode = 'fastq' + cprimers = 'https://bitbucket.org/kleinstein/immcantation/raw/16f94088c1df5c7a0ee1c9ea8b403cd4d2488e8a/protocols/AbSeq/AbSeq_R1_Human_TR_Primers.fasta' + race_linker = 'https://bitbucket.org/kleinstein/immcantation/raw/354f49228a43b4c2858d67fb09886126b314e317/protocols/AbSeq/AbSeq_R2_TS.fasta' + + library_generation_method = 'dt_5p_race_umi' + cprimer_position = 'R1' + cprimer_start = 0 + umi_length = 17 + umi_position = 'R2' + cluster_sets = false + + //presto options + primer_r1_maxerror = 0.2 + primer_r2_maxerror = 0.5 + assemblepairs_sequential = true + maskprimers_align = false + align_cregion = false + cregion_maxlen = 100 + cregion_maxerror = 0.3 + cregion_mask_mode = 'tag' + + //TCR options + clonal_threshold = 0 + skip_lineage +} diff --git a/conf/test_clontech_umi.config b/conf/test_clontech_umi.config new file mode 100644 index 00000000..552a7434 --- /dev/null +++ b/conf/test_clontech_umi.config @@ -0,0 +1,31 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/airrflow -profile test_clontech_umi, --outdir + +---------------------------------------------------------------------------------------- +*/ +includeConfig 'clontech_umi_bcr.config' + +params { + config_profile_name = 'Test profile for TAKARA protocol' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-clontech/samplesheet.tsv' + + imgtdb_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/imgtdb_base.zip' + igblast_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/igblast_base.zip' + + clonal_threshold = 0.1 + +} diff --git a/conf/test_nebnext_umi.config b/conf/test_nebnext_umi.config new file mode 100644 index 00000000..c96b16b3 --- /dev/null +++ b/conf/test_nebnext_umi.config @@ -0,0 +1,32 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/airrflow -profile test_nebnext_umi, --outdir + +---------------------------------------------------------------------------------------- +*/ + +includeConfig 'nebnext_umi_bcr.config' + +params { + config_profile_name = 'Test profile for NEBNext protocol' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-neb/samplesheet.tsv' + + imgtdb_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/imgtdb_base.zip' + igblast_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/igblast_base.zip' + + clonal_threshold = 0.1 + +} diff --git a/docs/usage.md b/docs/usage.md index aafc8349..dd60f84b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -29,6 +29,16 @@ nextflow run nf-core/airrflow \ --outdir results ``` +You can optionally set a protocol profile if you're running the pipeline with data from one of the supported profiles. The full list of supported profiles can be found in the section [Supported protocol profiles](#supported-protocol-profiles). An example command running the NEBNext UMI protocol profile with docker containers is: + +```bash +nextflow run nf-core/airrflow \ +-profile nebnext_umi,docker \ +--mode fastq \ +--input input_samplesheet.tsv \ +--outdir results +``` + A typical command for running the pipeline departing from **single-cell AIRR rearrangement tables or assembled bulk sequencing fasta** data is: ```bash @@ -165,6 +175,76 @@ nf-core/airrflow offers full support for the [AIRR standards 1.4](https://docs.a | biomaterial_provider | Samplesheet column | | Name of sample biomaterial provider | | library_generation_method | Parameter | `--library_generation_method` | Generic type of library generation | +## Supported protocol profiles + +### NEBNext Immune Sequencing Kit + +- [New England Biolabs NEBNext Immune sequencing kit](https://www.neb.com/en-us/products/e6320-nebnext-immune-sequencing-kit-human#Product%20Information) + +You can use the `nebnext_umi_bcr` or `nebnext_umi_tcr` preset defaults for analyzing bulk fastq sequencing data that was generated with the NEB Immune Profiling kit. An example using docker containers for the analysis is: + +```bash +nextflow run nf-core/airrflow -r \ +-profile nebnext_umi_bcr,docker \ +--input input_samplesheet.tsv \ +--outdir results +``` + +This profile executes the commands based on the pRESTO pre-set pipeline [presto-abseq.sh](https://bitbucket.org/kleinstein/immcantation/src/master/pipelines/presto-abseq.sh). A summary of the performed steps is: + +- Filter sequences by base quality. +- Score and mask the provided R1 primers and R2 template switch oligo. Primer defaults are taken from the [Immcantation repository](https://bitbucket.org/kleinstein/immcantation/src/master/protocols/AbSeq/). +- Pair sequences, build UMI consensus sequence. +- Assemble read pairs with the pRESTO `AssemblePairs sequential` option. +- Align and annotate the internal C Region (for the BCR specific protocol) for a more specific isotype annotation. +- Remove duplicate sequences and filter to sequences with at least 2 supporting sources. + +Please note that the default primer sequences and internal CRegion sequences are for human. If you wish to run this protocol on mouse or other species, please provide the alternative primers: + +```bash +nextflow run nf-core/airrflow -r \ +-profile nebnext_umi_bcr,docker \ +--input input_samplesheet.tsv \ +--cprimers \ +--internal_cregion_sequences \ +--outdir results +``` + +### Clontech / Takara SMARTer Human BCR Profiling kit + +- [TaKaRa SMARTer Human BCR kit](https://www.takarabio.com/products/next-generation-sequencing/immune-profiling/human-repertoire/human-bcr-profiling-kit-for-illumina-sequencing) + +You can use the `clontech_umi_bcr` or `clontech_umi_tcr` preset defaults for analyzing bulk fastq sequencing data that was generated with the Takara SMARTer Human Profiling kit. An example using docker containers for the analysis is: + +```bash +nextflow run nf-core/airrflow -r \ +-profile clontech_umi_bcr,docker \ +--input input_samplesheet.tsv \ +--outdir results +``` + +This profile executes the sequence assembly commands based on the pRESTO pre-set pipeline [presto-clontech-umi.sh](https://bitbucket.org/kleinstein/immcantation/src/master/pipelines/presto-clontech-umi.sh). A summary of the performed steps is: + +- Filter sequences by base quality. +- Align and annotate the universal C region seqeunces in the R1 reads. Defaults are taken from the [Immcantation repository](https://bitbucket.org/kleinstein/immcantation/src/master/protocols/Universal/). +- Identify the primers sequences and UMI (12 nt length) in the R2 reads. +- Pair sequences, build UMI consensus sequence. +- Assemble read pairs with the pRESTO `AssemblePairs sequential` option. +- Align and annotate the C Region sequences. +- Remove duplicate sequences and filter to sequences with at least 2 supporting sources. + +After the sequence assembly steps, the remaining steps are common for all protocols. + +Please note that the default primer sequences and internal CRegion sequences are for human. If you wish to run this protocol on mouse or other species, please provide the alternative primer sequences: + +```bash +nextflow run nf-core/airrflow -r \ +-profile clontech_umi_bcr,docker \ +--input input_samplesheet.tsv \ +--cprimers \ +--outdir results +``` + ## Supported bulk library generation methods (protocols) When processing bulk sequencing data departing from raw `fastq` reads, several sequencing protocols are supported which can be provided with the parameter `--library_generation_method`. diff --git a/modules/local/presto/presto_assemblepairs_sequential.nf b/modules/local/presto/presto_assemblepairs_sequential.nf new file mode 100644 index 00000000..40e0e1b6 --- /dev/null +++ b/modules/local/presto/presto_assemblepairs_sequential.nf @@ -0,0 +1,37 @@ +process PRESTO_ASSEMBLEPAIRS_SEQUENTIAL { + tag "$meta.id" + label 'process_long_parallelized' + label 'immcantation' + + conda "bioconda::presto=0.7.1 bioconda::igblast=1.21.0 conda-forge::wget=1.20.1 conda-forge::biopython=1.79" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-865ad74e0cfd6de39e9e3ade759d826fce726425:25073cb5e81f4a0dcd2f99ddd308510b3461df7e-0' : + 'biocontainers/mulled-v2-865ad74e0cfd6de39e9e3ade759d826fce726425:25073cb5e81f4a0dcd2f99ddd308510b3461df7e-0' }" + + input: + tuple val(meta), path(R1), path(R2) // reads in fastq format + path(igblast) // igblast references + + output: + tuple val(meta), path("*_assemble-pass.fastq"), emit: reads + path("*_command_log.txt"), emit: logs + path("*.log") + path("*_table.tab") + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + """ + AssemblePairs.py sequential -1 $R2 -2 $R1 --nproc ${task.cpus} \\ + -r "${igblast}/fasta/imgt_${meta.species}_${meta.locus.toLowerCase()}_v.fasta" \\ + $args \\ + --outname ${meta.id} --log ${meta.id}.log > ${meta.id}_command_log.txt + ParseLog.py -l ${meta.id}.log $args2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + presto: \$( AssemblePairs.py --version | awk -F' ' '{print \$2}' ) + END_VERSIONS + """ +} diff --git a/modules/local/presto/presto_buildconsensus.nf b/modules/local/presto/presto_buildconsensus.nf index 6f5d9b20..2a85c3ea 100644 --- a/modules/local/presto/presto_buildconsensus.nf +++ b/modules/local/presto/presto_buildconsensus.nf @@ -25,9 +25,9 @@ process PRESTO_BUILDCONSENSUS { def args2 = task.ext.args2 ?: '' def args3 = task.ext.args3 ?: '' """ - BuildConsensus.py -s $R1 --bf ${barcode_field} --nproc ${task.cpus} --pf PRIMER --prcons ${params.primer_consensus} --maxerror ${params.buildconsensus_maxerror} --maxgap ${params.buildconsensus_maxgap} ${args} --outname ${meta.id}_R1 --log ${meta.id}_R1.log > ${meta.id}_command_log.txt - BuildConsensus.py -s $R2 --bf ${barcode_field} --nproc ${task.cpus} --pf PRIMER --prcons ${params.primer_consensus} --maxerror ${params.buildconsensus_maxerror} --maxgap ${params.buildconsensus_maxgap} ${args2} --outname ${meta.id}_R2 --log ${meta.id}_R2.log >> ${meta.id}_command_log.txt - ParseLog.py -l ${meta.id}_R1.log ${meta.id}_R2.log -f ${args3} + BuildConsensus.py -s $R1 --bf ${barcode_field} --nproc ${task.cpus} --prcons ${params.primer_consensus} --maxerror ${params.buildconsensus_maxerror} --maxgap ${params.buildconsensus_maxgap} ${args} --outname ${meta.id}_R1 --log ${meta.id}_R1.log > ${meta.id}_command_log.txt + BuildConsensus.py -s $R2 --bf ${barcode_field} --nproc ${task.cpus} --prcons ${params.primer_consensus} --maxerror ${params.buildconsensus_maxerror} --maxgap ${params.buildconsensus_maxgap} ${args2} --outname ${meta.id}_R2 --log ${meta.id}_R2.log >> ${meta.id}_command_log.txt + ParseLog.py -l ${meta.id}_R1.log ${meta.id}_R2.log ${args3} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/presto/presto_filterseq.nf b/modules/local/presto/presto_filterseq.nf index 4af4267b..a7733147 100644 --- a/modules/local/presto/presto_filterseq.nf +++ b/modules/local/presto/presto_filterseq.nf @@ -13,7 +13,7 @@ process PRESTO_FILTERSEQ { output: tuple val(meta), path("*R1_quality-pass.fastq"), path("*R2_quality-pass.fastq") , emit: reads - path "*_command_log.txt" , emit: logs + path "*_command_log_R?.txt" , emit: logs path "versions.yml" , emit: versions path "*_R1.log" path "*_R2.log" @@ -21,8 +21,8 @@ process PRESTO_FILTERSEQ { script: """ - FilterSeq.py quality -s $R1 -q ${params.filterseq_q} --outname ${meta.id}_R1 --log ${R1.baseName}_R1.log --nproc ${task.cpus} > ${meta.id}_command_log.txt - FilterSeq.py quality -s $R2 -q ${params.filterseq_q} --outname ${meta.id}_R2 --log ${R2.baseName}_R2.log --nproc ${task.cpus} >> ${meta.id}_command_log.txt + FilterSeq.py quality -s $R1 -q ${params.filterseq_q} --outname ${meta.id}_R1 --log ${R1.baseName}_R1.log --nproc ${task.cpus} > ${meta.id}_command_log_R1.txt + FilterSeq.py quality -s $R2 -q ${params.filterseq_q} --outname ${meta.id}_R2 --log ${R2.baseName}_R2.log --nproc ${task.cpus} >> ${meta.id}_command_log_R2.txt ParseLog.py -l ${R1.baseName}_R1.log ${R2.baseName}_R2.log -f ID QUALITY cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/presto/presto_maskprimers.nf b/modules/local/presto/presto_maskprimers.nf index 99aab4dd..48e66a84 100644 --- a/modules/local/presto/presto_maskprimers.nf +++ b/modules/local/presto/presto_maskprimers.nf @@ -15,7 +15,7 @@ process PRESTO_MASKPRIMERS { output: tuple val(meta), path("*_R1_primers-pass.fastq"), path("*_R2_primers-pass.fastq") , emit: reads - path "*_command_log.txt", emit: logs + path "*_command_log_R?.txt", emit: logs path "*_R1.log" path "*_R2.log" path "*.tab", emit: log_tab @@ -28,8 +28,8 @@ process PRESTO_MASKPRIMERS { def primer_start_R1 = (params.index_file | params.umi_position == 'R1') ? "--start ${params.umi_length + params.cprimer_start} --barcode" : "--start ${params.cprimer_start}" def primer_start_R2 = (params.umi_position == 'R2') ? "--start ${params.umi_length + params.vprimer_start} --barcode" : "--start ${params.vprimer_start}" """ - MaskPrimers.py score --nproc ${task.cpus} -s $R1 -p ${cprimers} $primer_start_R1 $revpr --maxerror ${params.primer_maxerror} --mode ${params.primer_mask_mode} --outname ${meta.id}_R1 --log ${meta.id}_R1.log > ${meta.id}_command_log.txt - MaskPrimers.py score --nproc ${task.cpus} -s $R2 -p ${vprimers} $primer_start_R2 $revpr --maxerror ${params.primer_maxerror} --mode ${params.primer_mask_mode} --outname ${meta.id}_R2 --log ${meta.id}_R2.log >> ${meta.id}_command_log.txt + MaskPrimers.py score --nproc ${task.cpus} -s $R1 -p ${cprimers} $primer_start_R1 $revpr --maxerror ${params.primer_r1_maxerror} --mode ${params.primer_mask_mode} --outname ${meta.id}_R1 --log ${meta.id}_R1.log > ${meta.id}_command_log_R1.txt + MaskPrimers.py score --nproc ${task.cpus} -s $R2 -p ${vprimers} $primer_start_R2 $revpr --maxerror ${params.primer_r2_maxerror} --mode ${params.primer_mask_mode} --outname ${meta.id}_R2 --log ${meta.id}_R2.log > ${meta.id}_command_log_R2.txt ParseLog.py -l ${meta.id}_R1.log ${meta.id}_R2.log -f ID PRIMER ERROR cat <<-END_VERSIONS > versions.yml @@ -41,8 +41,8 @@ process PRESTO_MASKPRIMERS { def primer_start_R1 = (params.index_file | params.umi_position == 'R1') ? "--start ${params.umi_length + params.vprimer_start} --barcode" : "--start ${params.vprimer_start}" def primer_start_R2 = (params.umi_position == 'R2') ? "--start ${params.umi_length + params.cprimer_start} --barcode" : "--start ${params.cprimer_start}" """ - MaskPrimers.py score --nproc ${task.cpus} -s $R1 -p ${vprimers} $primer_start_R1 $revpr --maxerror ${params.primer_maxerror} --mode ${params.primer_mask_mode} --outname ${meta.id}_R1 --log ${meta.id}_R1.log > ${meta.id}_command_log.txt - MaskPrimers.py score --nproc ${task.cpus} -s $R2 -p ${cprimers} $primer_start_R2 $revpr --maxerror ${params.primer_maxerror} --mode ${params.primer_mask_mode} --outname ${meta.id}_R2 --log ${meta.id}_R2.log >> ${meta.id}_command_log.txt + MaskPrimers.py score --nproc ${task.cpus} -s $R1 -p ${vprimers} $primer_start_R1 $revpr --maxerror ${params.primer_r1_maxerror} --mode ${params.primer_mask_mode} --outname ${meta.id}_R1 --log ${meta.id}_R1.log > ${meta.id}_command_log_R1.txt + MaskPrimers.py score --nproc ${task.cpus} -s $R2 -p ${cprimers} $primer_start_R2 $revpr --maxerror ${params.primer_r2_maxerror} --mode ${params.primer_mask_mode} --outname ${meta.id}_R2 --log ${meta.id}_R2.log > ${meta.id}_command_log_R2.txt ParseLog.py -l "${meta.id}_R1.log" "${meta.id}_R2.log" -f ID PRIMER ERROR cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/presto/presto_maskprimers_align.nf b/modules/local/presto/presto_maskprimers_align.nf new file mode 100644 index 00000000..055e5d93 --- /dev/null +++ b/modules/local/presto/presto_maskprimers_align.nf @@ -0,0 +1,45 @@ +process PRESTO_MASKPRIMERS_ALIGN { + tag "$meta.id" + label "process_high" + label 'immcantation' + + conda "bioconda::presto=0.7.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/presto:0.7.1--pyhdfd78af_0' : + 'biocontainers/presto:0.7.1--pyhdfd78af_0' }" + + input: + tuple val(meta), path(R1) + path(cprimers) + val(max_len) + val(max_error) + val(mask_mode) + + output: + tuple val(meta), path("*_R1_primers-pass.fastq") , emit: reads + path "*_command_log_R1.txt", emit: logs + path "*_R1.log" + path "*.tab", emit: log_tab + path "versions.yml" , emit: versions + + script: + def args = task.ext.args?: '' + def args2 = task.ext.args2?: '' + """ + MaskPrimers.py align --nproc ${task.cpus} \\ + -s $R1 \\ + -p ${cprimers} \\ + --maxlen ${max_len} \\ + --maxerror ${max_error} \\ + --mode ${mask_mode} \\ + $args \\ + --outname ${meta.id}_R1 \\ + --log ${meta.id}_R1.log > ${meta.id}_command_log_R1.txt + ParseLog.py -l ${meta.id}_R1.log $args2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + presto: \$( MaskPrimers.py --version | awk -F' ' '{print \$2}' ) + END_VERSIONS + """ +} diff --git a/modules/local/presto/presto_maskprimers_extract.nf b/modules/local/presto/presto_maskprimers_extract.nf new file mode 100644 index 00000000..661389e0 --- /dev/null +++ b/modules/local/presto/presto_maskprimers_extract.nf @@ -0,0 +1,40 @@ +process PRESTO_MASKPRIMERS_EXTRACT { + tag "$meta.id" + label "process_high" + label 'immcantation' + + conda "bioconda::presto=0.7.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/presto:0.7.1--pyhdfd78af_0' : + 'biocontainers/presto:0.7.1--pyhdfd78af_0' }" + + input: + tuple val(meta), path(R2) + + output: + tuple val(meta), path("*_R2_primers-pass.fastq") , emit: reads + path "*_command_log_R2.txt", emit: logs + path "*_R2.log" + path "*.tab", emit: log_tab + path "versions.yml" , emit: versions + + script: + def args = task.ext.args?: '' + def args2 = task.ext.args2?: '' + """ + MaskPrimers.py extract --nproc ${task.cpus} \\ + -s $R2 \\ + --start ${params.umi_length} \\ + --len ${params.primer_extract_len} \\ + $args \\ + --mode ${params.primer_mask_mode} \\ + --outname ${meta.id}_R2 \\ + --log ${meta.id}_R2.log >> ${meta.id}_command_log_R2.txt + ParseLog.py -l ${meta.id}_R2.log $args2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + presto: \$( MaskPrimers.py --version | awk -F' ' '{print \$2}' ) + END_VERSIONS + """ +} diff --git a/modules/local/presto/presto_maskprimers_postassembly.nf b/modules/local/presto/presto_maskprimers_postassembly.nf index 982b6f46..91427c90 100644 --- a/modules/local/presto/presto_maskprimers_postassembly.nf +++ b/modules/local/presto/presto_maskprimers_postassembly.nf @@ -24,10 +24,10 @@ process PRESTO_MASKPRIMERS_POSTASSEMBLY { def revpr = params.primer_revpr ? '--revpr' : '' if (params.cprimer_position == "R1") { """ - MaskPrimers.py score --nproc ${task.cpus} -s $reads -p ${cprimers} --start ${params.cprimer_start} --maxerror ${params.primer_maxerror} \ + MaskPrimers.py score --nproc ${task.cpus} -s $reads -p ${cprimers} --start ${params.cprimer_start} --maxerror ${params.primer_r1_maxerror} \ --mode ${params.primer_mask_mode} --outname ${meta.id}-FWD \ --log ${meta.id}-FWD.log > ${meta.id}_command_log.txt - MaskPrimers.py score --nproc ${task.cpus} -s ${meta.id}-FWD_primers-pass.fastq -p ${vprimers} --start ${params.vprimer_start} --maxerror ${params.primer_maxerror} \ + MaskPrimers.py score --nproc ${task.cpus} -s ${meta.id}-FWD_primers-pass.fastq -p ${vprimers} --start ${params.vprimer_start} --maxerror ${params.primer_r2_maxerror} \ --mode ${params.primer_mask_mode} --outname ${meta.id}-REV $revpr \ --log ${meta.id}-REV.log >> ${meta.id}_command_log.txt ParseLog.py -l ${meta.id}-FWD.log ${meta.id}-REV.log -f ID PRIMER ERROR @@ -39,10 +39,10 @@ process PRESTO_MASKPRIMERS_POSTASSEMBLY { """ } else if (params.cprimer_position == "R2") { """ - MaskPrimers.py score --nproc ${task.cpus} -s $reads -p ${vprimers} --start ${params.cprimer_start} --maxerror ${params.primer_maxerror} \ + MaskPrimers.py score --nproc ${task.cpus} -s $reads -p ${vprimers} --start ${params.vprimer_start} --maxerror ${params.primer_r1_maxerror} \ --mode ${params.primer_mask_mode} --outname ${meta.id}-FWD \ --log ${meta.id}-FWD.log > ${meta.id}_command_log.txt - MaskPrimers.py score --nproc ${task.cpus} -s ${meta.id}-FWD_primers-pass.fastq -p ${cprimers} --start ${params.vprimer_start} --maxerror ${params.primer_maxerror} \ + MaskPrimers.py score --nproc ${task.cpus} -s ${meta.id}-FWD_primers-pass.fastq -p ${cprimers} --start ${params.cprimer_start} --maxerror ${params.primer_r2_maxerror} \ --mode ${params.primer_mask_mode} --outname ${meta.id}-REV $revpr \ --log ${meta.id}-REV.log >> ${meta.id}_command_log.txt ParseLog.py -l ${meta.id}-FWD.log ${meta.id}-REV.log -f ID PRIMER ERROR diff --git a/modules/local/presto/presto_pairseq.nf b/modules/local/presto/presto_pairseq.nf index 1027c880..40ac33b1 100644 --- a/modules/local/presto/presto_pairseq.nf +++ b/modules/local/presto/presto_pairseq.nf @@ -18,8 +18,9 @@ process PRESTO_PAIRSEQ { script: def copyfield = (params.index_file | params.umi_position == 'R1') ? "--1f BARCODE" : "--2f BARCODE" + def args = task.ext.args?: '' """ - PairSeq.py -1 ${meta.id}_R1.fastq -2 ${meta.id}_R2.fastq $copyfield --coord illumina > ${meta.id}_command_log.txt + PairSeq.py -1 ${meta.id}_R1.fastq -2 ${meta.id}_R2.fastq $copyfield $args > ${meta.id}_command_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index afc381c6..d0bd6c3c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -47,14 +47,34 @@ params { // -------------------------- // sequence assembly options // -------------------------- + // Filter sequences filterseq_q = 20 - primer_maxerror = 0.2 + + // Mask primers + + primer_r1_maxerror = 0.2 + primer_r2_maxerror = 0.2 primer_mask_mode = 'cut' + maskprimers_align = false + primer_extract_len = 0 + primer_maxlen = 50 + + // Build consensus primer_consensus = 0.6 buildconsensus_maxerror = 0.1 buildconsensus_maxgap = 0.5 cluster_sets = true + // Assemble pairs + assemblepairs_sequential = false + + // internal cregion + align_cregion = false + internal_cregion_sequences = null + cregion_maxlen = 100 + cregion_maxerror = 0.3 + cregion_mask_mode = 'tag' + // ----------------------- // vdj annotation options // ----------------------- @@ -268,7 +288,12 @@ profiles { test_assembled_immcantation_devel_mm { includeConfig 'conf/test_assembled_immcantation_devel_mm.config' } test_nocluster { includeConfig 'conf/test_nocluster.config' } test_fetchimgt { includeConfig 'conf/test_fetchimgt.config' } - test_igblast { includeConfig 'conf/test_igblast.config' } + test_clontech_umi { includeConfig 'conf/test_clontech_umi.config' } + test_nebnext_umi { includeConfig 'conf/test_nebnext_umi.config' } + nebnext_umi_tcr { includeConfig 'conf/nebnext_umi_tcr.config' } + nebnext_umi_bcr { includeConfig 'conf/nebnext_umi_bcr.config' } + clontech_umi_bcr { includeConfig 'conf/clontech_umi_bcr.config' } + clontech_umi_tcr { includeConfig 'conf/clontech_umi_tcr.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile diff --git a/nextflow_schema.json b/nextflow_schema.json index 387b396d..f1507f50 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -218,17 +218,11 @@ "description": "Quality threshold for pRESTO FilterSeq sequence filtering.", "fa_icon": "fas fa-filter" }, - "primer_maxerror": { - "type": "number", - "default": 0.2, - "description": "Maximum primer scoring error in the pRESTO MaskPrimer step for the C and/or V region primers identification.", - "fa_icon": "fas fa-align-center" - }, "primer_consensus": { "type": "number", "default": 0.6, "description": "Maximum error for building the primer consensus in the pRESTO Buildconsensus step.", - "fa_icon": "fas fa-align-center" + "fa_icon": "fas fa-align-left" }, "primer_mask_mode": { "type": "string", @@ -253,8 +247,70 @@ "cluster_sets": { "type": "boolean", "default": true, - "fa_icon": "fas fa-layer-group", + "fa_icon": "fas fa-align-center", "description": "Cluster sequences by similarity regardless of any annotation with pRESTO ClusterSets and annotate the cluster ID additionally to the UMI barcode." + }, + "primer_r1_maxerror": { + "type": "number", + "default": 0.2, + "fa_icon": "fas fa-align-left", + "description": "Maximum allowed error for R1 primer alignment." + }, + "primer_r2_maxerror": { + "type": "number", + "default": 0.2, + "fa_icon": "fas fa-align-right", + "description": "Maximum allowed error for R2 primer alignment." + }, + "maskprimers_align": { + "type": "boolean", + "fa_icon": "fas fa-align-center", + "description": "Align primers instead of scoring them. Used for protocols without primer fixed positions." + }, + "primer_extract_len": { + "type": "integer", + "default": 0, + "fa_icon": "fas fa-align-center", + "description": "Length of the extracted primers with MaskPrimer extract." + }, + "primer_maxlen": { + "type": "integer", + "default": 50, + "fa_icon": "fas fa-align-center", + "description": "Maximum allowed primer length when aligning the primers." + }, + "assemblepairs_sequential": { + "type": "boolean", + "fa_icon": "fas fa-align-center", + "description": "Use AssemblePairs sequential instead of AssemblePairs align when assembling read pairs." + }, + "align_cregion": { + "type": "boolean", + "fa_icon": "fas fa-align-center", + "description": "Align internal C-region for a more precise isotype characterization." + }, + "internal_cregion_sequences": { + "type": "string", + "fa_icon": "fas fa-align-center", + "description": "Provide internal C-region sequences for a more precise C-region characterization. Then also set the `align_cregion` flag." + }, + "cregion_maxlen": { + "type": "integer", + "default": 100, + "fa_icon": "fas fa-align-center", + "description": "Maximum allowed length when aligning the internal C-region." + }, + "cregion_maxerror": { + "type": "number", + "default": 0.3, + "fa_icon": "fas fa-align-center", + "description": "Maximum allowed error when aligning the internal C-region." + }, + "cregion_mask_mode": { + "type": "string", + "default": "tag", + "fa_icon": "fas fa-mask", + "description": "Mask mode for C-region alignment." } }, "fa_icon": "fas fa-align-center" diff --git a/subworkflows/local/databases.nf b/subworkflows/local/databases.nf new file mode 100644 index 00000000..594b340e --- /dev/null +++ b/subworkflows/local/databases.nf @@ -0,0 +1,54 @@ +include { FETCH_DATABASES } from '../../modules/local/fetch_databases' +include { UNZIP_DB as UNZIP_IGBLAST } from '../../modules/local/unzip_db' +include { UNZIP_DB as UNZIP_IMGT } from '../../modules/local/unzip_db' + +workflow DATABASES { + + take: + + main: + ch_versions = Channel.empty() + + // FETCH DATABASES + if( !params.fetch_imgt ){ + if (params.igblast_base.endsWith(".zip")) { + Channel.fromPath("${params.igblast_base}") + .ifEmpty{ error "IGBLAST DB not found: ${params.igblast_base}" } + .set { ch_igblast_zipped } + UNZIP_IGBLAST( ch_igblast_zipped.collect() ) + ch_igblast = UNZIP_IGBLAST.out.unzipped + ch_versions = ch_versions.mix(UNZIP_IGBLAST.out.versions) + } else { + Channel.fromPath("${params.igblast_base}") + .ifEmpty { error "IGBLAST DB not found: ${params.igblast_base}" } + .set { ch_igblast } + } + } + + if( !params.fetch_imgt ){ + if (params.imgtdb_base.endsWith(".zip")) { + Channel.fromPath("${params.imgtdb_base}") + .ifEmpty{ error "IMGTDB not found: ${params.imgtdb_base}" } + .set { ch_imgt_zipped } + UNZIP_IMGT( ch_imgt_zipped.collect() ) + ch_imgt = UNZIP_IMGT.out.unzipped + ch_versions = ch_versions.mix(UNZIP_IMGT.out.versions) + } else { + Channel.fromPath("${params.imgtdb_base}") + .ifEmpty { error "IMGT DB not found: ${params.imgtdb_base}" } + .set { ch_imgt } + } + } + + if (params.fetch_imgt) { + FETCH_DATABASES() + ch_igblast = FETCH_DATABASES.out.igblast + ch_imgt = FETCH_DATABASES.out.imgt + ch_versions = ch_versions.mix(FETCH_DATABASES.out.versions) + } + + emit: + versions = ch_versions + imgt = ch_imgt + igblast = ch_igblast +} diff --git a/subworkflows/local/presto_umi.nf b/subworkflows/local/presto_umi.nf index 8d7d8713..0fdd7cd6 100644 --- a/subworkflows/local/presto_umi.nf +++ b/subworkflows/local/presto_umi.nf @@ -9,16 +9,25 @@ include { FASTP } from '../../modules/n //PRESTO include { PRESTO_FILTERSEQ as PRESTO_FILTERSEQ_UMI } from '../../modules/local/presto/presto_filterseq' include { PRESTO_MASKPRIMERS as PRESTO_MASKPRIMERS_UMI } from '../../modules/local/presto/presto_maskprimers' +include { PRESTO_MASKPRIMERS_ALIGN } from '../../modules/local/presto/presto_maskprimers_align' +include { PRESTO_MASKPRIMERS_EXTRACT } from '../../modules/local/presto/presto_maskprimers_extract' +include { PRESTO_MASKPRIMERS_ALIGN as PRESTO_ALIGN_CREGION } from '../../modules/local/presto/presto_maskprimers_align' include { PRESTO_PAIRSEQ as PRESTO_PAIRSEQ_UMI } from '../../modules/local/presto/presto_pairseq' +include { PRESTO_PAIRSEQ as PRESTO_PAIRSEQ_ALIGN } from '../../modules/local/presto/presto_pairseq' include { PRESTO_CLUSTERSETS as PRESTO_CLUSTERSETS_UMI } from '../../modules/local/presto/presto_clustersets' include { PRESTO_PARSE_CLUSTER as PRESTO_PARSE_CLUSTER_UMI } from '../../modules/local/presto/presto_parse_cluster' include { PRESTO_BUILDCONSENSUS as PRESTO_BUILDCONSENSUS_UMI } from '../../modules/local/presto/presto_buildconsensus' +include { PRESTO_BUILDCONSENSUS as PRESTO_BUILDCONSENSUS_ALIGN } from '../../modules/local/presto/presto_buildconsensus' include { PRESTO_POSTCONSENSUS_PAIRSEQ as PRESTO_POSTCONSENSUS_PAIRSEQ_UMI } from '../../modules/local/presto/presto_postconsensus_pairseq' include { PRESTO_ASSEMBLEPAIRS as PRESTO_ASSEMBLEPAIRS_UMI } from '../../modules/local/presto/presto_assemblepairs' +include { PRESTO_ASSEMBLEPAIRS_SEQUENTIAL } from '../../modules/local/presto/presto_assemblepairs_sequential' include { PRESTO_PARSEHEADERS as PRESTO_PARSEHEADERS_COLLAPSE_UMI } from '../../modules/local/presto/presto_parseheaders' +include { PRESTO_PARSEHEADERS as PRESTO_PARSEHEADERS_CREGION } from '../../modules/local/presto/presto_parseheaders' include { PRESTO_PARSEHEADERS_PRIMERS as PRESTO_PARSEHEADERS_PRIMERS_UMI } from '../../modules/local/presto/presto_parseheaders_primers' include { PRESTO_PARSEHEADERS_METADATA as PRESTO_PARSEHEADERS_METADATA_UMI } from '../../modules/local/presto/presto_parseheaders_metadata' include { PRESTO_COLLAPSESEQ as PRESTO_COLLAPSESEQ_UMI } from '../../modules/local/presto/presto_collapseseq' +include { PRESTO_COLLAPSESEQ as PRESTO_COLLAPSESEQ_ALIGN } from '../../modules/local/presto/presto_collapseseq' +include { PRESTO_COLLAPSESEQ as PRESTO_COLLAPSESEQ_CREGION } from '../../modules/local/presto/presto_collapseseq' include { PRESTO_SPLITSEQ as PRESTO_SPLITSEQ_UMI} from '../../modules/local/presto/presto_splitseq' @@ -28,6 +37,8 @@ workflow PRESTO_UMI { ch_cprimers // channel: [ cprimers.fasta ] ch_vprimers // channel: [ vprimers.fasta ] ch_adapter_fasta // channel: [ adapters.fasta ] + ch_internal_cregion // channel: [ internal_cregions.fasta ] + ch_igblast main: @@ -91,24 +102,63 @@ workflow PRESTO_UMI { ch_versions = ch_versions.mix(PRESTO_FILTERSEQ_UMI.out.versions) // Mask primers - PRESTO_MASKPRIMERS_UMI ( - PRESTO_FILTERSEQ_UMI.out.reads, - ch_cprimers.collect(), - ch_vprimers.collect() - ) - ch_versions = ch_versions.mix(PRESTO_MASKPRIMERS_UMI.out.versions) + if (params.maskprimers_align) { + + ch_reads_R1 = PRESTO_FILTERSEQ_UMI.out.reads + .map{ reads -> [reads[0], reads[1]] }.dump(tag: 'ch_reads_R1') + ch_reads_R2 = PRESTO_FILTERSEQ_UMI.out.reads + .map{ reads -> [reads[0], reads[2]] }.dump(tag: 'ch_reads_R2') + PRESTO_MASKPRIMERS_ALIGN( + ch_reads_R1, + ch_cprimers.collect(), + params.primer_maxlen, + params.primer_r1_maxerror, + params.primer_mask_mode + ) + PRESTO_MASKPRIMERS_EXTRACT( + ch_reads_R2 + ) - // Pre-consensus pair - PRESTO_PAIRSEQ_UMI ( - PRESTO_MASKPRIMERS_UMI.out.reads - ) - ch_versions = ch_versions.mix(PRESTO_PAIRSEQ_UMI.out.versions) + ch_versions = ch_versions.mix(PRESTO_MASKPRIMERS_ALIGN.out.versions) + // Merge again R1 and R2 by sample ID. + ch_maskprimers_reads_R1 = PRESTO_MASKPRIMERS_ALIGN.out.reads.map{ reads -> [reads[0].id, reads[0], reads[1]]}.dump(tag: 'ch_maskprimers_reads_R1') + ch_maskprimers_reads_R2 = PRESTO_MASKPRIMERS_EXTRACT.out.reads.map{ reads -> [reads[0].id, reads[0], reads[1]]}.dump(tag: 'ch_maskprimers_reads_R2') + ch_maskprimers_reads = ch_maskprimers_reads_R1.join(ch_maskprimers_reads_R2) + .map{ it -> [it[1], it[2], it[4]] }.dump(tag: 'ch_maskprimers_reads_after_remerge') + + ch_maskprimers_logs = PRESTO_MASKPRIMERS_ALIGN.out.logs + ch_maskprimers_logs = ch_maskprimers_logs.mix(PRESTO_MASKPRIMERS_EXTRACT.out.logs) + + PRESTO_PAIRSEQ_ALIGN( ch_maskprimers_reads ) + ch_versions = ch_versions.mix(PRESTO_PAIRSEQ_ALIGN.out.versions) + ch_for_clustersets = PRESTO_PAIRSEQ_ALIGN.out.reads + ch_pairseq_logs = PRESTO_PAIRSEQ_ALIGN.out.logs + + } else { + + PRESTO_MASKPRIMERS_UMI ( + PRESTO_FILTERSEQ_UMI.out.reads, + ch_cprimers.collect(), + ch_vprimers.collect() + ) + ch_versions = ch_versions.mix(PRESTO_MASKPRIMERS_UMI.out.versions) + ch_maskprimers_logs = PRESTO_MASKPRIMERS_UMI.out.logs + + // Pre-consensus pair + PRESTO_PAIRSEQ_UMI ( + PRESTO_MASKPRIMERS_UMI.out.reads + ) + ch_versions = ch_versions.mix(PRESTO_PAIRSEQ_UMI.out.versions) + ch_for_clustersets = PRESTO_PAIRSEQ_UMI.out.reads + ch_pairseq_logs = PRESTO_PAIRSEQ_UMI.out.logs + + } if (params.cluster_sets) { // Cluster sequences by similarity PRESTO_CLUSTERSETS_UMI ( - PRESTO_PAIRSEQ_UMI.out.reads + ch_for_clustersets ) ch_versions = ch_versions.mix(PRESTO_CLUSTERSETS_UMI.out.versions) @@ -121,61 +171,128 @@ workflow PRESTO_UMI { ch_clustersets_logs = PRESTO_CLUSTERSETS_UMI.out.logs.collect() } else { - ch_for_buildconsensus = PRESTO_PAIRSEQ_UMI.out.reads + ch_for_buildconsensus = ch_for_clustersets ch_clustersets_logs = Channel.empty() } // Build consensus of sequences with same UMI barcode - PRESTO_BUILDCONSENSUS_UMI ( - ch_for_buildconsensus - ) - ch_versions = ch_versions.mix(PRESTO_BUILDCONSENSUS_UMI.out.versions) + if (params.maskprimers_align) { + PRESTO_BUILDCONSENSUS_ALIGN ( + ch_for_buildconsensus + ) + ch_versions = ch_versions.mix(PRESTO_BUILDCONSENSUS_ALIGN.out.versions) + ch_postconsensus = PRESTO_BUILDCONSENSUS_ALIGN.out.reads + ch_buildconsensus_logs = PRESTO_BUILDCONSENSUS_ALIGN.out.logs + } else { + PRESTO_BUILDCONSENSUS_UMI ( + ch_for_buildconsensus + ) + ch_versions = ch_versions.mix(PRESTO_BUILDCONSENSUS_UMI.out.versions) + ch_postconsensus = PRESTO_BUILDCONSENSUS_UMI.out.reads + ch_buildconsensus_logs = PRESTO_BUILDCONSENSUS_UMI.out.logs + } // Post-consensus pair PRESTO_POSTCONSENSUS_PAIRSEQ_UMI ( - PRESTO_BUILDCONSENSUS_UMI.out.reads + ch_postconsensus ) ch_versions = ch_versions.mix(PRESTO_POSTCONSENSUS_PAIRSEQ_UMI.out.versions) - // Assemble read pairs - PRESTO_ASSEMBLEPAIRS_UMI ( - PRESTO_POSTCONSENSUS_PAIRSEQ_UMI.out.reads - ) - ch_versions = ch_versions.mix(PRESTO_ASSEMBLEPAIRS_UMI.out.versions) + if (params.assemblepairs_sequential){ + // Assemble read pairs sequential + PRESTO_ASSEMBLEPAIRS_SEQUENTIAL ( + PRESTO_POSTCONSENSUS_PAIRSEQ_UMI.out.reads, + ch_igblast.collect() + ) + ch_versions = ch_versions.mix(PRESTO_ASSEMBLEPAIRS_SEQUENTIAL.out.versions) + ch_assemblepairs_reads = PRESTO_ASSEMBLEPAIRS_SEQUENTIAL.out.reads + ch_assemblepairs_logs = PRESTO_ASSEMBLEPAIRS_SEQUENTIAL.out.logs + } else { + // Assemble read pairs align + PRESTO_ASSEMBLEPAIRS_UMI ( + PRESTO_POSTCONSENSUS_PAIRSEQ_UMI.out.reads + ) + ch_versions = ch_versions.mix(PRESTO_ASSEMBLEPAIRS_UMI.out.versions) + ch_assemblepairs_reads = PRESTO_ASSEMBLEPAIRS_UMI.out.reads + ch_assemblepairs_logs = PRESTO_ASSEMBLEPAIRS_UMI.out.logs + } + + + if (params.align_cregion) { + PRESTO_ALIGN_CREGION( + ch_assemblepairs_reads, + ch_internal_cregion.collect(), + params.cregion_maxlen, + params.cregion_maxerror, + params.cregion_mask_mode + ) + ch_parseheaders_reads = PRESTO_ALIGN_CREGION.out.reads + } else { + ch_parseheaders_reads = ch_assemblepairs_reads + } // Generate QC stats after reads paired and filtered but before collapsed FASTQC_POSTASSEMBLY_UMI ( - PRESTO_ASSEMBLEPAIRS_UMI.out.reads + ch_assemblepairs_reads ) ch_versions = ch_versions.mix(FASTQC_POSTASSEMBLY_UMI.out.versions) // Combine UMI duplicate count PRESTO_PARSEHEADERS_COLLAPSE_UMI ( - PRESTO_ASSEMBLEPAIRS_UMI.out.reads + ch_parseheaders_reads ) ch_versions = ch_versions.mix(PRESTO_PARSEHEADERS_COLLAPSE_UMI.out.versions) - // Annotate primers in C_PRIMER and V_PRIMER field - PRESTO_PARSEHEADERS_PRIMERS_UMI ( - PRESTO_PARSEHEADERS_COLLAPSE_UMI.out.reads - ) - ch_versions = ch_versions.mix(PRESTO_PARSEHEADERS_PRIMERS_UMI.out.versions) + // Annotate primer fields and collapse duplicates + if (params.maskprimers_align) { + // Rename primer field to CREGION + PRESTO_PARSEHEADERS_CREGION ( + PRESTO_PARSEHEADERS_COLLAPSE_UMI.out.reads + ) + ch_versions = ch_versions.mix(PRESTO_PARSEHEADERS_CREGION.out.versions) - // Annotate metadata on primer headers + // Collapse duplicates + PRESTO_COLLAPSESEQ_ALIGN ( + PRESTO_PARSEHEADERS_CREGION.out.reads + ) + ch_versions = ch_versions.mix(PRESTO_COLLAPSESEQ_ALIGN.out.versions) + ch_collapsed = PRESTO_COLLAPSESEQ_ALIGN.out.reads + ch_collapse_logs = PRESTO_COLLAPSESEQ_ALIGN.out.logs + + } else { + // Annotate primers in C_PRIMER and V_PRIMER field + PRESTO_PARSEHEADERS_PRIMERS_UMI ( + PRESTO_PARSEHEADERS_COLLAPSE_UMI.out.reads + ) + ch_versions = ch_versions.mix(PRESTO_PARSEHEADERS_PRIMERS_UMI.out.versions) + + if (params.align_cregion) { + PRESTO_COLLAPSESEQ_CREGION ( + PRESTO_PARSEHEADERS_PRIMERS_UMI.out.reads + ) + ch_versions = ch_versions.mix(PRESTO_COLLAPSESEQ_CREGION.out.versions) + ch_collapsed = PRESTO_COLLAPSESEQ_CREGION.out.reads + ch_collapse_logs = PRESTO_COLLAPSESEQ_CREGION.out.logs + } else { + // Collapse duplicates + PRESTO_COLLAPSESEQ_UMI ( + PRESTO_PARSEHEADERS_PRIMERS_UMI.out.reads + ) + ch_versions = ch_versions.mix(PRESTO_COLLAPSESEQ_UMI.out.versions) + ch_collapsed = PRESTO_COLLAPSESEQ_UMI.out.reads + ch_collapse_logs = PRESTO_COLLAPSESEQ_UMI.out.logs + } + } + + // Annotate metadata on read headers PRESTO_PARSEHEADERS_METADATA_UMI ( - PRESTO_PARSEHEADERS_PRIMERS_UMI.out.reads + ch_collapsed ) ch_versions = ch_versions.mix(PRESTO_PARSEHEADERS_METADATA_UMI.out.versions) - // Mark and count duplicate sequences with different UMI barcodes (DUPCOUNT) - PRESTO_COLLAPSESEQ_UMI ( - PRESTO_PARSEHEADERS_METADATA_UMI.out.reads - ) - ch_versions = ch_versions.mix(PRESTO_COLLAPSESEQ_UMI.out.versions) - // Filter out sequences with less than 2 representative duplicates with different UMIs PRESTO_SPLITSEQ_UMI ( - PRESTO_COLLAPSESEQ_UMI.out.reads + PRESTO_PARSEHEADERS_METADATA_UMI.out.reads ) ch_versions = ch_versions.mix(PRESTO_SPLITSEQ_UMI.out.versions) @@ -186,12 +303,12 @@ workflow PRESTO_UMI { fastp_reads_html = FASTP.out.html.collect{ meta,html -> html } fastqc_postassembly_gz = FASTQC_POSTASSEMBLY_UMI.out.zip presto_filterseq_logs = PRESTO_FILTERSEQ_UMI.out.logs - presto_maskprimers_logs = PRESTO_MASKPRIMERS_UMI.out.logs.collect() - presto_pairseq_logs = PRESTO_PAIRSEQ_UMI.out.logs.collect() + presto_maskprimers_logs = ch_maskprimers_logs.collect() + presto_pairseq_logs = ch_pairseq_logs.collect() presto_clustersets_logs = ch_clustersets_logs - presto_buildconsensus_logs = PRESTO_BUILDCONSENSUS_UMI.out.logs.collect() + presto_buildconsensus_logs = ch_buildconsensus_logs.collect() presto_postconsensus_pairseq_logs = PRESTO_POSTCONSENSUS_PAIRSEQ_UMI.out.logs.collect() - presto_assemblepairs_logs = PRESTO_ASSEMBLEPAIRS_UMI.out.logs.collect() - presto_collapseseq_logs = PRESTO_COLLAPSESEQ_UMI.out.logs.collect() + presto_assemblepairs_logs = ch_assemblepairs_logs.collect() + presto_collapseseq_logs = ch_collapse_logs.collect() presto_splitseq_logs = PRESTO_SPLITSEQ_UMI.out.logs.collect() } diff --git a/subworkflows/local/sequence_assembly.nf b/subworkflows/local/sequence_assembly.nf index 26bfd3cd..0ae99c83 100644 --- a/subworkflows/local/sequence_assembly.nf +++ b/subworkflows/local/sequence_assembly.nf @@ -50,7 +50,8 @@ include { FASTQC } from '../../modules/nf-core/fastqc/main' workflow SEQUENCE_ASSEMBLY { take: - ch_input // channel: + ch_input // channel: reads + ch_igblast main: @@ -84,6 +85,11 @@ workflow SEQUENCE_ASSEMBLY { if (params.umi_length < 2) { error "The 'specific_pcr_umi' library generation method requires setting the '--umi_length' to a value greater than 1." } + if (params.internal_cregion_sequences) { + ch_internal_cregion = Channel.fromPath(params.internal_cregion_sequences, checkIfExists: true) + } else { + ch_internal_cregion = Channel.of([]) + } } else if (params.library_generation_method == 'specific_pcr') { if (params.vprimers) { ch_vprimers_fasta = Channel.fromPath(params.vprimers, checkIfExists: true) @@ -103,11 +109,16 @@ workflow SEQUENCE_ASSEMBLY { } else { params.umi_length = 0 } + if (params.internal_cregion_sequences) { + error "Please do not set '--internal_cregion_sequences' when using the 'specific_pcr' library generation method without UMIs." + } } else if (params.library_generation_method == 'dt_5p_race_umi') { if (params.vprimers) { error "The oligo-dT 5'-RACE UMI library generation method does not accept V-region primers, please provide a linker with '--race_linker' instead or select another library method option." } else if (params.race_linker) { ch_vprimers_fasta = Channel.fromPath(params.race_linker, checkIfExists: true) + } else if (params.maskprimers_align) { + ch_vprimers_fasta = Channel.of([]) } else { error "The oligo-dT 5'-RACE UMI library generation method requires a linker or Template Switch Oligo sequence, please provide it with the option '--race_linker'." } @@ -119,11 +130,18 @@ workflow SEQUENCE_ASSEMBLY { if (params.umi_length < 2) { error "The oligo-dT 5'-RACE UMI 'dt_5p_race_umi' library generation method requires specifying the '--umi_length' to a value greater than 1." } + if (params.internal_cregion_sequences) { + ch_internal_cregion = Channel.fromPath(params.internal_cregion_sequences, checkIfExists: true) + } else { + ch_internal_cregion = Channel.of([]) + } } else if (params.library_generation_method == 'dt_5p_race') { if (params.vprimers) { error "The oligo-dT 5'-RACE library generation method does not accept V-region primers, please provide a linker with '--race_linker' instead or select another library method option." } else if (params.race_linker) { ch_vprimers_fasta = Channel.fromPath(params.race_linker, checkIfExists: true) + } else if (params.maskprimers_align) { + ch_vprimers_fasta = Channel.of([]) } else { error "The oligo-dT 5'-RACE library generation method requires a linker or Template Switch Oligo sequence, please provide it with the option '--race_linker'." } @@ -137,6 +155,9 @@ workflow SEQUENCE_ASSEMBLY { } else { params.umi_length = 0 } + if (params.internal_cregion_sequences) { + error "Please do not set '--internal_cregion_sequences' when using the 'dt_5p_race' library generation method without UMIs." + } } else { error "The provided library generation method is not supported. Please check the docs for `--library_generation_method`." } @@ -145,7 +166,8 @@ workflow SEQUENCE_ASSEMBLY { if (params.index_file & params.umi_position == 'R2') {error "Please do not set `--umi_position` option if index file with UMIs is provided."} if (params.umi_length < 0) {error "Please provide the UMI barcode length in the option `--umi_length`. To run without UMIs, set umi_length to 0."} if (!params.index_file & params.umi_start != 0) {error "Setting a UMI start position is only allowed when providing the UMIs in a separate index read file. If so, please provide the `--index_file` flag as well."} - + if (params.maskprimers_align & params.umi_position == 'R1') {error "The maskprimers align option is only supported with UMI barcodes in the R2 reads (reads containing V region)."} + if (params.maskprimers_align & params.cprimer_position == 'R2') {error "The maskprimers align option is only supported with Cprimers in the R1 reads (reads containing C region)."} // // SUBWORKFLOW: Read in samplesheet, validate and stage input files @@ -190,7 +212,9 @@ workflow SEQUENCE_ASSEMBLY { ch_reads, ch_cprimers_fasta, ch_vprimers_fasta, - ch_adapter_fasta + ch_adapter_fasta, + ch_internal_cregion, + ch_igblast.collect() ) ch_presto_fasta = PRESTO_UMI.out.fasta ch_presto_software = PRESTO_UMI.out.software diff --git a/subworkflows/local/vdj_annotation.nf b/subworkflows/local/vdj_annotation.nf index c80d3503..4ac2b9df 100644 --- a/subworkflows/local/vdj_annotation.nf +++ b/subworkflows/local/vdj_annotation.nf @@ -1,6 +1,3 @@ -include { FETCH_DATABASES } from '../../modules/local/fetch_databases' -include { UNZIP_DB as UNZIP_IGBLAST } from '../../modules/local/unzip_db' -include { UNZIP_DB as UNZIP_IMGT } from '../../modules/local/unzip_db' include { CHANGEO_ASSIGNGENES } from '../../modules/local/changeo/changeo_assigngenes' include { CHANGEO_MAKEDB } from '../../modules/local/changeo/changeo_makedb' include { CHANGEO_PARSEDB_SPLIT } from '../../modules/local/changeo/changeo_parsedb_split' @@ -15,52 +12,13 @@ workflow VDJ_ANNOTATION { take: ch_fasta // [meta, fasta] ch_validated_samplesheet + ch_igblast + ch_imgt main: ch_versions = Channel.empty() ch_logs = Channel.empty() - // FETCH DATABASES - // TODO: this can take a long time, and the progress shows 0%. Would be - // nice to have some better progress reporting. - // And maybe run this as 2 separate steps, one for IMGT and one for IgBLAST? - if( !params.fetch_imgt ){ - if (params.igblast_base.endsWith(".zip")) { - Channel.fromPath("${params.igblast_base}") - .ifEmpty{ error "IGBLAST DB not found: ${params.igblast_base}" } - .set { ch_igblast_zipped } - UNZIP_IGBLAST( ch_igblast_zipped.collect() ) - ch_igblast = UNZIP_IGBLAST.out.unzipped - ch_versions = ch_versions.mix(UNZIP_IGBLAST.out.versions) - } else { - Channel.fromPath("${params.igblast_base}") - .ifEmpty { error "IGBLAST DB not found: ${params.igblast_base}" } - .set { ch_igblast } - } - } - - if( !params.fetch_imgt ){ - if (params.imgtdb_base.endsWith(".zip")) { - Channel.fromPath("${params.imgtdb_base}") - .ifEmpty{ error "IMGTDB not found: ${params.imgtdb_base}" } - .set { ch_imgt_zipped } - UNZIP_IMGT( ch_imgt_zipped.collect() ) - ch_imgt = UNZIP_IMGT.out.unzipped - ch_versions = ch_versions.mix(UNZIP_IMGT.out.versions) - } else { - Channel.fromPath("${params.imgtdb_base}") - .ifEmpty { error "IMGT DB not found: ${params.imgtdb_base}" } - .set { ch_imgt } - } - } - - if (params.fetch_imgt) { - FETCH_DATABASES() - ch_igblast = FETCH_DATABASES.out.igblast - ch_imgt = FETCH_DATABASES.out.imgt - ch_versions = ch_versions.mix(FETCH_DATABASES.out.versions) - } - CHANGEO_ASSIGNGENES ( ch_fasta, ch_igblast.collect() diff --git a/workflows/airrflow.nf b/workflows/airrflow.nf index d1cc3c13..dd1c6d8f 100644 --- a/workflows/airrflow.nf +++ b/workflows/airrflow.nf @@ -54,6 +54,7 @@ include { CHANGEO_CONVERTDB_FASTA as CHANGEO_CONVERTDB_FASTA_FROM_AIRR } from '. // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // +include { DATABASES } from '../subworkflows/local/databases' include { SEQUENCE_ASSEMBLY } from '../subworkflows/local/sequence_assembly' include { ASSEMBLED_INPUT_CHECK } from '../subworkflows/local/assembled_input_check' include { VDJ_ANNOTATION } from '../subworkflows/local/vdj_annotation' @@ -88,10 +89,16 @@ workflow AIRRFLOW { ch_versions = Channel.empty() ch_reassign_logs = Channel.empty() + // Download or fetch databases + DATABASES() + if ( params.mode == "fastq" ) { // Perform sequence assembly if input type is fastq - SEQUENCE_ASSEMBLY( ch_input ) + SEQUENCE_ASSEMBLY( + ch_input, + DATABASES.out.igblast.collect() + ) ch_fasta = SEQUENCE_ASSEMBLY.out.fasta ch_versions = ch_versions.mix(SEQUENCE_ASSEMBLY.out.versions) @@ -153,7 +160,9 @@ workflow AIRRFLOW { // Perform V(D)J annotation and filtering VDJ_ANNOTATION( ch_fasta, - ch_validated_samplesheet.collect() + ch_validated_samplesheet.collect(), + DATABASES.out.igblast.collect(), + DATABASES.out.imgt.collect() ) ch_versions = ch_versions.mix( VDJ_ANNOTATION.out.versions )