diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e3ff5e02..bd430bf4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -62,6 +62,8 @@ jobs: "test_10x_sc", "test_clontech_umi", "test_nebnext_umi", + "test_rnaseq_bulk", + "test_rnaseq_sc", ] fail-fast: false steps: diff --git a/conf/modules.config b/conf/modules.config index 3dc63fa9..dd315b85 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -422,6 +422,14 @@ process { ] } + withName: CHANGEO_PARSEDB_SELECT_LOCUS { + publishDir = [ + path: { "${params.outdir}/vdj_annotation/select-locus/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: CHANGEO_PARSEDB_SPLIT { publishDir = [ path: { "${params.outdir}/vdj_annotation/04-select-productive/${meta.id}" }, diff --git a/conf/test_rnaseq_bulk.config b/conf/test_rnaseq_bulk.config new file mode 100644 index 00000000..eb10e0d9 --- /dev/null +++ b/conf/test_rnaseq_bulk.config @@ -0,0 +1,26 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/airrflow -profile test_rnaseq_bulk, + */ + +params { + config_profile_name = 'Test bulk RNA-seq based workflow using TRUST4' + config_profile_description = 'Minimal test dataset to check pipeline function with raw bulk RNA-seq data' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + // params + mode = 'fastq' + library_generation_method = 'trust4' + clonal_threshold = 0 + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-rnaseq/rnaseq_metadata.tsv' +} diff --git a/conf/test_rnaseq_sc.config b/conf/test_rnaseq_sc.config new file mode 100644 index 00000000..de2bd2f5 --- /dev/null +++ b/conf/test_rnaseq_sc.config @@ -0,0 +1,30 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/airrflow -profile test_rnaseq_sc, + */ + +params { + config_profile_name = 'Test single-cell RNA-seq based workflow using TRUST4' + config_profile_description = 'Minimal test dataset to check pipeline function with raw single-cell RNA-seq data' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + + // params + mode = 'fastq' + library_generation_method = 'trust4' + clonal_threshold = 0 + barcode_read = "R1" + umi_read = "R1" + read_format = "bc:0:15,um:16:27" + skip_lineage = true + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-rnaseq/sc_rnaseq_metadata.tsv' +} diff --git a/docs/usage.md b/docs/usage.md index c3844547..5d5b4aab 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -41,13 +41,13 @@ nextflow run nf-core/airrflow \ A typical command to run the pipeline from **single cell raw fastq files** is: ```bash -nextflow run nf-core/airrflow -r dev \ +nextflow run nf-core/airrflow \ -profile \ --mode fastq \ --input input_samplesheet.tsv \ --library_generation_method sc_10x_genomics \ --reference_10x reference/refdata-cellranger-vdj-GRCh38-alts-ensembl-5.0.0.tar.gz \ ---outdir ./results +--outdir results ``` A typical command for running the pipeline departing from **single-cell AIRR rearrangement tables or assembled bulk sequencing fasta** data is: @@ -121,7 +121,7 @@ If you wish to share such profile (such as upload as supplementary material for ## Input samplesheet -### Fastq input samplesheet (bulk sequencing) +### Fastq input samplesheet (bulk AIRR sequencing) The required input file for processing raw BCR or TCR bulk targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename_R1`, `filename_R2`, `subject_id`, `species`, `tissue`, `pcr_target_locus`, `single_cell`, `sex`, `age` and `biomaterial_provider` are required. An example samplesheet is: @@ -143,7 +143,7 @@ The required input file for processing raw BCR or TCR bulk targeted sequencing d - `age`: Subject biological age. - `single_cell`: TRUE or FALSE. -Other optional columns can be added. These columns will be available when building the contrasts for the repertoire comparison report. It is recommended that these columns also follow the AIRR nomenclature. Examples are: +Other optional columns can be added. These columns will be available as metadata in the final repertoire table. It is recommended that these columns also follow the AIRR nomenclature. Examples are: - `intervention`: Description of intervention. - `disease_diagnosis`: Diagnosis of subject. @@ -151,19 +151,19 @@ Other optional columns can be added. These columns will be available when buildi - `collection_time_point_reference`: Event in the study schedule to which `Sample collection time` relates to (e.g. primary vaccination, intervention start). - `cell_subset`: Commonly-used designation of isolated cell population. -The metadata specified in the input file will then be automatically annotated in a column with the same header in the tables generated by the pipeline. +It is possible to provide several fastq files per sample (e.g. sequenced over different chips or lanes). In this case the different fastq files per sample will be merged together prior to processing. Provide one fastq pair R1/R2 per row, and the same `sample_id` field for these rows. ### Fastq input samplesheet (single cell sequencing) -The required input file for processing raw BCR or TCR single cell targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename_R1`, `filename_R2`, `subject_id`, `species`, `tissue`, `pcr_target_locus`, `single_cell`, `sex`, `age` and `biomaterial_provider` are required. You can refer to the bulk fastq input section for documentation on the individual columns. +The required input file for processing raw BCR or TCR single cell targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename_R1`, `filename_R2`, `subject_id`, `species`, `tissue`, `pcr_target_locus`, `single_cell`, `sex`, `age` and `biomaterial_provider` are required. Any other columns you add will be available in the final repertoire file as extra metadata fields. You can refer to the bulk fastq input section for documentation on the individual columns. An example samplesheet is: -| sample_id | filename_R1 | filename_R2 | subject_id | species | pcr_target_locus | tissue | sex | age | biomaterial_provider | single_cell | intervention | collection_time_point_relative | cell_subset | -| --------- | ------------------------------- | ------------------------------- | ---------- | ------- | ---------------- | ------ | ------ | --- | -------------------- | ----------- | -------------- | ------------------------------ | ------------ | -| sample01 | sample1_S1_L001_R1_001.fastq.gz | sample1_S1_L001_R2_001.fastq.gz | Subject02 | human | IG | blood | NA | 53 | sequencing_facility | FALSE | Drug_treatment | Baseline | plasmablasts | -| sample02 | sample2_S1_L001_R1_001.fastq.gz | sample2_S1_L001_R2_001.fastq.gz | Subject02 | human | TR | blood | female | 78 | sequencing_facility | FALSE | Drug_treatment | Baseline | plasmablasts | +| sample_id | filename_R1 | filename_R2 | subject_id | species | pcr_target_locus | tissue | sex | age | biomaterial_provider | single_cell | +| --------- | -------------------------------- | -------------------------------- | ---------- | ------- | ---------------- | ------ | ------ | --- | -------------------- | ----------- | +| sample01 | sample01_S1_L001_R1_001.fastq.gz | sample01_S1_L001_R2_001.fastq.gz | Subject02 | human | IG | blood | NA | 53 | sequencing_facility | TRUE | +| sample02 | sample02_S1_L001_R1_001.fastq.gz | sample02_S1_L001_R2_001.fastq.gz | Subject02 | human | TR | blood | female | 78 | sequencing_facility | TRUE | -> FASTQ files must confirm the 10xGenomics cellranger naming conventions
>**`[SAMPLE-NAME]`_S1_L00`[LANE-NUMBER]` _`[READ-TYPE]`\_001.fastq.gz** +> FASTQ files must conform with the 10xGenomics cellranger naming conventions with the same sample name as provided in the sample*id column
>\*\*`[SAMPLE-NAME]`* S`[CHIP-NUMBER]`_ L00`[LANE-NUMBER]`_`[R1/R2]`\_001.fastq.gz\*\* > > Read type is one of > @@ -172,6 +172,13 @@ An example samplesheet is: > - `R1`: Read 1 > - `R2`: Read 2 +It is possible to provide several fastq files per sample (e.g. sequenced over different chips or lanes). In this case the different fastq files per sample will be provided to the same cellranger process. These rows should then have an identical `sample_id` field. + +### Fastq input samplesheet (untargeted bulk or sc RNA sequencing) + +When running the untargeted protocol, BCR or TCR sequences will be extracted from the untargeted bulk or single-cell RNA sequencing with tools such as [TRUST4](https://github.com/liulab-dfci/TRUST4). +The required input file is the same as for the [Fastq bulk AIRR samplesheet](#fastq-input-samplesheet-bulk-airr-sequencing) or [Fastq single-cell AIRR samplesheet](#fastq-input-samplesheet-single-cell-sequencing) depending on the input data type (bulk RNAseq or single-cell RNAseq). + ### Assembled input samplesheet (bulk or single-cell sequencing) The required input file for processing raw BCR or TCR bulk targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename`, `subject_id`, `species`, `tissue`, `single_cell`, `sex`, `age` and `biomaterial_provider` are required. All fields are explained in the previous section, with the only difference being that there is only one `filename` column for the assembled input samplesheet. The provided file will be different from assembled single-cell or bulk data: @@ -468,6 +475,42 @@ nextflow run nf-core/airrflow -r dev \ - The 10xGenomics reference can be downloaded from the [download page](https://www.10xgenomics.com/support/software/cell-ranger/downloads) - To generate a V(D)J segment fasta file as reference from IMGT one can follow the [cellranger docs](https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/advanced/references#imgt). +## Supported unselected RNA-seq based methods + +nf-core/airrflow supports unselected bulk or single-cell RNA-seq fastq files as input. [TRUST4](https://github.com/liulab-dfci/TRUST4) is used to extract TCR/BCR sequences from these files. The resulting AIRR tables are then fed into airrflow's Immcantation based workflow.
+To use unselected RNA-seq based input, specify `--library_generation_method trust4`. + +### Bulk RNA-seq + +A typical command to run the pipeline from **bulk RNA-seq fastq files** is: + +```bash +nextflow run nf-core/airrfow \ +-profile \ +--mode fastq \ +--input input_samplesheet.tsv \ +--library_generation_method trust4 \ +--outdir results +``` + +### Single-cell RNA-seq + +A typical command to run the pipeline from **single-cell RNA-seq fastq files** is: + +```bash +nextflow run nf-core/airrfow \ +-profile \ +--mode fastq \ +--input input_samplesheet.tsv \ +--library_generation_method trust4 \ +--umi_read R1 \ +--read_format bc:0:15,um:16:27 \ +--outdir results +``` + +- If UMI's are present, the read containing them must be specified using the `--umi_position` parameter. +- The `--read_format` parameter can be used to specify the Barcode and UMI position within the reads (see TRUST4 [docs](https://github.com/liulab-dfci/TRUST4?tab=readme-ov-file#10x-genomics-data-and-barcode-based-single-cell-data)) + ## Core Nextflow arguments :::note diff --git a/modules.json b/modules.json index 3a6e053c..e561d2ec 100644 --- a/modules.json +++ b/modules.json @@ -34,6 +34,11 @@ "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", "installed_by": ["modules"] + }, + "trust4": { + "branch": "master", + "git_sha": "bbb9636dbe460f45fe786d0866f8fd7337e4fc7a", + "installed_by": ["modules"] } } }, diff --git a/modules/local/changeo/changeo_parsedb_select.nf b/modules/local/changeo/changeo_parsedb_select_locus.nf similarity index 72% rename from modules/local/changeo/changeo_parsedb_select.nf rename to modules/local/changeo/changeo_parsedb_select_locus.nf index 2bba4916..32805c26 100644 --- a/modules/local/changeo/changeo_parsedb_select.nf +++ b/modules/local/changeo/changeo_parsedb_select_locus.nf @@ -1,4 +1,4 @@ -process CHANGEO_PARSEDB_SELECT { +process CHANGEO_PARSEDB_SELECT_LOCUS { tag "$meta.id" label 'process_low' label 'immcantation' @@ -18,25 +18,21 @@ process CHANGEO_PARSEDB_SELECT { path "versions.yml" , emit: versions script: - def args = task.ext.args ?: '' - def args2 = task.ext.args2 ?: '' if (meta.locus.toUpperCase() == 'IG'){ """ - ParseDb.py select -d $tab $args --outname ${meta.id} > ${meta.id}_select_command_log.txt + ParseDb.py select -d $tab -f locus -u "IG[HKL]" --regex --outname ${meta.id} > ${meta.id}_select_command_log.txt cat <<-END_VERSIONS > versions.yml "${task.process}": - igblastn: \$( igblastn -version | grep -o "igblast[0-9\\. ]\\+" | grep -o "[0-9\\. ]\\+" ) changeo: \$( ParseDb.py --version | awk -F' ' '{print \$2}' ) END_VERSIONS """ } else if (meta.locus.toUpperCase() == 'TR'){ """ - ParseDb.py select -d $tab $args2 --outname ${meta.id} > "${meta.id}_command_log.txt" + ParseDb.py select -d $tab -f locus -u "TR[ABDG]" --regex --outname ${meta.id} > "${meta.id}_command_log.txt" cat <<-END_VERSIONS > versions.yml "${task.process}": - igblastn: \$( igblastn -version | grep -o "igblast[0-9\\. ]\\+" | grep -o "[0-9\\. ]\\+" ) changeo: \$( ParseDb.py --version | awk -F' ' '{print \$2}' ) END_VERSIONS """ diff --git a/modules/local/prepare_trust4_reference.nf b/modules/local/prepare_trust4_reference.nf new file mode 100644 index 00000000..dce204c8 --- /dev/null +++ b/modules/local/prepare_trust4_reference.nf @@ -0,0 +1,24 @@ +process PREPARE_TRUST4_REFERENCE { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::trust4=1.0.13" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/trust4:1.0.13--h43eeafb_0': + 'biocontainers/trust4:1.0.13--h43eeafb_0' }" + + input: + tuple val(meta), path(R1), path(R2) + path(reference_igblast) + + output: + tuple val(meta), path("trust4_reference.fa") , emit: trust4_reference + + script: + """ + cat ${reference_igblast}/fasta/imgt_${meta.species.toLowerCase()}_*.fasta \\ + ${reference_igblast}/fasta/imgt_${meta.species.toLowerCase()}_*.fasta >> trust4_reference.fa + """ + + +} diff --git a/modules/nf-core/trust4/environment.yml b/modules/nf-core/trust4/environment.yml new file mode 100644 index 00000000..9270eee2 --- /dev/null +++ b/modules/nf-core/trust4/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "trust4" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::trust4=1.0.13" diff --git a/modules/nf-core/trust4/main.nf b/modules/nf-core/trust4/main.nf new file mode 100644 index 00000000..1d822fb8 --- /dev/null +++ b/modules/nf-core/trust4/main.nf @@ -0,0 +1,105 @@ +process TRUST4 { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::trust4=1.0.13" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/trust4:1.0.13--h43eeafb_0': + 'biocontainers/trust4:1.0.13--h43eeafb_0' }" + + input: + tuple val(meta), path(bam), path(reads) + tuple val(meta2), path(fasta) + tuple val(meta3), path(vdj_reference) + tuple val(meta4), val(barcode_read) + tuple val(meta5), val(umi_read) + + output: + tuple val(meta), path("*.tsv") , emit: tsv + tuple val(meta), path("*_airr.tsv") , emit: airr_files + tuple val(meta), path("${meta.id}_airr.tsv") , emit: airr_tsv + tuple val(meta), path("*_report.tsv") , emit: report_tsv + tuple val(meta), path("*.fa") , emit: fasta + tuple val(meta), path("*.out") , emit: out + tuple val(meta), path("*.fq") , emit: fq + tuple val(meta), path("**") , emit: outs + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bam_mode = bam ? "-b ${bam}" : '' + def single_end_mode = reads && meta.single_end ? "-u ${reads}" : '' + // reference is optional for fastq input + def reference = vdj_reference ? "--ref ${vdj_reference}" : "" + // separate forward from reverse pairs + def (forward, reverse) = reads.collate(2).transpose() + def paired_end_mode = reads && (meta.single_end == false) ? "-1 ${forward[0]} -2 ${reverse[0]}" : '' + // read format is optional + def readFormat = params.read_format ? "--readFormat ${params.read_format}" : '' + // add barcode information if present + if (barcode_read) { + if (barcode_read == "R1") { + barcode = "--barcode ${forward[0]}" + } else if (barcode_read == "R2") { + barcode = "--barcode ${reverse[0]}" + } + } + else { + barcode = '' + } + // add umi information if present + if (umi_read) { + if (umi_read == "R1") { + umi = "--UMI ${forward[0]}" + } else if (umi_read == "R2") { + umi = "--UMI ${reverse[0]}" + } + } + else { + umi = '' + } + + """ + run-trust4 \\ + ${bam_mode} \\ + ${single_end_mode} \\ + ${paired_end_mode} \\ + ${barcode} \\ + ${readFormat} \\ + ${umi} \\ + -t $task.cpus \\ + -f ${fasta} \\ + -o ${prefix} \\ + ${reference} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + trust4: \$(run-trust4 2>&1 | grep -o 'v[0-9.]*-r[0-9]*' | sed 's/^/TRUST4 using /' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_airr.tsv + touch ${prefix}_airr_align.tsv + touch ${prefix}_report.tsv + touch ${prefix}_assembled_reads.fa + touch ${prefix}_annot.fa + touch ${prefix}_cdr3.out + touch ${prefix}_raw.out + touch ${prefix}_final.out + touch ${prefix}_toassemble.fq + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + trust4: \$(run-trust4 2>&1 | grep -o 'v[0-9.]*-r[0-9]*' | sed 's/^/TRUST4 using /' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/trust4/meta.yml b/modules/nf-core/trust4/meta.yml new file mode 100644 index 00000000..89bc4d29 --- /dev/null +++ b/modules/nf-core/trust4/meta.yml @@ -0,0 +1,75 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "trust4" +description: Run TRUST4 on RNA-seq data +keywords: + - sort + - example + - genomics +tools: + - "trust4": + description: "TCR and BCR assembly from bulk or single-cell RNA-seq data" + homepage: "https://github.com/liulab-dfci/TRUST4" + documentation: "https://github.com/liulab-dfci/TRUST4" + tool_dev_url: "https://github.com/liulab-dfci/TRUST4" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - bam: + type: file + description: BAM file from bulk or single-cell RNA-seq data + pattern: "*.bam" + - reads: + type: file + description: List of input FastQ files of size 1 and 2 for single-end and paired-end data, respectively + - fasta: + type: file + description: Path to the fasta file coordinate and sequence of V/D/J/C genes + - ref: + type: file + description: Path to detailed V/D/J/C gene reference file, such as from IMGT database. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - tsv: + type: file + description: tsv files created by TRUST4 + pattern: "*.tsv" + - airr_tsv: + type: file + description: TRUST4 results in AIRR format + pattern: "*_airr.tsv" + - report_tsv: + type: file + description: TRUST4 report in tsv format + pattern: "*_report.tsv" + - fasta: + type: file + description: Fasta files created by TRUST4 + pattern: "*.fa" + - out: + type: file + description: Further report files + pattern: "*.out" + - fq: + type: file + description: Fastq files created by TRUST4 + pattern: "*.fq" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@mapo9, @Joaodemeirelles" +maintainers: + - "@mapo9" diff --git a/nextflow.config b/nextflow.config index e51ff966..993b0de5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -86,6 +86,7 @@ params { fetch_imgt = false save_databases = true isotype_column = 'c_call' + skip_alignment_filter = false // ----------------------- // bulk filtering options @@ -123,6 +124,13 @@ params { // ----------------------- reference_10x = null + // ----------------------- + // raw RNA seq input options + // ----------------------- + barcode_read = null + read_format = null + umi_read = null + // ----------------------- // generic nf-core options @@ -305,6 +313,8 @@ profiles { test_10x_sc { includeConfig 'conf/test_10x_sc.config' } test_clontech_umi { includeConfig 'conf/test_clontech_umi.config' } test_nebnext_umi { includeConfig 'conf/test_nebnext_umi.config' } + test_rnaseq_bulk { includeConfig 'conf/test_rnaseq_bulk.config' } + test_rnaseq_sc { includeConfig 'conf/test_rnaseq_sc.config' } nebnext_umi_tcr { includeConfig 'conf/nebnext_umi_tcr.config' } nebnext_umi_bcr { includeConfig 'conf/nebnext_umi_bcr.config' } clontech_umi_bcr { includeConfig 'conf/clontech_umi_bcr.config' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 1c32a276..92412c39 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -61,7 +61,14 @@ "type": "string", "fa_icon": "fas fa-flask", "description": "Protocol used for the V(D)J amplicon sequencing library generation.", - "enum": ["specific_pcr_umi", "specific_pcr", "dt_5p_race", "dt_5p_race_umi", "sc_10x_genomics"], + "enum": [ + "specific_pcr_umi", + "specific_pcr", + "dt_5p_race", + "dt_5p_race_umi", + "sc_10x_genomics", + "trust4" + ], "help_text": "Available protocols are:\n- `specific_pcr_umi`: RT-PCR using transcript-specific primers containing UMIs.\n- `specific_pcr`: RT-PCR using transcript-specific primers.\n- `dt_5p_race_umi`: 5\u2019-RACE PCR using oligo-dT primers and template switch primers containing UMI.\n- `dt_5p_race`: 5\u2019-RACE PCR (i.e. RT is followed by a template switch (TS) step) using oligo-dT primers.\n- `sc_10x_genomics`:10x genomics library preparation protocol for scVDJ sequencing." }, "race_linker": { @@ -336,19 +343,22 @@ "save_databases": { "type": "boolean", "description": "Save databases so you can use the cache in future runs.", - "fa_icon": "fas fa-file-download" + "fa_icon": "fas fa-file-download", + "default": true }, "reference_fasta": { "type": "string", "description": "Path to the germline reference fasta.", "help_text": "By default, we provide a pre-downloaded version of the IMGT database. It is also possible to provide a custom reference fasta database. To fetch a fresh version of IMGT, set the `--fetch_imgt` parameter instead.", - "fa_icon": "fas fa-database" + "fa_icon": "fas fa-database", + "default": "https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/imgtdb_base.zip" }, "reference_igblast": { "type": "string", "description": "Path to the cached igblast database.", "help_text": "By default, we provide a pre-downloaded version of the IMGT database. It is also possible to provide a custom reference fasta database. To fetch a fresh version of IMGT, set the `--fetch_imgt` parameter instead.", - "fa_icon": "fas fa-database" + "fa_icon": "fas fa-database", + "default": "https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/database-cache/igblast_base.zip" }, "fetch_imgt": { "type": "boolean", @@ -478,6 +488,36 @@ "help_text": "Options for running raw single cell data.", "fa_icon": "fab fa-pagelines" }, + "rnaseq_based_analysis_options": { + "title": "Unselected RNA-seq based analysis options", + "type": "object", + "description": "Options specific for raw unselected RNA-seq input.", + "default": "", + "properties": { + "barcode_read": { + "type": "string", + "description": "Specifies which read holds the barcodes", + "enum": ["R1", "R2"], + "fa_icon": "fas fa-terminal", + "help_text": "file containing the barcodes" + }, + "umi_read": { + "type": "string", + "description": "Indicate if UMI indices are recorded in the R1 (default) or R1 fastq file.", + "help_text": "file containing 10x Genomics-like UMIs", + "enum": ["R1", "R2"], + "fa_icon": "fas fa-barcode" + }, + "read_format": { + "type": "string", + "description": "Specifies where in the read the barcodes and UMIs can be found.", + "help_text": "For further information see the TRUST4 [docs](https://github.com/liulab-dfci/TRUST4?tab=readme-ov-file#10x-genomics-data-and-barcode-based-single-cell-data).", + "fa_icon": "fas fa-terminal" + } + }, + "help_text": "Options for running raw RNA seq data.", + "fa_icon": "fab fa-pagelines" + }, "report_options": { "title": "Report options", "type": "object", @@ -790,6 +830,9 @@ { "$ref": "#/definitions/single_cell_analysis_options" }, + { + "$ref": "#/definitions/rnaseq_based_analysis_options" + }, { "$ref": "#/definitions/institutional_config_options" }, @@ -799,5 +842,10 @@ { "$ref": "#/definitions/generic_options" } - ] + ], + "properties": { + "skip_alignment_filter": { + "type": "boolean" + } + } } diff --git a/subworkflows/local/fastq_input_check.nf b/subworkflows/local/fastq_input_check.nf index e14cfc21..91412b15 100644 --- a/subworkflows/local/fastq_input_check.nf +++ b/subworkflows/local/fastq_input_check.nf @@ -29,8 +29,8 @@ workflow FASTQ_INPUT_CHECK { ch_versions = SAMPLESHEET_CHECK.out.versions - // Merge multi-lane sample fastq for protocols except for 10x genomics (cellranger handles multi-fastq per sample) - if (params.library_generation_method == 'sc_10x_genomics') { + // Merge multi-lane sample fastq for protocols except for 10x genomics, trust4 (cellranger handles multi-fastq per sample) + if (params.library_generation_method == 'sc_10x_genomics' || params.library_generation_method == 'trust4') { ch_merged_reads = ch_reads.single.mix( ch_reads.multiple ) @@ -85,10 +85,9 @@ def create_fastq_channels(LinkedHashMap col) { } array = [ meta, [ file(col.filename_R1), file(col.filename_R2), file(col.filename_I1) ] ] } else { - array = [ meta, [ file(col.filename_R1), file(col.filename_R2) ] ] if (params.index_file) { - error "ERROR: --index_file was provided but the index file path is not specified in the samplesheet!" + error "ERROR: Index file path was provided but the index file path is not specified in the samplesheet!" } } return array diff --git a/subworkflows/local/repertoire_analysis_reporting.nf b/subworkflows/local/repertoire_analysis_reporting.nf index 2a796751..905c2d85 100644 --- a/subworkflows/local/repertoire_analysis_reporting.nf +++ b/subworkflows/local/repertoire_analysis_reporting.nf @@ -30,7 +30,7 @@ workflow REPERTOIRE_ANALYSIS_REPORTING { main: ch_versions = Channel.empty() - if (params.mode == "fastq" && params.library_generation_method != "sc_10x_genomics") { + if (params.mode == "fastq" && params.library_generation_method != "sc_10x_genomics" && params.library_generation_method != "trust4" ) { PARSE_LOGS( ch_presto_filterseq_logs, ch_presto_maskprimers_logs, diff --git a/subworkflows/local/rnaseq_input.nf b/subworkflows/local/rnaseq_input.nf new file mode 100644 index 00000000..6469e6b5 --- /dev/null +++ b/subworkflows/local/rnaseq_input.nf @@ -0,0 +1,135 @@ +include { PREPARE_TRUST4_REFERENCE } from '../../modules/local/prepare_trust4_reference' +include { TRUST4 } from '../../modules/nf-core/trust4/main' +include { FASTQ_INPUT_CHECK } from '../../subworkflows/local/fastq_input_check' +include { CHANGEO_PARSEDB_SELECT_LOCUS } from '../../modules/local/changeo/changeo_parsedb_select_locus' +include { CHANGEO_CONVERTDB_FASTA as CHANGEO_CONVERTDB_FASTA_FROM_AIRR } from '../../modules/local/changeo/changeo_convertdb_fasta' +include { FASTP } from '../../modules/nf-core/fastp/main' +include { RENAME_FASTQ as RENAME_FASTQ_TRUST4 } from '../../modules/local/rename_fastq' + + + +workflow RNASEQ_INPUT { + + take: + ch_input + ch_igblast_reference + + main: + + ch_versions = Channel.empty() + ch_logs = Channel.empty() + + // + // read in samplesheet, validate and stage input fies + // + FASTQ_INPUT_CHECK( + ch_input + ) + ch_versions = ch_versions.mix(FASTQ_INPUT_CHECK.out.versions) + + ch_reads = FASTQ_INPUT_CHECK.out.reads + + + // validate library generation method parameters + if (params.vprimers) { + error "The TRUST4 library generation method does not require V-region primers, please provide a reference file instead or select another library method option." + } else if (params.race_linker) { + error "The TRUST4 10X genomics library generation method does not require the --race_linker parameter, please provide a reference file instead or select another library method option." + } + if (params.cprimers) { + error "The TRUST4 library generation method does not require C-region primers, please provide a reference file instead or select another library method option." + } + if (params.umi_length > 0) { + error "TRUST4 library generation method does not require to set the UMI length, please provide a reference file instead or select another library method option." + } + if (params.reference_10x) { + error "The TRUST4 library generation method does not require this reference, please provide a compliant reference file instead or select another library method option." + } + + // Fastp + save_merged = false + FASTP ( + ch_reads, + [], + [], + save_merged + ) + ch_versions = ch_versions.mix(FASTP.out.versions) + + ch_rename_fastq = FASTP.out.reads.map { meta, reads -> [meta, reads[0], reads[1]] } + + // rename fastp output + RENAME_FASTQ_TRUST4( + ch_rename_fastq + ) + + ch_reads_fastp_filtered = RENAME_FASTQ_TRUST4.out.reads.dump(tag: "fastp_filtered") + + PREPARE_TRUST4_REFERENCE( + ch_reads_fastp_filtered, + ch_igblast_reference + ) + + + // create trust4 input + ch_reads_trust4 = ch_reads_fastp_filtered.map{ meta, read_1, read_2 -> [ meta, [], [read_1, read_2] ] } + + PREPARE_TRUST4_REFERENCE.out.trust4_reference.dump(tag: "trust4_reference") + + ch_reads_trust4.dump(tag: "trust4_input") + + // create barcode and umi channels for nf-core trust4 module + barcode_channel = ch_reads_fastp_filtered.map { meta, read_1, read_2 -> [meta, params.barcode_read] } + umi_channel = ch_reads_fastp_filtered.map { meta, read_1, read_2 -> [meta, params.umi_read] } + + TRUST4( + ch_reads_trust4, + PREPARE_TRUST4_REFERENCE.out.trust4_reference, + Channel.of([[], []]).collect(), + barcode_channel, + umi_channel + ) + + ch_trust4_out = TRUST4.out.outs + + // check whether input is sc or bulk and extract respective airr file for downstream processing + ch_trust4_out + .branch { + meta, out_files -> + bulk : meta["single_cell"] == "false" + return [ meta, out_files.find { it.endsWith("${meta.id}_airr.tsv") } ] + sc : meta["single_cell"] == "true" + return [ meta, out_files.find { it.endsWith("${meta.id}_barcode_airr.tsv") } ] + } + .set { ch_trust4_airr_file } + + + // create channel with airr file + ch_trust4_airr_file.bulk.mix ( ch_trust4_airr_file.sc ).set { ch_trust4_airr } + + // select only provided locus + CHANGEO_PARSEDB_SELECT_LOCUS(ch_trust4_airr) + + + // convert airr tsv to fasta + CHANGEO_CONVERTDB_FASTA_FROM_AIRR( + CHANGEO_PARSEDB_SELECT_LOCUS.out.tab + ) + + ch_fasta = CHANGEO_CONVERTDB_FASTA_FROM_AIRR.out.fasta + + + emit: + versions = ch_versions + // fastp + fastp_reads_json = FASTP.out.json.collect{ meta,json -> json } + fastp_reads_html = FASTP.out.html.collect{ meta,html -> html } + // complete trust4 output + outs = ch_trust4_out + // trust4 airr file + airr = ch_trust4_airr + // trust4 output converted to FASTA format + fasta = ch_fasta + samplesheet = FASTQ_INPUT_CHECK.out.samplesheet + +} diff --git a/subworkflows/local/vdj_annotation.nf b/subworkflows/local/vdj_annotation.nf index 692320ec..18d052bf 100644 --- a/subworkflows/local/vdj_annotation.nf +++ b/subworkflows/local/vdj_annotation.nf @@ -38,19 +38,25 @@ workflow VDJ_ANNOTATION { ch_assigned_tab = CHANGEO_MAKEDB.out.tab ch_assignment_logs = CHANGEO_MAKEDB.out.logs - // Apply quality filters: - // - locus should match v_call chain - // - seq alignment min length informative positions 200 - // - max 10% N nucleotides - FILTER_QUALITY( - ch_assigned_tab - ) - ch_logs = ch_logs.mix(FILTER_QUALITY.out.logs) - ch_versions = ch_versions.mix(FILTER_QUALITY.out.versions) + if (!params.skip_alignment_filter){ + // Apply quality filters: + // - locus should match v_call chain + // - seq alignment min length informative positions 200 + // - max 10% N nucleotides + FILTER_QUALITY( + ch_assigned_tab + ) + ch_for_parsedb_split = FILTER_QUALITY.out.tab + ch_logs = ch_logs.mix(FILTER_QUALITY.out.logs) + ch_versions = ch_versions.mix(FILTER_QUALITY.out.versions) + } else { + ch_for_parsedb_split = ch_assigned_tab + } + if (params.productive_only) { CHANGEO_PARSEDB_SPLIT ( - FILTER_QUALITY.out.tab + ch_for_parsedb_split ) ch_logs = ch_logs.mix(CHANGEO_PARSEDB_SPLIT.out.logs) ch_versions = ch_versions.mix(CHANGEO_PARSEDB_SPLIT.out.versions) diff --git a/workflows/airrflow.nf b/workflows/airrflow.nf index 4d228544..9a401da9 100644 --- a/workflows/airrflow.nf +++ b/workflows/airrflow.nf @@ -43,6 +43,7 @@ include { CLONAL_ANALYSIS } from '../subworkflows/local/clonal_ana include { REPERTOIRE_ANALYSIS_REPORTING } from '../subworkflows/local/repertoire_analysis_reporting' include { SC_RAW_INPUT } from '../subworkflows/local/sc_raw_input' include { FASTQ_INPUT_CHECK } from '../subworkflows/local/fastq_input_check' +include { RNASEQ_INPUT } from '../subworkflows/local/rnaseq_input' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -94,24 +95,51 @@ workflow AIRRFLOW { ch_validated_samplesheet = SC_RAW_INPUT.out.samplesheet.collect() - ch_presto_filterseq_logs = Channel.empty() - ch_presto_maskprimers_logs = Channel.empty() - ch_presto_pairseq_logs = Channel.empty() - ch_presto_clustersets_logs = Channel.empty() - ch_presto_buildconsensus_logs = Channel.empty() - ch_presto_postconsensus_pairseq_logs = Channel.empty() - ch_presto_assemblepairs_logs = Channel.empty() - ch_presto_collapseseq_logs = Channel.empty() - ch_presto_splitseq_logs = Channel.empty() - ch_fastp_html = Channel.empty() - ch_fastp_json = Channel.empty() - ch_fastqc_postassembly_mqc = Channel.empty() - } else { - // Perform sequence assembly if input type is fastq from bulk sequencing data - SEQUENCE_ASSEMBLY( - ch_input, - DATABASES.out.igblast.collect() - ) + ch_presto_filterseq_logs = Channel.empty() + ch_presto_maskprimers_logs = Channel.empty() + ch_presto_pairseq_logs = Channel.empty() + ch_presto_clustersets_logs = Channel.empty() + ch_presto_buildconsensus_logs = Channel.empty() + ch_presto_postconsensus_pairseq_logs = Channel.empty() + ch_presto_assemblepairs_logs = Channel.empty() + ch_presto_collapseseq_logs = Channel.empty() + ch_presto_splitseq_logs = Channel.empty() + ch_fastp_html = Channel.empty() + ch_fastp_json = Channel.empty() + ch_fastqc_postassembly_mqc = Channel.empty() + + } else if (params.library_generation_method == "trust4") { + // Extract VDJ sequences from "general" RNA seq data using TRUST4 + + RNASEQ_INPUT ( + ch_input, + DATABASES.out.igblast.collect() + ) + + ch_fasta = RNASEQ_INPUT.out.fasta + ch_versions = ch_versions.mix(RNASEQ_INPUT.out.versions) + + ch_validated_samplesheet = RNASEQ_INPUT.out.samplesheet.collect() + + ch_presto_filterseq_logs = Channel.empty() + ch_presto_maskprimers_logs = Channel.empty() + ch_presto_pairseq_logs = Channel.empty() + ch_presto_clustersets_logs = Channel.empty() + ch_presto_buildconsensus_logs = Channel.empty() + ch_presto_postconsensus_pairseq_logs = Channel.empty() + ch_presto_assemblepairs_logs = Channel.empty() + ch_presto_collapseseq_logs = Channel.empty() + ch_presto_splitseq_logs = Channel.empty() + ch_fastp_html = RNASEQ_INPUT.out.fastp_reads_html + ch_fastp_json = RNASEQ_INPUT.out.fastp_reads_json + ch_fastqc_postassembly_mqc = Channel.empty() + } + else { + // Perform sequence assembly if input type is fastq from bulk sequencing data + SEQUENCE_ASSEMBLY( + ch_input, + DATABASES.out.igblast.collect() + ) ch_fasta = SEQUENCE_ASSEMBLY.out.fasta ch_versions = ch_versions.mix(SEQUENCE_ASSEMBLY.out.versions)