diff --git a/.nf-core.yml b/.nf-core.yml
index b37758c0..c1acdeb5 100644
--- a/.nf-core.yml
+++ b/.nf-core.yml
@@ -1,5 +1,5 @@
lint:
- files_exist: false
+ files_exist:
- assets/nf-core-tomte_logo_light.png
- docs/images/nf-core-tomte_logo_light.png
- docs/images/nf-core-tomte_logo_dark.png
@@ -8,7 +8,7 @@ lint:
- docs/images/tomte_pipeline_metromap.png
- conf/modules.config
- .github/ISSUE_TEMPLATE/config.yml
- files_unchanged: false
+ files_unchanged:
- assets/sendmail_template.txt
- .github/CONTRIBUTING.md
- .github/ISSUE_TEMPLATE/bug_report.yml
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e931e4cb..feec0b23 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,23 +3,41 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
-## 2.2.1 - Scrooge [2024-08-28]
+## X.X.X - [XXXX-XX-XX]
+
+### `Added`
+
+- Functionality to create DROP databases and to add samples to existing ones [#147](https://github.com/genomic-medicine-sweden/tomte/pull/147)
### `Fixed`
-- After an update, MultiQC was not outputing data for RnaSeqMetrics so an earlier version will be used [#156](https://github.com/genomic-medicine-sweden/tomte/pull/156)
+### `Parameters`
### `Changed`
-- Downgraded multiqc version [#156](https://github.com/genomic-medicine-sweden/tomte/pull/156)
- Updated modules ensemblvep/filtervep, ensemblvep/vep [#159](https://github.com/genomic-medicine-sweden/tomte/pull/159)
- Updated gencode version from 37 to 46 [#159](https://github.com/genomic-medicine-sweden/tomte/pull/159)
+- Updated modules using drop drop_config_runAE, drop_config_runAS, drop_sample_annot, and drop_filter_results [#147](https://github.com/genomic-medicine-sweden/tomte/pull/147)
| Tool | Old version | New version |
| -------------------- | ----------- | ----------- |
-| multiqc | 1.24.1 | 1.21 |
| ensemblvep/filtervep | 110 | 112 |
| ensemblvep/vep | 110 | 112 |
+| DROP | 1.3.3 | 1.4.0 |
+
+## 2.2.1 - Scrooge [2024-08-28]
+
+### `Fixed`
+
+- After an update, MultiQC was not outputing data for RnaSeqMetrics so an earlier version will be used [#156](https://github.com/genomic-medicine-sweden/tomte/pull/156)
+
+### `Changed`
+
+- Downgraded multiqc version [#156](https://github.com/genomic-medicine-sweden/tomte/pull/156)
+
+| Tool | Old version | New version |
+| ------- | ----------- | ----------- |
+| multiqc | 1.24.1 | 1.21 |
## 2.2.0 - TioDeNadal [2024-08-27]
diff --git a/bin/drop_config.py b/bin/drop_config.py
index 09abd544..856bf06a 100755
--- a/bin/drop_config.py
+++ b/bin/drop_config.py
@@ -6,7 +6,7 @@
from typing import Dict, Any
from copy import deepcopy
-SCRIPT_VERSION = "v1.0"
+SCRIPT_VERSION = "v2.1"
CONFIG_YAML = {
"projectTitle": "DROP: Detection of RNA Outliers Pipeline",
"root": None,
@@ -86,6 +86,7 @@ def update_config(
gtf: Path,
genome_assembly: str,
drop_group_samples: str,
+ drop_other_group_samples: str,
padjcutoff: float,
zscorecutoff: float,
drop_module: str,
@@ -99,13 +100,14 @@ def update_config(
config_copy: Dict[str, Any] = deepcopy(CONFIG_YAML)
config_copy["genome"] = genome_name
- config_copy["root"] = "output"
+ config_copy["root"] = str(Path.cwd() / "output")
config_copy["htmlOutputPath"] = "output/html"
config_copy["sampleAnnotation"] = "sample_annotation.tsv"
config_copy["geneAnnotation"][gtf_without_ext] = str(gtf)
config_copy["geneAnnotation"].pop("gtf", None)
config_copy["exportCounts"]["geneAnnotations"] = [gtf_without_ext]
config_copy["genomeAssembly"] = genome_assembly
+ config_copy["exportCounts"]["excludeGroups"] = [drop_other_group_samples]
# Export counts
if drop_module == "AE":
@@ -165,6 +167,12 @@ def parse_args(argv=None):
help="Specify drop group to analyse",
required=True,
)
+ parser.add_argument(
+ "--drop_other_group_samples",
+ type=str,
+ help="Specify the drop group to exclude in exportCounts",
+ required=True,
+ )
parser.add_argument(
"--padjcutoff",
type=float,
@@ -200,6 +208,7 @@ def main():
gtf=args.gtf,
genome_assembly=args.genome_assembly,
drop_group_samples=args.drop_group_samples,
+ drop_other_group_samples=args.drop_other_group_samples,
padjcutoff=args.padjcutoff,
zscorecutoff=args.zscorecutoff,
drop_module=args.drop_module,
diff --git a/bin/drop_sample_annot.py b/bin/drop_sample_annot.py
index 31f0602f..38a77f46 100755
--- a/bin/drop_sample_annot.py
+++ b/bin/drop_sample_annot.py
@@ -2,10 +2,10 @@
import argparse
import csv
-from pandas import read_csv, DataFrame, concat
+from pandas import read_csv, DataFrame, concat, isna
import os
-SCRIPT_VERSION = "v1.0"
+SCRIPT_VERSION = "v1.1"
SAMPLE_ANNOTATION_COLUMNS = [
"RNA_ID",
"RNA_BAM_FILE",
@@ -25,19 +25,24 @@
def write_sample_annotation_to_tsv(
- bam: str, samples: str, strandedness: str, single_end: str, drop_group_sample: str, out_file: str
+ bam: str,
+ samples: str,
+ strandedness: str,
+ single_end: str,
+ drop_group_sample: str,
+ out_file: str,
):
"""Write the Sample Annotation tsv file."""
with open(out_file, "w") as tsv_file:
- writer = csv.DictWriter(tsv_file, fieldnames=SAMPLE_ANNOTATION_COLUMNS, delimiter="\t")
+ writer = csv.DictWriter(
+ tsv_file, fieldnames=SAMPLE_ANNOTATION_COLUMNS, delimiter="\t"
+ )
writer.writeheader()
for index, id in enumerate(samples):
sa_dict: dict = {}.fromkeys(SAMPLE_ANNOTATION_COLUMNS, "NA")
sa_dict["RNA_ID"] = id
sa_dict["DROP_GROUP"] = drop_group_sample
- sa_dict["GENE_COUNTS_FILE"] = "NA"
- sa_dict["GENE_ANNOTATION"] = "NA"
- sa_dict["STRAND"] = strandedness[index]
+ sa_dict["STRAND"] = is_stranded(strandedness[index])
sa_dict["PAIRED_END"] = is_paired_end(single_end[index])
sa_dict["RNA_BAM_FILE"] = bam[index]
writer.writerow(sa_dict)
@@ -50,24 +55,74 @@ def is_paired_end(single_end: str) -> bool:
return False
+def is_stranded(strandedness: str) -> str:
+ """Logical funciton to determine sample strandness"""
+ if strandedness.lower() == "reverse":
+ return "reverse"
+ elif strandedness.lower() == "forward":
+ return "yes"
+ else:
+ return "no"
+
+
+def count_mode(sample_count_mode: str) -> str:
+ """Logical function to determine if count mode is given or default "IntersectionStrict" should be used"""
+ if isna(sample_count_mode) or sample_count_mode == "" or sample_count_mode == "NA":
+ return "IntersectionStrict"
+ else:
+ return sample_count_mode
+
+
+def count_overlaps(sample_count_overlap: str) -> str:
+ """Logical funciton to determine if count overlap is given or default "TRUE" should be used"""
+ if (
+ isna(sample_count_overlap)
+ or sample_count_overlap == ""
+ or sample_count_overlap == "NA"
+ ):
+ return True
+ else:
+ return sample_count_overlap
+
+
def write_final_annot_to_tsv(ref_count_file: str, ref_annot: str, out_file: str):
"""
Concatenates the Sample Annotation produced by SampleAnnotation with the one
- provided for the reference samples, checking for duplicate sample IDs
+ provided for the reference samples, if one is provided, checking for duplicate sample IDs
"""
df_samples: DataFrame = read_csv("drop_annotation_given_samples.tsv", sep="\t")
- df_reference: DataFrame = read_csv(ref_annot, sep="\t")
- df_reference["GENE_COUNTS_FILE"] = ref_count_file
- df_reference["SPLICE_COUNTS_DIR"] = df_reference["SPLICE_COUNTS_DIR"].str.rstrip("/").apply(os.path.basename)
- df_reference["DROP_GROUP"] = df_reference["DROP_GROUP"].str.replace(" ", "")
- df_samples["COUNT_OVERLAPS"] = df_reference["COUNT_OVERLAPS"].iloc[0]
- df_samples["COUNT_MODE"] = df_reference["COUNT_MODE"].iloc[0]
- df_samples["HPO_TERMS"] = df_reference["HPO_TERMS"].iloc[0]
- for id in df_samples["RNA_ID"]:
- df_reference = df_reference[df_reference["RNA_ID"].str.contains(id) == False]
- df: DataFrame = concat([df_samples, df_reference]).reset_index(drop=True)
- df.fillna("NA", inplace=True)
- df.to_csv(out_file, index=False, sep="\t")
+ if ref_annot == "None" or ref_count_file == "None":
+ print(
+ "No reference samples were provided by the user see usage of --ref_count_file and --ref_annot if you want to provide reference samples"
+ )
+ if df_samples.shape[0] < 50:
+ print(
+ "At least 30 samples are required for Aberrant Splicing and 50 for Aberrant expression"
+ )
+ print(f"Only {df_samples.shape[0]} samples were provided by the user")
+ df_samples.fillna("NA", inplace=True)
+ df_samples["COUNT_MODE"] = "IntersectionStrict"
+ df_samples["COUNT_OVERLAPS"] = True
+ df_samples.to_csv(out_file, index=False, sep="\t")
+ else:
+ df_reference: DataFrame = read_csv(ref_annot, sep="\t")
+ df_reference["GENE_COUNTS_FILE"] = ref_count_file
+ df_reference["SPLICE_COUNTS_DIR"] = (
+ df_reference["SPLICE_COUNTS_DIR"].str.rstrip("/").apply(os.path.basename)
+ )
+ df_reference["DROP_GROUP"] = df_reference["DROP_GROUP"].str.replace(" ", "")
+ df_samples["COUNT_OVERLAPS"] = count_overlaps(
+ df_reference["COUNT_OVERLAPS"].iloc[0]
+ )
+ df_samples["COUNT_MODE"] = count_mode(df_reference["COUNT_MODE"].iloc[0])
+ df_samples["HPO_TERMS"] = df_reference["HPO_TERMS"].iloc[0]
+ for id in df_samples["RNA_ID"]:
+ df_reference = df_reference[
+ df_reference["RNA_ID"].str.contains(id) == False
+ ]
+ df: DataFrame = concat([df_samples, df_reference]).reset_index(drop=True)
+ df.fillna("NA", inplace=True)
+ df.to_csv(out_file, index=False, sep="\t")
def parse_args(argv=None):
@@ -76,15 +131,51 @@ def parse_args(argv=None):
formatter_class=argparse.MetavarTypeHelpFormatter,
description="""Generate DROP sample annotation for patients.""",
)
- parser.add_argument("--bam", type=str, nargs="+", help="bam files for the analyzed samples", required=True)
- parser.add_argument("--samples", type=str, nargs="+", help="corresponding sample name", required=True)
- parser.add_argument("--strandedness", type=str, nargs="+", help="strandedness of RNA", required=True)
- parser.add_argument("--single_end", type=str, nargs="+", help="is the sample paired end?", required=True)
parser.add_argument(
- "--ref_count_file", type=str, help="A tsv file of gene counts for reference samples.", required=True
+ "--bam",
+ type=str,
+ nargs="+",
+ help="bam files for the analyzed samples",
+ required=True,
+ )
+ parser.add_argument(
+ "--samples",
+ type=str,
+ nargs="+",
+ help="corresponding sample name",
+ required=True,
+ )
+ parser.add_argument(
+ "--strandedness", type=str, nargs="+", help="strandedness of RNA", required=True
+ )
+ parser.add_argument(
+ "--single_end",
+ type=str,
+ nargs="+",
+ help="is the sample paired end?",
+ required=True,
+ )
+ parser.add_argument(
+ "--ref_count_file",
+ type=str,
+ default="None",
+ help="A tsv file of gene counts for reference samples.",
+ required=False,
+ )
+ parser.add_argument(
+ "--ref_annot",
+ type=str,
+ default="None",
+ help="Path to reference annotation tsv",
+ required=False,
+ )
+ parser.add_argument(
+ "--drop_group_sample",
+ type=str,
+ default="None",
+ help="Drop group of analyzed samples",
+ required=False,
)
- parser.add_argument("--ref_annot", type=str, help="Path to reference annotation tsv", required=True)
- parser.add_argument("--drop_group_sample", type=str, help="Drop group of analyzed samples", required=True)
parser.add_argument("--output", type=str, help="Path to save to", required=True)
parser.add_argument("--version", action="version", version=SCRIPT_VERSION)
return parser.parse_args(argv)
@@ -101,7 +192,12 @@ def main():
drop_group_sample=args.drop_group_sample,
out_file="drop_annotation_given_samples.tsv",
)
- write_final_annot_to_tsv(ref_count_file=args.ref_count_file, ref_annot=args.ref_annot, out_file=args.output)
+
+ write_final_annot_to_tsv(
+ ref_count_file=args.ref_count_file,
+ ref_annot=args.ref_annot,
+ out_file=args.output,
+ )
if __name__ == "__main__":
diff --git a/conf/base.config b/conf/base.config
index ed50b00c..47ea66d9 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -62,4 +62,17 @@ process {
errorStrategy = 'retry'
maxRetries = 2
}
+ if (!params.skip_export_counts_drop) {
+ withLabel:process_drop {
+ cpus = { check_max( 36 * task.attempt, 'cpus' ) }
+ memory = { check_max( 144.GB , 'memory' ) }
+ time = { check_max( 48.h * task.attempt, 'time' ) }
+ }
+ } else {
+ withLabel:process_drop {
+ cpus = { check_max( 12 * task.attempt, 'cpus' ) }
+ memory = { check_max( 72.GB * task.attempt, 'memory' ) }
+ time = { check_max( 16.h * task.attempt, 'time' ) }
+ }
+ }
}
diff --git a/docs/usage.md b/docs/usage.md
index 721fc425..ef53022e 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -225,16 +225,17 @@ The mandatory and optional parameters for each category are tabulated below.
DROP - aberrant expression
-| Mandatory | Optional |
-| ------------------------------------- | --------------------------------- |
-| reference_drop_annot_file1 | skip_drop_ae2 |
-| reference_drop_count_file | drop_group_samples_ae3 |
-| fasta | drop_padjcutoff_ae4 |
-| gtf | drop_zscorecutoff5 |
-| | gene_panel_clinical_filter |
-| | skip_downsample6 |
-| | num_reads7 |
-| | genome8 |
+| Mandatory | Optional |
+| ------------------------------------- | ----------------------------------- |
+| reference_drop_annot_file1 | skip_drop_ae2 |
+| reference_drop_count_file | drop_group_samples_ae3 |
+| fasta | drop_padjcutoff_ae4 |
+| gtf | drop_zscorecutoff5 |
+| | gene_panel_clinical_filter |
+| | skip_downsample6 |
+| | num_reads7 |
+| | genome8 |
+| | skip_export_counts_drop9 |
1 To get more information on how to format it, see below
2 If it is not provided by the user, the default value is false
@@ -244,18 +245,20 @@ DROP - aberrant expression
6 If it is not provided by the user, the default value is false
7 If it is not provided by the user, the default value is 120000000
8 If it is not provided by the user, the default value is GRCh38
+9 If it is not provided by the user, the default value is true
DROP - aberrant splicing
-| Mandatory | Optional |
-| ------------------------------------- | --------------------------------- |
-| reference_drop_annot_file1 | skip_drop_as2 |
-| reference_drop_splice_folder | drop_group_samples_as3 |
-| | drop_padjcutoff_as4 |
-| | gene_panel_clinical_filter |
-| | skip_downsample5 |
-| | num_reads6 |
-| | genome7 |
+| Mandatory | Optional |
+| ------------------------------------- | ----------------------------------- |
+| reference_drop_annot_file1 | skip_drop_as2 |
+| reference_drop_splice_folder | drop_group_samples_as3 |
+| | drop_padjcutoff_as4 |
+| | gene_panel_clinical_filter |
+| | skip_downsample5 |
+| | num_reads6 |
+| | genome7 |
+| | skip_export_counts_drop8 |
1 To get more information on how to format it, see below
2 If it is not provided by the user, the default value is false
@@ -264,16 +267,32 @@ DROP - aberrant splicing
5 If it is not provided by the user, the default value is false
6 If it is not provided by the user, the default value is 120000000
7 If it is not provided by the user, the default value is GRCh38
+8 If it is not provided by the user, the default value is true
##### Preparing input for DROP
If you want to run [DROP](https://github.com/gagneurlab/drop) aberrant expression or aberrant splicing you have to provide reference counts, splice counts and a sample sheet. The sample sheet should contain the columns as those in the [test sample annotation](../test_data/drop_data/sampleAnnotation.tsv), you do not need to include the samples you are running through the pipeline in the sample sheet.
-To obtain the gene counts and splice counts you will have to download the counts from one of the [available databases](https://github.com/gagneurlab/drop#datasets) or run drop locally with your own samples. If you choose the second option, you should start by runnig the module(s) you want to export counts for. Afterwards, you need to run the exportCounts module. Make sure that your config has only the modules you want to export and have already run as , that only existing groups are mentioned in the config, and that exportCounts excludGroups is null or contains a group of samples you want to exclude. Finally, run:
+###### Preparing your DROP control database
-```console
-snakemake exportCounts --cores 1
-```
+You have several options on how to create such a database. You can either build it or download it from one of the [available databases](https://github.com/gagneurlab/drop#datasets).
+
+To build your own database you will need at least 50 for aberrant expression, if you only run aberrant splicing 30 samples will suffice but DROP authors recommend to have at least around 100 for both modules. You can use Tomte to build your own database, to do so we recommend to run with the following parameters:
+
+- `--skip_export_counts_drop false` this switch will ensure that a folder called export_counts is created
+- `--skip_drop_as false` if you want to get a database for aberrant splicing
+- `--skip_drop_ae false` if you want to get a database for aberrant expression
+- `--skip_subsample_region false` if you have sequenced any material with overrepresented regions (such as hemoglobin in whole blood) we recommend to remove it by setting this parameter to false and providing a bed with the overrepresented region with `--subsample_bed`
+- `--skip_downsample false` if you have very deeply sequenced samples, we recommend to downsample, the default is 60M read pairs
+- `--skip_build_tracks true`, `--skip_stringtie true`, `--skip_vep true` as most users will be interested in getting the database rather than other downstream results
+
+Running DROP with many samples requires a lot of time and a lot of memory, that is why we recommend to subsample overrepresented regions and downsample if you have deeply sequenced samples. If your run fails for either of this reasons, try to relaunch it from the work directory where DROP was run so that DROP continues from the point where it failed (if you restart the pipeline with `-resume` it will begin from the start and it will likely fail in the same way).
+
+To restart DROP, start by finding the work directory where it was run. You can do so by opening the execution trace file in the pipeline_info folder and looking at the hash of the processes with name `TOMTE:ANALYSE_TRANSCRIPTS:DROP_CONFIG_RUN_AE` and `TOMTE:ANALYSE_TRANSCRIPTS:DROP_CONFIG_RUN_AS`. The work directory used to run the pipeline followed by the hash should be enough information to find the folder where DROP was run. Tomte should have set everything up in that directory so go into it and restart the run by running from the container created by Tomte the script `.command.sh`. If you want to run it with slurm remember to add a header with number of cores, time...
+
+If you want to add samples to an existing database, follow the same steps described above, making sure that you also provide the database you want to add samples to by using `--reference_drop_annot_file` and `--reference_drop_count_file` and/or `--reference_drop_splice_folder`. In this case scenerio, make sure that you have used the same references for the database as for the new set of samples.
+
+If you prefer to run DROP locally outside from Tomte follow instructions given by the [authors of DROP](https://github.com/gagneurlab/drop)
## Running the pipeline
diff --git a/modules/local/drop_config_runAE.nf b/modules/local/drop_config_runAE.nf
index 16eb7c41..e36c200f 100644
--- a/modules/local/drop_config_runAE.nf
+++ b/modules/local/drop_config_runAE.nf
@@ -1,13 +1,13 @@
process DROP_CONFIG_RUN_AE {
tag "DROP_CONFIG_RUN_AE"
- label 'process_high'
+ label 'process_drop'
// Exit if running this module with -profile conda / -profile mamba
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
exit 1, "Local DROP module does not support Conda. Please use Docker / Singularity / Podman instead."
}
- container "docker.io/clinicalgenomics/drop:1.3.3"
+ container "docker.io/clinicalgenomics/drop:1.4.0"
input:
tuple val(meta), path(fasta), path(fai)
@@ -18,14 +18,17 @@ process DROP_CONFIG_RUN_AE {
path ref_splice_folder
val(genome)
val(drop_group_samples_ae)
+ val(drop_group_samples_as)
val(drop_padjcutoff_ae)
val(drop_zScoreCutoff)
+ val(skip_export_counts_drop)
output:
path('config.yaml') , emit: config_drop
path('output') , emit: drop_ae_out
path('OUTRIDER_results_all.Rds'), emit: drop_ae_rds
path('gene_name_mapping*') , emit: drop_gene_name
+ path('exported_counts') , emit: gene_counts_ae, optional: true
path "versions.yml" , emit: versions
when:
@@ -35,6 +38,7 @@ process DROP_CONFIG_RUN_AE {
def args = task.ext.args ?: ''
def genome_assembly = "${genome}".contains("h37") ? "hg19" : "${genome}"
def drop_group = "${drop_group_samples_ae}".replace(" ","")
+ def drop_other_group_samples = "${drop_group_samples_as}".replace(" ","")
def zscorecutoff = drop_zScoreCutoff ? "--zscorecutoff ${drop_zScoreCutoff}" : ''
"""
@@ -49,12 +53,20 @@ process DROP_CONFIG_RUN_AE {
--drop_module AE \\
--genome_assembly $genome_assembly \\
--drop_group_samples $drop_group \\
+ --drop_other_group_samples $drop_other_group_samples \\
--padjcutoff ${drop_padjcutoff_ae} \\
$zscorecutoff \\
--output config.yaml
snakemake aberrantExpression --cores ${task.cpus} --rerun-triggers mtime $args
+ if [[ $skip_export_counts_drop == false ]]; then
+ snakemake exportCounts --cores 1
+ mkdir -p exported_counts
+ cp sample_annotation.tsv exported_counts/.
+ cp output/processed_results/exported_counts/*/geneCounts.tsv.gz exported_counts/.
+ fi
+
cp output/processed_results/aberrant_expression/*/outrider/outrider/OUTRIDER_results_all.Rds .
cp output/processed_data/preprocess/*/gene_name_mapping_*.tsv .
@@ -71,6 +83,9 @@ process DROP_CONFIG_RUN_AE {
touch OUTRIDER_results_all.Rds
touch gene_name_mapping_.tsv
mkdir output
+ if [[ $skip_export_counts_drop == false ]]; then
+ mkdir exported_counts
+ fi
cat <<-END_VERSIONS > versions.yml
"${task.process}":
diff --git a/modules/local/drop_config_runAS.nf b/modules/local/drop_config_runAS.nf
index 9c8a650d..a90a6e97 100644
--- a/modules/local/drop_config_runAS.nf
+++ b/modules/local/drop_config_runAS.nf
@@ -1,13 +1,13 @@
process DROP_CONFIG_RUN_AS {
tag "DROP_CONFIG_RUN_AS"
- label 'process_high'
+ label 'process_drop'
// Exit if running this module with -profile conda / -profile mamba
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
exit 1, "Local DROP module does not support Conda. Please use Docker / Singularity / Podman instead."
}
- container "docker.io/clinicalgenomics/drop:1.3.3"
+ container "docker.io/clinicalgenomics/drop:1.4.0"
input:
tuple val(meta), path(fasta), path(fai)
@@ -18,13 +18,16 @@ process DROP_CONFIG_RUN_AS {
path ref_splice_folder
val(genome)
val(drop_group_samples_as)
+ val(drop_group_samples_ae)
val(drop_padjcutoff_as)
+ val(skip_export_counts_drop)
output:
path('config.yaml') , emit: config_drop
path('output') , emit: drop_as_out
path('FRASER_results_fraser--*'), emit: drop_as_tsv
path('gene_name_mapping*') , emit: drop_gene_name
+ path('exported_counts') , emit: gene_counts_as, optional: true
path "versions.yml" , emit: versions
when:
@@ -34,6 +37,7 @@ process DROP_CONFIG_RUN_AS {
def args = task.ext.args ?: ''
def genome_assembly = "${genome}".contains("h37") ? "hg19" : "${genome}"
def drop_group = "${drop_group_samples_as}".replace(" ","")
+ def drop_other_group_samples = "${drop_group_samples_ae}".replace(" ","")
"""
TMPDIR=\$PWD
HOME=\$PWD
@@ -47,11 +51,19 @@ process DROP_CONFIG_RUN_AS {
--drop_module AS \\
--genome_assembly $genome_assembly \\
--drop_group_samples $drop_group \\
+ --drop_other_group_samples $drop_other_group_samples \\
--padjcutoff ${drop_padjcutoff_as} \\
--output config.yaml
snakemake aberrantSplicing --cores ${task.cpus} --rerun-triggers mtime $args
+ if [[ $skip_export_counts_drop == false ]]; then
+ snakemake exportCounts --cores 1
+ mkdir -p exported_counts
+ cp sample_annotation.tsv exported_counts/.
+ cp output/processed_results/exported_counts/*/*.gz exported_counts/.
+ fi
+
cp output/html/AberrantSplicing/FRASER_results_fraser--*.tsv .
cp output/processed_data/preprocess/*/gene_name_mapping_*.tsv .
@@ -68,6 +80,9 @@ process DROP_CONFIG_RUN_AS {
touch FRASER_results_fraser--.tsv
touch gene_name_mapping_.tsv
mkdir output
+ if [[ $skip_export_counts_drop == false ]]; then
+ mkdir exported_counts
+ fi
cat <<-END_VERSIONS > versions.yml
"${task.process}":
diff --git a/modules/local/drop_filter_results.nf b/modules/local/drop_filter_results.nf
index 3f4e5cd0..e893757e 100644
--- a/modules/local/drop_filter_results.nf
+++ b/modules/local/drop_filter_results.nf
@@ -7,7 +7,7 @@ process DROP_FILTER_RESULTS {
exit 1, "Local DROP module does not support Conda. Please use Docker / Singularity / Podman instead."
}
- container "docker.io/clinicalgenomics/drop:1.3.3"
+ container "docker.io/clinicalgenomics/drop:1.4.0"
input:
val(case_info)
diff --git a/modules/local/drop_sample_annot.nf b/modules/local/drop_sample_annot.nf
index 248d8b19..371fe8e8 100644
--- a/modules/local/drop_sample_annot.nf
+++ b/modules/local/drop_sample_annot.nf
@@ -1,5 +1,5 @@
process DROP_SAMPLE_ANNOT {
- tag "DROP_sample_annot"
+ tag "DROP_annot_file"
label 'process_low'
// Exit if running this module with -profile conda / -profile mamba
@@ -7,11 +7,10 @@ process DROP_SAMPLE_ANNOT {
exit 1, "Local DROP module does not support Conda. Please use Docker / Singularity / Podman instead."
}
- container "docker.io/clinicalgenomics/drop:1.3.3"
+ container "docker.io/clinicalgenomics/drop:1.4.0"
input:
- path(bam)
- val(samples)
+ tuple val(ids), val(single_ends), val(strandednesses), path(bam), path(bai)
path(ref_gene_counts)
path(ref_annot)
val(drop_group_samples_ae)
@@ -25,18 +24,20 @@ process DROP_SAMPLE_ANNOT {
task.ext.when == null || task.ext.when
script:
- def ids = "${samples.id}".replace("[","").replace("]","").replace(",","")
- def strandedness = "${samples.strandedness}".replace("[","").replace("]","").replace(",","")
- def single_end = "${samples.single_end}".replace("[","").replace("]","").replace(",","")
+ def id = "${ids}".replace("[","").replace("]","").replace(",","")
+ def single_end = "${single_ends}".replace("[","").replace("]","").replace(",","")
+ def strandedness = "${strandednesses}".replace("[","").replace("]","").replace(",","")
def drop_group = "${drop_group_samples_ae},${drop_group_samples_as}".replace(" ","").replace("[","").replace("]","")
+ def reference_count_file = ref_gene_counts ? "--ref_count_file ${ref_gene_counts}" : ''
+ def reference_annotation = ref_annot ? "--ref_annot ${ref_annot}" : ''
"""
$baseDir/bin/drop_sample_annot.py \\
--bam ${bam} \\
- --samples $ids \\
+ --samples $id \\
--strandedness $strandedness \\
--single_end $single_end \\
- --ref_count_file ${ref_gene_counts} \\
- --ref_annot ${ref_annot} \\
+ $reference_count_file \\
+ $reference_annotation \\
--drop_group_sample $drop_group \\
--output sample_annotation.tsv
diff --git a/nextflow.config b/nextflow.config
index 036b8884..99597edc 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -50,6 +50,7 @@ params {
skip_stringtie = false
skip_drop_ae = false
skip_drop_as = false
+ skip_export_counts_drop = true
drop_group_samples_ae = 'outrider'
drop_group_samples_as = 'fraser'
drop_padjcutoff_ae = 0.05
diff --git a/nextflow_schema.json b/nextflow_schema.json
index ba8bcbfd..f2cbcec1 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -311,6 +311,12 @@
"description": "Skip DROP Aberrant Splicing module",
"fa_icon": "fas fa-toggle-off"
},
+ "skip_export_counts_drop": {
+ "type": "boolean",
+ "fa_icon": "fas fa-toggle-on",
+ "default": true,
+ "description": "Skip export counts for DROP. It will export information from those modules run. Read usage for further information."
+ },
"drop_group_samples_ae": {
"type": "string",
"default": "outrider",
diff --git a/subworkflows/local/analyse_transcripts.nf b/subworkflows/local/analyse_transcripts.nf
index 0bfdd250..2a9f4951 100644
--- a/subworkflows/local/analyse_transcripts.nf
+++ b/subworkflows/local/analyse_transcripts.nf
@@ -28,6 +28,7 @@ workflow ANALYSE_TRANSCRIPTS {
ch_gene_panel_clinical_filter // channel: [optional] [ path(tsv) ]
case_info // channel: [optional] [ val(case_id) ]
skip_drop_ae // parameter: [mandatory] default: 'false'
+ skip_export_counts_drop // parameter: [mandatory] default: 'true'
main:
ch_versions = Channel.empty()
@@ -36,13 +37,20 @@ workflow ANALYSE_TRANSCRIPTS {
// Generates count files for samples and merges them with reference count file
// Generates sample annotation
- star_samples = gene_counts.map{ meta, cnt_file -> meta }.collect()
ch_bam_files = ch_bam_ds_bai.collect{it[1]}
+
+ ch_bam_ds_bai
+ .map { meta, bam, bai ->
+ [ meta.id, meta.single_end, meta.strandedness, bam, bai ]
+ }
+ .collect(flat:false)
+ .map { it.transpose() }
+ .set { ch_bam_files_annot }
+
DROP_SAMPLE_ANNOT(
- ch_bam_files,
- star_samples,
- ch_ref_drop_count_file,
- ch_ref_drop_annot_file,
+ ch_bam_files_annot,
+ ch_ref_drop_count_file.ifEmpty([]),
+ ch_ref_drop_annot_file.ifEmpty([]),
drop_group_samples_ae,
drop_group_samples_as
)
@@ -55,12 +63,14 @@ workflow ANALYSE_TRANSCRIPTS {
ch_gtf,
DROP_SAMPLE_ANNOT.out.drop_annot,
ch_bam_bai_files,
- ch_ref_drop_count_file,
- ch_ref_drop_splice_folder,
+ ch_ref_drop_count_file.ifEmpty([]),
+ ch_ref_drop_splice_folder.ifEmpty([]),
genome,
drop_group_samples_ae,
+ drop_group_samples_as,
drop_padjcutoff_ae,
- drop_zscorecutoff
+ drop_zscorecutoff,
+ skip_export_counts_drop
)
// Generates config file and runs Aberrant splicing module
@@ -69,11 +79,13 @@ workflow ANALYSE_TRANSCRIPTS {
ch_gtf,
DROP_SAMPLE_ANNOT.out.drop_annot,
ch_bam_bai_files,
- ch_ref_drop_count_file,
- ch_ref_drop_splice_folder,
+ ch_ref_drop_count_file.ifEmpty([]),
+ ch_ref_drop_splice_folder.ifEmpty([]),
genome,
drop_group_samples_as,
- drop_padjcutoff_as
+ drop_group_samples_ae,
+ drop_padjcutoff_as,
+ skip_export_counts_drop
)
ch_out_drop_ae_rds = DROP_CONFIG_RUN_AE.out.drop_ae_rds ? DROP_CONFIG_RUN_AE.out.drop_ae_rds.collect()
diff --git a/test_data/vep_to_download.csv b/test_data/vep_to_download.csv
index 31df0439..d34ac75f 100644
--- a/test_data/vep_to_download.csv
+++ b/test_data/vep_to_download.csv
@@ -1,4 +1,4 @@
name,path_for_wget
-clinvar_20240806.vcf.gz,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/weekly/clinvar.vcf.gz
-clinvar_20240806.vcf.gz.tbi,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/weekly/clinvar.vcf.gz.tbi
+clinvar_20240902.vcf.gz,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/weekly/clinvar_20240902.vcf.gz
+clinvar_20240902.vcf.gz.tbi,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/weekly/clinvar_20240902.vcf.gz.tbi
homo_sapiens_merged,https://ftp.ensembl.org/pub/release-112/variation/indexed_vep_cache/homo_sapiens_merged_vep_112_GRCh38.tar.gz
diff --git a/test_data/vep_to_download_37.csv b/test_data/vep_to_download_37.csv
index dc2d4e24..793d1d1a 100644
--- a/test_data/vep_to_download_37.csv
+++ b/test_data/vep_to_download_37.csv
@@ -1,4 +1,4 @@
name,path_for_wget
-clinvar_20240806.vcf.gz,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/weekly/clinvar.vcf.gz
-clinvar_20240806.vcf.gz.tbi,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/weekly/clinvar.vcf.gz.tbi
+clinvar_20240902.vcf.gz,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/weekly/clinvar_20240902.vcf.gz
+clinvar_20240902.vcf.gz.tbi,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/weekly/clinvar_20240902.vcf.gz.tbi
homo_sapiens_merged,https://ftp.ensembl.org/pub/release-112/variation/indexed_vep_cache/homo_sapiens_merged_vep_112_GRCh37.tar.gz
diff --git a/workflows/tomte.nf b/workflows/tomte.nf
index fa25d1e1..ed2172b7 100644
--- a/workflows/tomte.nf
+++ b/workflows/tomte.nf
@@ -176,7 +176,8 @@ workflow TOMTE {
params.drop_zscorecutoff,
ch_gene_panel_clinical_filter,
ch_case_info,
- params.skip_drop_ae
+ params.skip_drop_ae,
+ params.skip_export_counts_drop
)
ch_versions = ch_versions.mix(ANALYSE_TRANSCRIPTS.out.versions)