From e9fbca1c6d2b7b5d2ed67666739a7c04d81d667e Mon Sep 17 00:00:00 2001 From: Lucpen Date: Tue, 30 Jul 2024 13:48:11 +0200 Subject: [PATCH 01/19] feat add make database --- bin/drop_config.py | 11 +- bin/drop_sample_annot.py | 122 +++++++++++++++++----- conf/base.config | 13 +++ docs/usage.md | 65 ++++++++---- modules/local/drop_config_runAE.nf | 15 ++- modules/local/drop_config_runAS.nf | 15 ++- modules/local/drop_sample_annot.nf | 15 ++- nextflow.config | 1 + nextflow_schema.json | 19 ++-- subworkflows/local/analyse_transcripts.nf | 22 +++- workflows/tomte.nf | 4 +- 11 files changed, 228 insertions(+), 74 deletions(-) diff --git a/bin/drop_config.py b/bin/drop_config.py index 09abd544..1bfa8faa 100755 --- a/bin/drop_config.py +++ b/bin/drop_config.py @@ -6,7 +6,7 @@ from typing import Dict, Any from copy import deepcopy -SCRIPT_VERSION = "v1.0" +SCRIPT_VERSION = "v2.0" CONFIG_YAML = { "projectTitle": "DROP: Detection of RNA Outliers Pipeline", "root": None, @@ -86,6 +86,7 @@ def update_config( gtf: Path, genome_assembly: str, drop_group_samples: str, + drop_other_group_samples: str, padjcutoff: float, zscorecutoff: float, drop_module: str, @@ -106,6 +107,7 @@ def update_config( config_copy["geneAnnotation"].pop("gtf", None) config_copy["exportCounts"]["geneAnnotations"] = [gtf_without_ext] config_copy["genomeAssembly"] = genome_assembly + config_copy["exportCounts"]["excludeGroups"] = [drop_other_group_samples] # Export counts if drop_module == "AE": @@ -165,6 +167,12 @@ def parse_args(argv=None): help="Specify drop group to analyse", required=True, ) + parser.add_argument( + "--drop_other_group_samples", + type=str, + help="Specify the drop group to exclude in exportCounts", + required=True, + ) parser.add_argument( "--padjcutoff", type=float, @@ -200,6 +208,7 @@ def main(): gtf=args.gtf, genome_assembly=args.genome_assembly, drop_group_samples=args.drop_group_samples, + drop_other_group_samples=args.drop_other_group_samples, padjcutoff=args.padjcutoff, zscorecutoff=args.zscorecutoff, drop_module=args.drop_module, diff --git a/bin/drop_sample_annot.py b/bin/drop_sample_annot.py index 31f0602f..d3c0df6f 100755 --- a/bin/drop_sample_annot.py +++ b/bin/drop_sample_annot.py @@ -5,7 +5,7 @@ from pandas import read_csv, DataFrame, concat import os -SCRIPT_VERSION = "v1.0" +SCRIPT_VERSION = "v1.1" SAMPLE_ANNOTATION_COLUMNS = [ "RNA_ID", "RNA_BAM_FILE", @@ -25,11 +25,18 @@ def write_sample_annotation_to_tsv( - bam: str, samples: str, strandedness: str, single_end: str, drop_group_sample: str, out_file: str + bam: str, + samples: str, + strandedness: str, + single_end: str, + drop_group_sample: str, + out_file: str, ): """Write the Sample Annotation tsv file.""" with open(out_file, "w") as tsv_file: - writer = csv.DictWriter(tsv_file, fieldnames=SAMPLE_ANNOTATION_COLUMNS, delimiter="\t") + writer = csv.DictWriter( + tsv_file, fieldnames=SAMPLE_ANNOTATION_COLUMNS, delimiter="\t" + ) writer.writeheader() for index, id in enumerate(samples): sa_dict: dict = {}.fromkeys(SAMPLE_ANNOTATION_COLUMNS, "NA") @@ -37,7 +44,7 @@ def write_sample_annotation_to_tsv( sa_dict["DROP_GROUP"] = drop_group_sample sa_dict["GENE_COUNTS_FILE"] = "NA" sa_dict["GENE_ANNOTATION"] = "NA" - sa_dict["STRAND"] = strandedness[index] + sa_dict["STRAND"] = is_stranded(strandedness[index]) sa_dict["PAIRED_END"] = is_paired_end(single_end[index]) sa_dict["RNA_BAM_FILE"] = bam[index] writer.writerow(sa_dict) @@ -50,24 +57,49 @@ def is_paired_end(single_end: str) -> bool: return False +def is_stranded(strandedness: str) -> str: + """Logical funciton to determine if a sample is paired end""" + if strandedness.lower() == "reverse": + return "reverse" + elif strandedness.lower() == "forward": + return "yes" + else: + return "no" + + def write_final_annot_to_tsv(ref_count_file: str, ref_annot: str, out_file: str): """ Concatenates the Sample Annotation produced by SampleAnnotation with the one - provided for the reference samples, checking for duplicate sample IDs + provided for the reference samples, if one is provided, checking for duplicate sample IDs """ df_samples: DataFrame = read_csv("drop_annotation_given_samples.tsv", sep="\t") - df_reference: DataFrame = read_csv(ref_annot, sep="\t") - df_reference["GENE_COUNTS_FILE"] = ref_count_file - df_reference["SPLICE_COUNTS_DIR"] = df_reference["SPLICE_COUNTS_DIR"].str.rstrip("/").apply(os.path.basename) - df_reference["DROP_GROUP"] = df_reference["DROP_GROUP"].str.replace(" ", "") - df_samples["COUNT_OVERLAPS"] = df_reference["COUNT_OVERLAPS"].iloc[0] - df_samples["COUNT_MODE"] = df_reference["COUNT_MODE"].iloc[0] - df_samples["HPO_TERMS"] = df_reference["HPO_TERMS"].iloc[0] - for id in df_samples["RNA_ID"]: - df_reference = df_reference[df_reference["RNA_ID"].str.contains(id) == False] - df: DataFrame = concat([df_samples, df_reference]).reset_index(drop=True) - df.fillna("NA", inplace=True) - df.to_csv(out_file, index=False, sep="\t") + if ref_annot == "None" or ref_count_file == "None": + print( + "No reference samples were provided by the user see usage of --ref_count_file and --ref_annot if you want to provide reference samples" + ) + if df_samples.shape[0] < 50: + print( + "At least 30 samples are required for Aberrant Splicing and 50 for Aberrant expression" + ) + print(f"Only {df_samples.shape[0]} samples were provided by the user") + df_samples.to_csv(out_file, index=False, sep="\t") + else: + df_reference: DataFrame = read_csv(ref_annot, sep="\t") + df_reference["GENE_COUNTS_FILE"] = ref_count_file + df_reference["SPLICE_COUNTS_DIR"] = ( + df_reference["SPLICE_COUNTS_DIR"].str.rstrip("/").apply(os.path.basename) + ) + df_reference["DROP_GROUP"] = df_reference["DROP_GROUP"].str.replace(" ", "") + df_samples["COUNT_OVERLAPS"] = df_reference["COUNT_OVERLAPS"].iloc[0] + df_samples["COUNT_MODE"] = df_reference["COUNT_MODE"].iloc[0] + df_samples["HPO_TERMS"] = df_reference["HPO_TERMS"].iloc[0] + for id in df_samples["RNA_ID"]: + df_reference = df_reference[ + df_reference["RNA_ID"].str.contains(id) == False + ] + df: DataFrame = concat([df_samples, df_reference]).reset_index(drop=True) + df.fillna("NA", inplace=True) + df.to_csv(out_file, index=False, sep="\t") def parse_args(argv=None): @@ -76,15 +108,50 @@ def parse_args(argv=None): formatter_class=argparse.MetavarTypeHelpFormatter, description="""Generate DROP sample annotation for patients.""", ) - parser.add_argument("--bam", type=str, nargs="+", help="bam files for the analyzed samples", required=True) - parser.add_argument("--samples", type=str, nargs="+", help="corresponding sample name", required=True) - parser.add_argument("--strandedness", type=str, nargs="+", help="strandedness of RNA", required=True) - parser.add_argument("--single_end", type=str, nargs="+", help="is the sample paired end?", required=True) parser.add_argument( - "--ref_count_file", type=str, help="A tsv file of gene counts for reference samples.", required=True + "--bam", + type=str, + nargs="+", + help="bam files for the analyzed samples", + required=True, + ) + parser.add_argument( + "--samples", + type=str, + nargs="+", + help="corresponding sample name", + required=True, + ) + parser.add_argument( + "--strandedness", type=str, nargs="+", help="strandedness of RNA", required=True + ) + parser.add_argument( + "--single_end", + type=str, + nargs="+", + help="is the sample paired end?", + required=True, + ) + parser.add_argument( + "--ref_count_file", + type=str, + help="A tsv file of gene counts for reference samples.", + required=True, + ) + parser.add_argument( + "--ref_annot", + type=str, + default="None", + help="Path to reference annotation tsv", + required=False, + ) + parser.add_argument( + "--drop_group_sample", + type=str, + default="None", + help="Drop group of analyzed samples", + required=False, ) - parser.add_argument("--ref_annot", type=str, help="Path to reference annotation tsv", required=True) - parser.add_argument("--drop_group_sample", type=str, help="Drop group of analyzed samples", required=True) parser.add_argument("--output", type=str, help="Path to save to", required=True) parser.add_argument("--version", action="version", version=SCRIPT_VERSION) return parser.parse_args(argv) @@ -101,7 +168,12 @@ def main(): drop_group_sample=args.drop_group_sample, out_file="drop_annotation_given_samples.tsv", ) - write_final_annot_to_tsv(ref_count_file=args.ref_count_file, ref_annot=args.ref_annot, out_file=args.output) + + write_final_annot_to_tsv( + ref_count_file=args.ref_count_file, + ref_annot=args.ref_annot, + out_file=args.output, + ) if __name__ == "__main__": diff --git a/conf/base.config b/conf/base.config index 918fad7f..66a938bc 100644 --- a/conf/base.config +++ b/conf/base.config @@ -59,4 +59,17 @@ process { errorStrategy = 'retry' maxRetries = 2 } + if (params.skip_export_counts_drop) { + withLabel:process_drop { + cpus = { check_max( 36 * task.attempt, 'cpus' ) } + memory = { check_max( 144.GB , 'memory' ) } + time = { check_max( 48.h * task.attempt, 'time' ) } + } + } else { + withLabel:process_drop { + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + memory = { check_max( 72.GB * task.attempt, 'memory' ) } + time = { check_max( 16.h * task.attempt, 'time' ) } + } + } } diff --git a/docs/usage.md b/docs/usage.md index 1f1c3b6f..e0f163ca 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -215,16 +215,17 @@ The mandatory and optional parameters for each category are tabulated below. DROP - aberrant expression -| Mandatory | Optional | -| ------------------------------------- | --------------------------------- | -| reference_drop_annot_file1 | skip_drop_ae2 | -| reference_drop_count_file | drop_group_samples_ae3 | -| fasta | drop_padjcutoff_ae4 | -| gtf | drop_zscorecutoff5 | -| | gene_panel_clinical_filter | -| | skip_downsample6 | -| | num_reads7 | -| | genome8 | +| Mandatory | Optional | +| ------------------------------------- | ----------------------------------- | +| reference_drop_annot_file1 | skip_drop_ae2 | +| reference_drop_count_file | drop_group_samples_ae3 | +| fasta | drop_padjcutoff_ae4 | +| gtf | drop_zscorecutoff5 | +| | gene_panel_clinical_filter | +| | skip_downsample6 | +| | num_reads7 | +| | genome8 | +| | skip_export_counts_drop9 | 1 To get more information on how to format it, see below
2 If it is not provided by the user, the default value is false
@@ -234,18 +235,20 @@ DROP - aberrant expression 6 If it is not provided by the user, the default value is false
7 If it is not provided by the user, the default value is 120000000
8 If it is not provided by the user, the default value is GRCh38 +9 If it is not provided by the user, the default value is true
DROP - aberrant splicing -| Mandatory | Optional | -| ------------------------------------- | --------------------------------- | -| reference_drop_annot_file1 | skip_drop_as2 | -| reference_drop_splice_folder | drop_group_samples_as3 | -| | drop_padjcutoff_as4 | -| | gene_panel_clinical_filter | -| | skip_downsample5 | -| | num_reads6 | -| | genome7 | +| Mandatory | Optional | +| ------------------------------------- | ----------------------------------- | +| reference_drop_annot_file1 | skip_drop_as2 | +| reference_drop_splice_folder | drop_group_samples_as3 | +| | drop_padjcutoff_as4 | +| | gene_panel_clinical_filter | +| | skip_downsample5 | +| | num_reads6 | +| | genome7 | +| | skip_export_counts_drop8 | 1 To get more information on how to format it, see below
2 If it is not provided by the user, the default value is false
@@ -254,16 +257,32 @@ DROP - aberrant splicing 5 If it is not provided by the user, the default value is false
6 If it is not provided by the user, the default value is 120000000
7 If it is not provided by the user, the default value is GRCh38 +8 If it is not provided by the user, the default value is true
##### Preparing input for DROP If you want to run [DROP](https://github.com/gagneurlab/drop) aberrant expression or aberrant splicing you have to provide reference counts, splice counts and a sample sheet. The sample sheet should contain the columns as those in the [test sample annotation](../test_data/drop_data/sampleAnnotation.tsv), you do not need to include the samples you are running through the pipeline in the sample sheet. -To obtain the gene counts and splice counts you will have to download the counts from one of the [available databases](https://github.com/gagneurlab/drop#datasets) or run drop locally with your own samples. If you choose the second option, you should start by runnig the module(s) you want to export counts for. Afterwards, you need to run the exportCounts module. Make sure that your config has only the modules you want to export and have already run as , that only existing groups are mentioned in the config, and that exportCounts excludGroups is null or contains a group of samples you want to exclude. Finally, run: +###### Preparing your DROP control database -```console -snakemake exportCounts --cores 1 -``` +You have several options on how to create such a database. You can either build it or download it from one of the [available databases](https://github.com/gagneurlab/drop#datasets). + +To build your own database you will need at least 50 for aberrant expression, if you only run aberrant splicing 30 samples will suffice but DROP authors recommend to have at least around 100 for both modules. You can use Tomte to build your own database, to do so we recommend to run with the following parameters: + +- `--skip_export_counts_drop false` this switch will ensure that a folder called export_counts is created +- `--skip_drop_as false` if you want to get a database for aberrant splicing +- `--skip_drop_ae false` if you want to get a database for aberrant expression +- `--skip_subsample_region false` if you have sequenced any material with overrrepresented regions (such as hemoglobin in whole blood) we recommend to remove it by setting this parameter to false and providing a bed with thet overrepresented region with `--subsample_bed` +- `--skip_downsample false` if you have very deeply sequenced samples, we recommend to downsample, the default is 60M read pairs +- `--skip_build_tracks true`, `--skip_stringtie true`, `--skip_vep true` as most users will be interested in getting the database rather than other downstream results + +Running DROP with many samples requires a lot of time and a lot of memory, that is why we recommend to subsample overrepresented regions and downsampled if you have deeply sequenced samples. If your run fails for either of this reasons, try to relaunched it from the work directory where DROP was run so that DROP continues from the point where it failed (if you restart the pipeline with `-resume` it will begin from the start and it will likely fail in the same way). + +To restart DROP, start by finding the work directory where it was run. You can do so by opening the execution trace file in the pipeline_info folder and looking at the hash of the processes with name `TOMTE:ANALYSE_TRANSCRIPTS:DROP_CONFIG_RUN_AE` and `TOMTE:ANALYSE_TRANSCRIPTS:DROP_CONFIG_RUN_AS`. The work directory used to run the pipeline followed by the hash should be enough information to find the folder where DROP was run. Tomte should have set everything up in that directory so go into it and restart the run by running from the container created by Tomte the script `.command.sh`. If you want to run it with slurm remember to add a header with number of cores, time... + +If you want to add samples to an existing database, follow the same steps described above, making sure that you also provide the database you want to add samples to by using `--reference_drop_annot_file` and `--reference_drop_count_file` and/or `--reference_drop_splice_folder`. In this case scenerio, make sure that you have used the same references for the database as for the new set of samples. + +If you prefer to run DROP locally outside from Tomte follow instructions given by the [authors of DROP](https://github.com/gagneurlab/drop) ## Running the pipeline diff --git a/modules/local/drop_config_runAE.nf b/modules/local/drop_config_runAE.nf index 16eb7c41..492dbffb 100644 --- a/modules/local/drop_config_runAE.nf +++ b/modules/local/drop_config_runAE.nf @@ -1,6 +1,6 @@ process DROP_CONFIG_RUN_AE { tag "DROP_CONFIG_RUN_AE" - label 'process_high' + label 'process_drop' // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { @@ -18,14 +18,17 @@ process DROP_CONFIG_RUN_AE { path ref_splice_folder val(genome) val(drop_group_samples_ae) + val(drop_group_samples_as) val(drop_padjcutoff_ae) val(drop_zScoreCutoff) + val(skip_export_counts_drop) output: path('config.yaml') , emit: config_drop path('output') , emit: drop_ae_out path('OUTRIDER_results_all.Rds'), emit: drop_ae_rds path('gene_name_mapping*') , emit: drop_gene_name + path('geneCounts.tsv.gz') , emit: gene_counts_ae, optional: true path "versions.yml" , emit: versions when: @@ -35,6 +38,7 @@ process DROP_CONFIG_RUN_AE { def args = task.ext.args ?: '' def genome_assembly = "${genome}".contains("h37") ? "hg19" : "${genome}" def drop_group = "${drop_group_samples_ae}".replace(" ","") + def drop_other_group_samples = "${drop_group_samples_as}".replace(" ","") def zscorecutoff = drop_zScoreCutoff ? "--zscorecutoff ${drop_zScoreCutoff}" : '' """ @@ -49,12 +53,20 @@ process DROP_CONFIG_RUN_AE { --drop_module AE \\ --genome_assembly $genome_assembly \\ --drop_group_samples $drop_group \\ + --drop_other_group_samples $drop_other_group_samples \\ --padjcutoff ${drop_padjcutoff_ae} \\ $zscorecutoff \\ --output config.yaml snakemake aberrantExpression --cores ${task.cpus} --rerun-triggers mtime $args + if [[ !skip_export_counts_drop ]]; then + snakemake exportCounts --cores 1 + mkdir exported_counts + cp sample_annotation.tsv exported_counts/. + cp output/processed_results/exported_counts/*/geneCounts.tsv.gz exported_counts/. + fi + cp output/processed_results/aberrant_expression/*/outrider/outrider/OUTRIDER_results_all.Rds . cp output/processed_data/preprocess/*/gene_name_mapping_*.tsv . @@ -71,6 +83,7 @@ process DROP_CONFIG_RUN_AE { touch OUTRIDER_results_all.Rds touch gene_name_mapping_.tsv mkdir output + mkdir exported_counts cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/drop_config_runAS.nf b/modules/local/drop_config_runAS.nf index 9c8a650d..5107879a 100644 --- a/modules/local/drop_config_runAS.nf +++ b/modules/local/drop_config_runAS.nf @@ -1,6 +1,6 @@ process DROP_CONFIG_RUN_AS { tag "DROP_CONFIG_RUN_AS" - label 'process_high' + label 'process_drop' // Exit if running this module with -profile conda / -profile mamba if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { @@ -18,13 +18,16 @@ process DROP_CONFIG_RUN_AS { path ref_splice_folder val(genome) val(drop_group_samples_as) + val(drop_group_samples_ae) val(drop_padjcutoff_as) + val(skip_export_counts_drop) output: path('config.yaml') , emit: config_drop path('output') , emit: drop_as_out path('FRASER_results_fraser--*'), emit: drop_as_tsv path('gene_name_mapping*') , emit: drop_gene_name + path('exported_counts') , emit: gene_counts_as, optional: true path "versions.yml" , emit: versions when: @@ -34,6 +37,7 @@ process DROP_CONFIG_RUN_AS { def args = task.ext.args ?: '' def genome_assembly = "${genome}".contains("h37") ? "hg19" : "${genome}" def drop_group = "${drop_group_samples_as}".replace(" ","") + def drop_other_group_samples = "${drop_group_samples_ae}".replace(" ","") """ TMPDIR=\$PWD HOME=\$PWD @@ -47,11 +51,19 @@ process DROP_CONFIG_RUN_AS { --drop_module AS \\ --genome_assembly $genome_assembly \\ --drop_group_samples $drop_group \\ + --drop_other_group_samples $drop_other_group_samples \\ --padjcutoff ${drop_padjcutoff_as} \\ --output config.yaml snakemake aberrantSplicing --cores ${task.cpus} --rerun-triggers mtime $args + if [[ !skip_export_counts_drop ]]; then + snakemake exportCounts --cores 1 + mkdir exported_counts + cp sample_annotation.tsv exported_counts/. + cp output/processed_results/exported_counts/*/*.gz exported_counts/. + fi + cp output/html/AberrantSplicing/FRASER_results_fraser--*.tsv . cp output/processed_data/preprocess/*/gene_name_mapping_*.tsv . @@ -68,6 +80,7 @@ process DROP_CONFIG_RUN_AS { touch FRASER_results_fraser--.tsv touch gene_name_mapping_.tsv mkdir output + mkdir exported_counts cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/drop_sample_annot.nf b/modules/local/drop_sample_annot.nf index 248d8b19..e2ea3980 100644 --- a/modules/local/drop_sample_annot.nf +++ b/modules/local/drop_sample_annot.nf @@ -1,5 +1,5 @@ process DROP_SAMPLE_ANNOT { - tag "DROP_sample_annot" + tag "DROP_annot_file" label 'process_low' // Exit if running this module with -profile conda / -profile mamba @@ -10,8 +10,7 @@ process DROP_SAMPLE_ANNOT { container "docker.io/clinicalgenomics/drop:1.3.3" input: - path(bam) - val(samples) + tuple val(ids), val(single_ends), val(strandednesses), path(bam), path(bai) path(ref_gene_counts) path(ref_annot) val(drop_group_samples_ae) @@ -25,14 +24,14 @@ process DROP_SAMPLE_ANNOT { task.ext.when == null || task.ext.when script: - def ids = "${samples.id}".replace("[","").replace("]","").replace(",","") - def strandedness = "${samples.strandedness}".replace("[","").replace("]","").replace(",","") - def single_end = "${samples.single_end}".replace("[","").replace("]","").replace(",","") + def id = "${ids}".replace("[","").replace("]","").replace(",","") + def single_end = "${single_ends}".replace("[","").replace("]","").replace(",","") + def strandedness = "${strandednesses}".replace("[","").replace("]","").replace(",","") def drop_group = "${drop_group_samples_ae},${drop_group_samples_as}".replace(" ","").replace("[","").replace("]","") """ $baseDir/bin/drop_sample_annot.py \\ --bam ${bam} \\ - --samples $ids \\ + --samples $id \\ --strandedness $strandedness \\ --single_end $single_end \\ --ref_count_file ${ref_gene_counts} \\ @@ -55,4 +54,4 @@ process DROP_SAMPLE_ANNOT { drop_sample_annot: \$(\$baseDir/bin/drop_sample_annot --version ) END_VERSIONS """ -} +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 5c334c80..f4d33a9f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -45,6 +45,7 @@ params { skip_stringtie = false skip_drop_ae = false skip_drop_as = false + skip_export_counts_drop = true drop_group_samples_ae = 'outrider' drop_group_samples_as = 'fraser' drop_padjcutoff_ae = 0.05 diff --git a/nextflow_schema.json b/nextflow_schema.json index bd08defe..0ad98314 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -44,7 +44,8 @@ "save_mapped_as_cram": { "type": "boolean", "description": "Do you want to save bam as cram", - "fa_icon": "far fa-file-archive" + "fa_icon": "far fa-file-archive", + "default": true } } }, @@ -59,7 +60,8 @@ "description": "Name of iGenomes reference, etiher hg19/GRCh37 or hg38/GRCh38, it is case sensitive.", "fa_icon": "fas fa-book", "enum": ["hg19", "GRCh37", "hg38", "GRCh38"], - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details.", + "default": "GRCh38" }, "fasta": { "type": "string", @@ -200,13 +202,11 @@ }, "skip_subsample_region": { "type": "boolean", - "default": false, "description": "Turn off subsampling of the region. The region is defined by the subsample_bed parameter and the fraction is given by seed_frac", "fa_icon": "fas fa-toggle-off" }, "skip_downsample": { "type": "boolean", - "default": false, "description": "Skip downsampling before expression/splicing analysis. The number of reads to be used is defined by num_reads.", "fa_icon": "fas fa-toggle-off" }, @@ -255,34 +255,35 @@ }, "skip_build_tracks": { "type": "boolean", - "default": false, "description": "Skip building splice junction tracks for IGV.", "fa_icon": "fas fa-toggle-off" }, "skip_stringtie": { "type": "boolean", - "default": false, "description": "Skip analysis with StringTie", "fa_icon": "fas fa-toggle-off" }, "skip_vep": { "type": "boolean", - "default": false, "description": "Skip Ensembl Variant Effect Predictor", "fa_icon": "fas fa-toggle-off" }, "skip_drop_ae": { "type": "boolean", - "default": false, "description": "Skip DROP Aberrant Expression module ", "fa_icon": "fas fa-toggle-off" }, "skip_drop_as": { "type": "boolean", - "default": false, "description": "Skip DROP Aberrant Splicing module", "fa_icon": "fas fa-toggle-off" }, + "skip_export_counts_drop": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "default": true, + "description": "Skip export counts for DROP. It will export information from those modules run. Read usage for further information." + }, "drop_group_samples_ae": { "type": "string", "default": "outrider", diff --git a/subworkflows/local/analyse_transcripts.nf b/subworkflows/local/analyse_transcripts.nf index 0bfdd250..fac39b7d 100644 --- a/subworkflows/local/analyse_transcripts.nf +++ b/subworkflows/local/analyse_transcripts.nf @@ -28,6 +28,7 @@ workflow ANALYSE_TRANSCRIPTS { ch_gene_panel_clinical_filter // channel: [optional] [ path(tsv) ] case_info // channel: [optional] [ val(case_id) ] skip_drop_ae // parameter: [mandatory] default: 'false' + skip_export_counts_drop // parameter: [mandatory] default: 'true' main: ch_versions = Channel.empty() @@ -36,11 +37,18 @@ workflow ANALYSE_TRANSCRIPTS { // Generates count files for samples and merges them with reference count file // Generates sample annotation - star_samples = gene_counts.map{ meta, cnt_file -> meta }.collect() ch_bam_files = ch_bam_ds_bai.collect{it[1]} + + ch_bam_ds_bai + .map { meta, bam, bai -> + [ meta.id, meta.single_end, meta.strandedness, bam, bai ] + } + .collect(flat:false) + .map { it.transpose() } + .set { ch_bam_files_annot } + DROP_SAMPLE_ANNOT( - ch_bam_files, - star_samples, + ch_bam_files_annot, ch_ref_drop_count_file, ch_ref_drop_annot_file, drop_group_samples_ae, @@ -59,8 +67,10 @@ workflow ANALYSE_TRANSCRIPTS { ch_ref_drop_splice_folder, genome, drop_group_samples_ae, + drop_group_samples_as, drop_padjcutoff_ae, - drop_zscorecutoff + drop_zscorecutoff, + skip_export_counts_drop ) // Generates config file and runs Aberrant splicing module @@ -73,7 +83,9 @@ workflow ANALYSE_TRANSCRIPTS { ch_ref_drop_splice_folder, genome, drop_group_samples_as, - drop_padjcutoff_as + drop_group_samples_ae, + drop_padjcutoff_as, + skip_export_counts_drop ) ch_out_drop_ae_rds = DROP_CONFIG_RUN_AE.out.drop_ae_rds ? DROP_CONFIG_RUN_AE.out.drop_ae_rds.collect() diff --git a/workflows/tomte.nf b/workflows/tomte.nf index 9d00ea20..b13b614f 100644 --- a/workflows/tomte.nf +++ b/workflows/tomte.nf @@ -21,6 +21,7 @@ include { CALL_VARIANTS } from '../subworkflows/local/call_variants' include { ALLELE_SPECIFIC_CALLING } from '../subworkflows/local/allele_specific_calling' include { ANNOTATE_SNV } from '../subworkflows/local/annotate_snv' include { IGV_TRACKS } from '../subworkflows/local/igv_tracks' +//include { BUILD_DROP_DATABASE } from '../subworkflows/local/drop_database' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_tomte_pipeline' // @@ -155,7 +156,8 @@ workflow TOMTE { params.drop_zscorecutoff, ch_gene_panel_clinical_filter, ch_case_info, - params.skip_drop_ae + params.skip_drop_ae, + params.skip_export_counts_drop ) ch_versions = ch_versions.mix(ANALYSE_TRANSCRIPTS.out.versions) From c1d5a490009e7b5242b980dcb2b557073328e3aa Mon Sep 17 00:00:00 2001 From: Lucpen Date: Tue, 30 Jul 2024 13:55:25 +0200 Subject: [PATCH 02/19] fix linting --- modules/local/drop_sample_annot.nf | 2 +- subworkflows/local/analyse_transcripts.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/drop_sample_annot.nf b/modules/local/drop_sample_annot.nf index e2ea3980..216fd02e 100644 --- a/modules/local/drop_sample_annot.nf +++ b/modules/local/drop_sample_annot.nf @@ -54,4 +54,4 @@ process DROP_SAMPLE_ANNOT { drop_sample_annot: \$(\$baseDir/bin/drop_sample_annot --version ) END_VERSIONS """ -} \ No newline at end of file +} diff --git a/subworkflows/local/analyse_transcripts.nf b/subworkflows/local/analyse_transcripts.nf index fac39b7d..cb808b68 100644 --- a/subworkflows/local/analyse_transcripts.nf +++ b/subworkflows/local/analyse_transcripts.nf @@ -38,7 +38,7 @@ workflow ANALYSE_TRANSCRIPTS { // Generates sample annotation ch_bam_files = ch_bam_ds_bai.collect{it[1]} - + ch_bam_ds_bai .map { meta, bam, bai -> [ meta.id, meta.single_end, meta.strandedness, bam, bai ] From 3ef078ba95c0c3c997325c4952839fbc716865f2 Mon Sep 17 00:00:00 2001 From: Lucpen Date: Wed, 21 Aug 2024 16:05:08 +0200 Subject: [PATCH 03/19] Prevent module skipping if no annotation or counts provided --- bin/drop_sample_annot.py | 6 +++--- modules/local/drop_sample_annot.nf | 6 ++++-- subworkflows/local/analyse_transcripts.nf | 12 ++++++------ 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/bin/drop_sample_annot.py b/bin/drop_sample_annot.py index d3c0df6f..56033cdf 100755 --- a/bin/drop_sample_annot.py +++ b/bin/drop_sample_annot.py @@ -42,8 +42,6 @@ def write_sample_annotation_to_tsv( sa_dict: dict = {}.fromkeys(SAMPLE_ANNOTATION_COLUMNS, "NA") sa_dict["RNA_ID"] = id sa_dict["DROP_GROUP"] = drop_group_sample - sa_dict["GENE_COUNTS_FILE"] = "NA" - sa_dict["GENE_ANNOTATION"] = "NA" sa_dict["STRAND"] = is_stranded(strandedness[index]) sa_dict["PAIRED_END"] = is_paired_end(single_end[index]) sa_dict["RNA_BAM_FILE"] = bam[index] @@ -82,6 +80,7 @@ def write_final_annot_to_tsv(ref_count_file: str, ref_annot: str, out_file: str) "At least 30 samples are required for Aberrant Splicing and 50 for Aberrant expression" ) print(f"Only {df_samples.shape[0]} samples were provided by the user") + df_samples.fillna("NA", inplace=True) df_samples.to_csv(out_file, index=False, sep="\t") else: df_reference: DataFrame = read_csv(ref_annot, sep="\t") @@ -135,8 +134,9 @@ def parse_args(argv=None): parser.add_argument( "--ref_count_file", type=str, + default="None", help="A tsv file of gene counts for reference samples.", - required=True, + required=False, ) parser.add_argument( "--ref_annot", diff --git a/modules/local/drop_sample_annot.nf b/modules/local/drop_sample_annot.nf index 216fd02e..7514c862 100644 --- a/modules/local/drop_sample_annot.nf +++ b/modules/local/drop_sample_annot.nf @@ -28,14 +28,16 @@ process DROP_SAMPLE_ANNOT { def single_end = "${single_ends}".replace("[","").replace("]","").replace(",","") def strandedness = "${strandednesses}".replace("[","").replace("]","").replace(",","") def drop_group = "${drop_group_samples_ae},${drop_group_samples_as}".replace(" ","").replace("[","").replace("]","") + def reference_count_file = ref_gene_counts ? "--ref_count_file ${ref_gene_counts}" : '' + def reference_annotation = ref_annot ? "--ref_annot ${ref_annot}" : '' """ $baseDir/bin/drop_sample_annot.py \\ --bam ${bam} \\ --samples $id \\ --strandedness $strandedness \\ --single_end $single_end \\ - --ref_count_file ${ref_gene_counts} \\ - --ref_annot ${ref_annot} \\ + $reference_count_file \\ + $reference_annotation \\ --drop_group_sample $drop_group \\ --output sample_annotation.tsv diff --git a/subworkflows/local/analyse_transcripts.nf b/subworkflows/local/analyse_transcripts.nf index cb808b68..2a9f4951 100644 --- a/subworkflows/local/analyse_transcripts.nf +++ b/subworkflows/local/analyse_transcripts.nf @@ -49,8 +49,8 @@ workflow ANALYSE_TRANSCRIPTS { DROP_SAMPLE_ANNOT( ch_bam_files_annot, - ch_ref_drop_count_file, - ch_ref_drop_annot_file, + ch_ref_drop_count_file.ifEmpty([]), + ch_ref_drop_annot_file.ifEmpty([]), drop_group_samples_ae, drop_group_samples_as ) @@ -63,8 +63,8 @@ workflow ANALYSE_TRANSCRIPTS { ch_gtf, DROP_SAMPLE_ANNOT.out.drop_annot, ch_bam_bai_files, - ch_ref_drop_count_file, - ch_ref_drop_splice_folder, + ch_ref_drop_count_file.ifEmpty([]), + ch_ref_drop_splice_folder.ifEmpty([]), genome, drop_group_samples_ae, drop_group_samples_as, @@ -79,8 +79,8 @@ workflow ANALYSE_TRANSCRIPTS { ch_gtf, DROP_SAMPLE_ANNOT.out.drop_annot, ch_bam_bai_files, - ch_ref_drop_count_file, - ch_ref_drop_splice_folder, + ch_ref_drop_count_file.ifEmpty([]), + ch_ref_drop_splice_folder.ifEmpty([]), genome, drop_group_samples_as, drop_group_samples_ae, From 994147f1b3568f8b6ab2177bd232cf6b7159865b Mon Sep 17 00:00:00 2001 From: Lucpen Date: Fri, 30 Aug 2024 09:21:41 +0200 Subject: [PATCH 04/19] update DROP and fix drop_config --- CHANGELOG.md | 18 ++++++++++++++++++ bin/drop_config.py | 4 ++-- modules/local/drop_config_runAE.nf | 2 +- modules/local/drop_config_runAS.nf | 2 +- modules/local/drop_filter_results.nf | 2 +- modules/local/drop_sample_annot.nf | 2 +- 6 files changed, 24 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 343e2e69..4bd0ffd8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,24 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## X.X.X - [XXXX-XX-XX] + +### `Added` + +- Functionality to create DROP databases and to add samples to existing ones [#147](https://github.com/genomic-medicine-sweden/tomte/pull/147) + +### `Fixed` + +### `Parameters` + +### `Changed` + +- Updated modules using drop drop_config_runAE, drop_config_runAS, drop_sample_annot, and drop_filter_results [#147](https://github.com/genomic-medicine-sweden/tomte/pull/147) + +| Tool | Old version | New version | +| ---- | ----------- | ----------- | +| DROP | 1.3.3 | 1.4.0 | + ## 2.2.1 - Scrooge [2024-08-28] ### `Fixed` diff --git a/bin/drop_config.py b/bin/drop_config.py index 1bfa8faa..856bf06a 100755 --- a/bin/drop_config.py +++ b/bin/drop_config.py @@ -6,7 +6,7 @@ from typing import Dict, Any from copy import deepcopy -SCRIPT_VERSION = "v2.0" +SCRIPT_VERSION = "v2.1" CONFIG_YAML = { "projectTitle": "DROP: Detection of RNA Outliers Pipeline", "root": None, @@ -100,7 +100,7 @@ def update_config( config_copy: Dict[str, Any] = deepcopy(CONFIG_YAML) config_copy["genome"] = genome_name - config_copy["root"] = "output" + config_copy["root"] = str(Path.cwd() / "output") config_copy["htmlOutputPath"] = "output/html" config_copy["sampleAnnotation"] = "sample_annotation.tsv" config_copy["geneAnnotation"][gtf_without_ext] = str(gtf) diff --git a/modules/local/drop_config_runAE.nf b/modules/local/drop_config_runAE.nf index 492dbffb..ff8622ee 100644 --- a/modules/local/drop_config_runAE.nf +++ b/modules/local/drop_config_runAE.nf @@ -7,7 +7,7 @@ process DROP_CONFIG_RUN_AE { exit 1, "Local DROP module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/clinicalgenomics/drop:1.3.3" + container "docker.io/clinicalgenomics/drop:1.4.0" input: tuple val(meta), path(fasta), path(fai) diff --git a/modules/local/drop_config_runAS.nf b/modules/local/drop_config_runAS.nf index 5107879a..2c56d2c1 100644 --- a/modules/local/drop_config_runAS.nf +++ b/modules/local/drop_config_runAS.nf @@ -7,7 +7,7 @@ process DROP_CONFIG_RUN_AS { exit 1, "Local DROP module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/clinicalgenomics/drop:1.3.3" + container "docker.io/clinicalgenomics/drop:1.4.0" input: tuple val(meta), path(fasta), path(fai) diff --git a/modules/local/drop_filter_results.nf b/modules/local/drop_filter_results.nf index 3f4e5cd0..e893757e 100644 --- a/modules/local/drop_filter_results.nf +++ b/modules/local/drop_filter_results.nf @@ -7,7 +7,7 @@ process DROP_FILTER_RESULTS { exit 1, "Local DROP module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/clinicalgenomics/drop:1.3.3" + container "docker.io/clinicalgenomics/drop:1.4.0" input: val(case_info) diff --git a/modules/local/drop_sample_annot.nf b/modules/local/drop_sample_annot.nf index 7514c862..371fe8e8 100644 --- a/modules/local/drop_sample_annot.nf +++ b/modules/local/drop_sample_annot.nf @@ -7,7 +7,7 @@ process DROP_SAMPLE_ANNOT { exit 1, "Local DROP module does not support Conda. Please use Docker / Singularity / Podman instead." } - container "docker.io/clinicalgenomics/drop:1.3.3" + container "docker.io/clinicalgenomics/drop:1.4.0" input: tuple val(ids), val(single_ends), val(strandednesses), path(bam), path(bai) From 9556308966f6baae8735cfc86c6e41a8421d6b55 Mon Sep 17 00:00:00 2001 From: Lucpen Date: Wed, 4 Sep 2024 10:08:53 +0200 Subject: [PATCH 05/19] fix CHANGELOG --- CHANGELOG.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 77fe5c67..c4aa182c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,10 +16,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` - Updated modules using drop drop_config_runAE, drop_config_runAS, drop_sample_annot, and drop_filter_results [#147](https://github.com/genomic-medicine-sweden/tomte/pull/147) +- Updated modules ensemblvep/filtervep, ensemblvep/vep [#159](https://github.com/genomic-medicine-sweden/tomte/pull/159) +- Updated gencode version from 37 to 46 [#159](https://github.com/genomic-medicine-sweden/tomte/pull/159) -| Tool | Old version | New version | -| ---- | ----------- | ----------- | -| DROP | 1.3.3 | 1.4.0 | +| Tool | Old version | New version | +| -------------------- | ----------- | ----------- | +| DROP | 1.3.3 | 1.4.0 | +| ensemblvep/filtervep | 110 | 112 | +| ensemblvep/vep | 110 | 112 | ## 2.2.1 - Scrooge [2024-08-28] @@ -30,14 +34,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` - Downgraded multiqc version [#156](https://github.com/genomic-medicine-sweden/tomte/pull/156) -- Updated modules ensemblvep/filtervep, ensemblvep/vep [#159](https://github.com/genomic-medicine-sweden/tomte/pull/159) -- Updated gencode version from 37 to 46 [#159](https://github.com/genomic-medicine-sweden/tomte/pull/159) -| Tool | Old version | New version | -| -------------------- | ----------- | ----------- | -| multiqc | 1.24.1 | 1.21 | -| ensemblvep/filtervep | 110 | 112 | -| ensemblvep/vep | 110 | 112 | +| Tool | Old version | New version | +| ------- | ----------- | ----------- | +| multiqc | 1.24.1 | 1.21 | ## 2.2.0 - TioDeNadal [2024-08-27] From afffbb1e07dd2623ae7fc03ecbe48b8908f7e0d8 Mon Sep 17 00:00:00 2001 From: Lucpen Date: Wed, 4 Sep 2024 10:10:55 +0200 Subject: [PATCH 06/19] fix CHANGELOG --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c4aa182c..b83f4619 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,15 +15,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` -- Updated modules using drop drop_config_runAE, drop_config_runAS, drop_sample_annot, and drop_filter_results [#147](https://github.com/genomic-medicine-sweden/tomte/pull/147) - Updated modules ensemblvep/filtervep, ensemblvep/vep [#159](https://github.com/genomic-medicine-sweden/tomte/pull/159) - Updated gencode version from 37 to 46 [#159](https://github.com/genomic-medicine-sweden/tomte/pull/159) +- Updated modules using drop drop_config_runAE, drop_config_runAS, drop_sample_annot, and drop_filter_results [#147](https://github.com/genomic-medicine-sweden/tomte/pull/147) + | Tool | Old version | New version | | -------------------- | ----------- | ----------- | -| DROP | 1.3.3 | 1.4.0 | | ensemblvep/filtervep | 110 | 112 | | ensemblvep/vep | 110 | 112 | +| DROP | 1.3.3 | 1.4.0 | ## 2.2.1 - Scrooge [2024-08-28] From 52dbc7a5f071ed31f4f43fa3e4817c58d51562f8 Mon Sep 17 00:00:00 2001 From: Lucpen Date: Wed, 4 Sep 2024 10:13:31 +0200 Subject: [PATCH 07/19] run prettier --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b83f4619..feec0b23 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Updated gencode version from 37 to 46 [#159](https://github.com/genomic-medicine-sweden/tomte/pull/159) - Updated modules using drop drop_config_runAE, drop_config_runAS, drop_sample_annot, and drop_filter_results [#147](https://github.com/genomic-medicine-sweden/tomte/pull/147) - | Tool | Old version | New version | | -------------------- | ----------- | ----------- | | ensemblvep/filtervep | 110 | 112 | From 19a243538804446385334faf386cc3f705f7cc7d Mon Sep 17 00:00:00 2001 From: Lucpen Date: Wed, 4 Sep 2024 10:19:41 +0200 Subject: [PATCH 08/19] update clinvar version --- test_data/vep_to_download.csv | 4 ++-- test_data/vep_to_download_37.csv | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test_data/vep_to_download.csv b/test_data/vep_to_download.csv index 31df0439..d34ac75f 100644 --- a/test_data/vep_to_download.csv +++ b/test_data/vep_to_download.csv @@ -1,4 +1,4 @@ name,path_for_wget -clinvar_20240806.vcf.gz,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/weekly/clinvar.vcf.gz -clinvar_20240806.vcf.gz.tbi,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/weekly/clinvar.vcf.gz.tbi +clinvar_20240902.vcf.gz,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/weekly/clinvar_20240902.vcf.gz +clinvar_20240902.vcf.gz.tbi,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/weekly/clinvar_20240902.vcf.gz.tbi homo_sapiens_merged,https://ftp.ensembl.org/pub/release-112/variation/indexed_vep_cache/homo_sapiens_merged_vep_112_GRCh38.tar.gz diff --git a/test_data/vep_to_download_37.csv b/test_data/vep_to_download_37.csv index dc2d4e24..793d1d1a 100644 --- a/test_data/vep_to_download_37.csv +++ b/test_data/vep_to_download_37.csv @@ -1,4 +1,4 @@ name,path_for_wget -clinvar_20240806.vcf.gz,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/weekly/clinvar.vcf.gz -clinvar_20240806.vcf.gz.tbi,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/weekly/clinvar.vcf.gz.tbi +clinvar_20240902.vcf.gz,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/weekly/clinvar_20240902.vcf.gz +clinvar_20240902.vcf.gz.tbi,ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/weekly/clinvar_20240902.vcf.gz.tbi homo_sapiens_merged,https://ftp.ensembl.org/pub/release-112/variation/indexed_vep_cache/homo_sapiens_merged_vep_112_GRCh37.tar.gz From f3d2d0c88bc71ad0787aefb1f17534b65e327f6a Mon Sep 17 00:00:00 2001 From: Lucpen Date: Tue, 17 Sep 2024 07:52:03 +0200 Subject: [PATCH 09/19] add count mode and overlap to sampan --- bin/drop_sample_annot.py | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/bin/drop_sample_annot.py b/bin/drop_sample_annot.py index 56033cdf..e05b7d79 100755 --- a/bin/drop_sample_annot.py +++ b/bin/drop_sample_annot.py @@ -2,7 +2,7 @@ import argparse import csv -from pandas import read_csv, DataFrame, concat +from pandas import read_csv, DataFrame, concat, isna import os SCRIPT_VERSION = "v1.1" @@ -56,7 +56,7 @@ def is_paired_end(single_end: str) -> bool: def is_stranded(strandedness: str) -> str: - """Logical funciton to determine if a sample is paired end""" + """Logical funciton to determine sample strandness""" if strandedness.lower() == "reverse": return "reverse" elif strandedness.lower() == "forward": @@ -65,6 +65,30 @@ def is_stranded(strandedness: str) -> str: return "no" +def count_mode(sample_count_mode: str) -> str: + """Logical funciton to determine if count mode is given or default "IntersectionStrict" should be used""" + print("Hello") + print(sample_count_mode) + if isna(sample_count_mode) or sample_count_mode == "" or sample_count_mode == "NA": + return "IntersectionStrict" + else: + return sample_count_mode + + +def count_overlaps(sample_count_overlap: str) -> str: + """Logical funciton to determine if count overlap is given or default "TRUE" should be used""" + print("Hello") + print(sample_count_overlap) + if ( + isna(sample_count_overlap) + or sample_count_overlap == "" + or sample_count_overlap == "NA" + ): + return True + else: + return sample_count_overlap + + def write_final_annot_to_tsv(ref_count_file: str, ref_annot: str, out_file: str): """ Concatenates the Sample Annotation produced by SampleAnnotation with the one @@ -81,6 +105,8 @@ def write_final_annot_to_tsv(ref_count_file: str, ref_annot: str, out_file: str) ) print(f"Only {df_samples.shape[0]} samples were provided by the user") df_samples.fillna("NA", inplace=True) + df_samples["COUNT_MODE"] = "IntersectionStrict" + df_samples["COUNT_OVERLAPS"] = True df_samples.to_csv(out_file, index=False, sep="\t") else: df_reference: DataFrame = read_csv(ref_annot, sep="\t") @@ -89,8 +115,10 @@ def write_final_annot_to_tsv(ref_count_file: str, ref_annot: str, out_file: str) df_reference["SPLICE_COUNTS_DIR"].str.rstrip("/").apply(os.path.basename) ) df_reference["DROP_GROUP"] = df_reference["DROP_GROUP"].str.replace(" ", "") - df_samples["COUNT_OVERLAPS"] = df_reference["COUNT_OVERLAPS"].iloc[0] - df_samples["COUNT_MODE"] = df_reference["COUNT_MODE"].iloc[0] + df_samples["COUNT_OVERLAPS"] = count_overlaps( + df_reference["COUNT_OVERLAPS"].iloc[0] + ) + df_samples["COUNT_MODE"] = count_mode(df_reference["COUNT_MODE"].iloc[0]) df_samples["HPO_TERMS"] = df_reference["HPO_TERMS"].iloc[0] for id in df_samples["RNA_ID"]: df_reference = df_reference[ From 28e537f34dbda6def1de3f7332cd4087be11e1ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luc=C3=ADa=20Pe=C3=B1a-P=C3=A9rez?= Date: Mon, 23 Sep 2024 10:46:41 +0200 Subject: [PATCH 10/19] Update conf/base.config --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index b1add1f3..47ea66d9 100644 --- a/conf/base.config +++ b/conf/base.config @@ -62,7 +62,7 @@ process { errorStrategy = 'retry' maxRetries = 2 } - if (params.skip_export_counts_drop) { + if (!params.skip_export_counts_drop) { withLabel:process_drop { cpus = { check_max( 36 * task.attempt, 'cpus' ) } memory = { check_max( 144.GB , 'memory' ) } From ba154d2d3b14c3cb24db0f6e5c70bce35ee5a85c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luc=C3=ADa=20Pe=C3=B1a-P=C3=A9rez?= Date: Wed, 9 Oct 2024 08:09:16 +0200 Subject: [PATCH 11/19] Update modules/local/drop_config_runAE.nf --- modules/local/drop_config_runAE.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/drop_config_runAE.nf b/modules/local/drop_config_runAE.nf index ff8622ee..053ca4d1 100644 --- a/modules/local/drop_config_runAE.nf +++ b/modules/local/drop_config_runAE.nf @@ -62,7 +62,7 @@ process DROP_CONFIG_RUN_AE { if [[ !skip_export_counts_drop ]]; then snakemake exportCounts --cores 1 - mkdir exported_counts + mkdir -p exported_counts cp sample_annotation.tsv exported_counts/. cp output/processed_results/exported_counts/*/geneCounts.tsv.gz exported_counts/. fi From aeaf98b4d9fe4e3f90e7027c62d7fc6a861f7220 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luc=C3=ADa=20Pe=C3=B1a-P=C3=A9rez?= Date: Wed, 9 Oct 2024 08:14:06 +0200 Subject: [PATCH 12/19] Update modules/local/drop_config_runAS.nf --- modules/local/drop_config_runAS.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/drop_config_runAS.nf b/modules/local/drop_config_runAS.nf index 2c56d2c1..1f045710 100644 --- a/modules/local/drop_config_runAS.nf +++ b/modules/local/drop_config_runAS.nf @@ -59,7 +59,7 @@ process DROP_CONFIG_RUN_AS { if [[ !skip_export_counts_drop ]]; then snakemake exportCounts --cores 1 - mkdir exported_counts + mkdir -p exported_counts cp sample_annotation.tsv exported_counts/. cp output/processed_results/exported_counts/*/*.gz exported_counts/. fi From 3d1beaaa7f5501fe63257e039e4259d9f54b45ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luc=C3=ADa=20Pe=C3=B1a-P=C3=A9rez?= Date: Wed, 9 Oct 2024 08:18:21 +0200 Subject: [PATCH 13/19] Update modules/local/drop_config_runAE.nf --- modules/local/drop_config_runAE.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/drop_config_runAE.nf b/modules/local/drop_config_runAE.nf index 053ca4d1..e10c2ecd 100644 --- a/modules/local/drop_config_runAE.nf +++ b/modules/local/drop_config_runAE.nf @@ -28,7 +28,7 @@ process DROP_CONFIG_RUN_AE { path('output') , emit: drop_ae_out path('OUTRIDER_results_all.Rds'), emit: drop_ae_rds path('gene_name_mapping*') , emit: drop_gene_name - path('geneCounts.tsv.gz') , emit: gene_counts_ae, optional: true + path('exported_counts') , emit: gene_counts_ae, optional: true path "versions.yml" , emit: versions when: From 694d5e65b32bf77ad285c0f8c9d6ba192ed33921 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luc=C3=ADa=20Pe=C3=B1a-P=C3=A9rez?= Date: Wed, 9 Oct 2024 08:20:50 +0200 Subject: [PATCH 14/19] Update modules/local/drop_config_runAE.nf --- modules/local/drop_config_runAE.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/drop_config_runAE.nf b/modules/local/drop_config_runAE.nf index e10c2ecd..40a270c1 100644 --- a/modules/local/drop_config_runAE.nf +++ b/modules/local/drop_config_runAE.nf @@ -28,7 +28,7 @@ process DROP_CONFIG_RUN_AE { path('output') , emit: drop_ae_out path('OUTRIDER_results_all.Rds'), emit: drop_ae_rds path('gene_name_mapping*') , emit: drop_gene_name - path('exported_counts') , emit: gene_counts_ae, optional: true + path('exported_counts') , emit: gene_counts_ae, optional: true path "versions.yml" , emit: versions when: From 9f6f8628e88d742d76ffb4ec95e32dfed905557a Mon Sep 17 00:00:00 2001 From: Lucpen Date: Thu, 10 Oct 2024 09:07:09 +0200 Subject: [PATCH 15/19] fix .nf-core.yml --- .nf-core.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.nf-core.yml b/.nf-core.yml index b37758c0..c1acdeb5 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,5 +1,5 @@ lint: - files_exist: false + files_exist: - assets/nf-core-tomte_logo_light.png - docs/images/nf-core-tomte_logo_light.png - docs/images/nf-core-tomte_logo_dark.png @@ -8,7 +8,7 @@ lint: - docs/images/tomte_pipeline_metromap.png - conf/modules.config - .github/ISSUE_TEMPLATE/config.yml - files_unchanged: false + files_unchanged: - assets/sendmail_template.txt - .github/CONTRIBUTING.md - .github/ISSUE_TEMPLATE/bug_report.yml From 1de53e5e6ecac74989f96a6da3926f21adb15a04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luc=C3=ADa=20Pe=C3=B1a-P=C3=A9rez?= Date: Thu, 10 Oct 2024 09:57:42 +0200 Subject: [PATCH 16/19] Apply suggestions from code review Co-authored-by: Felix Lenner <52530259+fellen31@users.noreply.github.com> --- bin/drop_sample_annot.py | 4 ---- docs/usage.md | 4 ++-- nextflow.config | 2 +- workflows/tomte.nf | 1 - 4 files changed, 3 insertions(+), 8 deletions(-) diff --git a/bin/drop_sample_annot.py b/bin/drop_sample_annot.py index e05b7d79..6dde5fbb 100755 --- a/bin/drop_sample_annot.py +++ b/bin/drop_sample_annot.py @@ -67,8 +67,6 @@ def is_stranded(strandedness: str) -> str: def count_mode(sample_count_mode: str) -> str: """Logical funciton to determine if count mode is given or default "IntersectionStrict" should be used""" - print("Hello") - print(sample_count_mode) if isna(sample_count_mode) or sample_count_mode == "" or sample_count_mode == "NA": return "IntersectionStrict" else: @@ -77,8 +75,6 @@ def count_mode(sample_count_mode: str) -> str: def count_overlaps(sample_count_overlap: str) -> str: """Logical funciton to determine if count overlap is given or default "TRUE" should be used""" - print("Hello") - print(sample_count_overlap) if ( isna(sample_count_overlap) or sample_count_overlap == "" diff --git a/docs/usage.md b/docs/usage.md index bfdf0923..ef53022e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -282,11 +282,11 @@ To build your own database you will need at least 50 for aberrant expression, if - `--skip_export_counts_drop false` this switch will ensure that a folder called export_counts is created - `--skip_drop_as false` if you want to get a database for aberrant splicing - `--skip_drop_ae false` if you want to get a database for aberrant expression -- `--skip_subsample_region false` if you have sequenced any material with overrrepresented regions (such as hemoglobin in whole blood) we recommend to remove it by setting this parameter to false and providing a bed with thet overrepresented region with `--subsample_bed` +- `--skip_subsample_region false` if you have sequenced any material with overrepresented regions (such as hemoglobin in whole blood) we recommend to remove it by setting this parameter to false and providing a bed with the overrepresented region with `--subsample_bed` - `--skip_downsample false` if you have very deeply sequenced samples, we recommend to downsample, the default is 60M read pairs - `--skip_build_tracks true`, `--skip_stringtie true`, `--skip_vep true` as most users will be interested in getting the database rather than other downstream results -Running DROP with many samples requires a lot of time and a lot of memory, that is why we recommend to subsample overrepresented regions and downsampled if you have deeply sequenced samples. If your run fails for either of this reasons, try to relaunched it from the work directory where DROP was run so that DROP continues from the point where it failed (if you restart the pipeline with `-resume` it will begin from the start and it will likely fail in the same way). +Running DROP with many samples requires a lot of time and a lot of memory, that is why we recommend to subsample overrepresented regions and downsample if you have deeply sequenced samples. If your run fails for either of this reasons, try to relaunch it from the work directory where DROP was run so that DROP continues from the point where it failed (if you restart the pipeline with `-resume` it will begin from the start and it will likely fail in the same way). To restart DROP, start by finding the work directory where it was run. You can do so by opening the execution trace file in the pipeline_info folder and looking at the hash of the processes with name `TOMTE:ANALYSE_TRANSCRIPTS:DROP_CONFIG_RUN_AE` and `TOMTE:ANALYSE_TRANSCRIPTS:DROP_CONFIG_RUN_AS`. The work directory used to run the pipeline followed by the hash should be enough information to find the folder where DROP was run. Tomte should have set everything up in that directory so go into it and restart the run by running from the container created by Tomte the script `.command.sh`. If you want to run it with slurm remember to add a header with number of cores, time... diff --git a/nextflow.config b/nextflow.config index f5497b9f..99597edc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -50,7 +50,7 @@ params { skip_stringtie = false skip_drop_ae = false skip_drop_as = false - skip_export_counts_drop = true + skip_export_counts_drop = true drop_group_samples_ae = 'outrider' drop_group_samples_as = 'fraser' drop_padjcutoff_ae = 0.05 diff --git a/workflows/tomte.nf b/workflows/tomte.nf index 19061def..ed2172b7 100644 --- a/workflows/tomte.nf +++ b/workflows/tomte.nf @@ -22,7 +22,6 @@ include { CALL_VARIANTS } from '../subworkflows/local/call_variants' include { ALLELE_SPECIFIC_CALLING } from '../subworkflows/local/allele_specific_calling' include { ANNOTATE_SNV } from '../subworkflows/local/annotate_snv' include { IGV_TRACKS } from '../subworkflows/local/igv_tracks' -//include { BUILD_DROP_DATABASE } from '../subworkflows/local/drop_database' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_tomte_pipeline' // From 420b8b381127d91588c3cdd9131690741a70a18a Mon Sep 17 00:00:00 2001 From: Lucpen Date: Thu, 10 Oct 2024 10:17:36 +0200 Subject: [PATCH 17/19] apply reviewers suggestion --- modules/local/drop_config_runAE.nf | 4 +++- modules/local/drop_config_runAS.nf | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/local/drop_config_runAE.nf b/modules/local/drop_config_runAE.nf index 40a270c1..da45fc3d 100644 --- a/modules/local/drop_config_runAE.nf +++ b/modules/local/drop_config_runAE.nf @@ -83,7 +83,9 @@ process DROP_CONFIG_RUN_AE { touch OUTRIDER_results_all.Rds touch gene_name_mapping_.tsv mkdir output - mkdir exported_counts + if [[ !skip_export_counts_drop ]]; then + mkdir exported_counts + fi cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/drop_config_runAS.nf b/modules/local/drop_config_runAS.nf index 1f045710..c5ea0342 100644 --- a/modules/local/drop_config_runAS.nf +++ b/modules/local/drop_config_runAS.nf @@ -80,7 +80,9 @@ process DROP_CONFIG_RUN_AS { touch FRASER_results_fraser--.tsv touch gene_name_mapping_.tsv mkdir output - mkdir exported_counts + if [[ !skip_export_counts_drop ]]; then + mkdir exported_counts + fi cat <<-END_VERSIONS > versions.yml "${task.process}": From 64c02063fbb9c25f9ea54a21992d41d2e349bf20 Mon Sep 17 00:00:00 2001 From: Lucpen Date: Thu, 10 Oct 2024 11:35:12 +0200 Subject: [PATCH 18/19] apply reviewers suggestion --- modules/local/drop_config_runAE.nf | 4 ++-- modules/local/drop_config_runAS.nf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/local/drop_config_runAE.nf b/modules/local/drop_config_runAE.nf index da45fc3d..e36c200f 100644 --- a/modules/local/drop_config_runAE.nf +++ b/modules/local/drop_config_runAE.nf @@ -60,7 +60,7 @@ process DROP_CONFIG_RUN_AE { snakemake aberrantExpression --cores ${task.cpus} --rerun-triggers mtime $args - if [[ !skip_export_counts_drop ]]; then + if [[ $skip_export_counts_drop == false ]]; then snakemake exportCounts --cores 1 mkdir -p exported_counts cp sample_annotation.tsv exported_counts/. @@ -83,7 +83,7 @@ process DROP_CONFIG_RUN_AE { touch OUTRIDER_results_all.Rds touch gene_name_mapping_.tsv mkdir output - if [[ !skip_export_counts_drop ]]; then + if [[ $skip_export_counts_drop == false ]]; then mkdir exported_counts fi diff --git a/modules/local/drop_config_runAS.nf b/modules/local/drop_config_runAS.nf index c5ea0342..a90a6e97 100644 --- a/modules/local/drop_config_runAS.nf +++ b/modules/local/drop_config_runAS.nf @@ -57,7 +57,7 @@ process DROP_CONFIG_RUN_AS { snakemake aberrantSplicing --cores ${task.cpus} --rerun-triggers mtime $args - if [[ !skip_export_counts_drop ]]; then + if [[ $skip_export_counts_drop == false ]]; then snakemake exportCounts --cores 1 mkdir -p exported_counts cp sample_annotation.tsv exported_counts/. @@ -80,7 +80,7 @@ process DROP_CONFIG_RUN_AS { touch FRASER_results_fraser--.tsv touch gene_name_mapping_.tsv mkdir output - if [[ !skip_export_counts_drop ]]; then + if [[ $skip_export_counts_drop == false ]]; then mkdir exported_counts fi From 893fa4af3f1f780dc0e1eb12a9e4959c970462d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luc=C3=ADa=20Pe=C3=B1a-P=C3=A9rez?= Date: Fri, 11 Oct 2024 11:33:35 +0200 Subject: [PATCH 19/19] Apply suggestions from code review Co-authored-by: Annick Renevey <47788523+rannick@users.noreply.github.com> --- bin/drop_sample_annot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/drop_sample_annot.py b/bin/drop_sample_annot.py index 6dde5fbb..38a77f46 100755 --- a/bin/drop_sample_annot.py +++ b/bin/drop_sample_annot.py @@ -66,7 +66,7 @@ def is_stranded(strandedness: str) -> str: def count_mode(sample_count_mode: str) -> str: - """Logical funciton to determine if count mode is given or default "IntersectionStrict" should be used""" + """Logical function to determine if count mode is given or default "IntersectionStrict" should be used""" if isna(sample_count_mode) or sample_count_mode == "" or sample_count_mode == "NA": return "IntersectionStrict" else: