diff --git a/library/tasks/FastQC.wdl b/library/tasks/FastQC.wdl new file mode 100644 index 000000000..4be0fe717 --- /dev/null +++ b/library/tasks/FastQC.wdl @@ -0,0 +1,50 @@ +task FastQC { + Array[File] fastq_files + File? limits_file + Int startRead = 250000 + Int nRead = 250000 + String docker = "quay.io/biocontainers/fastqc:0.11.8--1" + Int machine_mem_mb = 3850 + Int disk = 100 + Int preemptible = 3 + + String dollar = "$" + parameter_meta { + fastq_files: "input fastq files" + limits_file: "(optional) limits file to use with fastqc" + startRead: "(optional) start fastqc at the nth read of the file" + nRead: "(optional) use (at most) n reads for fastqc" + docker: "(optional) the docker image containing the runtime environment for this task" + disk: "(optional) the amount of disk space (GiB) to provision for this task" + preemptible: "(optional) if non-zero, request a pre-emptible instance and allow for this number of preemptions before running the task on a non preemptible machine" + machine_mem_mb: "(optional) the amount of memory (MiB) to provision for this task" + + } + + command <<< + set -e + + mkdir outputs + declare -a fastqs=() + for fastq in ${sep=' ' fastq_files} + do + outname=`basename ${dollar}fastq .fastq.gz`_skip${startRead}_read${nRead}.fastq + zcat ${dollar}fastq | head -n ${4*(startRead + nRead)} | tail -n ${4*nRead} > ${dollar}outname + fastqs+=(${dollar}outname) + done + + fastqc ${dollar}{fastqs[@]} -o outputs ${"--limits " + limits_file} + >>> + + runtime { + docker: docker + memory: "${machine_mem_mb} MiB" + disks: "local-disk ${disk} HDD" + preemptible: preemptible + } + + output { + Array[File] fastqc_htmls = glob("outputs/*.html") + Array[File] fastqc_zips = glob("outputs/*.zip") + } +} \ No newline at end of file diff --git a/pipelines/optimus/Optimus.wdl b/pipelines/optimus/Optimus.wdl index d40f73fb8..c79aafcd5 100644 --- a/pipelines/optimus/Optimus.wdl +++ b/pipelines/optimus/Optimus.wdl @@ -11,6 +11,7 @@ import "RunEmptyDrops.wdl" as RunEmptyDrops import "ZarrUtils.wdl" as ZarrUtils import "Picard.wdl" as Picard import "UmiCorrection.wdl" as UmiCorrection +import "FastQC.wdl" as FastQC workflow Optimus { meta { @@ -25,6 +26,9 @@ workflow Optimus { Array[File]? i1_fastq String sample_id + # fastqc input + File? fastqc_limits + # organism reference parameters File tar_star_reference File annotations_gtf @@ -51,6 +55,7 @@ workflow Optimus { r2_fastq: "reverse read, contains cDNA fragment generated from captured mRNA" i1_fastq: "(optional) index read, for demultiplexing of multiple samples on one flow cell." sample_id: "name of sample matching this file, inserted into read group header" + fastqc_limits: "(optional) limits file for fastqc" tar_star_reference: "star genome reference" annotations_gtf: "gtf containing annotations for gene tagging (must match star reference)" ref_genome_fasta: "genome fasta file (must match star reference)" @@ -77,6 +82,13 @@ workflow Optimus { r2_unmapped_bam = FastqToUBam.bam_output, whitelist = whitelist } + + call FastQC.FastQC as FastQC { + input: + fastq_files = [r1_fastq[index], r2_fastq[index], non_optional_i1_fastq[index]], + limits_file = fastqc_limits, + disk = ceil((size(r1_fastq[index], "GiB") + size(r2_fastq[index], "GiB") + size(non_optional_i1_fastq[index], "GiB")) * 1.2 + 10) + } } # if the index is not passed, proceed without it. @@ -87,9 +99,18 @@ workflow Optimus { r2_unmapped_bam = FastqToUBam.bam_output, whitelist = whitelist } + + call FastQC.FastQC as FastQCNoIndex { + input: + fastq_files = [r1_fastq[index], r2_fastq[index]], + limits_file = fastqc_limits, + disk = ceil((size(r1_fastq[index], "GiB") + size(r2_fastq[index], "GiB")) * 1.2 + 10) + } } File barcoded_bam = select_first([AttachBarcodes.bam_output, AttachBarcodesNoIndex.bam_output]) + Array[File] fastqc_output_htmls = select_first([FastQC.fastqc_htmls,FastQCNoIndex.fastqc_htmls]) + Array[File] fastqc_output_zips = select_first([FastQC.fastqc_zips,FastQCNoIndex.fastqc_zips]) } call Merge.MergeSortBamFiles as MergeUnsorted { @@ -222,5 +243,9 @@ workflow Optimus { # zarr Array[File]? zarr_output_files = OptimusZarrConversion.zarr_output_files + + # fastqc + Array[File] fastqc_htmls = flatten(fastqc_output_htmls) + Array[File] fastqc_zips = flatten(fastqc_output_zips) } } diff --git a/pipelines/smartseq2_single_sample/SmartSeq2SingleSample.wdl b/pipelines/smartseq2_single_sample/SmartSeq2SingleSample.wdl index 8e92aa4b4..ec4159f63 100644 --- a/pipelines/smartseq2_single_sample/SmartSeq2SingleSample.wdl +++ b/pipelines/smartseq2_single_sample/SmartSeq2SingleSample.wdl @@ -3,6 +3,7 @@ import "Picard.wdl" as Picard import "RSEM.wdl" as RSEM import "GroupMetricsOutputs.wdl" as GroupQCs import "ZarrUtils.wdl" as ZarrUtils +import "FastQC.wdl" as FastQC workflow SmartSeq2SingleCell { meta { @@ -28,6 +29,8 @@ workflow SmartSeq2SingleCell { String output_name File fastq1 File fastq2 + # fastqc + File? fastqc_limits Int max_retries = 0 # whether to convert the outputs to Zarr format, by default it's set to true @@ -48,12 +51,20 @@ workflow SmartSeq2SingleCell { output_name: "Output name, can include path" fastq1: "R1 in paired end reads" fastq2: "R2 in paired end reads" + fastqc_limits: "(optional) limits file for fastqc" max_retries: "(optional) retry this number of times if task fails -- use with caution, see skylab README for details" output_zarr: "whether to run the taks that converts the outputs to Zarr format, by default it's true" } String quality_control_output_basename = output_name + "_qc" + call FastQC.FastQC { + input: + fastq_files = [fastq1, fastq2], + limits_file = fastqc_limits, + disk = ceil((size(fastq1, "GiB") + size(fastq2, "GiB")) * 1.2 + 10) + } + call HISAT2.HISAT2PairedEnd { input: hisat2_ref = hisat2_ref_index, @@ -149,5 +160,9 @@ workflow SmartSeq2SingleCell { # zarr Array[File]? zarr_output_files = SmartSeq2ZarrConversion.zarr_output_files + + #fastqc + Array[File] fastqc_htmls = FastQC.fastqc_htmls + Array[File] fastqc_zips = FastQC.fastqc_zips } } diff --git a/test/optimus/pr/ValidateOptimus.wdl b/test/optimus/pr/ValidateOptimus.wdl index 34ed4f321..c8e0e7c6b 100644 --- a/test/optimus/pr/ValidateOptimus.wdl +++ b/test/optimus/pr/ValidateOptimus.wdl @@ -3,13 +3,16 @@ task ValidateOptimus { File matrix File gene_metrics File cell_metrics - + Array[File] fastqc_htmls + Int n_fastqc_zips Int required_disk = ceil((size(bam, "G") + size(matrix, "G")) * 1.1) String expected_bam_hash String expected_matrix_hash String expected_gene_metric_hash String expected_cell_metric_hash + Int expected_n_fastqc_zips + Array[String] expected_fastqc_html_strings command <<< @@ -51,6 +54,25 @@ task ValidateOptimus { fail=true fi + #search for expected strings in fastqc html + for string in "${sep='" "' expected_fastqc_html_strings}"; do + fail_html=true + for htmlfile in ${sep=' ' fastqc_htmls}; do + if grep "$string" $htmlfile; then + fail_html=false + fi + done + if [ $fail_html == "true" ]; then + >&2 echo "expected string ($string) not found in fastqc html files" + fail=true + fi + done + + if [ ${expected_n_fastqc_zips} != ${n_fastqc_zips} ]; then + >&2 echo "number of fastqc zip (${n_fastqc_zips}) did not match expected number (${expected_n_fastqc_zips})" + fail=true + fi + if [ $fail == "true" ]; then exit 1; fi >>> diff --git a/test/optimus/pr/dependencies.json b/test/optimus/pr/dependencies.json index e6add7ae1..b99094612 100644 --- a/test/optimus/pr/dependencies.json +++ b/test/optimus/pr/dependencies.json @@ -14,5 +14,6 @@ "RunEmptyDrops.wdl": "library/tasks/RunEmptyDrops.wdl", "ZarrUtils.wdl": "library/tasks/ZarrUtils.wdl", "Picard.wdl": "library/tasks/Picard.wdl", - "UmiCorrection.wdl": "library/tasks/UmiCorrection.wdl" + "UmiCorrection.wdl": "library/tasks/UmiCorrection.wdl", + "FastQC.wdl": "library/tasks/FastQC.wdl" } diff --git a/test/optimus/pr/test_inputs.json b/test/optimus/pr/test_inputs.json index 0e33ab9c6..7b4b85d17 100644 --- a/test/optimus/pr/test_inputs.json +++ b/test/optimus/pr/test_inputs.json @@ -3,6 +3,10 @@ "TestOptimusPR.expected_matrix_hash": "aec7a79dc7b85a5d621509a3f4fa2192", "TestOptimusPR.expected_cell_metric_hash": "45cc8be253445201b02d102d2d096a0c", "TestOptimusPR.expected_gene_metric_hash": "d636671dfcff6ec8068987d0f5780334", + "TestOptimusPR.expected_fastqc_html_strings": ["pbmc8k_S1_L007_I1_001_skip250000_read250000.fastq FastQC Report","Sequence length8", + "pbmc8k_S1_L007_R1_001_skip250000_read250000.fastq FastQC Report","Sequence length26", + "pbmc8k_S1_L007_R2_001_skip250000_read250000.fastq FastQC Report","Sequence length98"], + "TestOptimusPR.expected_n_fastqc_zips": 6, "TestOptimusPR.r1_fastq": [ "gs://hca-dcp-mint-test-data/10x/demo/fastqs/pbmc8k_S1_L007_R1_001.fastq.gz", "gs://hca-dcp-mint-test-data/10x/demo/fastqs/pbmc8k_S1_L007_R1_001.fastq.gz" diff --git a/test/optimus/pr/test_optimus_PR.wdl b/test/optimus/pr/test_optimus_PR.wdl index 4d097694e..794072acc 100644 --- a/test/optimus/pr/test_optimus_PR.wdl +++ b/test/optimus/pr/test_optimus_PR.wdl @@ -10,6 +10,8 @@ workflow TestOptimusPR { String expected_matrix_hash String expected_gene_metric_hash String expected_cell_metric_hash + Array[String] expected_fastqc_html_strings # a set of strings to search for in the fastqc html outputs + Int expected_n_fastqc_zips # Optimus inputs Array[File] r1_fastq @@ -40,10 +42,14 @@ workflow TestOptimusPR { matrix = target.matrix, gene_metrics = target.gene_metrics, cell_metrics = target.cell_metrics, + fastqc_htmls = target.fastqc_htmls, + n_fastqc_zips = length(target.fastqc_zips), expected_bam_hash = expected_bam_hash, expected_matrix_hash = expected_matrix_hash, expected_cell_metric_hash = expected_cell_metric_hash, - expected_gene_metric_hash = expected_gene_metric_hash + expected_gene_metric_hash = expected_gene_metric_hash, + expected_fastqc_html_strings = expected_fastqc_html_strings, + expected_n_fastqc_zips = expected_n_fastqc_zips } } diff --git a/test/optimus/scientific/dependencies.json b/test/optimus/scientific/dependencies.json index 15a4b38d4..834f80ec4 100644 --- a/test/optimus/scientific/dependencies.json +++ b/test/optimus/scientific/dependencies.json @@ -12,5 +12,6 @@ "SequenceDataWithMoleculeTagMetrics.wdl": "library/tasks/SequenceDataWithMoleculeTagMetrics.wdl", "TagSortBam.wdl": "library/tasks/TagSortBam.wdl", "Picard.wdl": "library/tasks/Picard.wdl", - "UmiCorrection.wdl": "library/tasks/UmiCorrection.wdl" + "UmiCorrection.wdl": "library/tasks/UmiCorrection.wdl", + "FastQC.wdl": "library/tasks/FastQC.wdl" } diff --git a/test/smartseq2_single_sample/pr/ValidateSmartSeq2SingleCell.wdl b/test/smartseq2_single_sample/pr/ValidateSmartSeq2SingleCell.wdl index 4f0244148..7268176e1 100644 --- a/test/smartseq2_single_sample/pr/ValidateSmartSeq2SingleCell.wdl +++ b/test/smartseq2_single_sample/pr/ValidateSmartSeq2SingleCell.wdl @@ -5,6 +5,12 @@ task ValidateSmartSeq2SingleCell { File target_metrics String expected_metrics_hash + Array[File] fastqc_htmls + Array[String] expected_fastqc_html_strings + + Int expected_n_fastqc_zips + Int n_fastqc_zips + command <<< # catch intermittent failures @@ -28,6 +34,25 @@ task ValidateSmartSeq2SingleCell { fail=true fi + #search for expected strings in fastqc html + for string in "${sep='" "' expected_fastqc_html_strings}"; do + fail_html=true + for htmlfile in ${sep=' ' fastqc_htmls}; do + if grep "$string" $htmlfile; then + fail_html=false + fi + done + if [ $fail_html == "true" ]; then + >&2 echo "expected string ($string) not found in fastqc html files" + fail=true + fi + done + + if [ ${expected_n_fastqc_zips} != ${n_fastqc_zips} ]; then + >&2 echo "number of fastqc zip (${n_fastqc_zips}) did not match expected number (${expected_n_fastqc_zips})" + fail=true + fi + if [ $fail == "true" ]; then exit 1; fi >>> diff --git a/test/smartseq2_single_sample/pr/dependencies.json b/test/smartseq2_single_sample/pr/dependencies.json index 2c10e0518..c01f27f4c 100644 --- a/test/smartseq2_single_sample/pr/dependencies.json +++ b/test/smartseq2_single_sample/pr/dependencies.json @@ -5,5 +5,6 @@ "Picard.wdl": "library/tasks/Picard.wdl", "RSEM.wdl": "library/tasks/RSEM.wdl", "GroupMetricsOutputs.wdl": "library/tasks/GroupMetricsOutputs.wdl", - "ZarrUtils.wdl": "library/tasks/ZarrUtils.wdl" + "ZarrUtils.wdl": "library/tasks/ZarrUtils.wdl", + "FastQC.wdl": "library/tasks/FastQC.wdl" } diff --git a/test/smartseq2_single_sample/pr/test_inputs.json b/test/smartseq2_single_sample/pr/test_inputs.json index b28535cba..176e5bfd6 100644 --- a/test/smartseq2_single_sample/pr/test_inputs.json +++ b/test/smartseq2_single_sample/pr/test_inputs.json @@ -14,5 +14,8 @@ "TestSmartSeq2SingleCellPR.sample_name":"SRR1294925", "TestSmartSeq2SingleCellPR.output_name":"SRR1294925", "TestSmartSeq2SingleCellPR.expected_counts_hash": "135a3fbb959583db17713dc8b9d7fe33", - "TestSmartSeq2SingleCellPR.expected_metrics_hash": "99bd9903ac8dc77eb1c047ffa8eb42ed" + "TestSmartSeq2SingleCellPR.expected_metrics_hash": "99bd9903ac8dc77eb1c047ffa8eb42ed", + "TestSmartSeq2SingleCellPR.expected_fastqc_html_strings": ["SRR1294925_1_skip250000_read250000.fastq FastQC Report","Sequence length25", + "SRR1294925_2_skip250000_read250000.fastq FastQC Report"], + "TestSmartSeq2SingleCellPR.expected_n_fastqc_zips": 2 } diff --git a/test/smartseq2_single_sample/pr/test_smartseq2_single_sample_PR.wdl b/test/smartseq2_single_sample/pr/test_smartseq2_single_sample_PR.wdl index 75b7826f8..5f7dfd09d 100644 --- a/test/smartseq2_single_sample/pr/test_smartseq2_single_sample_PR.wdl +++ b/test/smartseq2_single_sample/pr/test_smartseq2_single_sample_PR.wdl @@ -23,6 +23,8 @@ workflow TestSmartSeq2SingleCellPR { String output_name File fastq1 File fastq2 + Array[String] expected_fastqc_html_strings + Int expected_n_fastqc_zips call target_wdl.SmartSeq2SingleCell as target_workflow { input: @@ -47,7 +49,11 @@ workflow TestSmartSeq2SingleCellPR { counts = target_workflow.rsem_gene_results, expected_counts_hash = expected_counts_hash, target_metrics = target_workflow.insert_size_metrics, - expected_metrics_hash = expected_metrics_hash + expected_metrics_hash = expected_metrics_hash, + fastqc_htmls = target_workflow.fastqc_htmls, + expected_fastqc_html_strings = expected_fastqc_html_strings, + n_fastqc_zips = length(target_workflow.fastqc_zips), + expected_n_fastqc_zips = expected_n_fastqc_zips } }