Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FastQC to Optimus and SmartSeq2 #202

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
50 changes: 50 additions & 0 deletions library/tasks/FastQC.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
task FastQC {
Array[File] fastq_files
File? limits_file
Int startRead = 250000
Int nRead = 250000
String docker = "quay.io/biocontainers/fastqc:0.11.8--1"
Int machine_mem_mb = 3850
Int disk = 100
Int preemptible = 3

String dollar = "$"
parameter_meta {
fastq_files: "input fastq files"
limits_file: "(optional) limits file to use with fastqc"
startRead: "(optional) start fastqc at the nth read of the file"
nRead: "(optional) use (at most) n reads for fastqc"
docker: "(optional) the docker image containing the runtime environment for this task"
disk: "(optional) the amount of disk space (GiB) to provision for this task"
preemptible: "(optional) if non-zero, request a pre-emptible instance and allow for this number of preemptions before running the task on a non preemptible machine"
machine_mem_mb: "(optional) the amount of memory (MiB) to provision for this task"

}

command <<<
set -e

mkdir outputs
declare -a fastqs=()
for fastq in ${sep=' ' fastq_files}
do
outname=`basename ${dollar}fastq .fastq.gz`_skip${startRead}_read${nRead}.fastq
zcat ${dollar}fastq | head -n ${4*(startRead + nRead)} | tail -n ${4*nRead} > ${dollar}outname
fastqs+=(${dollar}outname)
done

fastqc ${dollar}{fastqs[@]} -o outputs ${"--limits " + limits_file}
>>>

runtime {
docker: docker
memory: "${machine_mem_mb} MiB"
disks: "local-disk ${disk} HDD"
preemptible: preemptible
}

output {
Array[File] fastqc_htmls = glob("outputs/*.html")
Array[File] fastqc_zips = glob("outputs/*.zip")
}
}
25 changes: 25 additions & 0 deletions pipelines/optimus/Optimus.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import "RunEmptyDrops.wdl" as RunEmptyDrops
import "ZarrUtils.wdl" as ZarrUtils
import "Picard.wdl" as Picard
import "UmiCorrection.wdl" as UmiCorrection
import "FastQC.wdl" as FastQC

workflow Optimus {
meta {
Expand All @@ -25,6 +26,9 @@ workflow Optimus {
Array[File]? i1_fastq
String sample_id

# fastqc input
File? fastqc_limits

# organism reference parameters
File tar_star_reference
File annotations_gtf
Expand All @@ -51,6 +55,7 @@ workflow Optimus {
r2_fastq: "reverse read, contains cDNA fragment generated from captured mRNA"
i1_fastq: "(optional) index read, for demultiplexing of multiple samples on one flow cell."
sample_id: "name of sample matching this file, inserted into read group header"
fastqc_limits: "(optional) limits file for fastqc"
tar_star_reference: "star genome reference"
annotations_gtf: "gtf containing annotations for gene tagging (must match star reference)"
ref_genome_fasta: "genome fasta file (must match star reference)"
Expand All @@ -77,6 +82,13 @@ workflow Optimus {
r2_unmapped_bam = FastqToUBam.bam_output,
whitelist = whitelist
}

call FastQC.FastQC as FastQC {
input:
fastq_files = [r1_fastq[index], r2_fastq[index], non_optional_i1_fastq[index]],
limits_file = fastqc_limits,
disk = ceil((size(r1_fastq[index], "GiB") + size(r2_fastq[index], "GiB") + size(non_optional_i1_fastq[index], "GiB")) * 1.2 + 10)
}
}

# if the index is not passed, proceed without it.
Expand All @@ -87,9 +99,18 @@ workflow Optimus {
r2_unmapped_bam = FastqToUBam.bam_output,
whitelist = whitelist
}

call FastQC.FastQC as FastQCNoIndex {
input:
fastq_files = [r1_fastq[index], r2_fastq[index]],
limits_file = fastqc_limits,
disk = ceil((size(r1_fastq[index], "GiB") + size(r2_fastq[index], "GiB")) * 1.2 + 10)
}
}

File barcoded_bam = select_first([AttachBarcodes.bam_output, AttachBarcodesNoIndex.bam_output])
Array[File] fastqc_output_htmls = select_first([FastQC.fastqc_htmls,FastQCNoIndex.fastqc_htmls])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you are expecting multiple zips or htmls then I think select_all() would be better here.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since only one of FastQC and FastQCNoIndex will ever be run, using select_all() would just result in an extra unnecessary level of array.

Array[File] fastqc_output_zips = select_first([FastQC.fastqc_zips,FastQCNoIndex.fastqc_zips])
}

call Merge.MergeSortBamFiles as MergeUnsorted {
Expand Down Expand Up @@ -222,5 +243,9 @@ workflow Optimus {

# zarr
Array[File]? zarr_output_files = OptimusZarrConversion.zarr_output_files

# fastqc
Array[File] fastqc_htmls = flatten(fastqc_output_htmls)
Array[File] fastqc_zips = flatten(fastqc_output_zips)
}
}
15 changes: 15 additions & 0 deletions pipelines/smartseq2_single_sample/SmartSeq2SingleSample.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import "Picard.wdl" as Picard
import "RSEM.wdl" as RSEM
import "GroupMetricsOutputs.wdl" as GroupQCs
import "ZarrUtils.wdl" as ZarrUtils
import "FastQC.wdl" as FastQC

workflow SmartSeq2SingleCell {
meta {
Expand All @@ -28,6 +29,8 @@ workflow SmartSeq2SingleCell {
String output_name
File fastq1
File fastq2
# fastqc
File? fastqc_limits
Int max_retries = 0

# whether to convert the outputs to Zarr format, by default it's set to true
Expand All @@ -48,12 +51,20 @@ workflow SmartSeq2SingleCell {
output_name: "Output name, can include path"
fastq1: "R1 in paired end reads"
fastq2: "R2 in paired end reads"
fastqc_limits: "(optional) limits file for fastqc"
max_retries: "(optional) retry this number of times if task fails -- use with caution, see skylab README for details"
output_zarr: "whether to run the taks that converts the outputs to Zarr format, by default it's true"
}

String quality_control_output_basename = output_name + "_qc"

call FastQC.FastQC {
input:
fastq_files = [fastq1, fastq2],
limits_file = fastqc_limits,
disk = ceil((size(fastq1, "GiB") + size(fastq2, "GiB")) * 1.2 + 10)
}

call HISAT2.HISAT2PairedEnd {
input:
hisat2_ref = hisat2_ref_index,
Expand Down Expand Up @@ -149,5 +160,9 @@ workflow SmartSeq2SingleCell {

# zarr
Array[File]? zarr_output_files = SmartSeq2ZarrConversion.zarr_output_files

#fastqc
Array[File] fastqc_htmls = FastQC.fastqc_htmls
Array[File] fastqc_zips = FastQC.fastqc_zips
}
}
24 changes: 23 additions & 1 deletion test/optimus/pr/ValidateOptimus.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@ task ValidateOptimus {
File matrix
File gene_metrics
File cell_metrics

Array[File] fastqc_htmls
Int n_fastqc_zips
Int required_disk = ceil((size(bam, "G") + size(matrix, "G")) * 1.1)

String expected_bam_hash
String expected_matrix_hash
String expected_gene_metric_hash
String expected_cell_metric_hash
Int expected_n_fastqc_zips
Array[String] expected_fastqc_html_strings

command <<<

Expand Down Expand Up @@ -51,6 +54,25 @@ task ValidateOptimus {
fail=true
fi

#search for expected strings in fastqc html
for string in "${sep='" "' expected_fastqc_html_strings}"; do
fail_html=true
for htmlfile in ${sep=' ' fastqc_htmls}; do
if grep "$string" $htmlfile; then
fail_html=false
fi
done
if [ $fail_html == "true" ]; then
>&2 echo "expected string ($string) not found in fastqc html files"
fail=true
fi
done

if [ ${expected_n_fastqc_zips} != ${n_fastqc_zips} ]; then
>&2 echo "number of fastqc zip (${n_fastqc_zips}) did not match expected number (${expected_n_fastqc_zips})"
fail=true
fi

if [ $fail == "true" ]; then exit 1; fi

>>>
Expand Down
3 changes: 2 additions & 1 deletion test/optimus/pr/dependencies.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@
"RunEmptyDrops.wdl": "library/tasks/RunEmptyDrops.wdl",
"ZarrUtils.wdl": "library/tasks/ZarrUtils.wdl",
"Picard.wdl": "library/tasks/Picard.wdl",
"UmiCorrection.wdl": "library/tasks/UmiCorrection.wdl"
"UmiCorrection.wdl": "library/tasks/UmiCorrection.wdl",
"FastQC.wdl": "library/tasks/FastQC.wdl"
}
4 changes: 4 additions & 0 deletions test/optimus/pr/test_inputs.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
"TestOptimusPR.expected_matrix_hash": "aec7a79dc7b85a5d621509a3f4fa2192",
"TestOptimusPR.expected_cell_metric_hash": "45cc8be253445201b02d102d2d096a0c",
"TestOptimusPR.expected_gene_metric_hash": "d636671dfcff6ec8068987d0f5780334",
"TestOptimusPR.expected_fastqc_html_strings": ["pbmc8k_S1_L007_I1_001_skip250000_read250000.fastq FastQC Report","Sequence length</td><td>8",
"pbmc8k_S1_L007_R1_001_skip250000_read250000.fastq FastQC Report","Sequence length</td><td>26",
"pbmc8k_S1_L007_R2_001_skip250000_read250000.fastq FastQC Report","Sequence length</td><td>98"],
"TestOptimusPR.expected_n_fastqc_zips": 6,
"TestOptimusPR.r1_fastq": [
"gs://hca-dcp-mint-test-data/10x/demo/fastqs/pbmc8k_S1_L007_R1_001.fastq.gz",
"gs://hca-dcp-mint-test-data/10x/demo/fastqs/pbmc8k_S1_L007_R1_001.fastq.gz"
Expand Down
8 changes: 7 additions & 1 deletion test/optimus/pr/test_optimus_PR.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ workflow TestOptimusPR {
String expected_matrix_hash
String expected_gene_metric_hash
String expected_cell_metric_hash
Array[String] expected_fastqc_html_strings # a set of strings to search for in the fastqc html outputs
Int expected_n_fastqc_zips

# Optimus inputs
Array[File] r1_fastq
Expand Down Expand Up @@ -40,10 +42,14 @@ workflow TestOptimusPR {
matrix = target.matrix,
gene_metrics = target.gene_metrics,
cell_metrics = target.cell_metrics,
fastqc_htmls = target.fastqc_htmls,
n_fastqc_zips = length(target.fastqc_zips),
expected_bam_hash = expected_bam_hash,
expected_matrix_hash = expected_matrix_hash,
expected_cell_metric_hash = expected_cell_metric_hash,
expected_gene_metric_hash = expected_gene_metric_hash
expected_gene_metric_hash = expected_gene_metric_hash,
expected_fastqc_html_strings = expected_fastqc_html_strings,
expected_n_fastqc_zips = expected_n_fastqc_zips
}

}
3 changes: 2 additions & 1 deletion test/optimus/scientific/dependencies.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@
"SequenceDataWithMoleculeTagMetrics.wdl": "library/tasks/SequenceDataWithMoleculeTagMetrics.wdl",
"TagSortBam.wdl": "library/tasks/TagSortBam.wdl",
"Picard.wdl": "library/tasks/Picard.wdl",
"UmiCorrection.wdl": "library/tasks/UmiCorrection.wdl"
"UmiCorrection.wdl": "library/tasks/UmiCorrection.wdl",
"FastQC.wdl": "library/tasks/FastQC.wdl"
}
25 changes: 25 additions & 0 deletions test/smartseq2_single_sample/pr/ValidateSmartSeq2SingleCell.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ task ValidateSmartSeq2SingleCell {
File target_metrics
String expected_metrics_hash

Array[File] fastqc_htmls
Array[String] expected_fastqc_html_strings

Int expected_n_fastqc_zips
Int n_fastqc_zips

command <<<

# catch intermittent failures
Expand All @@ -28,6 +34,25 @@ task ValidateSmartSeq2SingleCell {
fail=true
fi

#search for expected strings in fastqc html
for string in "${sep='" "' expected_fastqc_html_strings}"; do
fail_html=true
for htmlfile in ${sep=' ' fastqc_htmls}; do
if grep "$string" $htmlfile; then
fail_html=false
fi
done
if [ $fail_html == "true" ]; then
>&2 echo "expected string ($string) not found in fastqc html files"
fail=true
fi
done

if [ ${expected_n_fastqc_zips} != ${n_fastqc_zips} ]; then
>&2 echo "number of fastqc zip (${n_fastqc_zips}) did not match expected number (${expected_n_fastqc_zips})"
fail=true
fi

if [ $fail == "true" ]; then exit 1; fi

>>>
Expand Down
3 changes: 2 additions & 1 deletion test/smartseq2_single_sample/pr/dependencies.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@
"Picard.wdl": "library/tasks/Picard.wdl",
"RSEM.wdl": "library/tasks/RSEM.wdl",
"GroupMetricsOutputs.wdl": "library/tasks/GroupMetricsOutputs.wdl",
"ZarrUtils.wdl": "library/tasks/ZarrUtils.wdl"
"ZarrUtils.wdl": "library/tasks/ZarrUtils.wdl",
"FastQC.wdl": "library/tasks/FastQC.wdl"
}
5 changes: 4 additions & 1 deletion test/smartseq2_single_sample/pr/test_inputs.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,8 @@
"TestSmartSeq2SingleCellPR.sample_name":"SRR1294925",
"TestSmartSeq2SingleCellPR.output_name":"SRR1294925",
"TestSmartSeq2SingleCellPR.expected_counts_hash": "135a3fbb959583db17713dc8b9d7fe33",
"TestSmartSeq2SingleCellPR.expected_metrics_hash": "99bd9903ac8dc77eb1c047ffa8eb42ed"
"TestSmartSeq2SingleCellPR.expected_metrics_hash": "99bd9903ac8dc77eb1c047ffa8eb42ed",
"TestSmartSeq2SingleCellPR.expected_fastqc_html_strings": ["SRR1294925_1_skip250000_read250000.fastq FastQC Report","Sequence length</td><td>25",
"SRR1294925_2_skip250000_read250000.fastq FastQC Report"],
"TestSmartSeq2SingleCellPR.expected_n_fastqc_zips": 2
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ workflow TestSmartSeq2SingleCellPR {
String output_name
File fastq1
File fastq2
Array[String] expected_fastqc_html_strings
Int expected_n_fastqc_zips

call target_wdl.SmartSeq2SingleCell as target_workflow {
input:
Expand All @@ -47,7 +49,11 @@ workflow TestSmartSeq2SingleCellPR {
counts = target_workflow.rsem_gene_results,
expected_counts_hash = expected_counts_hash,
target_metrics = target_workflow.insert_size_metrics,
expected_metrics_hash = expected_metrics_hash
expected_metrics_hash = expected_metrics_hash,
fastqc_htmls = target_workflow.fastqc_htmls,
expected_fastqc_html_strings = expected_fastqc_html_strings,
n_fastqc_zips = length(target_workflow.fastqc_zips),
expected_n_fastqc_zips = expected_n_fastqc_zips
}

}