HumanCellAtlas · kachulis · May 13, 2019 · May 13, 2019 · May 14, 2019 · May 14, 2019
diff --git a/library/tasks/FastQC.wdl b/library/tasks/FastQC.wdl
@@ -0,0 +1,50 @@
+task FastQC {
+    Array[File] fastq_files
+    File? limits_file
+    Int startRead = 250000
+    Int nRead = 250000
+    String docker = "quay.io/biocontainers/fastqc:0.11.8--1"
+    Int machine_mem_mb = 3850
+    Int disk = 100
+    Int preemptible = 3
+
+    String dollar = "$"
+    parameter_meta {
+        fastq_files: "input fastq files"
+        limits_file: "(optional) limits file to use with fastqc"
+        startRead: "(optional) start fastqc at the nth read of the file"
+        nRead: "(optional) use (at most) n reads for fastqc"
+        docker: "(optional) the docker image containing the runtime environment for this task"
+        disk: "(optional) the amount of disk space (GiB) to provision for this task"
+        preemptible: "(optional) if non-zero, request a pre-emptible instance and allow for this number of preemptions before running the task on a non preemptible machine"
+        machine_mem_mb: "(optional) the amount of memory (MiB) to provision for this task"
+
+    }
+
+    command <<<
+        set -e
+
+        mkdir outputs
+        declare -a fastqs=()
+        for fastq in ${sep=' ' fastq_files}
+        do
+            outname=`basename ${dollar}fastq .fastq.gz`_skip${startRead}_read${nRead}.fastq
+            zcat ${dollar}fastq | head -n ${4*(startRead + nRead)} | tail -n ${4*nRead} > ${dollar}outname
+            fastqs+=(${dollar}outname)
+        done
+
+        fastqc ${dollar}{fastqs[@]} -o outputs ${"--limits " + limits_file}
+    >>>
+
+    runtime {
+        docker: docker
+        memory: "${machine_mem_mb} MiB"
+        disks: "local-disk ${disk} HDD"
+        preemptible: preemptible
+    }
+
+    output {
+        Array[File] fastqc_htmls = glob("outputs/*.html")
+        Array[File] fastqc_zips = glob("outputs/*.zip")
+    }
+}
diff --git a/pipelines/optimus/Optimus.wdl b/pipelines/optimus/Optimus.wdl
@@ -11,6 +11,7 @@ import "RunEmptyDrops.wdl" as RunEmptyDrops
 import "ZarrUtils.wdl" as ZarrUtils
 import "Picard.wdl" as Picard
 import "UmiCorrection.wdl" as UmiCorrection
+import "FastQC.wdl" as FastQC
 
 workflow Optimus {
   meta {
@@ -25,6 +26,9 @@ workflow Optimus {
   Array[File]? i1_fastq
   String sample_id
 
+  # fastqc input
+  File? fastqc_limits
+
   # organism reference parameters
   File tar_star_reference
   File annotations_gtf
@@ -51,6 +55,7 @@ workflow Optimus {
     r2_fastq: "reverse read, contains cDNA fragment generated from captured mRNA"
     i1_fastq: "(optional) index read, for demultiplexing of multiple samples on one flow cell."
     sample_id: "name of sample matching this file, inserted into read group header"
+    fastqc_limits: "(optional) limits file for fastqc"
     tar_star_reference: "star genome reference"
     annotations_gtf: "gtf containing annotations for gene tagging (must match star reference)"
     ref_genome_fasta: "genome fasta file (must match star reference)"
@@ -77,6 +82,13 @@ workflow Optimus {
           r2_unmapped_bam = FastqToUBam.bam_output,
           whitelist = whitelist
       }
+
+      call FastQC.FastQC as FastQC {
+        input:
+          fastq_files = [r1_fastq[index], r2_fastq[index], non_optional_i1_fastq[index]],
+          limits_file = fastqc_limits,
+          disk = ceil((size(r1_fastq[index], "GiB") + size(r2_fastq[index], "GiB") + size(non_optional_i1_fastq[index], "GiB")) * 1.2 + 10)
+      }
     }
 
     # if the index is not passed, proceed without it.
@@ -87,9 +99,18 @@ workflow Optimus {
           r2_unmapped_bam = FastqToUBam.bam_output,
           whitelist = whitelist
       }
+
+      call FastQC.FastQC as FastQCNoIndex {
+        input:
+          fastq_files = [r1_fastq[index], r2_fastq[index]],
+          limits_file = fastqc_limits,
+          disk = ceil((size(r1_fastq[index], "GiB") + size(r2_fastq[index], "GiB")) * 1.2 + 10)
+      }
     }
 
     File barcoded_bam = select_first([AttachBarcodes.bam_output, AttachBarcodesNoIndex.bam_output])
+    Array[File] fastqc_output_htmls = select_first([FastQC.fastqc_htmls,FastQCNoIndex.fastqc_htmls])
+    Array[File] fastqc_output_zips = select_first([FastQC.fastqc_zips,FastQCNoIndex.fastqc_zips])
   }
 
   call Merge.MergeSortBamFiles as MergeUnsorted {
@@ -222,5 +243,9 @@ workflow Optimus {
 
       # zarr
       Array[File]? zarr_output_files = OptimusZarrConversion.zarr_output_files
+
+      # fastqc
+      Array[File] fastqc_htmls = flatten(fastqc_output_htmls)
+      Array[File] fastqc_zips = flatten(fastqc_output_zips)
   }
 }
diff --git a/pipelines/smartseq2_single_sample/SmartSeq2SingleSample.wdl b/pipelines/smartseq2_single_sample/SmartSeq2SingleSample.wdl
@@ -3,6 +3,7 @@ import "Picard.wdl" as Picard
 import "RSEM.wdl" as RSEM
 import "GroupMetricsOutputs.wdl" as GroupQCs
 import "ZarrUtils.wdl" as ZarrUtils
+import "FastQC.wdl" as FastQC
 
 workflow SmartSeq2SingleCell {
   meta {
@@ -28,6 +29,8 @@ workflow SmartSeq2SingleCell {
   String output_name
   File fastq1
   File fastq2
+  # fastqc
+  File? fastqc_limits
   Int max_retries = 0
 
   # whether to convert the outputs to Zarr format, by default it's set to true
@@ -48,12 +51,20 @@ workflow SmartSeq2SingleCell {
     output_name: "Output name, can include path"
     fastq1: "R1 in paired end reads"
     fastq2: "R2 in paired end reads"
+    fastqc_limits: "(optional) limits file for fastqc"
     max_retries: "(optional) retry this number of times if task fails -- use with caution, see skylab README for details"
     output_zarr: "whether to run the taks that converts the outputs to Zarr format, by default it's true"
   }
 
   String quality_control_output_basename = output_name + "_qc"
 
+  call FastQC.FastQC {
+    input:
+      fastq_files = [fastq1, fastq2],
+      limits_file = fastqc_limits,
+      disk = ceil((size(fastq1, "GiB") + size(fastq2, "GiB")) * 1.2 + 10)
+  }
+
   call HISAT2.HISAT2PairedEnd {
     input:
       hisat2_ref = hisat2_ref_index,
@@ -149,5 +160,9 @@ workflow SmartSeq2SingleCell {
 
     # zarr
     Array[File]? zarr_output_files = SmartSeq2ZarrConversion.zarr_output_files
+
+    #fastqc
+    Array[File] fastqc_htmls = FastQC.fastqc_htmls
+    Array[File] fastqc_zips = FastQC.fastqc_zips
   }
 }
diff --git a/test/optimus/pr/ValidateOptimus.wdl b/test/optimus/pr/ValidateOptimus.wdl
@@ -3,13 +3,16 @@ task ValidateOptimus {
       File matrix
       File gene_metrics
       File cell_metrics
-
+      Array[File] fastqc_htmls
+      Int n_fastqc_zips
       Int required_disk = ceil((size(bam, "G") + size(matrix, "G")) * 1.1)
 
       String expected_bam_hash
       String expected_matrix_hash
       String expected_gene_metric_hash
       String expected_cell_metric_hash
+      Int expected_n_fastqc_zips
+      Array[String] expected_fastqc_html_strings
 
   command <<<
 
@@ -51,6 +54,25 @@ task ValidateOptimus {
       fail=true
     fi
 
+    #search for expected strings in fastqc html
+    for string in "${sep='" "' expected_fastqc_html_strings}"; do
+        fail_html=true
+        for htmlfile in ${sep=' ' fastqc_htmls}; do
+          if grep "$string" $htmlfile; then
+            fail_html=false
+          fi
+        done
+        if [ $fail_html == "true" ]; then
+          >&2 echo "expected string ($string) not found in fastqc html files"
+          fail=true
+        fi
+    done
+
+    if [ ${expected_n_fastqc_zips} != ${n_fastqc_zips} ]; then
+      >&2 echo "number of fastqc zip (${n_fastqc_zips}) did not match expected number (${expected_n_fastqc_zips})"
+      fail=true
+    fi
+
     if [ $fail == "true" ]; then exit 1; fi
 
   >>>

diff --git a/test/optimus/pr/dependencies.json b/test/optimus/pr/dependencies.json
@@ -14,5 +14,6 @@
   "RunEmptyDrops.wdl": "library/tasks/RunEmptyDrops.wdl",
   "ZarrUtils.wdl": "library/tasks/ZarrUtils.wdl",
   "Picard.wdl": "library/tasks/Picard.wdl",
-  "UmiCorrection.wdl": "library/tasks/UmiCorrection.wdl"
+  "UmiCorrection.wdl": "library/tasks/UmiCorrection.wdl",
+  "FastQC.wdl": "library/tasks/FastQC.wdl"
 }
diff --git a/test/optimus/pr/test_inputs.json b/test/optimus/pr/test_inputs.json
@@ -3,6 +3,10 @@
   "TestOptimusPR.expected_matrix_hash": "aec7a79dc7b85a5d621509a3f4fa2192",
   "TestOptimusPR.expected_cell_metric_hash": "45cc8be253445201b02d102d2d096a0c",
   "TestOptimusPR.expected_gene_metric_hash": "d636671dfcff6ec8068987d0f5780334",
+  "TestOptimusPR.expected_fastqc_html_strings": ["pbmc8k_S1_L007_I1_001_skip250000_read250000.fastq FastQC Report","Sequence length</td><td>8",
+    "pbmc8k_S1_L007_R1_001_skip250000_read250000.fastq FastQC Report","Sequence length</td><td>26",
+    "pbmc8k_S1_L007_R2_001_skip250000_read250000.fastq FastQC Report","Sequence length</td><td>98"],
+  "TestOptimusPR.expected_n_fastqc_zips": 6,
   "TestOptimusPR.r1_fastq": [
     "gs://hca-dcp-mint-test-data/10x/demo/fastqs/pbmc8k_S1_L007_R1_001.fastq.gz",
     "gs://hca-dcp-mint-test-data/10x/demo/fastqs/pbmc8k_S1_L007_R1_001.fastq.gz"

diff --git a/test/optimus/pr/test_optimus_PR.wdl b/test/optimus/pr/test_optimus_PR.wdl
@@ -10,6 +10,8 @@ workflow TestOptimusPR {
   String expected_matrix_hash
   String expected_gene_metric_hash
   String expected_cell_metric_hash
+  Array[String] expected_fastqc_html_strings # a set of strings to search for in the fastqc html outputs
+  Int expected_n_fastqc_zips
 
   # Optimus inputs
   Array[File] r1_fastq
@@ -40,10 +42,14 @@ workflow TestOptimusPR {
       matrix = target.matrix,
       gene_metrics = target.gene_metrics,
       cell_metrics = target.cell_metrics,
+      fastqc_htmls = target.fastqc_htmls,
+      n_fastqc_zips = length(target.fastqc_zips),
       expected_bam_hash = expected_bam_hash,
       expected_matrix_hash = expected_matrix_hash,
       expected_cell_metric_hash = expected_cell_metric_hash,
-      expected_gene_metric_hash = expected_gene_metric_hash
+      expected_gene_metric_hash = expected_gene_metric_hash,
+      expected_fastqc_html_strings = expected_fastqc_html_strings,
+      expected_n_fastqc_zips = expected_n_fastqc_zips
   }
 
 }
diff --git a/test/optimus/scientific/dependencies.json b/test/optimus/scientific/dependencies.json
@@ -12,5 +12,6 @@
   "SequenceDataWithMoleculeTagMetrics.wdl": "library/tasks/SequenceDataWithMoleculeTagMetrics.wdl",
   "TagSortBam.wdl": "library/tasks/TagSortBam.wdl",
   "Picard.wdl": "library/tasks/Picard.wdl",
-  "UmiCorrection.wdl": "library/tasks/UmiCorrection.wdl"
+  "UmiCorrection.wdl": "library/tasks/UmiCorrection.wdl",
+  "FastQC.wdl": "library/tasks/FastQC.wdl"
 }
diff --git a/test/smartseq2_single_sample/pr/ValidateSmartSeq2SingleCell.wdl b/test/smartseq2_single_sample/pr/ValidateSmartSeq2SingleCell.wdl
@@ -5,6 +5,12 @@ task ValidateSmartSeq2SingleCell {
       File target_metrics
       String expected_metrics_hash
 
+      Array[File] fastqc_htmls
+      Array[String] expected_fastqc_html_strings
+
+      Int expected_n_fastqc_zips
+      Int n_fastqc_zips
+
   command <<<
 
     # catch intermittent failures
@@ -28,6 +34,25 @@ task ValidateSmartSeq2SingleCell {
       fail=true
     fi
 
+    #search for expected strings in fastqc html
+    for string in "${sep='" "' expected_fastqc_html_strings}"; do
+        fail_html=true
+        for htmlfile in ${sep=' ' fastqc_htmls}; do
+          if grep "$string" $htmlfile; then
+            fail_html=false
+          fi
+        done
+        if [ $fail_html == "true" ]; then
+          >&2 echo "expected string ($string) not found in fastqc html files"
+          fail=true
+        fi
+    done
+
+    if [ ${expected_n_fastqc_zips} != ${n_fastqc_zips} ]; then
+      >&2 echo "number of fastqc zip (${n_fastqc_zips}) did not match expected number (${expected_n_fastqc_zips})"
+      fail=true
+    fi
+
     if [ $fail == "true" ]; then exit 1; fi
 
   >>>

diff --git a/test/smartseq2_single_sample/pr/dependencies.json b/test/smartseq2_single_sample/pr/dependencies.json
@@ -5,5 +5,6 @@
   "Picard.wdl": "library/tasks/Picard.wdl",
   "RSEM.wdl": "library/tasks/RSEM.wdl",
   "GroupMetricsOutputs.wdl": "library/tasks/GroupMetricsOutputs.wdl",
-  "ZarrUtils.wdl": "library/tasks/ZarrUtils.wdl"
+  "ZarrUtils.wdl": "library/tasks/ZarrUtils.wdl",
+  "FastQC.wdl": "library/tasks/FastQC.wdl"
 }
diff --git a/test/smartseq2_single_sample/pr/test_inputs.json b/test/smartseq2_single_sample/pr/test_inputs.json
@@ -14,5 +14,8 @@
   "TestSmartSeq2SingleCellPR.sample_name":"SRR1294925",
   "TestSmartSeq2SingleCellPR.output_name":"SRR1294925",
   "TestSmartSeq2SingleCellPR.expected_counts_hash": "135a3fbb959583db17713dc8b9d7fe33",
-  "TestSmartSeq2SingleCellPR.expected_metrics_hash": "99bd9903ac8dc77eb1c047ffa8eb42ed"
+  "TestSmartSeq2SingleCellPR.expected_metrics_hash": "99bd9903ac8dc77eb1c047ffa8eb42ed",
+  "TestSmartSeq2SingleCellPR.expected_fastqc_html_strings": ["SRR1294925_1_skip250000_read250000.fastq FastQC Report","Sequence length</td><td>25",
+    "SRR1294925_2_skip250000_read250000.fastq FastQC Report"],
+  "TestSmartSeq2SingleCellPR.expected_n_fastqc_zips": 2
 }
diff --git a/test/smartseq2_single_sample/pr/test_smartseq2_single_sample_PR.wdl b/test/smartseq2_single_sample/pr/test_smartseq2_single_sample_PR.wdl
@@ -23,6 +23,8 @@ workflow TestSmartSeq2SingleCellPR {
   String output_name
   File fastq1
   File fastq2
+  Array[String] expected_fastqc_html_strings
+  Int expected_n_fastqc_zips
 
   call target_wdl.SmartSeq2SingleCell as target_workflow {
     input:
@@ -47,7 +49,11 @@ workflow TestSmartSeq2SingleCellPR {
      counts = target_workflow.rsem_gene_results,
      expected_counts_hash = expected_counts_hash,
      target_metrics = target_workflow.insert_size_metrics,
-     expected_metrics_hash = expected_metrics_hash
+     expected_metrics_hash = expected_metrics_hash,
+     fastqc_htmls = target_workflow.fastqc_htmls,
+     expected_fastqc_html_strings = expected_fastqc_html_strings,
+     n_fastqc_zips = length(target_workflow.fastqc_zips),
+     expected_n_fastqc_zips = expected_n_fastqc_zips
   }
 
 }