Skip to content

Latest commit

 

History

History
654 lines (513 loc) · 19.1 KB

quick-start-simple-workflow.md

File metadata and controls

654 lines (513 loc) · 19.1 KB

Quick Start - Simple Workflow

Here is a workflow that trims reads, gzips them, and then aligns them using bowtie2. The initial file structure is 'data/raw/Sample_01', 'data/raw/Sample_02', and 'data/raw/Sample_03'

Directory Structure

This is our starting directory structure. BioX will also make the outdirs as the workflow is submitted.

my_analysis
    data/raw
      Sample_01/
         Sample_01.read1.fastq.gz
         Sample_01.read2.fastq.gz
      Sample_02
         Sample_02.read1.fastq.gz
         Sample_02.read2.fastq.gz

Workflow Configuration

This is the my_workflow.yml file.

---
global:
    # ROOT Directory configurations
    # Aligning against raw reeds
    - indir: "data/raw"
    - outdir: "data/processed"
    # Find Samples in indir
    - sample_rule: (Sample.*)$
    - by_sample_outdir: 1
    - find_by_dir: 1
    # Additional Dir Setup
    - trimmomatic_dir: "data/processed/{$sample}/trimmomatic"
    - bowtie2_dir: "data/processed/{$sample}/bowtie2"
    # Add additional varibles to make the template a bit cleaner
    - TR1: "{$self->trimmomatic_dir}/{$sample}_read1_trimmomatic"
    - TR2: "{$self->trimmomatic_dir}/{$sample}_read2_trimmomatic"
    - bowtie2_reference: "genome.fa"
rules:
    - trimmomatic:
        local:
            - outdir: "{$self->trimmomatic_dir}"
            - INPUT:
              - "{$self->indir}/{$sample}_read1.fastq.gz"
              - "{$self->indir}/{$sample}_read2.fastq.gz"
            - OUTPUT:
              - "{$self->TR1}_1PE.fastq"
              - "{$self->TR1}_1SE.fastq"
              - "{$self->TR2}_2PE.fastq"
              - "{$self->TR2}_2SE.fastq"
              - "{$self->trimmomatic_dir}/{$sample}_trimmomatic.log"
        process: |
             #TASK tags={$sample}
             trimmomatic \
                PE -threads 6 \
                -trimlog {$self->OUTPUT->[4]} \
                {$self->INPUT->[0]} \
                {$self->INPUT->[1]} \
                {$self->OUTPUT->[0]} \
                {$self->OUTPUT->[1]} \
                {$self->OUTPUT->[2]} \
                {$self->OUTPUT->[3]} \
                TRAILING:3 LEADING:3 SLIDINGWINDOW:4:15 MINLEN:36
    - trimmomatic_gzip:
        local:
          - indir: "{$self->trimmomatic_dir}"
          - outdir: "{$self->trimmomatic_dir}"
          - INPUT:
            - "{$self->TR1}_1PE.fastq"
            - "{$self->TR2}_2PE.fastq"
            - "{$self->TR1}_1SE.fastq"
            - "{$self->TR2}_2SE.fastq"
          - OUTPUT:
            - "{$self->TR1}_1PE.fastq.gz"
            - "{$self->TR2}_2PE.fastq.gz"
            - "{$self->TR1}_1SE.fastq.gz"
            - "{$self->TR2}_2SE.fastq.gz"
        process: |
            #TASK tags={$sample}
            gzip -f {$self->INPUT->[0]}

            #TASK tags={$sample}
            gzip -f {$self->INPUT->[1]}

            #TASK tags={$sample}
            gzip -f {$self->INPUT->[2]}

            #TASK tags={$sample}
            gzip -f {$self->INPUT->[3]}
    - bowtie2:
        local:
              - indir: "{$self->trimmomatic_dir}"
              - outdir: "{$self->bowtie2_dir}"
              - INPUT:
                - "{$self->TR1}_1PE.fastq.gz"
                - "{$self->TR2}_2PE.fastq.gz"
              - OUTPUT: "{$self->bowtie2_dir}/{$sample}_aligned.sam"
        process: |
                #TASK tags={$sample}
                bowtie2 -p 7 -x {$self->bowtie2_reference} \
                -1 {$self->INPUT->[0]} \
                -2 {$self->INPUT->[1]} \
                -S {$self->OUTPUT}

Workflow Script - Piece by Piece

The workflow script that is generated by biox is very generously commented. Here we will break it down piece by piece.

Workflow Script - Global Parameters

BioX starts off by telling you how you ran it, and what your global parameters were.

#!/usr/bin/env bash


#
# Generated at: 2017-05-30T12:28:55
# This file was generated with the following options
#    run
#    -w    resequencing-human-complete-part1-trimm_reads_chunk.yml
#    -o    reseq.sh
#

#

# Starting Workflow

#
#
# Global Variables:
#    indir:
#        - data/raw
#    outdir:
#        - data/processed
#    sample_rule: (Sample.*)$
#    by_sample_outdir: 1
#    find_by_dir: 1
#    trimmomatic_dir: data/processed/{$sample}/trimmomatic
#    bowtie2_dir: data/processed/{$sample}/bowtie2
#    TR1: {$self->trimmomatic_dir}/{$sample}_read1_trimmomatic
#    TR2: {$self->trimmomatic_dir}/{$sample}_read2_trimmomatic
#    bowtie2_reference: genome.fa
#

Workflow Script - Rule 1 Trimmomatic

Comment Annotation

Once again, biox adds quite a bit of information. First is the 'Starting RuleX' block, then the variable declaration. If you are using the carry over capabilities of BioX for the INPUT/OUTPUT or indir/outdir, you will see them here.

There is a very important distinction to make here with variables. We declared TR1 and _TR2 _in the global key. All variables declared in the global key are available throughout the workflow, but local variables are only available within a rule. An important exception to this is the stash variable, which is carried through. You can find out more about the stash in Advanced - Using the stash.

#
#

#
# Starting trimmomatic
#


#
# Variables
# Indir: data/raw
# Outdir: {$self->trimmomatic_dir}
#
# Local Variables:
#
#    outdir:
#        - '{$self->trimmomatic_dir}'

#    INPUT:
#        - '{$self->indir}/{$sample}_read1.fastq.gz'
#        - '{$self->indir}/{$sample}_read2.fastq.gz'

#    OUTPUT:
#        - '{$self->TR1}_1PE.fastq'
#        - '{$self->TR1}_1SE.fastq'
#        - '{$self->TR2}_2PE.fastq'
#        - '{$self->TR2}_2SE.fastq'
#        - '{$self->trimmomatic_dir}/{$sample}_trimmomatic.log'

#

#
### HPC Directives
#HPC jobname=trimmomatic
#

Bash!

BioX is written to be as transparent as possible. It outputs bash that can be run directly on the command line. The script that is generated could by run as is. Sometimes when trouble shooting I like to run through with a single sample for a single command, run what biox gives, just to make sure everything has run the way I expect.

By default BioX resolves all special directory variables as absolute paths.

#TASK tags=Sample_01
trimmomatic \
PE -threads 6 \
-trimlog /home/user/my_analysis/data/processed/Sample_01/trimmomatic/Sample_01_trimmomatic.log \
/home/user/my_analysis/data/raw/Sample_01/Sample_01_read1.fastq.gz \
/home/user/my_analysis/data/raw/Sample_01/Sample_01_read2.fastq.gz \
/home/user/my_analysis/data/processed/Sample_01/trimmomatic/Sample_01_read1_trimmomatic_1PE.fastq \
/home/user/my_analysis/data/processed/Sample_01/trimmomatic/Sample_01_read1_trimmomatic_1SE.fastq \
/home/user/my_analysis/data/processed/Sample_01/trimmomatic/Sample_01_read2_trimmomatic_2PE.fastq \
/home/user/my_analysis/data/processed/Sample_01/trimmomatic/Sample_01_read2_trimmomatic_2SE.fastq \
TRAILING:3 LEADING:3 SLIDINGWINDOW:4:15 MINLEN:36


#TASK tags=Sample_02
trimmomatic \
PE -threads 6 \
-trimlog /home/user/my_analysis/data/processed/Sample_02/trimmomatic/Sample_02_trimmomatic.log \
/home/user/my_analysis/data/raw/Sample_02/Sample_02_read1.fastq.gz \
/home/user/my_analysis/data/raw/Sample_02/Sample_02_read2.fastq.gz \
/home/user/my_analysis/data/processed/Sample_02/trimmomatic/Sample_02_read1_trimmomatic_1PE.fastq \
/home/user/my_analysis/data/processed/Sample_02/trimmomatic/Sample_02_read1_trimmomatic_1SE.fastq \
/home/user/my_analysis/data/processed/Sample_02/trimmomatic/Sample_02_read2_trimmomatic_2PE.fastq \
/home/user/my_analysis/data/processed/Sample_02/trimmomatic/Sample_02_read2_trimmomatic_2SE.fastq \
TRAILING:3 LEADING:3 SLIDINGWINDOW:4:15 MINLEN:36


#TASK tags=Sample_03
trimmomatic \
PE -threads 6 \
-trimlog /home/user/my_analysis/data/processed/Sample_03/trimmomatic/Sample_03_trimmomatic.log \
/home/user/my_analysis/data/raw/Sample_03/Sample_03_read1.fastq.gz \
/home/user/my_analysis/data/raw/Sample_03/Sample_03_read2.fastq.gz \
/home/user/my_analysis/data/processed/Sample_03/trimmomatic/Sample_03_read1_trimmomatic_1PE.fastq \
/home/user/my_analysis/data/processed/Sample_03/trimmomatic/Sample_03_read1_trimmomatic_1SE.fastq \
/home/user/my_analysis/data/processed/Sample_03/trimmomatic/Sample_03_read2_trimmomatic_2PE.fastq \
/home/user/my_analysis/data/processed/Sample_03/trimmomatic/Sample_03_read2_trimmomatic_2SE.fastq \
TRAILING:3 LEADING:3 SLIDINGWINDOW:4:15 MINLEN:36

Workflow Script - Rule 2 TrimmomaticGzip

Comment Annotation

This looks very similar to what we had before. Now we have 4 _INPUT_s and 4 _OUTPUT_s, one for each of our reads. We aren't moving any files around, so both the indir and outdir are defined as the trimmomatic dir.

#
#

#
# Starting trimmomatic_gzip
#


#
# Variables
# Indir: {$self->trimmomatic_dir}
# Outdir: {$self->trimmomatic_dir}
#
# Local Variables:
#
#    indir:
#        - '{$self->trimmomatic_dir}'

#    outdir:
#        - '{$self->trimmomatic_dir}'

#    INPUT:
#        - '{$self->TR1}_1PE.fastq'
#        - '{$self->TR2}_2PE.fastq'
#        - '{$self->TR1}_1SE.fastq'
#        - '{$self->TR2}_2SE.fastq'

#    OUTPUT:
#        - '{$self->TR1}_1PE.fastq.gz'
#        - '{$self->TR2}_2PE.fastq.gz'
#        - '{$self->TR1}_1SE.fastq.gz'
#        - '{$self->TR2}_2SE.fastq.gz'

#

#
### HPC Directives
#HPC jobname=trimmomatic_gzip
#

Bash!

Same as before, BioX gives us a bash file of commands.

For the sake of readability I removed the full path name, but in the real world 'data/processed' would be '/home/user/my_analysis/data/processed'.

#TASK tags=Sample_01
gzip -f data/processed/Sample_01/trimmomatic/Sample_01_read1_trimmomatic_1PE.fastq
#TASK tags=Sample_01
gzip -f data/processed/Sample_01/trimmomatic/Sample_01_read2_trimmomatic_2PE.fastq
#TASK tags=Sample_01
gzip -f data/processed/Sample_01/trimmomatic/Sample_01_read1_trimmomatic_1SE.fastq
#TASK tags=Sample_01
gzip -f data/processed/Sample_01/trimmomatic/Sample_01_read2_trimmomatic_2SE.fastq


#TASK tags=Sample_02
gzip -f data/processed/Sample_02/trimmomatic/Sample_02_read1_trimmomatic_1PE.fastq
#TASK tags=Sample_02
gzip -f data/processed/Sample_02/trimmomatic/Sample_02_read2_trimmomatic_2PE.fastq
#TASK tags=Sample_02
gzip -f data/processed/Sample_02/trimmomatic/Sample_02_read1_trimmomatic_1SE.fastq
#TASK tags=Sample_02
gzip -f data/processed/Sample_02/trimmomatic/Sample_02_read2_trimmomatic_2SE.fastq


#TASK tags=Sample_03
gzip -f data/processed/Sample_03/trimmomatic/Sample_03_read1_trimmomatic_1PE.fastq
#TASK tags=Sample_03
gzip -f data/processed/Sample_03/trimmomatic/Sample_03_read2_trimmomatic_2PE.fastq
#TASK tags=Sample_03
gzip -f data/processed/Sample_03/trimmomatic/Sample_03_read1_trimmomatic_1SE.fastq
#TASK tags=Sample_03
gzip -f data/processed/Sample_03/trimmomatic/Sample_03_read2_trimmomatic_2SE.fastq

Workflow Script - Rule 3 Bowtie2

Comment Annotation

What we see is very similar to the what we saw in previous rules.

#
#

#
# Starting bowtie2
#


#
# Variables
# Indir: {$self->trimmomatic_dir}
# Outdir: {$self->bowtie2_dir}
#
# Local Variables:
#
#    indir:
#        - '{$self->trimmomatic_dir}'

#    outdir:
#        - '{$self->bowtie2_dir}'

#    INPUT:
#        - '{$self->TR1}_1PE.fastq.gz'
#        - '{$self->TR2}_2PE.fastq.gz'

#    OUTPUT: {$self->bowtie2_dir}/{$sample}_aligned.sam

#

#
### HPC Directives
#HPC jobname=bowtie2
#

Notice that since we have a single _OUTPUT, _it can be declared as a key/value pair instead of a key/list.

# THIS IS A LIST

# INPUT:
# - '{$self->TR1}_1PE.fastq.gz'
# - '{$self->TR2}_2PE.fastq.gz'

# THIS IS A VALUE
# OUTPUT: {$self->bowtie2_dir}/{$sample}_aligned.sam

Bash!

#TASK tags=Sample_01_
bowtie2 -p 7 -x data/genome.fa \
-1 data/processed/Sample_01/trimmomatic/Sample_01_read1_trimmomatic_1PE.fastq.gz \
-2 data/processed/Sample_01/trimmomatic/Sample_01_read2_trimmomatic_2PE.fastq.gz \
-S data/processed/Sample_01/bowtie2/Sample_01_aligned.sam


#TASK tags=Sample_02_
bowtie2 -p 7 -x data/genome.fa \
-1 data/processed/Sample_02/trimmomatic/Sample_02_read1_trimmomatic_1PE.fastq.gz \
-2 data/processed/Sample_02/trimmomatic/Sample_02_read2_trimmomatic_2PE.fastq.gz \
-S data/processed/Sample_02/bowtie2/Sample_02_aligned.sam


#TASK tags=Sample_03_
bowtie2 -p 7 -x data/genome.fa \
-1 data/processed/Sample_03/trimmomatic/Sample_03_read1_trimmomatic_1PE.fastq.gz \
-2 data/processed/Sample_03/trimmomatic/Sample_03_read2_trimmomatic_2PE.fastq.gz \
-S data/processed/Sample_03/bowtie2/Sample_03_aligned.sam

Altogether Now

This is the entire workflow as a whole.

Starting with trimmomatic_gzip, I removed the '/home/user/my_analysis' in order to make the file more readable.

#!/usr/bin/env bash


#
# Generated at: 2017-05-30T12:28:55
# This file was generated with the following options
#    run
#    -w    resequencing-human-complete-part1-trimm_reads_chunk.yml
#    -o    reseq.sh
#

#

# Starting Workflow

#
#
# Global Variables:
#    indir:
#        - data/raw
#    outdir:
#        - data/processed
#    sample_rule: (Sample.*)$
#    by_sample_outdir: 1
#    find_by_dir: 1
#    trimmomatic_dir: data/processed/{$sample}/trimmomatic
#    bowtie2_dir: data/processed/{$sample}/bowtie2
#    TR1: {$self->trimmomatic_dir}/{$sample}_read1_trimmomatic
#    TR2: {$self->trimmomatic_dir}/{$sample}_read2_trimmomatic
#    bowtie2_reference: data/genome.fa
#

#
#

#
# Starting trimmomatic
#


#
# Variables
# Indir: data/raw
# Outdir: {$self->trimmomatic_dir}
#
# Local Variables:
#
#    outdir:
#        - '{$self->trimmomatic_dir}'

#    INPUT:
#        - '{$self->indir}/{$sample}_read1.fastq.gz'
#        - '{$self->indir}/{$sample}_read2.fastq.gz'

#    OUTPUT:
#        - '{$self->TR1}_1PE.fastq'
#        - '{$self->TR1}_1SE.fastq'
#        - '{$self->TR2}_2PE.fastq'
#        - '{$self->TR2}_2SE.fastq'
#        - '{$self->trimmomatic_dir}/{$sample}_trimmomatic.log'

#

#
### HPC Directives
#HPC jobname=trimmomatic
#


#TASK tags=Sample_01
trimmomatic \
PE -threads 6 \
-trimlog /home/user/my_analysis/data/processed/Sample_01/trimmomatic/Sample_01_trimmomatic.log \
/home/user/my_analysis/data/raw/Sample_01/Sample_01_read1.fastq.gz \
/home/user/my_analysis/data/raw/Sample_01/Sample_01_read2.fastq.gz \
/home/user/my_analysis/data/processed/Sample_01/trimmomatic/Sample_01_read1_trimmomatic_1PE.fastq \
/home/user/my_analysis/data/processed/Sample_01/trimmomatic/Sample_01_read1_trimmomatic_1SE.fastq \
/home/user/my_analysis/data/processed/Sample_01/trimmomatic/Sample_01_read2_trimmomatic_2PE.fastq \
/home/user/my_analysis/data/processed/Sample_01/trimmomatic/Sample_01_read2_trimmomatic_2SE.fastq \
TRAILING:3 LEADING:3 SLIDINGWINDOW:4:15 MINLEN:36


#TASK tags=Sample_02
trimmomatic \
PE -threads 6 \
-trimlog /home/user/my_analysis/data/processed/Sample_02/trimmomatic/Sample_02_trimmomatic.log \
/home/user/my_analysis/data/raw/Sample_02/Sample_02_read1.fastq.gz \
/home/user/my_analysis/data/raw/Sample_02/Sample_02_read2.fastq.gz \
/home/user/my_analysis/data/processed/Sample_02/trimmomatic/Sample_02_read1_trimmomatic_1PE.fastq \
/home/user/my_analysis/data/processed/Sample_02/trimmomatic/Sample_02_read1_trimmomatic_1SE.fastq \
/home/user/my_analysis/data/processed/Sample_02/trimmomatic/Sample_02_read2_trimmomatic_2PE.fastq \
/home/user/my_analysis/data/processed/Sample_02/trimmomatic/Sample_02_read2_trimmomatic_2SE.fastq \
TRAILING:3 LEADING:3 SLIDINGWINDOW:4:15 MINLEN:36


#TASK tags=Sample_03
trimmomatic \
PE -threads 6 \
-trimlog /home/user/my_analysis/data/processed/Sample_03/trimmomatic/Sample_03_trimmomatic.log \
/home/user/my_analysis/data/raw/Sample_03/Sample_03_read1.fastq.gz \
/home/user/my_analysis/data/raw/Sample_03/Sample_03_read2.fastq.gz \
/home/user/my_analysis/data/processed/Sample_03/trimmomatic/Sample_03_read1_trimmomatic_1PE.fastq \
/home/user/my_analysis/data/processed/Sample_03/trimmomatic/Sample_03_read1_trimmomatic_1SE.fastq \
/home/user/my_analysis/data/processed/Sample_03/trimmomatic/Sample_03_read2_trimmomatic_2PE.fastq \
/home/user/my_analysis/data/processed/Sample_03/trimmomatic/Sample_03_read2_trimmomatic_2SE.fastq \
TRAILING:3 LEADING:3 SLIDINGWINDOW:4:15 MINLEN:36

#######################################################
###FROM HERE DOWN I REMOVED /home/user/my_analysis/
#######################################################
#
#

#
# Starting trimmomatic_gzip
#


#
# Variables
# Indir: {$self->trimmomatic_dir}
# Outdir: {$self->trimmomatic_dir}
#
# Local Variables:
#
#    indir:
#        - '{$self->trimmomatic_dir}'

#    outdir:
#        - '{$self->trimmomatic_dir}'

#    INPUT:
#        - '{$self->TR1}_1PE.fastq'
#        - '{$self->TR2}_2PE.fastq'
#        - '{$self->TR1}_1SE.fastq'
#        - '{$self->TR2}_2SE.fastq'

#    OUTPUT:
#        - '{$self->TR1}_1PE.fastq.gz'
#        - '{$self->TR2}_2PE.fastq.gz'
#        - '{$self->TR1}_1SE.fastq.gz'
#        - '{$self->TR2}_2SE.fastq.gz'

#

#
### HPC Directives
#HPC jobname=trimmomatic_gzip
#


#TASK tags=Sample_01
gzip -f data/processed/Sample_01/trimmomatic/Sample_01_read1_trimmomatic_1PE.fastq
#TASK tags=Sample_01
gzip -f data/processed/Sample_01/trimmomatic/Sample_01_read2_trimmomatic_2PE.fastq
#TASK tags=Sample_01
gzip -f data/processed/Sample_01/trimmomatic/Sample_01_read1_trimmomatic_1SE.fastq
#TASK tags=Sample_01
gzip -f data/processed/Sample_01/trimmomatic/Sample_01_read2_trimmomatic_2SE.fastq


#TASK tags=Sample_02
gzip -f data/processed/Sample_02/trimmomatic/Sample_02_read1_trimmomatic_1PE.fastq
#TASK tags=Sample_02
gzip -f data/processed/Sample_02/trimmomatic/Sample_02_read2_trimmomatic_2PE.fastq
#TASK tags=Sample_02
gzip -f data/processed/Sample_02/trimmomatic/Sample_02_read1_trimmomatic_1SE.fastq
#TASK tags=Sample_02
gzip -f data/processed/Sample_02/trimmomatic/Sample_02_read2_trimmomatic_2SE.fastq


#TASK tags=Sample_03
gzip -f data/processed/Sample_03/trimmomatic/Sample_03_read1_trimmomatic_1PE.fastq
#TASK tags=Sample_03
gzip -f data/processed/Sample_03/trimmomatic/Sample_03_read2_trimmomatic_2PE.fastq
#TASK tags=Sample_03
gzip -f data/processed/Sample_03/trimmomatic/Sample_03_read1_trimmomatic_1SE.fastq
#TASK tags=Sample_03
gzip -f data/processed/Sample_03/trimmomatic/Sample_03_read2_trimmomatic_2SE.fastq


#
#

#
# Starting bowtie2
#


#
# Variables
# Indir: {$self->trimmomatic_dir}
# Outdir: {$self->bowtie2_dir}
#
# Local Variables:
#
#    indir:
#        - '{$self->trimmomatic_dir}'

#    outdir:
#        - '{$self->bowtie2_dir}'

#    INPUT:
#        - '{$self->TR1}_1PE.fastq.gz'
#        - '{$self->TR2}_2PE.fastq.gz'

#    OUTPUT: {$self->bowtie2_dir}/{$sample}_aligned.sam

#

#
### HPC Directives
#HPC jobname=bowtie2
#


#TASK tags=Sample_01
bowtie2 -p 7 -x data/genome.fa \
-1 data/processed/Sample_01/trimmomatic/Sample_01_read1_trimmomatic_1PE.fastq.gz \
-2 data/processed/Sample_01/trimmomatic/Sample_01_read2_trimmomatic_2PE.fastq.gz \
-S data/processed/Sample_01/bowtie2/Sample_01_aligned.sam


#TASK tags=Sample_02
bowtie2 -p 7 -x data/genome.fa \
-1 data/processed/Sample_02/trimmomatic/Sample_02_read1_trimmomatic_1PE.fastq.gz \
-2 data/processed/Sample_02/trimmomatic/Sample_02_read2_trimmomatic_2PE.fastq.gz \
-S data/processed/Sample_02/bowtie2/Sample_02_aligned.sam


#TASK tags=Sample_03
bowtie2 -p 7 -x data/genome.fa \
-1 data/processed/Sample_03/trimmomatic/Sample_03_read1_trimmomatic_1PE.fastq.gz \
-2 data/processed/Sample_03/trimmomatic/Sample_03_read2_trimmomatic_2PE.fastq.gz \
-S data/processed/Sample_03/bowtie2/Sample_03_aligned.sam