-
Notifications
You must be signed in to change notification settings - Fork 1
/
rnaseq_without_DGE-Mouse10.yml
136 lines (136 loc) · 5.62 KB
/
rnaseq_without_DGE-Mouse10.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
---
global:
# ROOT Directory configurations
- indir: data/processed
- outdir: data/analysis
# Find Samples
- sample_rule: (Sample.*)$
- by_sample_outdir: 1
- find_by_dir: 1
# Analsis Dir Setup
- analysis_dir: data/analysis/
- raw_dir: data/raw
- rename_dir: data/raw/rename
- trimmomatic_dir: "data/processed/{$sample}/trimmomatic"
- trimmomatic_fastqc_dir: "data/processed/{$sample}/trimmomatic_fastqc"
- htseq_dir: "data/analysis/Counts"
#The data_dirs and very site specific variables - be sure to change these!
- data_dir: "/scratch/Reference_Genomes/Public/Vertebrate_mammalian/Mus_musculus/ENSEMBL-release-82-GRCm38"
- ANNOTATION: "{$self->data_dir}/tr_Mus_musculus_GRCm38/Mus_musculus.GRCm38.82"
- REFERENCE: "{$self->data_dir}/Mus_musculus.GRCm38.dna.toplevel"
#HPC Directives
- HPC:
- module: 'gencore gencore_dev gencore_rnaseq'
- partition: 'serial'
- commands_per_node: 1
- cpus_per_task: 1
rules:
- create_analysis_dirs:
local:
- create_outdir: 0
- override_process: 1
- HPC:
- walltime: '02:00:00'
- mem: '4GB'
- commands_per_node: 100
process: |
mkdir -p {$self->analysis_dir}/FPKMs && \
mkdir -p {$self->analysis_dir}/BAMs && \
mkdir -p {$self->analysis_dir}/Transcripts && \
mkdir -p {$self->analysis_dir}/CUFFDIFF && \
mkdir -p {$self->analysis_dir}/Counts/DESeq2
- tophat2:
local:
- create_outdir: 1
- indir: "{$self->trimmomatic_dir}"
- HPC:
- deps: create_analysis_dirs
- walltime: '12:00:00'
- mem: '60GB'
- cpus_per_task: 12
process: |
#TASK tags={$sample}
tophat2 -o {$self->outdir} \
--no-novel-juncs -p 12 --transcriptome-index={$self->ANNOTATION} \
{$self->{REFERENCE}} \
{$self->trimmomatic_dir}/{$sample}_read1_trimmomatic_1PE.fastq.gz \
{$self->trimmomatic_dir}/{$sample}_read2_trimmomatic_2PE.fastq.gz
- cufflinks:
## Cufflinks will only output a file name accepted_hits.bam
## to a particular directory
## We leave it here for now and move it in the move_data dir
local:
- INPUT: "{$self->indir}/accepted_hits.bam"
- OUTPUT: "{$self->analysis_dir}/BAMs/{$sample}_accepted_hits.bam"
- HPC:
- deps: tophat2
- mem: '60GB'
- walltime: '10:00:00'
- cpus_per_task: 12
process: |
#TASK tags={$sample}
cufflinks -p 12 --no-update-check -o {$self->outdir} \
-G {$self->ANNOTATION} {$self->INPUT}
- move_data:
## We move data around here for downstream analysis
## Some tool downstream requires its data to be in the same directory
local:
- tophat2_dir: 'data/analysis/{$sample}/tophat2'
- outdir: "{$self->analysis_dir}"
- INPUT: "{$self->tophat2_dir}/accepted_hits.bam"
- OUTPUT: "{$self->analysis_dir}/BAMs/{$sample}_accepted_hits.bam"
- HPC:
- deps: cufflinks
- mem: '4GB'
- walltime: '10:00:00'
- cpus_per_task: 1
- commands_per_node: 100
process: |
#TASK tags={$sample}
mv {$self->INPUT} {$self->OUTPUT}; \
mv {$self->outdir}/gene* {$self->analysis_dir}/FPKMs/{$sample}_gene_fpkms.txt; \
mv {$self->outdir}/transcrip* {$self->analysis_dir}/Transcripts/{$sample}_transcripts.gtf
- samtools_sort:
local:
- outdir: "{$self->analysis_dir}"
- INPUT: "{$self->analysis_dir}/BAMs/{$sample}_accepted_hits.bam"
- OUTPUT: "{$self->analysis_dir}/BAMs/{$sample}_sorted.accepted_hits.bam"
- HPC:
- deps: move_data
- mem: '80GB'
- walltime: '10:00:00'
- cpus_per_task: 6
process: |
#TASK tags={$sample}
samtools sort -n -@ 6 -O bam \
-o {$self->OUTPUT} \
{$self->INPUT}
- htseq_count:
## HtSeq count, or at least this version, requires that files be named a certain way
## And in the same directory
local:
- INPUT: "{$self->analysis_dir}/BAMs/{$sample}_sorted.accepted_hits.bam"
- OUTPUT: "{$self->analysis_dir}/Counts/htseq_$name.txt"
- HPC:
- deps: samtools_sort
- mem: '35GB'
- walltime: '10:00:00'
process: |
#TASK tags={$sample}
name=`echo {$sample}| sed s/Sample_//` && \
htseq-count -f bam -s no -t exon \
-i gene_id \
{$self->INPUT} \
{$self->ANNOTATION} > {$self->OUTPUT} && \
sed -i '/^__.*/d' {$self->OUTPUT}
- clean_dirs:
## Since we moved data around we will clean up the directories
local:
- data_dir: 'data/ercc_analysis'
- override_process: 1
- HPC:
- deps: htseq_count
- mem: '4GB'
- walltime: '10:00:00'
process: |
find {$self->data_dir} -type d -empty -delete