-
Notifications
You must be signed in to change notification settings - Fork 1
/
pipelineCREAM.sh
249 lines (204 loc) · 9.13 KB
/
pipelineCREAM.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#/usr/bin/env bash
while getopts i:o:m:g:t:f:c:a:h:d:b:n:p:q: option
do
case "${option}"
in
o) OUTDIR=${OPTARG};;
m) IMGDIR=${OPTARG};;
i) InputFile=${OPTARG};;
g) GencodeFile=${OPTARG};;
t) threshold=${OPTARG};;
f) FANTOM5=${OPTARG};;
c) TADS=${OPTARG};;
a) GENOMEFA=${OPTARG};;
h) HOCOMOCOthresholds=${OPTARG};;
d) PWMDIR=${OPTARG};;
b) HOCOMOCOtoTF=${OPTARG};;
n) interact=${OPTARG};;
p) PPI=${OPTARG};;
q) Cliques=${OPTARG};;
esac
done
display_usage() {
echo -e "\nUsage: pipeline.sh -i InputDir -o OutputDir -g gencodeFile -t threshold -f Fantom5PromEnhancerInteractionsFile -c fileWithTADs -a genome.fa -h HOCOMOCOthresholds -d folderWithPWMs -b HOCOMOCOfullannotations\n"
echo -e "\nExample: pipeline.sh -i ATAC.XXX.rep1_peaks.narrowPeak -o /home/user/outputDir -g gencode.v19.annotation.gff3 -t 4.99 -f hg19_enhancer_tss_associations_FANTOM5data.bed -c allTADS.bed -a hg19.fa -h HOCOMOCOv11_core_standard_thresholds_HUMAN_mono.txt -d HOCOMOCOv11_core_pwm -b HOCOMOCOv11_full_annotation_HUMAN_mono.tsv\n"
echo "You can get all necessary HOCOMOCO files (PWMs and thresholds) here: "
echo "PWMs: http://hocomoco11.autosome.ru/final_bundle/hocomoco11/core/HUMAN/mono/HOCOMOCOv11_core_pwm_HUMAN_mono.tar.gz "
echo "Thresholds: http://hocomoco11.autosome.ru/final_bundle/hocomoco11/core/HUMAN/mono/HOCOMOCOv11_core_standard_thresholds_HUMAN_mono.txt "
echo "HOCOMOCO to TF: http://hocomoco11.autosome.ru/final_bundle/hocomoco11/full/HUMAN/mono/HOCOMOCOv11_full_annotation_HUMAN_mono.tsv "
}
SCRIPTPATH=`dirname $0`
if [ ! -f "$HOCOMOCOtoTF" ]
then
echo -e "\nERROR: You have to set the file with HOCOMOCO full annotation (e.g. hocomoco11.autosome.ru/final_bundle/hocomoco11/full/HUMAN/mono/HOCOMOCOv11_full_annotation_HUMAN_mono.tsv)"
display_usage
exit 1
fi
if [ ! -f "$HOCOMOCOthresholds" ]
then
echo -e "\nERROR: You have to set the file with HOCOMOCO thresholds (e.g. http://hocomoco11.autosome.ru/final_bundle/hocomoco11/core/HUMAN/mono/HOCOMOCOv11_core_standard_thresholds_HUMAN_mono.txt)"
display_usage
exit 1
fi
if [ ! -d "$PWMDIR" ]
then
echo -e "\nERROR: You have to set the directory with HOCOMOCO PWM (e.g. get it here: http://hocomoco11.autosome.ru/final_bundle/hocomoco11/core/HUMAN/mono/HOCOMOCOv11_core_pwm_HUMAN_mono.tar.gz)"
display_usage
exit 1
fi
if [ ! -f "$FANTOM5" ]
then
echo -e "\nERROR: You have to set the .bed file with promoter-enhancer interactions from FANTOM5"
display_usage
exit 1
fi
if [ ! -f "$GENOMEFA" ]
then
echo -e "\nERROR: You have to set the .fasta file with the genomic sequences"
display_usage
exit 1
fi
if [ ! -f "$TADS" ]
then
echo -e "\nERROR: You have to set the .bed file with TADs"
display_usage
exit 1
fi
if [ ! -f "$InputFile" ]
then
echo -e "\nERROR: You have to set the .bed file with ATAC-seq peaks (e.g. narrowPeak file from the HMCan output)"
display_usage
exit 1
fi
if [ -z "$threshold" ]
then
echo -e "\nERROR: You have to set the score threshold for the .bed file with ATAC-seq peaks"
display_usage
exit 1
fi
if [ ! -f "$GencodeFile" ]
then
echo -e "\nERROR: You have to set the file with gene annotation (.gff)"
display_usage
exit 1
fi
if [ -z "$OUTDIR" ]
then
echo -e "\nERROR: You have to set the output directory"
display_usage
exit 1
fi
echo "TFhubsFinder -- input parameters:"
echo "Input file: $InputFile"
echo "Threshold: $threshold"
echo "Output Directory: $OUTDIR"
echo "GFF file with gene information: $GencodeFile"
echo "BED file with promoter-enhancer interactions: $FANTOM5"
echo "BED file with TADs: $TADS"
echo "FASTA file with the genome: $GENOMEFA"
echo "File with HOCOMOCO thresholds: $HOCOMOCOthresholds"
echo "File with full HOCOMOCO annotations: $HOCOMOCOtoTF"
echo "Use Protein-Protein Interaction: $interact"
echo "PPI file : $PPI"
echo "Finding the best $Cliques cliques"
echo "TFhubsFinder create the output directory if it does not exist"
mkdir $OUTDIR || true
rm -rf $OUTDIR/FASTA/ || true
mkdir $OUTDIR/FASTA/ || true
rm -rf $OUTDIR/TFBS/ || true
mkdir $OUTDIR/TFBS/ || true
output_fasta=$OUTDIR/FASTA/
TFBS_output=$OUTDIR/TFBS/
mkdir $TFBS_output/promoters
mkdir $TFBS_output/FANTOM5
mkdir $TFBS_output/TADS
file=$(basename $InputFile)
prepared_output=$OUTDIR/${file}.prepared.bed
prepcream_output=$OUTDIR/${file}.prepcream.bed
fprepared_output=$OUTDIR/${file}.fprepared.bed
cores_output=$OUTDIR/${file}.cores.bed
promoters_output=$OUTDIR/${file}.promoters.bed
uniq_promoters_output=$OUTDIR/${file}.uniq_promoters.bed
enhancers_output=$OUTDIR/${file}.enhancers.bed
FANTOM5_enhancers_output=$OUTDIR/${file}.FANTOM5_enhanc.bed
uniq_FANTOM5_enhanc_output=$OUTDIR/${file}.uniq_FANTOM5_enhanc.bed
enhancers_left_output=$OUTDIR/${file}.enhanc_left.bed
combined_enhanc_tads_output=$OUTDIR/${file}.comb_enhanc_tads.bed
unfiltered_tads_enhanc_output=$OUTDIR/${file}.unfilt_enhanc_tads.bed
TADS_with_repeats_output=$OUTDIR/${file}.TADS_with_repeats
TADS_enhancers_output=$OUTDIR/${file}.TADS_enhanc.bed
promoters_fasta_output=$output_fasta/${file}.promoters.fa
FANTOM5_enhanc_fasta_output=$output_fasta/${file}.FANTOM5.fa
TADS_enhanc_fasta_output=$output_fasta/${file}.TADS.fa
graph_output=$OUTDIR/${file}.gefx
summ_output=$OUTDIR/${file}.summary.bed
rm $OUTDIR/${file}.summary.bed.idx
date
echo "0. getting transcripts..."
tail -n +8 $GencodeFile | awk '
BEGIN {OFS="\t"}{
if ($7 == "+") {print $1,$3,$4,$4+1,$9}
else if ($7 == "-") {print $1,$3,$5-1,$5,$9}
}' | grep -wE "(transcript)" | awk 'BEGIN {OFS="\t"}{print $1,$2,$3,$4,$5}' > $OUTDIR/gencode.v19.TSS.txt
echo "done!"
echo "determining promoter regions..."
cat $OUTDIR/gencode.v19.TSS.txt | awk 'BEGIN {OFS="\t"}{print $1,$3-750,$4+750,"TSS",$5}' > $OUTDIR/promoter_regions.txt.bed
echo "done!"
echo "1. filtering $file with threshold $threshold..."
cat $InputFile | awk "BEGIN {OFS=\"\t\"} {if (\$5 > $threshold) {print \$1,\$2,\$3,\"peak\",\$4,\$10}}" > $prepared_output
# }
cat $prepared_output | awk "BEGIN {OFS=\"\t\"} {print \$1,\$2,\$3}" | sort -k1,1 -k2,2n > $prepcream_output
#CREAM
Rscript --vanilla $SCRIPTPATH/creamtest.R $prepcream_output $cores_output
python3 $SCRIPTPATH/filter_cream.py $cores_output $prepared_output
LC_ALL=C sort -u $fprepared_output -o $fprepared_output
echo "2. looking for promoters..."
cat $prepared_output $OUTDIR/promoter_regions.txt.bed | sort -k1,1 -k2,2n | python3 $SCRIPTPATH/intersect.py TSS > $promoters_output
# output: chrom, start, end, peak, gene
echo "3. looking for enhancers..."
cat $promoters_output | sort -k1,1 -k2,2n -k5,5 | sort -u -k4,4 > $uniq_promoters_output
python3 $SCRIPTPATH/enhancers.py $uniq_promoters_output $prepared_output > $enhancers_output
# output: chrom, start, end, "peak", peak
echo "4. looking for fantom5 enhancers..."
cat $FANTOM5 | tail -n +3 | awk 'BEGIN {OFS="\t"} {print $1,$7-200,$7+200,"genes",$4}' > $OUTDIR/hg19_FANTOM5data.bed
cat $enhancers_output $OUTDIR/hg19_FANTOM5data.bed | sort -k1,1 -k2,2n | python3 $SCRIPTPATH/intersect.py genes > $FANTOM5_enhancers_output
cat $FANTOM5_enhancers_output | sort -k1,1 -k2,2n -k5,5 | sort -u -k4,4 > $uniq_FANTOM5_enhanc_output
echo "done!"
echo "5. looking for TADS enhancers..."
python3 $SCRIPTPATH/enhancers_left.py $FANTOM5_enhancers_output $enhancers_output > $enhancers_left_output
cat $TADS | awk 'BEGIN {OFS="\t"} {print $1,$2,$3,"TADS",$4}' > $combined_enhanc_tads_output
cat $enhancers_left_output $combined_enhanc_tads_output | sort -k1,1 -k2,2n | python3 $SCRIPTPATH/intersect.py TADS > $unfiltered_tads_enhanc_output
python3 $SCRIPTPATH/filter_tads.py $promoters_output $unfiltered_tads_enhanc_output > $TADS_with_repeats_output
python3 $SCRIPTPATH/merge_TADS.py $TADS_with_repeats_output > $TADS_enhancers_output
echo "6. making fasta of promoters and enhancers..."
bedtools getfasta -fi $GENOMEFA -bed $uniq_promoters_output -name -fo $promoters_fasta_output
bedtools getfasta -fi $GENOMEFA -bed $uniq_FANTOM5_enhanc_output -name -fo $FANTOM5_enhanc_fasta_output
bedtools getfasta -fi $GENOMEFA -bed $TADS_enhancers_output -name -fo $TADS_enhanc_fasta_output
echo "done!"
tail -n +2 $HOCOMOCOthresholds > $OUTDIR/HOCOMOCO_thresholds.txt
date
#Motif Analysis
echo "looking for TFBS in promoters..."
python3 $SCRIPTPATH/TFBS_finder.py $OUTDIR/HOCOMOCO_thresholds.txt $PWMDIR $promoters_fasta_output promoters $HOCOMOCOtoTF
echo "looking for TFBS in FANTOM5 enhancers..."
python3 $SCRIPTPATH/TFBS_finder.py $OUTDIR/HOCOMOCO_thresholds.txt $PWMDIR $FANTOM5_enhanc_fasta_output FANTOM5 $HOCOMOCOtoTF
echo "looking for TFBS in TADS enhancers..."
python3 $SCRIPTPATH/TFBS_finder.py $OUTDIR/HOCOMOCO_thresholds.txt $PWMDIR $TADS_enhanc_fasta_output TADS $HOCOMOCOtoTF
echo "7. building the network..."
date
python3 $SCRIPTPATH/crossref.py $promoters_output $FANTOM5_enhancers_output $TADS_enhancers_output $TFBS_output $interact
sort -k1,1 -k2n,2n -k4,4 $summ_output -o $summ_output
if [ "$interact" = "True" ]
then
python3 $SCRIPTPATH/networks.py $summ_output $interact $IMGDIR $Cliques $PPI
else
python3 $SCRIPTPATH/networks.py $summ_output $interact $IMGDIR $Cliques
fi
echo "done!"
for regfile in $OUTDIR/${file}.gene_list.*.txt
do
LC_ALL=C sort -k2,2rg $regfile -o $regfile
done
echo "The $Cliques different Network files $graph_output are complete, you can visualize them with Gephi"
echo "The corresponding regulated genes files ${file}.gene_list.txt are complete"
date