forked from leylabmpi/SynTracker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_overlapping_regions.sh
executable file
·132 lines (97 loc) · 5.33 KB
/
find_overlapping_regions.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/bin/bash
# Title
#create variables for the flags:
length="${l:=2000}"
minimal_coverage="${c:=70}"
minimal_identity="${i:=97}"
while getopts t:r:o:c:i:l: flag
do
case "${flag}" in
t) target=${OPTARG};; #metagenome assemlies/genomes - used to create the blast DB
r) references=${OPTARG};; # folder with reference genomes
o) output=${OPTARG};; #output and temp folders will be written to this path
i) minimal_identity=${OPTARG};; #minimal blast identity
c) minimal_coverage=${OPTARG};; #blast qcov_hsp_perc
l) length=${OPTARG};; #length of flanking sequences (on the retrieved traget sequence)
esac
done
output=$(readlink -f "$output") #get the full path of the output folder
# step 0: run the python script first, to find the "central_regions"
rm -rf "$output"
mkdir "$output"
mkdir "$output/tmp"
central_regions="$output/central_regions/"
python central_regions.py $references $central_regions
# input file naming convention (per-sample assembly files/genome files):
# sample name should be the first part of the file name, followed by "_" and added information, if wanted
# The sample name should not include any special characters!!!
# The file name should not include "/" or ".", other special characters are accepted.
# step 1: add sample names to the fasta headers in each genome/assembly fasta files
echo "starting merging metagenomes/genome files"
rm -f "$output/tmp/combined_genomes.fasta"
for infile in $target/*; do
insert="$(basename "${infile%.*}")"
sed "s/>/>${insert}_/" < "$infile" >> "$output/tmp/combined_genomes_long.fasta" ;
done;
# Make another file, with shorter headers (in spades format), to avoid downstream blastcmddb errors (header length is limited)
sed "s/_saliva__contig_/_/" "$output/tmp/combined_genomes_long.fasta" >> "$output/tmp/combined_genomes.fasta" ;
sed -i "s/saliva-//gi" "$output/tmp/combined_genomes.fasta" ;
#sed "s/cov/ /" "$output/tmp/combined_genomes_long.fasta" >> "$output/tmp/combined_genomes.fasta" ;
echo " Merging metagenome/genome files is complete"
# step 2: Make a blast database, from all the contigs.
# Rerunning this step overrides the previous blast DB - pay attention!
newdir="$output/blastDB"
echo "newdir: $newdir"
rm -rf "$newdir"
mkdir "$newdir"
makeblastdb -in "$output/tmp/combined_genomes.fasta" -out "$newdir/GroupsDB" -title "GroupsDB" -parse_seqids -dbtype nucl
echo "Making blast DB is complete"
# Step 3: #run blast per genome => per region
#a. make output folder
#b. for each genome, make an output folder
#c. for each region run blast
newdir1="$output/blast_output"
rm -rf "$newdir1"
mkdir "$newdir1"
#run blast
#cd "$newdir"
for folder in "$central_regions"/*; do
#echo "$folder"
#echo "$newdir1/"$(basename "${folder%.*}")""
rm -rf "$newdir1/"$(basename "${folder%.*}")""
mkdir "$newdir1/"$(basename "${folder%.*}")""
for query in "$folder"/*.fasta; do
blastn -query "$query" -db "$newdir/GroupsDB" -out "$newdir1/"$(basename "${folder%.*}")"/$(basename "${query%.*}").tab" -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore sstrand" -perc_identity "$minimal_identity" -qcov_hsp_perc "$minimal_coverage" -num_threads 8;
#echo "blastn -query "$query" -db "$newdir/GroupsDB" -out "$newdir1/"$(basename "${folder%.*}")"/$(basename "${query%.*}").tab" -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore sstrand" -perc_identity "$minimal_identity" -qcov_hsp_perc "$minimal_coverage" -num_threads 8;";
done
done
echo "BLAST search is complete"
# Step 4: delete blast output files with less than 2 hits
for folder in "$newdir1/"*; do
for seqfolder in "$folder"/*; do
#echo "$seqfolder"
find "$seqfolder" -type f -exec awk -v x=2 'NR==x{exit 1}' {} \; -exec rm -f {} \;
done
done
echo "Deleting low-hit regions is complete"
# Step 5: Use the blast output file to:
# a. check the orientation of the hit (minus or plus strand). If minus, change the order of start and end of hit.
# b. take the 2/9/10/13 columns of the output (query id/start/end/strand). add/remove x base pairs on each side of the hit (columns 9-10).
# c. pipe to xargs, execute "blastdbcmd", to fetch the sequence in each hit, with x basepairs on each side.
# d. write to file to be analyzed in R synteny pipeline
set --
newdir2="$output/blastcmddb_output"
rm -rf "$newdir2"
mkdir "$newdir2"
export blastDB="$newdir/GroupsDB"
#export length=$length
for folder in "$newdir1"/*; do
LogFile="$newdir2"/$(basename "${folder%.*}").log
echo "$LogFile"
mkdir "$newdir2/$(basename "${folder%.*}")"
for tabular in "$folder"/*.tab; do
# awk -v "len=$length" -v "outlog=$LogFile" '{if ($13 == "minus") print $2"\t"$10-len"\t"$9+len"\t"$13"\t"; else print $2"\t"$9-len"\t"$10+len"\t"$13}' "$tabular" | xargs -n 4 sh -c 'blastdbcmd -db "$blastDB" -entry "$0" -range "$1"-"$2" -strand "$3" -outfmt %f -logfile outlog;' > "$newdir2"/$(basename "${folder%.*}")/$(basename "${tabular%.*}").fasta;
awk -v "len=$length" -v "outlog=$LogFile" '{if ($13 == "minus") print $2"\t"$10-len"\t"$9+len"\t"$13"\t"; else print $2"\t"$9-len"\t"$10+len"\t"$13}' "$tabular" | xargs -n 4 sh -c 'blastdbcmd -db "$blastDB" -entry "$0" -range "$1"-"$2" -strand "$3" -outfmt %f -logfile outlog;' > "$newdir2"/$(basename "${folder%.*}")/$(basename "${tabular%.*}").fasta;
done
done
echo "Retrieval of flanking regions is complete"