-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path6-1_positive_selection.sh
85 lines (66 loc) · 3.22 KB
/
6-1_positive_selection.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/bin/bash
# find all available protein sequences in ./data/evo/00proseqs
# and align them using MUSCLE
# store the aligned sequences to ./data/evo/01proseqs_align
# link muscle command to local executable
muscle=/home/ceballos/muscle-linux-x86.v5.2
# create folder ./data/evo/01proseqs_align if not already created
mkdir -p ./data/evo/01proseqs_align
# find all protein sequences in ./data/evo/00proseqs and find what the gene name is
# find all .fasta files in ./data/evo/00proseqs
for file in ./data/evo/00proseqs/*.fasta
do
fn=$(basename $file .fasta)
printf "$fn\n"
gene=$(echo $fn | cut -d'_' -f1)
printf "Aligning $gene\n"
$muscle -align ./data/evo/00proseqs/${gene}_proseq.fasta -output ./data/evo/01proseqs_align/${gene}_proseq_outv2.fasta
printf "Done aligning $gene\n"
done
# reorder the sequences in ./data/evo/01proseqs_align and put them in ./data/evo/02proseqs_ordered
conda activate nnt
python 6-1-1_reorder_sequences.py
# create folder ./data/evo/03codalign if not already created
mkdir -p ./data/evo/03codalign
# activate conda environment with pal2nal
conda activate evo_test
# find all available protein sequences in 02proseqs and translate them to codons
for file in ./data/evo/02proseqs_ordered/*.fasta
do
fn=$(basename $file .fasta)
printf "$fn\n"
gene=$(echo $fn | cut -d'_' -f1)
printf "Translating $gene\n"
pal2nal.pl ./data/evo/02proseqs_ordered/${gene}_proseq_ordered.fasta ./data/evo/00nucseqs/${gene}_nucseq.fasta -output fasta > ./data/evo/03codalign/${gene}_codalign.fa
printf "Done translating $gene\n"
done
# fill in missing sequences in ./data/evo/03codalign
conda activate nnt
python 6-1-2_fill_missing_sequences.py
# order the sequences in ./data/evo/04codalign_filled and save in ./data/evo/05codalign_taxnames
source_directory="./data/evo/04codalign_filled"
destination_directory="./data/evo/05codalign_taxnames"
# create the destination directory if it doesn't exist
mkdir -p "$destination_directory"
# define the patterns for the species you want to keep in the header
species_patterns="hsapiens|ptroglodytes|mmulatta|mmusculus|btaurus|dnovemcinctus|sharrisii|oanatinus|ggallus|xtropicalis|drerio|ccarcharias|pmarinus"
# loop through each oxt_codalign.fasta file in the source directory
for file in "$source_directory"/*.fa; do
# get the filename without the directory path
filename=$(basename "$file")
# process the file and save it in the destination directory
awk -v species="$species_patterns" '/^>/{ match($0, species); if (RSTART) print ">" substr($0, RSTART, RLENGTH); next }1' "$file" > "$destination_directory/$filename"
echo "Processed $file and saved to $destination_directory/$filename"
done
# use Hyphy to model positive selection across a self-defined phylogenetic tree
conda activate evo_test
# create folder 06absrel_results if not already created
mkdir -p ./data/evo/06absrel_results
# find all .fa files in ./data/evo/05codalign_taxnames
for file in ./data/evo/05codalign_taxnames/*.fa
do
fn=$(basename $file .fasta)
printf "$fn\n"
gene=$(echo $fn | cut -d'_' -f1)
hyphy absrel --alignment ./data/evo/05codalign_taxnames/${gene}_codalign.fa --tree ./data/evo/timetree_bi.nwk --output ./data/evo/06absrel_results/${gene}_ABSREL.json
done