-
Notifications
You must be signed in to change notification settings - Fork 0
/
09c_vsearch_cluster95.sh
59 lines (44 loc) · 2.33 KB
/
09c_vsearch_cluster95.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/bin/bash
#SBATCH -J vsearch
#SBATCH -o out_err_files/vsearch_%A.out
#SBATCH -e out_err_files/vsearch_%A.err
#SBATCH --nodes=1
#SBATCH -t 1-00:00:00
#SBATCH -p short,defq
#SBATCH --mail-type=ALL
#SBATCH [email protected]
module use /GWSPH/groups/cbi/Apps/_modulefiles
module load vsearch/2.14.1
#UC_OUTFILE="vs_us_out_size97_UC"
OUT_TAB_FILE="cluster_hits.tsv"
NSLOTS=$(nproc)
# Variables
TMP_concat_derep=$(mktemp)
TMP_FASTA1=$(mktemp)
TMP_FASTA2=$(mktemp)
#UNSORTED_CENTROIDS=$(mktemp)
FINAL_CENTROIDS="vsearch_size_centroid95.fasta"
#LOW_ABUND_THRES=1
# Dereplicate (vsearch)
vsearch --derep_fulllength DADA2_extracted_samples/concat.fasta --threads $NSLOTS --sizein --sizeout --fasta_width 0 --minuniquesize 1 --output $TMP_concat_derep
# Remove extra semicolon added by VSEARCH
#sed -i -E "s/;barcodelabel=.+;;/;/g" $TMP_concat_derep
sed -i -E "s/;.+;/;/g" $TMP_concat_derep
# Sort by size
vsearch --sortbysize $TMP_concat_derep --sizein --sizeout --output $TMP_FASTA1 --threads $NSLOTS --fasta_width 0
# VSEARCH chimera checking
vsearch --uchime_denovo $TMP_FASTA1 --sizein --sizeout --qmask none --nonchimeras $TMP_FASTA2 --threads $NSLOTS
# VSEARCH clustering
vsearch --cluster_size $TMP_FASTA2 --id 0.95 --sizein --centroids $FINAL_CENTROIDS --qmask none --threads $NSLOTS
# VSEARCH filtering and sorting
#vsearch --sortbysize $UNSORTED_CENTROIDS --minsize $LOW_ABUND_THRES --output $FINAL_CENTROIDS --sizein --threads $NSLOTS --fasta_width 0
# Shouldn't be necessary to remove size annotation in sequence headers, but check for them anyway.
sed -i -E "s/;size=.*//g" $FINAL_CENTROIDS
# Make a tab-separated file containing two columns: hit sequence, cluster centroid sequence. NOT DOING THIS RIGHT NOW. NOT USED
#grep -E "^H" $UC_OUTFILE | awk -F'\t' '{ print $9 "\t" $10 }' | sed -E 's/;size=[0-9]+;//g' >$OUT_TAB_FILE
# Map raw reads against OTU representatives concat.fasta is from LULU_prep.sh, which should be run before this.
vsearch --usearch_global DADA2_extracted_samples/concat.fasta --db $FINAL_CENTROIDS --id 0.95 --maxaccepts 0 --dbmask none --qmask none --uc FINAL_FOR_LULU_95.uc --threads $NSLOTS
# Then convert UC format to OTU table format with uc2otutab.py
python d5_py/uc2otutab.py FINAL_FOR_LULU_95.uc > FINAL_FOR_LULU_95_otutab.txt
# remove tmp files used
rm -f $TMP_concat_derep $TMP_FASTA1 $TMP_FASTA2 $UNSORTED_CENTROIDS