forked from adamnovak/sequence-graphs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtestFMD.sh
executable file
·88 lines (65 loc) · 2.09 KB
/
testFMD.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env bash
# testFMD.sh: test building FMD indexes with differet parameters and evaluate
# space usage.
set -e
# Make a directory to work in
mkdir -p testFMD
cd testFMD
# Get hg38
if [ ! -d hg38 ]
then
wget --progress=dot:giga http://hgdownload-test.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz
mkdir hg38
tar -C hg38 -xvzf hg38.chromFa.tar.gz
mv hg38/chroms/* hg38/
rmdir hg38/chroms
fi
# Get HuRef by FTP globbing
if [ ! -d huRef ]
then
mkdir huRef
cd huRef
# Make all the chromosome names. See
# <http://stackoverflow.com/a/8789815/402891> for 0-padding.
for CHROM in $(seq -f "%02g" 1 22) MT Un X Y
do
wget --progress=dot:giga ftp://ftp.ncbi.nih.gov/genomes/Homo_sapiens/CHR_${CHROM}/hs_alt_HuRef_*.fa.gz
done
gunzip *.gz
cd ..
fi
# Watson is too hard to find, so download the official Korean genome
if [ ! -d koRef ]
then
mkdir koRef
cd koRef
wget --progress=dot:giga ftp://ftp.kobic.re.kr/pub/KOBIC-KoreanGenome/fasta/*.fa.gz
gunzip *.gz
cd ..
fi
# Get an array of all the lists of genome files, one entry per genome.
FASTAS=("hg38/*.fa" "huRef/*.fa" "koRef/*.fa")
printf "RESULTS\tNUM_GENOMES\tBWT_BYTES\tSSA_BYTES\n"
for NUM_GENOMES in 1
do
# Pull out that many of the genomes.
SELECTED_FASTAS=${FASTAS[@]:0:${NUM_GENOMES}}
echo "Genomes: ${NUM_GENOMES}"
GENOME_FASTA="${NUM_GENOMES}genomes.fa"
if [ ! -e ${GENOME_FASTA} ]
then
cat ${SELECTED_FASTAS} > ${GENOME_FASTA}
fi
echo "Indexing ${GENOME_FASTA}"
time ../createIndex/createIndex ${GENOME_FASTA}-index ${GENOME_FASTA} --quiet --context 20
# Check the sizes of the index.
BWT_BYTES=$(stat -c%s ${GENOME_FASTA}-index/index.basename.bwt)
SSA_BYTES=$(stat -c%s ${GENOME_FASTA}-index/index.basename.ssa)
# Dump a grepable TSV line
printf "RESULTS\t${NUM_GENOMES}\t${BWT_BYTES}\t${SSA_BYTES}\n"
if [ -e gmon.out ]
then
# Make the profiling report
gprof ../createIndex/createIndex gmon.out > ${GENOME_FASTA}.gprof.txt
fi
done