-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscywalker_makerefdir
executable file
·241 lines (216 loc) · 6.38 KB
/
scywalker_makerefdir
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#!/bin/sh
# the next line restarts using wish \
exec tclsh "$0" ${1+"$@"}
#
# Copyright (c) by Peter De Rijk (VIB - University of Antwerp)
# See the file "license.txt" for information on usage and redistribution of
# this file, and for a DISCLAIMER OF ALL WARRANTIES.
#
# see where we are, add the lib dir in the original location to auto_path, load extension
set script [file join [pwd] [info script]]
set scriptname [file tail $script]
while 1 {
if {[catch {set script [file join [file dir $script] [file readlink $script]]}]} break
}
set appdir [file dir $script]
if {[lrange [file split $script] end-2 end] eq {apps cg cg.tcl}} {
set appbasedir [file dir [file dir $appdir]]
set auto_path [list $appbasedir/lib $appbasedir/lib/tcl8.5 $appbasedir/lib/tk8.5]
}
#lappend auto_path $appdir/lib
#source $appdir/lib/file.tcl
package require Extral
proc pathsep {} {
if {$::tcl_platform(platform) eq "windows"} {return \;} else {return \:}
}
proc scywalkerenv {} {
global auto_path env appdir tcl_dirtcl scywalkerdir externdir
if {![info exists appdir]} {
set appdir ${scywalkerdir}
}
if {[file dir [file dir $appdir]] eq [get tcl_dirtcl ""]} {
# we are being run from a dirtcl installation in apps/scywalker
set scywalkerdir $tcl_dirtcl
set externdir $scywalkerdir/bin
set bindir $appdir/bin
set extradir $scywalkerdir/extra
set cgdir $scywalkerdir
} else {
# we are being run from dev
set scywalkerdir $appdir
set externdir $scywalkerdir/extern
set bindir $scywalkerdir/bin
set extradir $scywalkerdir/extra
set cgdir [file dir $scywalkerdir]/genomecomb
}
# Setting LANG to "C" tells all unix tools (e.g. sort) to consider only basic ASCII characters and disable UTF-8 multibyte match
# This can improve performance substantially
set env(LANG) C
set env(LC_ALL) C
# add to paths
set env(PATH) $cgdir[pathsep]$bindir[pathsep]$externdir[pathsep]$scywalkerdir[pathsep]$extradir[pathsep]$env(PATH)
if {[info exists env(LD_LIBRARY_PATH)]} {
set env(LD_LIBRARY_PATH) $::externdir/lib:$env(LD_LIBRARY_PATH)
} else {
set env(LD_LIBRARY_PATH) $::externdir/lib
}
lappend auto_path $scywalkerdir/apps/cg/lib
return $scywalkerdir
}
array set ::gzexts {
.zst zst
.rz razip
.lz4 lz4
.gz gzip
.bgz bgzip
.bz2 bzip2
}
proc isgzext ext {
info exists ::gzexts($ext)
}
proc gzroot filename {
if {[isgzext [file extension $filename]]} {
return [file root $filename]
} else {
return $filename
}
}
proc formaterror {} {
puts stderr "scywalker_makerefdir ?options? refdir genomesequence.fasta transcripts.gtf"
puts stderr " with options: -organelles, -nolowgenecutoff"
exit 1
}
scywalkerenv
# generic help on no args
if {[llength $argv] < 1} {
formaterror
}
proc scywalker_makerefdir {args} {
set organelles {}
set transcripts {}
set ontindex 1
set pacbioindex 0
set nolowgenecutoff 200000
set groupchromosomes {}
set pos 0
foreach {key value} $args {
switch $key {
-organelles {
set organelles $value
incr pos 2
}
-nolowgenecutoff {
set nolowgenecutoff $value
incr pos 2
}
-groupchromosomes {
set groupchromosomes $value
incr pos 2
}
-ontindex {
set ontindex $value
incr pos 2
}
-pacbioindex {
set pacbioindex $value
incr pos 2
}
default break
}
}
set args [lrange $args $pos end]
if {[llength $args] > 3} {
if {[string index [lindex $args 0] 0] == "-"} {
puts stderr "error calling scywalker_makerefdir: unknown option "[lindex $args 0]", must be one of: -organelles, -transcripts"
} else {
formaterror
}
exit 1
}
foreach {refdir genomefasta transcripts} $args break
if {[file exists $refdir]} {
puts stderr "error: target refdir $refdir already exists"
exit 1
}
set refdir [file normalize $refdir]
set tail [file tail $refdir]
set build $tail
file mkdir $refdir
file mkdir $refdir/extra
# make ifas
set result $refdir/genome_$tail.ifas
puts "Making genome $result"
exec cg fas2ifas $genomefasta $result
exec samtools faidx $result
# groupchromosomes
if {$groupchromosomes ne ""} {
puts "Making genome $result.groupchromosomes"
groupchromosomes $result $groupchromosomes
}
# fullgenome
unset -nocomplain a
set rfile $refdir/extra/reg_${build}_fullgenome.tsv
puts "Making $rfile"
set data [file_read $result.fai]
set o [open $rfile.temp w]
puts $o chromosome\tbegin\tend
list_foreach {chromosome len} [lrange [split [string trim $data] \n] 0 end] {
set a($chromosome) 1
puts $o $chromosome\t0\t$len
}
close $o
file rename -force -- $rfile.temp $rfile
if {$organelles ne ""} {
puts "Writing $refdir/extra/reg_${build}_organelles.tsv"
foreach organelle $organelles {
if {![info exists a($organelle)]} {
puts stderr "error making organelles file: $organelle is not a chromosome in the given genome"
exit 1
}
}
file_write $refdir/extra/reg_${build}_organelles.tsv chromosome\n[join $organelles \n]\n
}
# for cram
exec cg fasta2cramref $result $result.forcram
# sequencedgenome
set target $refdir/extra/reg_${build}_sequencedgenome.tsv.zst
puts "Making $target"
exec cg calcsequencedgenome --stack 1 $result | cg zst > $target.temp
file rename -force -- $target.temp $target
if {$ontindex} {
# minimap2_splice index
puts "Making minimap2 ont index (can take large amount of memory)"
exec cg refseq_minimap2 $result splice
}
if {$pacbioindex} {
# minimap2_splice index
puts "Making minimap2 pacbio index (can take large amount of memory)"
exec cg refseq_minimap2 $result:hq splice
}
set target $refdir/genome_${build}.dict
exec samtools dict -o $target.temp $result
file rename $target.temp $target
# converting transcripts
set root [file root [file tail $transcripts]]
regsub ^gene_ $root {} root
set target $refdir/gene_${build}_$root.tsv
puts "Making $target"
set ext [file extension [gzroot $transcripts]]
if {$ext eq ".gtf"} {
file copy -force $transcripts [file root $target].gtf
exec cg gtf2tsv $transcripts $target
} elseif {$ext in ".gff .ggf2 .gff3"} {
file copy $transcripts [file root $target]$ext
exec cg gff2tsv $transcripts $target
} elseif {$ext eq ".tsv"} {
file copy $transcripts $target
} else {
puts stderr "format of transcripts file $transcripts not supported: must be one of: gtf, tsv, gff"
exit 1
}
set target $refdir/extra/reg_${build}_nolowgene200k.tsv.zst
puts "Making $target"
exec cg distrreg_nolowgene $refdir $nolowgenecutoff
puts "Made $refdir (can now be used as refdir for scywalker)"
}
scywalker_makerefdir {*}$argv