-
Notifications
You must be signed in to change notification settings - Fork 12
/
allhic.go
320 lines (289 loc) · 11.5 KB
/
allhic.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
/**
* Filename: /Users/htang/code/allhic/main.go
* Path: /Users/htang/code/allhic
* Created Date: Wednesday, January 3rd 2018, 11:21:45 am
* Author: htang
*
* Copyright (c) 2018 Haibao Tang
*/
package allhic
import (
"fmt"
"github.com/spf13/cobra"
"path"
"sort"
"strconv"
"strings"
)
// Logo banner (Varsity style):
// http://patorjk.com/software/taag/#p=testall&f=3D-ASCII&t=ALLHIC
var rootCmd = &cobra.Command{
Use: "allhic",
Short: "Genome scaffolding based on Hi-C data",
Long: `
_ _____ _____ ____ ____ _____ ______
/ \ |_ _| |_ _| |_ || _||_ _|.' ___ |
/ _ \ | | | | | |__| | | | / .' \_|
/ ___ \ | | _ | | _ | __ | | | | |
_/ / \ \_ _| |__/ | _| |__/ | _| | | |_ _| |_\ ` + "`" + `.___.'\
|____| |____||________||________||____||____||_____|` + "`" + `.____ .'
`,
Version: Version,
}
// Execute executes the root command.
func Execute() error {
return rootCmd.Execute()
}
// banner prints the separate steps
func banner(message string) {
message = "* " + message + " *"
log.Noticef(strings.Repeat("*", len(message)))
log.Noticef(message)
log.Noticef(strings.Repeat("*", len(message)))
}
// init adds all the sub-commands
func init() {
var RE string
var minLinks int
extractCmd := &cobra.Command{
Use: "extract bamfile fastafile",
Short: "Extract Hi-C link size distribution",
Long: `
Extract function:
Given a bamfile, the goal of the extract step is to calculate an empirical
distribution of Hi-C link size based on intra-contig links. The Extract function
also prepares for the latter steps of ALLHiC.
`,
Args: cobra.ExactArgs(2),
Run: func(cmd *cobra.Command, args []string) {
bamfile := args[0]
fastafile := args[1]
p := Extracter{Bamfile: bamfile, Fastafile: fastafile, RE: RE, MinLinks: minLinks}
p.Run()
},
}
extractCmd.Flags().StringVarP(&RE, "RE", "", DefaultRE, "Restriction site pattern, use comma to separate multiple patterns (N is considered as [ACGT]), e.g. 'GATCGATC,GANTGATC,GANTANTC,GATCANTC'")
extractCmd.Flags().IntVarP(&minLinks, "minLinks", "", MinLinks, "Minimum number of links for contig pair")
allelesCmd := &cobra.Command{
Use: "alleles genome.paf genome.counts_RE.txt",
Short: "Build alleles.table for `prune`",
Long: `
Alleles function:
Given a paf file, we could identify and classify the allelic contigs to be used
for "allhic prune". We recommend the following parameters to build the paf file:
$ minimap2 -DP -k19 -w19 -m200 -t32 genome.fasta genome.fasta > genome.paf
The PAF file contains all self-alignments, which is the basis for classification.
ALLHiC generates "alleles.table", which can then be used for later steps.
`,
Args: cobra.ExactArgs(2),
Run: func(cmd *cobra.Command, args []string) {
pafFile := args[0]
reFile := args[1]
p := Alleler{PafFile: pafFile, ReFile: reFile}
p.Run()
},
}
pruneCmd := &cobra.Command{
Use: "prune alleles.table pairs.txt",
Short: "Prune allelic, cross-allelic and weak links",
Long: `
Prune function:
Given contig pairing, the goal of the prune step is to remove all inter-allelic
links, then it is possible to reconstruct allele-separated assemblies in the following
steps. The alleles.table file contains tab-separated columns containing contigs that
are considered "allelic". For example:
Chr10 18902 tig00030660 tig00003333
Chr10 35071 tig00038687 tig00038686 tig00065419
These lines means that at certain locations in the genome (typically based on synteny
to a closely-related genome), several contigs are considered allelic. The Hi-C links
between these contigs are removed, in addition, other contigs linking to these contigs
simultaneously would only consider one single-best edge. The "pairs.txt" file is the output
of the "extract" command.
Alternatively we can use output from purge-haplotigs as source of alleles.table, the format
looks like:
tig00030660,PRIMARY -> tig00003333,HAPLOTIG
-> tig00038686,HAPLOTIG
`,
Args: cobra.ExactArgs(2),
Run: func(cmd *cobra.Command, args []string) {
allelesFile := args[0]
pairsFile := args[1]
p := Pruner{AllelesFile: allelesFile, PairsFile: pairsFile}
p.Run()
},
}
var minREs, maxLinkDensity, nonInformativeRatio int
partitionCmd := &cobra.Command{
Use: "partition counts_RE.txt pairs.txt k",
Short: "Separate contigs into k groups",
Long: `
Partition function:
Given a target k, number of partitions, the goal of the partitioning is to
separate all the contigs into separate clusters. As with all clustering
algorithm, there is an optimization goal here. The LACHESIS algorithm is
a hierarchical clustering algorithm using average links. The two input files
can be generated with the "extract" sub-command.
`,
Args: cobra.ExactArgs(3),
Run: func(cmd *cobra.Command, args []string) {
contigsfile := args[0]
pairsFile := args[1]
k, _ := strconv.Atoi(args[2])
p := Partitioner{Contigsfile: contigsfile, PairsFile: pairsFile, K: k,
MinREs: minREs, MaxLinkDensity: maxLinkDensity,
NonInformativeRatio: nonInformativeRatio}
p.Run()
},
}
partitionCmd.Flags().IntVarP(&minREs, "minREs", "", MinREs, "Minimum number of RE sites in a contig to be clustered (CLUSTER_MIN_RE_SITES in LACHESIS)")
partitionCmd.Flags().IntVarP(&maxLinkDensity, "maxLinkDensity", "", MaxLinkDensity, "Density threshold before marking contig as repetitive (CLUSTER_MAX_LINK_DENSITY in LACHESIS)")
partitionCmd.Flags().IntVarP(&nonInformativeRatio, "nonInformativeRatio", "", NonInformativeRatio, "cutoff for recovering skipped contigs back into the clusters (CLUSTER_NON-INFORMATIVE_RATIO in LACHESIS)")
var skipGA, resume bool
var seed int64
var npop, ngen int
var mutpb float64
optimizeCmd := &cobra.Command{
Use: "optimize counts_RE.txt clmfile",
Short: "Order-and-orient tigs in a group",
Long: `
Optimize function:
Given a set of Hi-C contacts between contigs, as specified in the
clmfile, reconstruct the highest scoring ordering and orientations
for these contigs. Optimize run on a specific partition in "clusters.txt"
as generated by "partition" sub-command, with the group_number matching the
order appearing in "clusters.txt". Typically, if there are k clusters, we
can start k separate "optimize" commands for parallelism (for example,
on a cluster).
`,
Args: cobra.ExactArgs(2),
Run: func(cmd *cobra.Command, args []string) {
refile := args[0]
clmfile := args[1]
p := Optimizer{REfile: refile, Clmfile: clmfile,
RunGA: !skipGA, Resume: resume,
Seed: seed, NPop: npop, NGen: ngen, MutProb: mutpb}
p.Run()
},
}
optimizeCmd.Flags().BoolVarP(&skipGA, "skipGA", "", false, "Skip GA step")
optimizeCmd.Flags().BoolVarP(&resume, "resume", "", false, "Resume from existing tour file")
optimizeCmd.Flags().Int64VarP(&seed, "seed", "", Seed, "Random seed")
optimizeCmd.Flags().IntVarP(&npop, "npop", "", Npop, "Population size")
optimizeCmd.Flags().IntVarP(&ngen, "ngen", "", Ngen, "Number of generations for convergence")
optimizeCmd.Flags().Float64VarP(&mutpb, "mutapb", "", MutaProb, "Mutation prob in GA")
buildCmd := &cobra.Command{
Use: "build tourfile1 tourfile2 ... contigs.fasta asm.chr.fasta",
Short: "Build genome release",
Long: `
Build function:
Convert the tourfile into the standard AGP file, which is then converted
into a FASTA genome release.
`,
Args: cobra.MinimumNArgs(3),
Run: func(cmd *cobra.Command, args []string) {
tourfiles := make([]string, 0)
for i := 0; i < len(args)-2; i++ {
tourfiles = append(tourfiles, args[i])
}
sort.Strings(tourfiles)
fastafile := args[len(args)-2]
outfastafile := args[len(args)-1]
p := Builder{Tourfiles: tourfiles,
Fastafile: fastafile,
OutFastafile: outfastafile}
p.Run()
},
}
plotCmd := &cobra.Command{
Use: "plot bamfile tourfile",
Short: "Extract matrix of link counts and plot heatmap",
Long: `
Anchor function:
Given a bamfile, we extract matrix of link counts and plot heatmap.
`,
Args: cobra.ExactArgs(2),
Run: func(cmd *cobra.Command, args []string) {
bamfile := args[0]
tourfile := args[1]
p := Plotter{
Anchor: &Anchorer{Bamfile: bamfile, Tourfile: tourfile},
}
p.Run()
},
}
assessCmd := &cobra.Command{
Use: "assess bamfile bedfile chr1",
Short: "Assess the orientations of contigs",
Long: `
Assess function:
Compute the posterior probability of contig orientations after scaffolding
as a quality assessment step.
`,
Args: cobra.ExactArgs(3),
Run: func(cmd *cobra.Command, args []string) {
bamfile := args[0]
bedfile := args[1]
seqid := args[2]
p := Assesser{Bamfile: bamfile, Bedfile: bedfile, Seqid: seqid}
p.Run()
},
}
pipelineCmd := &cobra.Command{
Use: "pipeline bamfile fastafile k",
Short: "Run extract-partition-optimize-build steps sequentially",
Long: `
Pipeline:
A convenience driver function. Chain the following steps sequentially.
- extract
- partion
- optimize
- build
`,
Args: cobra.ExactArgs(3),
Run: func(cmd *cobra.Command, args []string) {
bamfile := args[0]
fastafile := args[1]
k, _ := strconv.Atoi(args[2])
// Extract the contig pairs, count RE sites
banner(fmt.Sprintf("Extractor started (RE = %s)", RE))
extractor := Extracter{Bamfile: bamfile, Fastafile: fastafile, RE: RE}
extractor.Run()
// Partition into k groups
banner(fmt.Sprintf("Partition into %d groups", k))
partitioner := Partitioner{Contigsfile: extractor.OutContigsfile,
PairsFile: extractor.OutPairsfile, K: k}
partitioner.Run()
// Optimize the k groups separately
tourfiles := make([]string, 0)
for i, refile := range partitioner.OutREfiles {
banner(fmt.Sprintf("Optimize group %d", i))
optimizer := Optimizer{REfile: refile,
Clmfile: extractor.OutClmfile,
RunGA: !skipGA, Resume: resume,
Seed: seed, NPop: npop, NGen: ngen, MutProb: mutpb}
optimizer.Run()
tourfiles = append(tourfiles, optimizer.OutTourFile)
}
// Run the final build
banner("Build started (AGP and FASTA)")
outfastafile := path.Join(path.Dir(tourfiles[0]),
fmt.Sprintf("asm-g%d.chr.fasta", k))
builder := Builder{Tourfiles: tourfiles,
Fastafile: fastafile,
OutFastafile: outfastafile}
builder.Run()
},
}
pipelineCmd.Flags().StringVarP(&RE, "RE", "", DefaultRE, "Restriction site pattern, use comma to separate multiple patterns (N is considered as [ACGT]), e.g. 'GATCGATC,GANTGATC,GANTANTC,GATCANTC'")
pipelineCmd.Flags().IntVarP(&minLinks, "minLinks", "", MinLinks, "Minimum number of links for contig pair")
pipelineCmd.Flags().IntVarP(&minREs, "minREs", "", MinREs, "Minimum number of RE sites in a contig to be clustered (CLUSTER_MIN_RE_SITES in LACHESIS)")
pipelineCmd.Flags().IntVarP(&maxLinkDensity, "maxLinkDensity", "", MaxLinkDensity, "Density threshold before marking contig as repetive (CLUSTER_MAX_LINK_DENSITY in LACHESIS)")
pipelineCmd.Flags().IntVarP(&nonInformativeRatio, "nonInformativeRatio", "", NonInformativeRatio, "cutoff for recovering skipped contigs back into the clusters (CLUSTER_NON-INFORMATIVE_RATIO in LACHESIS)")
pipelineCmd.Flags().BoolVarP(&skipGA, "skipGA", "", false, "Skip GA step")
pipelineCmd.Flags().BoolVarP(&resume, "resume", "", false, "Resume from existing tour file")
pipelineCmd.Flags().Int64VarP(&seed, "seed", "", Seed, "Random seed")
pipelineCmd.Flags().IntVarP(&npop, "npop", "", Npop, "Population size")
pipelineCmd.Flags().IntVarP(&ngen, "ngen", "", Ngen, "Number of generations for convergence")
pipelineCmd.Flags().Float64VarP(&mutpb, "mutapb", "", MutaProb, "Mutation prob in GA")
rootCmd.AddCommand(extractCmd, allelesCmd, pruneCmd, partitionCmd, optimizeCmd, buildCmd, plotCmd, assessCmd, pipelineCmd)
}