-
Notifications
You must be signed in to change notification settings - Fork 18
/
miniprot.h
290 lines (254 loc) · 6.75 KB
/
miniprot.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
#ifndef MINIPROT_H
#define MINIPROT_H
#include <stdint.h>
#define MP_VERSION "0.13-r262-dirty"
#define MP_F_NO_SPLICE 0x1
#define MP_F_NO_ALIGN 0x2
#define MP_F_SHOW_UNMAP 0x4
#define MP_F_GFF 0x8
#define MP_F_NO_PAF 0x10
#define MP_F_GTF 0x20
#define MP_F_NO_PRE_CHAIN 0x40
#define MP_F_SHOW_RESIDUE 0x80
#define MP_F_SHOW_TRANS 0x100
#define MP_F_NO_CS 0x200
#define MP_FEAT_CDS 0
#define MP_FEAT_STOP 1
#define MP_BITS_PER_AA 4
#define MP_BLOCK_BONUS 2
#define MP_CODON_STD 1
#define MP_IDX_MAGIC "MPI\3"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct { uint64_t x, y; } mp128_t;
typedef struct { int32_t n, m; mp128_t *a; } mp128_v;
typedef struct { int32_t n, m; uint64_t *a; } mp64_v;
typedef struct {
int32_t bbit; // block bit
int32_t min_aa_len; // ignore ORFs shorter than this
int32_t kmer, mod_bit;
uint32_t trans_code;
} mp_idxopt_t;
typedef struct {
uint32_t flag;
int64_t mini_batch_size;
int32_t max_occ;
int32_t max_gap; // max gap on the query protein, in aa
int32_t max_intron;
int32_t min_max_intron, max_max_intron;
int32_t bw; // bandwidth, in aa
int32_t max_ext;
int32_t max_ava;
int32_t min_chn_cnt;
int32_t max_chn_max_skip;
int32_t max_chn_iter;
int32_t min_chn_sc;
float chn_coef_log;
float mask_level;
int32_t mask_len;
float pri_ratio;
float out_sim, out_cov;
int32_t best_n, out_n;
int32_t kmer2;
int32_t go, ge, io, fs; // gap open, extension, intron open, and frame-shift/stop-codon
int32_t io_end;
float ie_coef;
int32_t sp_model;
int32_t sp_null_bonus;
float sp_scale;
int32_t xdrop;
int32_t end_bonus;
int32_t asize; // size of the alphabet; always 22 in current implementation
int32_t gff_delim;
int32_t max_intron_flank;
const char *gff_prefix;
int8_t mat[484];
} mp_mapopt_t;
typedef struct {
uint32_t n, m;
uint64_t *a; // pos<<56 | score<<1 | acceptor
} mp_spsc_t;
typedef struct {
int64_t off, len;
char *name;
} mp_ctg_t;
typedef struct {
int32_t n_ctg, m_ctg;
int32_t l_name;
int64_t l_seq, m_seq;
uint8_t *seq; // TODO: separate this into multiple blocks; low priority
mp_ctg_t *ctg;
char *name;
void *h; // hash table to map sequence name to cid
mp_spsc_t *spsc; // of size n_ctg*2
} mp_ntdb_t;
typedef struct {
mp_idxopt_t opt;
uint32_t n_block;
mp_ntdb_t *nt;
int64_t n_kb, *ki;
uint32_t *bo, *kb;
} mp_idx_t;
typedef struct {
int32_t dp_score, dp_max, dp_max2;
int32_t n_cigar, m_cigar;
int32_t blen; // CDS length in alignment
int32_t n_fs; // number of frameshift events
int32_t n_stop; // number of in-frame stop codons
int32_t dist_stop; // distance in bp to the closest stop codon
int32_t dist_start; // distance in bp the the closest 'M'
int32_t n_iden, n_plus;
uint32_t cigar[];
} mp_extra_t;
typedef struct {
int64_t vs, ve;
int32_t qs, qe;
int16_t type, phase;
int32_t n_fs, n_stop;
int32_t score, n_iden, blen;
char donor[2], acceptor[2];
} mp_feat_t;
typedef struct {
int32_t off, cnt;
int32_t id, parent;
int32_t n_sub, subsc;
int32_t n_feat, m_feat, n_exon;
int32_t chn_sc;
int32_t chn_sc_ungap;
uint32_t hash;
uint32_t vid; // cid<<1 | rev
int32_t qs, qe;
int64_t vs, ve;
uint64_t *a;
mp_feat_t *feat;
mp_extra_t *p;
} mp_reg1_t;
struct mp_tbuf_s;
typedef struct mp_tbuf_s mp_tbuf_t;
extern int32_t mp_verbose, mp_dbg_flag;
extern char *ns_tab_nt_i2c, *ns_tab_aa_i2c;
extern uint8_t ns_tab_a2r[22], ns_tab_nt4[256], ns_tab_aa20[256], ns_tab_aa13[256];
extern uint8_t ns_tab_codon[64], ns_tab_codon13[64];
/**
* Set the translation table and builtin timer
*
* Run this before all other miniprot routines.
*/
void mp_start(void);
/**
* Initialize the default parameters for indexing
*
* @param io struct to initialize (out)
*/
void mp_idxopt_init(mp_idxopt_t *io);
/**
* Initialize the default parameters fr mapping
*
* @param mo struct to initialize (out)
*/
void mp_mapopt_init(mp_mapopt_t *mo);
/**
* Set frameshift and stop codon penalty
*
* This function changes the mp_mapopt_t::fs field and mp_mapopt_t::mat[]
*
* @param mo mapping options (in/out)
* @param fs frameshift and stop codon penalty (positive)
*/
void mp_mapopt_set_fs(mp_mapopt_t *mo, int32_t fs);
/**
* Set max intron length based on the genome size
*
* @param mo mapping options (in/out)
* @param gsize genome size
*/
void mp_mapopt_set_max_intron(mp_mapopt_t *mo, int64_t gsize);
/**
* Check the integrity of mapping parameters (not complete)
*
* @param mo mapping options
*
* @return 0 on success, or negative on failure
*/
int32_t mp_mapopt_check(const mp_mapopt_t *mo);
/**
* Load or build an index
*
* If _fn_ corresponds to a FASTA file, build the index using _io_ and
* _n_threads_. If _fn_ corresponds to a prebuilt index file, load the index
* and ignore _io_ and _n_threads_.
*
* @param fn FASTA or index file name
* @param io indexing options
* @param n_threads number of threads
*
* @return the index
*/
mp_idx_t *mp_idx_load(const char *fn, const mp_idxopt_t *io, int32_t n_threads);
/**
* Deallocate the index
*
* @param mi the index
*/
void mp_idx_destroy(mp_idx_t *mi);
/**
* Write an index to disk
*
* @param fn index file name
* @param mi the index
*
* @return 0 on success or negative on failure
*/
int mp_idx_dump(const char *fn, const mp_idx_t *mi);
/**
* Read an index file
*
* @param fn index file name
*
* @return the index, or NULL on failure
*/
mp_idx_t *mp_idx_restore(const char *fn);
/**
* Load a splice score file
*
* @param nt sequences in mp_idx_t::nt (in/out)
* @param fn splice score file name; can be optionally gzip'd
* @param max_sc cap splice scores at this parameter
*
* @return 0 on success, or negative on failure
*/
int32_t mp_ntseq_read_spsc(mp_ntdb_t *nt, const char *fn, int32_t max_sc);
/**
* Align a protein sequence against the index
*
* @param mi the index
* @param qlen protein length
* @param seq protein sequence
* @param n_reg number of alignments (out)
* @param b thread-local buffer; threads shouldn't share buffers (can be NULL)
* @param opt mapping parameters
* @param qname query name (used for breaking ties)
*
* @return list of alignments
*/
mp_reg1_t *mp_map(const mp_idx_t *mi, int qlen, const char *seq, int *n_reg, mp_tbuf_t *b, const mp_mapopt_t *opt, const char *qname);
/**
* Initialize a thread-local buffer
*
* @return the buffer
*/
mp_tbuf_t *mp_tbuf_init(void);
/**
* Deallocate a thread-local buffer
*
* @param b the buffer
*/
void mp_tbuf_destroy(mp_tbuf_t *b);
// ignore the following for now
void mp_idx_print_stat(const mp_idx_t *mi, int32_t max_occ);
int32_t mp_map_file(const mp_idx_t *idx, const char *fn, const mp_mapopt_t *opt, int n_threads);
#ifdef __cplusplus
}
#endif
#endif