Skip to content

Commit fb81e15

Browse files
committed
r1272: support pass-1 junction processing
NOT tested yet
1 parent d930ea9 commit fb81e15

File tree

4 files changed

+124
-58
lines changed

4 files changed

+124
-58
lines changed

index.c

Lines changed: 99 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -669,20 +669,17 @@ int mm_idx_alt_read(mm_idx_t *mi, const char *fn)
669669
return n_alt;
670670
}
671671

672-
/*******************
673-
* Known junctions *
674-
*******************/
672+
/***************
673+
* BED reading *
674+
***************/
675675

676676
#define sort_key_bed(a) ((a).st)
677677
KRADIX_SORT_INIT(bed, mm_idx_intv1_t, sort_key_bed, 4)
678678

679679
#define sort_key_end(a) ((a).en)
680680
KRADIX_SORT_INIT(end, mm_idx_intv1_t, sort_key_end, 4)
681681

682-
#define sort_key_jj(a) ((a).off)
683-
KRADIX_SORT_INIT(jj, mm_idx_jjump1_t, sort_key_jj, 4)
684-
685-
static mm_idx_intv_t *mm_idx_bed_read_core(const mm_idx_t *mi, const char *fn, int read_junc, int is_pass1)
682+
static mm_idx_intv_t *mm_idx_bed_read_core(const mm_idx_t *mi, const char *fn, int read_junc, int min_sc)
686683
{
687684
gzFile fp;
688685
kstream_t *ks;
@@ -729,7 +726,7 @@ static mm_idx_intv_t *mm_idx_bed_read_core(const mm_idx_t *mi, const char *fn, i
729726
}
730727
}
731728
if (id < 0 || t.st < 0 || t.st >= t.en) continue; // contig ID not found, or other problems
732-
if (is_pass1 && t.score < 5) continue; // for pass-1 BED, ignore junctions with weak signals; NB: paired with pass-1!
729+
if (min_sc > 0 && t.score < min_sc) continue;
733730
r = &I[id];
734731
if (i >= 11 && read_junc) { // BED12
735732
int32_t st, sz, en;
@@ -762,12 +759,12 @@ static mm_idx_intv_t *mm_idx_bed_read_core(const mm_idx_t *mi, const char *fn, i
762759
return I;
763760
}
764761

765-
static mm_idx_intv_t *mm_idx_bed_read_merged(const mm_idx_t *mi, const char *fn, int read_junc, int is_pass1)
762+
static mm_idx_intv_t *mm_idx_bed_read_merge(const mm_idx_t *mi, const char *fn, int read_junc, int min_sc)
766763
{
767764
long n = 0, n0 = 0;
768765
int32_t i;
769766
mm_idx_intv_t *I;
770-
I = mm_idx_bed_read_core(mi, fn, read_junc, is_pass1);
767+
I = mm_idx_bed_read_core(mi, fn, read_junc, min_sc);
771768
if (I == 0) return 0;
772769
for (i = 0; i < mi->n_seq; ++i) {
773770
int32_t j, j0, k;
@@ -796,48 +793,13 @@ static mm_idx_intv_t *mm_idx_bed_read_merged(const mm_idx_t *mi, const char *fn,
796793
return I;
797794
}
798795

799-
static mm_idx_jjump_t *mm_idx_bed2jjump(const mm_idx_t *mi, const mm_idx_intv_t *I)
800-
{
801-
int32_t i;
802-
mm_idx_jjump_t *J;
803-
J = CALLOC(mm_idx_jjump_t, mi->n_seq);
804-
for (i = 0; i < mi->n_seq; ++i) {
805-
int32_t j, k;
806-
const mm_idx_intv_t *intv = &I[i];
807-
mm_idx_jjump_t *jj = &J[i];
808-
jj->n = intv->n * 2;
809-
jj->a = CALLOC(mm_idx_jjump1_t, jj->n);
810-
for (j = k = 0; j < intv->n; ++j) {
811-
jj->a[k].off = intv->a[j].st, jj->a[k].off2 = intv->a[j].en, jj->a[k].cnt = intv->a[j].cnt, jj->a[k].strand = intv->a[j].strand, ++k;
812-
jj->a[k].off = intv->a[j].en, jj->a[k].off2 = intv->a[j].st, jj->a[k].cnt = intv->a[j].cnt, jj->a[k].strand = intv->a[j].strand, ++k;
813-
}
814-
radix_sort_jj(jj->a, jj->a + jj->n);
815-
}
816-
return J;
817-
}
818-
819-
int mm_idx_bed_read2(mm_idx_t *mi, const char *fn, int read_junc, int for_score, int for_jump)
796+
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc)
820797
{
821-
int32_t i;
822-
mm_idx_intv_t *I;
823798
if (mi->h == 0) mm_idx_index_name(mi);
824-
I = mm_idx_bed_read_merged(mi, fn, read_junc, 0);
825-
if (I == 0) return 0;
826-
if (for_jump)
827-
mi->J = mm_idx_bed2jjump(mi, I);
828-
if (!for_score) {
829-
for (i = 0; i < mi->n_seq; ++i)
830-
free(I[i].a);
831-
free(I);
832-
} else mi->I = I;
799+
mi->I = mm_idx_bed_read_merge(mi, fn, read_junc, -1);
833800
return 0;
834801
}
835802

836-
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc)
837-
{
838-
return mm_idx_bed_read2(mi, fn, read_junc, 1, 0);
839-
}
840-
841803
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s)
842804
{
843805
int32_t i, left, right;
@@ -863,6 +825,96 @@ int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uin
863825
return left;
864826
}
865827

828+
/*********************************
829+
* Reading junctions for jumping *
830+
*********************************/
831+
832+
#define sort_key_jj(a) ((a).off)
833+
KRADIX_SORT_INIT(jj, mm_idx_jjump1_t, sort_key_jj, 4)
834+
835+
#define sort_key_jj2(a) ((a).off2)
836+
KRADIX_SORT_INIT(jj2, mm_idx_jjump1_t, sort_key_jj2, 4)
837+
838+
static mm_idx_jjump_t *mm_idx_bed2jjump(const mm_idx_t *mi, const mm_idx_intv_t *I, uint16_t flag)
839+
{
840+
int32_t i;
841+
mm_idx_jjump_t *J;
842+
J = CALLOC(mm_idx_jjump_t, mi->n_seq);
843+
for (i = 0; i < mi->n_seq; ++i) {
844+
int32_t j, k;
845+
const mm_idx_intv_t *intv = &I[i];
846+
mm_idx_jjump_t *jj = &J[i];
847+
jj->n = intv->n * 2;
848+
jj->a = CALLOC(mm_idx_jjump1_t, jj->n);
849+
for (j = k = 0; j < intv->n; ++j) {
850+
jj->a[k].off = intv->a[j].st, jj->a[k].off2 = intv->a[j].en, jj->a[k].cnt = intv->a[j].cnt, jj->a[k].strand = intv->a[j].strand, jj->a[k++].flag = flag;
851+
jj->a[k].off = intv->a[j].en, jj->a[k].off2 = intv->a[j].st, jj->a[k].cnt = intv->a[j].cnt, jj->a[k].strand = intv->a[j].strand, jj->a[k++].flag = flag;
852+
}
853+
radix_sort_jj(jj->a, jj->a + jj->n);
854+
}
855+
return J;
856+
}
857+
858+
static mm_idx_jjump_t *mm_idx_jjump_merge(const mm_idx_t *mi, const mm_idx_jjump_t *J0, const mm_idx_jjump_t *J1)
859+
{
860+
int32_t i;
861+
mm_idx_jjump_t *J2;
862+
J2 = CALLOC(mm_idx_jjump_t, mi->n_seq);
863+
for (i = 0; i < mi->n_seq; ++i) {
864+
int32_t j, j0, k;
865+
const mm_idx_jjump_t *jj0 = &J0[i], *jj1 = &J1[i];
866+
mm_idx_jjump_t *jj2 = &J2[i];
867+
// merge jj0 and jj1 into jj2; faster with sorted merge but the performance difference should be negligible
868+
jj2->n = jj0->n + jj1->n;
869+
jj2->a = CALLOC(mm_idx_jjump1_t, jj2->n);
870+
for (j = k = 0; j < jj0->n; ++j) jj2->a[k++] = jj0->a[j];
871+
for (j = k = 0; j < jj1->n; ++j) jj2->a[k++] = jj1->a[j];
872+
radix_sort_jj(jj2->a, jj2->a + jj2->n); // sort by a[].off
873+
// sort by a[].off and then by a[].off2 such that they can be merged later
874+
for (j0 = 0, j = 1; j <= jj2->n; ++j) {
875+
if (j == jj2->n || jj2->a[j0].off != jj2->a[j].off) {
876+
radix_sort_jj2(jj2->a + j0, jj2->a + j);
877+
j0 = j;
878+
}
879+
}
880+
// the actual merge
881+
for (j0 = 0, j = 1, k = 0; j <= jj2->n; ++j) {
882+
if (j == jj2->n || jj2->a[j0].off != jj2->a[j].off || jj2->a[j0].off2 != jj2->a[j].off2) {
883+
int32_t t, cnt = 0;
884+
uint16_t flag = 0;
885+
for (t = j0; t < j; ++t) cnt += jj2->a[t].cnt, flag |= jj2->a[t].flag;
886+
jj2->a[k] = jj2->a[j0];
887+
jj2->a[k].cnt = cnt;
888+
jj2->a[k++].flag = flag;
889+
j0 = j;
890+
}
891+
}
892+
}
893+
return J2;
894+
}
895+
896+
int mm_idx_jjump_read(mm_idx_t *mi, const char *fn, int flag, int min_sc)
897+
{
898+
int32_t i;
899+
mm_idx_intv_t *I;
900+
mm_idx_jjump_t *J;
901+
if (mi->h == 0) mm_idx_index_name(mi);
902+
I = mm_idx_bed_read_merge(mi, fn, 1, min_sc);
903+
J = mm_idx_bed2jjump(mi, I, flag);
904+
for (i = 0; i < mi->n_seq; ++i) free(I[i].a);
905+
free(I);
906+
if (mi->J) {
907+
mm_idx_jjump_t *J2;
908+
J2 = mm_idx_jjump_merge(mi, mi->J, J2);
909+
for (i = 0; i < mi->n_seq; ++i) {
910+
free(mi->J[i].a); free(J[i].a);
911+
}
912+
free(mi->J); free(J);
913+
mi->J = J2;
914+
} else mi->J = J;
915+
return 0;
916+
}
917+
866918
static int32_t mm_idx_jump_get_core(int32_t n, const mm_idx_jjump1_t *a, int32_t x) // similar to mm_idx_find_intv()
867919
{
868920
int32_t s = 0, e = n;

main.c

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ static ko_longopt_t long_options[] = {
8484
{ "pairing", ko_required_argument, 359 },
8585
{ "jump-min-match", ko_required_argument, 360 },
8686
{ "write-junc", ko_no_argument, 361 },
87+
{ "jump-pass1", ko_required_argument, 362 },
8788
{ "dbg-seed-occ", ko_no_argument, 501 },
8889
{ "help", ko_no_argument, 'h' },
8990
{ "max-intron-len", ko_required_argument, 'G' },
@@ -133,7 +134,7 @@ int main(int argc, char *argv[])
133134
mm_mapopt_t opt;
134135
mm_idxopt_t ipt;
135136
int i, c, n_threads = 3, n_parts, old_best_n = -1;
136-
char *fnw = 0, *rg = 0, *junc_bed = 0, *jump_bed = 0, *fn_spsc = 0, *s, *alt_list = 0;
137+
char *fnw = 0, *rg = 0, *fn_bed_junc = 0, *fn_bed_jump = 0, *fn_bed_pass1 = 0, *fn_spsc = 0, *s, *alt_list = 0;
137138
FILE *fp_help = stderr;
138139
mm_idx_reader_t *idx_rdr;
139140
mm_idx_t *mi;
@@ -195,7 +196,7 @@ int main(int argc, char *argv[])
195196
else if (c == 'R') rg = o.arg;
196197
else if (c == 'h') fp_help = stdout;
197198
else if (c == '2') opt.flag |= MM_F_2_IO_THREADS;
198-
else if (c == 'j') jump_bed = o.arg;
199+
else if (c == 'j') fn_bed_jump = o.arg;
199200
else if (c == 'J') {
200201
int t;
201202
t = atoi(o.arg);
@@ -237,7 +238,7 @@ int main(int argc, char *argv[])
237238
else if (c == 336) opt.flag |= MM_F_HARD_MLEVEL; // --hard-mask-level
238239
else if (c == 337) opt.max_sw_mat = mm_parse_num(o.arg); // --cap-sw-mat
239240
else if (c == 338) opt.max_qlen = mm_parse_num(o.arg); // --max-qlen
240-
else if (c == 340) junc_bed = o.arg; // --junc-bed
241+
else if (c == 340) fn_bed_junc = o.arg; // --junc-bed
241242
else if (c == 341) opt.junc_bonus = atoi(o.arg); // --junc-bonus
242243
else if (c == 358) opt.junc_pen = atoi(o.arg); // --junc-pen
243244
else if (c == 342) opt.flag |= MM_F_SAM_HIT_ONLY; // --sam-hit-only
@@ -257,6 +258,7 @@ int main(int argc, char *argv[])
257258
else if (c == 357) fn_spsc = o.arg; // --spsc
258259
else if (c == 360) opt.jump_min_match = mm_parse_num(o.arg); // --jump-min-match
259260
else if (c == 361) opt.flag |= MM_F_OUT_JUNC | MM_F_CIGAR; // --write-junc
261+
else if (c == 362) fn_bed_pass1 = o.arg; // --jump-pass1
260262
else if (c == 501) mm_dbg_flag |= MM_DBG_SEED_FREQ; // --dbg-seed-occ
261263
else if (c == 330) {
262264
fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n");
@@ -458,16 +460,21 @@ int main(int argc, char *argv[])
458460
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq);
459461
if (argc != o.ind + 1) mm_mapopt_update(&opt, mi);
460462
if (mm_verbose >= 3) mm_idx_stat(mi);
461-
if (junc_bed) {
462-
mm_idx_bed_read(mi, junc_bed, 1);
463+
if (fn_bed_junc) {
464+
mm_idx_bed_read(mi, fn_bed_junc, 1);
463465
if (mi->I == 0 && mm_verbose >= 2)
464466
fprintf(stderr, "[WARNING] failed to load the junction BED file\n");
465467
}
466-
if (jump_bed) {
467-
mm_idx_bed_read2(mi, jump_bed, 1, 0, 1);
468+
if (fn_bed_jump) {
469+
mm_idx_jjump_read(mi, fn_bed_jump, MM_JUNC_ANNO, -1);
468470
if (mi->J == 0 && mm_verbose >= 2)
469471
fprintf(stderr, "[WARNING] failed to load the jump BED file\n");
470472
}
473+
if (fn_bed_pass1) {
474+
mm_idx_jjump_read(mi, fn_bed_pass1, MM_JUNC_MISC, 5);
475+
if (mi->J == 0 && mm_verbose >= 2)
476+
fprintf(stderr, "[WARNING] failed to load the pass-1 jump BED file\n");
477+
}
471478
if (fn_spsc) {
472479
mm_idx_spsc_read(mi, fn_spsc, mm_max_spsc_bonus(&opt));
473480
if (mi->spsc == 0 && mm_verbose >= 2)

minimap.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#include <stdio.h>
66
#include <sys/types.h>
77

8-
#define MM_VERSION "2.28-r1271-dirty"
8+
#define MM_VERSION "2.28-r1272-dirty"
99

1010
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
1111
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name

mmpriv.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
#define MM_SEED_SEG_SHIFT 48
2525
#define MM_SEED_SEG_MASK (0xffULL<<(MM_SEED_SEG_SHIFT))
2626

27+
#define MM_JUNC_ANNO 0x1
28+
#define MM_JUNC_MISC 0x2
29+
2730
#ifndef kroundup32
2831
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
2932
#endif
@@ -54,8 +57,9 @@ typedef struct {
5457
} mm_seg_t;
5558

5659
typedef struct {
57-
int32_t off, off2;
58-
int32_t cnt, strand;
60+
int32_t off, off2, cnt;
61+
int16_t strand;
62+
uint16_t flag;
5963
} mm_idx_jjump1_t;
6064

6165
double cputime(void);
@@ -81,14 +85,17 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
8185
void mm_write_sam3(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int seg_idx, int reg_idx, int n_seg, const int *n_regss, const mm_reg1_t *const* regss, void *km, int64_t opt_flag, int rep_len);
8286
void mm_write_junc(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r);
8387

88+
// indexing related in index.c
8489
void mm_idxopt_init(mm_idxopt_t *opt);
8590
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
8691
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
8792
int mm_idx_getseq2(const mm_idx_t *mi, int is_rev, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
8893
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand);
89-
int mm_idx_bed_read2(mm_idx_t *mi, const char *fn, int read_junc, int for_score, int for_jump);
94+
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
95+
int mm_idx_jjump_read(mm_idx_t *mi, const char *fn, int flag, int min_sc);
9096
const mm_idx_jjump1_t *mm_idx_jump_get(const mm_idx_t *db, int32_t cid, int32_t st, int32_t en, int32_t *n);
9197

98+
// chaining in lchain.c
9299
mm128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,
93100
int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
94101
mm128_t *mg_lchain_rmq(int max_dist, int max_dist_inner, int bw, int max_chn_skip, int cap_rmq_size, int min_cnt, int min_sc, float chn_pen_gap, float chn_pen_skip,

0 commit comments

Comments
 (0)