Skip to content

Commit d930ea9

Browse files
committed
r1271: code refactoring in prep for 2-pass
1 parent a832a42 commit d930ea9

File tree

3 files changed

+41
-32
lines changed

3 files changed

+41
-32
lines changed

index.c

Lines changed: 38 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ typedef struct mm_idx_bucket_s {
3333
} mm_idx_bucket_t;
3434

3535
typedef struct {
36-
int32_t st, en, max; // max is not used for now
36+
int32_t st, en, cnt;
3737
int32_t score:30, strand:2;
3838
} mm_idx_intv1_t;
3939

@@ -682,7 +682,7 @@ KRADIX_SORT_INIT(end, mm_idx_intv1_t, sort_key_end, 4)
682682
#define sort_key_jj(a) ((a).off)
683683
KRADIX_SORT_INIT(jj, mm_idx_jjump1_t, sort_key_jj, 4)
684684

685-
mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc)
685+
static mm_idx_intv_t *mm_idx_bed_read_core(const mm_idx_t *mi, const char *fn, int read_junc, int is_pass1)
686686
{
687687
gzFile fp;
688688
kstream_t *ks;
@@ -691,7 +691,7 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
691691

692692
fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
693693
if (fp == 0) return 0;
694-
I = (mm_idx_intv_t*)calloc(mi->n_seq, sizeof(*I));
694+
I = CALLOC(mm_idx_intv_t, mi->n_seq);
695695
ks = ks_init(fp);
696696
while (ks_getuntil(ks, KS_SEP_LINE, &str, 0) >= 0) {
697697
mm_idx_intv_t *r;
@@ -712,7 +712,7 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
712712
t.en = atol(q);
713713
if (t.en < 0) break;
714714
} else if (i == 4) { // BED score
715-
t.score = atol(q);
715+
t.score = *q >= '0' && *q <= '9'? atol(q) : -1;
716716
} else if (i == 5) { // strand
717717
t.strand = *q == '+'? 1 : *q == '-'? -1 : 0;
718718
} else if (i == 9) {
@@ -728,7 +728,8 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
728728
++i, q = p + 1;
729729
}
730730
}
731-
if (id < 0 || t.st < 0 || t.st >= t.en) continue;
731+
if (id < 0 || t.st < 0 || t.st >= t.en) continue; // contig ID not found, or other problems
732+
if (is_pass1 && t.score < 5) continue; // for pass-1 BED, ignore junctions with weak signals; NB: paired with pass-1!
732733
r = &I[id];
733734
if (i >= 11 && read_junc) { // BED12
734735
int32_t st, sz, en;
@@ -761,62 +762,69 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
761762
return I;
762763
}
763764

764-
static mm_idx_intv_t *mm_idx_bed_read_core(const mm_idx_t *mi, const char *fn, int read_junc)
765+
static mm_idx_intv_t *mm_idx_bed_read_merged(const mm_idx_t *mi, const char *fn, int read_junc, int is_pass1)
765766
{
766767
long n = 0, n0 = 0;
767768
int32_t i;
768769
mm_idx_intv_t *I;
769-
I = mm_idx_read_bed(mi, fn, read_junc);
770+
I = mm_idx_bed_read_core(mi, fn, read_junc, is_pass1);
770771
if (I == 0) return 0;
771772
for (i = 0; i < mi->n_seq; ++i) {
772773
int32_t j, j0, k;
773774
mm_idx_intv_t *intv = &I[i];
774775
n0 += intv->n;
775-
radix_sort_bed(intv->a, intv->a + intv->n);
776-
for (j = 1, j0 = 0; j <= intv->n; ++j) {
776+
radix_sort_bed(intv->a, intv->a + intv->n); // sort by st
777+
for (j = 1, j0 = 0; j <= intv->n; ++j) { // sort by st and then by end
777778
if (j == intv->n || intv->a[j].st != intv->a[j0].st) {
778779
radix_sort_end(intv->a + j0, intv->a + j);
779780
j0 = j;
780781
}
781782
}
782-
for (j = 1, j0 = 0, k = 0; j <= intv->n; ++j) {
783+
for (j = 1, j0 = 0, k = 0; j <= intv->n; ++j) { // merge intervals with the same (st, en)
783784
if (j == intv->n || intv->a[j].st != intv->a[j0].st || intv->a[j].en != intv->a[j0].en) {
784-
intv->a[k++] = intv->a[j0];
785+
intv->a[k] = intv->a[j0];
786+
intv->a[k++].cnt = j - j0;
785787
j0 = j;
786788
}
787789
}
788-
intv->n = k;
790+
intv->a = REALLOC(mm_idx_intv1_t, intv->a, k);
791+
intv->n = intv->m = k;
789792
n += k;
790793
}
791794
if (mm_verbose >= 3)
792795
fprintf(stderr, "[%s] read %ld introns, %ld of which are non-redundant\n", __func__, n0, n);
793796
return I;
794797
}
795798

799+
static mm_idx_jjump_t *mm_idx_bed2jjump(const mm_idx_t *mi, const mm_idx_intv_t *I)
800+
{
801+
int32_t i;
802+
mm_idx_jjump_t *J;
803+
J = CALLOC(mm_idx_jjump_t, mi->n_seq);
804+
for (i = 0; i < mi->n_seq; ++i) {
805+
int32_t j, k;
806+
const mm_idx_intv_t *intv = &I[i];
807+
mm_idx_jjump_t *jj = &J[i];
808+
jj->n = intv->n * 2;
809+
jj->a = CALLOC(mm_idx_jjump1_t, jj->n);
810+
for (j = k = 0; j < intv->n; ++j) {
811+
jj->a[k].off = intv->a[j].st, jj->a[k].off2 = intv->a[j].en, jj->a[k].cnt = intv->a[j].cnt, jj->a[k].strand = intv->a[j].strand, ++k;
812+
jj->a[k].off = intv->a[j].en, jj->a[k].off2 = intv->a[j].st, jj->a[k].cnt = intv->a[j].cnt, jj->a[k].strand = intv->a[j].strand, ++k;
813+
}
814+
radix_sort_jj(jj->a, jj->a + jj->n);
815+
}
816+
return J;
817+
}
818+
796819
int mm_idx_bed_read2(mm_idx_t *mi, const char *fn, int read_junc, int for_score, int for_jump)
797820
{
798821
int32_t i;
799822
mm_idx_intv_t *I;
800823
if (mi->h == 0) mm_idx_index_name(mi);
801-
I = mm_idx_bed_read_core(mi, fn, read_junc);
824+
I = mm_idx_bed_read_merged(mi, fn, read_junc, 0);
802825
if (I == 0) return 0;
803-
if (for_jump) {
804-
mm_idx_jjump_t *J;
805-
J = CALLOC(mm_idx_jjump_t, mi->n_seq);
806-
for (i = 0; i < mi->n_seq; ++i) {
807-
int32_t j, k;
808-
mm_idx_intv_t *intv = &I[i];
809-
mm_idx_jjump_t *jj = &J[i];
810-
jj->n = intv->n * 2;
811-
jj->a = CALLOC(mm_idx_jjump1_t, jj->n);
812-
for (j = k = 0; j < intv->n; ++j) {
813-
jj->a[k].off = intv->a[j].st, jj->a[k].off2 = intv->a[j].en, jj->a[k++].strand = intv->a[j].strand;
814-
jj->a[k].off = intv->a[j].en, jj->a[k].off2 = intv->a[j].st, jj->a[k++].strand = intv->a[j].strand;
815-
}
816-
radix_sort_jj(jj->a, jj->a + jj->n);
817-
}
818-
mi->J = J;
819-
}
826+
if (for_jump)
827+
mi->J = mm_idx_bed2jjump(mi, I);
820828
if (!for_score) {
821829
for (i = 0; i < mi->n_seq; ++i)
822830
free(I[i].a);

minimap.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#include <stdio.h>
66
#include <sys/types.h>
77

8-
#define MM_VERSION "2.28-r1270-dirty"
8+
#define MM_VERSION "2.28-r1271-dirty"
99

1010
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
1111
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name

mmpriv.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333

3434
#define MALLOC(type, len) ((type*)malloc((len) * sizeof(type)))
3535
#define CALLOC(type, len) ((type*)calloc((len), sizeof(type)))
36+
#define REALLOC(type, ptr, cnt) ((type*)realloc((ptr), (cnt) * sizeof(type)))
3637

3738
#ifdef __cplusplus
3839
extern "C" {
@@ -54,7 +55,7 @@ typedef struct {
5455

5556
typedef struct {
5657
int32_t off, off2;
57-
int32_t strand;
58+
int32_t cnt, strand;
5859
} mm_idx_jjump1_t;
5960

6061
double cputime(void);

0 commit comments

Comments
 (0)