@@ -33,7 +33,7 @@ typedef struct mm_idx_bucket_s {
3333} mm_idx_bucket_t ;
3434
3535typedef struct {
36- int32_t st , en , max ; // max is not used for now
36+ int32_t st , en , cnt ;
3737 int32_t score :30 , strand :2 ;
3838} mm_idx_intv1_t ;
3939
@@ -682,7 +682,7 @@ KRADIX_SORT_INIT(end, mm_idx_intv1_t, sort_key_end, 4)
682682#define sort_key_jj (a ) ((a).off)
683683KRADIX_SORT_INIT (jj , mm_idx_jjump1_t , sort_key_jj , 4 )
684684
685- mm_idx_intv_t * mm_idx_read_bed (const mm_idx_t * mi , const char * fn , int read_junc )
685+ static mm_idx_intv_t * mm_idx_bed_read_core (const mm_idx_t * mi , const char * fn , int read_junc , int is_pass1 )
686686{
687687 gzFile fp ;
688688 kstream_t * ks ;
@@ -691,7 +691,7 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
691691
692692 fp = fn && strcmp (fn , "-" )? gzopen (fn , "r" ) : gzdopen (fileno (stdin ), "r" );
693693 if (fp == 0 ) return 0 ;
694- I = (mm_idx_intv_t * ) calloc ( mi -> n_seq , sizeof ( * I ) );
694+ I = CALLOC (mm_idx_intv_t , mi -> n_seq );
695695 ks = ks_init (fp );
696696 while (ks_getuntil (ks , KS_SEP_LINE , & str , 0 ) >= 0 ) {
697697 mm_idx_intv_t * r ;
@@ -712,7 +712,7 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
712712 t .en = atol (q );
713713 if (t .en < 0 ) break ;
714714 } else if (i == 4 ) { // BED score
715- t .score = atol (q );
715+ t .score = * q >= '0' && * q <= '9' ? atol (q ) : -1 ;
716716 } else if (i == 5 ) { // strand
717717 t .strand = * q == '+' ? 1 : * q == '-' ? -1 : 0 ;
718718 } else if (i == 9 ) {
@@ -728,7 +728,8 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
728728 ++ i , q = p + 1 ;
729729 }
730730 }
731- if (id < 0 || t .st < 0 || t .st >= t .en ) continue ;
731+ if (id < 0 || t .st < 0 || t .st >= t .en ) continue ; // contig ID not found, or other problems
732+ if (is_pass1 && t .score < 5 ) continue ; // for pass-1 BED, ignore junctions with weak signals; NB: paired with pass-1!
732733 r = & I [id ];
733734 if (i >= 11 && read_junc ) { // BED12
734735 int32_t st , sz , en ;
@@ -761,62 +762,69 @@ mm_idx_intv_t *mm_idx_read_bed(const mm_idx_t *mi, const char *fn, int read_junc
761762 return I ;
762763}
763764
764- static mm_idx_intv_t * mm_idx_bed_read_core (const mm_idx_t * mi , const char * fn , int read_junc )
765+ static mm_idx_intv_t * mm_idx_bed_read_merged (const mm_idx_t * mi , const char * fn , int read_junc , int is_pass1 )
765766{
766767 long n = 0 , n0 = 0 ;
767768 int32_t i ;
768769 mm_idx_intv_t * I ;
769- I = mm_idx_read_bed (mi , fn , read_junc );
770+ I = mm_idx_bed_read_core (mi , fn , read_junc , is_pass1 );
770771 if (I == 0 ) return 0 ;
771772 for (i = 0 ; i < mi -> n_seq ; ++ i ) {
772773 int32_t j , j0 , k ;
773774 mm_idx_intv_t * intv = & I [i ];
774775 n0 += intv -> n ;
775- radix_sort_bed (intv -> a , intv -> a + intv -> n );
776- for (j = 1 , j0 = 0 ; j <= intv -> n ; ++ j ) {
776+ radix_sort_bed (intv -> a , intv -> a + intv -> n ); // sort by st
777+ for (j = 1 , j0 = 0 ; j <= intv -> n ; ++ j ) { // sort by st and then by end
777778 if (j == intv -> n || intv -> a [j ].st != intv -> a [j0 ].st ) {
778779 radix_sort_end (intv -> a + j0 , intv -> a + j );
779780 j0 = j ;
780781 }
781782 }
782- for (j = 1 , j0 = 0 , k = 0 ; j <= intv -> n ; ++ j ) {
783+ for (j = 1 , j0 = 0 , k = 0 ; j <= intv -> n ; ++ j ) { // merge intervals with the same (st, en)
783784 if (j == intv -> n || intv -> a [j ].st != intv -> a [j0 ].st || intv -> a [j ].en != intv -> a [j0 ].en ) {
784- intv -> a [k ++ ] = intv -> a [j0 ];
785+ intv -> a [k ] = intv -> a [j0 ];
786+ intv -> a [k ++ ].cnt = j - j0 ;
785787 j0 = j ;
786788 }
787789 }
788- intv -> n = k ;
790+ intv -> a = REALLOC (mm_idx_intv1_t , intv -> a , k );
791+ intv -> n = intv -> m = k ;
789792 n += k ;
790793 }
791794 if (mm_verbose >= 3 )
792795 fprintf (stderr , "[%s] read %ld introns, %ld of which are non-redundant\n" , __func__ , n0 , n );
793796 return I ;
794797}
795798
799+ static mm_idx_jjump_t * mm_idx_bed2jjump (const mm_idx_t * mi , const mm_idx_intv_t * I )
800+ {
801+ int32_t i ;
802+ mm_idx_jjump_t * J ;
803+ J = CALLOC (mm_idx_jjump_t , mi -> n_seq );
804+ for (i = 0 ; i < mi -> n_seq ; ++ i ) {
805+ int32_t j , k ;
806+ const mm_idx_intv_t * intv = & I [i ];
807+ mm_idx_jjump_t * jj = & J [i ];
808+ jj -> n = intv -> n * 2 ;
809+ jj -> a = CALLOC (mm_idx_jjump1_t , jj -> n );
810+ for (j = k = 0 ; j < intv -> n ; ++ j ) {
811+ jj -> a [k ].off = intv -> a [j ].st , jj -> a [k ].off2 = intv -> a [j ].en , jj -> a [k ].cnt = intv -> a [j ].cnt , jj -> a [k ].strand = intv -> a [j ].strand , ++ k ;
812+ jj -> a [k ].off = intv -> a [j ].en , jj -> a [k ].off2 = intv -> a [j ].st , jj -> a [k ].cnt = intv -> a [j ].cnt , jj -> a [k ].strand = intv -> a [j ].strand , ++ k ;
813+ }
814+ radix_sort_jj (jj -> a , jj -> a + jj -> n );
815+ }
816+ return J ;
817+ }
818+
796819int mm_idx_bed_read2 (mm_idx_t * mi , const char * fn , int read_junc , int for_score , int for_jump )
797820{
798821 int32_t i ;
799822 mm_idx_intv_t * I ;
800823 if (mi -> h == 0 ) mm_idx_index_name (mi );
801- I = mm_idx_bed_read_core (mi , fn , read_junc );
824+ I = mm_idx_bed_read_merged (mi , fn , read_junc , 0 );
802825 if (I == 0 ) return 0 ;
803- if (for_jump ) {
804- mm_idx_jjump_t * J ;
805- J = CALLOC (mm_idx_jjump_t , mi -> n_seq );
806- for (i = 0 ; i < mi -> n_seq ; ++ i ) {
807- int32_t j , k ;
808- mm_idx_intv_t * intv = & I [i ];
809- mm_idx_jjump_t * jj = & J [i ];
810- jj -> n = intv -> n * 2 ;
811- jj -> a = CALLOC (mm_idx_jjump1_t , jj -> n );
812- for (j = k = 0 ; j < intv -> n ; ++ j ) {
813- jj -> a [k ].off = intv -> a [j ].st , jj -> a [k ].off2 = intv -> a [j ].en , jj -> a [k ++ ].strand = intv -> a [j ].strand ;
814- jj -> a [k ].off = intv -> a [j ].en , jj -> a [k ].off2 = intv -> a [j ].st , jj -> a [k ++ ].strand = intv -> a [j ].strand ;
815- }
816- radix_sort_jj (jj -> a , jj -> a + jj -> n );
817- }
818- mi -> J = J ;
819- }
826+ if (for_jump )
827+ mi -> J = mm_idx_bed2jjump (mi , I );
820828 if (!for_score ) {
821829 for (i = 0 ; i < mi -> n_seq ; ++ i )
822830 free (I [i ].a );
0 commit comments