diff --git a/NEWS.md b/NEWS.md index 5de4ea62..b18f53fe 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,41 @@ +Release 2.25-r1173 (25 April 2023) +---------------------------------- + +Notable changes: + + * Improvement: use the miniprot splice model for RNA-seq alignment by default. + This model considers non-GT-AG splice sites and leads to slightly higher + (<0.1%) accuracy and sensitivity on real human data. + + * Change: increased the default `-I` to `8G` such that minimap2 would create a + uni-part index for a pair of mammalian genomes. This change may increase the + memory for all-vs-all read overlap alignment given large datasets. + + * New feature: output the sequences in secondary alignments with option + `--secondary-seq` (#687). + + * Bugfix: --rmq was not parsed correctly (#1010) + + * Bugfix: possibly incorrect coordinate when applying end bonus to the target + sequence (#1025). This is a ksw2 bug. It does not affect minimap2 as + minimap2 is not using the affected feature. + + * Improvement: incorporated several changes for better compatibility with + Windows (#1051) and for minimap2 integration at Oxford Nanopore Technologies + (#1048 and #1033). + + * Improvement: output the HD-line in SAM output (#1019). + + * Improvement: check minimap2 index file in mappy to prevent segmentation + fault for certain indices (#1008). + +For genomic sequences, minimap2 should give identical output to v2.24. +Long-read RNA-seq alignment may occasionally differ from previous versions. + +(2.25: 25 April 2023, r1173) + + + Release 2.24-r1122 (26 December 2021) ------------------------------------- diff --git a/README.md b/README.md index 8ed1ea6b..13d00c31 100644 --- a/README.md +++ b/README.md @@ -74,8 +74,8 @@ Detailed evaluations are available from the [minimap2 paper][doi] or the Minimap2 is optimized for x86-64 CPUs. You can acquire precompiled binaries from the [release page][release] with: ```sh -curl -L https://github.com/lh3/minimap2/releases/download/v2.24/minimap2-2.24_x64-linux.tar.bz2 | tar -jxvf - -./minimap2-2.24_x64-linux/minimap2 +curl -L https://github.com/lh3/minimap2/releases/download/v2.25/minimap2-2.25_x64-linux.tar.bz2 | tar -jxvf - +./minimap2-2.25_x64-linux/minimap2 ``` If you want to compile from the source, you need to have a C compiler, GNU make and zlib development files installed. Then type `make` in the source code diff --git a/cookbook.md b/cookbook.md index 8b3c1053..cbaed916 100644 --- a/cookbook.md +++ b/cookbook.md @@ -31,8 +31,8 @@ To acquire the data used in this cookbook and to install minimap2 and paftools, please follow the command lines below: ```sh # install minimap2 executables -curl -L https://github.com/lh3/minimap2/releases/download/v2.24/minimap2-2.24_x64-linux.tar.bz2 | tar jxf - -cp minimap2-2.24_x64-linux/{minimap2,k8,paftools.js} . # copy executables +curl -L https://github.com/lh3/minimap2/releases/download/v2.25/minimap2-2.25_x64-linux.tar.bz2 | tar jxf - +cp minimap2-2.25_x64-linux/{minimap2,k8,paftools.js} . # copy executables export PATH="$PATH:"`pwd` # put the current directory on PATH # download example datasets curl -L https://github.com/lh3/minimap2/releases/download/v2.10/cookbook-data.tgz | tar zxf - diff --git a/index.c b/index.c index c9bd01f6..0d8a2ae0 100644 --- a/index.c +++ b/index.c @@ -358,7 +358,7 @@ static void *worker_pipeline(void *shared, int step, void *in) for (i = 0; i < s->n_seq; ++i) { mm_bseq1_t *t = &s->seq[i]; if (t->l_seq > 0) - mm_sketch2(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, p->mi->flag&MM_I_SYNCMER, &s->a); + mm_sketch(0, t->seq, t->l_seq, p->mi->w, p->mi->k, t->rid, p->mi->flag&MM_I_HPC, &s->a); else if (mm_verbose >= 2) fprintf(stderr, "[WARNING] the length database sequence '%s' is 0\n", t->name); free(t->seq); free(t->name); @@ -446,7 +446,7 @@ mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const cha sum_len += p->len; if (p->len > 0) { a.n = 0; - mm_sketch2(0, s, p->len, w, k, i, is_hpc, 0, &a); // TODO: mm_idx_str() doesn't support syncmer + mm_sketch(0, s, p->len, w, k, i, is_hpc, &a); mm_idx_add(mi, a.n, a.a); } } diff --git a/main.c b/main.c index 31d874c3..7db642d1 100644 --- a/main.c +++ b/main.c @@ -120,7 +120,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const int main(int argc, char *argv[]) { - const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:j:J:"; + const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:"; ketopt_t o = KETOPT_INIT; mm_mapopt_t opt; mm_idxopt_t ipt; @@ -152,8 +152,7 @@ int main(int argc, char *argv[]) o = KETOPT_INIT; while ((c = ketopt(&o, argc, argv, 1, opt_str, long_options)) >= 0) { - if (c == 'w') ipt.w = atoi(o.arg), ipt.flag &= ~MM_I_SYNCMER; - else if (c == 'j') ipt.w = atoi(o.arg), ipt.flag |= MM_I_SYNCMER; + if (c == 'w') ipt.w = atoi(o.arg); else if (c == 'k') ipt.k = atoi(o.arg); else if (c == 'H') ipt.flag |= MM_I_HPC; else if (c == 'd') fnw = o.arg; // the above are indexing related options, except -I @@ -329,7 +328,6 @@ int main(int argc, char *argv[]) fprintf(fp_help, " -H use homopolymer-compressed k-mer (preferrable for PacBio)\n"); fprintf(fp_help, " -k INT k-mer size (no larger than 28) [%d]\n", ipt.k); fprintf(fp_help, " -w INT minimizer window size [%d]\n", ipt.w); -// fprintf(fp_help, " -j INT syncmer submer size (overriding -w) []\n"); fprintf(fp_help, " -I NUM split index for every ~NUM input bases [8G]\n"); fprintf(fp_help, " -d FILE dump index to FILE []\n"); fprintf(fp_help, " Mapping:\n"); diff --git a/map.c b/map.c index 360c8896..038888fb 100644 --- a/map.c +++ b/map.c @@ -62,7 +62,7 @@ static void collect_minimizers(void *km, const mm_mapopt_t *opt, const mm_idx_t mv->n = 0; for (i = n = 0; i < n_segs; ++i) { size_t j; - mm_sketch2(km, seqs[i], qlens[i], mi->w, mi->k, i, mi->flag&MM_I_HPC, mi->flag&MM_I_SYNCMER, mv); + mm_sketch(km, seqs[i], qlens[i], mi->w, mi->k, i, mi->flag&MM_I_HPC, mv); for (j = n; j < mv->n; ++j) mv->a[j].y += sum << 1; if (opt->sdust_thres > 0) // mask low-complexity minimizers diff --git a/minimap.h b/minimap.h index 05ab62cf..f4466ac0 100644 --- a/minimap.h +++ b/minimap.h @@ -5,7 +5,7 @@ #include #include -#define MM_VERSION "2.24-r1171-dirty" +#define MM_VERSION "2.25-r1173" #define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit #define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name @@ -48,7 +48,6 @@ #define MM_I_HPC 0x1 #define MM_I_NO_SEQ 0x2 #define MM_I_NO_NAME 0x4 -#define MM_I_SYNCMER 0x8 #define MM_IDX_MAGIC "MMI\2" diff --git a/minimap2.1 b/minimap2.1 index 2bcc9533..573b77fe 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -1,4 +1,4 @@ -.TH minimap2 1 "18 December 2021" "minimap2-2.24 (r1122)" "Bioinformatics tools" +.TH minimap2 1 "25 April 2023" "minimap2-2.25 (r1173)" "Bioinformatics tools" .SH NAME .PP minimap2 - mapping and alignment between collections of DNA sequences diff --git a/misc/paftools.js b/misc/paftools.js index bc2f29d7..5f4cd64f 100755 --- a/misc/paftools.js +++ b/misc/paftools.js @@ -1,6 +1,6 @@ #!/usr/bin/env k8 -var paftools_version = '2.24-r1152-dirty'; +var paftools_version = '2.25-r1173'; /***************************** ***** Library functions ***** diff --git a/mmpriv.h b/mmpriv.h index 7b51b984..2f5034b7 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -60,8 +60,6 @@ void radix_sort_64(uint64_t *beg, uint64_t *end); uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p); -void mm_sketch_syncmer(void *km, const char *str, int len, int smer, int k, uint32_t rid, int is_hpc, mm128_v *p); -void mm_sketch2(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, int is_syncmer, mm128_v *p); mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos); void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac); diff --git a/python/mappy.pyx b/python/mappy.pyx index 61e80ed2..8e65d202 100644 --- a/python/mappy.pyx +++ b/python/mappy.pyx @@ -3,7 +3,7 @@ from libc.stdlib cimport free cimport cmappy import sys -__version__ = '2.24' +__version__ = '2.25' cmappy.mm_reset_timer() diff --git a/setup.py b/setup.py index ce4d79c2..e3872c9f 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def compile_libminimap2(*args, **kwargs): setup( name = 'mappy', - version = '2.24', + version = '2.25', url = 'https://github.com/lh3/minimap2', description = 'Minimap2 python binding', long_description = readme(), diff --git a/sketch.c b/sketch.c index 1f6b7da4..f8306938 100644 --- a/sketch.c +++ b/sketch.c @@ -141,58 +141,3 @@ void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, i if (min.x != UINT64_MAX) kv_push(mm128_t, km, *p, min); } - -void mm_sketch_syncmer(void *km, const char *str, int len, int smer, int k, uint32_t rid, int is_hpc, mm128_v *p) -{ - uint64_t shift1 = 2 * (k - 1), mask = (1ULL<<2*k) - 1, smask = (1ULL<<2*smer) - 1, kmer[2] = {0,0}; - int i, j, l, buf_pos, min_pos, kmer_span = 0; - tiny_queue_t tq; - - assert(len > 0 && (smer > 0 && smer <= k) && (k > 0 && k <= 28)); // 56 bits for k-mer; could use long k-mers, but 28 enough in practice - memset(&tq, 0, sizeof(tiny_queue_t)); - kv_resize(mm128_t, km, *p, p->n + len/(k - smer)); - - for (i = l = buf_pos = min_pos = 0; i < len; ++i) { - int c = seq_nt4_table[(uint8_t)str[i]]; - if (c < 4) { // not an ambiguous base - int z; - if (is_hpc) { - int skip_len = 1; - if (i + 1 < len && seq_nt4_table[(uint8_t)str[i + 1]] == c) { - for (skip_len = 2; i + skip_len < len; ++skip_len) - if (seq_nt4_table[(uint8_t)str[i + skip_len]] != c) - break; - i += skip_len - 1; // put $i at the end of the current homopolymer run - } - tq_push(&tq, skip_len); - kmer_span += skip_len; - if (tq.count > k) kmer_span -= tq_shift(&tq); - } else kmer_span = l + 1 < k? l + 1 : k; - kmer[0] = (kmer[0] << 2 | c) & mask; // forward k-mer - kmer[1] = (kmer[1] >> 2) | (3ULL^c) << shift1; // reverse k-mer - if (kmer[0] == kmer[1]) continue; // skip "symmetric k-mers" as we don't know it strand - z = kmer[0] < kmer[1]? 0 : 1; // strand - ++l; - if (l >= k && kmer_span < 256) { - uint64_t x, min = UINT64_MAX; - x = hash64(kmer[z], mask); - for (j = 0; j <= k - smer; ++j) { - uint64_t y = x >> (j + j) & smask; - min = min < y? min : y; - } - if ((x & smask) == min) { - mm128_t t; - t.x = x << 8 | kmer_span; - t.y = (uint64_t)rid<<32 | (uint32_t)i<<1 | z; - kv_push(mm128_t, km, *p, t); - } - } - } else l = 0, tq.count = tq.front = 0, kmer_span = 0; - } -} - -void mm_sketch2(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, int is_syncmer, mm128_v *p) -{ - if (is_syncmer) mm_sketch_syncmer(km, str, len, w, k, rid, is_hpc, p); - else mm_sketch(km, str, len, w, k, rid, is_hpc, p); -}