Merge pull request #258 from DaehwanKimLab/release/2.2.1-rc

Release/2.2.1
DaehwanKimLab · Jul 24, 2020 · 4a411a9 · 4a411a9
2 parents 20f24e3 + cdc238c
commit 4a411a9
Show file tree

Hide file tree

Showing 37 changed files with 284 additions and 13,670 deletions.
diff --git a/Makefile b/Makefile
@@ -237,8 +237,6 @@ HT2LIB_PKG_SRC = \
 GENERAL_LIST = $(wildcard scripts/*.sh) \
 	$(wildcard scripts/*.pl) \
 	$(wildcard *.py) \
-	$(wildcard hisatgenotype_modules/*.py) \
-	$(wildcard hisatgenotype_scripts/*.py) \
 	$(wildcard example/index/*.ht2) \
 	$(wildcard example/reads/*.fa) \
 	example/reference/22_20-21M.fa \

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.2.0
+2.2.1
diff --git a/docs/_data/download-binary.yml b/docs/_data/download-binary.yml
@@ -1,5 +1,12 @@
-latest_version: 2.2.0,2.1.0
+latest_version: 2.2.1,2.2.0,2.1.0
 release:
+  - version: 2.2.1
+    date: 7/24/2020
+    name: HISAT2
+    artifacts:
+      Source: https://cloud.biohpc.swmed.edu/index.php/s/fE9QCsX3NH4QwBi/download
+      OSX_x86_64: https://cloud.biohpc.swmed.edu/index.php/s/an8KdGxxRdSRXjr/download
+      Linux_x86_64: https://cloud.biohpc.swmed.edu/index.php/s/4pMgDq4oAF9QCfA/download
   - version: 2.2.0
     date: 2/6/2020
     name: HISAT2

diff --git a/docs/_pages/hisat2.md b/docs/_pages/hisat2.md
@@ -9,6 +9,14 @@ share: false
 **HISAT2** is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome. Based on an extension of BWT for graphs ([Sir&eacute;n et al. 2014](http://dl.acm.org/citation.cfm?id=2674828)), we designed and implemented a graph FM index (GFM), an original approach and its first implementation. In addition to using one global GFM index that represents a population of human genomes, **HISAT2** uses a large set of small GFM indexes that collectively cover the whole genome. These small indexes (called local indexes), combined with several alignment strategies, enable rapid and accurate alignment of sequencing reads. This new indexing scheme is called a Hierarchical Graph FM index (HGFM).
 
 
+### HISAT 2.2.1 release 7/24/2020
+
+This patch version includes the following changes.
+* Python3 support
+* Remove the HISAT-genotype related scripts. HISAT-genotype moved to [http://daehwankimlab.github.io/hisat-genotype/](http://daehwankimlab.github.io/hisat-genotype/)
+* Fixed bugs related to `--read-lengths` option
+
+
 ### HISAT 2.2.0 release 2/6/2020
 
 This major version update includes a new feature to handle “repeat” reads. Based on sets of 100-bp simulated and 101-bp real reads that we tested, we found that 2.6-3.4% and 1.4-1.8% of the reads were mapped to >5 locations and >100 locations, respectively. Attempting to report all alignments would likely consume a prohibitive amount of disk space. In order to address this issue, our repeat indexing and alignment approach directly aligns reads to repeat sequences, resulting in one repeat alignment per read. HISAT2 provides application programming interfaces (API) for C++, Python, and JAVA that rapidly retrieve genomic locations from repeat alignments for use in downstream analyses.  

diff --git a/gfm.h b/gfm.h
@@ -687,12 +687,7 @@ class GFM {
 		useShmem_ = useShmem;
 		_in1Str = in + ".1." + gfm_ext;
 		_in2Str = in + ".2." + gfm_ext;
-        if(readLens != NULL) {
-            _readLens.resizeExact(readLens->size());
-            for(size_t i = 0; i < readLens->size(); i++) {
-                _readLens[i].first = _readLens[i].second = (*readLens)[i];
-            }
-        }
+
         if(skipLoading) return;
 
         if(repeatdb == NULL) {
@@ -805,37 +800,58 @@ class GFM {
         _repeat = false;
         if(repeatdb != NULL) {
             _repeat = true;
+
+			// Number of repeat groups in the index
             index_t numRepeatIndex = readIndex<index_t>(in7, this->toBe());
             assert_gt(numRepeatIndex, 0);
             EList<pair<index_t, index_t> > repeatLens; repeatLens.resizeExact(numRepeatIndex);
+
             for(size_t k = 0; k < numRepeatIndex; k++) {
                 repeatLens[k].first = readIndex<index_t>(in7, this->toBe());
                 repeatLens[k].second = readIndex<index_t>(in7, this->toBe());
             }
-            if(_readLens.empty()) {
-                _readLens = repeatLens;
-            }
-            _readIncluded.resizeExact(numRepeatIndex);
-            _readIncluded.fillZero();
-            size_t k = 0, k2 = 0;
-            while(k < numRepeatIndex && k2 < _readLens.size()) {
-                if(repeatLens[k].first >= _readLens[k2].first) {
-                    _readIncluded[k] = true;
-                    _readLens[k2] = repeatLens[k];
-                    k2++;
-                } else {
-                    k++;
-                }
-            }
-            _readLens.resize(k2);
-            repeatdb->read(in7, this->toBe(), _readIncluded);
+
+			if (readLens != NULL && !readLens->empty()) {
+				// Load subset of repeat groups.
+				size_t k = 0;
+				size_t k2 = 0;
+
+				_repeatIncluded.resizeExact(numRepeatIndex);
+				_repeatIncluded.fillZero();
+
+				while(k < numRepeatIndex && k2 < readLens->size()) {
+					if (repeatLens[k].first >= (*readLens)[k2]) {
+						_repeatIncluded[k] = true;
+						k2++;
+					} else {
+						k++;
+					}
+				}
+
+				// at least last repeat group is included
+				_repeatIncluded[numRepeatIndex - 1] = true;
+
+				_repeatLens.clear();
+				for(size_t i = 0; i < numRepeatIndex; i++) {
+					if (_repeatIncluded[i]) {
+						_repeatLens.push_back(repeatLens[i]);
+					}
+				}
+			} else {
+				// Load all repeat groups
+				_repeatLens = repeatLens;
+				_repeatIncluded.resizeExact(numRepeatIndex);
+				_repeatIncluded.fill(true);
+			}
+
+            repeatdb->read(in7, this->toBe(), _repeatIncluded);
             index_t numKmertables = readIndex<index_t>(in7, this->toBe());
             EList<streampos> filePos; filePos.resizeExact(numKmertables);
             for(size_t k = 0; k < numKmertables; k++) {
                 filePos[k] = readIndex<uint64_t>(in7, this->toBe());
             }
             for(size_t k = 0; k < numKmertables; k++) {
-                if(!_readIncluded[k])
+                if(!_repeatIncluded[k])
                     continue;
                 if(k > 0) {
                     in7.seekg(filePos[k-1]);
@@ -2115,27 +2131,27 @@ class GFM {
                         throw 1;
                     }
 
-                    _readLens.resizeExact(szs.size());
-                    for(size_t i = 0; i < _readLens.size(); i++) {
-                        _readLens[i].first = numeric_limits<index_t>::max();
-                        _readLens[i].second = 0;
+                    _repeatLens.resizeExact(szs.size());
+                    for(size_t i = 0; i < _repeatLens.size(); i++) {
+                        _repeatLens[i].first = numeric_limits<index_t>::max();
+                        _repeatLens[i].second = 0;
                     }
                     for(size_t i = 0; i < repeats.size(); i++) {
                         index_t id = repeats[i].repID;
                         index_t len = repeats[i].repLen;
-                        assert_lt(id, _readLens.size());
-                        if(_readLens[id].first > len) {
-                            _readLens[id].first = len;
+                        assert_lt(id, _repeatLens.size());
+                        if(_repeatLens[id].first > len) {
+                            _repeatLens[id].first = len;
                         }
-                        if(_readLens[id].second < len) {
-                            _readLens[id].second = len;
+                        if(_repeatLens[id].second < len) {
+                            _repeatLens[id].second = len;
                         }
                     }
 
-                    writeIndex<index_t>(fout7, _readLens.size(), this->toBe());
-                    for(size_t i = 0; i < _readLens.size(); i++) {
-                        writeIndex<index_t>(fout7, _readLens[i].first, this->toBe());
-                        writeIndex<index_t>(fout7, _readLens[i].second, this->toBe());
+                    writeIndex<index_t>(fout7, _repeatLens.size(), this->toBe());
+                    for(size_t i = 0; i < _repeatLens.size(); i++) {
+                        writeIndex<index_t>(fout7, _repeatLens[i].first, this->toBe());
+                        writeIndex<index_t>(fout7, _repeatLens[i].second, this->toBe());
                     }
                     _repeatdb.write(fout7, this->toBe());
                     writeIndex<index_t>(fout7, chr_szs.size(), this->toBe()); // number of repeat indexes
@@ -2440,7 +2456,7 @@ class GFM {
 	EList<string>& refnames()        { return _refnames; }
     bool        fw() const           { return fw_; }
     bool        repeat() const       { return _repeat; }
-    const EList<uint8_t>& getReadIncluded() const { return _readIncluded; }
+    const EList<uint8_t>& getRepeatIncluded() const { return _repeatIncluded; }
 
 #ifdef POPCNT_CAPABILITY
     bool _usePOPCNTinstruction;
@@ -4305,8 +4321,8 @@ class GFM {
     EList<RB_KmerTable>        _repeat_kmertables;
 
     bool _repeat;
-    EList<pair<index_t, index_t> > _readLens;
-    EList<uint8_t>                 _readIncluded;
+    EList<pair<index_t, index_t> > _repeatLens;
+    EList<uint8_t>                 _repeatIncluded;
 
 protected:
 

diff --git a/hisat2 b/hisat2
@@ -57,6 +57,7 @@ my $idx_ext_l     = 'ht2l';
 my $idx_ext_s     = 'ht2'; 
 my $idx_ext       = $idx_ext_s; 
 my $seq_in_args = 0;
+my $skip_read_stat = 0;
 my %signo       = ();
 my @signame     = ();
 
@@ -74,6 +75,8 @@ my @signame     = ();
 (-x "$align_prog") ||
 	Fail("Expected hisat2 to be in same directory with hisat2-align-s and hisat2-align-l:\n$script_path\n");
 
+(-x "$read_stat_prog") || ($skip_read_stat = 1);
+
 # Get description of arguments from HISAT so that we can distinguish HISAT
 # args from wrapper args
 sub getHt2Desc($) {
@@ -183,6 +186,10 @@ for(my $i = 0; $i < scalar(@ht2_args); $i++) {
 		$large_idx = 1;
 		$ht2_args[$i] = undef;
 	}
+	if($arg eq "--skip-read-lengths") {
+		$skip_read_stat = 1;
+		$ht2_args[$i] = undef
+	}
 	if($arg eq "-c") {
 		$seq_in_args = 1;
 	}
@@ -277,7 +284,9 @@ Info("  Binary args:\n[ @ht2_args ]\n");
 # check read lengths
 # if read_files have more than 1 files, use first one,
 my @read_files = (scalar(@unps) > 0) ? @unps : @mate1s;
-if ((scalar(@read_files) > 0) && ($seq_in_args == 0)) {
+if ((scalar(@read_files) > 0)
+		&& ($seq_in_args == 0)
+		&& ($skip_read_stat == 0)) {
 	Info("Check read length: $read_files[0]\n");
 	my $cmd = "'$read_stat_prog' $read_files[0]";
 	my $read_len_str = "";
@@ -287,7 +296,11 @@ if ((scalar(@read_files) > 0) && ($seq_in_args == 0)) {
 		chomp;
 		next if /^\s*$/;
 		my @ts = split(/ /);
-		$read_len_str = $ts[4];
+		if (scalar(@ts) > 4) {
+			$read_len_str = $ts[4];
+		} else {
+			$read_len_str = "";
+		}
 	}
 	close($fh);
 

diff --git a/hisat2.cpp b/hisat2.cpp
@@ -1779,7 +1779,7 @@ static void parseOption(int next_option, const char *arg) {
             EList<string> str_readLens;
             tokenize(arg, ",", str_readLens);
             for(size_t i = 0; i < str_readLens.size(); i++) {
-                int readLen = parseInt(20, "--read-lengths arg must be at least 20", str_readLens[i].c_str());
+                int readLen = parseInt(0, "--read-lengths arg must be at least 0", str_readLens[i].c_str());
                 readLens.push_back(readLen);
             }
             readLens.sort();
@@ -4052,7 +4052,7 @@ static void driver(
 
         BitPairReference* rrefs = NULL;
         if(rep_index_exists && use_repeat_index) {
-            const EList<uint8_t>& included = rgfm->getReadIncluded();
+            const EList<uint8_t>& included = rgfm->getRepeatIncluded();
             rrefs = new BitPairReference(
                                          rep_adjIdxBase,
                                          &included,

diff --git a/hisat2_extract_exons.py b/hisat2_extract_exons.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 #
 # Copyright 2015, Daehwan Kim <[email protected]>
@@ -19,8 +19,6 @@
 # along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-from __future__ import print_function
-
 from sys import stderr, exit
 from collections import defaultdict as dd, Counter
 from argparse import ArgumentParser, FileType