Skip to content

Commit

Permalink
Merge pull request #258 from DaehwanKimLab/release/2.2.1-rc
Browse files Browse the repository at this point in the history
Release/2.2.1
  • Loading branch information
parkchanhee authored Jul 24, 2020
2 parents 20f24e3 + cdc238c commit 4a411a9
Show file tree
Hide file tree
Showing 37 changed files with 284 additions and 13,670 deletions.
2 changes: 0 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,6 @@ HT2LIB_PKG_SRC = \
GENERAL_LIST = $(wildcard scripts/*.sh) \
$(wildcard scripts/*.pl) \
$(wildcard *.py) \
$(wildcard hisatgenotype_modules/*.py) \
$(wildcard hisatgenotype_scripts/*.py) \
$(wildcard example/index/*.ht2) \
$(wildcard example/reads/*.fa) \
example/reference/22_20-21M.fa \
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.2.0
2.2.1
9 changes: 8 additions & 1 deletion docs/_data/download-binary.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
latest_version: 2.2.0,2.1.0
latest_version: 2.2.1,2.2.0,2.1.0
release:
- version: 2.2.1
date: 7/24/2020
name: HISAT2
artifacts:
Source: https://cloud.biohpc.swmed.edu/index.php/s/fE9QCsX3NH4QwBi/download
OSX_x86_64: https://cloud.biohpc.swmed.edu/index.php/s/an8KdGxxRdSRXjr/download
Linux_x86_64: https://cloud.biohpc.swmed.edu/index.php/s/4pMgDq4oAF9QCfA/download
- version: 2.2.0
date: 2/6/2020
name: HISAT2
Expand Down
8 changes: 8 additions & 0 deletions docs/_pages/hisat2.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@ share: false
**HISAT2** is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome. Based on an extension of BWT for graphs ([Sirén et al. 2014](http://dl.acm.org/citation.cfm?id=2674828)), we designed and implemented a graph FM index (GFM), an original approach and its first implementation. In addition to using one global GFM index that represents a population of human genomes, **HISAT2** uses a large set of small GFM indexes that collectively cover the whole genome. These small indexes (called local indexes), combined with several alignment strategies, enable rapid and accurate alignment of sequencing reads. This new indexing scheme is called a Hierarchical Graph FM index (HGFM).


### HISAT 2.2.1 release 7/24/2020

This patch version includes the following changes.
* Python3 support
* Remove the HISAT-genotype related scripts. HISAT-genotype moved to [http://daehwankimlab.github.io/hisat-genotype/](http://daehwankimlab.github.io/hisat-genotype/)
* Fixed bugs related to `--read-lengths` option


### HISAT 2.2.0 release 2/6/2020

This major version update includes a new feature to handle “repeat” reads. Based on sets of 100-bp simulated and 101-bp real reads that we tested, we found that 2.6-3.4% and 1.4-1.8% of the reads were mapped to >5 locations and >100 locations, respectively. Attempting to report all alignments would likely consume a prohibitive amount of disk space. In order to address this issue, our repeat indexing and alignment approach directly aligns reads to repeat sequences, resulting in one repeat alignment per read. HISAT2 provides application programming interfaces (API) for C++, Python, and JAVA that rapidly retrieve genomic locations from repeat alignments for use in downstream analyses.
Expand Down
96 changes: 56 additions & 40 deletions gfm.h
Original file line number Diff line number Diff line change
Expand Up @@ -687,12 +687,7 @@ class GFM {
useShmem_ = useShmem;
_in1Str = in + ".1." + gfm_ext;
_in2Str = in + ".2." + gfm_ext;
if(readLens != NULL) {
_readLens.resizeExact(readLens->size());
for(size_t i = 0; i < readLens->size(); i++) {
_readLens[i].first = _readLens[i].second = (*readLens)[i];
}
}

if(skipLoading) return;

if(repeatdb == NULL) {
Expand Down Expand Up @@ -805,37 +800,58 @@ class GFM {
_repeat = false;
if(repeatdb != NULL) {
_repeat = true;

// Number of repeat groups in the index
index_t numRepeatIndex = readIndex<index_t>(in7, this->toBe());
assert_gt(numRepeatIndex, 0);
EList<pair<index_t, index_t> > repeatLens; repeatLens.resizeExact(numRepeatIndex);

for(size_t k = 0; k < numRepeatIndex; k++) {
repeatLens[k].first = readIndex<index_t>(in7, this->toBe());
repeatLens[k].second = readIndex<index_t>(in7, this->toBe());
}
if(_readLens.empty()) {
_readLens = repeatLens;
}
_readIncluded.resizeExact(numRepeatIndex);
_readIncluded.fillZero();
size_t k = 0, k2 = 0;
while(k < numRepeatIndex && k2 < _readLens.size()) {
if(repeatLens[k].first >= _readLens[k2].first) {
_readIncluded[k] = true;
_readLens[k2] = repeatLens[k];
k2++;
} else {
k++;
}
}
_readLens.resize(k2);
repeatdb->read(in7, this->toBe(), _readIncluded);

if (readLens != NULL && !readLens->empty()) {
// Load subset of repeat groups.
size_t k = 0;
size_t k2 = 0;

_repeatIncluded.resizeExact(numRepeatIndex);
_repeatIncluded.fillZero();

while(k < numRepeatIndex && k2 < readLens->size()) {
if (repeatLens[k].first >= (*readLens)[k2]) {
_repeatIncluded[k] = true;
k2++;
} else {
k++;
}
}

// at least last repeat group is included
_repeatIncluded[numRepeatIndex - 1] = true;

_repeatLens.clear();
for(size_t i = 0; i < numRepeatIndex; i++) {
if (_repeatIncluded[i]) {
_repeatLens.push_back(repeatLens[i]);
}
}
} else {
// Load all repeat groups
_repeatLens = repeatLens;
_repeatIncluded.resizeExact(numRepeatIndex);
_repeatIncluded.fill(true);
}

repeatdb->read(in7, this->toBe(), _repeatIncluded);
index_t numKmertables = readIndex<index_t>(in7, this->toBe());
EList<streampos> filePos; filePos.resizeExact(numKmertables);
for(size_t k = 0; k < numKmertables; k++) {
filePos[k] = readIndex<uint64_t>(in7, this->toBe());
}
for(size_t k = 0; k < numKmertables; k++) {
if(!_readIncluded[k])
if(!_repeatIncluded[k])
continue;
if(k > 0) {
in7.seekg(filePos[k-1]);
Expand Down Expand Up @@ -2115,27 +2131,27 @@ class GFM {
throw 1;
}

_readLens.resizeExact(szs.size());
for(size_t i = 0; i < _readLens.size(); i++) {
_readLens[i].first = numeric_limits<index_t>::max();
_readLens[i].second = 0;
_repeatLens.resizeExact(szs.size());
for(size_t i = 0; i < _repeatLens.size(); i++) {
_repeatLens[i].first = numeric_limits<index_t>::max();
_repeatLens[i].second = 0;
}
for(size_t i = 0; i < repeats.size(); i++) {
index_t id = repeats[i].repID;
index_t len = repeats[i].repLen;
assert_lt(id, _readLens.size());
if(_readLens[id].first > len) {
_readLens[id].first = len;
assert_lt(id, _repeatLens.size());
if(_repeatLens[id].first > len) {
_repeatLens[id].first = len;
}
if(_readLens[id].second < len) {
_readLens[id].second = len;
if(_repeatLens[id].second < len) {
_repeatLens[id].second = len;
}
}

writeIndex<index_t>(fout7, _readLens.size(), this->toBe());
for(size_t i = 0; i < _readLens.size(); i++) {
writeIndex<index_t>(fout7, _readLens[i].first, this->toBe());
writeIndex<index_t>(fout7, _readLens[i].second, this->toBe());
writeIndex<index_t>(fout7, _repeatLens.size(), this->toBe());
for(size_t i = 0; i < _repeatLens.size(); i++) {
writeIndex<index_t>(fout7, _repeatLens[i].first, this->toBe());
writeIndex<index_t>(fout7, _repeatLens[i].second, this->toBe());
}
_repeatdb.write(fout7, this->toBe());
writeIndex<index_t>(fout7, chr_szs.size(), this->toBe()); // number of repeat indexes
Expand Down Expand Up @@ -2440,7 +2456,7 @@ class GFM {
EList<string>& refnames() { return _refnames; }
bool fw() const { return fw_; }
bool repeat() const { return _repeat; }
const EList<uint8_t>& getReadIncluded() const { return _readIncluded; }
const EList<uint8_t>& getRepeatIncluded() const { return _repeatIncluded; }

#ifdef POPCNT_CAPABILITY
bool _usePOPCNTinstruction;
Expand Down Expand Up @@ -4305,8 +4321,8 @@ class GFM {
EList<RB_KmerTable> _repeat_kmertables;

bool _repeat;
EList<pair<index_t, index_t> > _readLens;
EList<uint8_t> _readIncluded;
EList<pair<index_t, index_t> > _repeatLens;
EList<uint8_t> _repeatIncluded;

protected:

Expand Down
17 changes: 15 additions & 2 deletions hisat2
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ my $idx_ext_l = 'ht2l';
my $idx_ext_s = 'ht2';
my $idx_ext = $idx_ext_s;
my $seq_in_args = 0;
my $skip_read_stat = 0;
my %signo = ();
my @signame = ();

Expand All @@ -74,6 +75,8 @@ my @signame = ();
(-x "$align_prog") ||
Fail("Expected hisat2 to be in same directory with hisat2-align-s and hisat2-align-l:\n$script_path\n");

(-x "$read_stat_prog") || ($skip_read_stat = 1);

# Get description of arguments from HISAT so that we can distinguish HISAT
# args from wrapper args
sub getHt2Desc($) {
Expand Down Expand Up @@ -183,6 +186,10 @@ for(my $i = 0; $i < scalar(@ht2_args); $i++) {
$large_idx = 1;
$ht2_args[$i] = undef;
}
if($arg eq "--skip-read-lengths") {
$skip_read_stat = 1;
$ht2_args[$i] = undef
}
if($arg eq "-c") {
$seq_in_args = 1;
}
Expand Down Expand Up @@ -277,7 +284,9 @@ Info(" Binary args:\n[ @ht2_args ]\n");
# check read lengths
# if read_files have more than 1 files, use first one,
my @read_files = (scalar(@unps) > 0) ? @unps : @mate1s;
if ((scalar(@read_files) > 0) && ($seq_in_args == 0)) {
if ((scalar(@read_files) > 0)
&& ($seq_in_args == 0)
&& ($skip_read_stat == 0)) {
Info("Check read length: $read_files[0]\n");
my $cmd = "'$read_stat_prog' $read_files[0]";
my $read_len_str = "";
Expand All @@ -287,7 +296,11 @@ if ((scalar(@read_files) > 0) && ($seq_in_args == 0)) {
chomp;
next if /^\s*$/;
my @ts = split(/ /);
$read_len_str = $ts[4];
if (scalar(@ts) > 4) {
$read_len_str = $ts[4];
} else {
$read_len_str = "";
}
}
close($fh);

Expand Down
4 changes: 2 additions & 2 deletions hisat2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1779,7 +1779,7 @@ static void parseOption(int next_option, const char *arg) {
EList<string> str_readLens;
tokenize(arg, ",", str_readLens);
for(size_t i = 0; i < str_readLens.size(); i++) {
int readLen = parseInt(20, "--read-lengths arg must be at least 20", str_readLens[i].c_str());
int readLen = parseInt(0, "--read-lengths arg must be at least 0", str_readLens[i].c_str());
readLens.push_back(readLen);
}
readLens.sort();
Expand Down Expand Up @@ -4052,7 +4052,7 @@ static void driver(

BitPairReference* rrefs = NULL;
if(rep_index_exists && use_repeat_index) {
const EList<uint8_t>& included = rgfm->getReadIncluded();
const EList<uint8_t>& included = rgfm->getRepeatIncluded();
rrefs = new BitPairReference(
rep_adjIdxBase,
&included,
Expand Down
4 changes: 1 addition & 3 deletions hisat2_extract_exons.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3

#
# Copyright 2015, Daehwan Kim <[email protected]>
Expand All @@ -19,8 +19,6 @@
# along with HISAT 2. If not, see <http://www.gnu.org/licenses/>.
#

from __future__ import print_function

from sys import stderr, exit
from collections import defaultdict as dd, Counter
from argparse import ArgumentParser, FileType
Expand Down
Loading

0 comments on commit 4a411a9

Please sign in to comment.