From dd419fb17b5e2bb9bfc259a0f9e4cb951bfa0940 Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Mon, 30 Jan 2017 09:54:49 -0800 Subject: [PATCH 1/8] Add option to sidestep NFS issues --- scripts/trim-low-abund.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/trim-low-abund.py b/scripts/trim-low-abund.py index 597162e7b8..0b3aa44632 100755 --- a/scripts/trim-low-abund.py +++ b/scripts/trim-low-abund.py @@ -136,6 +136,8 @@ def get_parser(): parser.add_argument('--tempdir', '-T', type=str, default='./', help="Set location of temporary directory for " "second pass") + parser.add_argument('--keep-temp', action='store_true', help='do not ' + 'delete temporary files or directories') add_output_compression_type(parser) parser.add_argument('--diginorm', default=False, action='store_true', @@ -439,8 +441,12 @@ def main(): if not args.output: trimfp.close() - log_info('removing temp directory & contents ({temp})', temp=tempdir) - shutil.rmtree(tempdir) + if args.keep_temp: + log_info('--keep-temp mode, not removing temp directory "{temp}"', + temp=tempdir) + else: + log_info('removing temp directory & contents ({temp})', temp=tempdir) + shutil.rmtree(tempdir) trimmed_reads = trimmer.trimmed_reads From 915069c89945ef2f6061ca7da3e64e4dc782ecf8 Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Mon, 30 Jan 2017 14:18:05 -0800 Subject: [PATCH 2/8] Updated change log --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 863491d76c..400cda7b9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ under semantic versioning, but will be in future versions of khmer. ## [Unreleased] ### Added +- New `--keep-temp` option for `trim-low-abund.py` that does not delete the + temporary directory created by the script. This sidesteps NFS-related issues + we've seen on multiple HPC file systems. - New `--no-reformat` option for `interleave-reads.py` script disables default read name correction behavior. - New `HashSet` data structure for managing collections of k-mer hashes and From 91fbfa73d23b2a79d48ac3d30606b53d3bc6daf6 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Wed, 1 Feb 2017 15:00:01 +0100 Subject: [PATCH 3/8] Add close method to ReadParser --- khmer/_khmer.cc | 18 ++++++++++++++++++ lib/read_parsers.cc | 10 ++++++++++ lib/read_parsers.hh | 2 ++ scripts/trim-low-abund.py | 5 ++++- 4 files changed, 34 insertions(+), 1 deletion(-) diff --git a/khmer/_khmer.cc b/khmer/_khmer.cc index 0bfa450872..91e1f30f27 100644 --- a/khmer/_khmer.cc +++ b/khmer/_khmer.cc @@ -284,6 +284,10 @@ static bool convert_Pytablesizes_to_vector(PyListObject * sizes_list_o, } +static +read_parsers::IParser * +_PyObject_to_khmer_ReadParser(PyObject * py_object); + /***********************************************************************/ // @@ -791,6 +795,16 @@ ReadParser_iter_read_pairs(PyObject * self, PyObject * args ) } +PyObject * +ReadParser_close(PyObject * self, PyObject * args) +{ + read_parsers::IParser* rparser = _PyObject_to_khmer_ReadParser(self); + rparser->close(); + + Py_INCREF(Py_None); + return Py_None; +} + static PyMethodDef _ReadParser_methods [ ] = { { "iter_reads", (PyCFunction)ReadParser_iter_reads, @@ -800,6 +814,10 @@ static PyMethodDef _ReadParser_methods [ ] = { "iter_read_pairs", (PyCFunction)ReadParser_iter_read_pairs, METH_VARARGS, "Iterates over paired reads as pairs." }, + { + "close", (PyCFunction)ReadParser_close, + METH_NOARGS, "Close associated files." + }, { NULL, NULL, 0, NULL } // sentinel }; diff --git a/lib/read_parsers.cc b/lib/read_parsers.cc index 9ccf92de5a..d3bf7509c2 100644 --- a/lib/read_parsers.cc +++ b/lib/read_parsers.cc @@ -232,6 +232,12 @@ bool ReadParser::is_complete() return _parser->is_complete(); } +template +void ReadParser::close() +{ + _parser->close(); +} + void FastxReader::_init() { seqan::open(_stream, _filename.c_str()); @@ -286,6 +292,10 @@ size_t FastxReader::get_num_reads() return _num_reads; } +void FastxParser::close() { + seqan::close(_stream); +} + Read FastxReader::get_next_read() { Read read; diff --git a/lib/read_parsers.hh b/lib/read_parsers.hh index a92771fcaf..a094d576ef 100644 --- a/lib/read_parsers.hh +++ b/lib/read_parsers.hh @@ -167,6 +167,7 @@ public: size_t get_num_reads(); bool is_complete(); + void close(); }; // class ReadParser @@ -189,6 +190,7 @@ public: Read get_next_read(); bool is_complete(); size_t get_num_reads(); + void close(); }; // class FastxReader diff --git a/scripts/trim-low-abund.py b/scripts/trim-low-abund.py index 597162e7b8..a8178bc228 100755 --- a/scripts/trim-low-abund.py +++ b/scripts/trim-low-abund.py @@ -414,7 +414,8 @@ def main(): # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. - paired_iter = broken_paired_reader(ReadParser(pass2filename), + read_parser = ReadParser(pass2filename) + paired_iter = broken_paired_reader(read_parser, min_length=K, force_single=True) @@ -432,6 +433,8 @@ def main(): written_reads += 1 written_bp += len(read) + read_parser.close() + log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) From 791dd4da98010201a3192c6674e5ab57051c542a Mon Sep 17 00:00:00 2001 From: Tim Head Date: Thu, 2 Feb 2017 15:03:56 +0100 Subject: [PATCH 4/8] Formating and adjust to new FastxReader --- khmer/_khmer.cc | 18 +++++++++--------- lib/read_parsers.cc | 35 +++++++++++++++++------------------ lib/read_parsers.hh | 3 +-- 3 files changed, 27 insertions(+), 29 deletions(-) diff --git a/khmer/_khmer.cc b/khmer/_khmer.cc index 91e1f30f27..47b88efead 100644 --- a/khmer/_khmer.cc +++ b/khmer/_khmer.cc @@ -284,9 +284,7 @@ static bool convert_Pytablesizes_to_vector(PyListObject * sizes_list_o, } -static -read_parsers::IParser * -_PyObject_to_khmer_ReadParser(PyObject * py_object); +static FastxParserPtr& _PyObject_to_khmer_ReadParser(PyObject * py_object); /***********************************************************************/ @@ -798,7 +796,7 @@ ReadParser_iter_read_pairs(PyObject * self, PyObject * args ) PyObject * ReadParser_close(PyObject * self, PyObject * args) { - read_parsers::IParser* rparser = _PyObject_to_khmer_ReadParser(self); + FastxParserPtr& rparser = _PyObject_to_khmer_ReadParser(self); rparser->close(); Py_INCREF(Py_None); @@ -884,7 +882,8 @@ void _init_ReadParser_Type_constants() // Place pair mode constants into class dictionary. int result; - PyObject *value = PyLong_FromLong(ReadParser::PAIR_MODE_IGNORE_UNPAIRED); + PyObject *value = PyLong_FromLong( + ReadParser::PAIR_MODE_IGNORE_UNPAIRED); if (value == NULL) { Py_DECREF(cls_attrs_DICT); return; @@ -2324,7 +2323,7 @@ CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KHashtable_Object") = { PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */ "_khmer.KHashtable ", /*tp_name*/ - sizeof(khmer_KHashtable_Object) , /*tp_basicsize*/ + sizeof(khmer_KHashtable_Object), /*tp_basicsize*/ 0, /*tp_itemsize*/ 0, /*tp_dealloc*/ 0, /*tp_print*/ @@ -2954,7 +2953,7 @@ labelhash_consume_fasta_and_tag_with_labels(khmer_KGraphLabels_Object * me, //Py_BEGIN_ALLOW_THREADS try { hb->consume_fasta_and_tag_with_labels(filename, total_reads, - n_consumed); + n_consumed); } catch (khmer_file_exception &exc) { exc_string = exc.what(); file_exception = exc_string.c_str(); @@ -3784,8 +3783,9 @@ static PyObject * hllcounter_consume_fasta(khmer_KHLLCounter_Object * me, unsigned long long n_consumed = 0; unsigned int total_reads = 0; try { - me->hllcounter->consume_fasta(filename, stream_records, total_reads, - n_consumed); + me->hllcounter->consume_fasta(filename, stream_records, + total_reads, + n_consumed); } catch (khmer_file_exception &exc) { PyErr_SetString(PyExc_OSError, exc.what()); return NULL; diff --git a/lib/read_parsers.cc b/lib/read_parsers.cc index d3bf7509c2..f22dbc13ee 100644 --- a/lib/read_parsers.cc +++ b/lib/read_parsers.cc @@ -209,11 +209,9 @@ ReadPair ReadParser::get_next_read_pair(uint8_t mode) { if (mode == ReadParser::PAIR_MODE_IGNORE_UNPAIRED) { return _get_next_read_pair_in_ignore_mode(); - } - else if (mode == ReadParser::PAIR_MODE_ERROR_ON_UNPAIRED) { + } else if (mode == ReadParser::PAIR_MODE_ERROR_ON_UNPAIRED) { return _get_next_read_pair_in_error_mode(); - } - else { + } else { std::ostringstream oss; oss << "Unknown pair reading mode: " << mode; throw UnknownPairReadingMode(oss.str()); @@ -254,25 +252,25 @@ void FastxReader::_init() } FastxReader::FastxReader() - : _filename("-"), _spin_lock(0), _num_reads(0), _have_qualities(false) + : _filename("-"), _spin_lock(0), _num_reads(0), _have_qualities(false) { _init(); } FastxReader::FastxReader(const std::string& infile) - : _filename(infile), - _spin_lock(0), - _num_reads(0), - _have_qualities(false) + : _filename(infile), + _spin_lock(0), + _num_reads(0), + _have_qualities(false) { _init(); } FastxReader::FastxReader(FastxReader& other) - : _filename(other._filename), - _spin_lock(other._spin_lock), - _num_reads(other._num_reads), - _have_qualities(other._have_qualities) + : _filename(other._filename), + _spin_lock(other._spin_lock), + _num_reads(other._num_reads), + _have_qualities(other._have_qualities) { _stream = std::move(other._stream); } @@ -292,7 +290,8 @@ size_t FastxReader::get_num_reads() return _num_reads; } -void FastxParser::close() { +void FastxReader::close() +{ seqan::close(_stream); } @@ -345,10 +344,10 @@ template ReadParserPtr get_parser(const std::string& filename) { return ReadParserPtr( - new ReadParser( - std::unique_ptr(new SeqIO(filename)) - ) - ); + new ReadParser( + std::unique_ptr(new SeqIO(filename)) + ) + ); } // All template instantiations used in the codebase must be declared here. diff --git a/lib/read_parsers.hh b/lib/read_parsers.hh index a094d576ef..2773b1ff67 100644 --- a/lib/read_parsers.hh +++ b/lib/read_parsers.hh @@ -93,8 +93,7 @@ struct InvalidReadPair : public khmer_value_exception { unsigned char _to_valid_dna(const unsigned char c); -struct Read -{ +struct Read { std::string name; std::string description; std::string sequence; From 2e03fe3b97ecaff0add8ae80afb6e5b3e8cac8e6 Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Mon, 6 Feb 2017 16:13:12 -0800 Subject: [PATCH 5/8] Catch OSError instead of adding a script option --- CHANGELOG.md | 3 --- scripts/trim-low-abund.py | 10 ++++------ 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b3478f8bb2..5132a476d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,9 +8,6 @@ under semantic versioning, but will be in future versions of khmer. ## [Unreleased] ### Added -- New `--keep-temp` option for `trim-low-abund.py` that does not delete the - temporary directory created by the script. This sidesteps NFS-related issues - we've seen on multiple HPC file systems. - New `--no-reformat` option for `interleave-reads.py` script disables default read name correction behavior. - New `HashSet` data structure for managing collections of k-mer hashes and diff --git a/scripts/trim-low-abund.py b/scripts/trim-low-abund.py index 0b3aa44632..023c588c4c 100755 --- a/scripts/trim-low-abund.py +++ b/scripts/trim-low-abund.py @@ -136,8 +136,6 @@ def get_parser(): parser.add_argument('--tempdir', '-T', type=str, default='./', help="Set location of temporary directory for " "second pass") - parser.add_argument('--keep-temp', action='store_true', help='do not ' - 'delete temporary files or directories') add_output_compression_type(parser) parser.add_argument('--diginorm', default=False, action='store_true', @@ -441,12 +439,12 @@ def main(): if not args.output: trimfp.close() - if args.keep_temp: - log_info('--keep-temp mode, not removing temp directory "{temp}"', - temp=tempdir) - else: + try: log_info('removing temp directory & contents ({temp})', temp=tempdir) shutil.rmtree(tempdir) + except OSError as oe: + log_info('WARNING: unable to remove {temp} (probably an NFS issue); ' + 'please remove manually', temp=tempdir) trimmed_reads = trimmer.trimmed_reads From bef16bb981005715de6b15d344f3d4d1a6364024 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Mon, 13 Feb 2017 16:14:04 +0100 Subject: [PATCH 6/8] Fix generation of coverage for c++ code --- .travis.yml | 3 ++- Makefile | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 9b1604be73..23f8ee3325 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ notifications: # for a PR. So we disable build-on-push for all branches # except the ones whitelisted here. branches: - only: + only: - master matrix: @@ -69,6 +69,7 @@ script: # generate all the diagnostic reports after_success: + - make clean - PYTEST_ADDOPTS=-qqq make coverage-gcovr.xml coverage.xml # Fix suggested by http://diff-cover.readthedocs.io/en/latest/#troubleshooting - git fetch origin master:refs/remotes/origin/master diff --git a/Makefile b/Makefile index b4ecc23901..3d6982c49a 100644 --- a/Makefile +++ b/Makefile @@ -136,6 +136,7 @@ clean: FORCE rm -f diff-cover.html rm -Rf build dist rm -rf __pycache__/ .eggs/ khmer.egg-info/ + -rm *.gcov debug: FORCE export CFLAGS="-pg -fprofile-arcs -D_GLIBCXX_DEBUG_PEDANTIC \ From f234a9f25ccebdf08b7b6275779a0b0c26ed6b92 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Wed, 27 Apr 2016 10:47:19 -0700 Subject: [PATCH 7/8] Add test to trigger #1353, implement one possible fix --- lib/kmer_hash.cc | 6 ++++++ tests/test_functions.py | 1 + 2 files changed, 7 insertions(+) diff --git a/lib/kmer_hash.cc b/lib/kmer_hash.cc index fcfbc4639b..4f21e1554a 100644 --- a/lib/kmer_hash.cc +++ b/lib/kmer_hash.cc @@ -184,6 +184,12 @@ HashIntoType _hash_murmur(const std::string& kmer, h = out[0]; std::string rev = khmer::_revcomp(kmer); + if (rev == kmer) { + // self complement kmer, can't use bitwise XOR + r = out[0]; + return h; + } + MurmurHash3_x64_128((void *)rev.c_str(), rev.size(), seed, &out); r = out[0]; diff --git a/tests/test_functions.py b/tests/test_functions.py index 1cb86bb433..c537d2e465 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -153,6 +153,7 @@ def test_hash_murmur3(): assert khmer.hash_murmur3('TTTT') == 526240128537019279 assert khmer.hash_murmur3('CCCC') == 14391997331386449225 assert khmer.hash_murmur3('GGGG') == 14391997331386449225 + assert khmer.hash_murmur3('TATA') != 0 def test_hash_no_rc_murmur3(): From 9b4b03852f0afd70f50277cced09539bceef94d9 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Wed, 27 Apr 2016 10:56:09 -0700 Subject: [PATCH 8/8] Uses testcases suggested by @dkoslicki --- lib/kmer_hash.cc | 2 +- tests/test_functions.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/kmer_hash.cc b/lib/kmer_hash.cc index 4f21e1554a..fb2eca61ef 100644 --- a/lib/kmer_hash.cc +++ b/lib/kmer_hash.cc @@ -188,7 +188,7 @@ HashIntoType _hash_murmur(const std::string& kmer, // self complement kmer, can't use bitwise XOR r = out[0]; return h; - } + } MurmurHash3_x64_128((void *)rev.c_str(), rev.size(), seed, &out); r = out[0]; diff --git a/tests/test_functions.py b/tests/test_functions.py index c537d2e465..80cb6661a5 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -153,7 +153,9 @@ def test_hash_murmur3(): assert khmer.hash_murmur3('TTTT') == 526240128537019279 assert khmer.hash_murmur3('CCCC') == 14391997331386449225 assert khmer.hash_murmur3('GGGG') == 14391997331386449225 - assert khmer.hash_murmur3('TATA') != 0 + assert khmer.hash_murmur3('TATATATATATATATATATA') != 0 + assert khmer.hash_murmur3('TTTTGCAAAA') != 0 + assert khmer.hash_murmur3('GAAAATTTTC') != 0 def test_hash_no_rc_murmur3():