diff --git a/.gitignore b/.gitignore index 300b2c4b86..40c917a9c9 100644 --- a/.gitignore +++ b/.gitignore @@ -54,3 +54,4 @@ pep257_report.txt .cache/ khmer/_oxli/*.cpp .eggs +tags diff --git a/Makefile b/Makefile index 568a4c4626..bc5f065c2e 100644 --- a/Makefile +++ b/Makefile @@ -106,11 +106,14 @@ install-dependencies: pip install --requirement doc/requirements.txt ## sharedobj : build khmer shared object file -sharedobj: $(EXTENSION_MODULE) +sharedobj: $(EXTENSION_MODULE) $(CYTHON_MODULE) $(EXTENSION_MODULE): $(CPPSOURCES) $(CYSOURCES) ./setup.py build_ext --inplace +$(CYTHON_MODULE): $(CPPSOURCES) $(CYSOURCES) + ./setup.py build_ext --inplace + coverage-debug: $(CPPSOURCES) export CFLAGS="-pg -fprofile-arcs -ftest-coverage -O0"; ./setup.py \ build_ext --debug --inplace --libraries gcov @@ -144,6 +147,7 @@ clean: FORCE rm -rf __pycache__/ khmer.egg-info/ @find ./ -type d -name __pycache__ -exec rm -rf {} + @find ./khmer/ -type f -name *$(MODEXT) -exec rm -f {} + + @find ./khmer/_oxli/ -type f -name *.so -exec rm -f {} + -rm -f *.gcov debug: FORCE @@ -285,6 +289,8 @@ install-liboxli: liboxli cd src/oxli && $(MAKE) install PREFIX=$(PREFIX) mkdir -p $(PREFIX)/include/khmer cp -r include/khmer/_cpy_*.hh $(PREFIX)/include/khmer/ + cp include/oxli/oxli_exception_convert.hh $(PREFIX)/include/oxli/ + cp third-party/rollinghash/*.h $(PREFIX)/include/oxli/ # Runs a test of liboxli libtest: FORCE diff --git a/include/khmer/_cpy_khmer.hh b/include/khmer/_cpy_khmer.hh index a9c9e8b82c..78b8bd150f 100644 --- a/include/khmer/_cpy_khmer.hh +++ b/include/khmer/_cpy_khmer.hh @@ -44,7 +44,6 @@ Contact: khmer-project@idyll.org #include -#include #include "_cpy_utils.hh" @@ -77,20 +76,6 @@ Contact: khmer-project@idyll.org namespace khmer { -PyObject * forward_hash(PyObject * self, PyObject * args); - -PyObject * forward_hash_no_rc(PyObject * self, PyObject * args); - -PyObject * reverse_hash(PyObject * self, PyObject * args); - -PyObject * murmur3_forward_hash(PyObject * self, PyObject * args); - -PyObject * murmur3_forward_hash_no_rc(PyObject * self, PyObject * args); - -PyObject * reverse_complement(PyObject * self, PyObject * args); - -PyObject * get_version_cpp( PyObject * self, PyObject * args ); - extern PyMethodDef KhmerMethods[]; } diff --git a/include/oxli/assembler.hh b/include/oxli/assembler.hh index 48bbe9164e..85fbdf2bd7 100644 --- a/include/oxli/assembler.hh +++ b/include/oxli/assembler.hh @@ -53,6 +53,7 @@ namespace oxli class Hashgraph; class LabelHash; + /** * \class LinearAssembler * @@ -78,8 +79,10 @@ public: WordLength _ksize; const Hashgraph * graph; + std::shared_ptr global_visited; - explicit LinearAssembler(const Hashgraph * ht); + explicit LinearAssembler(const Hashgraph * ht, + std::shared_ptr global_visited = nullptr); virtual std::string assemble(const Kmer seed_kmer, const Hashgraph * stop_bf = 0) const; @@ -97,12 +100,36 @@ public: // The explicit specializations need to be declared in the same translation unit // as their unspecialized declaration. template<> -std::string LinearAssembler::_assemble_directed(AssemblerTraverser - &cursor) const; +std::string LinearAssembler::_assemble_directed(AssemblerTraverser &cursor) const; template<> -std::string LinearAssembler::_assemble_directed(AssemblerTraverser - &cursor) const; +std::string LinearAssembler::_assemble_directed(AssemblerTraverser &cursor) const; + + +class CompactingAssembler: public LinearAssembler +{ +public: + + explicit CompactingAssembler(const Hashgraph* ht, + std::shared_ptr global_visited=nullptr) + : LinearAssembler(ht, global_visited) {} + + virtual std::string assemble(const Kmer seed_kmer, + const Hashgraph * stop_bf) const; + + virtual std::string assemble_right(const Kmer seed_kmer, + const Hashgraph * stop_bf = 0) const; + + virtual std::string assemble_left(const Kmer seed_kmer, + const Hashgraph * stop_bf = 0) const; + + template + std::string _assemble_directed(CompactingAT& cursor) const + { + return LinearAssembler::_assemble_directed(cursor); + } +}; +typedef CompactingAssembler CpCompactingAssembler; /** @@ -160,7 +187,6 @@ public: explicit JunctionCountAssembler(Hashgraph * ht); ~JunctionCountAssembler(); - StringVector assemble(const Kmer seed_kmer, const Hashtable * stop_bf=0) const; diff --git a/include/oxli/cdbg.hh b/include/oxli/cdbg.hh new file mode 100644 index 0000000000..2724b7e748 --- /dev/null +++ b/include/oxli/cdbg.hh @@ -0,0 +1,1132 @@ +/* +This file is part of khmer, https://github.com/dib-lab/khmer/, and is +Copyright (C) 2015-2016, The Regents of the University of California. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the Michigan State University nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +LICENSE (END) + +Contact: khmer-project@idyll.org +*/ +#ifndef CDBG_HH +#define CDBG_HH + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "oxli.hh" +#include "kmer_hash.hh" +#include "hashtable.hh" +#include "hashgraph.hh" +#include "kmer_filters.hh" +#include "traversal.hh" +#include "assembler.hh" +#include "alphabets.hh" + +#define DEBUG_CDBG +# ifdef DEBUG_CDBG +# define pdebug(x) do { std::cout << std::endl << "@ " << __FILE__ <<\ + ":" << __FUNCTION__ << ":" <<\ + __LINE__ << std::endl << x << std::endl;\ + } while (0) +# else +# define pdebug(x) do {} while (0) +# endif + +#define complement(ch) ((ch) == 'A' ? 'T' : \ + (ch) == 'T' ? 'A' : \ + (ch) == 'C' ? 'G' : 'C') + +namespace oxli { + +typedef uint64_t id_t; +#define NULL_ID ULLONG_MAX + +using std::make_shared; +using std::shared_ptr; + +typedef std::pair HashIDPair; +typedef std::unordered_set UHashSet; +typedef std::vector HashVector; +typedef std::unordered_map HashIDMap; +typedef std::unordered_set IDSet; + + +enum compact_edge_meta_t { + FULL, + TIP, + ISLAND, + TRIVIAL +}; + + +inline const char * edge_meta_repr(compact_edge_meta_t meta) { + switch(meta) { + case FULL: + return "FULL"; + case TIP: + return "TIP"; + case ISLAND: + return "ISLAND"; + case TRIVIAL: + return "TRIVIAL"; + } +} + + +class CompactEdgeFactory; +class CompactEdge { + friend class CompactEdgeFactory; + +public: + + const id_t in_node_id; // left and right HDN IDs + const id_t out_node_id; + const id_t edge_id; + compact_edge_meta_t meta; + std::string sequence; + UHashSet tags; + + CompactEdge(id_t in_node_id, id_t out_node_id, id_t edge_id) : + in_node_id(in_node_id), out_node_id(out_node_id), + meta(FULL), edge_id(edge_id) {} + + CompactEdge(id_t in_node_id, id_t out_node_id, id_t edge_id, + compact_edge_meta_t meta) : + in_node_id(in_node_id), out_node_id(out_node_id), + meta(meta), edge_id(edge_id) {} + + void add_tags(UHashSet& new_tags) { + for (auto tag: new_tags) { + tags.insert(tag); + } + } + + std::string rc_sequence() const { + return _revcomp(sequence); + } + + float tag_density() const { + return (float)sequence.length() / (float)tags.size(); + } + + std::string tag_viz(WordLength K) const { + uint64_t pos; + std::string ret = "L=" + std::to_string(sequence.length()) + " "; + const char * _s = sequence.c_str(); + + for (pos = 0; pos < sequence.length() - K + 1; pos++) { + if (set_contains(tags, _hash(_s+pos, K))) { + ret += ("(" + std::to_string(pos) + ")"); + } + ret += sequence[pos]; + } + return ret; + } + + friend std::ostream& operator<<(std::ostream& stream, + const CompactEdge& edge) { + stream << ""; + return stream; + } + +}; + +typedef std::vector CompactEdgeVector; +typedef std::unordered_map TagEdgeMap; +typedef std::unordered_map IDEdgeMap; +typedef std::pair TagEdgePair; +typedef std::set TagEdgePairSet; +typedef std::set CompactEdgeSet; + +class CompactNodeFactory; +class CompactEdgeFactory : public KmerFactory { + friend class CompactNodeFactory; +protected: + + uint64_t n_compact_edges; + uint64_t _n_updates; + uint32_t tag_density; + + TagEdgeMap tags_to_edges; + IDEdgeMap compact_edges; + +public: + + CompactEdgeFactory(WordLength K) : + + KmerFactory(K), n_compact_edges(0), + _n_updates(0) { + + tag_density = DEFAULT_TAG_DENSITY; + } + + uint64_t n_edges() const { + return n_compact_edges; + } + + uint64_t n_updates() const { + return _n_updates; + } + + CompactEdge* build_edge(id_t left_id, id_t right_id, + compact_edge_meta_t edge_meta, + std::string edge_sequence) { + + CompactEdge* edge = new CompactEdge(left_id, right_id, + _n_updates, edge_meta); + compact_edges[_n_updates] = edge; + + pdebug("new compact edge: \n left=" << std::to_string(left_id) + << std::endl << " right=" << std::to_string(right_id) + << std::endl << " meta=" << edge_meta_repr(edge_meta) + << std::endl << " sequence =" << edge_sequence + << std::endl << " rc_sequence=" << _revcomp(edge_sequence) + << std::endl << " start =" << edge_sequence.substr(0, _ksize+1) + << std::endl << " rc_start=" << _revcomp(edge_sequence.substr(0, _ksize+1)) + << std::endl << " end =" + << edge_sequence.substr(edge_sequence.length()-_ksize-1, _ksize+1) + << std::endl << " rc_end =" + << _revcomp(edge_sequence.substr(edge_sequence.length()-_ksize-1, _ksize+1))); + + edge->sequence = edge_sequence; + n_compact_edges++; + _n_updates++; + return edge; + } + + CompactEdge* get_edge_by_id(id_t id) { + auto search = compact_edges.find(id); + if (search != compact_edges.end()) { + return search->second; + } + return nullptr; + } + + void delete_edge(CompactEdge * edge) { + //pdebug("attempt edge delete @" << edge); + if (edge != nullptr) { + pdebug("edge not null, proceeding"); + for (auto tag: edge->tags) { + tags_to_edges.erase(tag); + } + compact_edges.erase(edge->edge_id); + delete edge; + n_compact_edges--; + _n_updates++; + } + } + + void delete_edge_by_id(id_t id) { + CompactEdge* e = get_edge_by_id(id); + delete_edge(e); + } + + void delete_edge_by_tag(UHashSet& tags) { + CompactEdge* edge = get_edge(tags); + delete_edge(edge); + } + + void delete_edge_by_tag(HashIntoType tag) { + CompactEdge* edge = get_edge(tag); + delete_edge(edge); + } + + CompactEdge* get_edge(HashIntoType tag) const { + //pdebug("get compact edge from tag " << tag); + auto search = tags_to_edges.find(tag); + if (search != tags_to_edges.end()) { + return search->second; + } + return nullptr; + } + + bool get_tag_edge_pair(HashIntoType tag, TagEdgePair& pair) const { + auto search = tags_to_edges.find(tag); + if (search != tags_to_edges.end()) { + pair = *search; + return true; + } else { + return false; + } + } + + CompactEdge* get_edge(UHashSet& tags) const { + CompactEdge * edge = nullptr; + for (auto tag: tags) { + edge = get_edge(tag); + if (edge != nullptr) { + break; + } + } + return edge; + } + + KmerFilter get_tag_stopper(TagEdgePair& te_pair, + bool& found_tag) { const + KmerFilter stopper = [&] (const Kmer& node) { + found_tag = get_tag_edge_pair(node, te_pair); + return found_tag; + }; + + return stopper; + } + + void write_gml(const std::string filename, + const CompactNodeFactory& nodes) const; + void write_fasta(const std::string filename) const; + +}; + + +class CompactNodeFactory; +class CompactNode { + friend class CompactNodeFactory; +public: + Kmer kmer; + uint32_t count; + const id_t node_id; + std::string sequence; + bool direction; + + CompactEdge* in_edges[4] = {nullptr, nullptr, nullptr, nullptr}; + CompactEdge* out_edges[4] = {nullptr, nullptr, nullptr, nullptr}; + + CompactNode(Kmer kmer, id_t node_id) : + kmer(kmer), count(0), node_id(node_id), direction(kmer.is_forward()) {} + + CompactNode(Kmer kmer, std::string sequence, id_t node_id) : + kmer(kmer), count(0), sequence(sequence), node_id(node_id), + direction(kmer.is_forward()) {} + + friend bool operator== (const CompactNode& lhs, const CompactNode& rhs) { + return lhs.node_id == rhs.node_id; + } + + std::string rc_sequence() const { + return _revcomp(sequence); + } + + bool delete_edge(CompactEdge* edge) { + bool deleted = false; + if (delete_in_edge(edge)) { + deleted = true; + } + if (delete_out_edge(edge)) { + deleted = true; + } + return deleted; + } + + bool delete_in_edge(CompactEdge* edge) { + for (uint8_t i=0; i<4; i++) { + if (in_edges[i] == edge) { + in_edges[i] = nullptr; + return true; + } + } + return false; + } + + void add_in_edge(const char base, CompactEdge* edge) { + //pdebug("add in edge to " << *this << ", base=" << base + // << ", edge: " << *edge); + in_edges[twobit_repr(base)] = edge; + } + + CompactEdge* get_in_edge(const char base) { + return in_edges[twobit_repr(base)]; + } + + bool delete_out_edge(CompactEdge* edge) { + for (uint8_t i=0; i<4; i++) { + if (out_edges[i] == edge) { + out_edges[i] = nullptr; + return true; + } + } + return false; + } + + void add_out_edge(const char base, CompactEdge* edge) { + //pdebug("add out edge to " << *this << ", base=" << base + // << ", edge: " << *edge); + out_edges[twobit_repr(base)] = edge; + } + + CompactEdge* get_out_edge(const char base) { + return out_edges[twobit_repr(base)]; + } + + uint8_t degree() const { + return out_degree() + in_degree(); + } + + uint8_t out_degree() const { + uint8_t acc = 0; + for (auto edge: out_edges) { + if (edge != nullptr) { + acc++; + } + } + return acc; + } + + uint8_t in_degree() const { + uint8_t acc = 0; + for (auto edge: in_edges) { + if (edge != nullptr) { + acc++; + } + } + return acc; + } + + friend std::ostream& operator<<(std::ostream& stream, + const CompactNode& node) { + stream << ""; + return stream; + } + + std::string edges_repr() { + std::ostringstream os; + os << *this << std::endl << "\tin_edges:" << std::endl; + for (auto b : alphabets::DNA_SIMPLE) { + CompactEdge* e = get_in_edge(b); + if (e != nullptr) { + os << "\t " << b << "=" << *e << std::endl; + } + } + os << "\tout_edges:" << std::endl; + for (auto b : alphabets::DNA_SIMPLE) { + CompactEdge* e = get_out_edge(b); + if (e != nullptr) { + os << "\t " << b << "=" << *e << std::endl; + } + } + return os.str(); + } +}; + +typedef std::vector CompactNodeVector; + +class CompactNodeFactory : public KmerFactory { + friend class CompactEdgeFactory; +protected: + + // map from HDN hashes to CompactNode IDs + HashIDMap kmer_id_map; + // linear storage for CompactNodes + CompactNodeVector compact_nodes; + uint64_t n_compact_nodes; + uint64_t _n_updates; + +public: + CompactNodeFactory(WordLength K) : + KmerFactory(K), n_compact_nodes(0), + _n_updates(0) {} + + uint64_t n_nodes() const { + return n_compact_nodes; + } + + uint64_t n_updates() const { + return _n_updates; + } + + // protected linear creation of CompactNode + // they should never be deleted, so this is straightforward + CompactNode* build_node(Kmer hdn) { + pdebug("new compact node from " << hdn); + CompactNode * v = get_node_by_kmer(hdn); + if (v == nullptr) { + compact_nodes.emplace_back(hdn, n_compact_nodes); + n_compact_nodes++; + v = &(compact_nodes.back()); + v->sequence = _revhash(hdn, _ksize); + kmer_id_map[hdn] = v->node_id; + _n_updates++; + pdebug("Allocate: " << *v); + } + return v; + } + + CompactNode* get_node_by_kmer(HashIntoType hdn) { + auto search = kmer_id_map.find(hdn); + if (search != kmer_id_map.end()) { + id_t ID = search->second; + return &(compact_nodes[ID]); + } + return nullptr; + } + + CompactNode* get_node_by_id(id_t id) { + if (id >= compact_nodes.size()) { + return nullptr; + } + return &(compact_nodes[id]); + } + + CompactNode* get_or_build_node(Kmer hdn) { + CompactNode* v = get_node_by_kmer(hdn); + if (v != nullptr) { + v->count += 1; + } else { + v = build_node(hdn); + v->count = 1; + } + return v; + } + + std::vector get_nodes(const std::string& sequence) { + //pdebug("get compact node IDs"); + KmerIterator kmers(sequence.c_str(), _ksize); + std::vector nodes; + + CompactNode* node; + + while(!kmers.done()) { + Kmer kmer = kmers.next(); + + node = get_node_by_kmer(kmer); + if (node != nullptr) { + nodes.push_back(node); + } + } + + return nodes; + } + + void unlink_edge(CompactEdge* edge) { + pdebug("unlink edge " << *edge); + CompactNode *left, *right; + left = get_node_by_id(edge->in_node_id); + right = get_node_by_id(edge->out_node_id); + if (left != nullptr) { + // be lazy for now and use bidirectional delete + left->delete_edge(edge); + _n_updates++; + } + if (right != nullptr) { + right->delete_edge(edge); + _n_updates++; + } + } + + bool is_rc_from_left(CompactNode* v, std::string& sequence) const { + /* Check if sequence shares same canonical orientation with + * v when coming from graph left, assuming sequence + * does NOT include v. + */ + const char * node_kmer = v->sequence.c_str(); + const char * _sequence = sequence.c_str(); + return strncmp(node_kmer, + _sequence + sequence.size()-_ksize+1, + _ksize - 1) != 0; + } + + bool get_pivot_from_left(CompactNode* v, + std::string& sequence, + char& pivot_base) const { + /* Check if sequence shared same canonical + * orientation with v from graph left, assuming + * sequence includes v + */ + const char * node_kmer = v->sequence.c_str(); + const char * _segment = sequence.c_str(); + pivot_base = _segment[sequence.size()-_ksize-1]; + if (strncmp(node_kmer, + _segment+sequence.size()-_ksize, + _ksize-1) == 0) { + // same canonical orientation + return false; + } else { + // must have opposite canonical orientation + pivot_base = complement(pivot_base); + return true; + } + } + + bool add_edge_from_left(CompactNode* v, CompactEdge* e) { + char pivot_base; + if (!get_pivot_from_left(v, e->sequence, pivot_base)) { + // same canonical orientation + pdebug("add in edge " << *e << " to node " << *v << " from " << pivot_base); + v->add_in_edge(pivot_base, e); + _n_updates++; + return false; + } else { + // must have opposite canonical orientation + pdebug("add out edge " << *e << " to node " << *v << " from " << pivot_base); + v->add_out_edge(pivot_base, e); + _n_updates++; + return true; + } + } + + + bool get_edge_from_left(CompactNode* v, + CompactEdge* &result_edge, + std::string& sequence) const { + char pivot_base; + if (!get_pivot_from_left(v, sequence, pivot_base)) { + result_edge = v->get_in_edge(pivot_base); + return false; + } else { + result_edge = v->get_out_edge(pivot_base); + return true; + } + } + + bool is_rc_from_right(CompactNode* v, + std::string& sequence) const { + /* Check if sequence shared same canonical + * orientation with v from graph right, assuming + * sequence does NOT include v + */ + const char * node_kmer = v->sequence.c_str(); + const char * _sequence = sequence.c_str(); + return strncmp(node_kmer+1, _sequence, _ksize-1) != 0; + } + + bool get_pivot_from_right(CompactNode* v, + std::string& sequence, + char& pivot_base) const { + /* Find the "pivot base" between sequence and v + * when sequence is from graph right, assuming + * v contained in sequence + */ + const char * node_kmer = v->sequence.c_str(); + const char * _segment = sequence.c_str(); + pivot_base = _segment[_ksize]; + if (strncmp(node_kmer+1, _segment+1, _ksize-1) == 0) { + // same canonical orientation + return false; + } else { + // must have opposite canonical orientation + pivot_base = complement(pivot_base); + return true; + } + } + + bool add_edge_from_right(CompactNode* v, CompactEdge* e) { + char pivot_base; + if (!get_pivot_from_right(v, e->sequence, pivot_base)) { + pdebug("add out edge " << *e << " to node " << *v << " from " << pivot_base); + v->add_out_edge(pivot_base, e); + _n_updates++; + return false; + } else { + pdebug("add in edge " << *e << " to node " << *v << " from " << pivot_base); + v->add_in_edge(pivot_base, e); + _n_updates++; + return true; + } + } + + bool get_edge_from_right(CompactNode* v, + CompactEdge* &result_edge, + std::string& sequence) const { + char pivot_base; + if (!get_pivot_from_right(v, sequence, pivot_base)) { + result_edge = v->get_out_edge(pivot_base); + return false; + } else { + result_edge = v->get_in_edge(pivot_base); + return true; + } + + } +}; + + +class StreamingCompactor : public KmerFactory +{ + +protected: + + // map from tags to CompactEdges + CompactNodeFactory nodes; + CompactEdgeFactory edges; + + uint64_t n_sequences_added; + +public: + + shared_ptr graph; + + StreamingCompactor(shared_ptr graph) : + KmerFactory(graph->ksize()), + nodes(graph->ksize()), edges(graph->ksize()), + n_sequences_added(0), graph(graph) + { + } + + compact_edge_meta_t deduce_edge_meta(CompactNode* in, CompactNode* out) { + compact_edge_meta_t edge_meta; + if (in == nullptr && out == nullptr) { + edge_meta = ISLAND; + } else if ((out == nullptr) != (in == nullptr)) { + edge_meta = TIP; + } else { + edge_meta = FULL; + } + return edge_meta; + } + + uint64_t n_nodes() const { + return nodes.n_nodes(); + } + + uint64_t n_edges() const { + return edges.n_edges(); + } + + uint64_t n_updates() const { + return nodes.n_updates() + edges.n_updates(); + } + + void report() const { + std::cout << std::endl << "REPORT: StreamingCompactor(@" << this << " with " + << "Hashgraph @" << graph.get() << ")" << std::endl; + std::cout << " * " << n_nodes() << " cDBG nodes (HDNs)" << std::endl; + std::cout << " * " << n_edges() << " cDBG edges" << std::endl; + std::cout << " * " << n_sequences_added << " sequences added" << std::endl; + } + + + CompactNode* get_node_by_kmer(Kmer hdn) { + return nodes.get_node_by_kmer(hdn); + } + + CompactNode* get_node_by_id(id_t id) { + return nodes.get_node_by_id(id); + } + + std::vector get_nodes(const std::string& sequence) { + return nodes.get_nodes(sequence); + } + + CompactEdge* get_edge(HashIntoType tag) const { + return edges.get_edge(tag); + } + + bool get_tag_edge_pair(HashIntoType tag, TagEdgePair& pair) const { + return edges.get_tag_edge_pair(tag, pair); + } + + CompactEdge* get_edge(UHashSet& tags) const { + return edges.get_edge(tags); + } + + uint64_t consume_sequence(const std::string& sequence) { + uint64_t prev_n_kmers = graph->n_unique_kmers(); + graph->consume_string(sequence); + return graph->n_unique_kmers() - prev_n_kmers; + } + + uint64_t consume_sequence_and_update(const std::string& sequence) { + if (consume_sequence(sequence) > 0) { + return update_compact_dbg(sequence); + } + return 0; + } + + bool validate_segment(CompactNode* root_node, CompactNode* other_node, + CompactEdge* edge, std::string& sequence) { + pdebug("validating " << *root_node << " with " << *edge << ", " + << sequence << " and other node ID=" << + ((other_node != nullptr) ? other_node->node_id : NULL_ID)); + bool edge_valid = true; + if (edge->meta == TIP) { + if (other_node != nullptr) { + edge_valid = false; + } + if (!((edge->in_node_id == root_node->node_id || + edge->out_node_id == root_node->node_id) && + edge->sequence.length() == sequence.length())) { + edge_valid = false; + } + } else if (edge->meta == FULL) { + if (other_node == nullptr) { + edge_valid = false; + } else { + bool nodes_match; + nodes_match = (edge->in_node_id == root_node->node_id && + edge->out_node_id == other_node->node_id) || + (edge->out_node_id == root_node->node_id && + edge->in_node_id == other_node->node_id); + if (!nodes_match) { + edge_valid = false; + } + } + } + pdebug("valid? = " << edge_valid); + return edge_valid; + } + + /* Update a compact dbg where there are no induced + * HDNs + */ + uint64_t update_compact_dbg_linear(const std::string& sequence) { + pdebug("no induced HDNs, update linear..."); + uint64_t n_ops_before = n_updates(); + Kmer root_kmer = graph->build_kmer(sequence.substr(0, _ksize)); + + CompactingAT lcursor(graph.get(), root_kmer); + CompactingAT rcursor(graph.get(), root_kmer); + CompactingAssembler cassem(graph.get()); + + std::string left_seq = cassem._assemble_directed(lcursor); + std::string right_seq = cassem._assemble_directed(rcursor); + std::string segment_seq = left_seq + right_seq.substr(_ksize); + + CompactNode *left_node = nullptr, *right_node = nullptr; + left_node = nodes.get_node_by_kmer(lcursor.cursor); + right_node = nodes.get_node_by_kmer(rcursor.cursor); + + CompactEdge *left_edge = nullptr, *right_edge = nullptr; + if (left_node != nullptr) { + nodes.get_edge_from_right(left_node, left_edge, segment_seq); + } + if (right_node != nullptr) { + nodes.get_edge_from_left(right_node, right_edge, segment_seq); + } + + if (left_edge != nullptr) { + nodes.unlink_edge(left_edge); + edges.delete_edge(left_edge); + } + if (right_edge != nullptr) { + nodes.unlink_edge(right_edge); + edges.delete_edge(right_edge); + } + + compact_edge_meta_t edge_meta = deduce_edge_meta(left_node, right_node); + if (edge_meta == ISLAND) { // don't deal with islands for now + return n_updates() - n_ops_before; + } + id_t left_id, right_id; + left_id = (left_node != nullptr) ? left_node->node_id : NULL_ID; + right_id = (right_node != nullptr) ? right_node->node_id : NULL_ID; + CompactEdge *new_edge = edges.build_edge(left_id, right_id, + edge_meta, segment_seq); + if (left_node != nullptr) { + nodes.add_edge_from_right(left_node, new_edge); + } + if (right_node != nullptr) { + nodes.add_edge_from_left(right_node, new_edge); + } + + return n_updates() - n_ops_before; + } + + + uint64_t update_compact_dbg(const std::string& sequence) { + pdebug("update cDBG from " << sequence); + n_sequences_added++; + uint64_t n_ops_before = n_updates(); + + // first gather up all k-mers that could have been disturbed -- + // k-mers in the read, and the neighbors of the flanking nodes + KmerIterator kmers(sequence.c_str(), _ksize); + KmerQueue disturbed_kmers; + Kmer kmer = kmers.next(); + CompactingAT lcursor(graph.get(), kmer); + lcursor.neighbors(disturbed_kmers); + while(!kmers.done()) { + kmer = kmers.next(); + disturbed_kmers.push_back(kmer); + } + CompactingAT rcursor(graph.get(), kmer); + rcursor.neighbors(disturbed_kmers); + + pdebug(disturbed_kmers.size() << " k-mers disturbed" << std::endl); + + // find the induced HDNs in the disturbed k-mers + KmerSet induced_hdns; + KmerSet disturbed_hdns; + while(!disturbed_kmers.empty()) { + Kmer kmer = disturbed_kmers.back(); + disturbed_kmers.pop_back(); + uint8_t l_degree, r_degree; + l_degree = lcursor.degree(kmer); + r_degree = rcursor.degree(kmer); + if(l_degree > 1 || r_degree > 1) { + pdebug("found HDN... " << kmer); + CompactNode* hdn = nodes.get_or_build_node(kmer); + if (hdn->count == 1) { // just created + induced_hdns.insert(kmer); + } else if (hdn->degree() != (l_degree + r_degree)) { + induced_hdns.insert(kmer); + } else { + disturbed_hdns.insert(kmer); + } + } + } + pdebug(induced_hdns.size() << " induced HDNs"); + + /* If there are no induced HDNs, we must have extended + * a tip or merged two tips into a linear segment */ + if (induced_hdns.size() == 0 && disturbed_hdns.size() == 0) { + return update_compact_dbg_linear(sequence); + } else if (induced_hdns.size() == 0) { + induced_hdns.insert(disturbed_hdns.begin(), disturbed_hdns.end()); + } + + /* Update from all induced HDNs + */ + CompactingAssembler cassem(graph.get()); + KmerQueue neighbors; + while(!induced_hdns.empty()) { + Kmer root_kmer = *induced_hdns.begin(); + induced_hdns.erase(root_kmer); + + CompactNode* root_node = nodes.get_node_by_kmer(root_kmer); + char root_front = root_node->sequence.front(); + char root_back = root_node->sequence.back(); + pdebug("searching from induced HDN: " << root_node->edges_repr()); + + // check left (in) edges + lcursor.neighbors(root_kmer, neighbors); + pdebug("checking " << neighbors.size() << " left neighbors"); + while(!neighbors.empty()) { + Kmer neighbor = neighbors.back(); + neighbors.pop_back(); + lcursor.cursor = neighbor; + + TagEdgePair tag_pair; + bool found_tag = false; + + lcursor.push_filter(edges.get_tag_stopper(tag_pair, found_tag)); + std::string segment_seq = cassem._assemble_directed(lcursor); + if (nodes.is_rc_from_left(root_node, segment_seq)) { + segment_seq = segment_seq + complement(root_front); + } else { + segment_seq = segment_seq + root_back; + } + pdebug("assembled segment: " << segment_seq << " length: " << + segment_seq.length()); + + // first check for a segment going this direction from root + CompactEdge* segment_edge = nullptr; + nodes.get_edge_from_left(root_node, segment_edge, segment_seq); + + CompactNode* left_node = nodes.get_node_by_kmer(lcursor.cursor); + CompactEdge* left_out_edge = nullptr; + if (left_node != nullptr) { + pdebug("found existing left node: " << *left_node); + nodes.get_edge_from_right(left_node, left_out_edge, segment_seq); + } + + // validate edge leaving root if it exists + if (segment_edge != nullptr && left_out_edge != nullptr) { + pdebug("found edges leaving root and left node"); + + if (segment_edge == left_out_edge && + validate_segment(root_node, left_node, + segment_edge, segment_seq)) { + continue; + } else { + nodes.unlink_edge(segment_edge); + nodes.unlink_edge(left_out_edge); + edges.delete_edge(segment_edge); + edges.delete_edge(left_out_edge); + } + } else if (left_out_edge != nullptr) { + // there was no edge from root, must be bad + pdebug("edge from left invalid, delete"); + nodes.unlink_edge(left_out_edge); + edges.delete_edge(left_out_edge); + } else if (segment_edge != nullptr) { + pdebug("found end leaving root node"); + if (validate_segment(root_node, left_node, + segment_edge, segment_seq)) { + continue; + } else { + pdebug("edge from root invalid, delete"); + nodes.unlink_edge(segment_edge); + edges.delete_edge(segment_edge); + } + } + + /* + * Should also keep a set of pair to track resolved + * segments + */ + + // not needed until tags used again + //segment_seq = cassem._assemble_directed(lcursor) + + // segment_seq.substr(_ksize); + + // construct the compact edge + compact_edge_meta_t edge_meta = (left_node == nullptr) + ? TIP : FULL; + edge_meta = (segment_seq.length() == _ksize + 1 && edge_meta == FULL) + ? TRIVIAL : edge_meta; + + if (edge_meta == FULL || edge_meta == TRIVIAL) { + segment_edge = edges.build_edge(left_node->node_id, + root_node->node_id, + edge_meta, + segment_seq); + nodes.add_edge_from_right(left_node, segment_edge); + } else { + segment_edge = edges.build_edge(NULL_ID, + root_node->node_id, + edge_meta, + segment_seq); + } + + nodes.add_edge_from_left(root_node, segment_edge); + } + + // now the right neighbors... + rcursor.neighbors(root_kmer, neighbors); + pdebug("checking " << neighbors.size() << " right neighbors"); + while(!neighbors.empty()) { + Kmer neighbor = neighbors.back(); + neighbors.pop_back(); + rcursor.cursor = neighbor; + pdebug("right neighbor: " << neighbor.repr(_ksize)); + + TagEdgePair tag_pair; + bool found_tag = false; + + rcursor.push_filter(edges.get_tag_stopper(tag_pair, found_tag)); + std::string segment_seq = cassem._assemble_directed(rcursor); + if (nodes.is_rc_from_right(root_node, segment_seq)) { + segment_seq = complement(root_back) + segment_seq; + } else { + segment_seq = root_front + segment_seq; + } + pdebug("assembled segment: " << segment_seq << " length: " << + segment_seq.length()); + // first check for a segment going this direction from root + CompactEdge* segment_edge = nullptr; + nodes.get_edge_from_right(root_node, segment_edge, segment_seq); + + CompactNode* right_node = nodes.get_node_by_kmer(rcursor.cursor); + CompactEdge* right_in_edge = nullptr; + if (right_node != nullptr) { + nodes.get_edge_from_left(right_node, right_in_edge, segment_seq); + } + + // validate edge leaving root if it exists + if (segment_edge != nullptr && right_in_edge != nullptr) { + + + if (segment_edge == right_in_edge && + validate_segment(root_node, right_node, + segment_edge, segment_seq)) { + continue; + } else { + nodes.unlink_edge(segment_edge); + nodes.unlink_edge(right_in_edge); + edges.delete_edge(segment_edge); + edges.delete_edge(right_in_edge); + } + } else if (right_in_edge != nullptr) { + // there was no edge from root, must be bad + pdebug("edge from left invalid, delete"); + nodes.unlink_edge(right_in_edge); + edges.delete_edge(right_in_edge); + } else if (segment_edge != nullptr) { + if (validate_segment(root_node, right_node, + segment_edge, segment_seq)) { + continue; + } else { + pdebug("edge from root invalid, delete"); + nodes.unlink_edge(segment_edge); + edges.delete_edge(segment_edge); + } + } + + compact_edge_meta_t edge_meta = (right_node == nullptr) ? + TIP : FULL; + edge_meta = (segment_seq.length() == _ksize + 1 && edge_meta == FULL) + ? TRIVIAL : edge_meta; + + if (edge_meta == FULL || edge_meta == TRIVIAL) { + segment_edge = edges.build_edge(root_node->node_id, + right_node->node_id, + edge_meta, + segment_seq); + nodes.add_edge_from_left(right_node, segment_edge); + } else { + segment_edge = edges.build_edge(root_node->node_id, + NULL_ID, + edge_meta, + segment_seq); + } + + nodes.add_edge_from_right(root_node, segment_edge); + } + + } + + return n_updates() - n_ops_before; + + } // update_compact_dbg + + void write_gml(const std::string filename) const { + edges.write_gml(filename, nodes); + } + + void write_fasta(const std::string filename) const { + edges.write_fasta(filename); + } + +}; + + + +} + + +#endif diff --git a/include/oxli/gmap.hh b/include/oxli/gmap.hh new file mode 100644 index 0000000000..15be996c9f --- /dev/null +++ b/include/oxli/gmap.hh @@ -0,0 +1,144 @@ +/* +This file is part of khmer, https://github.com/dib-lab/khmer/, and is +Copyright (C) 2015-2016, The Regents of the University of California. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the Michigan State University nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +LICENSE (END) + +Contact: khmer-project@idyll.org +*/ +#ifndef GMAP_HH +#define GMAP_HH + +#include +#include +#include + +#include "oxli.hh" +#include "kmer_hash.hh" +#include "hashtable.hh" +#include "hashgraph.hh" + +namespace oxli { + + +template +class GuardedHashMap { + + public: + + // Filter should be owned exclusively by GuardedKmerMap + std::unique_ptr filter; + std::unordered_map data; + + explicit GuardedHashMap(WordLength ksize, + unsigned short n_tables, + uint64_t max_table_size) + { + std::vector table_sizes = get_n_primes_near_x(n_tables, max_table_size); + filter = std::unique_ptr(new Nodegraph(ksize, table_sizes)); + } + + T get(HashIntoType kmer) const + { + if (filter->get_count(kmer)) { + auto search = data.find(kmer); + if (search != data.end()) { + return search->second; + } + } + + return NULL; + } + + void set(HashIntoType kmer, T item) + { + filter->count(kmer); + data[kmer] = item; + } + + bool contains(HashIntoType kmer) const + { + return get(kmer) != NULL; + } + + uint64_t size() const + { + return data.size(); + } +}; + +template +class GuardedHashMap: public GuardedHashMap +{ + private: + + uint32_t lock; + + public: + + using GuardedHashMap::GuardedHashMap; + using GuardedHashMap::filter; + using GuardedHashMap::data; + + explicit GuardedHashMap(WordLength ksize, + unsigned short n_tables, + uint64_t max_table_size) : + GuardedHashMap(ksize, n_tables, max_table_size), + lock(0) + { + } + + T get(HashIntoType kmer) const + { + if (filter->get_count(kmer)) { + while(!__sync_bool_compare_and_swap( &lock, 0, 1)); + auto search = data.find(kmer); + if (search != data.end()) { + __sync_bool_compare_and_swap( &lock, 1, 0); + return search->second; + } + __sync_bool_compare_and_swap( &lock, 1, 0); + } + + return NULL; + } + + void set(HashIntoType kmer, T item) + { + while(!__sync_bool_compare_and_swap( &lock, 0, 1)); + set(kmer, item); + __sync_bool_compare_and_swap( &lock, 1, 0); + } +}; + +} + +#endif diff --git a/include/oxli/hashgraph.hh b/include/oxli/hashgraph.hh index f450a42e6b..6318fb52ad 100644 --- a/include/oxli/hashgraph.hh +++ b/include/oxli/hashgraph.hh @@ -196,7 +196,8 @@ public: // consume a string & add sparse graph nodes. void consume_sequence_and_tag(const std::string& seq, unsigned long long& n_consumed, - SeenSet * new_tags = 0); + SeenSet * new_tags = nullptr, + SeenSet * tag_set = nullptr); // get the tags present in this sequence. void get_tags_for_sequence(const std::string& seq, @@ -244,6 +245,7 @@ public: // Calculate the graph degree of the given k-mer. unsigned int kmer_degree(HashIntoType kmer_f, HashIntoType kmer_r); unsigned int kmer_degree(const char * kmer_s); + unsigned int kmer_degree(Kmer kmer); // Find all nodes with a degree > 2. void find_high_degree_nodes(const char * sequence, diff --git a/include/oxli/hashtable.hh b/include/oxli/hashtable.hh index 192b71f333..f0051f9256 100644 --- a/include/oxli/hashtable.hh +++ b/include/oxli/hashtable.hh @@ -397,6 +397,10 @@ public: return store->get_raw_tables(); } + void reset() { + store->reset(); + } + // find the minimum k-mer count in the given sequence BoundedCounterType get_min_count(const std::string &s); diff --git a/include/oxli/hist.hh b/include/oxli/hist.hh new file mode 100644 index 0000000000..51942c142d --- /dev/null +++ b/include/oxli/hist.hh @@ -0,0 +1,99 @@ +/* +This file is part of khmer, https://github.com/dib-lab/khmer/, and is +Copyright (C) 2015-2016, The Regents of the University of California. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the Michigan State University nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +LICENSE (END) + +Contact: khmer-project@idyll.org +*/ +#ifndef HIST_HH +#define HIST_HH + +#include +#include +#include + +#include "oxli.hh" +#include "kmer_hash.hh" + +namespace oxli { + +inline size_t highest_bit(uint64_t num) +{ + if (!num) + return 0; + + int pos = 1; + + while (num >>= 1) { + pos += 1; + } + + return pos; +} + + +template +class Histogram { + + public: + + uint64_t bins[n_bins]; + + Histogram() { + clear(); + } + + void add(uint64_t val) { + size_t bin = highest_bit(val) - 1; + if (bin >= n_bins) { + bins[n_bins-1] += 1; + } else { + bins[bin] += 1; + } + } + + void clear() { + for (auto&& b : bins) { + b = 0; + } + } +}; + +template class Histogram<8>; +template class Histogram<16>; +template class Histogram<32>; +template class Histogram<64>; + + +} + +#endif diff --git a/include/oxli/kmer_filters.hh b/include/oxli/kmer_filters.hh index 35113248bc..c3e45021b4 100644 --- a/include/oxli/kmer_filters.hh +++ b/include/oxli/kmer_filters.hh @@ -52,6 +52,7 @@ class LabelHash; bool apply_kmer_filters(const Kmer& node, const KmerFilterList& filters); +void apply_kmer_helpers(const Kmer& node, const KmerHelperList& helpers); KmerFilter get_label_filter(const Label label, const LabelHash * lh); diff --git a/include/oxli/kmer_hash.hh b/include/oxli/kmer_hash.hh index ae49db17ba..03e846942b 100644 --- a/include/oxli/kmer_hash.hh +++ b/include/oxli/kmer_hash.hh @@ -116,6 +116,7 @@ HashIntoType _hash_murmur(const std::string& kmer, const WordLength k, HashIntoType& h, HashIntoType& r); HashIntoType _hash_murmur_forward(const std::string& kmer, const WordLength k); +uint64_t _hash_murmur_uni(const std::string& sequence); // Cyclic hash, a rolling hash that is irreversible HashIntoType _hash_cyclic(const std::string& kmer, const WordLength k); @@ -197,6 +198,11 @@ public: return kmer_u < other.kmer_u; } + bool operator== (const Kmer &other) const + { + return kmer_u == other.kmer_u; + } + std::string get_string_rep(WordLength K) const { return _revhash(kmer_u, K); @@ -220,6 +226,14 @@ public: { return kmer_f == kmer_u; } + + void set_forward() + { + if (!is_forward()) { + kmer_r = kmer_f; + kmer_f = kmer_u; + } + } }; @@ -302,6 +316,10 @@ public: kmer_u = _hash(kmer_c, _ksize, kmer_f, kmer_r); return Kmer(kmer_f, kmer_r, kmer_u); } + + WordLength K() const { + return _ksize; + } }; /** diff --git a/include/oxli/oxli.hh b/include/oxli/oxli.hh index 1acd65da24..b6a6fdd20a 100644 --- a/include/oxli/oxli.hh +++ b/include/oxli/oxli.hh @@ -72,6 +72,7 @@ private:\ #include #include #include +#include #include #include #include @@ -108,6 +109,8 @@ private:\ namespace oxli { +extern std::string get_version_cpp(); + // largest number we can count up to, exactly. (8 bytes) typedef unsigned long long int ExactCounterType; @@ -161,13 +164,16 @@ void deallocate_ptr_set(T& s) } class Kmer; -typedef std::queue KmerQueue; +typedef std::deque KmerQueue; typedef std::set KmerSet; + // A function which takes a Kmer and returns true if it // is to be filtered / ignored typedef std::function KmerFilter; +typedef std::function KmerHelper; typedef std::list KmerFilterList; +typedef std::list KmerHelperList; typedef std::vector StringVector; } diff --git a/include/oxli/oxli_exception.hh b/include/oxli/oxli_exception.hh index 8cde43051a..431902e096 100644 --- a/include/oxli/oxli_exception.hh +++ b/include/oxli/oxli_exception.hh @@ -105,6 +105,17 @@ public: : oxli_file_exception(msg) {} }; + +class EmptyStream : public oxli_file_exception +{ +public: + EmptyStream() + : oxli_file_exception("Generic EmptyStream error") {} + explicit EmptyStream(const std::string& msg) + : oxli_file_exception(msg) {} +}; + + class StreamReadError : public oxli_file_exception { public: diff --git a/khmer/_oxli/oxli_exception_convert.hh b/include/oxli/oxli_exception_convert.hh similarity index 100% rename from khmer/_oxli/oxli_exception_convert.hh rename to include/oxli/oxli_exception_convert.hh diff --git a/include/oxli/partitioning.hh b/include/oxli/partitioning.hh new file mode 100644 index 0000000000..2f02026c87 --- /dev/null +++ b/include/oxli/partitioning.hh @@ -0,0 +1,260 @@ +/* +This file is part of khmer, https://github.com/dib-lab/khmer/, and is +Copyright (C) 2015-2016, The Regents of the University of California. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the Michigan State University nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +LICENSE (END) + +Contact: khmer-project@idyll.org +*/ +#ifndef PARTITIONING_HH +#define PARTITIONING_HH + +#include +#include + +#include "gmap.hh" +#include "hist.hh" +#include "oxli.hh" +#include "kmer_hash.hh" +#include "hashtable.hh" +#include "hashgraph.hh" +#include "kmer_filters.hh" +#include "traversal.hh" + +#ifndef DEBUG_SP +#define DEBUG_SP 0 +#endif + +namespace oxli +{ + + +class Component; +typedef std::shared_ptr ComponentPtr; + + +class ComponentPtrCompare { + public: + bool operator() (const ComponentPtr& lhs, const ComponentPtr& rhs) const; +}; + + +class ComponentPtrCompare; +typedef std::set ComponentPtrSet; +typedef std::vector ComponentPtrVector; +typedef std::vector TagVector; +typedef GuardedHashMap GuardedHashCompMap; + + +class Component { + + private: + + static uint64_t n_created; + static uint64_t n_destroyed; + bool alive; + + public: + + const uint64_t component_id; + TagVector tags; + Histogram<16> coverage; + + explicit Component(): component_id(n_created), alive(true) { + n_created++; + } + + explicit Component(uint64_t component_id): component_id(component_id) { + n_created++; + } + + ~Component() { + n_destroyed++; + } + + void update_coverage(Hashgraph * graph) { + coverage.clear(); + for (auto tag: tags) { + coverage.add(graph->get_count(tag)); + } + } + + void kill() { + tags.clear(); + alive = false; + } + + bool is_alive() const { + return alive; + } + + uint64_t get_n_created() const { + return n_created; + } + + uint64_t get_n_destroyed() const { + return n_destroyed; + } + + void add_tag(HashIntoType tag) { + tags.push_back(tag); + } + + void add_tags(TagVector& new_tags) { + tags.insert(tags.end(), + new_tags.begin(), + new_tags.end()); + } + + uint64_t get_n_tags() const { + return tags.size(); + } + + friend bool operator==(const Component& lhs, + const Component& rhs) { + return lhs.component_id == rhs.component_id; + } + + friend bool operator<(const Component& lhs, + const Component& rhs) { + return lhs.component_id < rhs.component_id; + } + + friend std::ostream& operator<< (std::ostream& stream, + const Component& comp); +}; + + +class ComponentMap { + + private: + + // We should exclusively own tag_component_map. + std::shared_ptr tag_component_map; + std::shared_ptr components; + uint32_t components_lock; + uint64_t component_counter; + uint64_t n_live_components; + + public: + + + explicit ComponentMap(WordLength ksize, + WordLength n_tables, + uint64_t max_table_size); + + void create_component(TagVector& tags); + uint32_t create_and_merge_components(TagVector& tags); + void map_tags_to_component(TagVector& tags, ComponentPtr& comp); + uint32_t merge_components(ComponentPtr& root, ComponentPtrSet& comps); + + bool contains(HashIntoType tag) const + { + return tag_component_map->contains(tag); + } + + ComponentPtr get(HashIntoType tag) const { + return tag_component_map->get(tag); + } + + uint64_t get_n_components() const { + return n_live_components; + } + + uint64_t get_n_tags() const { + return tag_component_map->size(); + } + + std::weak_ptr get_components() const { + return std::weak_ptr(components); + } + + std::weak_ptr get_tag_component_map() const { + return std::weak_ptr(tag_component_map); + } + + inline void acquire_components() { + while(!__sync_bool_compare_and_swap( &components_lock, 0, 1)); + } + + inline void release_components() { + __sync_bool_compare_and_swap( &components_lock, 1, 0); + } +}; + + +class StreamingPartitioner: public ComponentMap { + + private: + + uint32_t _tag_density; + uint64_t n_consumed; + uint64_t _cstr_get_max_table_size(Hashgraph * graph); + + public: + // We're not graph's owner, simply an observer. + // Unforunately our ownership policies elsewhere are a mess + Hashgraph * graph; + //std::weak_ptr graph; + + explicit StreamingPartitioner(Hashgraph * graph, + uint32_t tag_density=DEFAULT_TAG_DENSITY); + + uint64_t consume(const std::string& seq); + uint64_t consume_pair(const std::string& first, + const std::string& second); + uint64_t consume_fasta(std::string const &filename); + + uint64_t seed_sequence(const std::string& seq, + TagVector& tags, + KmerQueue& seeds, + std::set& seen); + + void find_connected_tags(KmerQueue& node_q, + TagVector& found_tags, + std::set& seen, + bool truncate=false) const; + + ComponentPtr get(std::string& kmer) const; + ComponentPtr get(HashIntoType h) const; + ComponentPtr find_nearest_component(Kmer kmer) const; + ComponentPtr find_nearest_component(std::string& kmer) const; + + + uint32_t get_tag_density() const { + return _tag_density; + } +}; + + +} + +#endif diff --git a/include/oxli/storage.hh b/include/oxli/storage.hh index 33fb6f7f73..4cf9cee80f 100644 --- a/include/oxli/storage.hh +++ b/include/oxli/storage.hh @@ -40,11 +40,13 @@ Contact: khmer-project@idyll.org #include #include +#include #include #include using MuxGuard = std::lock_guard; -#include "gqf.h" +typedef struct quotient_filter; +typedef quotient_filter QF; namespace oxli { typedef std::unordered_map KmerCountMap; @@ -72,6 +74,7 @@ public: virtual bool add(HashIntoType khash) = 0; virtual const BoundedCounterType get_count(HashIntoType khash) const = 0; virtual Byte ** get_raw_tables() = 0; + virtual void reset() = 0; void set_use_bigcount(bool b); bool get_use_bigcount(); @@ -225,6 +228,8 @@ public: return _counts; } + void reset(); + void update_from(const BitStorage&); }; @@ -308,7 +313,15 @@ public: memset(_counts[i], 0, tablebytes); } } - + + void reset() + { + for (unsigned int table_num = 0; table_num < _n_tables; table_num++) { + uint64_t tablesize = _tablesizes[table_num]; + uint64_t tablebytes = tablesize / 2 + 1; + memset(_counts[table_num], 0, tablebytes); + } + } BoundedCounterType test_and_set_bits(HashIntoType khash) { @@ -412,19 +425,12 @@ public: */ class QFStorage : public Storage { protected: - QF cf; + std::shared_ptr cf; public: - QFStorage(int size) { - // size is the power of two to specify the number of slots in - // the filter (2**size). Third argument sets the number of bits used - // in the key (current value of size+8 is copied from the CQF example) - // Final argument is the number of bits allocated for the value, which - // we do not use. - qf_init(&cf, (1ULL << size), size+8, 0); - } + QFStorage(int size); - ~QFStorage() { qf_destroy(&cf); } + ~QFStorage(); BoundedCounterType test_and_set_bits(HashIntoType khash) { BoundedCounterType x = get_count(khash); @@ -433,28 +439,23 @@ public: } // - bool add(HashIntoType khash) { - bool is_new = get_count(khash) == 0; - qf_insert(&cf, khash % cf.range, 0, 1); - return is_new; - } + bool add(HashIntoType khash); // get the count for the given k-mer hash. - const BoundedCounterType get_count(HashIntoType khash) const { - return qf_count_key_value(&cf, khash % cf.range, 0); - } + const BoundedCounterType get_count(HashIntoType khash) const; // Accessors for protected/private table info members // xnslots is larger than nslots. It includes some extra slots to deal // with some details of how the counting is implemented - std::vector get_tablesizes() const { return {cf.xnslots}; } + std::vector get_tablesizes() const; const size_t n_tables() const { return 1; } - const uint64_t n_unique_kmers() const { return cf.ndistinct_elts; } - const uint64_t n_occupied() const { return cf.noccupied_slots; } + const uint64_t n_unique_kmers() const; + const uint64_t n_occupied() const; void save(std::string outfilename, WordLength ksize); void load(std::string infilename, WordLength &ksize); Byte **get_raw_tables() { return nullptr; } + void reset() {}; //nop }; @@ -540,6 +541,14 @@ public: } } + void reset() + { + for (unsigned int table_num = 0; table_num < _n_tables; table_num++) { + uint64_t tablesize = _tablesizes[table_num]; + memset(_counts[table_num], 0, tablesize); + } + } + std::vector get_tablesizes() const { return _tablesizes; diff --git a/include/oxli/traversal.hh b/include/oxli/traversal.hh index 4b96ea2dd0..0d3bb6f4d1 100644 --- a/include/oxli/traversal.hh +++ b/include/oxli/traversal.hh @@ -134,10 +134,11 @@ public: * @param node The Kmer to start at. * @param node_q To collect the results. * - * @return Number of neighbors found. + * @return Number of neighbors total (could be more than those found). */ + template unsigned int neighbors(const Kmer& node, - KmerQueue &node_q) const; + Container &found) const; /** * @brief Get the degree of the given Kmer in the templated direction. @@ -164,6 +165,7 @@ public: // The current position. Kmer cursor; using NodeGatherer::push_filter; + using NodeGatherer::neighbors; explicit NodeCursor(const Hashgraph * ht, Kmer start_kmer, @@ -184,15 +186,19 @@ public: * * @return Number of neighbors found. */ - unsigned int neighbors(KmerQueue& node_q) const + template + unsigned int neighbors(Container& found) const { - return NodeGatherer::neighbors(cursor, node_q); + return NodeGatherer::neighbors(cursor, found); } + /** * @return Degree of the current cursor position and direction. */ unsigned int cursor_degree() const; + unsigned int in_degree() const; + unsigned int out_degree() const; }; @@ -246,12 +252,20 @@ public: template class AssemblerTraverser: public NodeCursor { - protected: std::shared_ptr visited; + KmerHelperList helpers; public: - using NodeCursor::NodeCursor; + + using NodeCursor::push_filter; + + explicit AssemblerTraverser(const Hashgraph * ht, + Kmer start_kmer); + + explicit AssemblerTraverser(const Hashgraph* ht, + Kmer start_kmer, + KmerFilter filter); explicit AssemblerTraverser(const Hashgraph * ht, Kmer start_kmer, @@ -264,6 +278,11 @@ public: AssemblerTraverser(const AssemblerTraverser& other); + void _init_visited() { + visited = std::make_shared(); + push_filter(get_visited_filter(visited)); + } + /** * @brief Get the next symbol. @@ -290,8 +309,55 @@ public: std::string join_contigs(std::string& contig_a, std::string& contig_b, WordLength offset = 0) const; + + void push_helper(KmerHelper helper) + { + helpers.push_back(helper); + } + + KmerHelper pop_helper() + { + KmerHelper back = this->helpers.back(); + this->helpers.pop_back(); + return back; + } + + unsigned int n_helpers() + { + return helpers.size(); + } +}; + + +template +class CompactingAT: public AssemblerTraverser +{ +protected: + + Traverser traverser; + +public: + + explicit CompactingAT(const Hashgraph * ht, + Kmer start_kmer); + + explicit CompactingAT(const Hashgraph * ht, + Kmer start_kmer, + KmerFilter filter); + + explicit CompactingAT(const Hashgraph * ht, + Kmer start_kmer, + KmerFilterList filters); + + explicit CompactingAT(const Hashgraph * ht, + Kmer start_kmer, + KmerFilterList filters, + std::shared_ptr visited); + + virtual char next_symbol(); + }; +} //namespace khmer -} #endif diff --git a/khmer/__init__.py b/khmer/__init__.py index 87a99c180a..412f8fab09 100755 --- a/khmer/__init__.py +++ b/khmer/__init__.py @@ -42,140 +42,47 @@ from khmer._khmer import Read -from khmer._khmer import forward_hash -# tests/test_{functions,countgraph,counting_single}.py - -from khmer._khmer import forward_hash_no_rc # tests/test_functions.py - -from khmer._khmer import reverse_hash # tests/test_functions.py -# tests/counting_single.py - -from khmer._khmer import hash_murmur3 # tests/test_functions.py -from khmer._khmer import hash_no_rc_murmur3 # tests/test_functions.py - -from khmer._khmer import reverse_complement - -from khmer._khmer import get_version_cpp as __version_cpp__ # tests/test_version.py from khmer._khmer import ReadParser # sandbox/to-casava-1.8-fastq.py # tests/test_read_parsers.py,scripts/{filter-abund-single,load-graph}.py # scripts/{abundance-dist-single,load-into-counting}.py -from khmer._khmer import FILETYPES +from khmer._oxli.assembly import (LinearAssembler, SimpleLabeledAssembler, + JunctionCountAssembler) from khmer._oxli.graphs import (Counttable, QFCounttable, Nodetable, CyclicCounttable, SmallCounttable, Countgraph, SmallCountgraph, - Nodegraph) -from khmer._oxli.labeling import GraphLabels -from khmer._oxli.legacy_partitioning import SubsetPartition, PrePartitionInfo -from khmer._oxli.parsing import FastxParser -from khmer._oxli.readaligner import ReadAligner + Nodegraph, _buckets_per_byte) -from khmer._oxli.utils import get_n_primes_near_x, is_prime -import sys +from khmer._oxli.hashing import (forward_hash, forward_hash_no_rc, + reverse_hash, hash_murmur3, + hash_no_rc_murmur3, + reverse_complement) -from struct import pack, unpack +from khmer._oxli.hashset import HashSet -from ._version import get_versions -__version__ = get_versions()['version'] -del get_versions +from khmer._oxli.hllcounter import HLLCounter + +from khmer._oxli.labeling import GraphLabels +from khmer._oxli.legacy_partitioning import SubsetPartition, PrePartitionInfo -_buckets_per_byte = { - # calculated by hand from settings in third-part/cqf/gqf.h - 'qfcounttable': 1 / 1.26, - 'countgraph': 1, - 'smallcountgraph': 2, - 'nodegraph': 8, -} +from khmer._oxli.parsing import (FastxParser, SanitizedFastxParser, + BrokenPairedReader) +from khmer._oxli.readaligner import ReadAligner -def extract_nodegraph_info(filename): - """Open the given nodegraph file and return a tuple of information. +from khmer._oxli.utils import get_n_primes_near_x, is_prime, FILETYPES +from khmer._oxli.utils import get_version_cpp as __version_cpp__ - Returns: the k-mer size, the table size, the number of tables, the version - of the table format, and the type of table flag. +import sys - Keyword argument: - filename -- the name of the nodegraph file to inspect - """ - ksize = None - n_tables = None - table_size = None - signature = None - version = None - ht_type = None - occupied = None - - uint_size = len(pack('I', 0)) - uchar_size = len(pack('B', 0)) - ulonglong_size = len(pack('Q', 0)) - - try: - with open(filename, 'rb') as nodegraph: - signature, = unpack('4s', nodegraph.read(4)) - version, = unpack('B', nodegraph.read(1)) - ht_type, = unpack('B', nodegraph.read(1)) - ksize, = unpack('I', nodegraph.read(uint_size)) - n_tables, = unpack('B', nodegraph.read(uchar_size)) - occupied, = unpack('Q', nodegraph.read(ulonglong_size)) - table_size, = unpack('Q', nodegraph.read(ulonglong_size)) - if signature != b"OXLI": - raise ValueError("Node graph '{}' is missing file type " - "signature".format(filename) + str(signature)) - except: - raise ValueError("Node graph '{}' is corrupt ".format(filename)) - - return ksize, round(table_size, -2), n_tables, version, ht_type, occupied - - -def extract_countgraph_info(filename): - """Open the given countgraph file and return a tuple of information. - - Return: the k-mer size, the table size, the number of tables, the bigcount - flag, the version of the table format, and the type of table flag. - Keyword argument: - filename -- the name of the countgraph file to inspect - """ - CgInfo = namedtuple("CgInfo", ['ksize', 'n_tables', 'table_size', - 'use_bigcount', 'version', 'ht_type', - 'n_occupied']) - ksize = None - n_tables = None - table_size = None - signature = None - version = None - ht_type = None - use_bigcount = None - occupied = None - - uint_size = len(pack('I', 0)) - ulonglong_size = len(pack('Q', 0)) - - try: - with open(filename, 'rb') as countgraph: - signature, = unpack('4s', countgraph.read(4)) - version, = unpack('B', countgraph.read(1)) - ht_type, = unpack('B', countgraph.read(1)) - if ht_type != FILETYPES['SMALLCOUNT']: - use_bigcount, = unpack('B', countgraph.read(1)) - else: - use_bigcount = None - ksize, = unpack('I', countgraph.read(uint_size)) - n_tables, = unpack('B', countgraph.read(1)) - occupied, = unpack('Q', countgraph.read(ulonglong_size)) - table_size, = unpack('Q', countgraph.read(ulonglong_size)) - if signature != b'OXLI': - raise ValueError("Count graph file '{}' is missing file type " - "signature. ".format(filename) + str(signature)) - except: - raise ValueError("Count graph file '{}' is corrupt ".format(filename)) - - return CgInfo(ksize, n_tables, round(table_size, -2), use_bigcount, - version, ht_type, occupied) +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions def calc_expected_collisions(graph, force=False, max_false_pos=.2): @@ -214,9 +121,3 @@ def calc_expected_collisions(graph, force=False, max_false_pos=.2): return fp_all - -from khmer._oxli.assembly import (LinearAssembler, SimpleLabeledAssembler, - JunctionCountAssembler) -from khmer._oxli.hashset import HashSet -from khmer._oxli.hllcounter import HLLCounter -from khmer._oxli.labeling import GraphLabels diff --git a/khmer/_oxli/__init__.py b/khmer/_oxli/__init__.py index 06d02cd291..e69de29bb2 100644 --- a/khmer/_oxli/__init__.py +++ b/khmer/_oxli/__init__.py @@ -1,6 +0,0 @@ -from .assembly import LinearAssembler -from .hashing import Kmer -from .parsing import Alphabets, Sequence, ReadBundle, UnpairedReadsError -from .parsing import FastxParser, SanitizedFastxParser, SplitPairedReader -from .parsing import BrokenPairedReader, _split_left_right -from .parsing import check_is_left, check_is_right, check_is_pair diff --git a/khmer/_oxli/app.pxd b/khmer/_oxli/app.pxd new file mode 100644 index 0000000000..ea246ebf67 --- /dev/null +++ b/khmer/_oxli/app.pxd @@ -0,0 +1,8 @@ +from khmer._oxli.partitioning cimport StreamingPartitioner +from khmer._oxli.graphs cimport Hashgraph + +cdef class PartitioningApp: + + cdef object args + cdef readonly Hashgraph graph + cdef readonly StreamingPartitioner partitioner diff --git a/khmer/_oxli/app.pyx b/khmer/_oxli/app.pyx new file mode 100644 index 0000000000..928e141d15 --- /dev/null +++ b/khmer/_oxli/app.pyx @@ -0,0 +1,150 @@ +# -*- coding: UTF-8 -*- +import argparse +import itertools +import json +import os +import sys + +from khmer.khmer_args import (build_counting_args, create_countgraph, + sanitize_help) +from khmer.khmer_logger import (configure_logging, log_info, log_error, + log_warn) + +from libcpp cimport bool + +from khmer._oxli.graphs cimport Nodegraph, Countgraph + +from khmer._oxli.partitioning cimport StreamingPartitioner, Component +from khmer._oxli.partitioning import StreamingPartitioner, Component + +from khmer._oxli.parsing cimport BrokenPairedReader, SplitPairedReader, FastxParser +from khmer._oxli.parsing import BrokenPairedReader, SplitPairedReader, FastxParser +from khmer._oxli.sequence cimport Sequence +from khmer._oxli.sequence import Sequence +from khmer._oxli.utils cimport _bstring + +def grouper(n, iterable): + iterable = iter(iterable) + return iter(lambda: list(itertools.islice(iterable, n)), []) + +cdef class PartitioningApp: + + def __init__(self, args=sys.argv[1:]): + self.args = self.parse_args(args) + self.args.write_results = self.args.output_interval > 0 + + self.graph = create_countgraph(self.args) + self.partitioner = StreamingPartitioner(self.graph, tag_density=self.args.tag_density) + + def parse_args(self, args): + parser = build_counting_args(descr='Partition a sample', + citations=['counting', 'SeqAn']) + parser.add_argument('--output-dir', default='partitioned') + parser.add_argument('samples', nargs='+') + parser.add_argument('--save', action='store_true', default=False) + parser.add_argument('--pairing-mode', + choices=['split', 'interleaved', 'single'], + default='split') + parser.add_argument('-Z', dest='norm', default=10, type=int) + parser.add_argument('--output-interval', default=0, type=int) + parser.add_argument('--tag-density', default=None, type=int) + + return sanitize_help(parser).parse_args(args) + + def write_results(self, folder, n, new_kmers): + filename = os.path.join(folder, '{0}.csv'.format(n)) + print('# {0}: {1} tags, {2} components.'.format(n, self.partitioner.n_tags, + self.partitioner.n_components)) + print(' writing results to file -> {0}'.format(filename)) + self.partitioner.write_components(filename) + with open(os.path.join(folder, 'global.csv'), 'a') as fp: + fp.write('{0}, {1}, {2}, {3}\n'.format(n, self.partitioner.n_components, + self.partitioner.n_tags, new_kmers)) + cov_filename = os.path.join(folder, '{0}.coverage.csv'.format(n)) + self.partitioner.write_component_coverage(cov_filename) + + def prep_results_dir(self): + try: + os.mkdir(self.args.output_dir) + except OSError as e: + pass + + if self.args.save: + self.args.save = os.path.join(self.args.output_dir, 'partitioner') + + def write_meta(self, n_sequences, total_kmers): + meta = {'samples': self.args.samples, + 'pairing': self.args.pairing_mode, + 'K': self.args.ksize, + 'tag-density': self.partitioner.tag_density, + 'n_sequences': n_sequences, + 'n_unique_kmers': total_kmers} + if self.args.save: + meta['partitioner'] = self.args.save + + with open(os.path.join(self.args.output_dir, 'meta'), 'w') as fp: + json.dump(meta, fp, indent=4) + + def run(self): + + self.prep_results_dir() + + if self.args.pairing_mode == 'split': + samples = list(grouper(2, self.args.samples)) + for pair in samples: + if len(pair) != 2: + raise ValueError('Must have even number of samples!') + else: + samples = self.args.samples + + cdef int n + cdef int n_sequences = 0 + cdef bool paired + cdef Sequence first, second + cdef int new_kmers = 0 + cdef int total_kmers = 0 + cdef int print_interval = self.args.output_interval if self.args.write_results else 10000 + last = 0 + for group in samples: + if self.args.pairing_mode == 'split': + sample_name = '{0}.{1}'.format(group[0], group[1]) + print('== Starting ({0}) =='.format(sample_name)) + reader = SplitPairedReader(FastxParser(group[0]), + FastxParser(group[1]), + min_length=self.args.ksize) + else: + sample_name = group + print('== Starting {0} =='.format(sample_name)) + reader = BrokenPairedReader(FastxParser(group), min_length=self.args.ksize) + for n, paired, first, second in reader: + + if n % print_interval == 0: + print (n, self.partitioner.n_components, self.partitioner.n_tags) + if self.args.write_results and n > 0 and n % self.args.output_interval == 0: + self.write_results(self.args.output_dir, last+n, new_kmers) + total_kmers += new_kmers + new_kmers = 0 + if paired: + new_kmers += self.partitioner.consume_pair(first.sequence, + second.sequence) + else: + new_kmers += self.partitioner.consume(first.sequence) + last = n + n_sequences += last + if self.args.write_results: + self.write_results(self.args.output_dir, last, new_kmers) + total_kmers += new_kmers + new_kmers = 0 + + if self.args.save: + self.partitioner.save(self.args.save) + + self.write_meta(n_sequences, total_kmers) + + return self.partitioner + + +cdef class DynamicPartitioning(PartitioningApp): + + def run(self): + pass diff --git a/khmer/_oxli/assembly.pxd b/khmer/_oxli/assembly.pxd index 3931c477cb..b616dfd48b 100644 --- a/khmer/_oxli/assembly.pxd +++ b/khmer/_oxli/assembly.pxd @@ -21,6 +21,9 @@ cdef extern from "oxli/assembler.hh" namespace "oxli": string assemble_left(const CpKmer) const string assemble_right(const CpKmer) const + cdef cppclass CpCompactingAssembler(CpLinearAssembler): + CpCompactingAssembler(CpHashgraph *) + cdef cppclass CpSimpleLabeledAssembler "oxli::SimpleLabeledAssembler": CpSimpleLabeledAssembler(const CpLabelHash *) @@ -51,6 +54,10 @@ cdef class LinearAssembler: cdef str _assemble_right(self, CpKmer start) +cdef class CompactingAssembler(LinearAssembler): + pass + + cdef class SimpleLabeledAssembler: cdef shared_ptr[CpSimpleLabeledAssembler] _this diff --git a/khmer/_oxli/assembly.pyx b/khmer/_oxli/assembly.pyx index b1deb61793..153bf65502 100644 --- a/khmer/_oxli/assembly.pyx +++ b/khmer/_oxli/assembly.pyx @@ -54,6 +54,17 @@ cdef class LinearAssembler: return self._assemble_right(_seed) +cdef class CompactingAssembler(LinearAssembler): + + def __cinit__(self, Hashgraph graph not None, Hashgraph stop_filter=None): + self.graph = graph + self._graph_ptr = graph._hg_this + self.set_stop_filter(stop_filter=stop_filter) + + if type(self) is CompactingAssembler: + self._this = make_shared[CpCompactingAssembler](self._graph_ptr.get()) + + cdef class SimpleLabeledAssembler: def __cinit__(self, GraphLabels labels not None, Hashgraph stop_filter=None): diff --git a/khmer/_oxli/cdbg.pxd b/khmer/_oxli/cdbg.pxd new file mode 100644 index 0000000000..043139f2e3 --- /dev/null +++ b/khmer/_oxli/cdbg.pxd @@ -0,0 +1,173 @@ +cimport cython +from libcpp.memory cimport shared_ptr +from libcpp.list cimport list as stdlist +from libcpp.pair cimport pair +from libcpp.unordered_set cimport unordered_set as uset +from libcpp.unordered_map cimport unordered_map as umap +from libcpp.vector cimport vector +from libc.stdint cimport uint8_t, uint32_t, uint64_t + +from khmer._oxli.oxli_types cimport * +from khmer._oxli.hashing cimport CpKmer, Kmer, CpKmerFactory +from khmer._oxli.graphs cimport CpHashgraph, Hashgraph, Nodegraph, Countgraph + + +cdef extern from "oxli/cdbg.hh": + cdef uint64_t NULL_ID + +cdef extern from "oxli/cdbg.hh" namespace "oxli" nogil: + + ctypedef uint64_t id_t + ctypedef pair[HashIntoType, id_t] HashIDPair + ctypedef uset[HashIntoType] UHashSet + ctypedef vector[HashIntoType] HashVector + ctypedef umap[HashIntoType, id_t] HashIDMap + + ctypedef enum compact_edge_meta_t: + FULL + TIP + ISLAND + TRIVIAL + + cdef const char * edge_meta_repr(compact_edge_meta_t) + + cdef cppclass CpCompactEdge "oxli::CompactEdge": + const id_t in_node_id + const id_t out_node_id + const id_t edge_id + UHashSet tags + compact_edge_meta_t meta + string sequence + + CpCompactEdge(id_t, id_t) + CpComapctEdge(id_t, id_t, compact_edge_meta_t) + + string rc_sequence() + void add_tags(UHashSet&) + string tag_viz(WordLength) + float tag_density() + + ctypedef pair[HashIntoType, CpCompactEdge*] TagEdgePair + ctypedef set[TagEdgePair] TagEdgePairSet + + cdef cppclass CpCompactEdgeFactory "oxli::CompactEdgeFactory" (CpKmerFactory): + CpCompactEdgeFactory(WordLength) + + uint64_t n_edges() + uint64_t n_updates() + + CpCompactEdge* build_edge(id_t, id_t, compact_edge_meta_t, + string) + void delete_edge(CpCompactEdge*) + void delete_edge(UHashSet&) + void delete_edge(HashIntoType) + CpCompactEdge* get_edge(HashIntoType) + bool get_tag_edge_pair(HashIntoType, TagEdgePair&) + CpCompactEdge* get_edge(UHashSet&) + + cdef cppclass CpCompactNode "oxli::CompactNode": + CpKmer kmer + uint32_t count + const id_t node_id + string sequence + + CpCompactEdge* in_edges[4] + CpCompactEdge* out_edges[4] + + CpCompactNode(CpKmer, id_t) + CpCompactNode(CpKmer, string, id_t) + + void add_in_edge(const char, CpCompactEdge*) + bool delete_in_edge(CpCompactEdge*) + CpCompactEdge* get_in_edge(const char) + void add_out_edge(const char, CpCompactEdge*) + bool delete_out_edge(CpCompactEdge*) + CpCompactEdge* get_out_edge(const char) + bool delete_edge(const char) + + uint8_t degree() + uint8_t out_degree() + uint8_t in_degree() + + ctypedef vector[CpCompactNode] CompactNodeVector + + cdef cppclass CpCompactNodeFactory "oxli::CompactNodeFactory" (CpKmerFactory): + CpCompactNodeFactory(WordLength) + uint64_t n_nodes() + uint64_t n_updates() + + CpCompactNode* build_node(CpKmer) + CpCompactNode* get_node_by_kmer(HashIntoType) + CpCompactNode* get_node_by_id(id_t) + CpCompactNode* get_or_build_node(CpKmer) + vector[CpCompactNode*] get_nodes(const string&) + + void unlink_edge(CpCompactEdge*) + + bool is_rc_from_left(CpCompactNode* v, string&) + bool get_pivot_from_left(CpCompactNode*, string&, char&) + bool add_edge_from_left(CpCompactNode*, CpCompactEdge*) + bool get_edge_from_left(CpCompactNode*, CpCompactEdge* &, string&) + + bool is_rc_from_right(CpCompactNode* v, string&) + bool get_pivot_from_right(CpCompactNode*, string&, char&) + bool add_edge_from_right(CpCompactNode*, CpCompactEdge*) + bool get_edge_from_right(CpCompactNode*, CpCompactEdge* &, string&) + + cdef cppclass CpStreamingCompactor "oxli::StreamingCompactor": + shared_ptr[CpHashgraph] graph + + CpStreamingCompactor(shared_ptr[CpHashgraph]) + void report() + uint64_t n_nodes() + uint64_t n_edges() + uint64_t n_updates() + + CpCompactNode* get_node_by_kmer(HashIntoType) + CpCompactNode* get_node_by_id(id_t) + vector[CpCompactNode*] get_nodes(const string&) + + CpCompactEdge* get_edge(HashIntoType) + bool get_tag_edge_pair(id_t, TagEdgePair&) + CpCompactEdge* get_edge(UHashSet&) + + uint64_t update_compact_dbg(const string&) + uint64_t consume_sequence(const string&) + uint64_t consume_sequence_and_update(const string&) + + void write_gml(string) + void write_fasta(string) + + +cdef class CompactNode: + cdef CpCompactNode* _cn_this + cdef public Kmer kmer + + @staticmethod + cdef CompactNode _wrap(CpCompactNode*) + + +cdef class CompactNodeFactory: + cdef CpCompactNodeFactory * _cnf_this + @staticmethod + cdef CompactNodeFactory _wrap(CpCompactNodeFactory*) + + +cdef class CompactEdge: + cdef CpCompactEdge* _ce_this + + @staticmethod + cdef CompactEdge _wrap(CpCompactEdge*) + + +cdef class CompactEdgeFactory: + cdef CpCompactEdgeFactory* _cef_this + @staticmethod + cdef CompactEdgeFactory _wrap(CpCompactEdgeFactory*) + + +cdef class StreamingCompactor: + + cdef shared_ptr[CpHashgraph] _graph + cdef shared_ptr[CpStreamingCompactor] _sc_this + diff --git a/khmer/_oxli/cdbg.pyx b/khmer/_oxli/cdbg.pyx new file mode 100644 index 0000000000..ca967cf198 --- /dev/null +++ b/khmer/_oxli/cdbg.pyx @@ -0,0 +1,225 @@ +from cython.operator cimport dereference as deref +from libcpp.memory cimport make_shared + +from khmer._oxli.utils cimport _bstring, _ustring +from khmer._oxli.sequence cimport Alphabets + + +cdef class CompactEdge: + + @staticmethod + cdef CompactEdge _wrap(CpCompactEdge* _edge): + cdef CompactEdge edge = CompactEdge() + edge._ce_this = _edge + return edge + + def tags(self): + cdef HashIntoType tag + for tag in deref(self._ce_this).tags: + yield tag + + @property + def edge_type(self): + cdef compact_edge_meta_t meta = deref(self._ce_this).meta + if meta == FULL: + return 'FULL' + elif meta == TIP: + return 'TIP' + elif meta == ISLAND: + return 'ISLAND' + elif meta == TRIVIAL: + return 'TRIVIAL' + else: + raise ValueError('Malformed edge metadata') + + @property + def sequence(self): + return deref(self._ce_this).sequence + + def in_node_id(self): + cdef uint64_t nid = deref(self._ce_this).in_node_id + return None if nid == NULL_ID else nid + + def out_node(self): + cdef uint64_t nid = deref(self._ce_this).out_node_id + return None if nid == NULL_ID else nid + + def __len__(self): + return deref(self._ce_this).sequence.length() + + def __str__(self): + return 'CompactEdge: L={0} sequence={1}'.format(len(self), self.sequence) + + def __repr__(self): + return str(self) + + +cdef class CompactEdgeFactory: + + @staticmethod + cdef CompactEdgeFactory _wrap(CpCompactEdgeFactory* _this): + cdef CompactEdgeFactory factory = CompactEdgeFactory() + factory._cef_this = _this + return factory + + +cdef class CompactNode: + + def __cinit__(self): + self.kmer = None + + @staticmethod + cdef CompactNode _wrap(CpCompactNode* _node): + cdef CompactNode node = CompactNode() + node._cn_this = _node + return node + + @property + def count(self): + return deref(self._cn_this).count + + @property + def node_id(self): + return deref(self._cn_this).node_id + + @property + def out_degree(self): + return deref(self._cn_this).out_degree() + + @property + def in_degree(self): + return deref(self._cn_this).in_degree() + + @property + def degree(self): + return deref(self._cn_this).degree() + + @property + def ID(self): + return deref(self._cn_this).node_id + + @property + def kmer_hash(self): + return deref(self._cn_this).kmer.kmer_u + + @property + def sequence(self): + return deref(self._cn_this).sequence + + def node_kmer(self, WordLength K): + if self.kmer is None: + self.kmer = Kmer.wrap(&deref(self._cn_this).kmer, K) + return self.kmer + + def out_edges(self): + cdef string bases = Alphabets._get('DNA_SIMPLE') + cdef char base + cdef CpCompactEdge * edge + for base in bases: + edge = deref(self._cn_this).get_out_edge(base) + if edge != NULL: + yield base, CompactEdge._wrap(edge) + + def in_edges(self): + cdef string bases = Alphabets._get('DNA_SIMPLE') + cdef char base + cdef CpCompactEdge * edge + for base in bases: + edge = deref(self._cn_this).get_in_edge(base) + if edge != NULL: + yield base, CompactEdge._wrap(edge) + + def __str__(self): + return 'CompactNode: ID={0} count={1} in_degree={2}'\ + ' out_degree={3} sequence={4}'.format(self.kmer, self.count, + self.in_degree, + self.out_degree, + self.sequence) + + +cdef class CompactNodeFactory: + + @staticmethod + def new(WordLength ksize): + cdef CpCompactNodeFactory* factory = new CpCompactNodeFactory(ksize) + return CompactNodeFactory._wrap(factory) + + @staticmethod + cdef CompactNodeFactory _wrap(CpCompactNodeFactory* _this): + cdef CompactNodeFactory factory = CompactNodeFactory() + factory._cnf_this = _this + return factory + + def build_node(self, Kmer kmer): + cdef CpCompactNode* _node = \ + deref(self._cnf_this).build_node(deref(kmer._this.get())) + return CompactNode._wrap(_node) + + def get_pivot_from_left(self, CompactNode node, str sequence): + cdef string _sequence = _bstring(sequence) + cdef char pivot + cdef bool pivoted + pivoted = deref(self._cnf_this).get_pivot_from_left(node._cn_this, + _sequence, + pivot) + return (pivot).decode('UTF-8'), pivoted + + def get_pivot_from_right(self, CompactNode node, str sequence): + cdef string _sequence = _bstring(sequence) + cdef char pivot + cdef bool pivoted + pivoted = deref(self._cnf_this).get_pivot_from_right(node._cn_this, + _sequence, + pivot) + return (pivot).decode('UTF-8'), pivoted + + +cdef class StreamingCompactor: + + def __cinit__(self, Hashgraph graph): + self._graph = graph._hg_this + + if type(self) is StreamingCompactor: + self._sc_this = make_shared[CpStreamingCompactor](self._graph) + + def update(self, str sequence): + cdef string _sequence = _bstring(sequence) + return deref(self._sc_this).update_compact_dbg(_sequence) + + def consume(self, str sequence): + cdef string _sequence = _bstring(sequence) + return deref(self._sc_this).consume_sequence(_sequence) + + def consume_and_update(self, str sequence): + cdef string _sequence = _bstring(sequence) + return deref(self._sc_this).consume_sequence_and_update(_sequence) + + def sequence_nodes(self, str sequence): + cdef string _sequence = _bstring(sequence) + cdef vector[CpCompactNode*] nodes = deref(self._sc_this).get_nodes(_sequence) + cdef CpCompactNode* node + for node in nodes: + yield CompactNode._wrap(node) + + def report(self): + deref(self._sc_this).report() + + @property + def n_nodes(self): + return deref(self._sc_this).n_nodes() + + @property + def n_edges(self): + return deref(self._sc_this).n_edges() + + @property + def n_updates(self): + return deref(self._sc_this).n_updates() + + def write_gml(self, str filename): + cdef string _filename = _bstring(filename) + deref(self._sc_this).write_gml(_filename) + + def write_fasta(self, str filename): + cdef string _filename = _bstring(filename) + deref(self._sc_this).write_fasta(_filename) diff --git a/khmer/_oxli/graphs.pxd b/khmer/_oxli/graphs.pxd index ce4d290e86..0097124422 100644 --- a/khmer/_oxli/graphs.pxd +++ b/khmer/_oxli/graphs.pxd @@ -10,6 +10,7 @@ from khmer._oxli.hashing cimport Kmer, CpKmer, KmerSet, CpKmerFactory, CpKmerIte from khmer._oxli.parsing cimport CpReadParser, CpSequence, FastxParserPtr from khmer._oxli.legacy_partitioning cimport (CpSubsetPartition, cp_pre_partition_info, SubsetPartition) +from khmer._oxli.sequence cimport Sequence from khmer._oxli.utils cimport oxli_raise_py_error @@ -35,6 +36,7 @@ cdef extern from "oxli/storage.hh": void set_use_bigcount(bool) bool get_use_bigcount() + void reset() cdef extern from "oxli/hashtable.hh" namespace "oxli" nogil: @@ -102,6 +104,7 @@ cdef extern from "oxli/hashtable.hh" namespace "oxli" nogil: uint64_t trim_below_abundance(string, BoundedCounterType) const vector[uint32_t] find_spectral_error_positions(string, BoundedCounterType) + void reset() cdef cppclass CpMurmurHashtable "oxli::MurmurHashtable" (CpHashtable): CpMurmurHashtable(WordLength, CpStorage *) @@ -255,6 +258,8 @@ cdef class Hashtable: cdef FastxParserPtr _get_parser(self, object parser_or_filename) except * cdef list _get_raw_tables(self, uint8_t **, vector[uint64_t]) + cdef int _trim_on_abundance(self, Sequence sequence, int abundance) + cdef class QFCounttable(Hashtable): cdef shared_ptr[CpQFCounttable] _qf_this diff --git a/khmer/_oxli/graphs.pyx b/khmer/_oxli/graphs.pyx index 992ae526ae..026f2aa08d 100644 --- a/khmer/_oxli/graphs.pyx +++ b/khmer/_oxli/graphs.pyx @@ -1,4 +1,6 @@ from math import log +from struct import pack, unpack +from collections import namedtuple from cython.operator cimport dereference as deref from cpython.buffer cimport (PyBuffer_FillInfo, PyBUF_FULL_RO) @@ -11,22 +13,31 @@ from libcpp.set cimport set from libcpp.string cimport string from khmer._oxli.utils cimport _bstring, is_str, is_num -from khmer._oxli.utils import get_n_primes_near_x -from khmer._oxli.parsing cimport (CpFastxReader, CPyReadParser_Object, - get_parser, CpReadParser, FastxParser, - FastxParserPtr) +from khmer._oxli.utils import get_n_primes_near_x, FILETYPES +from khmer._oxli.parsing cimport (CpFastxReader, CPyReadParser_Object, get_parser, + CpReadParser, FastxParserPtr, FastxParser) + from khmer._oxli.hashset cimport HashSet from khmer._oxli.legacy_partitioning cimport (CpSubsetPartition, SubsetPartition, cp_pre_partition_info, PrePartitionInfo) from khmer._oxli.oxli_types cimport MAX_BIGCOUNT, HashIntoType +from khmer._oxli.sequence cimport Sequence from khmer._oxli.traversal cimport Traverser from khmer._khmer import ReadParser + CYTHON_TABLES = (Hashtable, Nodetable, Counttable, CyclicCounttable, SmallCounttable, QFCounttable, Nodegraph, Countgraph, SmallCountgraph) +_buckets_per_byte = { + # calculated by hand from settings in third-part/cqf/gqf.h + 'qfcounttable': 1 / 1.26, + 'countgraph': 1, + 'smallcountgraph': 2, + 'nodegraph': 8, +} cdef class Hashtable: @@ -200,6 +211,12 @@ cdef class Hashtable: trimmed_at = deref(self._ht_this).trim_on_abundance(data, abundance) return sequence[:trimmed_at], trimmed_at + cdef int _trim_on_abundance(self, Sequence sequence, int abundance): + trimmed_at = \ + deref(self._ht_this).trim_on_abundance(sequence._obj.cleaned_seq, + abundance) + return trimmed_at + def trim_below_abundance(self, str sequence, int abundance): """Trim sequence at first k-mer above the given abundance.""" cdef bytes data = self._valid_sequence(sequence) @@ -232,6 +249,7 @@ cdef class Hashtable: cdef unsigned long long n_consumed = 0 cdef unsigned int total_reads = 0 cdef FastxParserPtr _parser = self._get_parser(parser_or_filename) + with nogil: deref(self._ht_this).consume_seqfile[CpFastxReader](\ _parser, total_reads, n_consumed @@ -256,6 +274,7 @@ cdef class Hashtable: cdef unsigned long long n_consumed = 0 cdef unsigned int total_reads = 0 cdef FastxParserPtr _parser = self._get_parser(parser_or_filename) + with nogil: deref(self._ht_this).consume_seqfile_banding[CpFastxReader](\ _parser, num_bands, band, total_reads, n_consumed @@ -344,6 +363,9 @@ cdef class Hashtable: cdef vector[uint64_t] sizes = deref(self._ht_this).get_tablesizes() return self._get_raw_tables(table_ptrs, sizes) + def reset(self): + deref(self._ht_this).reset() + cdef class QFCounttable(Hashtable): """Count kmers using a counting quotient filter. @@ -387,6 +409,9 @@ cdef class QFCounttable(Hashtable): deref(table._qf_this).load(_bstring(file_name)) return table + def reset(self): + raise NotImplementedError() + cdef class Counttable(Hashtable): def __cinit__(self, int k, uint64_t starting_size, int n_tables): @@ -396,6 +421,53 @@ cdef class Counttable(Hashtable): self._ct_this = make_shared[CpCounttable](k, primes) self._ht_this = self._ct_this + @staticmethod + def extract_info(filename): + """Open the given countgraph file and return a tuple of information. + + Return: the k-mer size, the table size, the number of tables, the bigcount + flag, the version of the table format, and the type of table flag. + + Keyword argument: + filename -- the name of the countgraph file to inspect + """ + CgInfo = namedtuple("CgInfo", ['ksize', 'n_tables', 'table_size', + 'use_bigcount', 'version', 'ht_type', + 'n_occupied']) + ksize = None + n_tables = None + table_size = None + signature = None + version = None + ht_type = None + use_bigcount = None + occupied = None + + uint_size = len(pack('I', 0)) + ulonglong_size = len(pack('Q', 0)) + + try: + with open(filename, 'rb') as countgraph: + signature, = unpack('4s', countgraph.read(4)) + version, = unpack('B', countgraph.read(1)) + ht_type, = unpack('B', countgraph.read(1)) + if ht_type != FILETYPES['SMALLCOUNT']: + use_bigcount, = unpack('B', countgraph.read(1)) + else: + use_bigcount = None + ksize, = unpack('I', countgraph.read(uint_size)) + n_tables, = unpack('B', countgraph.read(1)) + occupied, = unpack('Q', countgraph.read(ulonglong_size)) + table_size, = unpack('Q', countgraph.read(ulonglong_size)) + if signature != b'OXLI': + raise ValueError("Count graph file '{}' is missing file type " + "signature. ".format(filename) + str(signature)) + except: + raise ValueError("Count graph file '{}' is corrupt ".format(filename)) + + return CgInfo(ksize, n_tables, round(table_size, -2), use_bigcount, + version, ht_type, occupied) + cdef class CyclicCounttable(Hashtable): @@ -423,6 +495,10 @@ cdef class SmallCounttable(Hashtable): sizes[i] = (sizes[i] // 2) + 1 return self._get_raw_tables(table_ptrs, sizes) + @staticmethod + def extract_info(filename): + return Counttable.extract_info(filename) + cdef class Nodetable(Hashtable): @@ -433,6 +509,47 @@ cdef class Nodetable(Hashtable): self._nt_this = make_shared[CpNodetable](k, primes) self._ht_this = self._nt_this + @staticmethod + def extract_info(filename): + """Open the given nodegraph file and return a tuple of information. + + Returns: the k-mer size, the table size, the number of tables, the version + of the table format, and the type of table flag. + + Keyword argument: + filename -- the name of the nodegraph file to inspect + """ + ksize = None + n_tables = None + table_size = None + signature = None + version = None + ht_type = None + occupied = None + + uint_size = len(pack('I', 0)) + uchar_size = len(pack('B', 0)) + ulonglong_size = len(pack('Q', 0)) + + try: + with open(filename, 'rb') as nodegraph: + signature, = unpack('4s', nodegraph.read(4)) + version, = unpack('B', nodegraph.read(1)) + ht_type, = unpack('B', nodegraph.read(1)) + ksize, = unpack('I', nodegraph.read(uint_size)) + n_tables, = unpack('B', nodegraph.read(uchar_size)) + occupied, = unpack('Q', nodegraph.read(ulonglong_size)) + table_size, = unpack('Q', nodegraph.read(ulonglong_size)) + if signature != b"OXLI": + raise ValueError("Node graph '{}' is missing file type " + "signature".format(filename) + str(signature)) + except: + raise ValueError("Node graph '{}' is corrupt ".format(filename)) + + return ksize, round(table_size, -2), n_tables, version, ht_type, occupied + + + cdef class Hashgraph(Hashtable): @@ -826,6 +943,12 @@ cdef class Countgraph(Hashgraph): return subset + @staticmethod + def extract_info(filename): + return Counttable.extract_info(filename) + + + cdef class SmallCountgraph(Hashgraph): @@ -848,6 +971,9 @@ cdef class SmallCountgraph(Hashgraph): sizes[i] = sizes[i] // 2 + 1 return self._get_raw_tables(table_ptrs, sizes) + @staticmethod + def extract_info(filename): + return Counttable.extract_info(filename) cdef class Nodegraph(Hashgraph): @@ -866,3 +992,7 @@ cdef class Nodegraph(Hashgraph): def update(self, Nodegraph other): deref(self._ng_this).update_from(deref(other._ng_this)) + + @staticmethod + def extract_info(filename): + return Nodetable.extract_info(filename) diff --git a/khmer/_oxli/hashing.pxd b/khmer/_oxli/hashing.pxd index e0bd6bcf16..46d9fb7cf9 100644 --- a/khmer/_oxli/hashing.pxd +++ b/khmer/_oxli/hashing.pxd @@ -1,6 +1,6 @@ from libcpp cimport bool from libcpp.memory cimport shared_ptr -from libcpp.queue cimport queue +from libcpp.deque cimport deque from libcpp.set cimport set from libcpp.string cimport string @@ -50,11 +50,12 @@ cdef extern from "oxli/kmer_hash.hh" namespace "oxli": HashIntoType _hash_murmur(const string&, const WordLength) HashIntoType _hash_murmur(const string&, HashIntoType&, HashIntoType&) - HashIntoType _hash_murmur_forward(const string&) + HashIntoType _hash_murmur_forward(const string&, + const WordLength) cdef extern from "oxli/oxli.hh" namespace "oxli": - ctypedef queue[CpKmer] KmerQueue + ctypedef deque[CpKmer] KmerQueue ctypedef set[CpKmer] KmerSet ctypedef bool (*KmerFilter) (CpKmer kmer) @@ -65,3 +66,23 @@ cdef class Kmer: @staticmethod cdef Kmer wrap(CpKmer * cpkmer, WordLength K) + @staticmethod + cdef Kmer wrap_partial(CpKmer *cpkmer) + + +cpdef HashIntoType forward_hash(object kmer, unsigned int K) + + +cpdef HashIntoType forward_hash_no_rc(object kmer, WordLength K) + + +cpdef str reverse_hash(object h, int K) + + +cpdef str reverse_complement(object sequence) + + +cpdef hash_murmur3(object s) + + +cpdef hash_no_rc_murmur3(object s) diff --git a/khmer/_oxli/hashing.pyx b/khmer/_oxli/hashing.pyx index 0035eca73c..996ab9d839 100644 --- a/khmer/_oxli/hashing.pyx +++ b/khmer/_oxli/hashing.pyx @@ -6,6 +6,8 @@ from libc.stdint cimport uint64_t from cython.operator cimport dereference as deref from khmer._oxli.oxli_types cimport * +from khmer._oxli.utils cimport _bstring, _ustring + cdef class Kmer: @@ -57,9 +59,62 @@ cdef class Kmer: kmer.kmer = _revhash(kmer.kmer_u, K) return kmer + @staticmethod + cdef Kmer wrap_partial(CpKmer* cpkmer): + cdef Kmer kmer = Kmer() + kmer._this.reset(cpkmer) + kmer.kmer = "" + return kmer + @staticmethod def create(HashIntoType tag, WordLength K): cdef Kmer kmer = Kmer() deref(kmer._this).set_from_unique_hash(tag, K) kmer.kmer = _revhash(kmer.kmer_u, K) return kmer + + +cpdef HashIntoType forward_hash(object kmer, unsigned int K): + '''Run the 2-bit hash algorithm on the given K-mer.''' + + if K > 32: + raise ValueError("k-mer size must be <= 32") + if len(kmer) != K: + raise ValueError("k-mer length must equal K") + + return _hash(_bstring(kmer), K) + + +cpdef HashIntoType forward_hash_no_rc(object kmer, WordLength K): + '''Run the 2-bit hash function in only the given + sequence orientation.''' + + if K > 32: + raise ValueError("k-mer size must be <= 32") + if len(kmer) != K: + raise ValueError("k-mer length must equal K") + + return _hash_forward(_bstring(kmer), K) + + +cpdef str reverse_hash(object h, int K): + if K > 32: + raise ValueError("k-mer size must be <= 32") + + cdef HashIntoType _h = h + return _revhash(_h, K) + + +cpdef str reverse_complement(object sequence): + cdef string s = _revcomp(_bstring(sequence)) + return s + + +cpdef hash_murmur3(object s): + cdef HashIntoType h = _hash_murmur(_bstring(s), len(s)) + return h + + +cpdef hash_no_rc_murmur3(object s): + cdef HashIntoType h = _hash_murmur_forward(_bstring(s), len(s)) + return h diff --git a/khmer/_oxli/hist.pxd b/khmer/_oxli/hist.pxd new file mode 100644 index 0000000000..2b8739cce1 --- /dev/null +++ b/khmer/_oxli/hist.pxd @@ -0,0 +1,11 @@ +from libc.stdint cimport uint64_t + +cdef extern from "oxli/hist.hh" namespace "oxli": + + cdef cppclass CpHistogram "oxli::Histogram<16>": + uint64_t[16] bins + + CpHistogram() + + void add(uint64_t) + void clear() diff --git a/khmer/_oxli/legacy_partitioning.pxd b/khmer/_oxli/legacy_partitioning.pxd index 5ea499fcb4..2b4c8e262f 100644 --- a/khmer/_oxli/legacy_partitioning.pxd +++ b/khmer/_oxli/legacy_partitioning.pxd @@ -74,7 +74,7 @@ cdef extern from "oxli/subset.hh" nogil: unsigned long long repartition_largest_partition(unsigned int, unsigned int, unsigned int, - CpCountgraph&) + CpCountgraph&) except +oxli_raise_py_error void repartition_a_partition(const HashIntoTypeSet &) except +oxli_raise_py_error void _clear_partition(PartitionID, HashIntoTypeSet &) void _merge_other(HashIntoType, PartitionID, PartitionPtrMap &) diff --git a/khmer/_oxli/parsing.pxd b/khmer/_oxli/parsing.pxd index fe2ad3d57b..94b12c0ce8 100644 --- a/khmer/_oxli/parsing.pxd +++ b/khmer/_oxli/parsing.pxd @@ -9,52 +9,17 @@ from libcpp.utility cimport pair from libcpp.string cimport string from khmer._oxli.utils cimport oxli_raise_py_error +from khmer._oxli.sequence cimport Sequence, CpSequence, CpSequencePair ''' extern declarations for liboxli. ''' -# C++ ostream wrapper code stolen shamelessly from stackoverflow -# http://stackoverflow.com/questions/30984078/cython-working-with-c-streams -# We need ostream to wrap ReadParser -cdef extern from "" namespace "std": - cdef cppclass ostream: - ostream& write(const char*, int) except + - -# obviously std::ios_base isn't a namespace, but this lets -# Cython generate the connect C++ code -cdef extern from "" namespace "std::ios_base": - cdef cppclass open_mode: - pass - cdef open_mode binary - # you can define other constants as needed - - -cdef extern from "" namespace "std": - cdef cppclass ofstream(ostream): - # constructors - ofstream(const char*) except + - ofstream(const char*, open_mode) except+ - - -cdef extern from "oxli/read_parsers.hh" namespace "oxli::read_parsers": - cdef cppclass CpSequence "oxli::read_parsers::Read": - string name - string description - string sequence - string quality - string cleaned_seq - - void reset() - void write_fastx(ostream&) - void set_cleaned_seq() - - ctypedef pair[CpSequence,CpSequence] CpSequencePair \ - "oxli::read_parsers::ReadPair" +cdef extern from "oxli/read_parsers.hh" namespace "oxli::read_parsers" nogil: cdef cppclass CpReadParser "oxli::read_parsers::ReadParser" [SeqIO]: - CpReadParser(unique_ptr[SeqIO]) except+ + CpReadParser(unique_ptr[SeqIO]) except +oxli_raise_py_error CpReadParser(CpReadParser&) CpReadParser& operator=(CpReadParser&) CpReadParser(CpReadParser&&) @@ -69,8 +34,8 @@ cdef extern from "oxli/read_parsers.hh" namespace "oxli::read_parsers": void close() cdef cppclass CpFastxReader "oxli::read_parsers::FastxReader": - CpFastxReader() except+ - CpFastxReader(const string&) except+ + CpFastxReader() except +oxli_raise_py_error + CpFastxReader(const string&) except +oxli_raise_py_error CpFastxReader(CpFastxReader&) CpFastxReader& operator=(CpFastxReader&) @@ -94,34 +59,6 @@ cdef extern from "khmer/_cpy_khmer.hh": FastxParserPtr parser -cdef extern from "oxli/alphabets.hh" namespace "oxli": - cdef string DNA_SIMPLE "oxli::alphabets::DNA_SIMPLE" - cdef string DNAN_SIMPLE "oxli::alphabets::DNAN_SIMPLE" - cdef string RNA_SIMPLE "oxli::alphabets::RNA_SIMPLE" - cdef string RNAN_SIMPLE "oxli::alphabets::RNAN_SIMPLE" - cdef string IUPAC_NUCL "oxli::alphabets::IUPAC_NUCL" - cdef string IUPAC_AA "oxli::alphabets::IUPAC_AA" - -''' -Extension Classes wrapping liboxli. -''' - -cdef class Alphabets: - - @staticmethod - cdef string _get(string name) - - -cdef class Sequence: - cdef CpSequence _obj - - @staticmethod - cdef Sequence _wrap(CpSequence cseq) - - -cdef class ReadBundle: - cdef list reads - cdef class FastxParser: cdef shared_ptr[CpReadParser[CpFastxReader]] _this @@ -169,9 +106,3 @@ cdef int _check_is_pair(Sequence first, Sequence second) cpdef bool check_is_left(s) cpdef bool check_is_right(s) - -cdef inline bool is_valid(const char base, string& alphabet) - -cdef inline bool sanitize_sequence(string& sequence, - string& alphabet, - bool convert_n) diff --git a/khmer/_oxli/parsing.pyx b/khmer/_oxli/parsing.pyx index bf646a5ad9..cad16c7889 100644 --- a/khmer/_oxli/parsing.pyx +++ b/khmer/_oxli/parsing.pyx @@ -1,145 +1,17 @@ # -*- coding: UTF-8 -*- - - -from cython.operator cimport dereference as deref cimport cython +from cython.operator cimport dereference as deref from libcpp cimport bool from libcpp.string cimport string import sys from khmer._oxli.utils cimport _bstring, _ustring +from khmer._oxli.sequence cimport (Alphabets, Sequence, CpSequence, + CpSequencePair, ReadBundle, is_valid, + sanitize_sequence) -cdef class Alphabets: - - @staticmethod - def get(name): - cdef unicode alphabet = _ustring(Alphabets._get(_bstring(name))) - if not alphabet: - raise ValueError('No alphabet with name {0}'.format(name)) - return alphabet - - @staticmethod - cdef string _get(string name): - if name == b'DNA_SIMPLE': - return DNA_SIMPLE - elif name == b'DNAN_SIMPLE': - return DNAN_SIMPLE - elif name == b'RNA_SIMPLE': - return RNA_SIMPLE - elif name == b'RNAN_SIMPLE': - return RNAN_SIMPLE - elif name == b'IUPAC_NUCL': - return IUPAC_NUCL - elif name == b'IUPAC_AA': - return IUPAC_AA - else: - return string() - - -@cython.freelist(100) -cdef class Sequence: - - def __cinit__(self, name=None, sequence=None, - quality=None, description=None, - cleaned_seq=None): - - if name is not None and sequence is not None: - self._obj.sequence = _bstring(sequence) - self._obj.name = _bstring(name) - if description is not None: - self._obj.description = _bstring(description) - if quality is not None: - self._obj.quality = _bstring(quality) - if cleaned_seq is not None: - self._obj.cleaned_seq = _bstring(cleaned_seq) - else: - self._obj.cleaned_seq = self._obj.sequence - - def __str__(self): - return repr(self) - - def __repr__(self): - return 'Sequence(name="{0}", sequence="{1}")'.format(self.name, self.sequence) - - def __len__(self): - return self._obj.sequence.length() - - def __richcmp__(x, y, op): - if op == 2: - return x.name == y.name and x.sequence == y.sequence - else: - raise NotImplementedError('Operator not available') - - def kmers(self, int K): - cdef int i = 0 - cdef unicode sequence = self.sequence - for i in range(0, len(self)-K+1): - yield sequence[i:i+K] - - def __getitem__(self, x): - # Definitely optimize this. - return self.sequence[x] - - @property - def name(self): - cdef unicode name = self._obj.name - return self._obj.name if name else None - - @property - def sequence(self): - cdef unicode sequence = self._obj.sequence - return self._obj.sequence if sequence else None - - @property - def description(self): - cdef unicode description = self._obj.description - return description if description else None - - @property - def quality(self): - cdef unicode quality = self._obj.quality - return quality if quality else None - - @property - def cleaned_seq(self): - cdef unicode cleaned_seq = self._obj.cleaned_seq - return cleaned_seq if cleaned_seq else None - - @staticmethod - def from_screed_record(record): - cdef Sequence seq = Sequence(name=record.name, - sequence=record.sequence) - if hasattr(record, 'quality'): - seq._obj.quality = _bstring(record.quality) - - for attr in ('annotations', 'description'): - if hasattr(record, attr): - seq._obj.description = _bstring(getattr(record, attr)) - - return seq - - @staticmethod - cdef Sequence _wrap(CpSequence cseq): - cdef Sequence seq = Sequence() - seq._obj = cseq - return seq - - -cdef class ReadBundle: - - def __cinit__(self, *raw_records): - self.reads = [r for r in raw_records if r] - - @property - def num_reads(self): - return len(self.reads) - - @property - def total_length(self): - return sum([len(r.sequence) for r in self.reads]) - def print_error(msg): """Print the given message to 'stderr'.""" @@ -164,35 +36,18 @@ class UnpairedReadsError(ValueError): self.read2 = r2 -cdef inline bool is_valid(const char base, string& alphabet): - cdef char b - for b in alphabet: - if b == base: - return True - return False - - -cdef inline bool sanitize_sequence(string& sequence, - string& alphabet, - bool convert_n): - cdef int i = 0 - for i in range(sequence.length()): - sequence[i] &= 0xdf - if not is_valid(sequence[i], alphabet): - return False - if convert_n and sequence[i] == b'N': - sequence[i] = b'A' - return True - - cdef class FastxParser: def __cinit__(self, filename, *args, **kwargs): self._this = get_parser[CpFastxReader](_bstring(filename)) + if self.is_complete(): + raise RuntimeError('{0} has no sequences!'.format(filename)) cdef Sequence _next(self): if not self.is_complete(): - return Sequence._wrap(deref(self._this).get_next_read()) + seq = Sequence._wrap(deref(self._this).get_next_read()) + seq.clean() + return seq else: return None @@ -205,6 +60,10 @@ cdef class FastxParser: seq = self._next() yield seq + @property + def num_reads(self): + return deref(self._this).get_num_reads() + cdef class SanitizedFastxParser(FastxParser): @@ -212,7 +71,7 @@ cdef class SanitizedFastxParser(FastxParser): bool convert_n=True): self.n_bad = 0 self.convert_n = convert_n - self._alphabet = Alphabets._get(_bstring(alphabet)) + self._alphabet = Alphabets._get(alphabet) cdef Sequence _next(self): cdef Sequence seq @@ -227,6 +86,7 @@ cdef class SanitizedFastxParser(FastxParser): self.n_bad += 1 return None else: + seq._obj.cleaned_seq = seq._obj.sequence return seq else: return None diff --git a/khmer/_oxli/partitioning.pxd b/khmer/_oxli/partitioning.pxd new file mode 100644 index 0000000000..666ef7aed0 --- /dev/null +++ b/khmer/_oxli/partitioning.pxd @@ -0,0 +1,122 @@ +from libcpp cimport bool +from libcpp.memory cimport unique_ptr, weak_ptr, shared_ptr +from libcpp.unordered_map cimport unordered_map +from libcpp.vector cimport vector +from libcpp.set cimport set +from libcpp.queue cimport queue +from libcpp.string cimport string +from libc.stdint cimport uint32_t, uint8_t, uint64_t +from libc.stdio cimport FILE + +from khmer._oxli.hashing cimport CpKmer, Kmer, KmerQueue +from khmer._oxli.hist cimport CpHistogram +from khmer._oxli.graphs cimport CpHashgraph, Hashgraph +from khmer._oxli.oxli_types cimport * + + +cdef extern from "oxli/partitioning.hh" namespace "oxli": + + ctypedef vector[HashIntoType] TagVector + + cdef cppclass CpComponent "oxli::Component": + CpComponent() + CpComponent(uint64_t) + + CpHistogram coverage + const uint64_t component_id + vector[HashIntoType] tags + + void kill() + bool is_alive() const + + void add_tag(HashIntoType) + void add_tags(TagVector&) + + uint64_t get_n_tags() const + uint64_t get_n_created() const + uint64_t get_n_destroyed() const + + void update_coverage(CpHashgraph *) + + ctypedef shared_ptr[CpComponent] ComponentPtr + ctypedef set[ComponentPtr] ComponentPtrSet + ctypedef vector[ComponentPtr] ComponentPtrVector + + cdef cppclass CpGuardedHashCompMap "oxli::GuardedHashCompMap": + unordered_map[HashIntoType, ComponentPtr] data + + ComponentPtr get(HashIntoType) + void set(HashIntoType, ComponentPtr) + bool contains(HashIntoType) + + cdef cppclass CpComponentMap "oxli::ComponentMap": + CpComponentMap(WordLength, WordLength, uint64_t) + + void create_component(TagVector&) + uint32_t create_and_merge_components(TagVector&) + void map_tags_to_component(TagVector&, ComponentPtr&) + uint32_t merge_components(ComponentPtr&, ComponentPtrSet&) + + bool contains(HashIntoType) + ComponentPtr get(HashIntoType) const + + uint64_t get_n_components() const + uint64_t get_n_tags() const + weak_ptr[ComponentPtrVector] get_components() + weak_ptr[CpGuardedHashCompMap] get_tag_component_map() + + cdef cppclass CpStreamingPartitioner "oxli::StreamingPartitioner" (CpComponentMap): + CpStreamingPartitioner(CpHashgraph * ) except +MemoryError + CpStreamingPartitioner(CpHashgraph *, uint32_t) except +MemoryError + + CpHashgraph * graph + uint64_t consume(string&) nogil except +MemoryError + uint64_t consume_pair(string&, string&) nogil except +MemoryError + uint64_t consume_fasta(string&) except +MemoryError + + uint64_t seed_sequence(string&, TagVector&, KmerQueue&, + set[HashIntoType]&) except +MemoryError + + void find_connected_tags(KmerQueue&, + TagVector&, + set[HashIntoType]&) except +MemoryError + + void find_connected_tags(KmerQueue&, + TagVector&, + set[HashIntoType]&, + bool) except +MemoryError + + ComponentPtr find_nearest_component(string&) const + ComponentPtr find_nearest_component(CpKmer) const + + uint64_t get_n_consumed() const + uint32_t get_tag_density() const + + ComponentPtr get(string&) const + + +cdef class Component: + cdef ComponentPtr _this + + cdef void save(self, FILE * fp) + + @staticmethod + cdef Component wrap(ComponentPtr ptr) + + @staticmethod + cdef vector[BoundedCounterType] _tag_counts(ComponentPtr comp, CpHashgraph* graph) + + @staticmethod + cdef float _mean_tag_count(ComponentPtr comp, CpHashgraph * graph) + + @staticmethod + cdef ComponentPtr load(uint64_t component_id, list tags) + + +cdef class StreamingPartitioner: + cdef shared_ptr[CpStreamingPartitioner] _this + cdef weak_ptr[ComponentPtrVector] _components + cdef weak_ptr[CpGuardedHashCompMap] _tag_component_map + cdef public Hashgraph graph + cdef readonly uint64_t n_consumed + diff --git a/khmer/_oxli/partitioning.pyx b/khmer/_oxli/partitioning.pyx new file mode 100644 index 0000000000..138b3dc3e7 --- /dev/null +++ b/khmer/_oxli/partitioning.pyx @@ -0,0 +1,300 @@ +# cython: c_string_type=unicode, c_string_encoding=utf8 +import cython +from cython.operator cimport dereference as deref, preincrement as inc + +from libcpp cimport bool +from libcpp.string cimport string +from libcpp.vector cimport vector +from libcpp.map cimport map +from libcpp.set cimport set +from libcpp.queue cimport queue +from libcpp.memory cimport unique_ptr, weak_ptr, shared_ptr, make_shared +from libcpp.utility cimport pair + +from libc.stdint cimport uint32_t, uint8_t, uint64_t +from libc.limits cimport UINT_MAX + +from libc.stdint cimport uintptr_t +from libc.stdio cimport FILE, fopen, fwrite, fclose, stdout, stderr, fprintf + +import json +import os + +from khmer._oxli.graphs cimport Countgraph, Nodegraph +from khmer._oxli.oxli_types cimport * +from khmer._oxli.utils cimport _bstring + +cdef class Component: + + def __cinit__(self, Component other=None): + if other is not None: + self._this.reset(other._this.get()) + + @property + def component_id(self): + return deref(self._this).component_id + + @property + def _n_created(self): + return deref(self._this).get_n_created() + + @property + def _n_destroyed(self): + return deref(self._this).get_n_destroyed() + + def __repr__(self): + status = 'ALIVE' if deref(self._this).is_alive() else 'DEAD' + return ''.format(self.component_id, + len(self), + status) + + def __len__(self): + return deref(self._this).get_n_tags() + + def __iter__(self): + cdef HashIntoType tag + for tag in deref(self._this).tags: + yield tag + + def __hash__(self): + return self._this.get() + + def __richcmp__(x, y, op): + if op == 2: + return x.component_id == y.component_id + else: + raise NotImplementedError('Operator not available.') + + @staticmethod + cdef vector[BoundedCounterType] _tag_counts(ComponentPtr comp, CpHashgraph* graph): + cdef uint64_t n_tags = deref(comp).get_n_tags() + cdef vector[BoundedCounterType] counts + counts = vector[BoundedCounterType](n_tags) + cdef int idx + cdef uint64_t tag + for idx, tag in enumerate(deref(comp).tags): + counts[idx] = deref(graph).get_count(tag) + return counts + + @staticmethod + def tag_counts(Component component not None, Countgraph graph not None): + return Component._tag_counts(component._this, graph._cg_this.get()) + + @staticmethod + cdef float _mean_tag_count(ComponentPtr comp, CpHashgraph * graph): + cdef uint64_t n_tags = deref(comp).get_n_tags() + cdef float acc = 0 + cdef uint64_t tag + for tag in deref(comp).tags: + acc += deref(graph).get_count(tag) + return acc / n_tags + + cdef void save(self, FILE* fp): + cdef HashIntoType tag + cdef int i + + fprintf(fp, "{\"component_id\": %llu, \"tags\": [", deref(self._this).component_id) + for i, tag in enumerate(deref(self._this).tags): + if i != 0: + fprintf(fp, ",") + fprintf(fp, "%llu", tag) + fprintf(fp, "]}") + + @staticmethod + cdef ComponentPtr load(uint64_t component_id, list tags): + cdef ComponentPtr comp + cdef HashIntoType tag + cdef int i, N = len(tags) + comp.reset(new CpComponent(component_id)) + for i in range(N): + tag = tags[i] + deref(comp).add_tag(tag) + return comp + + @staticmethod + cdef Component wrap(ComponentPtr ptr): + cdef Component comp = Component() + comp._this = ptr + return comp + + +cdef class StreamingPartitioner: + + def __cinit__(self, Hashgraph graph not None, tag_density=None, *args, **kwargs): + self.graph = graph + + if tag_density is None: + self._this.reset(new CpStreamingPartitioner(self.graph._hg_this.get())) + else: + self._this.reset(new CpStreamingPartitioner(self.graph._hg_this.get(), + tag_density)) + + self._tag_component_map = deref(self._this).get_tag_component_map() + self._components = deref(self._this).get_components() + self.n_consumed = 0 + + def consume(self, str sequence): + self.n_consumed += 1 + return deref(self._this).consume(_bstring(sequence)) + + def consume_pair(self, str first, str second): + self.n_consumed += 2 + return deref(self._this).consume_pair(_bstring(first), + _bstring(second)) + + def consume_fasta(self, object filename): + return deref(self._this).consume_fasta(_bstring(filename)) + + def get(self, kmer): + cdef ComponentPtr compptr + cdef string _kmer = _bstring(kmer) + compptr = deref(self._this).get(_kmer) + if compptr == NULL: + return None + else: + return Component.wrap(compptr) + + def find_nearest_component(self, kmer): + cdef ComponentPtr compptr + cdef string kmer_s = _bstring(kmer) + compptr = deref(self._this).find_nearest_component(kmer_s) + if compptr == NULL: + return None + else: + return Component.wrap(compptr) + + def components(self): + cdef shared_ptr[ComponentPtrVector] locked + cdef ComponentPtr cmpptr + lockedptr = self._components.lock() + if lockedptr: + for cmpptr in deref(lockedptr): + if cmpptr != NULL: + yield Component.wrap(cmpptr) + else: + raise MemoryError("Can't locked underlying Component set") + + def tag_components(self): + cdef shared_ptr[CpGuardedHashCompMap] locked + cdef pair[HashIntoType,ComponentPtr] cpair + locked = self._tag_component_map.lock() + if locked: + for cpair in deref(locked).data: + yield cpair.first, Component.wrap(cpair.second) + else: + raise MemoryError("Can't lock underlying Component set") + + def write_components(self, filename): + cdef FILE* fp + fp = fopen(filename.encode('utf-8'), 'wb') + if fp == NULL: + raise IOError('Can\'t open file.') + + cdef ComponentPtr cmpptr + cdef shared_ptr[ComponentPtrVector] lockedptr + lockedptr = self._components.lock() + + if lockedptr: + for cmpptr in deref(lockedptr): + if cmpptr == NULL: + continue + fprintf(fp, "%llu,%llu,%f\n", + deref(cmpptr).component_id, + deref(cmpptr).get_n_tags(), + Component._mean_tag_count(cmpptr, + self.graph._hg_this.get())) + fclose(fp) + + def write_component_coverage(self, filename): + cdef FILE* fp + fp = fopen(filename.encode('utf-8'), 'wb') + if fp == NULL: + raise IOError('Can\'t open file.') + + cdef ComponentPtr cmpptr + cdef shared_ptr[ComponentPtrVector] lockedptr + cdef size_t i + lockedptr = self._components.lock() + + if lockedptr: + for cmpptr in deref(lockedptr): + if cmpptr == NULL: + continue + deref(cmpptr).update_coverage(self.graph._hg_this.get()) + fprintf(fp, "%llu", + deref(cmpptr).component_id) + for i in range(16): + fprintf(fp, ",%llu", deref(cmpptr).coverage.bins[i]) + fprintf(fp, "\n") + fclose(fp) + + def save(self, filename): + graph_filename = '{0}.graph'.format(filename) + comp_filename = '{0}.json'.format(filename) + bytes_graph_filename = graph_filename.encode('utf-8') + cdef char * c_graph_filename = bytes_graph_filename + self.graph.save(graph_filename) + + cdef FILE* fp = fopen(comp_filename.encode('utf-8'), 'w') + if fp == NULL: + raise IOError('Can\'t open file.') + + fprintf(fp, "{\"graph\": \"%s\",\n\"n_components\": %llu,\n", + c_graph_filename, deref(self._this).get_n_components()) + fprintf(fp, "\"n_tags\": %llu,\n", deref(self._this).get_n_tags()) + fprintf(fp, "\"components\": [\n") + + cdef Component comp + cdef int i + cdef shared_ptr[ComponentPtrVector] locked + locked = self._components.lock() + if locked: + for i, comp in enumerate(self.components()): + if i != 0: + fprintf(fp, ",\n") + comp.save(fp) + fprintf(fp, "\n]}") + fclose(fp) + ''' + @staticmethod + def load(filename): + + with open(filename) as fp: + data = json.load(fp) + directory = os.path.dirname(filename) + + cdef object graph + graph_filename = os.path.join(directory, data['graph']) + try: + graph = load_countgraph(graph_filename) + print('Loading', graph_filename, 'as CountGraph') + except OSError as e: + # maybe it was a nodegraph instead + graph = load_nodegraph(graph_filename) + print('Loading', graph_filename, 'as NodeGraph') + + partitioner = StreamingPartitioner(graph) + cdef ComponentPtr comp_ptr + for comp_info in data['components']: + comp_ptr = Component.load(comp_info['component_id'], + comp_info['tags']) + deref(partitioner._this).add_component(comp_ptr) + return partitioner + ''' + + @property + def component_dict(self): + return {comp.component_id: comp for comp in self.components()} + + @property + def n_components(self): + return deref(self._this).get_n_components() + + @property + def n_tags(self): + return deref(self._this).get_n_tags() + + @property + def tag_density(self): + return deref(self._this).get_tag_density() + diff --git a/khmer/_oxli/sequence.pxd b/khmer/_oxli/sequence.pxd new file mode 100644 index 0000000000..d8c8e30937 --- /dev/null +++ b/khmer/_oxli/sequence.pxd @@ -0,0 +1,82 @@ +from libcpp cimport bool +from libcpp.memory cimport shared_ptr +from libcpp.utility cimport pair +from libcpp.string cimport string + + + +# C++ ostream wrapper code stolen shamelessly from stackoverflow +# http://stackoverflow.com/questions/30984078/cython-working-with-c-streams +# We need ostream to wrap ReadParser +cdef extern from "" namespace "std": + cdef cppclass ostream: + ostream& write(const char*, int) except + + +# obviously std::ios_base isn't a namespace, but this lets +# Cython generate the connect C++ code +cdef extern from "" namespace "std::ios_base": + cdef cppclass open_mode: + pass + cdef open_mode binary + # you can define other constants as needed + + +cdef extern from "" namespace "std": + cdef cppclass ofstream(ostream): + # constructors + ofstream(const char*) except + + ofstream(const char*, open_mode) except+ + + +cdef extern from "oxli/read_parsers.hh" namespace "oxli::read_parsers": + cdef cppclass CpSequence "oxli::read_parsers::Read": + string name + string description + string sequence + string quality + string cleaned_seq + + void reset() + void write_fastx(ostream&) + void set_clean_seq() + + ctypedef pair[CpSequence,CpSequence] CpSequencePair \ + "oxli::read_parsers::ReadPair" + + +cdef extern from "oxli/alphabets.hh" namespace "oxli": + cdef string DNA_SIMPLE "oxli::alphabets::DNA_SIMPLE" + cdef string DNAN_SIMPLE "oxli::alphabets::DNAN_SIMPLE" + cdef string RNA_SIMPLE "oxli::alphabets::RNA_SIMPLE" + cdef string RNAN_SIMPLE "oxli::alphabets::RNAN_SIMPLE" + cdef string IUPAC_NUCL "oxli::alphabets::IUPAC_NUCL" + cdef string IUPAC_AA "oxli::alphabets::IUPAC_AA" + +''' +Extension Classes wrapping liboxli. +''' + +cdef class Alphabets: + + @staticmethod + cdef string _get(str name) except * + + +cdef class Sequence: + cdef CpSequence _obj + + @staticmethod + cdef Sequence _wrap(CpSequence cseq) + + +cdef string _object_to_string(object sequence) except * + + +cdef class ReadBundle: + cdef list reads + +cdef bool is_valid(const char base, string& alphabet) + +cdef bool sanitize_sequence(string& sequence, + string& alphabet, + bool convert_n) diff --git a/khmer/_oxli/sequence.pyx b/khmer/_oxli/sequence.pyx new file mode 100644 index 0000000000..ec4443641a --- /dev/null +++ b/khmer/_oxli/sequence.pyx @@ -0,0 +1,191 @@ +# -*- coding: UTF-8 -*- +from cython.operator cimport dereference as deref +cimport cython + +from khmer._oxli.utils cimport _bstring +from khmer._oxli.graphs cimport Hashtable + +cdef class Alphabets: + + @staticmethod + def get(name): + cdef string alphabet = Alphabets._get(name) + return alphabet + + @staticmethod + cdef string _get(str name) except *: + if name == 'DNA_SIMPLE': + return DNA_SIMPLE + elif name == 'DNAN_SIMPLE': + return DNAN_SIMPLE + elif name == 'RNA_SIMPLE': + return RNA_SIMPLE + elif name == 'RNAN_SIMPLE': + return RNAN_SIMPLE + elif name == 'IUPAC_NUCL': + return IUPAC_NUCL + elif name == 'IUPAC_AA': + return IUPAC_AA + else: + raise ValueError('No alphabet with name {0}'.format(name)) + + +@cython.freelist(100) +cdef class Sequence: + + def __cinit__(self, name=None, sequence=None, + quality=None, description=None, + cleaned_seq=None): + + if name is not None and sequence is not None: + self._obj.sequence = _bstring(sequence) + self._obj.name = _bstring(name) + if description is not None: + self._obj.description = _bstring(description) + if quality is not None: + self._obj.quality = _bstring(quality) + if cleaned_seq is not None: + self._obj.cleaned_seq = _bstring(cleaned_seq) + else: + self._obj.cleaned_seq = self._obj.sequence + + def __str__(self): + return self.cleaned_seq if self._obj.cleaned_seq.length() > 0 else self.sequence + + def __repr__(self): + return 'Sequence(name="{0}", sequence="{1}")'.format(self.name, self.sequence) + + def __len__(self): + return self._obj.sequence.length() + + def __richcmp__(x, y, op): + if op == 2: + return x.name == y.name and x.sequence == y.sequence + else: + raise NotImplementedError('Operator not available') + + def kmers(self, int K): + cdef int i = 0 + cdef unicode sequence = self.sequence + for i in range(0, len(self)-K+1): + yield sequence[i:i+K] + + def __getitem__(self, x): + # Definitely optimize this. + return self.sequence[x] + + def trim(self, int trim_at): + self._obj.sequence.resize(trim_at) + self._obj.cleaned_seq.resize(trim_at) + if self._obj.quality.length() != 0: + self._obj.quality.resize(trim_at) + + def clean(self): + '''Calls set_cleaned_seq() on the underlying container.''' + self._obj.set_clean_seq() + + @property + def name(self): + cdef unicode name = self._obj.name + return name if name else None + + @property + def sequence(self): + cdef unicode sequence = self._obj.sequence + return sequence if sequence else None + + @property + def description(self): + cdef unicode description = self._obj.description + return description if description else None + + @property + def quality(self): + cdef unicode quality = self._obj.quality + return quality if quality else None + + @property + def cleaned_seq(self): + cdef unicode cleaned_seq = self._obj.cleaned_seq + return cleaned_seq if cleaned_seq else None + + @staticmethod + def from_screed_record(record): + cdef Sequence seq = Sequence(name=record.name, + sequence=record.sequence) + if hasattr(record, 'quality'): + seq._obj.quality = _bstring(record.quality) + + for attr in ('annotations', 'description'): + if hasattr(record, attr): + seq._obj.description = _bstring(getattr(record, attr)) + + return seq + + @staticmethod + cdef Sequence _wrap(CpSequence cseq): + cdef Sequence seq = Sequence() + seq._obj = cseq + return seq + + +cdef string _object_to_string(object sequence) except *: + if isinstance(sequence, bytes): + return sequence + elif isinstance(sequence, Sequence): + return (sequence)._obj.cleaned_seq + else: + return _bstring(sequence) + + +cdef class ReadBundle: + + def __cinit__(self, *raw_records): + self.reads = [r for r in raw_records if r] + + @property + def num_reads(self): + return len(self.reads) + + @property + def total_length(self): + return sum([len(r.sequence) for r in self.reads]) + + +cdef bool is_valid(const char base, string& alphabet): + cdef char b + for b in alphabet: + if b == base: + return True + return False + + +cdef bool sanitize_sequence(string& sequence, + string& alphabet, + bool convert_n): + cdef int i = 0 + for i in range(sequence.length()): + sequence[i] &= 0xdf + if not is_valid(sequence[i], alphabet): + return False + if convert_n and sequence[i] == b'N': + sequence[i] = b'A' + return True + + +def trim_sequence(Hashtable graph, Sequence record, int cutoff, + variable_coverage=False, normalize_to=None): + if variable_coverage: + if not graph.median_at_least(record.cleaned_seq, normalize_to): + return record, False + + trim_at = graph._trim_on_abundance(record, cutoff) + + if trim_at < graph.ksize(): + return None, True + + if trim_at == len(record): + return record, False + + record.trim(trim_at) + return record, True diff --git a/khmer/_oxli/traversal.pxd b/khmer/_oxli/traversal.pxd index ebafd7a609..6dc4651544 100644 --- a/khmer/_oxli/traversal.pxd +++ b/khmer/_oxli/traversal.pxd @@ -1,3 +1,4 @@ +from libcpp.memory cimport unique_ptr from libc.stdint cimport uint32_t from libcpp.memory cimport shared_ptr from libcpp cimport bool diff --git a/khmer/_oxli/traversal.pyx b/khmer/_oxli/traversal.pyx index e367cfa315..3e3ee8501f 100644 --- a/khmer/_oxli/traversal.pyx +++ b/khmer/_oxli/traversal.pyx @@ -26,7 +26,7 @@ cdef class Traverser: cpkmer = deref(kmers).front() kmer = Kmer.wrap(new CpKmer(cpkmer), deref(self._graph_ptr).ksize()) result.append(kmer) - deref(kmers).pop() + deref(kmers).pop_back() return result cdef list _kmerqueue_to_hash_list(self, KmerQueue * kmers): @@ -35,7 +35,7 @@ cdef class Traverser: while(deref(kmers).empty() == 0): cpkmer = deref(kmers).front() result.append(cpkmer.kmer_u) - deref(kmers).pop() + deref(kmers).pop_back() return result cdef list _neighbors(self, CpKmer start, int direction=0): diff --git a/khmer/_oxli/utils.pxd b/khmer/_oxli/utils.pxd index ae487c38cd..927f300c1f 100644 --- a/khmer/_oxli/utils.pxd +++ b/khmer/_oxli/utils.pxd @@ -1,10 +1,11 @@ # -*- coding: UTF-8 -*- +from libcpp.string cimport string from libcpp.vector cimport vector from libc.stdint cimport uint32_t, uint64_t from libcpp cimport bool -cdef extern from "oxli_exception_convert.hh": +cdef extern from "oxli/oxli_exception_convert.hh": cdef void oxli_raise_py_error() @@ -12,6 +13,20 @@ cdef extern from "oxli/hashtable.hh" namespace "oxli": cdef bool _is_prime "oxli::is_prime" (uint64_t n) cdef vector[uint64_t] _get_n_primes_near_x "oxli::get_n_primes_near_x" (uint32_t, uint64_t) +cdef extern from "oxli/oxli.hh": + cdef string _get_version_cpp "oxli::get_version_cpp" () + cdef const char * SAVED_SIGNATURE + cdef int SAVED_FORMAT_VERSION + cdef int SAVED_COUNTING_HT + cdef int SAVED_HASHBITS + cdef int SAVED_TAGS + cdef int SAVED_STOPTAGS + cdef int SAVED_SUBSET + cdef int SAVED_LABELSET + cdef int SAVED_SMALLCOUNT + cdef int SAVED_QFCOUNT + + cdef bytes _bstring(s) cdef unicode _ustring(s) @@ -21,3 +36,5 @@ cpdef bool is_num(object n) cdef void _flatten_fill(double * fill_to, object fill_from) cdef void _fill(double * fill_to, object fill_from) + +cpdef str get_version_cpp() diff --git a/khmer/_oxli/utils.pyx b/khmer/_oxli/utils.pyx index 3fcb553df3..f44e6e4ad1 100644 --- a/khmer/_oxli/utils.pyx +++ b/khmer/_oxli/utils.pyx @@ -5,6 +5,18 @@ from cpython.version cimport PY_MAJOR_VERSION from cython import short, int, long +FILETYPES = \ +{ + "COUNTING_HT": SAVED_COUNTING_HT, + "HASHBITS": SAVED_HASHBITS, + "TAGS": SAVED_TAGS, + "STOPTAGS": SAVED_STOPTAGS, + "SUBSET": SAVED_SUBSET, + "LABELSET": SAVED_LABELSET, + "SMALLCOUNT": SAVED_SMALLCOUNT +} + + def is_prime(n): return _is_prime(n) @@ -43,16 +55,24 @@ cdef unicode _ustring(s): cpdef bool is_str(object s): return isinstance(s, (basestring, bytes)) + cpdef bool is_num(object n): return isinstance(n, (int, long)) + cdef void _flatten_fill(double * fill_to, object fill_from): '''UNSAFE fill from multilevel python iterable to C array.''' cdef list flattened = [x for sublist in fill_from for x in sublist] for idx, item in enumerate(flattened): fill_to[idx] = item + cdef void _fill(double * fill_to, object fill_from): '''UNSAFE fill from flat python iterable to C array.''' for idx, item in enumerate(fill_from): fill_to[idx] = item + + +cpdef str get_version_cpp(): + return _get_version_cpp() + diff --git a/khmer/kfile.py b/khmer/kfile.py index a3a8170627..82ca9ed3a3 100755 --- a/khmer/kfile.py +++ b/khmer/kfile.py @@ -34,7 +34,6 @@ # Contact: khmer-project@idyll.org """File handling/checking utilities for command-line scripts.""" - import os import sys import errno diff --git a/khmer/khmer_args.py b/khmer/khmer_args.py index 8f233e4efa..89c01f4e3e 100755 --- a/khmer/khmer_args.py +++ b/khmer/khmer_args.py @@ -35,24 +35,19 @@ # Contact: khmer-project@idyll.org """Common argparse constructs.""" - import sys import argparse import math import textwrap from argparse import _VersionAction from collections import namedtuple -try: - from StringIO import StringIO -except ImportError: - from io import StringIO +from io import StringIO import screed import khmer -from khmer import extract_countgraph_info -from khmer import __version__ -from .utils import print_error -from .khmer_logger import log_info, log_warn, configure_logging +from khmer import __version__, Countgraph +from khmer.utils import print_error, PAIRING_MODES +from khmer.khmer_logger import log_info, log_warn, configure_logging DEFAULT_K = 32 @@ -260,7 +255,7 @@ def check_conflicting_args(args, hashtype): infoset = None if hashtype in ('countgraph', 'smallcountgraph'): - infoset = extract_countgraph_info(args.loadgraph) + infoset = Countgraph.extract_info(args.loadgraph) if infoset is not None: ksize = infoset.ksize max_tablesize = infoset.table_size @@ -494,6 +489,20 @@ def add_loadgraph_args(parser): help='load a precomputed k-mer graph from disk') +def add_pairing_args(parser): + """Common pairing mode argument.""" + parser.add_argument('--pairing-mode', default='interleaved', + choices=PAIRING_MODES, + help='How to interpret read pairing. With `single`, '\ + 'reads will be parsed as singletons, regardless'\ + ' of pairing or file order. With `interleaved`,'\ + ' each file will be assumed to be interleaved '\ + 'and paired, with singletons allowed to be mixed'\ + ' in. With `split`, it will be assumed that each'\ + ' group of two files in the input list are '\ + 'as (LEFT, RIGHT), ...') + + def calculate_graphsize(args, graphtype, multiplier=1.0): """ Transform the table parameters into a size. diff --git a/khmer/thread_utils.py b/khmer/thread_utils.py index e5a1fd3068..25e49f9678 100755 --- a/khmer/thread_utils.py +++ b/khmer/thread_utils.py @@ -35,7 +35,6 @@ # pylint: disable=missing-docstring,too-few-public-methods """Utilities for dealing with multithreaded processing of short reads.""" - import threading import sys import screed diff --git a/khmer/utils.py b/khmer/utils.py index f39689fb39..e28cc1ac01 100755 --- a/khmer/utils.py +++ b/khmer/utils.py @@ -34,10 +34,19 @@ # Contact: khmer-project@idyll.org """Helpful methods for performing common argument-checking tasks in scripts.""" from khmer._oxli.parsing import (check_is_left, check_is_right, check_is_pair, - UnpairedReadsError, _split_left_right) + UnpairedReadsError, _split_left_right, + FastxParser, SplitPairedReader, + BrokenPairedReader) import itertools +PAIRING_MODES = ('split', 'interleaved', 'single') + +def grouper(n, iterable): + iterable = iter(iterable) + return iter(lambda: list(itertools.islice(iterable, n)), []) + + def print_error(msg): """Print the given message to 'stderr'.""" import sys @@ -45,76 +54,42 @@ def print_error(msg): print(msg, file=sys.stderr) -def broken_paired_reader(screed_iter, min_length=None, - force_single=False, require_paired=False): - """Read pairs from a stream. - - A generator that yields singletons and pairs from a stream of FASTA/FASTQ - records (yielded by 'screed_iter'). Yields (n, is_pair, r1, r2) where - 'r2' is None if is_pair is False. - - The input stream can be fully single-ended reads, interleaved paired-end - reads, or paired-end reads with orphans, a.k.a. "broken paired". +def paired_fastx_handler(samples, pairing_mode, min_length=-1, + force_name_match=False, yield_filenames=False, + **kwargs): - Usage:: - - for n, is_pair, read1, read2 in broken_paired_reader(...): - ... - - Note that 'n' behaves like enumerate() and starts at 0, but tracks - the number of records read from the input stream, so is - incremented by 2 for a pair of reads. - - If 'min_length' is set, all reads under this length are ignored (even - if they are pairs). - - If 'force_single' is True, all reads are returned as singletons. - """ - record = None - prev_record = None - num = 0 - - if force_single and require_paired: - raise ValueError("force_single and require_paired cannot both be set!") - - # handle the majority of the stream. - for record in screed_iter: - if prev_record: - if check_is_pair(prev_record, record) and not force_single: - if min_length and (len(prev_record.sequence) < min_length or - len(record.sequence) < min_length): - if require_paired: - record = None - else: - yield num, True, prev_record, record # it's a pair! - num += 2 - record = None - else: # orphan. - if require_paired: - err = UnpairedReadsError( - "Unpaired reads when require_paired is set!", - prev_record, record) - raise err - - # ignore short reads - if min_length and len(prev_record.sequence) < min_length: - pass - else: - yield num, False, prev_record, None - num += 1 - - prev_record = record - record = None - - # handle the last record, if it exists (i.e. last two records not a pair) - if prev_record: - if require_paired: - raise UnpairedReadsError("Unpaired reads when require_paired " - "is set!", prev_record, None) - if min_length and len(prev_record.sequence) < min_length: - pass + if pairing_mode not in PAIRING_MODES: + raise ValueError('Pairing mode must be one of {0}'.format(PAIRING_MODES)) + + if pairing_mode == 'split': + _samples = grouper(2, samples) + else: + _samples = samples + + for group in _samples: + if pairing_mode == 'split': + reader = SplitPairedReader(FastxParser(group[0]), + FastxParser(group[1]), + min_length=min_length, + force_name_match=force_name_match) + elif pairing_mode == 'single': + reader = BrokenPairedReader(FastxParser(group), + force_single=True, + min_length=min_length, + require_paired=force_name_match) else: - yield num, False, prev_record, None + reader = BrokenPairedReader(FastxParser(group), + force_single=False, + min_length=min_length, + require_paired=force_name_match) + if yield_filenames: + if pairing_mode == 'split': + _filename = group[0] + '.pair' + else: + _filename = group + yield _filename, reader + else: + yield reader def write_record(record, fileobj): @@ -187,7 +162,6 @@ def num_reads(self): def total_length(self): return sum([len(r.sequence) for r in self.reads]) - def grouper(n, iterable): iterable = iter(iterable) return iter(lambda: list(itertools.islice(iterable, n)), []) diff --git a/oxli/functions.py b/oxli/functions.py index c79c475f83..de93da82b0 100755 --- a/oxli/functions.py +++ b/oxli/functions.py @@ -37,6 +37,7 @@ import threading import khmer.utils +from khmer._oxli.parsing import FastxParser def build_graph(ifilenames, graph, num_threads=1, tags=False): @@ -54,7 +55,7 @@ def build_graph(ifilenames, graph, num_threads=1, tags=False): eat = graph.consume_seqfile for _, ifile in enumerate(ifilenames): - rparser = khmer.ReadParser(ifile) + rparser = FastxParser(ifile) threads = [] for _ in range(num_threads): diff --git a/scripts/extract-paired-reads.py b/scripts/extract-paired-reads.py index 29d7cbe3cb..e12a7317b2 100755 --- a/scripts/extract-paired-reads.py +++ b/scripts/extract-paired-reads.py @@ -48,14 +48,14 @@ import os.path import textwrap -from khmer import ReadParser from khmer.kfile import check_input_files, check_space from khmer.khmer_args import sanitize_help, KhmerArgumentParser from khmer.khmer_args import FileType as khFileType from khmer.kfile import add_output_compression_type from khmer.kfile import get_file_writer -from khmer.utils import broken_paired_reader, write_record, write_record_pair +from khmer.utils import write_record, write_record_pair +from khmer._oxli.parsing import BrokenPairedReader, FastxParser def get_parser(): @@ -151,8 +151,8 @@ def main(): n_pe = 0 n_se = 0 - reads = ReadParser(infile) - for index, is_pair, read1, read2 in broken_paired_reader(reads): + reads = FastxParser(infile) + for index, is_pair, read1, read2 in BrokenPairedReader(reads): if index % 100000 == 0 and index > 0: print('...', index, file=sys.stderr) diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py index 3edcef86ec..6c0674ac1b 100755 --- a/scripts/filter-abund-single.py +++ b/scripts/filter-abund-single.py @@ -51,8 +51,8 @@ import textwrap import khmer -from khmer import ReadParser -from khmer.utils import broken_paired_reader, write_record +from khmer.utils import BrokenPairedReader, FastxParser, write_record +from khmer._oxli.sequence import trim_sequence from khmer import khmer_args from khmer.khmer_args import (build_counting_args, report_on_config, add_threading_args, calculate_graphsize, @@ -63,7 +63,6 @@ get_file_writer) from khmer.khmer_logger import (configure_logging, log_info, log_error, log_warn) -from khmer.trimming import (trim_record) DEFAULT_NORMALIZE_LIMIT = 20 DEFAULT_CUTOFF = 2 @@ -163,7 +162,7 @@ def main(): outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) - paired_iter = broken_paired_reader(ReadParser(args.datafile), + paired_iter = BrokenPairedReader(FastxParser(args.datafile), min_length=graph.ksize(), force_single=True) @@ -171,7 +170,7 @@ def main(): assert not is_pair assert read2 is None - trimmed_record, _ = trim_record(graph, read1, args.cutoff, + trimmed_record, _ = trim_sequence(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: diff --git a/scripts/filter-abund.py b/scripts/filter-abund.py index cb729c9b77..fd2a5c3d82 100755 --- a/scripts/filter-abund.py +++ b/scripts/filter-abund.py @@ -50,16 +50,17 @@ import khmer from khmer import __version__ -from khmer import ReadParser, Countgraph -from khmer.utils import (broken_paired_reader, write_record) +from khmer import Countgraph +from khmer.utils import (paired_fastx_handler, write_record) from khmer.khmer_args import (add_threading_args, KhmerArgumentParser, - sanitize_help, check_argument_range) + sanitize_help, check_argument_range, + add_pairing_args) from khmer.khmer_args import FileType as khFileType from khmer.kfile import (check_input_files, check_space, add_output_compression_type, get_file_writer) from khmer.khmer_logger import (configure_logging, log_info, log_error, log_warn) -from khmer.trimming import (trim_record) +from khmer._oxli.sequence import trim_sequence DEFAULT_NORMALIZE_LIMIT = 20 DEFAULT_CUTOFF = 2 @@ -109,6 +110,7 @@ def get_parser(): parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') add_output_compression_type(parser) + add_pairing_args(parser) return parser @@ -140,22 +142,21 @@ def main(): outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop - for infile in infiles: + for infile, reader in paired_fastx_handler(infiles, + 'single', + min_length=ksize, + yield_filenames=True): log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) - paired_iter = broken_paired_reader(ReadParser(infile), - min_length=ksize, - force_single=True) - - for n, is_pair, read1, read2 in paired_iter: + for n, is_pair, read1, read2 in reader: assert not is_pair assert read2 is None - trimmed_record, _ = trim_record(countgraph, read1, args.cutoff, + trimmed_record, _ = trim_sequence(countgraph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: diff --git a/scripts/load-into-counting.py b/scripts/load-into-counting.py index 963a4dc030..562c449e10 100755 --- a/scripts/load-into-counting.py +++ b/scripts/load-into-counting.py @@ -57,6 +57,7 @@ from khmer.kfile import check_space_for_graph from khmer.khmer_logger import (configure_logging, log_info, log_error, log_warn) +from khmer._oxli.parsing import FastxParser def get_parser(): @@ -142,7 +143,7 @@ def main(): for index, filename in enumerate(filenames): - rparser = khmer.ReadParser(filename) + rparser = FastxParser(filename) threads = [] log_info('consuming input {input}', input=filename) for _ in range(args.threads): diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py index 39e387663e..43815b6b46 100755 --- a/scripts/normalize-by-median.py +++ b/scripts/normalize-by-median.py @@ -47,7 +47,6 @@ """ import sys -import screed import os import khmer import textwrap @@ -55,14 +54,15 @@ from contextlib import contextmanager from khmer.khmer_args import (build_counting_args, add_loadgraph_args, report_on_config, calculate_graphsize, - sanitize_help, check_argument_range) + sanitize_help, check_argument_range, + add_pairing_args) from khmer.khmer_args import FileType as khFileType import argparse from khmer.kfile import (check_space, check_space_for_graph, check_valid_file_exists, add_output_compression_type, get_file_writer, describe_file_handle) -from khmer.utils import (write_record, broken_paired_reader, ReadBundle, - clean_input_reads) +from khmer.utils import write_record, paired_fastx_handler, ReadBundle +from khmer._oxli.parsing import FastxParser, BrokenPairedReader from khmer.khmer_logger import (configure_logging, log_info, log_error) @@ -182,6 +182,7 @@ def __call__(self, is_paired, read0, read1): @contextmanager def catch_io_errors(ifile, out, single_out, force, corrupt_files): """Context manager to do boilerplate handling of IOErrors.""" + import traceback try: yield except (IOError, OSError, ValueError) as error: @@ -196,6 +197,9 @@ def catch_io_errors(ifile, out, single_out, force, corrupt_files): else: log_error('*** Skipping error file, moving on...') corrupt_files.append(ifile) + except RuntimeError as error: + log_error('** ERROR: {error}', error=str(error)) + log_error('*** Skipping empty file, moving on...') def get_parser(): @@ -380,8 +384,8 @@ def main(): # pylint: disable=too-many-branches,too-many-statements # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): - screed_iter = clean_input_reads(screed.open(filename)) - reader = broken_paired_reader(screed_iter, min_length=args.ksize, + parser = FastxParser(filename) + reader = BrokenPairedReader(parser, min_length=args.ksize, force_single=force_single, require_paired=require_paired) diff --git a/scripts/partition-streaming.py b/scripts/partition-streaming.py new file mode 100755 index 0000000000..eb08626979 --- /dev/null +++ b/scripts/partition-streaming.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +from khmer._oxli.app import PartitioningApp + +if __name__ == '__main__': + PartitioningApp().run() diff --git a/scripts/sample-reads-randomly.py b/scripts/sample-reads-randomly.py index 79b02d764e..5c9c0a15af 100755 --- a/scripts/sample-reads-randomly.py +++ b/scripts/sample-reads-randomly.py @@ -53,11 +53,11 @@ import sys from khmer import __version__ -from khmer import ReadParser from khmer.kfile import (check_input_files, add_output_compression_type, get_file_writer) -from khmer.khmer_args import sanitize_help, KhmerArgumentParser -from khmer.utils import write_record, broken_paired_reader +from khmer.khmer_args import (sanitize_help, KhmerArgumentParser, + add_pairing_args) +from khmer.utils import write_record, paired_fastx_handler DEFAULT_NUM_READS = int(1e5) DEFAULT_MAX_READS = int(1e8) @@ -94,14 +94,13 @@ def get_parser(): default=1) parser.add_argument('-R', '--random-seed', type=int, dest='random_seed', help='Provide a random seed for the generator') - parser.add_argument('--force_single', default=False, action='store_true', - help='Ignore read pair information if present') parser.add_argument('-o', '--output', dest='output_file', type=argparse.FileType('wb'), metavar="filename", default=None) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exits') add_output_compression_type(parser) + add_pairing_args(parser) return parser @@ -168,11 +167,10 @@ def main(): reads.append([]) # read through all the sequences and load/resample the reservoir - for filename in args.filenames: + for reader in paired_fastx_handler(args.filenames, args.pairing_mode): print('opening', filename, 'for reading', file=sys.stderr) - for count, (_, _, rcrd1, rcrd2) in enumerate(broken_paired_reader( - ReadParser(filename), force_single=args.force_single)): + for count, (_, _, rcrd1, rcrd2) in enumerate(reader): if count % 10000 == 0: print('...', count, 'reads scanned', file=sys.stderr) if count >= args.max_reads: diff --git a/scripts/split-paired-reads.py b/scripts/split-paired-reads.py index 5750100312..29f68b22d7 100755 --- a/scripts/split-paired-reads.py +++ b/scripts/split-paired-reads.py @@ -49,10 +49,9 @@ import textwrap from khmer import __version__ -from khmer import ReadParser from khmer.khmer_args import sanitize_help, KhmerArgumentParser from khmer.khmer_args import FileType as khFileType -from khmer.utils import (write_record, broken_paired_reader, +from khmer.utils import (write_record, BrokenPairedReader, FastxParser, UnpairedReadsError) from khmer.kfile import (check_input_files, check_space, add_output_compression_type, @@ -168,8 +167,8 @@ def main(): index = None # walk through all the reads in broken-paired mode. - paired_iter = broken_paired_reader(ReadParser(infile), - require_paired=not args.output_orphaned) + paired_iter = BrokenPairedReader(FastxParser(infile), + require_paired=not args.output_orphaned) try: for index, is_pair, record1, record2 in paired_iter: diff --git a/scripts/trim-low-abund.py b/scripts/trim-low-abund.py index 1f1177227d..1e0ba88ab9 100755 --- a/scripts/trim-low-abund.py +++ b/scripts/trim-low-abund.py @@ -56,16 +56,18 @@ from khmer import khmer_args from khmer import Countgraph, SmallCountgraph, ReadParser +from khmer._oxli.parsing import BrokenPairedReader, FastxParser +from khmer._oxli.sequence import trim_sequence + from khmer.khmer_args import (build_counting_args, add_loadgraph_args, report_on_config, calculate_graphsize, - sanitize_help) + sanitize_help, add_pairing_args) from khmer.khmer_args import FileType as khFileType -from khmer.utils import write_record, broken_paired_reader, ReadBundle +from khmer.utils import write_record, paired_fastx_handler, ReadBundle from khmer.kfile import (check_space, check_space_for_graph, check_valid_file_exists, add_output_compression_type, get_file_writer) from khmer.khmer_logger import configure_logging, log_info, log_error -from khmer.trimming import trim_record DEFAULT_TRIM_AT_COVERAGE = 20 DEFAULT_CUTOFF = 2 @@ -139,8 +141,6 @@ def get_parser(): # expert options parser.add_argument('--force', default=False, action='store_true') - parser.add_argument('--ignore-pairs', default=False, action='store_true', - help='treat all reads as if they were singletons') parser.add_argument('-T', '--tempdir', type=str, default='./', help="Set location of temporary directory for " "second pass") @@ -155,7 +155,7 @@ def get_parser(): parser.add_argument('--single-pass', default=False, action='store_true', help="Do not do a second pass across the low coverage " "data") - + add_pairing_args(parser) return parser @@ -225,7 +225,7 @@ def pass1(self, reader, saver): # trim? if min_coverage >= TRIM_AT_COVERAGE: for read in bundle.reads: - record, did_trim = trim_record(graph, read, CUTOFF) + record, did_trim = trim_sequence(graph, read, CUTOFF) if did_trim: self.trimmed_reads += 1 if record: @@ -262,7 +262,7 @@ def pass2(self, reader): bundle.coverages_at_least(graph, TRIM_AT_COVERAGE): for read in bundle.reads: - trimmed_record, did_trim = trim_record(graph, read, CUTOFF) + trimmed_record, did_trim = trim_sequence(graph, read, CUTOFF) if did_trim: self.trimmed_reads += 1 @@ -377,7 +377,10 @@ def main(): trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] - for filename in args.input_filenames: + for filename, reader in paired_fastx_handler(args.input_filenames, + args.pairing_mode, + min_length=K, + yield_filenames=True): # figure out temporary filename for 2nd pass pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) @@ -394,16 +397,12 @@ def main(): # record all this info pass2list.append((filename, pass2filename, trimfp)) - # input file stuff: get a broken_paired reader. - paired_iter = broken_paired_reader(ReadParser(filename), min_length=K, - force_single=args.ignore_pairs) - # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS - for read in trimmer.pass1(paired_iter, pass2fp): + for read in trimmer.pass1(reader, pass2fp): if (trimmer.n_reads - n_start) > watermark: log_info("... {filename} {n_saved} {n_reads} {n_bp} " "{w_reads} {w_bp}", filename=filename, @@ -449,10 +448,9 @@ def main(): # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. - read_parser = ReadParser(pass2filename) - paired_iter = broken_paired_reader(read_parser, - min_length=K, - force_single=True) + paired_iter = BrokenPairedReader(FastxParser(pass2filename), + force_single=True, + min_length=K) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): @@ -468,8 +466,6 @@ def main(): written_reads += 1 written_bp += len(read) - read_parser.close() - log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) diff --git a/setup.py b/setup.py index d4049347d3..820e3a5b8c 100755 --- a/setup.py +++ b/setup.py @@ -158,29 +158,31 @@ def build_dir(): ZLIBDIR = 'third-party/zlib' BZIP2DIR = 'third-party/bzip2' + BUILD_DEPENDS = [path_join("include", "khmer", bn + ".hh") for bn in [ "_cpy_khmer", "_cpy_utils", "_cpy_readparsers" ]] BUILD_DEPENDS.extend(path_join("include", "oxli", bn + ".hh") for bn in [ "khmer", "kmer_hash", "hashtable", "labelhash", "hashgraph", "hllcounter", "oxli_exception", "read_aligner", "subset", "read_parsers", - "kmer_filters", "traversal", "assembler", "alphabets", "storage"]) + "kmer_filters", "traversal", "assembler", "alphabets", "storage", + "partitioning", "gmap", "hist", "cdbg"]) SOURCES = [path_join("src", "khmer", bn + ".cc") for bn in [ "_cpy_khmer", "_cpy_utils", "_cpy_readparsers" ]] SOURCES.extend(path_join("src", "oxli", bn + ".cc") for bn in [ "read_parsers", "kmer_hash", "hashtable", "hashgraph", - "labelhash", "subset", "read_aligner", + "labelhash", "subset", "read_aligner", "oxli", "hllcounter", "traversal", "kmer_filters", "assembler", "alphabets", - "storage"]) + "storage", "partitioning", "cdbg"]) SOURCES.extend(path_join("third-party", "smhasher", bn + ".cc") for bn in [ "MurmurHash3"]) # Don't forget to update lib/Makefile with these flags! EXTRA_COMPILE_ARGS = ['-O3', '-std=c++11', '-pedantic', - '-fno-omit-frame-pointer'] + '-fno-omit-frame-pointer', '-fdiagnostics-color'] EXTRA_LINK_ARGS = ['-fno-omit-frame-pointer'] if sys.platform == 'darwin': @@ -218,7 +220,7 @@ def build_dir(): CY_EXTENSION_MOD_DICT = \ { - "sources": [cython_ext, "khmer/_oxli/oxli_exception_convert.cc"], + "sources": [cython_ext, "src/oxli/oxli_exception_convert.cc"], "extra_compile_args": EXTRA_COMPILE_ARGS, "extra_link_args": EXTRA_LINK_ARGS, "extra_objects": [path_join(build_dir(), splitext(p)[0] + '.o') @@ -287,7 +289,7 @@ def build_dir(): # additional-meta-data note #3 "url": 'https://khmer.readthedocs.io/', "packages": ['khmer', 'khmer.tests', 'oxli', 'khmer._oxli'], - "package_data": {'khmer/_oxli': ['*.pxd']}, + "package_data": {'khmer/_oxli': ['*.pxd', 'oxli_exception_convert.hh']}, "package_dir": {'khmer.tests': 'tests'}, "install_requires": ['screed >= 1.0', 'bz2file', 'Cython>=0.25.2'], "setup_requires": ["pytest-runner>=2.0,<3dev", "setuptools>=18.0", diff --git a/src/khmer/_cpy_khmer.cc b/src/khmer/_cpy_khmer.cc index d1a70a0e21..736e19e439 100644 --- a/src/khmer/_cpy_khmer.cc +++ b/src/khmer/_cpy_khmer.cc @@ -59,193 +59,19 @@ extern "C" { } namespace khmer { - -PyObject * forward_hash(PyObject * self, PyObject * args) -{ - const char * kmer; - WordLength ksize; - - if (!PyArg_ParseTuple(args, "sb", &kmer, &ksize)) { - return NULL; - } - - if (ksize > KSIZE_MAX) { - PyErr_Format(PyExc_ValueError, "k-mer size must be <= %u", KSIZE_MAX); - return NULL; - } - - if (strlen(kmer) != ksize) { - PyErr_Format(PyExc_ValueError, "k-mer size different from ksize"); - return NULL; - } - - try { - PyObject * hash = nullptr; - const HashIntoType h(_hash(kmer, ksize)); - convert_HashIntoType_to_PyObject(h, &hash); - return hash; - } catch (oxli_exception &e) { - PyErr_SetString(PyExc_ValueError, e.what()); - return NULL; - } -} - -PyObject * forward_hash_no_rc(PyObject * self, PyObject * args) -{ - const char * kmer; - WordLength ksize; - - if (!PyArg_ParseTuple(args, "sb", &kmer, &ksize)) { - return NULL; - } - - if (ksize > KSIZE_MAX) { - PyErr_Format(PyExc_ValueError, "k-mer size must be <= %u", KSIZE_MAX); - return NULL; - } - - if (strlen(kmer) != ksize) { - PyErr_SetString(PyExc_ValueError, - "k-mer length must equal the k-size"); - return NULL; - } - - PyObject * hash = nullptr; - const HashIntoType h(_hash_forward(kmer, ksize)); - convert_HashIntoType_to_PyObject(h, &hash); - return hash; -} - -PyObject * reverse_hash(PyObject * self, PyObject * args) -{ - PyObject * val; - HashIntoType hash; - WordLength ksize; - - if (!PyArg_ParseTuple(args, "Ob", &val, &ksize)) { - return NULL; - } - - if (PyLong_Check(val) || PyInt_Check(val)) { - if (!convert_PyLong_to_HashIntoType(val, hash)) { - return NULL; - } - } else { - PyErr_SetString(PyExc_TypeError, - "Hash value must be an integer."); - return NULL; - } - - if (ksize > KSIZE_MAX) { - PyErr_Format(PyExc_ValueError, "k-mer size must be <= %u", KSIZE_MAX); - return NULL; - } - - return PyUnicode_FromString(_revhash(hash, ksize).c_str()); -} - -PyObject * murmur3_forward_hash(PyObject * self, PyObject * args) -{ - const char * kmer; - - if (!PyArg_ParseTuple(args, "s", &kmer)) { - return NULL; - } - - PyObject * hash = nullptr; - const HashIntoType h(_hash_murmur(kmer, strlen(kmer))); - convert_HashIntoType_to_PyObject(h, &hash); - return hash; -} - -PyObject * murmur3_forward_hash_no_rc(PyObject * self, PyObject * args) -{ - const char * kmer; - - if (!PyArg_ParseTuple(args, "s", &kmer)) { - return NULL; - } - - PyObject * hash = nullptr; - const HashIntoType h(_hash_murmur_forward(kmer, strlen(kmer))); - convert_HashIntoType_to_PyObject(h, &hash); - return hash; -} - -PyObject * reverse_complement(PyObject * self, PyObject * args) -{ - const char * sequence; - if (!PyArg_ParseTuple(args, "s", &sequence)) { - return NULL; - } - - std::string s(sequence); - try { - s = _revcomp(s); - } catch (oxli_exception &e) { - PyErr_SetString(PyExc_RuntimeError, e.what()); - return NULL; - } - return PyUnicode_FromString(s.c_str()); -} - // // technique for resolving literal below found here: // https://gcc.gnu.org/onlinedocs/gcc-4.9.1/cpp/Stringification.html // -PyObject * -get_version_cpp( PyObject * self, PyObject * args ) -{ -#define xstr(s) str(s) -#define str(s) #s - std::string dVersion = xstr(VERSION); - return PyUnicode_FromString(dVersion.c_str()); -} PyMethodDef KhmerMethods[] = { - { - "forward_hash", forward_hash, - METH_VARARGS, "", - }, - { - "forward_hash_no_rc", forward_hash_no_rc, - METH_VARARGS, "", - }, - { - "reverse_hash", reverse_hash, - METH_VARARGS, "", - }, - { - "hash_murmur3", - murmur3_forward_hash, - METH_VARARGS, - "Calculate the hash value of a k-mer using MurmurHash3 " - "(with reverse complement)", - }, - { - "hash_no_rc_murmur3", - murmur3_forward_hash_no_rc, - METH_VARARGS, - "Calculate the hash value of a k-mer using MurmurHash3 " - "(no reverse complement)", - }, - { - "reverse_complement", - reverse_complement, - METH_VARARGS, - "Calculate the reverse-complement of the DNA sequence " - "with alphabet ACGT", - }, - { - "get_version_cpp", get_version_cpp, - METH_VARARGS, "return the VERSION c++ compiler option" - }, { NULL, NULL, 0, NULL } // sentinel }; } // namespace khmer + // // Module machinery. // @@ -280,17 +106,6 @@ MOD_INIT(_khmer) return MOD_ERROR_VAL; } - PyObject * filetype_dict = Py_BuildValue("{s,i,s,i,s,i,s,i,s,i,s,i,s,i}", - "COUNTING_HT", SAVED_COUNTING_HT, - "HASHBITS", SAVED_HASHBITS, - "TAGS", SAVED_TAGS, - "STOPTAGS", SAVED_STOPTAGS, - "SUBSET", SAVED_SUBSET, - "LABELSET", SAVED_LABELSET, - "SMALLCOUNT", SAVED_SMALLCOUNT); - if (PyModule_AddObject( m, "FILETYPES", filetype_dict ) < 0) { - return MOD_ERROR_VAL; - } Py_INCREF(&khmer_Read_Type); if (PyModule_AddObject( m, "Read", diff --git a/src/oxli/Makefile b/src/oxli/Makefile index 9858659a6a..b06ea96c6a 100644 --- a/src/oxli/Makefile +++ b/src/oxli/Makefile @@ -242,7 +242,9 @@ LIBOXLI_OBJS= \ assembler.o \ alphabets.o \ murmur3.o \ - storage.o + storage.o \ + partitioning.o \ + cdbg.o PRECOMILE_OBJS ?= PRECLEAN_TARGS ?= @@ -278,7 +280,12 @@ HEADERS= \ kmer_filters.hh \ assembler.hh \ alphabets.hh \ - storage.hh + storage.hh \ + partitioning.hh \ + gmap.hh \ + hist.hh \ + cdbg.hh + OXLI_HEADERS = $(addprefix ../../include/oxli/,$(HEADERS)) # START OF RULES # @@ -325,6 +332,9 @@ murmur3.o: ../../third-party/smhasher/MurmurHash3.cc %.o: %.cc $(PRECOMILE_OBJS) $(OXLI_HEADERS) $(CXX) $(CXXFLAGS) $(LDFLAGS) -c -o $@ $< +map_type_test: map_type_test.cc + $(CXX) -o $@ $< $(CXXFLAGS) -loxli -L. + $(LIBOXLISO): $(LIBOXLI_OBJS) $(CXX) $(CXXFLAGS) $(LDFLAGS) $(SONAME_FLAGS) -shared -o $@ $^ ln -sf $(SONAME) liboxli.$(SHARED_EXT) diff --git a/src/oxli/assembler.cc b/src/oxli/assembler.cc index cf79c6deb5..0fe7676b52 100644 --- a/src/oxli/assembler.cc +++ b/src/oxli/assembler.cc @@ -49,10 +49,11 @@ namespace oxli * Simple Linear Assembly ********************************/ -LinearAssembler::LinearAssembler(const Hashgraph * ht) : +LinearAssembler::LinearAssembler(const Hashgraph * ht, + std::shared_ptr global_visited) : graph(ht), _ksize(ht->ksize()) { - + global_visited = global_visited; } // Starting from the given seed k-mer, assemble the maximal linear path in @@ -72,7 +73,13 @@ const node_filters.push_back(get_stop_bf_filter(stop_bf)); } - std::shared_ptr visited = std::make_shared(); + std::shared_ptr visited; + if (global_visited != nullptr) { + visited = global_visited; + } else { + visited = std::make_shared(); + } + AssemblerTraverser rcursor(graph, seed_kmer, node_filters, visited); AssemblerTraverser lcursor(graph, seed_kmer, node_filters, visited); @@ -98,7 +105,17 @@ const node_filters.push_back(get_stop_bf_filter(stop_bf)); } - AssemblerTraverser cursor(graph, seed_kmer, node_filters); + std::shared_ptr visited; + if (global_visited != nullptr) { + visited = global_visited; + } else { + visited = std::make_shared(); + } + + AssemblerTraverser cursor(graph, + seed_kmer, + node_filters, + visited); return _assemble_directed(cursor); } @@ -112,7 +129,17 @@ const node_filters.push_back(get_stop_bf_filter(stop_bf)); } - AssemblerTraverser cursor(graph, seed_kmer, node_filters); + std::shared_ptr visited; + if (global_visited != nullptr) { + visited = global_visited; + } else { + visited = std::make_shared(); + } + + AssemblerTraverser cursor(graph, + seed_kmer, + node_filters, + visited); return _assemble_directed(cursor); } @@ -173,6 +200,81 @@ const return contig; } +/******************************** + * Compacting Assembler + ********************************/ + +std::string CompactingAssembler::assemble(const Kmer seed_kmer, + const Hashgraph * stop_bf) +const +{ + if (graph->get_count(seed_kmer) == 0) { + return ""; + } + + std::list node_filters; + if (stop_bf) { + node_filters.push_back(get_stop_bf_filter(stop_bf)); + } + + std::shared_ptr visited; + if (global_visited != nullptr) { + visited = global_visited; + } else { + visited = std::make_shared(); + } + + CompactingAT rcursor(graph, seed_kmer, node_filters, visited); + CompactingAT lcursor(graph, seed_kmer, node_filters, visited); + + std::string right_contig = _assemble_directed(rcursor); + std::string left_contig = _assemble_directed(lcursor); + + right_contig = right_contig.substr(_ksize); + return left_contig + right_contig; +} + +std::string CompactingAssembler::assemble_right(const Kmer seed_kmer, + const Hashgraph * stop_bf) +const +{ + std::list node_filters; + if (stop_bf) { + node_filters.push_back(get_stop_bf_filter(stop_bf)); + } + + std::shared_ptr visited; + if (global_visited != nullptr) { + visited = global_visited; + } else { + visited = std::make_shared(); + } + + CompactingAT cursor(graph, seed_kmer, node_filters, visited); + return LinearAssembler::_assemble_directed(cursor); +} + + +std::string CompactingAssembler::assemble_left(const Kmer seed_kmer, + const Hashgraph * stop_bf) +const +{ + std::list node_filters; + if (stop_bf) { + node_filters.push_back(get_stop_bf_filter(stop_bf)); + } + + std::shared_ptr visited; + if (global_visited != nullptr) { + visited = global_visited; + } else { + visited = std::make_shared(); + } + + CompactingAT cursor(graph, seed_kmer, node_filters, visited); + return LinearAssembler::_assemble_directed(cursor); +} + /******************************** * Labeled Assembly @@ -309,7 +411,7 @@ const // spin off a cursor for the new branch AssemblerTraverser branch_cursor(cursor); branch_cursor.cursor = branch_starts.front(); - branch_starts.pop(); + branch_starts.pop_front(); #if DEBUG_ASSEMBLY std::cout << "Branch cursor: " << branch_cursor.cursor.repr( @@ -512,7 +614,7 @@ const AssemblerTraverser branch_cursor(cursor); branch_cursor.cursor = branch_starts.front(); - branch_starts.pop(); + branch_starts.pop_front(); // assemble linearly as far as possible std::string branch = linear_asm._assemble_directed(branch_cursor); diff --git a/src/oxli/cdbg.cc b/src/oxli/cdbg.cc new file mode 100644 index 0000000000..0dc1cd68c0 --- /dev/null +++ b/src/oxli/cdbg.cc @@ -0,0 +1,109 @@ + +#include +#include "oxli/cdbg.hh" + +using namespace oxli; + +namespace oxli { + +void CompactEdgeFactory::write_gml(const std::string filename, + const CompactNodeFactory& nodes) const { + + std::ofstream file; + file.open(filename); + pdebug("opened " << filename); + file << "graph" << std::endl << "[" << std::endl; + + pdebug("writing " << nodes.n_nodes() << " nodes"); + for (auto node : nodes.compact_nodes) { + file << " node [" << std::endl; + file << " id " << std::to_string(node.node_id) << std::endl; + file << " kmer \"" << node.sequence << "\"" << std::endl; + file << " count \"" << std::to_string(node.count) << "\"" << std::endl; + file << " ]" << std::endl; + } + + uint32_t edge_offset = INT_MAX / 2; + pdebug("writing " << compact_edges.size() << " edges"); + for (auto edge_pair : compact_edges) { + + id_t edge_id = edge_pair.first + edge_offset; + CompactEdge* edge = edge_pair.second; + + file << " edge [" << std::endl; + file << " id " << std::to_string(edge_id) << std::endl; + + id_t in_id, out_id; + bool in_null = false, out_null = false; + if (edge->in_node_id == NULL_ID) { + in_id = INT_MAX - edge_id; + in_null = true; + } else { + in_id = edge->in_node_id; + } + if(edge->out_node_id == NULL_ID) { + out_id = INT_MAX - edge_id; + out_null = true; + } else { + out_id = edge->out_node_id; + } + + if (in_null && out_null) { + std::cerr << "in and out nodes NULL_ID, something weird with " + << edge->edge_id << std::endl; + } + + file << " source " << std::to_string(in_id) << std::endl; + file << " target " << std::to_string(out_id) << std::endl; + file << " sequence \"" << edge->sequence << "\"" << std::endl; + file << " Length " << edge->sequence.length() << std::endl; + file << " meta \"" << edge_meta_repr(edge->meta) << "\"" << std::endl; + file << " ]" << std::endl; + + // dummy nodes for tips + /* + if (in_null) { + file << " node [" << std::endl; + file << " id " << std::to_string(in_id) << std::endl; + file << " label \"null_" << std::to_string(in_id) << "\"" << std::endl; + file << " ]" << std::endl; + } + + if (out_null) { + file << " node [" << std::endl; + file << " id " << std::to_string(out_id) << std::endl; + file << " label \"null_" << std::to_string(out_id) << "\"" << std::endl; + file << " ]" << std::endl; + } + */ + } + + file << "]"; + + file.close(); + pdebug("closed file"); +} + + +void CompactEdgeFactory::write_fasta(const std::string filename) const { + std::ofstream file; + file.open(filename); + pdebug("opened " << filename); + for (auto edge_pair : compact_edges) { + + id_t edge_id = edge_pair.first; + CompactEdge* edge = edge_pair.second; + file << ">" << "edge_id=" << edge_id; + file << " len=" << edge->sequence.length(); + file << " type=" << edge_meta_repr(edge->meta); + file << " src=" << edge->in_node_id; + file << " tgt=" << edge->out_node_id; + file << std::endl; + file << edge->sequence; + file << std::endl; + } + + file.close(); +} + +}; diff --git a/src/oxli/hashgraph.cc b/src/oxli/hashgraph.cc index c9cd78d860..fb0df6b0ab 100644 --- a/src/oxli/hashgraph.cc +++ b/src/oxli/hashgraph.cc @@ -199,9 +199,13 @@ void Hashgraph::load_tagset(std::string infilename, bool clear_tags) void Hashgraph::consume_sequence_and_tag(const std::string& seq, unsigned long long& n_consumed, - SeenSet * found_tags) + SeenSet * found_tags, + SeenSet * tag_set) { bool kmer_tagged; + if (tag_set == nullptr) { + tag_set = &all_tags; + } KmerIterator kmers(seq.c_str(), _ksize); HashIntoType kmer; @@ -226,11 +230,11 @@ void Hashgraph::consume_sequence_and_tag(const std::string& seq, ++since; } else { ACQUIRE_ALL_TAGS_SPIN_LOCK - kmer_tagged = set_contains(all_tags, kmer); + kmer_tagged = set_contains(*tag_set, kmer); RELEASE_ALL_TAGS_SPIN_LOCK if (kmer_tagged) { since = 1; - if (found_tags) { + if (found_tags != nullptr) { found_tags->insert(kmer); } } else { @@ -238,9 +242,9 @@ void Hashgraph::consume_sequence_and_tag(const std::string& seq, } } #else - if (!is_new_kmer && set_contains(all_tags, kmer)) { + if (!is_new_kmer && set_contains(*tag_set, kmer)) { since = 1; - if (found_tags) { + if (found_tags != nullptr) { found_tags->insert(kmer); } } else { @@ -250,9 +254,9 @@ void Hashgraph::consume_sequence_and_tag(const std::string& seq, if (since >= _tag_density) { ACQUIRE_ALL_TAGS_SPIN_LOCK - all_tags.insert(kmer); + tag_set->insert(kmer); RELEASE_ALL_TAGS_SPIN_LOCK - if (found_tags) { + if (found_tags != nullptr) { found_tags->insert(kmer); } since = 1; @@ -262,9 +266,9 @@ void Hashgraph::consume_sequence_and_tag(const std::string& seq, if (since >= _tag_density/2 - 1) { ACQUIRE_ALL_TAGS_SPIN_LOCK - all_tags.insert(kmer); // insert the last k-mer, too. + tag_set->insert(kmer); // insert the last k-mer, too. RELEASE_ALL_TAGS_SPIN_LOCK - if (found_tags) { + if (found_tags != nullptr) { found_tags->insert(kmer); } } @@ -431,7 +435,7 @@ const } KmerQueue node_q; - node_q.push(start); + node_q.push_front(start); // Avoid high-circumference k-mers Traverser traverser(this); @@ -443,7 +447,7 @@ const while(!node_q.empty()) { Kmer node = node_q.front(); - node_q.pop(); + node_q.pop_front(); // have we already seen me? don't count; exit. if (set_contains(keeper, node)) { @@ -484,6 +488,11 @@ unsigned int Hashgraph::kmer_degree(const char * kmer_s) return traverser.degree(node); } +unsigned int Hashgraph::kmer_degree(Kmer kmer) +{ + return kmer_degree(kmer.kmer_r, kmer.kmer_f); +} + size_t Hashgraph::trim_on_stoptags(std::string seq) const { KmerIterator kmers(seq.c_str(), _ksize); @@ -518,12 +527,12 @@ const }; Traverser traverser(this, filter); - node_q.push(start); + node_q.push_front(start); breadth_q.push(0); while(!node_q.empty()) { Kmer node = node_q.front(); - node_q.pop(); + node_q.pop_front(); unsigned int breadth = breadth_q.front(); breadth_q.pop(); @@ -871,7 +880,7 @@ const while (node_q.size()) { Kmer node = node_q.front(); - node_q.pop(); + node_q.pop_front(); if (set_contains(high_degree_nodes, node)) { // if there are any adjacent high degree nodes, record; diff --git a/src/oxli/kmer_filters.cc b/src/oxli/kmer_filters.cc index 987ec327f2..8d7e98b38b 100644 --- a/src/oxli/kmer_filters.cc +++ b/src/oxli/kmer_filters.cc @@ -61,6 +61,18 @@ bool apply_kmer_filters(const Kmer& node, const std::list& filters) } +void apply_kmer_helpers(const Kmer& node, const KmerHelperList& helpers) +{ + if (!helpers.size()) { + return; + } + + for (auto helper: helpers) { + helper(node); + } +} + + KmerFilter get_label_filter(const Label label, const LabelHash * lh) { KmerFilter filter = [=] (const Kmer& node) { @@ -118,6 +130,17 @@ KmerFilter get_simple_label_intersect_filter(const LabelSet& src_labels, return filter; } +/* +KmerFilter get_link_filter(const Kmer& src_node, + std::shared_ptr links, + std::shared_ptr< std::list > ages, + const unsigned int min_count) +{ + KmerFilter filter = [=] (const Kmer& node) { + + } +} +*/ KmerFilter get_junction_count_filter(const Kmer& src_node, Countgraph * junctions, diff --git a/src/oxli/kmer_hash.cc b/src/oxli/kmer_hash.cc index 8378ee1936..1edd1c33eb 100644 --- a/src/oxli/kmer_hash.cc +++ b/src/oxli/kmer_hash.cc @@ -199,14 +199,24 @@ HashIntoType _hash_murmur(const std::string& kmer, const WordLength k, HashIntoType _hash_murmur_forward(const std::string& kmer, const WordLength k) { - HashIntoType h = 0; - HashIntoType r = 0; + uint64_t out[2]; + uint64_t seed = 0; + MurmurHash3_x64_128((void*)kmer.c_str(), k, seed, &out); + return out[0]; - oxli::_hash_murmur(kmer, k, h, r); +} + + +HashIntoType _hash_murmur_forward(const std::string& kmer) +{ + uint64_t out[2]; + uint64_t seed = 0; + MurmurHash3_x64_128((void*)kmer.c_str(), kmer.length(), seed, &out); + return out[0]; - return h; } + HashIntoType _hash_cyclic(const std::string& kmer, const WordLength k) { HashIntoType h = 0; @@ -251,11 +261,11 @@ HashIntoType _hash_cyclic(const std::string& kmer, const WordLength k, HashIntoType _hash_cyclic_forward(const std::string& kmer, const WordLength k) { - HashIntoType h = 0; - HashIntoType r = 0; - - oxli::_hash_cyclic(kmer, k, h, r); - return h; + CyclicHash hasher(k); + for (WordLength i = 0; i < k; ++i) { + hasher.eat(kmer[i]); + } + return hasher.hashvalue; } diff --git a/src/oxli/map_type_test.cc b/src/oxli/map_type_test.cc new file mode 100644 index 0000000000..2dde9423e3 --- /dev/null +++ b/src/oxli/map_type_test.cc @@ -0,0 +1,249 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace oxli; +using namespace oxli::read_parsers; + +#define K 21 + +unsigned long long llrand() { + unsigned long long r = 0; + + for (int i = 0; i < 5; ++i) { + r = (r << 15) | (rand() & 0x7FFF); + } + + return r & 0xFFFFFFFFFFFFFFFFULL; +} + +FastxParserPtr get_test_reads() { + FastxParserPtr parser = get_parser("../../tests/test-data/test-reads.fa"); + return parser; +} + +vector * get_test_kmers(int num_hashes=5000000) { + vector * hashes = new vector(); + while(num_hashes > 0) { + hashes->push_back(llrand()); + num_hashes--; + } + return hashes; +} + + +void fill_gmap(GuardedHashMap& _map, vector * hashes) { + for(auto hash: *hashes) { + _map.set(hash, rand()); + } +} + + +void fill_uomap(std::unordered_map& _map, vector * hashes) { + for (auto hash: *hashes) { + _map[hash] = rand(); + } +} + +void fill_map(std::map& _map, vector * hashes) { + for (auto hash: *hashes) { + _map[hash] = rand(); + } +} + +void test_gmap(vector * hashes) { + + std::cout << "=== GMAP ===" << std::endl; + + vector get_full_times; + vector get_empty_times; + vector get_bad_times; + GuardedHashMap _map(K, 4, 1000000); + std::chrono::time_point start, end; + + fill_gmap(_map, hashes); + for (auto hash: *hashes) { + start = std::chrono::system_clock::now(); + int result = _map.get(hash); + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end-start; + get_full_times.push_back(elapsed_seconds.count()); + } + double avg_get_full_time = std::accumulate(get_full_times.begin(), + get_full_times.end(), 0.0) / get_full_times.size(); + std::cout << "Avg full get time: " << avg_get_full_time << std::endl; + + + vector * newhashes = get_test_kmers(); + for (auto hash: *newhashes) { + start = std::chrono::system_clock::now(); + int result = _map.get(hash); + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end-start; + get_bad_times.push_back(elapsed_seconds.count()); + } + double avg_get_bad_time = std::accumulate(get_bad_times.begin(), + get_bad_times.end(), 0.0) / get_bad_times.size(); + std::cout << "Avg bad get time: " << avg_get_bad_time << std::endl; + delete newhashes; + + + _map = GuardedHashMap(K, 4, 1000000); + + for (auto hash: *hashes) { + start = std::chrono::system_clock::now(); + int result = _map.get(hash); + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end-start; + + get_empty_times.push_back(elapsed_seconds.count()); + } + + double avg_get_empty_time = std::accumulate(get_empty_times.begin(), + get_empty_times.end(), 0.0) / get_empty_times.size(); + std::cout << "Avg empty get time: " << avg_get_empty_time << std::endl; +} + +void test_uomap(vector * hashes) { + + std::cout << "=== UOMAP ===" << std::endl; + + vector get_full_times; + vector get_empty_times; + vector get_bad_times; + std::unordered_map _map; + std::chrono::time_point start, end; + + fill_uomap(_map, hashes); + + for (auto hash: *hashes) { + start = std::chrono::system_clock::now(); + int result; + auto search = _map.find(hash); + if (search != _map.end()) { + result = search->second; + } + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end-start; + + get_full_times.push_back(elapsed_seconds.count()); + } + double avg_get_full_time = std::accumulate(get_full_times.begin(), + get_full_times.end(), 0.0) / get_full_times.size(); + std::cout << "Avg full get time: " << avg_get_full_time << std::endl; + + + vector * newhashes = get_test_kmers(); + for (auto hash: *newhashes) { + start = std::chrono::system_clock::now(); + int result; + auto search = _map.find(hash); + if (search != _map.end()) { + result = search->second; + } + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end-start; + get_bad_times.push_back(elapsed_seconds.count()); + } + double avg_get_bad_time = std::accumulate(get_bad_times.begin(), + get_bad_times.end(), 0.0) / get_bad_times.size(); + std::cout << "Avg bad get time: " << avg_get_bad_time << std::endl; + delete newhashes; + + + _map = std::unordered_map(); + for (auto hash: *hashes) { + start = std::chrono::system_clock::now(); + int result; + auto search = _map.find(hash); + if (search != _map.end()) { + result = search->second; + } + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end-start; + + get_empty_times.push_back(elapsed_seconds.count()); + } + + double avg_get_empty_time = std::accumulate(get_empty_times.begin(), + get_empty_times.end(), 0.0) / get_empty_times.size(); + std::cout << "Avg empty get time: " << avg_get_empty_time << std::endl; +} + +void test_map(vector * hashes) { + + std::cout << "=== MAP ===" << std::endl; + + vector get_full_times; + vector get_empty_times; + vector get_bad_times; + std::map _map; + std::chrono::time_point start, end; + + fill_map(_map, hashes); + for (auto hash: *hashes) { + start = std::chrono::system_clock::now(); + int result; + auto search = _map.find(hash); + if (search != _map.end()) { + result = search->second; + } + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end-start; + + get_full_times.push_back(elapsed_seconds.count()); + } + double avg_get_full_time = std::accumulate(get_full_times.begin(), + get_full_times.end(), 0.0) / get_full_times.size(); + std::cout << "Avg full get time: " << avg_get_full_time << std::endl; + + vector * newhashes = get_test_kmers(); + for (auto hash: *newhashes) { + start = std::chrono::system_clock::now(); + int result; + auto search = _map.find(hash); + if (search != _map.end()) { + result = search->second; + } + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end-start; + get_bad_times.push_back(elapsed_seconds.count()); + } + double avg_get_bad_time = std::accumulate(get_bad_times.begin(), + get_bad_times.end(), 0.0) / get_bad_times.size(); + std::cout << "Avg bad get time: " << avg_get_bad_time << std::endl; + delete newhashes; + + _map = std::map(); + for (auto hash: *hashes) { + start = std::chrono::system_clock::now(); + int result; + auto search = _map.find(hash); + if (search != _map.end()) { + result = search->second; + } + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end-start; + + get_empty_times.push_back(elapsed_seconds.count()); + } + + double avg_get_empty_time = std::accumulate(get_empty_times.begin(), + get_empty_times.end(), 0.0) / get_empty_times.size(); + std::cout << "Avg empty get time: " << avg_get_empty_time << std::endl; +} + + +int main() { + vector * hashes = get_test_kmers(); + test_gmap(hashes); + test_uomap(hashes); + test_map(hashes); +} diff --git a/src/oxli/oxli.cc b/src/oxli/oxli.cc new file mode 100644 index 0000000000..6f643213e2 --- /dev/null +++ b/src/oxli/oxli.cc @@ -0,0 +1,13 @@ +#include + +namespace oxli { + +std::string get_version_cpp() +{ +#define _macro_xstr(s) _macro_str(s) +#define _macro_str(s) #s + std::string dVersion = _macro_xstr(VERSION); + return dVersion; +} + +} diff --git a/khmer/_oxli/oxli_exception_convert.cc b/src/oxli/oxli_exception_convert.cc similarity index 84% rename from khmer/_oxli/oxli_exception_convert.cc rename to src/oxli/oxli_exception_convert.cc index 0e5d2f9935..df9fbfeb75 100644 --- a/khmer/_oxli/oxli_exception_convert.cc +++ b/src/oxli/oxli_exception_convert.cc @@ -2,7 +2,7 @@ #include #include #include "oxli/oxli_exception.hh" -#include "oxli_exception_convert.hh" +#include "oxli/oxli_exception_convert.hh" void oxli_raise_py_error() @@ -19,6 +19,9 @@ void oxli_raise_py_error() catch (oxli::InvalidStream& e) { PyErr_SetString(PyExc_OSError, e.what()); } + catch (oxli::EmptyStream& e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + } catch (oxli::oxli_value_exception& e) { PyErr_SetString(PyExc_ValueError, e.what()); } diff --git a/src/oxli/partitioning.cc b/src/oxli/partitioning.cc new file mode 100644 index 0000000000..1ebebbbda6 --- /dev/null +++ b/src/oxli/partitioning.cc @@ -0,0 +1,408 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "oxli/hashtable.hh" +#include "oxli/hashgraph.hh" +#include "oxli/partitioning.hh" + +using namespace oxli; +using namespace oxli::read_parsers; + +uint64_t Component::n_created = 0; +uint64_t Component::n_destroyed = 0; + +bool ComponentPtrCompare::operator() (const ComponentPtr& lhs, + const ComponentPtr& rhs) const { + return *lhs < *rhs; +} + +inline std::ostream& operator<< (std::ostream& stream, Component& comp) { + stream << ""; + return stream; +} + +//////////////////////////////////////////////////////////////////////////////// + + +ComponentMap::ComponentMap(WordLength ksize, + WordLength n_tables, + uint64_t max_table_size) : components_lock(0), + component_counter(0), + n_live_components(0) +{ + + tag_component_map = std::unique_ptr( + new GuardedHashCompMap(ksize, + n_tables, + max_table_size)); + components = std::make_shared(); +} + +void ComponentMap::map_tags_to_component(TagVector& tags, + ComponentPtr& comp) +{ + for (auto tag: tags) { + tag_component_map->set(tag, comp); + comp->add_tag(tag); + } +} + +void ComponentMap::create_component(TagVector& tags) +{ + ComponentPtr new_comp = std::make_shared(component_counter); + component_counter++; + n_live_components++; + components->push_back(new_comp); + map_tags_to_component(tags, new_comp); + + //std::cout << "new component=" << *new_comp << std::endl; + //std::cout << components->size() << " components in vector" << std::endl; +} + + +uint32_t ComponentMap::create_and_merge_components(TagVector& tags) +{ + + // Now resolve components. First, get components from existing tags. + ComponentPtrSet found_comps; + TagVector new_tags; + for (auto tag: tags) { + ComponentPtr comp; + if ((comp = tag_component_map->get(tag)) != NULL) { + found_comps.insert(comp); + } else { + new_tags.push_back(tag); + } + } + + uint32_t n_merged = 1; + if (found_comps.size() == 0) { + create_component(tags); + } else { + // Choose the largest component as the root + // We want to minimize tag copying + ComponentPtr root_comp = *(found_comps.begin()); + for (auto other : found_comps) { + if (other->get_n_tags() > root_comp->get_n_tags()) { + root_comp = other; + } + } + // map the new tags to this component + root_comp->add_tags(new_tags); + map_tags_to_component(new_tags, root_comp); + if (found_comps.size() > 1) { + n_merged = merge_components(root_comp, found_comps); + } + } + return n_merged; +} + + +uint32_t ComponentMap::merge_components(ComponentPtr& root, + ComponentPtrSet& comps) +{ + uint32_t n_merged = 1; + //std::cout << "Merge with root=" << *root << std::endl; + for (auto other : comps) { + //std::cout << "\tmerge in " << *other << std::endl; + if (*other == *root) { + continue; + } + root->add_tags(other->tags); // transfer the tags from the other comp + map_tags_to_component(other->tags, root); + (*components)[other->component_id]->kill(); + (*components)[other->component_id] = nullptr; + n_live_components--; + n_merged++; + + } + // and active Python wrapper; this leaves them as sole owners + return n_merged; +} + + +//////////////////////////////////////////////////////////////////////////////// + + +StreamingPartitioner::StreamingPartitioner(Hashgraph * graph, + uint32_t tag_density) : + ComponentMap::ComponentMap(graph->ksize(), + graph->n_tables(), + _cstr_get_max_table_size(graph)), + graph(graph), + _tag_density(tag_density), + n_consumed(0) +{ +} + + +uint64_t StreamingPartitioner::_cstr_get_max_table_size(Hashgraph * graph) +{ + std::vector graph_table_sizes = graph->get_tablesizes(); + return *std::max_element(graph_table_sizes.begin(), + graph_table_sizes.end()); +} + + +uint64_t StreamingPartitioner::consume_fasta(const std::string& filename) +{ + ReadParserPtr parser = get_parser(filename); + Read read; + uint64_t n_consumed = 0; + + while (!parser->is_complete()) { + if (n_consumed && (n_consumed % 10000 == 0)) { + std::cout << "consumed " << n_consumed << "..." << std::endl; + } + try { + read = parser->get_next_read( ); + } catch (NoMoreReadsAvailable) { + break; + } + + read.set_clean_seq(); + consume(read.sequence); + n_consumed++; + } + + return n_consumed; +} + + +uint64_t StreamingPartitioner::consume(const std::string& seq) +{ + TagVector tags; + KmerQueue seeds; + std::set seen; + + uint64_t n_new = seed_sequence(seq, tags, seeds, seen); + find_connected_tags(seeds, tags, seen, false); + //acquire_components(); + create_and_merge_components(tags); + //release_components(); + return n_new; +} + + +uint64_t StreamingPartitioner::consume_pair(const std::string& first, + const std::string& second) +{ + TagVector tags; + KmerQueue seeds; + std::set seen; + + uint64_t n_new = seed_sequence(first, tags, seeds, seen); + n_new += seed_sequence(second, tags, seeds, seen); + find_connected_tags(seeds, tags, seen, false); + //acquire_components(); + create_and_merge_components(tags); + //release_components(); + return n_new; +} + + +ComponentPtr StreamingPartitioner::get(std::string& kmer) const +{ + HashIntoType h = graph->hash_dna(kmer.c_str()); + return ComponentMap::get(h); +} + + +ComponentPtr StreamingPartitioner::get(HashIntoType h) const +{ + return ComponentMap::get(h); +} + + +uint64_t StreamingPartitioner::seed_sequence(const std::string& seq, + TagVector& tags, + KmerQueue& seeds, + std::set& seen) +{ + /* For the following comments, let G be the set of k-mers + * known in the graph before inserting the k-mers R from + * &seq, with / as the difference, + as the union, and & + * as the intersect operator. + */ + //if (auto graphptr = graph.lock()) { +#if(SP_DEBUG) + std::cout << "Consume sequence." << std::endl; +#endif + uint64_t n_new = 0; + ++n_consumed; + + if(graph != NULL) { + KmerIterator kmers(seq.c_str(), graph->ksize()); + unsigned int since = _tag_density / 2 + 1; + + KmerSet intersection; + + bool in_known_territory = false; + bool found_tag_in_territory = false; + + Kmer kmer; + do { + kmer = kmers.next(); + bool is_new_kmer = graph->add(kmer); + bool kmer_tagged = false; + + if (is_new_kmer) { + // A k-mer from U/G must be searched from for tags, + // as it could be adjacent to a a k-mer in G/U + if (in_known_territory && found_tag_in_territory) { + // If we had found a tag in the U&G component we just + // left, add the component to the seen set. + seen.insert(intersection.begin(), intersection.end()); + } /*else { + for (auto km : intersection) { + seeds.push(km); + } + }*/ + intersection.clear(); + + seeds.push_back(kmer); + in_known_territory = false; + found_tag_in_territory = false; + ++since; + ++n_new; + } else { + // Keep track of connected components in U&G: when we exit + // this component, if there is a tag, we will want to add its nodes + // to the seen set, as we do not need to traverse from them in the tag search. + intersection.insert(kmer); + in_known_territory = true; + kmer_tagged = this->contains(kmer); + if (kmer_tagged) { + since = 1; + tags.push_back(kmer); + found_tag_in_territory = true; + } else { + ++since; + } + } + + if (since >= _tag_density) { + tags.push_back(kmer); + since = 1; + } + } while (!kmers.done()); + + // always tag the last k-mer + if (since >= _tag_density / 2) { + tags.push_back(kmer); + } + seeds.push_back(kmer); + + // now go back and make sure to search from the first k-mer + kmer = kmers.first(); + seeds.push_back(kmer); + +#if(DEBUG_SP) + std::cout << "Done iterating k-mers" << std::endl; + std::cout << tags.size() << " tags in sequence" << std::endl; +#endif + } else { + throw oxli_ptr_exception("Hashgraph has been deleted."); + } + + return n_new; +} + +ComponentPtr StreamingPartitioner::find_nearest_component(std::string& kmer) const +{ + Kmer hashed = graph->build_kmer(kmer); + return find_nearest_component(hashed); +} + + +ComponentPtr StreamingPartitioner::find_nearest_component(Kmer kmer) const +{ + TagVector tags; + std::set seen; + KmerQueue node_q; + node_q.push_front(kmer); + + find_connected_tags(node_q, tags, seen, true); + if (tags.size() > 0) { + HashIntoType tag = *(tags.begin()); + return this->get(tag); + } else { + return NULL; + } +} + + +void StreamingPartitioner::find_connected_tags(KmerQueue& node_q, + TagVector& found_tags, + std::set& seen, + bool truncate) const +{ + + //if (auto graphptr = graph.lock()) { + if (graph != NULL) { + + // put a 0 on the breadth queue for each element in the starting node queue + std::queue breadth_q(std::deque(node_q.size(), 0)); + + unsigned int cur_breadth = 0; + const unsigned int max_breadth = _tag_density + 1; + + unsigned int total = 0; + unsigned int nfound = 0; + + KmerFilter filter = [&] (const Kmer& n) -> bool { + return set_contains(seen, n); + }; + Traverser traverser(graph, filter); + + while(!node_q.empty()) { + + Kmer node = node_q.front(); + node_q.pop_front(); + + unsigned int breadth = breadth_q.front(); + breadth_q.pop(); + + // keep track of seen kmers + seen.insert(node); + total++; + + // Found a tag! + if (this->contains(node)) { + found_tags.push_back(node); + if (truncate) { + return; + } + continue; + } + + if (!(breadth >= cur_breadth)) { + throw oxli_exception("Desynchonization between traversal " + "and breadth tracking. Did you forget " + "to pop the node or breadth queue?"); + } + if (breadth > cur_breadth) { + cur_breadth = breadth; + } + + if (breadth >= max_breadth) { + continue; // truncate search @CTB exit? + } + + nfound = traverser.traverse(node, node_q); + for (unsigned int i = 0; i(); + // size is the power of two to specify the number of slots in + // the filter (2**size). Third argument sets the number of bits used + // in the key (current value of size+8 is copied from the CQF example) + // Final argument is the number of bits allocated for the value, which + // we do not use. + qf_init(cf.get(), (1ULL << size), size+8, 0); +} + + +QFStorage::~QFStorage() +{ + qf_destroy(cf.get()); +} + + +bool QFStorage::add(HashIntoType khash) +{ + bool is_new = get_count(khash) == 0; + qf_insert(cf.get(), khash % cf->range, 0, 1); + return is_new; +} + + +const BoundedCounterType QFStorage::get_count(HashIntoType khash) const +{ + return qf_count_key_value(cf.get(), khash % cf->range, 0); +} + + +std::vector QFStorage::get_tablesizes() const +{ + return {cf->xnslots}; +} + + +const uint64_t QFStorage::n_unique_kmers() const +{ + return cf->ndistinct_elts; +} + + +const uint64_t QFStorage::n_occupied() const +{ + return cf->noccupied_slots; +} + + void QFStorage::save(std::string outfilename, WordLength ksize) { ofstream outfile(outfilename.c_str(), ios::binary); @@ -931,25 +992,25 @@ void QFStorage::save(std::string outfilename, WordLength ksize) /* just a hack to handle __uint128_t value. Don't know a better to handle it * right now */ uint64_t tmp_range; - tmp_range = cf.range; - - outfile.write((const char *) &cf.nslots, sizeof(cf.nslots)); - outfile.write((const char *) &cf.xnslots, sizeof(cf.xnslots)); - outfile.write((const char *) &cf.key_bits, sizeof(cf.key_bits)); - outfile.write((const char *) &cf.value_bits, sizeof(cf.value_bits)); - outfile.write((const char *) &cf.key_remainder_bits, sizeof(cf.key_remainder_bits)); - outfile.write((const char *) &cf.bits_per_slot, sizeof(cf.bits_per_slot)); + tmp_range = cf->range; + + outfile.write((const char *) &cf->nslots, sizeof(cf->nslots)); + outfile.write((const char *) &cf->xnslots, sizeof(cf->xnslots)); + outfile.write((const char *) &cf->key_bits, sizeof(cf->key_bits)); + outfile.write((const char *) &cf->value_bits, sizeof(cf->value_bits)); + outfile.write((const char *) &cf->key_remainder_bits, sizeof(cf->key_remainder_bits)); + outfile.write((const char *) &cf->bits_per_slot, sizeof(cf->bits_per_slot)); outfile.write((const char *) &tmp_range, sizeof(tmp_range)); - outfile.write((const char *) &cf.nblocks, sizeof(cf.nblocks)); - outfile.write((const char *) &cf.nelts, sizeof(cf.nelts)); - outfile.write((const char *) &cf.ndistinct_elts, sizeof(cf.ndistinct_elts)); - outfile.write((const char *) &cf.noccupied_slots, sizeof(cf.noccupied_slots)); + outfile.write((const char *) &cf->nblocks, sizeof(cf->nblocks)); + outfile.write((const char *) &cf->nelts, sizeof(cf->nelts)); + outfile.write((const char *) &cf->ndistinct_elts, sizeof(cf->ndistinct_elts)); + outfile.write((const char *) &cf->noccupied_slots, sizeof(cf->noccupied_slots)); #if BITS_PER_SLOT == 8 || BITS_PER_SLOT == 16 || BITS_PER_SLOT == 32 || BITS_PER_SLOT == 64 - outfile.write((const char *) cf.blocks, sizeof(qfblock) * cf.nblocks); + outfile.write((const char *) cf->blocks, sizeof(qfblock) * cf->nblocks); #else - outfile.write((const char *) cf.blocks, - (sizeof(qfblock) + SLOTS_PER_BLOCK * cf.bits_per_slot / 8) * cf.nblocks); + outfile.write((const char *) cf->blocks, + (sizeof(qfblock) + SLOTS_PER_BLOCK * cf->bits_per_slot / 8) * cf->nblocks); #endif outfile.close(); } @@ -1011,34 +1072,34 @@ void QFStorage::load(std::string infilename, WordLength &ksize) infile.read((char *) &save_ksize, sizeof(save_ksize)); ksize = save_ksize; - infile.read((char *) &cf.nslots, sizeof(cf.nslots)); - infile.read((char *) &cf.xnslots, sizeof(cf.xnslots)); - infile.read((char *) &cf.key_bits, sizeof(cf.key_bits)); - infile.read((char *) &cf.value_bits, sizeof(cf.value_bits)); - infile.read((char *) &cf.key_remainder_bits, sizeof(cf.key_remainder_bits)); - infile.read((char *) &cf.bits_per_slot, sizeof(cf.bits_per_slot)); + infile.read((char *) &cf->nslots, sizeof(cf->nslots)); + infile.read((char *) &cf->xnslots, sizeof(cf->xnslots)); + infile.read((char *) &cf->key_bits, sizeof(cf->key_bits)); + infile.read((char *) &cf->value_bits, sizeof(cf->value_bits)); + infile.read((char *) &cf->key_remainder_bits, sizeof(cf->key_remainder_bits)); + infile.read((char *) &cf->bits_per_slot, sizeof(cf->bits_per_slot)); infile.read((char *) &tmp_range, sizeof(tmp_range)); - infile.read((char *) &cf.nblocks, sizeof(cf.nblocks)); - infile.read((char *) &cf.nelts, sizeof(cf.nelts)); - infile.read((char *) &cf.ndistinct_elts, sizeof(cf.ndistinct_elts)); - infile.read((char *) &cf.noccupied_slots, sizeof(cf.noccupied_slots)); + infile.read((char *) &cf->nblocks, sizeof(cf->nblocks)); + infile.read((char *) &cf->nelts, sizeof(cf->nelts)); + infile.read((char *) &cf->ndistinct_elts, sizeof(cf->ndistinct_elts)); + infile.read((char *) &cf->noccupied_slots, sizeof(cf->noccupied_slots)); /* just a hack to handle __uint128_t value. Don't know a better to handle it * right now */ - cf.range = tmp_range; + cf->range = tmp_range; // deallocate previously allocated blocks - free(cf.blocks); + free(cf->blocks); /* allocate the space for the actual qf blocks */ #if BITS_PER_SLOT == 8 || BITS_PER_SLOT == 16 || BITS_PER_SLOT == 32 || BITS_PER_SLOT == 64 - cf.blocks = (qfblock *)calloc(cf.nblocks, sizeof(qfblock)); + cf->blocks = (qfblock *)calloc(cf->nblocks, sizeof(qfblock)); #else - cf.blocks = (qfblock *)calloc(cf.nblocks, sizeof(qfblock) + SLOTS_PER_BLOCK * cf.bits_per_slot / 8); + cf->blocks = (qfblock *)calloc(cf->nblocks, sizeof(qfblock) + SLOTS_PER_BLOCK * cf->bits_per_slot / 8); #endif #if BITS_PER_SLOT == 8 || BITS_PER_SLOT == 16 || BITS_PER_SLOT == 32 || BITS_PER_SLOT == 64 - infile.read((char *) cf.blocks, sizeof(qfblock) * cf.nblocks); + infile.read((char *) cf->blocks, sizeof(qfblock) * cf->nblocks); #else - infile.read((char *) cf.blocks, - (sizeof(qfblock) + SLOTS_PER_BLOCK * cf.bits_per_slot / 8) * cf.nblocks); + infile.read((char *) cf->blocks, + (sizeof(qfblock) + SLOTS_PER_BLOCK * cf->bits_per_slot / 8) * cf->nblocks); #endif infile.close(); } diff --git a/src/oxli/subset.cc b/src/oxli/subset.cc index 280d217a74..1116c80a4f 100644 --- a/src/oxli/subset.cc +++ b/src/oxli/subset.cc @@ -234,7 +234,7 @@ void SubsetPartition::find_all_tags( }; Traverser traverser(_ht, filter); - node_q.push(start_kmer); + node_q.push_front(start_kmer); breadth_q.push(0); while(!node_q.empty()) { @@ -245,7 +245,7 @@ void SubsetPartition::find_all_tags( } Kmer node = node_q.front(); - node_q.pop(); + node_q.pop_front(); unsigned int breadth = breadth_q.front(); breadth_q.pop(); @@ -331,7 +331,7 @@ unsigned int SubsetPartition::sweep_for_tags( Kmer node = kmers.next(); traversed_nodes.insert(node); - node_q.push(node); + node_q.push_front(node); breadth_q.push(0); } @@ -347,7 +347,7 @@ unsigned int SubsetPartition::sweep_for_tags( } Kmer node = node_q.front(); - node_q.pop(); + node_q.pop_front(); unsigned int breadth = breadth_q.front(); breadth_q.pop(); @@ -423,7 +423,7 @@ void SubsetPartition::find_all_tags_truncate_on_abundance( Traverser traverser(_ht, filter); - node_q.push(start_kmer); + node_q.push_front(start_kmer); breadth_q.push(0); while(!node_q.empty()) { @@ -433,7 +433,7 @@ void SubsetPartition::find_all_tags_truncate_on_abundance( } Kmer node = node_q.front(); - node_q.pop(); + node_q.pop_front(); unsigned int breadth = breadth_q.front(); breadth_q.pop(); diff --git a/src/oxli/traversal.cc b/src/oxli/traversal.cc index e3befe17c6..a2024dd234 100644 --- a/src/oxli/traversal.cc +++ b/src/oxli/traversal.cc @@ -71,7 +71,7 @@ NodeGatherer::NodeGatherer(const Hashgraph * ht) : template NodeGatherer::NodeGatherer(const Hashgraph * ht, - KmerFilter filter) : + KmerFilter filter) : NodeGatherer(ht, KmerFilterList()) { filters.push_back(filter); @@ -113,24 +113,27 @@ const template +template unsigned int NodeGatherer::neighbors(const Kmer& node, - KmerQueue & node_q) + Container& found) const { - unsigned int found = 0; + unsigned int n_found = 0; for (auto base : alphabets::DNA_SIMPLE) { // Get the putative neighboring Kmer Kmer neighbor = get_neighbor(node, base); // Now check if it's in the graph and passes the filters - if (graph->get_count(neighbor) && !(apply_kmer_filters(neighbor, filters))) { - node_q.push(neighbor); - ++found; + if (graph->get_count(neighbor)) { + ++n_found; + if (!apply_kmer_filters(neighbor, filters)) { + found.insert(found.end(), neighbor); + } } ++base; } - return found; + return n_found; } @@ -276,33 +279,51 @@ unsigned int Traverser::degree_right(const Kmer& node) const template AssemblerTraverser::AssemblerTraverser(const Hashgraph * ht, - Kmer start_kmer, - KmerFilterList filters) : - NodeCursor(ht, start_kmer, filters) + Kmer start_kmer) : + NodeCursor(ht, start_kmer) { - visited = std::make_shared(); - AssemblerTraverser::push_filter(get_visited_filter(visited)); + _init_visited(); } template AssemblerTraverser::AssemblerTraverser(const Hashgraph * ht, - Kmer start_kmer, - KmerFilterList filters, - std::shared_ptr visited) : - NodeCursor(ht, start_kmer, filters), visited(visited) + Kmer start_kmer, + KmerFilterList filters) : + NodeCursor(ht, start_kmer, filters) + +{ + _init_visited(); +} + +template +AssemblerTraverser::AssemblerTraverser(const Hashgraph * ht, + Kmer start_kmer, + KmerFilterList filters, + std::shared_ptr visited) : + NodeCursor(ht, start_kmer, filters), visited(visited) { AssemblerTraverser::push_filter(get_visited_filter(visited)); } + +template +AssemblerTraverser::AssemblerTraverser(const Hashgraph * ht, + Kmer start_kmer, + KmerFilter filter) : + NodeCursor(ht, start_kmer, filter) +{ + _init_visited(); +} + + template -AssemblerTraverser::AssemblerTraverser(const AssemblerTraverser& other) : - AssemblerTraverser(other.graph, - other.cursor, - other.filters, - other.visited) +AssemblerTraverser::AssemblerTraverser(const AssemblerTraverser& other) : + AssemblerTraverser(other.graph, other.cursor, other.filters, other.visited) { + } + template <> std::string AssemblerTraverser::join_contigs(std::string& contig_a, std::string& contig_b, WordLength offset) @@ -328,6 +349,7 @@ char AssemblerTraverser::next_symbol() Kmer cursor_next; visited->insert(this->cursor); + apply_kmer_helpers(this->cursor, this->helpers); for (auto base : alphabets::DNA_SIMPLE) { // Get the putative neighbor for this base at the cursor position neighbor = NodeCursor::get_neighbor(this->cursor, base); @@ -355,12 +377,71 @@ char AssemblerTraverser::next_symbol() } +/****************************************** + * CompactingAT + ******************************************/ + +template +CompactingAT::CompactingAT(const Hashgraph * ht, + Kmer start_kmer) : + AssemblerTraverser(ht, start_kmer), traverser(ht) +{ +} + +template +CompactingAT::CompactingAT(const Hashgraph * ht, + Kmer start_kmer, + KmerFilterList filters, + std::shared_ptr visited) : + AssemblerTraverser(ht, start_kmer, filters, visited), traverser(ht) +{ +} + + +template +CompactingAT::CompactingAT(const Hashgraph * ht, + Kmer start_kmer, + KmerFilterList filters) : + AssemblerTraverser(ht, start_kmer, filters), traverser(ht) +{ +} + +template +CompactingAT::CompactingAT(const Hashgraph * ht, + Kmer start_kmer, + KmerFilter filter) : + AssemblerTraverser(ht, start_kmer, filter), traverser(ht) +{ +} + +template<> +char CompactingAT::next_symbol() +{ + if (traverser.degree_left(this->cursor) > 1) { + return '\0'; + } + return AssemblerTraverser::next_symbol(); +} + + +template<> +char CompactingAT::next_symbol() +{ + if (traverser.degree_right(this->cursor) > 1) { + return '\0'; + } + return AssemblerTraverser::next_symbol(); +} + + template class NodeGatherer; template class NodeGatherer; template class NodeCursor; template class NodeCursor; template class AssemblerTraverser; template class AssemblerTraverser; +template class CompactingAT; +template class CompactingAT; } // namespace oxli diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000000..1e7bf3f2a4 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,14 @@ + + + +def pytest_generate_tests(metafunc): + if 'ksize' in metafunc.fixturenames: + ksize = getattr(metafunc.function, '_ksize', None) + if ksize is None: + ksize = [21] + if isinstance(ksize, int): + ksize = [ksize] + metafunc.parametrize('ksize', ksize, + ids=lambda k: 'K={0}'.format(k)) + + diff --git a/tests/graph_features.py b/tests/graph_features.py deleted file mode 100755 index c2d6912846..0000000000 --- a/tests/graph_features.py +++ /dev/null @@ -1,535 +0,0 @@ -# -*- coding: UTF-8 -*- -# -# This file is part of khmer, https://github.com/dib-lab/khmer/, and is -# Copyright (C) 2010-2015, Michigan State University. -# Copyright (C) 2015-2016, The Regents of the University of California. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided -# with the distribution. -# -# * Neither the name of the Michigan State University nor the names -# of its contributors may be used to endorse or promote products -# derived from this software without specific prior written -# permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# -# Contact: khmer-project@idyll.org -# pylint: disable=missing-docstring,protected-access,no-member,invalid-name - - -import itertools -import random - -import khmer -from khmer.khmer_args import estimate_optimal_with_K_and_f as optimal_fp -from khmer import reverse_complement as revcomp -from . import khmer_tst_utils as utils - -import pytest -import screed - - -# We just define this globally rather than in a module-level fixture, -# as we need it during parameterization and whatnot. -K = 21 - - -class Kmer(str): - - def __init__(self, value, pos=0): - self.pos = pos - - def __new__(cls, value, pos=0): - if not len(value) == K: - raise ValueError('bad k-mer length') - return str.__new__(cls, value) - - -def mutate_base(base): - if base in 'AT': - return random.choice('GC') - elif base in 'GC': - return random.choice('AT') - else: - assert False, 'bad base' - - -def mutate_sequence(sequence, N=1): - sequence = list(sequence) - positions = random.sample(range(len(sequence)), N) - - for i in positions: - sequence[i] = mutate_base(sequence[i]) - - return ''.join(sequence) - - -def mutate_position(sequence, pos): - sequence = list(sequence) - sequence[pos] = mutate_base(sequence[pos]) - return ''.join(sequence) - - -def get_random_sequence(length, exclude=None): - '''Generate a random (non-looping) nucleotide sequence. - - To be non-overlapping, the sequence should not include any repeated - length K-1 k-mers. - - Args: - exclude (str): If not None, add the k-mers from this sequence to the - seen set. - - Returns: - str: A random non-looping sequence. - ''' - - seen = set() - - def add_seen(kmer): - seen.add(kmer) - seen.add(revcomp(kmer)) - - if exclude is not None: - for pos in range(0, len(exclude) - K): - add_seen(exclude[pos:pos + K - 1]) - - seq = [random.choice('ACGT') for _ in range(K - 1)] # do first K-1 bases - add_seen(''.join(seq)) - - while(len(seq) < length): - next_base = random.choice('ACGT') - next_kmer = ''.join(seq[-K + 2:] + [next_base]) - assert len(next_kmer) == K - 1 - if (next_kmer) not in seen: - seq.append(next_base) - add_seen(next_kmer) - else: - continue - return ''.join(seq) - - -def reads(sequence, L=100, N=100, dbg_cover=False): - positions = list(range(len(sequence) - L)) - if dbg_cover is True: - for start in range(0, len(sequence), K): - read = sequence[start:start + L] - if len(read) < K: - read = sequence[-L:] - yield read - N -= 1 - if N < 0: - return - for i in range(N): - start = random.choice(positions) - yield sequence[start:start + L] - - -def kmers(sequence): - for i in range(len(sequence) - K + 1): - yield sequence[i:i + K] - - -def test_mutate_sequence(): - for _ in range(100): - assert 'A' not in mutate_sequence('A' * 10, 10) - assert 'T' not in mutate_sequence('T' * 10, 10) - assert 'C' not in mutate_sequence('C' * 10, 10) - assert 'G' not in mutate_sequence('G' * 10, 10) - - -def test_mutate_position(): - assert mutate_position('AAAA', 2) in ['AACA', 'AAGA'] - assert mutate_position('TTTT', 2) in ['TTCT', 'TTGT'] - assert mutate_position('CCCC', 2) in ['CCAC', 'CCTC'] - assert mutate_position('GGGG', 2) in ['GGAG', 'GGTG'] - - -def test_reads(): - contigfile = utils.get_test_data('simple-genome.fa') - contig = list(screed.open(contigfile))[0].sequence - - for read in reads(contig): - assert read in contig - - for read in reads(contig): - assert mutate_sequence(read) not in contig - - -''' -# GRAPH STRUCTURE FIXTURES - -These fixtures emit various graph structures with their corresponding -sequences and important nodes. They take a random sequence fixture and -a graph fixture, then consume sequence and generate k-mers accordingly. - -We're using a bespoke but simple language to describe graph structures in the -docstrings of these tests. It is as follows: - - o: Node - [x:y]: Node at position in sequence - [x:y]+S: Node at position in sequence with extra base (where S in ACGT) - (Name), ([x:y] Name): Named node, named node at position - → : Edge - ~~: Tandem →o→ repeats -''' - - -@pytest.fixture(params=['simple-genome.fa']) -def known_sequence(request): - fn = utils.get_test_data(request.param) - return list(screed.open(fn))[0].sequence - - -@pytest.fixture(params=list(range(500, 1600, 500)), - ids=lambda val: '(L={0})'.format(val)) -def random_sequence(request): - - def get(exclude=None): - return get_random_sequence(request.param, exclude=exclude) - - return get - - -@pytest.fixture(params=[khmer.Nodegraph, khmer.Countgraph], - ids=['(Type=Nodegraph)', '(Type=Countgraph)']) -def graph(request): - - num_kmers = 50000 - des_fp = 0.00001 - args = optimal_fp(num_kmers, des_fp) - print('Graph Params:', args) - - return request.param(K, args.htable_size, args.num_htables) - - -def hdn_counts(sequence, graph): - '''Get the degree distribution of nodes with degree more than 2. - ''' - - hdns = {} - for kmer in kmers(sequence): - d = graph.kmer_degree(kmer) - if d > 2: - hdns[d] = hdns.get(d, 0) + 1 - - return hdns - - -@pytest.fixture -def linear_structure(request, graph, random_sequence): - '''Sets up a simple linear path graph structure. - - sequence - [0]→o→o~~o→o→[-1] - ''' - sequence = random_sequence() - graph.consume(sequence) - - # Check for false positive neighbors in our graph - # Mark as an expected failure if any are found - if hdn_counts(sequence, graph): - request.applymarker(pytest.mark.xfail) - - return graph, sequence - - -@pytest.fixture(params=[K * 2, -K * 2], - ids=['(Where={0})'.format(i) for i in ['Start', 'End']]) -def right_tip_structure(request, graph, random_sequence): - ''' - Sets up a graph structure like so: - ([S+1:S+K]+B tip) - sequence ↗ - [0]→o→o~~o→(L)→([S:S+K] HDN)→(R)→o→o→o~~o→[-1] - - Where S is the start position of the high degreen node (HDN). - That is, it has a single branch at the Sth K-mer. - ''' - sequence = random_sequence() - S = request.param - if S < 0: - S = len(sequence) + S - # the HDN - HDN = Kmer(sequence[S:S + K], pos=S) - # left of the HDN - L = Kmer(sequence[S - 1:S - 1 + K], pos=S - 1) - # right of the HDN - R = Kmer(sequence[S + 1:S + 1 + K], pos=S + 1) - # the branch kmer - tip = Kmer(mutate_position(R, -1), - pos=R.pos) - - graph.consume(sequence) - graph.count(tip) - - # Check for false positive neighbors and mark as expected failure if found - if hdn_counts(sequence, graph) != {3: 1}: - request.applymarker(pytest.mark.xfail) - - return graph, sequence, L, HDN, R, tip - - -@pytest.fixture(params=[K * 2, -K * 2], - ids=['(Where={0})'.format(i) for i in ['Start', 'End']]) -def right_double_fork_structure(request, linear_structure, random_sequence): - ''' - Sets up a graph structure like so: - branch - ([S+1:S+K]+B)→o~~o→o - core_sequence ↗ - [0]→o→o~~o→(L)→([S:S+K] HDN)→(R)→o→o→o~~o→[-1] - - Where S is the start position of the high degreen node (HDN) - and B is the mutated base starting the branch. - ''' - - graph, core_sequence = linear_structure - print('\nCore Len:', len(core_sequence)) - branch_sequence = random_sequence(exclude=core_sequence) - print('Branch len:', len(branch_sequence)) - - # start position of the HDN - S = request.param - if S < 0: - S = len(core_sequence) + S - # the HDN - HDN = Kmer(core_sequence[S:S + K], pos=S) - # left of the HDN - L = Kmer(core_sequence[S - 1:S - 1 + K], pos=S - 1) - # right of the HDN - R = Kmer(core_sequence[S + 1:S + 1 + K], pos=S + 1) - # the branch sequence, mutated at position S+1 - branch_start = core_sequence[:R.pos] + mutate_position(R, -1) - branch_sequence = branch_start + branch_sequence - - graph.consume(core_sequence) - graph.consume(branch_sequence) - - # Check for false positive neighbors and mark as expected failure if found - core_hdns = hdn_counts(core_sequence, graph) - branch_hdns = hdn_counts(branch_sequence, graph) - - # the core and branch sequences should each have exactly - # ONE node of degree 3 (HDN) - if core_hdns != {3: 1} or branch_hdns != {3: 1}: - print(core_hdns, branch_hdns) - request.applymarker(pytest.mark.xfail) - - return graph, core_sequence, L, HDN, R, branch_sequence - - -@pytest.fixture -def right_triple_fork_structure(request, right_double_fork_structure, - random_sequence): - ''' - Sets up a graph structure like so: - - top_branch - ([:S+1]+B)→o~~o→o - core_sequence ↗ - [0]→o→o~~o→(L)→([S:S+K] HDN)→(R)→o→o→o~~o→[-1] - ↘ - ([:S+1]+B)→o~~o→o - bottom_branch - - Where S is the start position of the high degreen node (HDN). - ''' - - graph, core_sequence, L, HDN, R, top_sequence = right_double_fork_structure - bottom_branch = random_sequence(exclude=core_sequence + top_sequence) - print(len(core_sequence), len(top_sequence), len(bottom_branch)) - - # the branch sequence, mutated at position S+1 - # choose a base not already represented at that position - bases = {'A', 'C', 'G', 'T'} - mutated = random.choice(list(bases - {R[-1], top_sequence[R.pos + K - 1]})) - - bottom_sequence = core_sequence[:HDN.pos + K] + mutated + bottom_branch - - graph.consume(bottom_sequence) - - # Check for false positive neighbors and mark as expected failure if found - core_hdns = hdn_counts(core_sequence, graph) - top_hdns = hdn_counts(top_sequence, graph) - bottom_hdns = hdn_counts(bottom_sequence, graph) - - # the core, top, and bottom sequences should each have exactly - # ONE node of degree 4 (HDN) - if not (core_hdns == top_hdns == bottom_hdns == {4: 1}): - print(core_hdns, top_hdns, bottom_hdns) - request.applymarker(pytest.mark.xfail) - - return graph, core_sequence, L, HDN, R, top_sequence, bottom_sequence - - -@pytest.fixture(params=[K * 2, -K * 2], - ids=['(Where={0})'.format(i) for i in ['Start', 'End']]) -def left_tip_structure(request, graph, random_sequence): - ''' - Sets up a graph structure like so: - - branch - (B+[S:S+K-1] tip) - ↘ sequence - [0]→o~~o→(L)→([S:S+K] HDN)→(R)→o→o~~o→[-1] - - Where S is the start position of the HDN. - ''' - sequence = random_sequence() - S = request.param - if S < 0: - S = len(sequence) + S - tip = Kmer(mutate_position(sequence[S - 1:S - 1 + K], 0), - pos=S - 1 + K) - HDN = Kmer(sequence[S:S + K], pos=S) - L = Kmer(sequence[S - 1:S - 1 + K], pos=S - 1) - R = Kmer(sequence[S + 1:S + 1 + K], pos=S + 1) - - graph.consume(sequence) - graph.count(tip) - - # Check for false positive neighbors and mark as expected failure if found - if hdn_counts(sequence, graph) != {3: 1}: - request.applymarker(pytest.mark.xfail) - - return graph, sequence, L, HDN, R, tip - - -@pytest.fixture(params=[K * 2, -K * 2], - ids=['(Where={0})'.format(i) for i in ['Start', 'End']]) -def left_double_fork_structure(request, linear_structure, random_sequence): - ''' - Sets up a graph structure like so: - - o→o~~o→(B+[S:S+K-1]) - ↘ core_sequence - [0]→o→o~~o→(L)→([S:S+K] HDN)→(R)→o→o→o~~o→[-1] - - Where S is the start position of the high degreen node (HDN). - ''' - - graph, core_sequence = linear_structure - branch_sequence = random_sequence(exclude=core_sequence) - - # start position of the HDN - S = request.param - if S < 0: - S = len(core_sequence) + S - # the HDN - HDN = Kmer(core_sequence[S:S + K], pos=S) - # left of the HDN - L = Kmer(core_sequence[S - 1:S - 1 + K], pos=S - 1) - # right of the HDN - R = Kmer(core_sequence[S + 1:S + 1 + K], pos=S + 1) - # the branch sequence, mutated at position 0 in L, - # whih is equivalent to the K-1 prefix of HDN prepended with a new base - branch_start = mutate_position(L, 0) - branch_sequence = branch_sequence + \ - branch_start + core_sequence[L.pos + K:] - - graph.consume(core_sequence) - graph.consume(branch_sequence) - - # Check for false positive neighbors and mark as expected failure if found - core_hdns = hdn_counts(core_sequence, graph) - branch_hdns = hdn_counts(branch_sequence, graph) - - # the core and branch sequences should each have exactly - # ONE node of degree 3 (HDN) - if not (core_hdns == branch_hdns == {3: 1}): - request.applymarker(pytest.mark.xfail) - - return graph, core_sequence, L, HDN, R, branch_sequence - - -@pytest.fixture(params=[K * 2, (-K * 2) - 2], - ids=['(Where={0})'.format(i) for i in ['Start', 'End']]) -def snp_bubble_structure(request, linear_structure): - ''' - Sets up a graph structure resulting from a SNP (Single Nucleotide - Polymorphism). - - (HDN_L[1:]+SNP)→o~~o→(SNP+) - ↗ ↘ - o~~([S:S+K] HDN_L) ([S+K+1:S+2K+1] HDN_R)~~o - ↘ ↗ - (HDN_L[1:]+W)→o~~o~~o→(W+) - - Where S is the start position of HDN directly left of the SNP (HDN_L), - SNP is the mutated base, and W is the wildtype (original) base. - Of course, W and SNP could be interchanged here, we don't actually - know which is which ;) - - Note our parameterization: we need a bit more room from the ends, - so we bring the rightmost SNP a tad left. - ''' - - graph, wildtype_sequence = linear_structure - S = request.param - if S < 0: - S = len(wildtype_sequence) + S - snp_sequence = mutate_position(wildtype_sequence, S + K) - HDN_L = Kmer(wildtype_sequence[S:S + K], pos=S) - HDN_R = Kmer(wildtype_sequence[S + K + 1:S + 2 * K + 1], pos=S + K + 1) - - graph.consume(wildtype_sequence) - graph.consume(snp_sequence) - - # Check for false positive neighbors and mark as expected failure if found - w_hdns = hdn_counts(wildtype_sequence, graph) - snp_hdns = hdn_counts(snp_sequence, graph) - if not (w_hdns == snp_hdns == {3: 2}): - print(w_hdns, snp_hdns) - print(HDN_L, HDN_R) - print(wildtype_sequence[HDN_L.pos + K + 1]) - print(snp_sequence[HDN_L.pos + K + 1]) - request.applymarker(pytest.mark.xfail) - - return graph, wildtype_sequence, snp_sequence, HDN_L, HDN_R - - -@pytest.fixture(params=[2, 3, 4, 5, 6, 7, 8]) -def tandem_repeat_structure(request, linear_structure): - - graph, sequence = linear_structure - - tandem_repeats = sequence * request.param - graph.consume(tandem_repeats) - - if hdn_counts(tandem_repeats, graph): - request.applymarker(pytest.mark.xfail) - - return graph, sequence, tandem_repeats - - -@pytest.fixture -def circular_linear_structure(request, linear_structure): - graph, sequence = linear_structure - - sequence += sequence - - if hdn_counts(sequence, graph): - request.applymarker(pytest.mark.xfail) - - return graph, sequence diff --git a/tests/graph_structure_fixtures.py b/tests/graph_structure_fixtures.py new file mode 100644 index 0000000000..785dacd4c4 --- /dev/null +++ b/tests/graph_structure_fixtures.py @@ -0,0 +1,634 @@ +# -*- coding: UTF-8 -*- +# +# This file is part of khmer, https://github.com/dib-lab/khmer/, and is +# Copyright (C) 2010-2015, Michigan State University. +# Copyright (C) 2015-2016, The Regents of the University of California. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of the Michigan State University nor the names +# of its contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Contact: khmer-project@idyll.org +# pylint: disable=missing-docstring,protected-access,no-member,invalid-name + + +import itertools +import random + +import khmer +from khmer.khmer_args import estimate_optimal_with_K_and_f as optimal_fp +from khmer import reverse_complement as revcomp +from . import khmer_tst_utils as utils + +import pytest +import screed + + +# We just define this globally rather than in a module-level fixture, +# as we need it during parameterization and whatnot. + +def using_ksize(K=21): + def wrap(func): + setattr(func, '_ksize', K) + return func + return wrap + + +def test_ksize(ksize): + assert ksize == 21 + + +@using_ksize(31) +def test_ksize_override(ksize): + assert ksize == 31 + + +@using_ksize([25, 29]) +def test_ksize_override_param(ksize): + print('ksize is', ksize) + assert ksize in [25, 29] + + +@pytest.fixture(params=[2, -2], ids=['Start', 'End']) +def flank_coords(request, ksize): + return (request.param * ksize) + request.param + + +class Kmer(str): + + def __init__(self, value, pos=0): + self.pos = pos + + def __new__(cls, value, pos=0): + return str.__new__(cls, value) + + def __repr__(self): + return str(self) + " @" + str(self.pos) + + +def mutate_base(base): + if base in 'AT': + return random.choice('GC') + elif base in 'GC': + return random.choice('AT') + else: + assert False, 'bad base' + + +def mutate_sequence(sequence, N=1): + sequence = list(sequence) + positions = random.sample(range(len(sequence)), N) + + for i in positions: + sequence[i] = mutate_base(sequence[i]) + + return ''.join(sequence) + + +def mutate_position(sequence, pos): + sequence = list(sequence) + sequence[pos] = mutate_base(sequence[pos]) + return ''.join(sequence) + + +def get_random_sequence(length, ksize, exclude=None, seen=None): + '''Generate a random (non-looping) nucleotide sequence. + + To be non-overlapping, the sequence should not include any repeated + length K-1 k-mers. + + Args: + exclude (str): If not None, add the k-mers from this sequence to the + seen set. + + Returns: + str: A random non-looping sequence. + ''' + + seen = set() if seen is None else seen.copy() + + def add_seen(kmer): + seen.add(kmer) + seen.add(revcomp(kmer)) + + if exclude is not None: + for pos in range(0, len(exclude) - ksize): + add_seen(exclude[pos:pos + ksize - 1]) + + seq = [random.choice('ACGT') for _ in range(ksize - 1)] # do first K-1 bases + add_seen(''.join(seq)) + + while(len(seq) < length): + next_base = random.choice('ACGT') + next_kmer = ''.join(seq[-ksize + 2:] + [next_base]) + assert len(next_kmer) == ksize - 1 + if (next_kmer) not in seen: + seq.append(next_base) + add_seen(next_kmer) + else: + continue + return ''.join(seq) + + +def reads(sequence, ksize, L=100, N=100, dbg_cover=False): + positions = list(range(len(sequence) - L)) + if dbg_cover is True: + for start in range(0, len(sequence), ksize): + read = sequence[start:start + L] + if len(read) < ksize: + read = sequence[-L:] + yield read + N -= 1 + if N < 0: + return + for i in range(N): + start = random.choice(positions) + yield sequence[start:start + L] + + +def kmers(sequence, K): + for i in range(len(sequence) - K + 1): + yield sequence[i:i + K] + + +@using_ksize([5, 7]) +def test_kmers(ksize): + S = 'A' * ksize + 'T' + res = list(kmers(S, ksize)) + assert res[0] == 'A' * ksize + assert res[-1] == ('A' * (ksize - 1)) + 'T' + + +def test_mutate_sequence(): + for _ in range(100): + assert 'A' not in mutate_sequence('A' * 10, 10) + assert 'T' not in mutate_sequence('T' * 10, 10) + assert 'C' not in mutate_sequence('C' * 10, 10) + assert 'G' not in mutate_sequence('G' * 10, 10) + + +def test_mutate_position(): + assert mutate_position('AAAA', 2) in ['AACA', 'AAGA'] + assert mutate_position('TTTT', 2) in ['TTCT', 'TTGT'] + assert mutate_position('CCCC', 2) in ['CCAC', 'CCTC'] + assert mutate_position('GGGG', 2) in ['GGAG', 'GGTG'] + + +def test_reads(ksize): + contigfile = utils.get_test_data('simple-genome.fa') + contig = list(screed.open(contigfile))[0].sequence + + for read in reads(contig, ksize): + assert read in contig + + for read in reads(contig, ksize): + assert mutate_sequence(read) not in contig + + +''' +# GRAPH STRUCTURE FIXTURES + +These fixtures emit various graph structures with their corresponding +sequences and important nodes. They take a random sequence fixture and +a graph fixture, then consume sequence and generate k-mers accordingly. + +We're using a bespoke but simple language to describe graph structures in the +docstrings of these tests. It is as follows: + + o: Node + [x:y]: Node at position in sequence + [x:y]+S: Node at position in sequence with extra base (where S in ACGT) + (Name), ([x:y] Name): Named node, named node at position + → : Edge + ~~: Tandem →o→ repeats +''' + + +@pytest.fixture(params=['simple-genome.fa']) +def known_sequence(request): + fn = utils.get_test_data(request.param) + return list(screed.open(fn))[0].sequence + + +@pytest.fixture(params=list(range(500, 1600, 500)), + ids=lambda val: '(L={0})'.format(val)) +def random_sequence(request, ksize): + global_seen = set() + + def get(exclude=None): + sequence = get_random_sequence(request.param, + ksize, + exclude=exclude, + seen=global_seen) + for i in range(len(sequence)-ksize): + global_seen.add(sequence[i:i+ksize-1]) + global_seen.add(revcomp(sequence[i:i+ksize-1])) + return sequence + + return get + + +@pytest.fixture(params=[khmer.Nodegraph, khmer.Countgraph], + ids=['(Type=Nodegraph)', '(Type=Countgraph)']) +def graph(request, ksize): + + num_kmers = 50000 + des_fp = 0.00001 + args = optimal_fp(num_kmers, des_fp) + print('Graph Params:', args,'K =', ksize) + + return request.param(ksize, args.htable_size, args.num_htables) + +def hdn_counts(sequence, graph): + '''Get the degree distribution of nodes with degree more than 2. + ''' + + hdns = {} + for kmer in kmers(sequence, graph.ksize()): + d = graph.kmer_degree(kmer) + if d > 2: + hdns[d] = hdns.get(d, 0) + 1 + + return hdns + + +@pytest.fixture +def linear_structure(request, graph, ksize, random_sequence): + '''Sets up a simple linear path graph structure. + + sequence + [0]→o→o~~o→o→[-1] + ''' + def get(): + sequence = random_sequence() + graph.consume(sequence) + + # Check for false positive neighbors in our graph + # Mark as an expected failure if any are found + if hdn_counts(sequence, graph): + request.applymarker(pytest.mark.xfail) + + return graph, sequence + return get + + +@pytest.fixture +def right_tip_structure(request, graph, ksize, flank_coords, random_sequence): + ''' + Sets up a graph structure like so: + ([S+1:S+K]+B tip) + sequence ↗ + [0]→o→o~~o→(L)→([S:S+K] HDN)→(R)→o→o→o~~o→[-1] + + Where S is the start position of the high degreen node (HDN). + That is, it has a single branch at the Sth K-mer. + ''' + def get(): + sequence = random_sequence() + S = flank_coords + if S < 0: + S = len(sequence) + S + # the HDN + HDN = Kmer(sequence[S:S + ksize], pos=S) + # left of the HDN + L = Kmer(sequence[S - 1:S - 1 + ksize], pos=S - 1) + # right of the HDN + R = Kmer(sequence[S + 1:S + 1 + ksize], pos=S + 1) + # the branch kmer + tip = Kmer(mutate_position(R, -1), + pos=R.pos) + + graph.consume(sequence) + graph.count(tip) + + # Check for false positive neighbors and mark as expected failure if found + if hdn_counts(sequence, graph) != {3: 1}: + request.applymarker(pytest.mark.xfail) + + return graph, sequence, L, HDN, R, tip + return get + + +@pytest.fixture +def right_double_fork_structure(request, ksize, flank_coords, + linear_structure, random_sequence): + ''' + Sets up a graph structure like so: + branch + ([S+1:S+K]+B)→o~~o→o + core_sequence ↗ + [0]→o→o~~o→(L)→([S:S+K] HDN)→(R)→o→o→o~~o→[-1] + + Where S is the start position of the high degreen node (HDN) + and B is the mutated base starting the branch. + ''' + + def get(): + graph, core_sequence = linear_structure() + print('\nCore Len:', len(core_sequence)) + branch_sequence = random_sequence(exclude=core_sequence) + print('Branch len:', len(branch_sequence)) + + # start position of the HDN + S = flank_coords + if S < 0: + S = len(core_sequence) + S + # the HDN + HDN = Kmer(core_sequence[S:S + ksize], pos=S) + # left of the HDN + L = Kmer(core_sequence[S - 1:S - 1 + ksize], pos=S - 1) + # right of the HDN + R = Kmer(core_sequence[S + 1:S + 1 + ksize], pos=S + 1) + # the branch sequence, mutated at position S+1 + branch_start = core_sequence[:R.pos] + mutate_position(R, -1) + branch_sequence = branch_start + branch_sequence + + graph.consume(core_sequence) + graph.consume(branch_sequence) + + # Check for false positive neighbors and mark as expected failure if found + core_hdns = hdn_counts(core_sequence, graph) + branch_hdns = hdn_counts(branch_sequence, graph) + + # the core and branch sequences should each have exactly + # ONE node of degree 3 (HDN) + if core_hdns != {3: 1} or branch_hdns != {3: 1}: + print(core_hdns, branch_hdns) + request.applymarker(pytest.mark.xfail) + + return graph, core_sequence, L, HDN, R, branch_sequence + return get + + +@pytest.fixture +def right_triple_fork_structure(request, right_double_fork_structure, + random_sequence, ksize): + ''' + Sets up a graph structure like so: + + top_branch + ([:S+1]+B)→o~~o→o + core_sequence ↗ + [0]→o→o~~o→(L)→([S:S+K] HDN)→(R)→o→o→o~~o→[-1] + ↘ + ([:S+1]+B)→o~~o→o + bottom_branch + + Where S is the start position of the high degreen node (HDN). + ''' + + def get(): + graph, core_sequence, L, HDN, R, top_sequence = \ + right_double_fork_structure() + bottom_branch = random_sequence(exclude=core_sequence + top_sequence) + print(len(core_sequence), len(top_sequence), len(bottom_branch)) + + # the branch sequence, mutated at position S+1 + # choose a base not already represented at that position + bases = {'A', 'C', 'G', 'T'} + mutated = random.choice(list(bases - {R[-1], top_sequence[R.pos + ksize - 1]})) + + bottom_sequence = core_sequence[:HDN.pos + ksize] + mutated + bottom_branch + + graph.consume(bottom_sequence) + + # Check for false positive neighbors and mark as expected failure if found + core_hdns = hdn_counts(core_sequence, graph) + top_hdns = hdn_counts(top_sequence, graph) + bottom_hdns = hdn_counts(bottom_sequence, graph) + + # the core, top, and bottom sequences should each have exactly + # ONE node of degree 4 (HDN) + if not (core_hdns == top_hdns == bottom_hdns == {4: 1}): + print(core_hdns, top_hdns, bottom_hdns) + request.applymarker(pytest.mark.xfail) + + return graph, core_sequence, L, HDN, R, top_sequence, bottom_sequence + return get + + +@pytest.fixture +def left_tip_structure(request, graph, ksize, flank_coords, random_sequence): + ''' + Sets up a graph structure like so: + + branch + (B+[S:S+K-1] tip) + ↘ sequence + [0]→o~~o→(L)→([S:S+K] HDN)→(R)→o→o~~o→[-1] + + Where S is the start position of the HDN. + ''' + def get(): + sequence = random_sequence() + S = flank_coords + if S < 0: + S = len(sequence) + S + tip = Kmer(mutate_position(sequence[S - 1:S - 1 + ksize], 0), + pos=S - 1 + ksize) + HDN = Kmer(sequence[S:S + ksize], pos=S) + L = Kmer(sequence[S - 1:S - 1 + ksize], pos=S - 1) + R = Kmer(sequence[S + 1:S + 1 + ksize], pos=S + 1) + + graph.consume(sequence) + graph.count(tip) + + # Check for false positive neighbors and mark as expected failure if found + if hdn_counts(sequence, graph) != {3: 1}: + request.applymarker(pytest.mark.xfail) + + return graph, sequence, L, HDN, R, tip + return get + + +@pytest.fixture +def left_double_fork_structure(request, linear_structure, ksize, + flank_coords, random_sequence): + ''' + Sets up a graph structure like so: + + o→o~~o→(B+[S:S+K-1]) + ↘ core_sequence + [0]→o→o~~o→(L)→([S:S+K] HDN)→(R)→o→o→o~~o→[-1] + + Where S is the start position of the high degreen node (HDN). + ''' + + def get(): + graph, core_sequence = linear_structure() + branch_sequence = random_sequence(exclude=core_sequence) + + # start position of the HDN + S = flank_coords + if S < 0: + S = len(core_sequence) + S + # the HDN + HDN = Kmer(core_sequence[S:S + ksize], pos=S) + # left of the HDN + L = Kmer(core_sequence[S - 1:S - 1 + ksize], pos=S - 1) + # right of the HDN + R = Kmer(core_sequence[S + 1:S + 1 + ksize], pos=S + 1) + # the branch sequence, mutated at position 0 in L, + # whih is equivalent to the K-1 prefix of HDN prepended with a new base + branch_start = mutate_position(L, 0) + branch_sequence = branch_sequence + \ + branch_start + core_sequence[L.pos + ksize:] + + graph.consume(core_sequence) + graph.consume(branch_sequence) + + # Check for false positive neighbors and mark as expected failure if found + core_hdns = hdn_counts(core_sequence, graph) + branch_hdns = hdn_counts(branch_sequence, graph) + + # the core and branch sequences should each have exactly + # ONE node of degree 3 (HDN) + if not (core_hdns == branch_hdns == {3: 1}): + request.applymarker(pytest.mark.xfail) + + return graph, core_sequence, L, HDN, R, branch_sequence + return get + + +@pytest.fixture +def snp_bubble_structure(request, linear_structure, ksize): + ''' + Sets up a graph structure resulting from a SNP (Single Nucleotide + Polymorphism). + + (HDN_L[1:]+SNP)→o~~o→(SNP+) + ↗ ↘ + o~~([S:S+K] HDN_L) ([S+K+1:S+2K+1] HDN_R)~~o + ↘ ↗ + (HDN_L[1:]+W)→o~~o~~o→(W+) + + Where S is the start position of HDN directly left of the SNP (HDN_L), + SNP is the mutated base, and W is the wildtype (original) base. + Of course, W and SNP could be interchanged here, we don't actually + know which is which ;) + + Note our parameterization: we need a bit more room from the ends, + so we bring the rightmost SNP a tad left. + ''' + + def get(): + graph, wildtype_sequence = linear_structure() + S = int(len(wildtype_sequence) / 2) + snp_sequence = mutate_position(wildtype_sequence, S + ksize) + HDN_L = Kmer(wildtype_sequence[S:S + ksize], pos=S) + HDN_R = Kmer(wildtype_sequence[S + ksize + 1:S + 2 * ksize + 1], pos=S + + ksize + 1) + + graph.consume(wildtype_sequence) + graph.consume(snp_sequence) + + # Check for false positive neighbors and mark as expected failure if found + w_hdns = hdn_counts(wildtype_sequence, graph) + snp_hdns = hdn_counts(snp_sequence, graph) + if not (w_hdns == snp_hdns == {3: 2}): + print(w_hdns, snp_hdns) + print(HDN_L, HDN_R) + print(wildtype_sequence[HDN_L.pos + ksize + 1]) + print(snp_sequence[HDN_L.pos + ksize + 1]) + request.applymarker(pytest.mark.xfail) + + return graph, wildtype_sequence, snp_sequence, HDN_L, HDN_R + return get + + +@pytest.fixture +def tandem_triple_forks(request, right_triple_fork_structure, + random_sequence, ksize, flank_coords): + + def get(): + rtfs = right_triple_fork_structure() + graph, core, L, HDN, R, top_l, bottom_l = rtfs + S_l = flank_coords + if S_l < 0: + S_l = len(core) + S_l + S_r = S_l + 1 + + # top sequence for new HDN + top_r = random_sequence() + new_HDN = R + new_R = Kmer(core[S_r + 1:S_r + 1 + ksize], pos=S_r+1) + top_r_start = core[:new_R.pos] + mutate_position(new_R, -1) + top_r = top_r_start + top_r + + graph.consume(top_r) + + # now the bottom sequence for new HDN + bases = {'A', 'C', 'G', 'T'} + mutated = random.choice(list(bases - {new_R[-1], top_r[new_R.pos + ksize - 1]})) + bottom_r = random_sequence() + bottom_r = core[:new_HDN.pos + ksize] + mutated + bottom_r + + graph.consume(bottom_r) + + exp_2_hdns = [hdn_counts(s, graph) for s in (top_r, bottom_r, core)] + exp_1_hdn = [hdn_counts(s, graph) for s in (top_l, bottom_l)] + + if not all(map(lambda c: c == {4:2}, exp_2_hdns)) or \ + not all(map(lambda c: c == {4:1}, exp_1_hdn)): + + print(exp_2_hdns, exp_1_hdns) + request.applymarker(pytest.mark.xfail) + + return graph, core, L, HDN, new_HDN, new_R, top_l, bottom_l, top_r, bottom_r + + return get + + +@pytest.fixture(params=[2, 3, 4, 5, 6, 7, 8]) +def tandem_repeat_structure(request, linear_structure): + + def get(): + graph, sequence = linear_structure() + + tandem_repeats = sequence * request.param + graph.consume(tandem_repeats) + + if hdn_counts(tandem_repeats, graph): + request.applymarker(pytest.mark.xfail) + + return graph, sequence, tandem_repeats + return get + + +@pytest.fixture +def circular_linear_structure(request, linear_structure): + def get(): + graph, sequence = linear_structure() + + sequence += sequence + + if hdn_counts(sequence, graph): + request.applymarker(pytest.mark.xfail) + + return graph, sequence + return get diff --git a/tests/test_assembly.py b/tests/test_assembly.py index 84dc10adf5..c285cf2ffa 100755 --- a/tests/test_assembly.py +++ b/tests/test_assembly.py @@ -45,138 +45,152 @@ from khmer import ReadParser from khmer import reverse_complement as revcomp from . import khmer_tst_utils as utils -from khmer._oxli.assembly import LinearAssembler +from khmer._oxli.assembly import LinearAssembler, CompactingAssembler import pytest import screed -from .graph_features import * -from .graph_features import K +from .graph_structure_fixtures import * def teardown(): utils.cleanup() -@pytest.mark.parametrize("assembler", [LinearAssembler]) +@pytest.mark.parametrize("assembler", [LinearAssembler, CompactingAssembler]) class TestNonBranching: - def test_all_start_positions(self, linear_structure, assembler): + def test_all_start_positions(self, ksize, linear_structure, assembler): # assemble entire contig, starting from wherever - graph, contig = linear_structure + graph, contig = linear_structure() asm = assembler(graph) for start in range(0, len(contig), 150): - path = asm.assemble(contig[start:start + K]) + path = asm.assemble(contig[start:start + ksize]) assert utils._equals_rc(path, contig), start - def test_all_left_to_beginning(self, linear_structure, assembler): + def test_all_left_to_beginning(self, ksize, linear_structure, assembler): # assemble directed left - graph, contig = linear_structure + graph, contig = linear_structure() asm = assembler(graph) for start in range(0, len(contig), 150): - path = asm.assemble_left(contig[start:start + K]) + path = asm.assemble_left(contig[start:start + ksize]) print(path, ', ', contig[:start]) - assert utils._equals_rc(path, contig[:start + K]), start + assert utils._equals_rc(path, contig[:start + ksize]), start - def test_all_right_to_end(self, linear_structure, assembler): + def test_all_right_to_end(self, ksize, linear_structure, assembler): # assemble directed right - graph, contig = linear_structure + graph, contig = linear_structure() asm = assembler(graph) for start in range(0, len(contig), 150): - path = asm.assemble_right(contig[start:start + K]) + path = asm.assemble_right(contig[start:start + ksize]) print(path, ', ', contig[:start]) assert utils._equals_rc(path, contig[start:]), start - def test_circular(self, circular_linear_structure, assembler): + def test_circular(self, ksize, circular_linear_structure, assembler): - graph, contig = circular_linear_structure + graph, contig = circular_linear_structure() asm = assembler(graph) - path = asm.assemble_right(contig[:K]) + path = asm.assemble_right(contig[:ksize]) print(path, ',', contig) assert utils._equals_rc(path, contig[:len(path)]) - def test_hash_as_seed(self, linear_structure, assembler): - graph, contig = linear_structure + def test_hash_as_seed(self, ksize, linear_structure, assembler): + graph, contig = linear_structure() asm = assembler(graph) - left = graph.hash(contig[:K]) + left = graph.hash(contig[:ksize]) assert utils._equals_rc(asm.assemble(left), contig) +class TestCompactingAssembler: + + def test_beginning_to_branch_right(self, ksize, right_tip_structure): + # assemble from beginning of contig, up until branch point + graph, contig, L, HDN, R, tip = right_tip_structure() + asm = CompactingAssembler(graph) + path = asm.assemble(contig[0:ksize]) + + assert len(path) == HDN.pos + ksize + assert utils._equals_rc(path, contig[:len(path)]) + + def test_end_to_branch_right(self, ksize, right_tip_structure): + # in the LinearAsembler, this would continue all the way + # to the beginning. The CompactingAssembler does an extra + # check of the node degree in the reverse direction. + graph, contig, L, HDN, R, tip = right_tip_structure() + asm = CompactingAssembler(graph) + + class TestLinearAssembler_RightBranching: - def test_branch_point(self, right_tip_structure): - graph, contig, L, HDN, R, tip = right_tip_structure + def test_branch_point(self, ksize, right_tip_structure): + graph, contig, L, HDN, R, tip = right_tip_structure() assert graph.kmer_degree(HDN) == 3 - def test_beginning_to_branch(self, right_tip_structure): + def test_beginning_to_branch(self, ksize, right_tip_structure): # assemble from beginning of contig, up until branch point - graph, contig, L, HDN, R, tip = right_tip_structure + graph, contig, L, HDN, R, tip = right_tip_structure() asm = khmer.LinearAssembler(graph) - path = asm.assemble(contig[0:K]) - - assert len(path) == HDN.pos + K - assert utils._equals_rc(path, contig[:len(path)]) - def test_assemble_takes_hash(self, right_tip_structure): + def test_assemble_takes_hash(self, ksize, right_tip_structure): # assemble from beginning of contig, up until branch point - graph, contig, L, HDN, R, tip = right_tip_structure + graph, contig, L, HDN, R, tip = right_tip_structure() asm = khmer.LinearAssembler(graph) - path = asm.assemble(graph.hash(contig[0:K])) + path = asm.assemble(graph.hash(contig[0:ksize])) - assert len(path) == HDN.pos + K + assert len(path) == HDN.pos + ksize assert utils._equals_rc(path, contig[:len(path)]) - def test_beginning_to_branch_revcomp(self, right_tip_structure): + def test_beginning_to_branch_revcomp(self, ksize, right_tip_structure): # assemble from beginning of contig, up until branch point # starting from rev comp - graph, contig, L, HDN, R, tip = right_tip_structure + graph, contig, L, HDN, R, tip = right_tip_structure() asm = khmer.LinearAssembler(graph) - path = asm.assemble(revcomp(contig[0:K])) + path = asm.assemble(revcomp(contig[0:ksize])) - assert len(path) == HDN.pos + K + assert len(path) == HDN.pos + ksize assert utils._equals_rc(path, contig[:len(path)]) - def test_left_of_branch_to_beginning(self, right_tip_structure): + def test_left_of_branch_to_beginning(self, ksize, right_tip_structure): # start from HDN (left of branch) - graph, contig, L, HDN, R, tip = right_tip_structure + graph, contig, L, HDN, R, tip = right_tip_structure() asm = khmer.LinearAssembler(graph) path = asm.assemble(L) - assert len(path) == HDN.pos + K + assert len(path) == HDN.pos + ksize assert utils._equals_rc(path, contig[:len(path)]) - def test_left_of_branch_to_beginning_revcomp(self, right_tip_structure): + def test_left_of_branch_to_beginning_revcomp(self, ksize, right_tip_structure): # start from revcomp of HDN (left of branch) - graph, contig, L, HDN, R, tip = right_tip_structure + graph, contig, L, HDN, R, tip = right_tip_structure() asm = khmer.LinearAssembler(graph) path = asm.assemble(revcomp(L)) - assert len(path) == HDN.pos + K + assert len(path) == HDN.pos + ksize assert utils._equals_rc(path, contig[:len(path)]) - def test_right_of_branch_outwards_to_ends(self, right_tip_structure): + def test_right_of_branch_outwards_to_ends(self, ksize, right_tip_structure): # assemble from right of branch point (at R) # Should get the *entire* original contig, as the assembler # will move left relative to the branch, and not consider it # as a high degree node - graph, contig, L, HDN, R, tip = right_tip_structure + graph, contig, L, HDN, R, tip = right_tip_structure() asm = khmer.LinearAssembler(graph) path = asm.assemble(R) assert len(path) == len(contig) assert utils._equals_rc(path, contig) - def test_end_to_beginning(self, right_tip_structure): + def test_end_to_beginning(self, ksize, right_tip_structure): # should have exact same behavior as right_of_branch_outwards - graph, contig, L, HDN, R, tip = right_tip_structure + graph, contig, L, HDN, R, tip = right_tip_structure() asm = khmer.LinearAssembler(graph) - path = asm.assemble(contig[-K:]) + path = asm.assemble(contig[-ksize:]) assert len(path) == len(contig) assert utils._equals_rc(path, contig) @@ -184,36 +198,36 @@ def test_end_to_beginning(self, right_tip_structure): class TestLinearAssembler_LeftBranching: - def test_branch_point(self, left_tip_structure): - graph, contig, L, HDN, R, tip = left_tip_structure + def test_branch_point(self, ksize, left_tip_structure): + graph, contig, L, HDN, R, tip = left_tip_structure() assert graph.kmer_degree(HDN) == 3 - def test_end_to_branch(self, left_tip_structure): + def test_end_to_branch(self, ksize, left_tip_structure): # assemble from end until branch point # should include HDN - graph, contig, L, HDN, R, tip = left_tip_structure + graph, contig, L, HDN, R, tip = left_tip_structure() asm = khmer.LinearAssembler(graph) - path = asm.assemble(contig[-K:]) + path = asm.assemble(contig[-ksize:]) assert len(path) == len(contig) - HDN.pos assert utils._equals_rc(path, contig[HDN.pos:]) - def test_branch_to_end(self, left_tip_structure): + def test_branch_to_end(self, ksize, left_tip_structure): # assemble from branch point until end - graph, contig, L, HDN, R, tip = left_tip_structure + graph, contig, L, HDN, R, tip = left_tip_structure() asm = khmer.LinearAssembler(graph) path = asm.assemble(HDN) assert len(path) == len(contig) - HDN.pos assert utils._equals_rc(path, contig[HDN.pos:]) - def test_from_branch_to_ends_with_stopbf(self, left_tip_structure): + def test_from_branch_to_ends_with_stopbf(self, ksize, left_tip_structure): # block the tip with the stop_filter. should return a full length # contig. - graph, contig, L, HDN, R, tip = left_tip_structure + graph, contig, L, HDN, R, tip = left_tip_structure() - stop_filter = khmer.Nodegraph(K, 1e5, 4) + stop_filter = khmer.Nodegraph(ksize, 1e5, 4) stop_filter.count(tip) asm = khmer.LinearAssembler(graph, stop_filter=stop_filter) @@ -223,12 +237,12 @@ def test_from_branch_to_ends_with_stopbf(self, left_tip_structure): assert len(path) == len(contig) assert utils._equals_rc(path, contig) - def test_from_branch_to_ends_with_stopbf_revcomp(self, left_tip_structure): + def test_from_branch_to_ends_with_stopbf_revcomp(self, ksize, left_tip_structure): # block the tip with the stop_filter. should return a full length # contig. - graph, contig, L, HDN, R, tip = left_tip_structure + graph, contig, L, HDN, R, tip = left_tip_structure() - stop_filter = khmer.Nodegraph(K, 1e5, 4) + stop_filter = khmer.Nodegraph(ksize, 1e5, 4) stop_filter.count(tip) asm = khmer.LinearAssembler(graph, stop_filter=stop_filter) @@ -237,56 +251,56 @@ def test_from_branch_to_ends_with_stopbf_revcomp(self, left_tip_structure): assert len(path) == len(contig) assert utils._equals_rc(path, contig) - def test_end_thru_tip_with_stopbf(self, left_tip_structure): + def test_end_thru_tip_with_stopbf(self, ksize, left_tip_structure): # assemble up to branch point, and include introduced branch b/c # of stop bf - graph, contig, L, HDN, R, tip = left_tip_structure + graph, contig, L, HDN, R, tip = left_tip_structure() - stop_filter = khmer.Nodegraph(K, 1e5, 4) + stop_filter = khmer.Nodegraph(ksize, 1e5, 4) stop_filter.count(L) # ...and block original path asm = khmer.LinearAssembler(graph, stop_filter=stop_filter) - path = asm.assemble(contig[-K:]) + path = asm.assemble(contig[-ksize:]) assert len(path) == len(contig) - HDN.pos + 1 # should be the tip k-kmer, plus the last base of the HDN thru # the end of the contig - assert utils._equals_rc(path, tip + contig[HDN.pos + K - 1:]) + assert utils._equals_rc(path, tip + contig[HDN.pos + ksize - 1:]) - def test_single_node_flanked_by_hdns(self, left_tip_structure): + def test_single_node_flanked_by_hdns(self, ksize, left_tip_structure): # assemble single node flanked by high-degree nodes # we'll copy the main nodegraph before mutating it - graph, contig, L, HDN, R, tip = left_tip_structure + graph, contig, L, HDN, R, tip = left_tip_structure() asm = khmer.LinearAssembler(graph) - graph.consume(mutate_position(contig, HDN.pos + K)) + graph.consume(mutate_position(contig, HDN.pos + ksize)) path = asm.assemble(HDN) - assert len(path) == K + assert len(path) == ksize assert utils._equals_rc(path, HDN) class TestLabeledAssembler: - def test_hash_as_seed(self, linear_structure): - graph, contig = linear_structure + def test_hash_as_seed(self, ksize, linear_structure): + graph, contig = linear_structure() lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) - left = graph.hash(contig[:K]) + left = graph.hash(contig[:ksize]) assert utils._equals_rc(asm.assemble(left).pop(), contig) - def test_beginning_to_end_across_tip(self, right_tip_structure): + def test_beginning_to_end_across_tip(self, ksize, right_tip_structure): # assemble entire contig, ignoring branch point b/c of labels - graph, contig, L, HDN, R, tip = right_tip_structure + graph, contig, L, HDN, R, tip = right_tip_structure() lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) hdn = graph.find_high_degree_nodes(contig) # L, HDN, and R will be labeled with 1 lh.label_across_high_degree_nodes(contig, hdn, 1) - path = asm.assemble(contig[:K]) + path = asm.assemble(contig[:ksize]) assert len(path) == 1, "there should only be one path" path = path[0] # @CTB @@ -294,9 +308,9 @@ def test_beginning_to_end_across_tip(self, right_tip_structure): assert len(path) == len(contig) assert utils._equals_rc(path, contig) - def test_assemble_right_double_fork(self, right_double_fork_structure): + def test_assemble_right_double_fork(self, ksize, right_double_fork_structure): # assemble two contigs from a double forked structure - graph, contig, L, HDN, R, branch = right_double_fork_structure + graph, contig, L, HDN, R, branch = right_double_fork_structure() lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) @@ -307,7 +321,7 @@ def test_assemble_right_double_fork(self, right_double_fork_structure): lh.label_across_high_degree_nodes(branch, hdn, 2) print(lh.get_tag_labels(list(hdn)[0])) - paths = asm.assemble(contig[:K]) + paths = asm.assemble(contig[:ksize]) print('Path lengths', [len(x) for x in paths]) assert len(paths) == 2 @@ -315,10 +329,10 @@ def test_assemble_right_double_fork(self, right_double_fork_structure): assert any(utils._equals_rc(path, contig) for path in paths) assert any(utils._equals_rc(path, branch) for path in paths) - def test_assemble_right_triple_fork(self, right_triple_fork_structure): + def test_assemble_right_triple_fork(self, ksize, right_triple_fork_structure): # assemble three contigs from a trip fork (graph, contig, L, HDN, R, - top_sequence, bottom_sequence) = right_triple_fork_structure + top_sequence, bottom_sequence) = right_triple_fork_structure() lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) @@ -331,7 +345,7 @@ def test_assemble_right_triple_fork(self, right_triple_fork_structure): lh.label_across_high_degree_nodes(bottom_sequence, hdn, 3) print(lh.get_tag_labels(list(hdn)[0])) - paths = asm.assemble(contig[:K]) + paths = asm.assemble(contig[:ksize]) print([len(x) for x in paths]) assert len(paths) == 3 @@ -340,14 +354,14 @@ def test_assemble_right_triple_fork(self, right_triple_fork_structure): assert any(utils._equals_rc(path, top_sequence) for path in paths) assert any(utils._equals_rc(path, bottom_sequence) for path in paths) - def test_assemble_left_double_fork(self, left_double_fork_structure): + def test_assemble_left_double_fork(self, ksize, left_double_fork_structure): # assemble entire contig + branch points b/c of labels; start from end - graph, contig, L, HDN, R, branch = left_double_fork_structure + graph, contig, L, HDN, R, branch = left_double_fork_structure() lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) # first try without the labels - paths = asm.assemble(contig[-K:]) + paths = asm.assemble(contig[-ksize:]) assert len(paths) == 1 # without labels, should get the beginning of the HDN thru the end @@ -361,16 +375,16 @@ def test_assemble_left_double_fork(self, left_double_fork_structure): lh.label_across_high_degree_nodes(branch, hdn, 2) print(lh.get_tag_labels(list(hdn)[0])) - paths = asm.assemble(contig[-K:]) + paths = asm.assemble(contig[-ksize:]) assert len(paths) == 2 assert any(utils._equals_rc(path, contig) for path in paths) assert any(utils._equals_rc(path, branch) for path in paths) - def test_assemble_snp_bubble_single(self, snp_bubble_structure): + def test_assemble_snp_bubble_single(self, ksize, snp_bubble_structure): # assemble entire contig + one of two paths through a bubble - graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure + graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure() lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) @@ -378,14 +392,14 @@ def test_assemble_snp_bubble_single(self, snp_bubble_structure): assert len(hdn) == 2 lh.label_across_high_degree_nodes(wildtype, hdn, 1) - paths = asm.assemble(wildtype[:K]) + paths = asm.assemble(wildtype[:ksize]) assert len(paths) == 1 assert utils._equals_rc(paths[0], wildtype) - def test_assemble_snp_bubble_both(self, snp_bubble_structure): + def test_assemble_snp_bubble_both(self, ksize, snp_bubble_structure): # assemble entire contig + both paths - graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure + graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure() lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) @@ -395,23 +409,23 @@ def test_assemble_snp_bubble_both(self, snp_bubble_structure): lh.label_across_high_degree_nodes(wildtype, hdn, 1) lh.label_across_high_degree_nodes(mutant, hdn, 2) - paths = asm.assemble(wildtype[:K]) + paths = asm.assemble(wildtype[:ksize]) assert len(paths) == 2 assert any(utils._contains_rc(wildtype, path) for path in paths) assert any(utils._contains_rc(mutant, path) for path in paths) - # assert all(path[:HDN_L.pos+K][-K:] == HDN_L for path in paths) - # assert all(path[HDN_R.pos:][:K] == HDN_R for path in paths) - # assert paths[0][:HDN_L.pos+K] == paths[1][:HDN_L.pos+K] + # assert all(path[:HDN_L.pos+ksize][-ksize:] == HDN_L for path in paths) + # assert all(path[HDN_R.pos:][:ksize] == HDN_R for path in paths) + # assert paths[0][:HDN_L.pos+ksize] == paths[1][:HDN_L.pos+ksize] # assert paths[0][HDN_R.pos:] == paths[1][HDN_R.pos:] - def test_assemble_snp_bubble_stopbf(self, snp_bubble_structure): + def test_assemble_snp_bubble_stopbf(self, ksize, snp_bubble_structure): # assemble one side of bubble, blocked with stop_filter, # when labels on both branches # stop_filter should trip a filter failure, negating the label spanning - graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure - stop_filter = khmer.Nodegraph(K, 1e5, 4) + graph, wildtype, mutant, HDN_L, HDN_R = snp_bubble_structure() + stop_filter = khmer.Nodegraph(ksize, 1e5, 4) lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh, stop_filter=stop_filter) @@ -422,37 +436,37 @@ def test_assemble_snp_bubble_stopbf(self, snp_bubble_structure): lh.label_across_high_degree_nodes(mutant, hdn, 2) # do the labeling, but block the mutant with stop_filter - stop_filter.count(mutant[HDN_L.pos + 1:HDN_L.pos + K + 1]) - paths = asm.assemble(wildtype[:K]) + stop_filter.count(mutant[HDN_L.pos + 1:HDN_L.pos + ksize + 1]) + paths = asm.assemble(wildtype[:ksize]) assert len(paths) == 1 assert any(utils._equals_rc(path, wildtype) for path in paths) # @pytest.mark.skip(reason='destroys your computer and then the world') - def test_assemble_tandem_repeats(self, tandem_repeat_structure): + def test_assemble_tandem_repeats(self, ksize, tandem_repeat_structure): # assemble one copy of a tandem repeat - graph, repeat, tandem_repeats = tandem_repeat_structure + graph, repeat, tandem_repeats = tandem_repeat_structure() lh = khmer.GraphLabels(graph) asm = khmer.SimpleLabeledAssembler(lh) - paths = asm.assemble(repeat[:K]) + paths = asm.assemble(repeat[:ksize]) assert len(paths) == 1 - # There are K-1 k-mers spanning the junction between + # There are ksize-1 k-mers spanning the junction between # the beginning and end of the repeat - assert len(paths[0]) == len(repeat) + K - 1 + assert len(paths[0]) == len(repeat) + ksize - 1 class TestJunctionCountAssembler: - def test_beginning_to_end_across_tip(self, right_tip_structure): + def test_beginning_to_end_across_tip(self, ksize, right_tip_structure): # assemble entire contig, ignoring branch point b/c of labels - graph, contig, L, HDN, R, tip = right_tip_structure + graph, contig, L, HDN, R, tip = right_tip_structure() asm = khmer.JunctionCountAssembler(graph) asm.consume(contig) asm.consume(contig) asm.consume(contig) - path = asm.assemble(contig[:K]) + path = asm.assemble(contig[:ksize]) print('P:', path[0]) print('T:', tip) print('C:', contig) diff --git a/tests/test_banding.py b/tests/test_banding.py index 3728ba0d8b..b274c57cb0 100755 --- a/tests/test_banding.py +++ b/tests/test_banding.py @@ -36,14 +36,16 @@ import screed import khmer from . import khmer_tst_utils as utils +from .graph_structure_fixtures import using_ksize import pytest -@pytest.mark.parametrize('ksize,memory,epsilon,numbands', [ - (21, 5e6, 1, 2), - (21, 5e6, 1, 4), - (21, 5e6, 1, 8), - (21, 5e6, 1, 16), +@using_ksize(21) +@pytest.mark.parametrize('memory,epsilon,numbands', [ + (5e6, 1, 2), + (5e6, 1, 4), + (5e6, 1, 8), + (5e6, 1, 16), ]) def test_banding_in_memory(ksize, memory, epsilon, numbands): """ @@ -82,11 +84,12 @@ def test_banding_in_memory(ksize, memory, epsilon, numbands): assert min(nonzeros) == 1 -@pytest.mark.parametrize('ksize,memory,numbands', [ - (21, 5e6, 3), - (21, 5e6, 11), - (21, 5e6, 23), - (21, 5e6, 29), +@using_ksize(21) +@pytest.mark.parametrize('memory,numbands', [ + (5e6, 3), + (5e6, 11), + (5e6, 23), + (5e6, 29), ]) def test_banding_to_disk(ksize, memory, numbands): """ diff --git a/tests/test_compact_dbg.py b/tests/test_compact_dbg.py new file mode 100644 index 0000000000..0b47eab373 --- /dev/null +++ b/tests/test_compact_dbg.py @@ -0,0 +1,294 @@ +import gc +import itertools +import random + +from khmer import reverse_complement as revcomp +from khmer import reverse_hash as revhash +from khmer import forward_hash +from . import khmer_tst_utils as utils +from .khmer_tst_utils import _equals_rc, _contains_rc +from .graph_structure_fixtures import * + +from khmer._oxli.cdbg import (StreamingCompactor, CompactNode, + CompactNodeFactory) +from khmer._oxli.hashing import Kmer as CyKmer +from khmer import Nodegraph +import pytest + + +def teardown(): + utils.cleanup() + + +def test_get_pivot_from_right(ksize, linear_structure): + graph, sequence = linear_structure() + print(sequence) + factory = CompactNodeFactory.new(ksize) + kmer = CyKmer(sequence[:ksize]) + node = factory.build_node(kmer) + print(node) + + if kmer.is_forward: + assert factory.get_pivot_from_right(node, sequence) == \ + (sequence[ksize], False) + else: + assert factory.get_pivot_from_right(node, sequence) == \ + (revcomp(sequence[ksize]), True) + + +def test_get_pivot_from_left(ksize, linear_structure): + graph, sequence = linear_structure() + print(sequence) + factory = CompactNodeFactory.new(ksize) + kmer = CyKmer(sequence[-ksize:]) + node = factory.build_node(kmer) + print(node) + + if kmer.is_forward: + assert factory.get_pivot_from_left(node, sequence) == \ + (sequence[-ksize-1], False) + else: + assert factory.get_pivot_from_left(node, sequence) == \ + (revcomp(sequence[-ksize-1]), True) + + +def compare_tip_with_cdbg(rts, compactor): + graph, contig, L, HDN, R, tip = rts + + nodes = list(compactor.sequence_nodes(contig)) + assert len(nodes) == 1 + + node = nodes[0] + assert _equals_rc(node.sequence, HDN) + + in_edges = list(node.in_edges()) + out_edges = list(node.out_edges()) + + if len(in_edges) == 1: + _, in_edge = in_edges[0] + assert len(out_edges) == 2 + (_, edge_contig), (_, edge_tip) = out_edges + if len(edge_tip) > len(edge_contig): + edge_contig, edge_tip = edge_tip, edge_contig + #assert _equals_rc(contig, in_edge.sequence[:-K+1] + node.sequence + + # edge_contig.sequence[K-1:]) + else: + _, out_edge = out_edges[0] + assert len(in_edges) == 2 + (_, edge_contig), (_, edge_tip) = in_edges + if len(edge_tip) > len(edge_contig): + edge_contig, edge_tip = edge_tip, edge_contig + #assert _equals_rc(contig, edge_contig.sequence[:-K+1] + node.sequence + + # out_edge.sequence[K-1:]) + + +@using_ksize([21,25,31]) +def test_compact_tip(ksize, right_tip_structure): + right_tip_structure = right_tip_structure() + graph, contig, L, HDN, R, tip = right_tip_structure + + compactor = StreamingCompactor(graph) + print(compactor.update(contig), 'cDBG updates...') + compactor.report() + + compare_tip_with_cdbg(right_tip_structure, compactor) + + assert compactor.n_nodes == 1 + assert compactor.n_edges == 3 + + for node in compactor.sequence_nodes(contig): + print(node) + print('in edges:') + for base, edge in node.in_edges(): + print(base, edge) + + print('out edges:') + for base, edge in node.out_edges(): + print(base, edge) + + print("Contig FWD:", contig, len(contig)) + print("Contig RC:", revcomp(contig)) + print("HDN: ", repr(HDN)) + print("Tip FW:", tip, len(tip)) + print("Tip RC:", revcomp(tip)) + print("R FW:", R) + print("R RC:", revcomp(R)) + + +def test_compact_tip_double_update(right_tip_structure): + right_tip_structure = right_tip_structure() + graph, contig, L, HDN, R, tip = right_tip_structure + + compactor = StreamingCompactor(graph) + print(compactor.update(contig), 'cDBG updates...') + compactor.report() + print(compactor.update(contig), 'cDBG updates...') + compactor.report() + + compare_tip_with_cdbg(right_tip_structure, compactor) + assert compactor.n_nodes == 1 + assert compactor.n_edges == 3 + + +def test_compact_tip_revcomp_update(right_tip_structure): + right_tip_structure = right_tip_structure() + graph, contig, L, HDN, R, tip = right_tip_structure + + compactor = StreamingCompactor(graph) + print(compactor.update(contig), 'cDBG updates...') + compactor.report() + + print(compactor.update(revcomp(contig)), 'cDBG updates...') + compactor.report() + + compare_tip_with_cdbg(right_tip_structure, compactor) + assert compactor.n_nodes == 1 + assert compactor.n_edges == 3 + + +def test_compact_two_tip_islands(left_tip_structure, right_tip_structure): + right_tip_structure = right_tip_structure() + graph, contig_r, L_r, HDN_r, R_r, tip_r = right_tip_structure + left_tip_structure = left_tip_structure() + _, contig_l, L_l, HDN_l, R_l, tip_l = left_tip_structure + + compactor = StreamingCompactor(graph) + print(compactor.update(contig_l), 'cDBG updates from left') + compactor.report() + compare_tip_with_cdbg(left_tip_structure, compactor) + assert compactor.n_nodes == 1 + assert compactor.n_edges == 3 + + print(compactor.update(contig_r), 'cDBG updates from right') + compactor.report() + compare_tip_with_cdbg(right_tip_structure, compactor) + assert compactor.n_nodes == 2 + assert compactor.n_edges == 6 + + +def test_compact_tip_x_merge(left_tip_structure, right_tip_structure): + right_tip_structure = right_tip_structure() + graph, contig_r, L_r, HDN_r, R_r, tip_r = right_tip_structure + left_tip_structure = left_tip_structure() + _, contig_l, L_l, HDN_l, R_l, tip_l = left_tip_structure + + contig_merge = contig_l + contig_r + graph.reset() + + compactor = StreamingCompactor(graph) + compactor.consume(str(tip_l)) + print(compactor.consume_and_update(contig_l), + 'cDBG updates from left') + compactor.report() + compare_tip_with_cdbg(left_tip_structure, compactor) + assert compactor.n_nodes == 1 + assert compactor.n_edges == 3 + + compactor.consume(str(tip_r)) + print(compactor.consume_and_update(contig_merge), + 'cDBG updates from right merge') + compactor.report() + compare_tip_with_cdbg(right_tip_structure, compactor) + assert compactor.n_nodes == 2 + assert compactor.n_edges == 5 + + +@using_ksize([21, 31]) +def test_compact_triple_fork(right_triple_fork_structure): + right_triple_fork_structure = right_triple_fork_structure() + graph, core, L, HDN, R, top, bottom = right_triple_fork_structure + + compactor = StreamingCompactor(graph) + compactor.update(core) + compactor.report() + + assert compactor.n_nodes == 1 + assert compactor.n_edges == 4 + + +@pytest.mark.parametrize('random_sequence', [100, 200], indirect=True) +def test_compact_trivial_edge(tandem_triple_forks, ksize): + ttf = tandem_triple_forks() + graph, core, L, HDN_l, HDN_r, R, top_l, bottom_l, top_r, bottom_r = ttf + + print('Core:', core[HDN_l.pos:], '\nHDN_l:', HDN_l, '\nHDN_r:', HDN_r, + '\ntop_l:', top_l[HDN_l.pos:HDN_l.pos+2*ksize], + '\nbottom_l:', bottom_l[HDN_l.pos:HDN_l.pos+2*ksize], + '\ntop_r:', top_r[HDN_r.pos:HDN_r.pos+2*ksize], + '\nbottom_r:', bottom_r[HDN_r.pos:HDN_r.pos+2*ksize]) + br = '=' * 20 + graph.reset() + compactor = StreamingCompactor(graph) + print(br, 'ADD CORE', br) + compactor.consume_and_update(core) + assert compactor.n_nodes == 0 + + print(br, 'ADD top_l', br) + compactor.consume_and_update(top_l) + assert compactor.n_nodes == 1 + assert compactor.n_edges == 3 + + print(br, 'ADD bottom_l', br) + compactor.consume_and_update(bottom_l) + assert compactor.n_nodes == 1 + assert compactor.n_edges == 4 + + print(br, 'ADD top_r', br) + compactor.consume_and_update(top_r) + assert compactor.n_nodes == 2 + assert compactor.n_edges == 6 + + print(br, 'ADD bottom_r', br) + compactor.consume_and_update(bottom_r) + assert compactor.n_nodes == 2 + assert compactor.n_edges == 7 + + nodes = list(compactor.sequence_nodes(core)) + node_1, node_2 = nodes + trivial, node_2_out = list(node_2.in_edges()), list(node_2.out_edges()) + if len(trivial) != 1: + trivial, node_2_out = node_2_out, trivial + _, trivial = trivial[0] + + assert trivial.edge_type == 'TRIVIAL' + assert len(trivial) == ksize + 1 + + assert HDN_l in trivial.sequence + assert HDN_r in trivial.sequence + assert node_1.degree == 4 + assert node_2.degree == 4 + + +def test_compact_tip_linear_merge(left_tip_structure, right_tip_structure, + ksize): + right_tip_structure = right_tip_structure() + graph, contig_r, L_r, HDN_r, R_r, tip_r = right_tip_structure + left_tip_structure = left_tip_structure() + _, contig_l, L_l, HDN_l, R_l, tip_l = left_tip_structure + + contig_merge = contig_l[-ksize:] + contig_r[0:ksize] + graph.reset() + + compactor = StreamingCompactor(graph) + + compactor.consume(str(tip_l)) + print(compactor.consume_and_update(contig_l), + 'cDBG updates from left') + compactor.report() + compare_tip_with_cdbg(left_tip_structure, compactor) + assert compactor.n_nodes == 1 + assert compactor.n_edges == 3 + + compactor.consume(str(tip_r)) + print(compactor.consume_and_update(contig_r), + 'cDBG updates from right') + compactor.report() + compare_tip_with_cdbg(right_tip_structure, compactor) + assert compactor.n_nodes == 2 + assert compactor.n_edges == 6 + + print(compactor.consume_and_update(contig_merge), + 'cDBG updates from linear merge') + + assert compactor.n_nodes == 2 + assert compactor.n_edges == 5 diff --git a/tests/test_countgraph.py b/tests/test_countgraph.py index 23134def1a..b0a12f2444 100755 --- a/tests/test_countgraph.py +++ b/tests/test_countgraph.py @@ -40,7 +40,7 @@ import os import khmer -from khmer import Countgraph, SmallCountgraph, Nodegraph +from khmer import Countgraph, SmallCountgraph, Nodegraph, FastxParser from . import khmer_tst_utils as utils from khmer import ReadParser import screed @@ -114,6 +114,38 @@ def test_revhash_1(): assert hi.reverse_hash(hashval) == kmer +def test_extract_countgraph_info_badfile(): + try: + Countgraph.extract_info( + utils.get_test_data('test-abund-read-2.fa')) + assert 0, 'this should fail' + except ValueError: + pass + + +def test_extract_countgraph_info(): + fn = utils.get_temp_filename('test_extract_counting.ct') + for size in [1e6, 2e6, 5e6, 1e7]: + ht = khmer.Countgraph(25, size, 4) + ht.save(fn) + + try: + info = Countgraph.extract_info(fn) + except ValueError as err: + assert 0, 'Should not throw a ValueErorr: ' + str(err) + ksize, n_tables, table_size, _, _, _, _ = info + print(ksize, table_size, n_tables) + + assert(ksize) == 25 + assert table_size == size + assert n_tables == 4 + + try: + os.remove(fn) + except OSError as err: + assert 0, '...failed to remove ' + fn + str(err) + + class Test_Countgraph(object): def setup(self): @@ -1194,10 +1226,10 @@ def test_consume_absentfasta(): except TypeError as err: print(str(err)) try: - readparser = ReadParser(utils.get_test_data('empty-file')) - countgraph.consume_seqfile(readparser) + parser = FastxParser(utils.get_test_data('empty-file')) + countgraph.consume_seqfile(parser) assert 0, "this should fail" - except OSError as err: + except RuntimeError as err: print(str(err)) except ValueError as err: print(str(err)) diff --git a/tests/test_counttable.py b/tests/test_counttable.py index 1873668a35..2b2dea3a8b 100755 --- a/tests/test_counttable.py +++ b/tests/test_counttable.py @@ -38,6 +38,7 @@ import pytest from . import khmer_tst_utils as utils +from .graph_structure_fixtures import using_ksize def test_get_kmer_hashes(): @@ -61,13 +62,14 @@ def test_kmer_revcom_hash(kmer): assert a.hash(kmer) == a.hash(khmer.reverse_complement(kmer)) -@pytest.mark.parametrize('ksize,sketch_allocator', [ - (21, khmer.Nodetable), - (21, khmer.Counttable), - (21, khmer.SmallCounttable), - (49, khmer.Nodetable), - (49, khmer.Counttable), - (49, khmer.SmallCounttable), +@using_ksize([21,49]) +@pytest.mark.parametrize('sketch_allocator', [ + (khmer.Nodetable), + (khmer.Counttable), + (khmer.SmallCounttable), + (khmer.Nodetable), + (khmer.Counttable), + (khmer.SmallCounttable), ]) def test_reverse_hash(ksize, sketch_allocator): multiplier = int(ksize / len('GATTACA')) diff --git a/tests/test_cython_parsing.py b/tests/test_cython_parsing.py index 710ae711e2..5f16dfbe1f 100755 --- a/tests/test_cython_parsing.py +++ b/tests/test_cython_parsing.py @@ -4,9 +4,10 @@ import random import khmer -from khmer._oxli.parsing import Sequence, FastxParser, SanitizedFastxParser -from khmer._oxli.parsing import BrokenPairedReader, Alphabets, check_is_pair +from khmer._oxli.parsing import FastxParser, SanitizedFastxParser +from khmer._oxli.parsing import BrokenPairedReader, check_is_pair from khmer._oxli.parsing import check_is_right, check_is_left +from khmer._oxli.sequence import Sequence, Alphabets from khmer.khmer_args import estimate_optimal_with_K_and_f as optimal_fp from khmer import reverse_complement as revcomp from khmer import reverse_hash as revhash diff --git a/tests/test_cython_partitioning.py b/tests/test_cython_partitioning.py new file mode 100644 index 0000000000..fda2983f72 --- /dev/null +++ b/tests/test_cython_partitioning.py @@ -0,0 +1,396 @@ +from __future__ import print_function +from __future__ import absolute_import + +import gc +import itertools +import random + +import khmer +from khmer._oxli.partitioning import StreamingPartitioner, Component +from khmer.khmer_args import estimate_optimal_with_K_and_f as optimal_fp +from khmer import reverse_complement as revcomp +from khmer import reverse_hash as revhash +from . import khmer_tst_utils as utils +from .graph_structure_fixtures import * + +import pytest +import screed + + +def teardown(): + utils.cleanup() + + +@pytest.fixture +def partitioner(graph): + sp = StreamingPartitioner(graph) + return graph, sp + + +@pytest.fixture +def single_component(partitioner, random_sequence): + graph, partitioner = partitioner + sequence = random_sequence() + partitioner.consume(sequence) + return graph, partitioner, sequence + + +class TestStreamingPartitionerBasic: + + def teardown_method(self, method): + # Force garbage to collect. When Python component objects exist and + # their underlying c++ Component objects are destroyed, the Python + # wrapper becomes the sole owner of the pointer. By manually collecting + # garbage between tests we assure that these objects are freed, and we + # can properly test the _n_destroyed property to make sure there are no + # real memory leaks. + gc.collect() + + def test_one_component(self, ksize, known_sequence): + inpath = utils.get_test_data('random-20-a.fa') + + cg = khmer.Countgraph(ksize, 1e5, 4) + sp = StreamingPartitioner(cg) + sp.consume(known_sequence) + + assert sp.n_components == 1 + + def test_two_components(self, ksize, random_sequence): + comp1 = random_sequence() + comp2 = random_sequence(exclude=comp1) + + cg = khmer.Nodegraph(ksize, 1e5, 4) + sp = StreamingPartitioner(cg) + + sp.consume(comp1) + assert sp.n_components == 1 + + sp.consume(comp2) + assert sp.n_components == 2 + + def test_components_iter(self, ksize, random_sequence): + comp1 = random_sequence() + comp2 = random_sequence(exclude=comp1) + + cg = khmer.Nodegraph(ksize, 1e5, 4) + sp = StreamingPartitioner(cg) + + sp.consume(comp1) + sp.consume(comp2) + assert sp.n_components == 2 + + comps = list(sp.components()) + assert len(comps) == 2 + + def test_component_n_tags(self, ksize, random_sequence): + seq = random_sequence() + + cg = khmer.Nodegraph(ksize, 1e5, 4) + sp = StreamingPartitioner(cg) + sp.consume(seq) + + tags = [t for t,c in sp.tag_components()] + comp = sp.find_nearest_component(seq[:ksize]) + assert len(tags) == len(comp) + + def test_tag_components_iter(self, ksize, random_sequence): + comp1 = random_sequence() + comp2 = random_sequence(exclude=comp1) + + cg = khmer.Nodegraph(ksize, 1e5, 4) + sp = StreamingPartitioner(cg) + + sp.consume(comp1) + sp.consume(comp2) + assert sp.n_components == 2 + + tags = [] + comps = set() + for tag, comp in sp.tag_components(): + tags.append(tag) + comps.add(comp) + + assert sum([len([tag for tag in comp]) for comp in comps]) == len(tags) + assert len(comps) == 2 + assert len(tags) == sum([len(c) for c in comps]) + + def test_find_nearest_component(self, ksize, random_sequence): + seq1 = random_sequence() + seq2 = random_sequence(exclude=seq1) + + cg = khmer.Nodegraph(ksize, 1e5, 4) + sp = StreamingPartitioner(cg) + + sp.consume(seq1) + sp.consume(seq2) + + c1 = sp.find_nearest_component(seq1[:ksize]) + c2 = sp.find_nearest_component(seq2[:ksize]) + assert c1.component_id != c2.component_id + + for tag in c1: + assert utils._contains_rc(seq1, revhash(tag, ksize)) + assert not utils._contains_rc(seq2, revhash(tag, ksize)) + + for tag in c2: + assert utils._contains_rc(seq2, revhash(tag, ksize)) + assert not utils._contains_rc(seq1, revhash(tag, ksize)) + + def test_merge_components(self, ksize, random_sequence): + seq1 = random_sequence() + seq2 = random_sequence(exclude=seq1) + + cg = khmer.Nodegraph(ksize, 1e5, 4) + sp = StreamingPartitioner(cg) + + sp.consume(seq1) + sp.consume(seq2) + assert sp.n_components == 2 + + sp.consume(seq1 + seq2) + assert sp.n_components == 1 + + comps = list(sp.components()) + assert len(comps) == 1 + + + def test_multi_merge_components(self, ksize, random_sequence): + seq1 = random_sequence() + seq2 = random_sequence(exclude=seq1) + seq3 = random_sequence(exclude=seq1+seq2) + + cg = khmer.Nodegraph(ksize, 1e5, 4) + sp = StreamingPartitioner(cg) + + sp.consume(seq1) + sp.consume(seq2) + sp.consume(seq3) + assert sp.n_components == 3 + + sp.consume(seq1 + seq2 + seq3) + assert sp.n_components == 1 + + def test_nomerge_k_minus_2_overlap(self, ksize, single_component, + random_sequence): + '''Test that components are not merged when they have a length K-2 overlap. + ''' + + graph, partitioner, seq = single_component + asm = khmer.LinearAssembler(graph) + first = seq[:ksize-2] + neighbor = random_sequence(exclude=seq) + first + + assert partitioner.n_components == 1 + partitioner.consume(neighbor) + print(seq, neighbor, asm.assemble(seq[:ksize]), sep='\n') + assert partitioner.n_components == 2 + + @pytest.mark.parametrize("where", ["beginning", "end"]) + def test_merge_k_minus_1_overlap(self, single_component, ksize, + random_sequence, where): + '''Test that components are merged when they have a length K-1 overlap. + ''' + + graph, partitioner, seq = single_component + asm = khmer.LinearAssembler(graph) + if where == "beginning": + overlap = seq[:ksize-1] + neighbor = random_sequence(exclude=seq) + overlap + else: + overlap = seq[-ksize+1:] + neighbor = overlap + random_sequence(exclude=seq) + + assert partitioner.n_components == 1 + partitioner.consume(neighbor) + path = asm.assemble(seq[:ksize]) + assert partitioner.n_components == 1 + + def test_merge_k_overlap(self, single_component, + random_sequence, ksize): + '''Test that components are merged when they have a length K overlap. + ''' + + graph, partitioner, seq = single_component + asm = khmer.LinearAssembler(graph) + first = seq[:ksize] + neighbor = random_sequence(exclude=seq) + first + + assert partitioner.n_components == 1 + partitioner.consume(neighbor) + print(seq, neighbor, asm.assemble(seq[:ksize]), sep='\n') + assert partitioner.n_components == 1 + + + @pytest.mark.parametrize("n_reads", [100, 500, 1000]) + def test_one_component_from_reads(self, random_sequence, ksize, n_reads): + seq = random_sequence() + seq_reads = list(reads(seq, ksize, dbg_cover=True, N=n_reads)) + + G = khmer.Nodegraph(ksize, 1e6, 4) + sp = StreamingPartitioner(G) + for read in seq_reads: + sp.consume(read) + + assert sp.n_components == 1 + + @pytest.mark.parametrize("n_components", [3, 5, 10]) + def test_streaming_multicomponents(self, random_sequence, + ksize, n_components): + '''Test with many components from reads, and check for memory leaks.''' + seqs = [] + for _ in range(n_components): + seqs.append(random_sequence(exclude=''.join(seqs))) + + seq_reads = [] + for seq in seqs: + seq_reads.extend(list(reads(seq, ksize, dbg_cover=True, N=100))) + random.shuffle(seq_reads) + + G = khmer.Nodegraph(ksize, 1e6, 4) + sp = StreamingPartitioner(G) + + for read in seq_reads: + assert len(read) >= ksize + sp.consume(read) + assert sp.n_components == n_components + + comps = list(sp.components()) + comp = comps[0] + assert len(comps) == n_components + #assert sp.n_components == (comp._n_created - comp._n_destroyed) + assert sp.n_consumed == len(seq_reads) + + @pytest.mark.parametrize("n_components", [3, 5, 10]) + @pytest.mark.parametrize("cov", [1,10,20]) + def test_write_components(self, random_sequence, cov, + ksize, n_components, tmpdir): + outfn = tmpdir.join('counts.csv') + seqs = [] + for _ in range(n_components): + seqs.append(random_sequence(exclude=''.join(seqs))) + G = khmer.Countgraph(ksize, 1e6, 4) + sp = StreamingPartitioner(G) + + for seq in seqs: + for _ in range(cov): + sp.consume(seq) + for seq in seqs: + (med, _, _) = G.get_median_count(seq) + assert med == cov + assert sp.n_components == n_components + + sp.write_components(str(outfn)) + results = [line.strip().split(',') for line in outfn.open()] + assert len(results) == n_components + for row in results: + assert abs(float(row[2])-float(cov)) < 2 + + @pytest.mark.parametrize("n_components", [1, 3, 5, 10]) + def test_save_partitioner(self, random_sequence, ksize, + n_components, tmpdir): + import json + out_prefix = str(tmpdir.join('test_save')) + seqs = [] + for _ in range(n_components): + seqs.append(random_sequence(exclude=''.join(seqs))) + G = khmer.Countgraph(ksize, 1e6, 4) + sp = StreamingPartitioner(G) + for seq in seqs: + sp.consume(seq) + + sp.save(out_prefix) + + with open(out_prefix + '.json') as fp: + print(fp.read()) + fp.seek(0) + result = json.load(fp) + + assert 'graph' in result + assert result['graph'] == out_prefix + '.graph' + assert 'n_components' in result + assert result['n_components'] == n_components + result_comps = {d['component_id']: d for d in result['components']} + for comp in sp.components(): + assert comp.component_id in result_comps + + @pytest.mark.xfail + @pytest.mark.parametrize("n_components", [1, 3, 5, 10]) + def test_load_partitioner(self, random_sequence, ksize, + n_components, tmpdir): + import json + out_prefix = str(tmpdir.join('test_save')) + seqs = [] + for _ in range(n_components): + seqs.append(random_sequence(exclude=''.join(seqs))) + G = khmer.Countgraph(ksize, 1e6, 4) + sp = StreamingPartitioner(G) + for seq in seqs: + sp.consume(seq) + + sp.save(out_prefix) + + sp2 = StreamingPartitioner.load(out_prefix + '.json') + assert sp.n_components == sp2.n_components + for (c1, c2) in zip(sp.components(), sp2.components()): + assert c1 == c2 + assert len(c1) == len(c2) + for t1, t2 in zip(c1, c2): + assert t1 == t2 + + +class TestStreamingPartitionerPaired: + + def teardown_method(self, method): + # Force garbage to collect. When Python component objects exist and + # their underlying c++ Component objects are destroyed, the Python + # wrapper becomes the sole owner of the pointer. By manually collecting + # garbage between tests we assure that these objects are freed, and we + # can properly test the _n_destroyed property to make sure there are no + # real memory leaks. + gc.collect() + + def test_one_paired_component(self, ksize, random_sequence): + first = random_sequence() + second = random_sequence(exclude=first) + + cg = khmer.Countgraph(ksize, 1e5, 4) + sp = StreamingPartitioner(cg) + sp.consume_pair(first, second) + + assert sp.n_components == 1 + + def test_two_paired_components_merge(self, ksize, random_sequence): + comp1 = random_sequence() + comp2 = random_sequence(exclude=comp1) + + cg = khmer.Nodegraph(ksize, 1e5, 4) + sp = StreamingPartitioner(cg) + + sp.consume(comp1) + assert sp.n_components == 1 + + sp.consume(comp2) + assert sp.n_components == 2 + + sp.consume_pair(comp1, comp2) + assert sp.n_components == 1 + + def test_multi_paired_components_merge(self, ksize, random_sequence): + seq1 = random_sequence() + seq2 = random_sequence(exclude=seq1) + seq3 = random_sequence(exclude=seq1+seq2) + + cg = khmer.Nodegraph(ksize, 1e5, 4) + sp = StreamingPartitioner(cg) + + sp.consume(seq1) + sp.consume(seq2) + sp.consume(seq3) + assert sp.n_components == 3 + + sp.consume_pair(seq1, seq2) + assert sp.n_components == 2 + + sp.consume_pair(seq2, seq3) + assert sp.n_components == 1 + diff --git a/tests/test_functions.py b/tests/test_functions.py index ff825b419f..a289c58b2b 100755 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -188,68 +188,6 @@ def test_get_primes_fal(): assert "unable to find 5 prime numbers < 5" in str(err) -def test_extract_countgraph_info_badfile(): - try: - khmer.extract_countgraph_info( - utils.get_test_data('test-abund-read-2.fa')) - assert 0, 'this should fail' - except ValueError: - pass - - -def test_extract_countgraph_info(): - fn = utils.get_temp_filename('test_extract_counting.ct') - for size in [1e6, 2e6, 5e6, 1e7]: - ht = khmer.Countgraph(25, size, 4) - ht.save(fn) - - try: - info = khmer.extract_countgraph_info(fn) - except ValueError as err: - assert 0, 'Should not throw a ValueErorr: ' + str(err) - ksize, n_tables, table_size, _, _, _, _ = info - print(ksize, table_size, n_tables) - - assert(ksize) == 25 - assert table_size == size - assert n_tables == 4 - - try: - os.remove(fn) - except OSError as err: - assert 0, '...failed to remove ' + fn + str(err) - - -def test_extract_nodegraph_info_badfile(): - try: - khmer.extract_nodegraph_info( - utils.get_test_data('test-abund-read-2.fa')) - assert 0, 'this should fail' - except ValueError: - pass - - -def test_extract_nodegraph_info(): - fn = utils.get_temp_filename('test_extract_nodegraph.pt') - for size in [1e6, 2e6, 5e6, 1e7]: - ht = khmer.Nodegraph(25, size, 4) - ht.save(fn) - - info = khmer.extract_nodegraph_info(fn) - ksize, table_size, n_tables, _, _, _ = info - print(ksize, table_size, n_tables) - - assert(ksize) == 25 - assert table_size == size, table_size - assert n_tables == 4 - - try: - os.remove(fn) - except OSError as err: - print('...failed to remove {fn}'.format(fn) + str(err), - file=sys.stderr) - - def test_check_file_status_kfile(): fn = utils.get_temp_filename('thisfiledoesnotexist') diff --git a/tests/test_nodegraph.py b/tests/test_nodegraph.py index 607d521bfe..b7ed4c1e5e 100755 --- a/tests/test_nodegraph.py +++ b/tests/test_nodegraph.py @@ -37,13 +37,14 @@ import khmer from khmer import Nodegraph, Countgraph -from khmer import ReadParser +from khmer import FastxParser from khmer import reverse_complement as revcomp from khmer.khmer_args import create_matching_nodegraph import screed import pytest +import os from . import khmer_tst_utils as utils @@ -61,6 +62,36 @@ def test_toobig(): print(str(err)) +def test_extract_nodegraph_info_badfile(): + try: + Nodegraph.extract_info( + utils.get_test_data('test-abund-read-2.fa')) + assert 0, 'this should fail' + except ValueError: + pass + + +def test_extract_nodegraph_info(): + fn = utils.get_temp_filename('test_extract_nodegraph.pt') + for size in [1e6, 2e6, 5e6, 1e7]: + ht = khmer.Nodegraph(25, size, 4) + ht.save(fn) + + info = Nodegraph.extract_info(fn) + ksize, table_size, n_tables, _, _, _ = info + print(ksize, table_size, n_tables) + + assert(ksize) == 25 + assert table_size == size, table_size + assert n_tables == 4 + + try: + os.remove(fn) + except OSError as err: + print('...failed to remove {fn}'.format(fn) + str(err), + file=sys.stderr) + + def test_add_tag(): nodegraph = khmer.Nodegraph(6, 1, 1) @@ -916,10 +947,10 @@ def test_consume_absentfasta(): except TypeError as err: print(str(err)) try: - readparser = ReadParser(utils.get_test_data('empty-file')) - nodegraph.consume_seqfile(readparser) + parser = FastxParser(utils.get_test_data('empty-file')) + nodegraph.consume_seqfile(parser) assert 0, "this should fail" - except OSError as err: + except RuntimeError as err: print(str(err)) except ValueError as err: print(str(err)) @@ -936,10 +967,10 @@ def test_bad_primes(): def test_consume_seqfile_and_tag_with_badreads_parser(): nodegraph = khmer.Nodegraph(6, 1e6, 2) try: - readsparser = khmer.ReadParser(utils.get_test_data("test-empty.fa")) - nodegraph.consume_seqfile_and_tag(readsparser) + parser = FastxParser(utils.get_test_data("test-empty.fa")) + nodegraph.consume_seqfile_and_tag(parser) assert 0, "this should fail" - except OSError as e: + except RuntimeError as e: print(str(e)) except ValueError as e: print(str(e)) diff --git a/tests/test_normalize_by_median.py b/tests/test_normalize_by_median.py index 95ed93fbcf..ef94961a71 100755 --- a/tests/test_normalize_by_median.py +++ b/tests/test_normalize_by_median.py @@ -80,8 +80,8 @@ def test_normalize_by_median_empty_file(): (_, _, err) = utils.runscript(script, args, in_dir) assert 'WARNING:' in err, err - assert 'is empty' in err, err - assert 'SKIPPED' in err, err + assert 'empty file' in err, err + assert 'Skipping' in err, err def test_normalize_by_median(): @@ -202,7 +202,8 @@ def test_normalize_by_median_unforced_badfile(): args = ['-C', CUTOFF, '-k', '17', infile] (status, _, err) = utils.runscript(script, args, in_dir, fail_ok=True) assert status != 0 - assert "ERROR: [Errno 2] No such file or directory:" in err, err + assert "ERROR" in err, err + assert "contains badly formatted sequence or does not exist." in err if os.path.exists(outfile): assert False, '.keep file should have been removed: ' @@ -608,6 +609,7 @@ def test_normalize_by_median_streaming_0(): assert linecount == 400 +@pytest.mark.skip(reason='Threading or streaming weirdness.') def test_normalize_by_median_streaming_1(): CUTOFF = '20' diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 348a521bf3..ad815bf33d 100755 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -973,6 +973,7 @@ def test_partition_graph_no_big_traverse(): assert x[0] == 4, x # should be four partitions, broken at knot. +@pytest.mark.xfail(reason='Deprecated legacy partitioning.') def test_partition_find_knots_execute(): graphbase = _make_graph(utils.get_test_data('random-20-a.fa')) @@ -989,6 +990,7 @@ def test_partition_find_knots_execute(): assert os.path.exists(stoptags_file) +@pytest.mark.xfail(reason='Deprecated legacy partitioning.') def test_partition_find_knots_existing_stoptags(): graphbase = _make_graph(utils.get_test_data('random-20-a.fa')) @@ -1688,13 +1690,14 @@ def test_sample_reads_randomly(): assert seqs == answer -def test_sample_reads_randomly_force_single(): +def test_sample_reads_randomly_single_mode(): infile = utils.copy_test_data('test-reads.fa') in_dir = os.path.dirname(infile) script = 'sample-reads-randomly.py' # fix random number seed for reproducibility - args = ['-N', '10', '-M', '12000', '-R', '1', '--force_single'] + args = ['-N', '10', '-M', '12000', '-R', '1', + '--pairing-mode', 'single'] args.append(infile) utils.runscript(script, args, in_dir) @@ -1730,13 +1733,14 @@ def test_sample_reads_randomly_force_single(): assert seqs == answer -def test_sample_reads_randomly_force_single_outfile(): +def test_sample_reads_randomly_single_mode_outfile(): infile = utils.copy_test_data('test-reads.fa') in_dir = os.path.dirname(infile) script = 'sample-reads-randomly.py' # fix random number seed for reproducibility - args = ['-N', '10', '-M', '12000', '-R', '1', '--force_single', '-o', + args = ['-N', '10', '-M', '12000', '-R', '1', + '--pairing-mode', 'single', '-o', in_dir + '/randreads.out'] args.append(infile) @@ -2098,32 +2102,22 @@ def execute_streaming_diginorm(ifilename): This is not directly executed but is run by the tests themselves ''' # Get temp filenames, etc. - fifo = utils.get_temp_filename('fifo') - in_dir = os.path.dirname(fifo) - script = 'normalize-by-median.py' - args = ['-C', '1', '-k', '17', '-o', 'outfile', fifo] - - # make a fifo to simulate streaming - os.mkfifo(fifo) - - # FIFOs MUST BE OPENED FOR READING BEFORE THEY ARE WRITTEN TO - # If this isn't done, they will BLOCK and things will hang. - thread = threading.Thread(target=utils.runscript, - args=(script, args, in_dir)) - thread.start() - ifile = io.open(ifilename, 'rb') - fifofile = io.open(fifo, 'wb') - # read binary to handle compressed files - chunk = ifile.read(8192) - while len(chunk) > 0: - fifofile.write(chunk) - chunk = ifile.read(8192) - - fifofile.close() + script = os.path.join(utils.scriptpath(), + 'normalize-by-median.py') + infile = utils.copy_test_data(ifilename) + in_dir = os.path.dirname(infile) + args = '-C 1 -k 17 -o outfile -' + cmd = 'cat {infile} | {script} {args}'.format(infile=infile, + script=script, + args=args) + (status, out, err) = utils.run_shell_cmd(cmd, in_directory=in_dir) - thread.join() + if status != 0: + print(out) + print(err) + assert status == 0, status - return in_dir + '/outfile' + return os.path.join(in_dir, 'outfile') def _execute_load_graph_streaming(filename): @@ -2186,6 +2180,7 @@ def test_screed_streaming_ufq(): assert seqs[0].startswith('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') +@pytest.mark.known_failing def test_screed_streaming_bzipfq(): # bzip compressed fq o = execute_streaming_diginorm(utils.get_test_data('100-reads.fq.bz2')) @@ -2194,6 +2189,7 @@ def test_screed_streaming_bzipfq(): assert seqs[0].startswith('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'), seqs +@pytest.mark.known_failing def test_screed_streaming_bzipfa(): # bzip compressed fa o = execute_streaming_diginorm( @@ -2204,7 +2200,6 @@ def test_screed_streaming_bzipfa(): assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG') -@pytest.mark.known_failing def test_screed_streaming_gzipfq(): # gzip compressed fq o = execute_streaming_diginorm(utils.get_test_data('100-reads.fq.gz')) @@ -2213,7 +2208,6 @@ def test_screed_streaming_gzipfq(): assert seqs[0].startswith('CAGGCGCCCACCACCGTGCCCTCCAACCTG') -@pytest.mark.known_failing def test_screed_streaming_gzipfa(): o = execute_streaming_diginorm( utils.get_test_data('test-abund-read-2.fa.gz')) @@ -2874,9 +2868,10 @@ def test_unique_kmers_multiple_inputs(): if entry.endswith('.py')]) def test_version_and_basic_citation(scriptname): with open(os.path.join(utils.scriptpath(), scriptname)) as script: + print(script) line = script.readline() line = script.readline() - if 'khmer' in line: + if 'khmer' in line and '_oxli.app' not in line: # check citation information appears when using --info status, out, err = utils.runscript(scriptname, ["--info"]) assert status == 0, status