diff --git a/.gitignore b/.gitignore index fedcf4a..c7ffdd2 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ kcollections.egg-info/ *.so *.pyc *.dylib +*.fa diff --git a/CMakeLists.txt b/CMakeLists.txt index 900c01b..c712d2c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ project(kcollections) #set(BOOST_REQUESTED_VERSION 1.72.0) #set(BOOST_ROOT_DIR ${CMAKE_SOURCE_DIR}/libs/boost-${BOOST_REQUESTED_VERSION}) #list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake) -find_package(Boost ${BOOST_REQUESTED_VERSION} REQUIRED COMPONENTS serialization) +find_package(Boost 1.65.1 ${BOOST_REQUESTED_VERSION} REQUIRED COMPONENTS serialization) message("boost libs: ${Boost_LIBRARIES} ${Boost_INCLUDE_DIRS}") # The version number. @@ -128,7 +128,12 @@ else() add_executable(setsavetest "${SOURCE_DIR}/TestSetSave.cpp" $) target_link_libraries(setsavetest Threads::Threads Kset) set_target_properties(setsavetest PROPERTIES COMPILE_DEFINITIONS "KSET") - + + # Testing Kdict save + add_executable(dictsavetest "${SOURCE_DIR}/TestDictSave.cpp" $) + target_link_libraries(dictsavetest Threads::Threads Kset) + set_target_properties(dictsavetest PROPERTIES COMPILE_DEFINITIONS "KDICT") + # Testing Kcounter save add_executable(countersavetest "${SOURCE_DIR}/TestCounterSave.cpp" $) target_link_libraries(countersavetest Threads::Threads Kset) diff --git a/inc/Kcontainer.h b/inc/Kcontainer.h index 2df7bc5..a507fc9 100644 --- a/inc/Kcontainer.h +++ b/inc/Kcontainer.h @@ -11,13 +11,16 @@ #include #include #include -#include "Vertex.h" -#include "helper.h" //#include #include #include +#include "globals.h" +#include "helper.h" +#include "Vertex.h" + #if defined(PYTHON) +#include namespace py = pybind11; #endif @@ -25,6 +28,7 @@ namespace py = pybind11; template #endif + struct ThreadGlobals { #if defined(KSET) std::vector>>* kmers; diff --git a/inc/Kcounter.h b/inc/Kcounter.h index caa166e..8e0d5af 100644 --- a/inc/Kcounter.h +++ b/inc/Kcounter.h @@ -5,6 +5,7 @@ #include #include +#include "globals.h" #include "Kcontainer.h" class Kcounter diff --git a/inc/Kdict.h b/inc/Kdict.h index 6ddc4d2..70d141b 100644 --- a/inc/Kdict.h +++ b/inc/Kdict.h @@ -5,7 +5,9 @@ #include #include +#include "globals.h" #include "Kcontainer.h" +#include "opaque.h" template class Kdict @@ -70,12 +72,12 @@ class Kdict this->merge_func = merge_func; } - void add(char* kmer, T& obj) { + void add(const char* kmer, T obj) { CHECK_KMER_LENGTH(kmer, m_k, "Kdict"); kc->kcontainer_add(kmer, obj, overwrite_merge_func); } - bool contains(char* kmer) { + bool contains(const char* kmer) { CHECK_KMER_LENGTH(kmer, m_k, "Kdict"); return kc->kcontainer_contains(kmer); } @@ -89,12 +91,12 @@ class Kdict return kc->kcontainer_size(); } - void remove(char* kmer) { + void remove(const char* kmer) { CHECK_KMER_LENGTH(kmer, m_k, "Kdict"); kc->kcontainer_remove(kmer); } - T get(char* kmer) { + T& get(const char* kmer) { CHECK_KMER_LENGTH(kmer, m_k, "Kdict"); return kc->kcontainer_get(kmer); } @@ -144,7 +146,7 @@ class Kdict kc->parallel_kcontainer_add_init(threads, merge_func); } - void parallel_add(const char* kmer, T& value) { + void parallel_add(const char* kmer, T value) { kc->parallel_kcontainer_add(kmer, value); } diff --git a/inc/Kset.h b/inc/Kset.h index 4e46736..a8dbc3f 100644 --- a/inc/Kset.h +++ b/inc/Kset.h @@ -7,10 +7,10 @@ #include #include +#include "globals.h" #include "uint256_t.h" #include "Kcontainer.h" -#include "globals.h" #include "helper.h" class Kset diff --git a/inc/UContainer.h b/inc/UContainer.h index a6856a9..eb64dc3 100644 --- a/inc/UContainer.h +++ b/inc/UContainer.h @@ -91,6 +91,7 @@ class UC { } #else ar & objs; + suffixes = (uint8_t*) calloc(objs.size() * CDEPTH, sizeof(uint8_t)); for(size_t i = 0; i < objs.size() * CDEPTH; i++) { ar & suffixes[i]; diff --git a/inc/Vertex.h b/inc/Vertex.h index dffd555..fedf5eb 100644 --- a/inc/Vertex.h +++ b/inc/Vertex.h @@ -7,9 +7,9 @@ #include #include +#include "globals.h" #include "UContainer.h" -#include "globals.h" //#include #include "uint256_t.h" #include "uint128_t.h" diff --git a/inc/globals.h b/inc/globals.h index ba9623b..1b5e307 100644 --- a/inc/globals.h +++ b/inc/globals.h @@ -1,7 +1,7 @@ #pragma once #include - +//#include #define CAPACITY 4096 #define NHASHES 12 #define HASHSIZE 512 // HASHSIZE % 32 must be 0 @@ -11,3 +11,5 @@ typedef int count_dtype; #define MAXCOUNT UINT16_MAX extern int CDEPTH; + + diff --git a/inc/opaque.h b/inc/opaque.h new file mode 100644 index 0000000..0a4d350 --- /dev/null +++ b/inc/opaque.h @@ -0,0 +1,40 @@ +#if defined(KDICT) && defined(PYTHON) +#include +#include + +#include +PYBIND11_MAKE_OPAQUE(std::vector); +PYBIND11_MAKE_OPAQUE(std::vector); +PYBIND11_MAKE_OPAQUE(std::vector); +PYBIND11_MAKE_OPAQUE(std::vector); +PYBIND11_MAKE_OPAQUE(std::vector); + +PYBIND11_MAKE_OPAQUE(std::vector>); +PYBIND11_MAKE_OPAQUE(std::vector>); +PYBIND11_MAKE_OPAQUE(std::vector>); +PYBIND11_MAKE_OPAQUE(std::vector>); +PYBIND11_MAKE_OPAQUE(std::vector>); + +/* +PYBIND11_MAKE_OPAQUE(std::list); +PYBIND11_MAKE_OPAQUE(std::list); +PYBIND11_MAKE_OPAQUE(std::list); +PYBIND11_MAKE_OPAQUE(std::list); +PYBIND11_MAKE_OPAQUE(std::list); + +PYBIND11_MAKE_OPAQUE(std::list>); +PYBIND11_MAKE_OPAQUE(std::list>); +PYBIND11_MAKE_OPAQUE(std::list>); +PYBIND11_MAKE_OPAQUE(std::list>); +PYBIND11_MAKE_OPAQUE(std::list>); +*/ +/* +PYBIND11_MAKE_OPAQUE(std::set); +PYBIND11_MAKE_OPAQUE(std::set); +PYBIND11_MAKE_OPAQUE(std::set); +PYBIND11_MAKE_OPAQUE(std::set); +//PYBIND11_MAKE_OPAQUE(std::set); +*/ + +#endif + diff --git a/kcollections/__init__.py b/kcollections/__init__.py index cb69a4f..e718ebd 100644 --- a/kcollections/__init__.py +++ b/kcollections/__init__.py @@ -5,8 +5,11 @@ def create_kdict(base): class tkdict(base): - def __init__(self, k=0): + def __init__(self, k=0, caster=None, seq_caster=None, rcaster=None): super(tkdict, self).__init__(k) + self.caster = caster + self.rcaster = rcaster + self.seq_caster = seq_caster def __str__( self ): res = [] @@ -14,14 +17,52 @@ def __str__( self ): res.append( key + ':' + str( val ) ) return '{' + ','.join( res ) + '}' + ''' + def __getitem__(self, key): + if key in self: + val = super(tkdict, self).__getitem__(key) + print(self.rcaster) + try: + return self.rcaster(val) + except: + return val + else: + raise KeyError("kmer {} not in kdict".format(key)) + ''' + def __setitem__(self, key, val): + #print('setitem') + if self.caster is not None: + #print('casting', val) + val = self.caster(val) + #print(type(val)) + super(tkdict, self).__setitem__(key, val) + def __repr__( self ): return self.__str__() def items( self ): - return self.__iter__() - + for kmer, val in self.__iter__(): + try: + yield kmer, self.rcaster(val) + except: + yield kmer, val + + def parallel_add_seq(self, seq, values): + if self.seq_caster: + values = self.seq_caster(map(self.caster, values)) + super(tkdict, self).parallel_add_seq(seq, values) + + def add_seq(self, seq, values): + if self.seq_caster: + values = self.seq_caster(map(self.caster, values)) + super(tkdict, self).add_seq(seq, values) + def iteritems( self ): - return self.__iter__() + for kmer, val in self.__iter__(): + try: + yield kmer, self.rcaster(val) + except: + yield kmer, val def keys( self ): for kmer, val in self.__iter__(): @@ -29,7 +70,10 @@ def keys( self ): def values( self ): for kmer, val in self.__iter__(): - yield val + try: + yield self.rcaster(val) + except: + yield val def copy( self ): new_kdict = Kdict( self.k ) @@ -39,13 +83,21 @@ def copy( self ): def get( self, key, value = None ): if key in self: - return self[ key ] + val = self[key] + try: + return self.rcaster(val) + except: + return val else: return value def popitem( self ): kmer, item = next( self.items() ) del self[ kmer ] + try: + item = self.rcaster(item) + except: + pass return ( kmer, item ) def setdefault( self, key, value = None ): @@ -53,13 +105,20 @@ def setdefault( self, key, value = None ): self[ key ] = value return value else: - return self[ key ] + value = self[ key ] + try: + return self.rcaster(value) + except: + return value def pop( self, key, *default ): if key in self: value = self[ key ] del self[ key ] - return value + try: + return self.rcaster(value) + except: + return value else: if len( default ) > 0: return default @@ -74,7 +133,32 @@ def update( self, *others ): except: key = item val = other[ key ] - self[ key ] = val + try: + self[ key ] = self.caster(val) + except: + self[key] = val + + ''' + def add_seq(self, seq, values): + if self.caster: + caster = self.caster.__name__ + split = caster.find('_') + caster = eval(caster[:split] + '_vector' + caster[split:]) + values = caster(values) + super(tkdict, self).add_seq(seq, values) + + def parallel_add_seq(self, seq, values): + if self.caster: + print('casting') + caster = self.caster.__name__ + split = caster.find('_') + caster = eval(caster[:split] + '_vector' + caster[split:]) + print(caster) + values = caster(values) + print(type(values)) + super(tkdict, self).parallel_add_seq(seq, values) +''' + return tkdict @@ -83,9 +167,19 @@ def Kdict(val_type, k): iter(val_type) except: type_name = 'Kdict_' + val_type.__name__ + caster = None + rcaster = None + seq_caster = None + kd = create_kdict(eval(type_name))(k) else: type_name = 'Kdict_' + '_'.join([x.__name__ if x != list else 'vector' for x in val_type]) - return create_kdict(eval(type_name))(k) + # NOTE: for now, we just store everything as a list + caster = 'o' + '_'.join([x.__name__ if x != list else 'vector' for x in val_type]) + seq_caster = 'ovector_' + '_'.join([x.__name__ if x != list else 'vector' for x in val_type]) + # NOTE: this does not allow for nexted collections + rcaster = val_type[0] + kd = create_kdict(eval(type_name))(k, eval(caster), eval(seq_caster), rcaster) + return kd class Kset(KsetParent): diff --git a/kcollections/src/Kcollections.cc b/kcollections/src/Kcollections.cc index e76eb6a..80c88ed 100644 --- a/kcollections/src/Kcollections.cc +++ b/kcollections/src/Kcollections.cc @@ -1,15 +1,23 @@ -#if KDICT +#if defined(KDICT) && defined(PYTHON) #include #include +#include #include #include +#include #include "Kdict.h" +#include "globals.h" + + + template void declare_kdict_member(py::module &m, const std::string &typestr) { using CClass = Kdict; using VClass = Vertex; + + std::string pyclass_name = std::string("Kdict_") + typestr; m.doc() = R"pbdoc( @@ -23,6 +31,8 @@ void declare_kdict_member(py::module &m, const std::string &typestr) { Kdict )pbdoc"; + + py::class_(m, pyclass_name.c_str(), py::buffer_protocol(), py::dynamic_attr()) .def(py::init()) @@ -33,7 +43,7 @@ void declare_kdict_member(py::module &m, const std::string &typestr) { Takes two arguments, the kmer represented as a string and the object to set it to. )pbdoc") - .def("__getitem__", &CClass::get, R"pbdoc()pbdoc") + .def("__getitem__", &CClass::get, py::return_value_policy::reference) .def("__iter__", [](CClass& v) { return py::make_iterator(v.begin(), v.end()); }) .def("__contains__", &CClass::contains, R"pbdoc( Checks if a kmer is in Kdict @@ -61,9 +71,16 @@ void declare_kdict_member(py::module &m, const std::string &typestr) { .def("get_child_suffix", &CClass::get_child_suffix ) .def_property_readonly("k", &CClass::get_k) .def("parallel_add_init", &CClass::parallel_add_init, py::call_guard()) + .def("parallel_add_seq", [](CClass& kd, const char* seq, py::iterable& iter) { + kd.parallel_add_seq(seq, iter); + } + ) + .def("add_seq", [](CClass& kd, const char* seq, py::iterable& iter) { + kd.add_seq(seq, iter); + } + ) .def("parallel_add", &CClass::parallel_add) - .def("parallel_add_seq", &CClass::parallel_add_seq) - .def("add_seq", &CClass::add_seq) + //.def("add_seq", &CClass::add_seq) .def("parallel_add_join", &CClass::parallel_add_join, py::call_guard()) .def("set_merge_func", &CClass::set_merge_func); @@ -72,26 +89,44 @@ void declare_kdict_member(py::module &m, const std::string &typestr) { .def( "uc", &VClass::get_uc ); } + +template +void make_opaque(py::module& m, const std::string& name) { + std::string tname = "ovector_" + name; + py::bind_vector>(m, tname.c_str(), py::module_local()); +} + template void declare_kdict(py::module& m, const std::string& name) { - declare_kdict_member(m, name); declare_kdict_member>(m, std::string("vector_") + name); declare_kdict_member>(m, std::string("set_") + name); declare_kdict_member>(m, std::string("list_") + name); } - -//PYBIND11_MAKE_OPAQUE(std::vector); - PYBIND11_MODULE( _Kdict, m ) { + make_opaque(m, "int"); + make_opaque(m, "float"); + // NOTE: we use char because std::vector does not return references to items + // using char instead is a hack, should we change this? + make_opaque(m, "bool"); + make_opaque(m, "str"); + //make_opaque(m, "object"); + + make_opaque>(m, "vector_int"); + make_opaque>(m, "vector_float"); + make_opaque>(m, "vector_bool"); + make_opaque>(m, "vector_str"); + //make_opaque>(m, "vector_object"); + + declare_kdict(m, "int"); declare_kdict(m, "float"); // NOTE: we use char because std::vector does not return references to items // using char instead is a hack, should we change this? declare_kdict(m, "bool"); - declare_kdict(m, "string"); - declare_kdict(m, "object"); + declare_kdict(m, "str"); + //declare_kdict(m, "object"); //declare_kdict_member(m, "pylist"); declare_kdict_member>>(m, "list_list"); diff --git a/kcollections/src/TestDictSave.cpp b/kcollections/src/TestDictSave.cpp new file mode 100644 index 0000000..7fcb055 --- /dev/null +++ b/kcollections/src/TestDictSave.cpp @@ -0,0 +1,118 @@ +#include +#include "Kdict.h" +#include +#include +#include +#include + +using namespace std; + +int main(int argc, char* argv[]) { + int k = atoi(argv[1]); + char* file_path = argv[2]; + + string line; + string seq; + ifstream fh(file_path); + Kdict>* kc = new Kdict>(k); + std::set* s = new std::set(); + //kc->parallel_add_init(4); + + int c = 0; + if(fh.is_open()) { + while(getline(fh, line)) { + //std::cout << "inserting: " << line << std::endl; + //kc->add(line.c_str()); + if(line.c_str()[0] == '>') { + continue; + } + seq.append(line); + } + + int value = 0; + for(size_t i = 0; i < seq.size() - k + 1; i++) { + s->insert(string(seq.substr(i, k))); + kc->add(string(seq.substr(i, k)).c_str(), std::vector {i}); + } + + fh.close(); + std::cout << "seq size: " << seq.size() << std::endl; + //kc->add_seq(seq.c_str()); + + } else { + std::cout << "Could not open file: " << file_path << std::endl; + } + + //kc->parallel_add_join(); + + std::cout << "Kmer set contains " << kc->size() << " kmers" << std::endl; + + int before_count = 0; + int before_total = kc->size(); + + for(auto& it : *kc) { + //std::cout << it << std::endl; + before_count += 1; + } + + if(before_count != kc->size()) { + std::cout << "ERROR: iterating and traversed kmer counts are not equal" << std::endl; + } + + + std::cout << "\n\nSaving kset using boost..." << std::endl; + kc->write("testsave.bs"); + std::cout << "Done using boost. Deleting kset..." << std::endl; + delete kc; + std::cout << "\tDone!" << std::endl; + + + std::cout << "\n\nLoading kset using boost..." << std::endl; + kc = new Kdict>(); + kc->read("testsave.bs"); + std::cout << "\tDone!" << std::endl; + std::cout << "\n\nVerifying loaded kset..." << std::endl; + std::cout << "\tk: " << kc->get_k() << std::endl; + if(before_total == kc->size()){ + std::cout << "\tSUCCESS: current size matches previous size"; + } else { + std::cout << "\tERROR: current size does not match previous size"; + } + std::cout << " (current: " << kc->size() << ", previous: " << before_total << ")" << std::endl; + std::cout << "\tDone!" << std::endl; + + kc->get("TTTTCATTCTGACTGCAACGGGC").push_back(0); + for(auto& it : kc->get("TTTTCATTCTGACTGCAACGGGC")) { + std::cout << it << std::endl; + } + + int after_count = 0; + for(auto& it : *kc) { + //std::cout << it.first << "\t" << *it.second<< std::endl; + if(s->find(string(it.first)) == s->end()) { + std::cout << after_count << " could not find " << it.first << std::endl; + return -1; + } + after_count += 1; + } + + std::cout << "\n\nTesting kmer iteration" << std::endl; + if(after_count == before_count) { + std::cout << "\tSUCCESS: kmer iteration mounts match"; + } else { + std::cout << "\tERROR: kmer iteration counts failed"; + } + + std::cout << " (before: " << before_count << ", after: " << after_count << ")" << std::endl; + + /*fh.open(file_path); + if(fh.is_open()) { + while(getline(fh, line)) { + kc->remove(line.c_str()); + //std::cout << "kcet size: " << kc->size() << std::endl; + } + }*/ + + + return 0; +} diff --git a/setup.py b/setup.py index efb7f56..1ef0c4b 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools import setup, Extension, find_packages from setuptools.command.build_ext import build_ext -__version__ = '1.0.4' +__version__ = '1.0.5' class CMakeExtension(Extension): def __init__(self, name, sourcedir=''): diff --git a/travis/build_wheels.sh b/travis/build_wheels.sh index a602796..c4fbd8d 100755 --- a/travis/build_wheels.sh +++ b/travis/build_wheels.sh @@ -54,4 +54,4 @@ done # Install packages and test "${PYBIN}/pip" install kcollections --no-index -f /io/wheelhouse -(cd "$HOME"; "${PYBIN}/python" -c 'from kcollections import Kset, Kcounter, Kdict_int, Kdict_float, Kdict_string') +(cd "$HOME"; "${PYBIN}/python" -c 'from kcollections import Kset, Kcounter, Kdict_int, Kdict_float, Kdict_str')