diff --git a/libraries/triedent/CMakeLists.txt b/libraries/triedent/CMakeLists.txt index 69ea367d4..0ffef1bc9 100644 --- a/libraries/triedent/CMakeLists.txt +++ b/libraries/triedent/CMakeLists.txt @@ -5,12 +5,16 @@ find_package(Threads REQUIRED) add_library(triedent src/database.cpp src/mapping.cpp - src/gc_queue.cpp - src/ring_allocator.cpp - src/region_allocator.cpp - src/cache_allocator.cpp) + src/seg_allocator.cpp +# src/gc_queue.cpp +# src/ring_allocator.cpp +# src/region_allocator.cpp +# src/cache_allocator.cpp + ) target_include_directories(triedent PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include ${Boost_INCLUDE_DIRS}) target_link_libraries(triedent PUBLIC Threads::Threads) +#target_compile_options(triedent PUBLIC -fsanitize=thread ) +#target_link_options(triedent PUBLIC -fsanitize=thread ) if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(amd64)|(AMD64)") if( NOT APPLE ) @@ -22,9 +26,6 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(amd64)|(AMD64)") endif() endif() -add_executable(mermaid src/mermaid.cpp) -target_link_libraries(mermaid PUBLIC Boost::program_options triedent) -target_include_directories(mermaid PUBLIC ${Boost_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/include) -set_target_properties(mermaid PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${ROOT_BINARY_DIR}) +add_subdirectory(programs) add_subdirectory(test) diff --git a/libraries/triedent/include/triedent/block_allocator.hpp b/libraries/triedent/include/triedent/block_allocator.hpp new file mode 100644 index 000000000..4434f6316 --- /dev/null +++ b/libraries/triedent/include/triedent/block_allocator.hpp @@ -0,0 +1,174 @@ +#pragma once +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace triedent +{ + + class block_allocator + { + public: + using id = uint32_t; + + block_allocator(std::filesystem::path file, + uint64_t block_size, + uint32_t max_blocks, + bool read_write = true) + : _filename(file), _block_size(block_size) + { + _max_blocks = max_blocks; + _block_mapping = new char_ptr[max_blocks]; + + int flags = O_CLOEXEC; + int flock_operation; + if (read_write) + { + flags |= O_RDWR; + flags |= O_CREAT; + flock_operation = LOCK_EX; + } + else + { + flags |= O_RDONLY; + flock_operation = LOCK_SH; + } + + _fd = ::open(file.native().c_str(), flags, 0644); + if (_fd == -1) { + std::cerr <<"opening " << file.native() <<"\n"; + throw std::runtime_error("unable to open block file"); + } + + if (::flock(_fd, flock_operation | LOCK_NB) != 0) + { + ::close(_fd); + throw std::system_error{errno, std::generic_category()}; + } + struct stat statbuf[1]; + if (::fstat(_fd, statbuf) != 0) + { + ::close(_fd); + throw std::system_error{errno, std::generic_category()}; + } + _file_size = statbuf->st_size; + if (_file_size % block_size != 0) + { + ::close(_fd); + throw std::runtime_error("block file isn't a multiple of block size"); + } + if (_file_size) + { + auto prot = PROT_READ | PROT_WRITE; //get_prot(_mode); + if (auto addr = ::mmap(nullptr, _file_size, prot, MAP_SHARED, _fd, 0); + addr != MAP_FAILED) + { + char* data = (char*)addr; + auto end = data + _file_size; + while (data != end) + { + _block_mapping[ _num_blocks.fetch_add(1) ] = data; + //_block_mapping.push_back(data); + data += _block_size; + } + // try_pin(&_pinned, addr, _size); + // std::cerr<<"madvise random " << int64_t(addr) <<" " << _size << " \n"; + // madvise(addr, _size, MADV_RANDOM ); + } + else + { + ::close(_fd); + throw std::system_error{errno, std::generic_category()}; + } + } + } + ~block_allocator() + { + if (_fd) + { + for( uint32_t i = 0; i < _num_blocks.load(); ++i ) + ::munmap(_block_mapping[i], _block_size); + ::close(_fd); + } + } + + uint64_t block_size() const { return _block_size; } + uint64_t num_blocks()const { return _num_blocks.load( std::memory_order_relaxed ); } + + /** + * This method brute forces syncing all blocks which likely + * flushes more than needed. + */ + void sync(sync_type st) { + if (_fd and sync_type::none != st ) + { + uint64_t nb = num_blocks(); + for( uint32_t i = 0; i < nb; ++i ) + ::msync(_block_mapping[i], _block_size, msync_flag(st) ); + } + } + + // return the base pointer for the mapped segment + inline void* get(id i) { + assert( i < _num_blocks.load(std::memory_order_relaxed) ); + // this is safe because block mapping reserved capacity so + // resize should never move the data + return _block_mapping[i]; + } + + id alloc() + { + std::lock_guard l{_resize_mutex}; + + auto new_size = _file_size + _block_size; + if (::ftruncate(_fd, new_size) < 0) + { + throw std::system_error(errno, std::generic_category()); + } + + auto prot = PROT_READ | PROT_WRITE; //get_prot(_mode); + if (auto addr = ::mmap(nullptr, _block_size, prot, MAP_SHARED, _fd, _file_size); + addr != MAP_FAILED) + { + auto nb = _num_blocks.load( std::memory_order_relaxed ); + if( nb == _max_blocks ) + throw std::runtime_error("maximum block number reached"); + + _block_mapping[_num_blocks.load(std::memory_order_relaxed)] = (char*)addr; + _file_size = new_size; + return _num_blocks.fetch_add(1, std::memory_order_release); + } + if (::ftruncate(_fd, _file_size) < 0) + { + throw std::system_error(errno, std::generic_category()); + } + throw std::runtime_error("unable to mmap new block"); + } + + private: + std::filesystem::path _filename; + uint64_t _block_size; + uint64_t _max_blocks; + uint64_t _file_size; + int _fd; + std::atomic _num_blocks; + // std::vector _block_mapping; + using char_ptr = char*; + char_ptr* _block_mapping; + mutable std::mutex _resize_mutex; + }; +} // namespace triedent diff --git a/libraries/triedent/include/triedent/cache_allocator.hpp b/libraries/triedent/include/triedent/cache_allocator.hpp deleted file mode 100644 index 5641a0dc3..000000000 --- a/libraries/triedent/include/triedent/cache_allocator.hpp +++ /dev/null @@ -1,190 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace triedent -{ - // Cache allocator manages all storage for the database. - // - // It maintains multiple buffers and moves accessed data to the hot - // buffer. Objects that are not accessed will be moved to successively - // lower buffers over time. - // - // Objects may be moved at any time. All data - // reads must be protected by a session lock which ensures that - // existing pointers remain valid. All writes must be protected - // by a location_lock, which prevents the data from being moved. - class cache_allocator - { - public: - using id = object_id; - - // cold_bytes can grow - // hot/warm/cool are fixed - // hot/warm/cool/cold MUST be more than twice the - // maximum allocation size. - struct config - { - uint64_t hot_bytes = 1000 * 1000ull; - uint64_t warm_bytes = 1000 * 1000ull; - uint64_t cool_bytes = 1000 * 1000ull; - uint64_t cold_bytes = 1000 * 1000ull; - }; - - cache_allocator(const std::filesystem::path& path, - const config& cfg, - access_mode mode, - bool allow_gc = false); - ~cache_allocator(); - - auto start_session() { return gc_queue::session{_gc}; } - - bool bump_count(object_id id) { return _obj_ids.bump_count(id); } - location_lock lock(object_id id) { return _obj_ids.lock(id); } - - // WARNING: alloc temporarily unlocks the session, which invalidates - // all existing pointers to allocated objects - // - // WARNING: alloc is blocking. It should not be called while - // holding any locks other than the session. It should also - // not be called by the swap thread. - std::pair alloc(std::unique_lock& session, - std::size_t num_bytes, - node_type type); - - std::pair release(session_lock_ref<>, id i); - - // The returned pointer will remain valid until the session lock is released - // get_cache is non-blocking. - template - std::tuple get_cache(session_lock_ref<> session, id i); - - std::uint16_t ref(id i) { return _obj_ids.ref(i); } - - static std::uint32_t object_size(void* ptr) - { - return reinterpret_cast(ptr)[-1].size; - } - - bool is_slow() const { return !_obj_ids.pinned() || !hot().pinned() || !warm().pinned(); } - - std::array, 5> span() const - { - return {_obj_ids.span(), hot().span(), warm().span(), cool().span(), cold().span()}; - } - - bool gc_retain(object_id i) { return _obj_ids.gc_retain(i); } - void gc_start() { _obj_ids.gc_start(); } - void gc_finish() { _obj_ids.gc_finish(); } - - void validate(id i) { _obj_ids.validate(i); } - - void print_stats(std::ostream& os, bool detail); - - private: - bool swap(gc_session&); - void* try_move_object(session_lock_ref<> session, - ring_allocator& to, - const location_lock& lock, - void* data, - std::uint32_t size); - - void swap_loop(); - - ring_allocator& hot() { return _levels[hot_cache]; } - ring_allocator& warm() { return _levels[warm_cache]; } - ring_allocator& cool() { return _levels[cool_cache]; } - region_allocator& cold() { return _cold; } - - const ring_allocator& hot() const { return _levels[hot_cache]; } - const ring_allocator& warm() const { return _levels[warm_cache]; } - const ring_allocator& cool() const { return _levels[cool_cache]; } - const region_allocator& cold() const { return _cold; } - - object_header* get_object(object_location loc) - { - if (loc.cache == cold_cache) - return _cold.get_object(loc.offset); - return _levels[loc.cache].get_object(loc.offset); - } - - gc_queue _gc; - object_db _obj_ids; - ring_allocator _levels[3]; - region_allocator _cold; - - std::atomic _done{false}; - std::thread _swap_thread; - std::thread _gc_thread; - }; - - inline std::pair cache_allocator::alloc( // - std::unique_lock& session, - std::size_t num_bytes, - node_type type) - { - if (num_bytes > 0xffffff - 8) [[unlikely]] - throw std::runtime_error("obj too big"); - - object_id i = _obj_ids.alloc(session, type); - hot().allocate(session, i, num_bytes, - [&](void*, object_location loc) { _obj_ids.init(i, loc); }); - - auto lock = _obj_ids.lock(i); - return {std::move(lock), get_object(_obj_ids.get(i))->data()}; - } - - inline std::pair cache_allocator::release(session_lock_ref<>, id i) - { - auto l = _obj_ids.release(i); - if (l.ref == 0 && l.cache == cold_cache) - { - cold().deallocate(l); - } - return {(l.ref > 0 ? nullptr : (char*)get_object(l)->data()), {l.type()}}; - } - - // The returned pointer will remain valid until the session lock is released - template - std::tuple cache_allocator::get_cache(session_lock_ref<> session, - id i) - { - auto loc = _obj_ids.get(i); - auto obj = get_object(loc); - - if constexpr (CopyToHot) - { - if (loc.cache != hot_cache && obj->size <= 4096) - { - // MUST NOT wait for free memory while holding a location lock - if (auto copy = - try_move_object(session, hot(), _obj_ids.lock(i), obj->data(), obj->size)) - { - if constexpr (debug_cache) - { - // std::osyncstream(std::cout) - // << "copied to hot: " << loc.cache << ":" << loc.offset() << std::endl; - } - return {copy, {loc.type()}, static_cast(loc.ref)}; - } - } - } - - if constexpr (debug_cache) - { - // std::osyncstream(std::cout) << "read: " << loc.cache << ":" << loc.offset() << std::endl; - } - return {obj->data(), {loc.type()}, static_cast(loc.ref)}; - } - -} // namespace triedent diff --git a/libraries/triedent/include/triedent/database.hpp b/libraries/triedent/include/triedent/database.hpp index 73ad2917b..5b8f88915 100644 --- a/libraries/triedent/include/triedent/database.hpp +++ b/libraries/triedent/include/triedent/database.hpp @@ -4,6 +4,7 @@ #include #include #include +#include #include namespace triedent @@ -20,14 +21,9 @@ namespace triedent struct write_access; struct read_access; - template - struct deref; - template struct mutable_deref; - inline key_type from_key6(const key_view sixb); - // Write thread usage notes: // * To create a new tree, default-initialize a shared_ptr // * To get the upper-most root, use write_session::get_top_root @@ -133,17 +129,18 @@ namespace triedent using string_view = std::string_view; using id = object_id; + auto lock() const { return _session.lock(); } + protected: - using swap_guard = std::lock_guard; - explicit session_base(cache_allocator& a); - operator gc_session&() const { return _session; } + explicit session_base(seg_allocator& a); + operator seg_allocator::session&() const { return _session; } public: key_view to_key6(key_view v) const; private: - mutable gc_session _session; - mutable key_type key_buf; + mutable seg_allocator::session _session; // or read_lock...? + mutable key_type key_buf; }; /** @@ -164,25 +161,51 @@ namespace triedent bool get(const std::shared_ptr& r, std::span key, std::vector* result_bytes, - std::vector>* result_roots) const; + std::vector>* result_roots = nullptr) const; std::optional> get(const std::shared_ptr& r, std::span key) const; + /// Assume keys a-z + /// + /// key = m + /// + /// greater_equal is m, or if m isn't present then it is n + /// less_than is l + /// max is z + /// next = m+1 or n, if keys are strings then next is 'ma' + /** + * TODO: verify these docs + * finds the first key greater than or equal to key, this can be used to find + * the first element by using an empty key() + * + * ie. lower_bound + */ bool get_greater_equal(const std::shared_ptr& r, std::span key, std::vector* result_key, - std::vector* result_bytes, - std::vector>* result_roots) const; + std::vector* result_bytes = nullptr, + std::vector>* result_roots = nullptr) const; + + /** + * TODO: verify these docs + * finds the largest key less than key + */ bool get_less_than(const std::shared_ptr& r, std::span key, std::vector* result_key, - std::vector* result_bytes, - std::vector>* result_roots) const; + std::vector* result_bytes = nullptr, + std::vector>* result_roots = nullptr) const; + /** + * TODO: verify these docs + * + * finds the largest key with the given prefix, this can be used to find + * the last key by using an empty prefix. + */ bool get_max(const std::shared_ptr& r, std::span prefix, std::vector* result_key, - std::vector* result_bytes, - std::vector>* result_roots) const; + std::vector* result_bytes = nullptr, + std::vector>* result_roots = nullptr) const; void print(const std::shared_ptr& r); void validate(const std::shared_ptr& r); @@ -194,12 +217,12 @@ namespace triedent session(const session&) = delete; inline object_id get_id(const std::shared_ptr& r) const; - void validate(session_lock_ref<> l, id); + void validate(session_rlock& l, id); void print(id n, string_view prefix = "", std::string k = ""); - inline deref get_by_id(session_lock_ref<> l, object_id i) const; - inline deref get_by_id(session_lock_ref<> l, object_id i, bool& unique) const; + inline deref get_by_id(session_rlock& l, object_id i) const; + inline deref get_by_id(session_rlock& l, object_id i, bool& unique) const; - bool unguarded_get(session_lock_ref<> l, + bool unguarded_get(session_rlock& l, const std::shared_ptr& ancestor, object_id root, std::string_view key, @@ -213,39 +236,41 @@ namespace triedent std::vector>* result_roots) const; bool unguarded_get_greater_equal( - session_lock_ref<> l, + session_rlock& l, const std::shared_ptr& ancestor, object_id root, std::string_view key, - std::vector& result_key, + temp_key6& result_key, std::vector* result_bytes, std::vector>* result_roots) const; bool unguarded_get_less_than( - session_lock_ref<> l, + session_rlock& l, const std::shared_ptr& ancestor, object_id root, std::optional key, - std::vector& result_key, + temp_key6& result_key, std::vector* result_bytes, std::vector>* result_roots) const; - bool unguarded_get_max(session_lock_ref<> l, + bool unguarded_get_max(session_rlock& l, const std::shared_ptr& ancestor, object_id root, std::string_view prefix_min, std::string_view prefix_max, - std::vector& result_key, + temp_key6& result_key, std::vector* result_bytes, std::vector>* result_roots) const; - inline id retain(std::unique_lock&, id); - inline void release(session_lock_ref<> l, id); + inline id retain(session_rlock&, id); // bump or copy + inline void release(session_rlock&, id); // polymorphic release node friend class database; std::shared_ptr _db; - cache_allocator& ring() const; + seg_allocator& sega() const; + + void cache(auto& objref) const; }; using read_session = session; @@ -255,7 +280,7 @@ namespace triedent write_session(std::shared_ptr db) : read_session(db) {} std::shared_ptr get_top_root(); - void set_top_root(const std::shared_ptr& r); + void set_top_root(const std::shared_ptr& r, bool sync = false); int upsert(std::shared_ptr& r, std::span key, std::span val); @@ -283,85 +308,134 @@ namespace triedent void end_collect_garbage(); ///@} + void validate() + { + auto tr = get_id(get_top_root()); + auto state = session_base::lock(); + validate_node(state, tr); + } + private: inline bool get_unique(std::shared_ptr& r); - inline void update_root(session_lock_ref<> l, std::shared_ptr& r, object_id id); - - void recursive_retain(session_lock_ref<> l, object_id id); - - mutable_deref make_value(std::unique_lock& session, - node_type type, - string_view k, - string_view v); - mutable_deref clone_value(std::unique_lock& session, - object_id origin, - node_type type, - string_view key, - std::uint32_t key_offset, - string_view val); - - mutable_deref clone_value(std::unique_lock& session, - object_id origin, - node_type type, - const std::string& key, - string_view val); - inline mutable_deref make_inner(std::unique_lock& session, - string_view pre, - id val, - uint64_t branches); - inline mutable_deref clone_inner(std::unique_lock& session, - object_id id, - const inner_node& cpy, - string_view pre, - std::uint32_t offset, - object_id val, - uint64_t branches); - inline mutable_deref clone_inner(std::unique_lock& session, - object_id id, - const inner_node& cpy, - const std::string& pre, - object_id val, - uint64_t branches); + inline void update_root(session_rlock& l, std::shared_ptr& r, object_id id); + + void recursive_retain(session_rlock& l, object_id id); + + mutable_deref make_value(session_rlock& state, + node_type type, + string_view k, + string_view v); + + inline object_id make_value_id(session_rlock& state, + node_type type, + string_view k, + string_view v); + + mutable_deref clone_value(session_rlock& state, + object_id origin, + node_type type, + string_view key, + std::uint32_t key_offset, + string_view val); + + // like clone_value but doesn't construct a mutable_deref which does + // unnecessary locking + inline object_id clone_value_id(session_rlock& state, + object_id origin, + node_type type, + string_view key, + std::uint32_t key_offset, + string_view val); + + inline mutable_deref clone_value(session_rlock& state, + object_id origin, + node_type type, + const std::string& key, + string_view val); + + inline object_id clone_value_id(session_rlock& state, + object_id origin, + node_type type, + const std::string& key, + string_view val); + + inline mutable_deref make_inner(session_rlock& state, + string_view pre, + id val, + uint64_t branches); + + inline object_id make_inner_id(session_rlock& state, + string_view pre, + id val, + uint64_t branches); + + inline mutable_deref clone_inner(session_rlock& state, + object_id id, + const inner_node& cpy, + string_view pre, + std::uint32_t offset, + object_id val, + uint64_t branches); + inline mutable_deref clone_inner(session_rlock& state, + object_id id, + const inner_node& cpy, + const std::string& pre, + object_id val, + uint64_t branches); + + inline object_id clone_inner_id(session_rlock& state, + object_id id, + const inner_node& cpy, + string_view pre, + std::uint32_t offset, + object_id val, + uint64_t branches); + inline object_id clone_inner_id(session_rlock& state, + object_id id, + const inner_node& cpy, + const std::string& pre, + object_id val, + uint64_t branches); template inline mutable_deref lock(const deref& obj); - inline id add_child(std::unique_lock& session, - id root, - bool unique, - node_type type, - string_view key, - string_view val, - int& old_size); - inline id remove_child(std::unique_lock& session, - id root, - bool unique, - string_view key, - int& removed_size); - - inline void modify_value(session_lock_ref<> l, + inline id add_child(session_rlock& state, + id root, + bool unique, + node_type type, + string_view key, + string_view val, + int& old_size); + inline id remove_child(session_rlock& state, + id root, + bool unique, + string_view key, + int& removed_size); + + inline void modify_value(session_rlock& state, mutable_deref mut, string_view val); - inline id set_value(std::unique_lock& session, - deref n, - bool unique, - node_type type, - string_view key, - string_view val); - inline id set_inner_value(std::unique_lock& session, - deref n, - bool unique, - node_type type, - string_view val); - inline id combine_value_nodes(std::unique_lock& session, - node_type t1, - string_view k1, - string_view v1, - object_id origin1, - node_type t2, - string_view k2, - string_view v2, - object_id origin2); + inline id set_value(session_rlock& state, + deref n, + bool unique, + node_type type, + string_view key, + string_view val); + inline id set_inner_value(session_rlock& state, + deref n, + bool unique, + node_type type, + string_view val); + inline id combine_value_nodes(session_rlock& state, + node_type t1, + string_view k1, + string_view v1, + object_id origin1, + node_type t2, + string_view k2, + string_view v2, + object_id origin2); }; class database : public std::enable_shared_from_this @@ -374,7 +448,47 @@ namespace triedent friend root; public: - using config = cache_allocator::config; + struct config + { + /** + * Read threads can move the accessed data into + * a warm cache to improve cache locality and separate + * infrequently used data from frequently used data. + * + * If used with anything other than sync_type::none, this + * will produce write amplification somewhat less than + * the total data read because on sync() the moved cache + * values must be flushed to disk. + */ + bool cache_on_read = false; + + /** + * By default triedent starts a background thread which + * will compact data ones a segment + */ + bool run_compact_thread = true; + + /** + * The max amount of a segment that is allowed to be empty + * before the compactor thread will move the remaining contents + * to a new segment. + * + * Lower values save space, but produce more write amplification when + * using sync_type other than none. Lower values improve cache + * locality and reduce page misses by keeping the data denser. + */ + int compact_empty_threshold_percent = 20; + + /** + * Triedent will discourage the OS from swapping out + * the most recently used segments by using mlock(), + * may want a higher compaction threshold if using mlock() + * + */ + uint64_t max_pinnable_segments = 64; + + sync_type sync_mode = sync_type::none; + }; static constexpr auto read_write = access_mode::read_write; static constexpr auto read_only = access_mode::read_only; @@ -388,6 +502,9 @@ namespace triedent database(const std::filesystem::path& dir, access_mode mode, bool allow_gc = false); ~database(); + void start_compact_thread() { _sega.start_compact_thread(); } + bool compact_next_segment() { return _sega.compact_next_segment(); } + static void create(std::filesystem::path dir, config); std::shared_ptr start_write_session(); @@ -395,11 +512,12 @@ namespace triedent void print_stats(std::ostream& os, bool detail = false); - bool is_slow() const { return _ring.is_slow(); } - auto span() const { return _ring.span(); } + // bool is_slow() const { return _ring.is_slow(); } + // auto span() const { return _ring.span(); } + private: - inline void release(session_lock_ref<> l, id); + inline void release(session_rlock& l, id); struct database_memory { @@ -414,7 +532,7 @@ namespace triedent std::atomic top_root; }; - cache_allocator _ring; + seg_allocator _sega; mapping _file; database_memory* _dbm; @@ -423,6 +541,7 @@ namespace triedent std::mutex _root_release_session_mutex; session_base _root_release_session; + config _config; }; inline root::~root() @@ -434,99 +553,22 @@ namespace triedent if (db && id && !ancestor) { std::lock_guard lock(db->_root_release_session_mutex); - session_base::swap_guard guard(db->_root_release_session); - db->release(guard, id); + auto state = db->_root_release_session.lock(); + db->release(state, id); } } - template - struct deref - { - using id = object_id; - - deref(std::pair p, node_type t) - : _id(p.first), ptr((char*)p.second), _type(t) - { - } - deref(std::pair p) - : _id(p.first), ptr((char*)p.second), _type(node_type::inner) - { - } - template - deref(deref p) : _id(p._id), ptr((char*)p.ptr), _type(p._type) - { - } - deref(id i, void* p, node_type t) : _id(i), ptr(p), _type(t) {} - - explicit inline operator bool() const { return bool(_id); } - inline operator id() const { return _id; } - - auto type() const { return _type; } - bool is_leaf_node() const { return _type != node_type::inner; } - inline auto& as_value_node() const { return *reinterpret_cast(ptr); } - inline auto& as_inner_node() const { return *reinterpret_cast(ptr); } - - inline const T* operator->() const { return reinterpret_cast(ptr); } - inline const T& operator*() const { return *reinterpret_cast(ptr); } - - int64_t as_id() const { return _id.id; } - - // Allocation invalidates pointers. reload will make the deref object - // valid again after an allocation. - void reload(cache_allocator& a, session_lock_ref<> session) - { - auto [p, type, ref] = a.get_cache(session, _id); - ptr = p; - } - - protected: - template - friend class deref; - - id _id; - void* ptr; - node_type _type; - }; // deref - - template - struct mutable_deref : deref - { - mutable_deref() = default; - mutable_deref(std::pair p, node_type type) - : deref{{p.first.get_id(), p.second}, type}, lock{std::move(p.first)} - { - } - mutable_deref(std::pair p) - : deref{{p.first.get_id(), p.second}}, lock{std::move(p.first)} - { - } - mutable_deref(location_lock lock, const deref& src) : lock{std::move(lock)}, deref{src} - { - } - - inline auto& as_value_node() const { return *reinterpret_cast(this->ptr); } - inline auto& as_inner_node() const { return *reinterpret_cast(this->ptr); } - - inline T* operator->() const { return reinterpret_cast(this->ptr); } - inline T& operator*() const { return *reinterpret_cast(this->ptr); } - - auto get_id() { return lock.get_id(); } - - private: - location_lock lock; - }; // mutable_deref - - inline session_base::session_base(cache_allocator& a) : _session(a.start_session()) {} + inline session_base::session_base(seg_allocator& a) : _session(a.start_session()) {} template - inline cache_allocator& session::ring() const + inline seg_allocator& session::sega() const { - return _db->_ring; + return _db->_sega; } template session::session(std::shared_ptr db) - : session_base{db->_ring}, _db(std::move(db)) + : session_base{db->_sega}, _db(std::move(db)) { } template @@ -545,18 +587,17 @@ namespace triedent } template - inline deref session::get_by_id(session_lock_ref<> l, id i) const + inline deref session::get_by_id(session_rlock& state, id i) const { - auto [ptr, type, ref] = ring().template get_cache(l, i); - return {i, ptr, type}; + return deref(state.get(i)); // TODO: cache } template - inline deref session::get_by_id(session_lock_ref<> l, id i, bool& unique) const + inline deref session::get_by_id(session_rlock& state, id i, bool& unique) const { - auto [ptr, type, ref] = ring().template get_cache(l, i); - unique &= ref == 1; - return {i, ptr, type}; + auto ob = state.get(i); + unique &= (ob.ref_count() == 1); + return deref(ob); } template @@ -569,29 +610,30 @@ namespace triedent << std::endl; if (r.use_count() == 1 && r->db && !r->ancestor && r->id) { - auto id = r->id; - r->id = {}; - swap_guard g(*this); - release(g, id); + auto id = r->id; + r->id = {}; + auto state = lock(); + release(state, id); } r = {}; } template - inline void session::release(session_lock_ref<> l, id obj) + inline void session::release(session_rlock& state, id obj) { - _db->release(l, obj); + _db->release(state, obj); } - inline void database::release(session_lock_ref<> l, id obj) + + inline void database::release(session_rlock& state, id obj) { - release_node(l, _ring, obj); + release_node(state, obj); } template - inline database::id session::retain(std::unique_lock& session, id obj) + inline database::id session::retain(session_rlock& state, id obj) { - return bump_refcount_or_copy(ring(), session, obj); + return bump_refcount_or_copy(state, obj); } // This always returns a view into the first argument @@ -617,12 +659,12 @@ namespace triedent return result; } - std::unique_lock l(*this); - id = retain(l, {id}).id; + auto state = session_base::lock(); + id = retain(state, {id}).id; return std::make_shared(root{_db, nullptr, {id}}); } - inline void write_session::set_top_root(const std::shared_ptr& r) + inline void write_session::set_top_root(const std::shared_ptr& r, bool sync) { std::lock_guard lock(_db->_root_change_mutex); auto current = _db->_dbm->top_root.load(); @@ -634,12 +676,23 @@ namespace triedent return; } - std::unique_lock l(*this); + auto state = session_base::lock(); + /* + if( id.id and not validate_node( state, id ) ) { + throw std::runtime_error( "invalid node!" ); + } + */ + if constexpr (debug_roots) std::cout << id.id << ": set_top_root: old=" << current << std::endl; - id = retain(l, id); + id = retain(state, id); _db->_dbm->top_root.store(id.id); - release(l, {current}); + if (_db->_config.sync_mode != sync_type::none) + { + _db->_sega.sync(_db->_config.sync_mode); // data backing it is written here + _db->_file.sync(_db->_config.sync_mode); // top root is written here + } + release(state, {current}); } inline bool write_session::get_unique(std::shared_ptr& r) @@ -648,9 +701,7 @@ namespace triedent return r && r->db && !r->ancestor && r.use_count() == 1; } - inline void write_session::update_root(session_lock_ref<> l, - std::shared_ptr& r, - object_id id) + inline void write_session::update_root(session_rlock& l, std::shared_ptr& r, object_id id) { if (r && r->db && r->id == id) { @@ -668,12 +719,15 @@ namespace triedent // bumped. if constexpr (debug_roots) std::cout << id.id << ": update_root replacing:" << r->id.id << std::endl; + release(l, r->id); + r->id = id; } else { - if constexpr (debug_roots) { + if constexpr (debug_roots) + { if (r == nullptr) { std::cout << id.id << ": update_root original was nullptr" << std::endl; @@ -688,87 +742,138 @@ namespace triedent } } - inline mutable_deref write_session::make_value(std::unique_lock& session, - node_type type, - string_view key, - string_view val) + inline mutable_deref write_session::make_value(session_rlock& state, + node_type type, + string_view key, + string_view val) { - return {value_node::make(ring(), session, key, val, type), type}; + return {value_node::make(state, key, val, type)}; } - inline mutable_deref write_session::clone_value( - std::unique_lock& session, - object_id origin, - node_type type, - string_view key, - std::uint32_t key_offset, - string_view val) + inline object_id write_session::make_value_id(session_rlock& state, + node_type type, + string_view key, + string_view val) { - return {value_node::clone(ring(), session, origin, key, key_offset, val, type), type}; + auto obr = value_node::make(state, key, val, type); + obr.obj()->update_checksum(); + return obr.id(); } - inline mutable_deref write_session::clone_value( - std::unique_lock& session, - object_id origin, - node_type type, - const std::string& key, - string_view val) + inline object_id write_session::clone_value_id(session_rlock& state, + object_id origin, + node_type type, + string_view key, + std::uint32_t key_offset, + string_view val) { - return {value_node::clone(ring(), session, origin, key, -1, val, type), type}; + auto obr = value_node::clone(state, origin, key, key_offset, val, type); + obr.obj()->update_checksum(); + return obr.id(); } - inline mutable_deref write_session::make_inner(std::unique_lock& session, - string_view pre, - id val, - uint64_t branches) + inline mutable_deref write_session::clone_value(session_rlock& state, + object_id origin, + node_type type, + const std::string& key, + string_view val) { - return inner_node::make(ring(), session, pre, val, branches); + return {value_node::clone(state, origin, key, -1, val, type)}; + } + inline object_id write_session::clone_value_id(session_rlock& state, + object_id origin, + node_type type, + const std::string& key, + string_view val) + { + auto obr = value_node::clone(state, origin, key, -1, val, type); + obr.obj()->update_checksum(); + return obr.id(); } - inline mutable_deref write_session::clone_inner( - std::unique_lock& session, - object_id id, - const inner_node& cpy, - string_view pre, - std::uint32_t offset, - object_id val, - uint64_t branches) + inline mutable_deref write_session::make_inner(session_rlock& state, + string_view pre, + id val, + uint64_t branches) { - return inner_node::clone(ring(), session, id, &cpy, pre, offset, val, branches); + return inner_node::make(state, pre, val, branches); + } + inline object_id write_session::make_inner_id(session_rlock& state, + string_view pre, + id val, + uint64_t branches) + { + auto obr = inner_node::make(state, pre, val, branches); + obr.obj()->update_checksum(); + return obr.id(); } - inline mutable_deref write_session::clone_inner( - std::unique_lock& session, - object_id id, - const inner_node& cpy, - const std::string& pre, - object_id val, - uint64_t branches) + inline mutable_deref write_session::clone_inner(session_rlock& state, + object_id id, + const inner_node& cpy, + string_view pre, + std::uint32_t offset, + object_id val, + uint64_t branches) { - return inner_node::clone(ring(), session, id, &cpy, pre, -1, val, branches); + return inner_node::clone(state, id, &cpy, pre, offset, val, branches); + } + + inline object_id write_session::clone_inner_id(session_rlock& state, + object_id id, + const inner_node& cpy, + string_view pre, + std::uint32_t offset, + object_id val, + uint64_t branches) + { + auto obr = inner_node::clone(state, id, &cpy, pre, offset, val, branches); + obr.obj()->update_checksum(); + return obr.id(); + } + + inline mutable_deref write_session::clone_inner(session_rlock& state, + object_id id, + const inner_node& cpy, + const std::string& pre, + object_id val, + uint64_t branches) + { + return inner_node::clone(state, id, &cpy, pre, -1, val, branches); + } + inline object_id write_session::clone_inner_id(session_rlock& state, + object_id id, + const inner_node& cpy, + const std::string& pre, + object_id val, + uint64_t branches) + { + auto obr = inner_node::clone(state, id, &cpy, pre, -1, val, branches); + obr.obj()->update_checksum(); + return obr.id(); } template inline mutable_deref write_session::lock(const deref& obj) { - return {ring().lock(obj), obj}; + return {obj}; } /** * Given an existing value node and a new key/value to insert */ - database::id write_session::combine_value_nodes(std::unique_lock& session, - node_type t1, - string_view k1, - string_view v1, - object_id origin1, - node_type t2, - string_view k2, - string_view v2, - object_id origin2) + database::id write_session::combine_value_nodes(session_rlock& state, + node_type t1, + string_view k1, + string_view v1, + object_id origin1, + node_type t2, + string_view k2, + string_view v2, + object_id origin2) { if (k1.size() > k2.size()) - return combine_value_nodes(session, t2, k2, v2, origin2, t1, k1, v1, origin1); + return combine_value_nodes(state, t2, k2, v2, origin2, t1, k1, v1, origin1); //std::cerr << __func__ << ":" << __LINE__ << "\n"; auto cpre = common_prefix(k1, k2); @@ -800,33 +905,39 @@ namespace triedent if (cpre == k1) { auto [inner_id, branch_id] = build_children( - [&] { return clone_value(session, origin1, t1, k1, k1.size(), v1); }, - [&] { return clone_value(session, origin2, t2, k2, cpre.size() + 1, v2); }); + [&] { return clone_value_id(state, origin1, t1, k1, k1.size(), v1); }, + [&] { return clone_value_id(state, origin2, t2, k2, cpre.size() + 1, v2); }); + + // this usesthe non-locking deref because no alloc before return + auto in = inner_node::make(state, cpre, id(), 1ull << b2); - auto in = make_inner(session, cpre, id(), 1ull << b2); // Set value separately, because we don't want to increment its refcount in->set_value(inner_id); in->branch(b2) = branch_id; - return in; + in.obj()->update_checksum(); + + return in.id(); } else { auto b1sfx = k1.substr(cpre.size()); auto b1 = b1sfx.front(); auto [b1id, b2id] = build_children( - [&] { return clone_value(session, origin1, t1, k1, cpre.size() + 1, v1); }, - [&] { return clone_value(session, origin2, t2, k2, cpre.size() + 1, v2); }); + [&] { return clone_value_id(state, origin1, t1, k1, cpre.size() + 1, v1); }, + [&] { return clone_value_id(state, origin2, t2, k2, cpre.size() + 1, v2); }); - auto in = make_inner(session, cpre, id(), inner_node::branches(b1, b2)); + // this usesthe non-locking deref because there are no alloc before return + auto in = inner_node::make(state, cpre, id(), inner_node::branches(b1, b2)); in->branch(b1) = b1id; in->branch(b2) = b2id; - return in; + in.obj()->update_checksum(); + return in.id(); } } - void write_session::modify_value(session_lock_ref<> l, + void write_session::modify_value(session_rlock& l, mutable_deref mut, string_view val) { @@ -834,7 +945,7 @@ namespace triedent { if constexpr (debug_roots) { - std::cout << mut.get_id().id << ": modify_value; old:"; + std::cout << mut.id().id << ": modify_value; old:"; for (unsigned i = 0; i < mut->num_roots(); ++i) std::cout << " " << mut->roots()[i].id; std::cout << std::endl; @@ -852,7 +963,7 @@ namespace triedent if constexpr (debug_roots) { - std::cout << mut.get_id().id << ": modify_value; new:"; + std::cout << mut.id().id << ": modify_value; new:"; for (unsigned i = 0; i < mut->num_roots(); ++i) std::cout << " " << mut->roots()[i].id; std::cout << std::endl; @@ -862,63 +973,68 @@ namespace triedent memcpy(mut->data_ptr(), val.data(), val.size()); } - database::id write_session::set_value(std::unique_lock& session, - deref n, - bool unique, - node_type type, - string_view key, - string_view val) + database::id write_session::set_value(session_rlock& state, + deref n, + bool unique, + node_type type, + string_view key, + string_view val) { if (!n || !unique || type != n.type()) - return make_value(session, type, key, val); + return make_value_id(state, type, key, val); assert(n.is_leaf_node()); auto& vn = n.as_value_node(); if (vn.data_size() == val.size()) { - modify_value(session, lock(deref(n)), val); - return n; + modify_value(state, deref(n), val); + assert(n.obj()->validate_checksum()); + return n.id(); } - return make_value(session, type, key, val); + return make_value_id(state, type, key, val); } - database::id write_session::set_inner_value(std::unique_lock& session, - deref n, - bool unique, - node_type type, - string_view val) + database::id write_session::set_inner_value(session_rlock& state, + deref n, + bool unique, + node_type type, + string_view val) { if (unique) { if (auto old_value = n->value()) { - auto v = get_by_id(session, old_value); + auto v = state.get(old_value); // TODO copy to cache? auto& vn = v.as_value_node(); - if (v.type() == type && vn.data_size() == val.size() && ring().ref(old_value) == 1) + if (v.type() == type && vn.data_size() == val.size() && v.ref_count() == 1) { - modify_value(session, lock(deref(v)), val); - return n; + modify_value(state, deref(v), val); + assert(v.obj()->validate_checksum()); + return n.id(); } else { - ring().release(session, old_value); + v.release(); } } - object_id val_id = make_value(session, type, string_view(), val); - n.reload(ring(), session); - auto locked = lock(n); - locked->set_value(val_id); - return n; + object_id val_id = make_value_id(state, type, string_view(), val); + // This lock is necessary because we alloc above and n was deref + // before + lock(n)->set_value(val_id); + assert(n.obj()->validate_checksum()); + return n.id(); } else { - object_id new_val = make_value(session, type, string_view(), val); - n.reload(ring(), session); - auto result = clone_inner(session, n, *n, n->key(), 0, object_id{}, n->branches()); + object_id new_val = make_value_id(state, type, string_view(), val); + + auto result = + inner_node::clone(state, n.id(), &*n, n->key(), 0, object_id{}, n->branches()); result->set_value(new_val); - return result; + result.obj()->update_checksum(); + return result.id(); } } @@ -926,28 +1042,28 @@ namespace triedent * Given an existing tree node (root) add a new key/value under it and return the id * of the new node if a new node had to be allocated. */ - inline database::id write_session::add_child(std::unique_lock& session, - id root, - bool unique, - node_type type, - string_view key, - string_view val, - int& old_size) + inline database::id write_session::add_child(session_rlock& state, + id root, + bool unique, + node_type type, + string_view key, + string_view val, + int& old_size) { if (not root) // empty case - return make_value(session, type, key, val); + return make_value_id(state, type, key, val); - auto n = get_by_id(session, root, unique); + auto n = get_by_id(state, root, unique); if (n.is_leaf_node()) // current root is value { auto& vn = n.as_value_node(); if (vn.key() != key) - return combine_value_nodes(session, n.type(), vn.key(), vn.data(), root, type, key, val, + return combine_value_nodes(state, n.type(), vn.key(), vn.data(), root, type, key, val, object_id{}); else { old_size = vn.data_size(); - return set_value(session, n, unique, type, key, val); + return set_value(state, n, unique, type, key, val); } } @@ -957,8 +1073,8 @@ namespace triedent if (in_key == key) // whose prefix is same as key, therefore set the value { if (in->value()) - old_size = get_by_id(session, in->value()).as_value_node().data_size(); - return set_inner_value(session, n, unique, type, val); + old_size = state.get(in->value()).as_value_node().data_size(); + return set_inner_value(state, n, unique, type, val); } // key should be the first argument, because (unlike in_key) @@ -972,29 +1088,32 @@ namespace triedent { object_id cur_b = in->has_branch(b) ? in->branch(b) : object_id{}; auto new_b = - add_child(session, cur_b, false, type, key.substr(cpre.size() + 1), val, old_size); - in.reload(ring(), session); - auto new_in = clone_inner(session, root, *in, in->key(), 0, in->value(), - in->branches() | 1ull << b); + add_child(state, cur_b, false, type, key.substr(cpre.size() + 1), val, old_size); + + auto new_in = inner_node::clone(state, root, &*in, in->key(), 0, in->value(), + in->branches() | 1ull << b); if (new_b != cur_b) { new_in->branch(b) = new_b; - release(session, cur_b); + release(state, cur_b); } - return new_in; + new_in.obj()->update_checksum(); + return new_in.id(); } // else modify in place auto cur_b = in->branch(b); auto new_b = - add_child(session, cur_b, unique, type, key.substr(cpre.size() + 1), val, old_size); + add_child(state, cur_b, unique, type, key.substr(cpre.size() + 1), val, old_size); if (new_b != cur_b) { - in.reload(ring(), session); - lock(in)->branch(b) = new_b; - release(session, cur_b); + { + auto li = lock(in); + li->branch(b) = new_b; + } + release(state, cur_b); } return root; } @@ -1004,15 +1123,17 @@ namespace triedent { auto b1 = in_key[cpre.size()]; // MUST convert to id to release the location_lock - id b1val = - clone_inner(session, in, *in, in_key, cpre.size() + 1, in->value(), in->branches()); - id b0val = make_value(session, type, string_view(), val); + id b1val = clone_inner_id(state, in.id(), *in, in_key, cpre.size() + 1, in->value(), + in->branches()); + id b0val = make_value_id(state, type, string_view(), val); - auto nin = make_inner(session, cpre, object_id{}, inner_node::branches(b1)); + auto nin = inner_node::make(state, cpre, object_id{}, inner_node::branches(b1)); // Set separately because we don't need to inc ref - nin->set_value(b0val); - nin->branch(b1) = b1val; - return nin; + auto& ninr = *nin; + ninr.set_value(b0val); + ninr.branch(b1) = b1val; + nin.obj()->update_checksum(); + return nin.id(); } else // there are two branches { @@ -1021,17 +1142,19 @@ namespace triedent auto b1key = key.substr(cpre.size() + 1); // Handle sub first, because b2key is invalidated by allocation. // cpre and b1key are safe because they point into key, which is externally owned - id sub = - clone_inner(session, in, *in, in_key, cpre.size() + 1, in->value(), in->branches()); - id b1val = make_value(session, type, b1key, val); - auto nin = make_inner(session, cpre, id(), inner_node::branches(b1, b2)); + id sub = clone_inner_id(state, in.id(), *in, in_key, cpre.size() + 1, in->value(), + in->branches()); + id b1val = make_value_id(state, type, b1key, val); + + auto nin = inner_node::make(state, cpre, id(), inner_node::branches(b1, b2)); assert(not nin->branch(b1)); nin->branch(b1) = b1val; assert(not nin->branch(b2)); nin->branch(b2) = sub; + nin.obj()->update_checksum(); - return nin; + return nin.id(); } } } // write_session::add_child @@ -1040,14 +1163,15 @@ namespace triedent std::span key, std::span val) { - std::unique_lock l(*this); + auto state = session_base::lock(); int old_size = -1; auto new_root = - add_child(l, get_id(r), get_unique(r), node_type::bytes, + add_child(state, get_id(r), false & get_unique(r), node_type::bytes, to_key6({key.data(), key.size()}), {val.data(), val.size()}, old_size); assert(new_root.id); - update_root(l, r, new_root); + assert(state.get(new_root).obj()->validate_checksum()); + update_root(state, r, new_root); return old_size; } @@ -1055,19 +1179,19 @@ namespace triedent std::span key, std::span> roots) { - std::unique_lock l(*this); + auto state = session_base::lock(); std::vector ids; ids.reserve(roots.size()); for (auto& r : roots) - ids.push_back(retain(l, get_id(r))); + ids.push_back(retain(state, get_id(r))); int old_size = -1; auto new_root = add_child( - l, get_id(r), get_unique(r), node_type::roots, to_key6({key.data(), key.size()}), + state, get_id(r), get_unique(r), node_type::roots, to_key6({key.data(), key.size()}), {reinterpret_cast(ids.data()), ids.size() * sizeof(object_id)}, old_size); assert(new_root.id); - update_root(l, r, new_root); + update_root(state, r, new_root); return old_size; } @@ -1087,14 +1211,14 @@ namespace triedent std::vector* result_bytes, std::vector>* result_roots) const { - swap_guard g(*this); - return unguarded_get(g, r, get_id(r), to_key6({key.data(), key.size()}), result_bytes, + auto state = session_base::lock(); + return unguarded_get(state, r, get_id(r), to_key6({key.data(), key.size()}), result_bytes, result_roots); } template bool session::unguarded_get( - session_lock_ref<> l, + session_rlock& l, const std::shared_ptr& ancestor, object_id root, std::string_view key, @@ -1183,32 +1307,32 @@ namespace triedent std::vector* result_bytes, std::vector>* result_roots) const { - swap_guard g(*this); - std::vector result_key6; - if (!unguarded_get_greater_equal(g, r, get_id(r), to_key6({key.data(), key.size()}), + auto state = session_base::lock(); + temp_key6 result_key6; + if (!unguarded_get_greater_equal(state, r, get_id(r), to_key6({key.data(), key.size()}), result_key6, result_bytes, result_roots)) return false; if (result_key) { - auto s = from_key6({result_key6.data(), result_key6.size()}); - result_key->assign(s.begin(), s.end()); + from_key6({result_key6.data(), result_key6.size()}, *result_key); } return true; } template bool session::unguarded_get_greater_equal( - session_lock_ref<> l, + session_rlock& state, const std::shared_ptr& ancestor, object_id root, std::string_view key, - std::vector& result_key, + temp_key6& result_key, std::vector* result_bytes, std::vector>* result_roots) const { if (!root) return false; - auto n = get_by_id(l, root); + auto n = state.get(root); + cache(n); if (n.is_leaf_node()) { auto& vn = n.as_value_node(); @@ -1236,7 +1360,7 @@ namespace triedent } else if (in.value()) { - auto v = get_by_id(l, in.value()); + auto v = state.get(in.value()); //get_by_id(l, in.value()); auto& vn = v.as_value_node(); return fill_result(ancestor, vn, v.type(), result_bytes, result_roots); } @@ -1249,8 +1373,8 @@ namespace triedent return false; auto rk = result_key.size(); result_key.push_back(b); - if (unguarded_get_greater_equal(l, ancestor, in.branch(b), key, result_key, result_bytes, - result_roots)) + if (unguarded_get_greater_equal(state, ancestor, in.branch(b), key, result_key, + result_bytes, result_roots)) return true; result_key.resize(rk); b = in.lower_bound(b + 1); @@ -1265,11 +1389,13 @@ namespace triedent std::vector* result_bytes, std::vector>* result_roots) const { - swap_guard g(*this); - std::vector result_key6; - if (!unguarded_get_less_than(g, r, get_id(r), to_key6({key.data(), key.size()}), result_key6, - result_bytes, result_roots)) - return false; + temp_key6 result_key6; + { // scope the lock as narrow as possible + auto state = session_base::lock(); + if (!unguarded_get_less_than(state, r, get_id(r), to_key6({key.data(), key.size()}), + result_key6, result_bytes, result_roots)) + return false; + } if (result_key) { auto s = from_key6({result_key6.data(), result_key6.size()}); @@ -1280,24 +1406,26 @@ namespace triedent template bool session::unguarded_get_less_than( - session_lock_ref<> l, + session_rlock& l, const std::shared_ptr& ancestor, object_id root, std::optional key, - std::vector& result_key, + temp_key6& result_key, std::vector* result_bytes, std::vector>* result_roots) const { if (!root) return false; auto n = get_by_id(l, root); + cache(n); if (n.is_leaf_node()) { auto& vn = n.as_value_node(); auto vn_key = vn.key(); if (key && vn_key >= *key) return false; - result_key.insert(result_key.end(), vn_key.begin(), vn_key.end()); + //result_key.insert(result_key.end(), vn_key.begin(), vn_key.end()); + result_key.append(vn_key.begin(), vn_key.end()); return fill_result(ancestor, vn, n.type(), result_bytes, result_roots); } auto& in = n.as_inner_node(); @@ -1316,7 +1444,8 @@ namespace triedent else key = std::nullopt; } - result_key.insert(result_key.end(), in_key.begin(), in_key.end()); + //result_key.insert(result_key.end(), in_key.begin(), in_key.end()); + result_key.append(in_key.begin(), in_key.end()); auto b = in.reverse_lower_bound(last_b); if (b < last_b) key = std::nullopt; @@ -1349,16 +1478,19 @@ namespace triedent std::vector* result_bytes, std::vector>* result_roots) const { - swap_guard g(*this); - auto prefix_min = to_key6({prefix.data(), prefix.size()}); - auto extra_bits = prefix_min.size() * 6 - prefix.size() * 8; - auto prefix_max = (std::string)prefix_min; + auto prefix_min = to_key6({prefix.data(), prefix.size()}); + auto extra_bits = prefix_min.size() * 6 - prefix.size() * 8; + auto prefix_max = (std::string)prefix_min; if (!prefix_max.empty()) prefix_max.back() |= (1 << extra_bits) - 1; - std::vector result_key6; - if (!unguarded_get_max(g, r, get_id(r), prefix_min, prefix_max, result_key6, result_bytes, - result_roots)) - return false; + temp_key6 result_key6; + + { + auto state = session_base::lock(); + if (!unguarded_get_max(state, r, get_id(r), prefix_min, prefix_max, result_key6, + result_bytes, result_roots)) + return false; + } if (result_key) { auto s = from_key6({result_key6.data(), result_key6.size()}); @@ -1369,12 +1501,12 @@ namespace triedent template bool session::unguarded_get_max( - session_lock_ref<> l, + session_rlock& l, const std::shared_ptr& ancestor, object_id root, std::string_view prefix_min, std::string_view prefix_max, - std::vector& result_key, + temp_key6& result_key, std::vector* result_bytes, std::vector>* result_roots) const { @@ -1384,6 +1516,7 @@ namespace triedent while (true) { auto n = get_by_id(l, root); + cache(n); if (n.is_leaf_node()) { auto& vn = n.as_value_node(); @@ -1427,25 +1560,24 @@ namespace triedent inline int write_session::remove(std::shared_ptr& r, std::span key) { - std::unique_lock l(*this); - int removed_size = -1; - auto new_root = remove_child(l, get_id(r), get_unique(r), to_key6({key.data(), key.size()}), - removed_size); - update_root(l, r, new_root); + auto state = session_base::lock(); + auto new_root = remove_child(state, get_id(r), get_unique(r), + to_key6({key.data(), key.size()}), removed_size); + update_root(state, r, new_root); return removed_size; } - inline database::id write_session::remove_child(std::unique_lock& session, - id root, - bool unique, - string_view key, - int& removed_size) + inline database::id write_session::remove_child(session_rlock& state, + id root, + bool unique, + string_view key, + int& removed_size) { if (not root) return root; - auto n = get_by_id(session, root, unique); + auto n = get_by_id(state, root, unique); if (n.is_leaf_node()) // current root is value { auto& vn = n.as_value_node(); @@ -1468,12 +1600,12 @@ namespace triedent auto iv = in->value(); if (not iv) return root; - removed_size = get_by_id(session, iv).as_value_node().data_size(); + removed_size = get_by_id(state, iv).as_value_node().data_size(); if (in->num_branches() == 1) { char b = std::countr_zero(in->branches()); - auto bn = get_by_id(session, *in->children()); + auto bn = get_by_id(state, *in->children()); std::string new_key; new_key += in_key; new_key += b; @@ -1483,26 +1615,29 @@ namespace triedent auto& vn = bn.as_value_node(); new_key += vn.key(); // TRIEDENT_DEBUG( "clone value" ); - return clone_value(session, bn, bn.type(), new_key, vn.data()); + return clone_value_id(state, bn.id(), bn.type(), new_key, vn.data()); } else { auto& bin = bn.as_inner_node(); new_key += bin.key(); // TRIEDENT_DEBUG( "clone inner " ); - return clone_inner(session, bn, bin, new_key, bin.value(), bin.branches()); + return clone_inner_id(state, bn.id(), bin, new_key, bin.value(), bin.branches()); } } if (unique) { auto prev = in->value(); - lock(in)->set_value(id()); - release(session, prev); + { + auto lin = lock(in); + lin->set_value(id()); + } + release(state, prev); return root; } else - return clone_inner(session, in, *in, key, 0, id(), in->branches()); + return clone_inner_id(state, in.id(), *in, key, 0, id(), in->branches()); } auto cpre = common_prefix(in_key, key); @@ -1515,25 +1650,26 @@ namespace triedent object_id cur_b = in->branch(b); - auto new_b = - remove_child(session, cur_b, unique, key.substr(in_key.size() + 1), removed_size); + auto new_b = remove_child(state, cur_b, unique, key.substr(in_key.size() + 1), removed_size); if (new_b != cur_b) { - in.reload(ring(), session); if (new_b and unique) { - lock(in)->branch(b) = new_b; - release(session, cur_b); + { + auto lin = lock(in); + lin->branch(b) = new_b; + } + release(state, cur_b); return root; } if (new_b) // update branch { auto new_root = - clone_inner(session, in, *in, in->key(), 0, in->value(), in->branches()); + inner_node::clone(state, in.id(), &*in, in->key(), 0, in->value(), in->branches()); auto& new_br = new_root->branch(b); - release(session, new_br); + release(state, new_br); new_br = new_b; - return new_root; + return new_root.id(); } else // remove branch { @@ -1541,7 +1677,7 @@ namespace triedent if (std::popcount(new_branches) + bool(in->value()) > 1) { // multiple branches remain, nothing to merge up, just realloc without branch // TRIEDENT_WARN( "clone without branch" ); - return clone_inner(session, in, *in, in->key(), 0, in->value(), new_branches); + return clone_inner_id(state, in.id(), *in, in->key(), 0, in->value(), new_branches); } if (not new_branches) { @@ -1551,11 +1687,11 @@ namespace triedent // in this case, not branches means it must have a value assert(in->value() and "expected value because we removed a branch"); - auto cur_v = get_by_id(session, in->value()); + auto cur_v = state.get(in->value()); //get_by_id(state, in->value()); auto& cv = cur_v.as_value_node(); // make a copy because key and data come from different objects, which clone doesn't handle. std::string new_key{in->key()}; - return clone_value(session, cur_v, cur_v.type(), new_key, cv.data()); + return clone_value_id(state, cur_v.id(), cur_v.type(), new_key, cv.data()); } else { // there must be only 1 branch left @@ -1564,7 +1700,7 @@ namespace triedent auto lb = std::countr_zero(in->branches() ^ inner_node::branches(b)); auto& last_branch = in->branch(lb); // the one branch is either a value or a inner node - auto cur_v = get_by_id(session, last_branch); + auto cur_v = get_by_id(state, last_branch); if (cur_v.is_leaf_node()) { auto& cv = cur_v.as_value_node(); @@ -1572,7 +1708,7 @@ namespace triedent new_key += in->key(); new_key += char(lb); new_key += cv.key(); - return clone_value(session, cur_v, cur_v.type(), new_key, cv.data()); + return clone_value_id(state, cur_v.id(), cur_v.type(), new_key, cv.data()); } else { @@ -1581,7 +1717,7 @@ namespace triedent new_key += in->key(); new_key += char(lb); new_key += cv.key(); - return clone_inner(session, cur_v, cv, new_key, cv.value(), cv.branches()); + return clone_inner_id(state, cur_v.id(), cv, new_key, cv.value(), cv.branches()); } } } @@ -1598,8 +1734,8 @@ namespace triedent template void session::validate(const std::shared_ptr& r) { - swap_guard l{*this}; - validate(l, get_id(r)); + auto state = session_base::lock(); + validate(state, get_id(r)); } template @@ -1661,26 +1797,26 @@ namespace triedent std::lock_guard lock(_db->_root_change_mutex); id = {_db->_dbm->top_root.load()}; } - swap_guard l{*this}; - recursive_retain(l, id); + auto state = session_base::lock(); + recursive_retain(state, id); } - inline void write_session::recursive_retain(session_lock_ref<> l, id r) + inline void write_session::recursive_retain(session_rlock& state, id r) { if (not r) return; - if (!ring().gc_retain(r)) - return; // retaining this node indirectly retains all children + auto dr = state.get(r); + if (not dr.retain()) + return; - auto dr = get_by_id(l, r); if (dr.type() == node_type::inner) { auto& in = dr.as_inner_node(); - recursive_retain(l, in.value()); + recursive_retain(state, in.value()); for (auto child : std::span{in.children(), in.num_branches()}) { - recursive_retain(l, child); + recursive_retain(state, child); } } else if (dr.type() == node_type::roots) @@ -1688,18 +1824,20 @@ namespace triedent auto& rt = dr.as_value_node(); for (auto child : std::span{rt.roots(), rt.num_roots()}) { - recursive_retain(l, child); + recursive_retain(state, child); } } } inline void write_session::start_collect_garbage() { - ring().gc_start(); + throw std::runtime_error("not impl yet"); + //ring().gc_start(); } inline void write_session::end_collect_garbage() { - ring().gc_finish(); + throw std::runtime_error("not impl yet"); + //ring().gc_finish(); } template @@ -1713,111 +1851,49 @@ namespace triedent } template - void session::validate(session_lock_ref<> l, id r) + void session::validate(session_rlock& state, id r) { if (not r) return; auto validate_id = [&](auto i) { - ring().validate(r); - if (0 == ring().ref(r)) + auto rv = state.validate(r); + if (0 == rv.ref_count()) throw std::runtime_error("found reference to object with 0 ref count: " + std::to_string(r.id)); }; validate_id(r); - auto dr = get_by_id(l, r); + auto dr = state.get(r); //get_by_id(state, r); if (not dr.is_leaf_node()) { auto& in = dr.as_inner_node(); - validate(l, in.value()); + validate(state, in.value()); auto* c = in.children(); auto* e = c + in.num_branches(); while (c != e) { - validate(l, *c); + validate(state, *c); ++c; } } } - inline key_type from_key6(const key_view sixb) + inline key_view session_base::to_key6(key_view v) const { - std::string out; - out.resize((sixb.size() * 6) / 8); - - const uint8_t* pos6 = (uint8_t*)sixb.data(); - const uint8_t* pos6_end = (uint8_t*)sixb.data() + sixb.size(); - uint8_t* pos8 = (uint8_t*)out.data(); - - while (pos6_end - pos6 >= 4) - { - pos8[0] = (pos6[0] << 2) | (pos6[1] >> 4); // 6 + 2t - pos8[1] = (pos6[1] << 4) | (pos6[2] >> 2); // 4b + 4t - pos8[2] = (pos6[2] << 6) | pos6[3]; // 2b + 6 - pos6 += 4; - pos8 += 3; - } - switch (pos6_end - pos6) - { - case 3: - pos8[0] = (pos6[0] << 2) | (pos6[1] >> 4); // 6 + 2t - pos8[1] = (pos6[1] << 4) | (pos6[2] >> 2); // 4b + 4t - // pos8[2] = (pos6[2] << 6); // 2b + 6-0 - break; - case 2: - pos8[0] = (pos6[0] << 2) | (pos6[1] >> 4); // 6 + 2t - // pos8[1] = (pos6[1] << 4); // 4b + 4-0 - break; - case 1: - pos8[0] = (pos6[0] << 2); // 6 + 2-0 - break; - } - return out; + return triedent::to_key6(key_buf, v); } - inline key_view to_key6(key_type& key_buf, key_view v) - { - uint32_t bits = v.size() * 8; - uint32_t byte6 = (bits + 5) / 6; - - key_buf.resize(byte6); - - uint8_t* pos6 = (uint8_t*)key_buf.data(); - const uint8_t* pos8 = (uint8_t*)v.data(); - const uint8_t* pos8_end = (uint8_t*)v.data() + v.size(); - while (pos8_end - pos8 >= 3) - { - pos6[0] = pos8[0] >> 2; - pos6[1] = (pos8[0] & 0x3) << 4 | pos8[1] >> 4; - pos6[2] = (pos8[1] & 0xf) << 2 | (pos8[2] >> 6); - pos6[3] = pos8[2] & 0x3f; - pos8 += 3; - pos6 += 4; - } - - switch (pos8_end - pos8) + template + void session::cache(auto& objref) const + { + if (_db->_config.cache_on_read) { - case 2: - pos6[0] = pos8[0] >> 2; - pos6[1] = (pos8[0] & 0x3) << 4 | pos8[1] >> 4; - pos6[2] = (pos8[1] & 0xf) << 2; - break; - case 1: - pos6[0] = pos8[0] >> 2; - pos6[1] = (pos8[0] & 0x3) << 4; - break; - default: - break; + objref.cache_object(); } - return {key_buf.data(), key_buf.size()}; - } - inline key_view session_base::to_key6(key_view v) const - { - return triedent::to_key6(key_buf, v); } } // namespace triedent diff --git a/libraries/triedent/include/triedent/db.hpp b/libraries/triedent/include/triedent/db.hpp new file mode 100644 index 000000000..fc85a312f --- /dev/null +++ b/libraries/triedent/include/triedent/db.hpp @@ -0,0 +1,249 @@ +#pragma once +#include +#include +#include +#include + +namespace triedent +{ + + struct Status + { + bool ok = true; + }; + + /** + * This is the high-level interface through which the + * database should be accessed if you don't want to maintain + * multiple persistent snapshots. This interface is designed to + * operate with low-latency syncing between when a write transaction + * commits and the first read sees the change. + */ + class DB + { + public: + struct Options + { + bool create_if_missing = false; + bool error_if_exists = false; + database::config config; + }; + typedef std::shared_ptr root_ptr; + + /** + * Thread-local read session, used to start read transactions which + * all occur from the same state snapshot. + */ + class ReadSession + { + public: + class Transaction + { + public: + /** + * Span is any type that has a data() and size() method. + * e.g. std::string, std::vector, std::span + */ + template + Status get(const Span& key, std::vector* value) + { + return Status{.ok = _rs._rs->get(_root, {key.data(), key.size()}, value, nullptr)}; + } + + template + bool get_greater_equal(const Span& key, + std::vector* result_key, + std::vector* result_val = nullptr) + { + return _rs._rs->get_greater_equal(_root, {key.data(), key.size()}, result_key, + result_val); + } + + ~Transaction() {} + + private: + friend class ReadSession; + Transaction(ReadSession& s) : _rs(s), _root(s._db.getRoot()) {} + + ReadSession& _rs; + root_ptr _root; + }; // Transaction + + //auto startTransaction() { return std::make_shared(std::ref(*this)); } + auto startTransaction() { return std::shared_ptr(new Transaction(*this)); } + + ReadSession(DB& d) : _db(d) { _rs = _db._db->start_read_session(); } + + private: + friend class Transaction; + + std::shared_ptr _rs; + DB& _db; + + }; // ReadSession + + /** + * Only one write session can exist and it may only be called by a + * single thread. Writes are batched in WriteSession::Transactions and + * can be aborted before any reads see it. + */ + class WriteSession + { + public: + class Transaction + { + public: + Status get(std::span key, std::vector& value); + Status put(std::span key, std::span value); + Status remove(std::span key); + + Status commit() + { + if (_root) + { + //_ws._db._root = _root; + _ws.setRoot(std::move(_root)); + return {}; + } + return {.ok = false}; + } + + Status abort() + { + _root.reset(); + return {}; + } + + ~Transaction() { commit(); } + + // KeySpan and ValueSpan can be any type that has a .data() and .size() method + // @return the old size if a key was replaced, otherwise 0 + template + int put(const KeySpan& key, const ValueSpan& value) + { + return _ws._ws->upsert(_root, {key.data(), key.size()}, + {value.data(), value.size()}); + } + + private: + friend class WriteSession; + Transaction(WriteSession& s) : _ws(s), _root(s._db._root) {} + + std::shared_ptr _root; + WriteSession& _ws; + }; // WriteSession::Transaction + + auto startTransaction() { return new Transaction(*this); } + + WriteSession(DB& d) : _db(d) + { + _ws = _db._db->start_write_session(); + _db._root = _ws->get_top_root(); + } + + void validate() { _ws->validate(); } + + private: + friend class Transaction; + friend class DB; + + void setRoot(std::shared_ptr r) + { + _ws->set_top_root(r); + _db.setRoot(std::move(r)); + } + + DB& _db; + std::shared_ptr _ws; + }; // WriteSession + + static std::shared_ptr open(Options opt, std::filesystem::path dir) + { + return std::make_shared(std::make_shared(dir.c_str(), opt.config, database::read_write)); + } + + DB(std::shared_ptr d) : _db(std::move(d)), _ws(*this) + { + _root = _ws._ws->get_top_root(); + _release_thread = std::thread([this]() { release_loop(); }); + } + + auto createReadSession() { return std::make_shared(std::ref(*this)); } + WriteSession& writeSession() { return _ws; } + + root_ptr getRoot() const + { + root_ptr tmp; + { + std::shared_lock m(_root_mutex); + tmp = _root; + } + return tmp; + } + ~DB() + { + _done = true; + _release_thread.join(); + _db->print_stats(std::cout, true); + } + + void print() { _db->print_stats(std::cout, true); } + bool compact() { return _db->compact_next_segment(); } + + private: // DB + void setRoot(root_ptr p) + { + { + std::unique_lock l(_root_mutex); + root_ptr tmp = _root; // delay release until unlock + _root = std::move(p); + std::unique_lock l2(_release_mutex); + _release_queue.push_back(std::move(tmp)); + // TODO: notify release thread + } + } + + std::shared_ptr _db; + WriteSession _ws; + + void release_loop() + { + while (not _done) + { + bool rest = false; + { + root_ptr tmp; + { + std::unique_lock l(_release_mutex); + if (not _release_queue.empty()) + { + tmp = _release_queue.front(); + _release_queue.pop_front(); + } + else + { + rest = true; + } + } + } + if (rest) + { + // TODO: wait conditiopn + using namespace std::chrono_literals; + std::this_thread::sleep_for(30ms); + } + } + // clean up + std::unique_lock l(_root_mutex); + _release_queue.clear(); + } + + std::atomic _done; + std::thread _release_thread; + std::list _release_queue; + mutable std::shared_mutex _root_mutex; + mutable std::shared_mutex _release_mutex; + root_ptr _root; + }; + +} // namespace triedent diff --git a/libraries/triedent/include/triedent/debug.hpp b/libraries/triedent/include/triedent/debug.hpp index f7cdcbe1d..8d975c834 100644 --- a/libraries/triedent/include/triedent/debug.hpp +++ b/libraries/triedent/include/triedent/debug.hpp @@ -4,11 +4,14 @@ #include #include //#include +// #undef NDEBUG +#include namespace triedent { static constexpr bool debug_cache = false; static constexpr bool debug_gc = false; + static constexpr bool debug_invariant = true; struct scope { @@ -22,7 +25,7 @@ namespace triedent } }; - inline const char* thread_name(const char* n = "default") + inline const char* thread_name(const char* n = nullptr) { static thread_local const char* thread_name = n; if (n) @@ -42,6 +45,7 @@ namespace triedent } inline auto set_current_thread_name( const char* name ) { + thread_name(name); #ifdef __APPLE__ return pthread_setname_np(name); #else diff --git a/libraries/triedent/include/triedent/gc_queue.hpp b/libraries/triedent/include/triedent/gc_queue.hpp index 0bcf26e9a..f57e499f0 100644 --- a/libraries/triedent/include/triedent/gc_queue.hpp +++ b/libraries/triedent/include/triedent/gc_queue.hpp @@ -103,14 +103,14 @@ namespace triedent // notify _queue_cond. static constexpr size_type wait_bit = ~(npos >> 1); friend class session; - std::mutex _session_mutex; - std::vector _sessions; - std::mutex _queue_mutex; - std::condition_variable _queue_cond; - std::atomic _end; - std::size_t _size; - std::vector> _queue; - bool _waiting; + alignas(64) std::mutex _session_mutex; + std::vector _sessions; + alignas(64) std::mutex _queue_mutex; + alignas(64) std::condition_variable _queue_cond; + alignas(64) std::atomic _end; + std::size_t _size; + std::vector> _queue; + bool _waiting; }; using gc_session = gc_queue::session; diff --git a/libraries/triedent/include/triedent/id_allocator.hpp b/libraries/triedent/include/triedent/id_allocator.hpp new file mode 100644 index 000000000..f18ce7d7b --- /dev/null +++ b/libraries/triedent/include/triedent/id_allocator.hpp @@ -0,0 +1,236 @@ +#pragma once +#include +#include +#include +#include + +#include + +namespace triedent +{ + + inline constexpr uint64_t obj_val(node_type type, uint16_t ref) + { + object_info result{0}; + // This is distinct from any valid offset + result._location = (1ull << object_info::location_rshift) - 1; + result._ref = ref; + result._type = static_cast(type); + return result.to_int(); + } + inline constexpr uint64_t free_val(uint64_t loc) + { + object_info result{0}; + // This is distinct from any valid offset + result._location = loc; + result._ref = 0; + result._type = static_cast(node_type::undefined); + return result.to_int(); + } + + /** + * Allocates object ids across multiple threads with + * minimal locking by simulating a hash table that grows + * when collision rate gets too high. + * + * - alloc and free are thread safe and non-blocking except + * alloc will block if the load reaches 80% in order to grow + * the backing file. + * + * - free is constant time two atomic operations with no memory ordering requirments + * - alloc typically requires fetching 1 cache line and doing + * less than 8 loads and 1 C&S and 1 fetch add, but occassionally (0.1-.01%) + * may take 3 or 4 times as long. + * + * There are no memory ordering requirments because the object's value + * speaks for itself and is not gaurding other memory. + */ + class id_allocator + { + public: + static const uint32_t id_block_size = 1024 * 1024 * 128; + static_assert(id_block_size % 64 == 0, "should be divisible by cacheline"); + + inline static constexpr uint64_t extract_next_ptr(uint64_t x) { return (x >> 19); } + inline static constexpr uint64_t create_next_ptr(uint64_t x) { return (x << 19); } + /* + inline static constexpr uint64_t extract_next_ptr(uint64_t x) + { + // assert((x >> 15 & 3) == uint64_t(node_type::undefined)); + return (x & object_info::location_mask) >> object_info::location_rshift; + //return (x >> object_info::location_rshift) & object_info::location_mask; + } + inline static constexpr uint64_t create_next_ptr(uint64_t x) + { + auto r = (x << object_info::location_rshift) | (uint64_t(node_type::undefined) << 15); + assert( extract_next_ptr(r) == x ); + return r; + } + */ + + id_allocator(std::filesystem::path id_file) + : _data_dir(id_file), + _block_alloc(id_file, id_block_size, 8192 /*1TB*/), + _ids_header_file(id_file.native() + ".header", access_mode::read_write) + { + if (_ids_header_file.size() == 0) + { + _ids_header_file.resize(round_to_page(sizeof(ids_header))); + auto idh = new (_ids_header_file.data()) ids_header(); + idh->_next_alloc.store(1); + idh->_end_id.store(0); + idh->_first_free.store(object_info(node_type::undefined, 0).to_int()); + } + _idheader = reinterpret_cast(_ids_header_file.data()); + } + + uint64_t get_capacity() const { return _idheader->_end_id.load(std::memory_order_relaxed); } + + std::atomic& get(object_id id) + { + auto abs_pos = id.id * sizeof(uint64_t); + auto block_num = abs_pos / id_block_size; + auto index_in_block = uint64_t(abs_pos) & uint64_t(id_block_size - 1); + auto ptr = ((char*)_block_alloc.get(block_num)) + index_in_block; + return reinterpret_cast&>(*ptr); + } + + /** + * The value stored at the returned object_id is equal to + * alloc_session::default_id_value which indicates undefined type with + * a reference count of 1. If you store 0 at this location the allocator + * will think it is free and invariants about load capacity will be broken. + */ + std::pair&, object_id> get_new_id() + { + // std::cerr << "get new id...\n"; + // std::cerr << " pre alloc free list: "; + // print_free_list(); + + auto brand_new = [&]() + { + object_id id{_idheader->_next_alloc.fetch_add(1, std::memory_order_relaxed)}; + grow(id); // ensure that there should be new id + + auto& atom = get(id); + atom.store(obj_val(node_type::undefined, 1), std::memory_order_relaxed); + + // std::cerr << " brand new id: " << id.id << "\n"; + return std::pair&, object_id>(atom, id); + }; + //auto r = brand_new(); + //std::cerr << "get new id: " << r.second.id << "\n"; + + std::unique_lock l{_alloc_mutex}; + uint64_t ff = _idheader->_first_free.load(std::memory_order_acquire); + do + { + if (extract_next_ptr(ff) == 0) + { + // std::cerr << "alloc brand new! \n"; + _alloc_mutex.unlock(); + l.release(); + return brand_new(); + } + } while (not _idheader->_first_free.compare_exchange_strong( + ff, get({extract_next_ptr(ff)}).load(std::memory_order_relaxed))); + + ff = extract_next_ptr(ff); + // std::cerr << " reused id: " << ff << "\n"; + auto& ffa = get({ff}); + // store 1 = ref count 1 prevents object as being interpreted as unalloc + ffa.store(obj_val(node_type::undefined, 1), std::memory_order_relaxed); + + // std::cerr << " post alloc free list: "; + // print_free_list(); + return {ffa, {ff}}; + } + + void print_free_list() + { + uint64_t id = extract_next_ptr(_idheader->_first_free.load()); + std::cerr << id; + while (id) + { + id = extract_next_ptr(get({id})); + std::cerr << ", " << id; + } + std::cerr << " END\n"; + } + + void free_id(object_id id) + { + auto& head_free_list = _idheader->_first_free; + auto& next_free = get(id); + auto new_head = object_info(node_type::undefined, id.id).to_int(); + + uint64_t cur_head = _idheader->_first_free.load(std::memory_order_acquire); + assert(not(cur_head & object_info::ref_mask)); + assert(not(next_free & object_info::ref_mask)); + do + { + next_free.store(cur_head, std::memory_order_release); + } while (not head_free_list.compare_exchange_weak(cur_head, new_head, std::memory_order_release)); + //print_free_list(); + } + + auto& get_mutex( object_id id ) { + return _locks[id.id&(8192-1)]; + } + + private: + friend class alloc_session; + + void grow(object_id id) + { + // optimistic... + if ( id.id < + _idheader->_end_id.load(std::memory_order_relaxed)) + return; + + void* ptr; + { + std::lock_guard l{_grow_mutex}; + if (id.id < _idheader->_end_id.load()) + return; // no need to grow, another thread grew first + + // std::cerr << "growing obj id db\n"; + ptr = _block_alloc.get(_block_alloc.alloc()); + _idheader->_end_id.store(_block_alloc.num_blocks() * _block_alloc.block_size() / 8, std::memory_order_release); + } // don't hold lock while doing mlock + + if (::mlock(ptr, id_block_size)) + { + std::cerr << "WARNING: unable to mlock ID lookups\n"; + ::madvise(ptr, id_block_size, MADV_RANDOM); + } + } + + std::mutex _alloc_mutex; + std::mutex _grow_mutex; + std::filesystem::path _data_dir; + block_allocator _block_alloc; + + /** + * Mapped from disk to track meta data associated with the IDs + */ + struct ids_header + { + uint64_t _magic = 0; + uint64_t _block_size = id_block_size; + + std::atomic _next_alloc; /// the next new ID to be allocated + std::atomic _end_id; /// the first ID beyond the end of file + + // the lower 15 bits represent the alloc_session number of the last write + // the upper bits represent the index of the first free ID, the value + // stored at that index is the index of the next free ID or 0 if there + // are no unused ids available. + std::atomic _first_free; /// index of an ID that has the index of the next ID + }; + + ids_header* _idheader; + mapping _ids_header_file; + std::mutex _locks[8192]; + }; +}; // namespace triedent diff --git a/libraries/triedent/include/triedent/key6.hpp b/libraries/triedent/include/triedent/key6.hpp new file mode 100644 index 000000000..39313aa22 --- /dev/null +++ b/libraries/triedent/include/triedent/key6.hpp @@ -0,0 +1,156 @@ +#pragma once +#include + +namespace triedent { + + using key_view = std::string_view; + using value_view = std::string_view; + using key_type = std::string; + using value_type = key_type; + + inline key_type from_key6(const key_view sixb); + + template + inline void from_key6(const key_view sixb, KeyType& out); + + // used to avoid malloc, because keys can be at most 256, + // this one change produced 13% improvment with 12 threads + struct temp_key6 + { + uint32_t _size = 0; + char _buffer[128]; + + uint32_t size() const { return _size; } + const char* begin() const { return _buffer; } + const char* end() const { return _buffer + _size; } + char* begin() { return _buffer; } + char* end() { return _buffer + _size; } + + void append(const char* p, const char* e) + { + int s = e - p; + if (_size + s > sizeof(_buffer)) + throw std::runtime_error("key length overflow"); + memcpy(end(), p, s); + _size += s; + } + void push_back(char c) + { + if (_size < sizeof(_buffer)) + { + *end() = c; + ++_size; + } + else + { + throw std::runtime_error("key length overflow"); + } + } + void resize(uint32_t s) + { + if (s < sizeof(_buffer)) + { + _size = s; + } + else + { + throw std::runtime_error("key length overflow"); + } + } + const char* data() const { return begin(); } + char* data() { return begin(); } + + void insert(char* pos, const char* begin, const char* end) + { + assert(pos >= _buffer and pos < _buffer + sizeof(_buffer)); + // assert((const char*)pos + end - begin < _buffer + sizeof(_buffer)); + memcpy(pos, begin, end - begin); + _size += end - begin; + } + + temp_key6() : _size(0) {} + + private: + temp_key6(const temp_key6&) = delete; // should not be copied + }; + + inline key_type from_key6(const key_view sixb) + { + key_type tmp; + from_key6(sixb, tmp); + return tmp; + } + + template + inline void from_key6(const key_view sixb, KeyType& out) + { + out.resize((sixb.size() * 6) / 8); + + const uint8_t* pos6 = (uint8_t*)sixb.data(); + const uint8_t* pos6_end = (uint8_t*)sixb.data() + sixb.size(); + uint8_t* pos8 = (uint8_t*)out.data(); + + while (pos6_end - pos6 >= 4) + { + pos8[0] = (pos6[0] << 2) | (pos6[1] >> 4); // 6 + 2t + pos8[1] = (pos6[1] << 4) | (pos6[2] >> 2); // 4b + 4t + pos8[2] = (pos6[2] << 6) | pos6[3]; // 2b + 6 + pos6 += 4; + pos8 += 3; + } + switch (pos6_end - pos6) + { + case 3: + pos8[0] = (pos6[0] << 2) | (pos6[1] >> 4); // 6 + 2t + pos8[1] = (pos6[1] << 4) | (pos6[2] >> 2); // 4b + 4t + // pos8[2] = (pos6[2] << 6); // 2b + 6-0 + break; + case 2: + pos8[0] = (pos6[0] << 2) | (pos6[1] >> 4); // 6 + 2t + // pos8[1] = (pos6[1] << 4); // 4b + 4-0 + break; + case 1: + pos8[0] = (pos6[0] << 2); // 6 + 2-0 + break; + } + } + inline key_view to_key6(key_type& key_buf, key_view v) + { + uint32_t bits = v.size() * 8; + uint32_t byte6 = (bits + 5) / 6; + + key_buf.resize(byte6); + + uint8_t* pos6 = (uint8_t*)key_buf.data(); + const uint8_t* pos8 = (uint8_t*)v.data(); + const uint8_t* pos8_end = (uint8_t*)v.data() + v.size(); + + while (pos8_end - pos8 >= 3) + { + pos6[0] = pos8[0] >> 2; + pos6[1] = (pos8[0] & 0x3) << 4 | pos8[1] >> 4; + pos6[2] = (pos8[1] & 0xf) << 2 | (pos8[2] >> 6); + pos6[3] = pos8[2] & 0x3f; + pos8 += 3; + pos6 += 4; + } + + switch (pos8_end - pos8) + { + case 2: + pos6[0] = pos8[0] >> 2; + pos6[1] = (pos8[0] & 0x3) << 4 | pos8[1] >> 4; + pos6[2] = (pos8[1] & 0xf) << 2; + break; + case 1: + pos6[0] = pos8[0] >> 2; + pos6[1] = (pos8[0] & 0x3) << 4; + break; + default: + break; + } + return {key_buf.data(), key_buf.size()}; + } + + +} diff --git a/libraries/triedent/include/triedent/lehmer64.h b/libraries/triedent/include/triedent/lehmer64.h new file mode 100644 index 000000000..401472082 --- /dev/null +++ b/libraries/triedent/include/triedent/lehmer64.h @@ -0,0 +1,88 @@ +#ifndef LEHMER64_H +#define LEHMER64_H + + +/** +* D. H. Lehmer, Mathematical methods in large-scale computing units. +* Proceedings of a Second Symposium on Large Scale Digital Calculating +* Machinery; +* Annals of the Computation Laboratory, Harvard Univ. 26 (1951), pp. 141-146. +* +* P L'Ecuyer, Tables of linear congruential generators of different sizes and +* good lattice structure. Mathematics of Computation of the American +* Mathematical +* Society 68.225 (1999): 249-260. +*/ +struct lehmer64_rng +{ + lehmer64_rng(uint64_t seed) + { + _lehmer64_state = + (((__uint128_t)splitmix64_stateless(seed, 0)) << 64) + splitmix64_stateless(seed, 1); + } + + uint64_t next() + { + _lehmer64_state *= 0xda942042e4dd58b5ull; + auto r = _lehmer64_state >> 64; + + _lehmer64_state = + (((__uint128_t)splitmix64_stateless(r, 0)) << 64) + splitmix64_stateless(r, 1); + + return r; + } + + private: + __uint128_t _lehmer64_state; + + // state for splitmix64 + uint64_t splitmix64_x; /* The state can be seeded with any value. */ + + // call this one before calling splitmix64 + inline void splitmix64_seed(uint64_t seed) { splitmix64_x = seed; } + + // floor( ( (1+sqrt(5))/2 ) * 2**64 MOD 2**64) + static const uint64_t golden_gamma = 0x9E3779B97F4A7C15ull; + + // returns random number, modifies seed[0] + // compared with D. Lemire against + // http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/8-b132/java/util/SplittableRandom.java#SplittableRandom.0gamma + inline uint64_t splitmix64_r(uint64_t* seed) + { + uint64_t z = (*seed += golden_gamma); + // David Stafford's Mix13 for MurmurHash3's 64-bit finalizer + z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull; + z = (z ^ (z >> 27)) * 0x94D049BB133111EBull; + return z ^ (z >> 31); + } + + // returns random number, modifies splitmix64_x + inline uint64_t splitmix64(void) { return splitmix64_r(&splitmix64_x); } + + // returns the 32 least significant bits of a call to splitmix64 + // this is a simple (inlined) function call followed by a cast + inline uint32_t splitmix64_cast32(void) { return (uint32_t)splitmix64(); } + + // returns the value of splitmix64 "offset" steps from seed + inline uint64_t splitmix64_stateless(uint64_t seed, uint64_t offset) + { + seed += offset * golden_gamma; + return splitmix64_r(&seed); + } +}; + +/* +static inline void lehmer64_seed(uint64_t seed) +{ + g_lehmer64_state = + (((__uint128_t)splitmix64_stateless(seed, 0)) << 64) + splitmix64_stateless(seed, 1); +} + +static inline uint64_t lehmer64() +{ + g_lehmer64_state *= UINT64_C(0xda942042e4dd58b5); + return g_lehmer64_state >> 64; +} +*/ + +#endif diff --git a/libraries/triedent/include/triedent/location_lock.hpp b/libraries/triedent/include/triedent/location_lock.hpp index b830985b0..d9403f6d0 100644 --- a/libraries/triedent/include/triedent/location_lock.hpp +++ b/libraries/triedent/include/triedent/location_lock.hpp @@ -107,7 +107,9 @@ namespace triedent (64 - sizeof(_waiting) - 2 * sizeof(_mutex)) / sizeof(object_id); object_id _locked_ids[max_locks]; }; - static_assert(sizeof(location_mutex) == 64); + // TODO: Why do we care about the size, moving the atomics to alignas(64) prevents + // false cacheline sharing... + //static_assert(sizeof(location_mutex) == 64); class location_lock { diff --git a/libraries/triedent/include/triedent/mapping.hpp b/libraries/triedent/include/triedent/mapping.hpp index dd4a01484..36a416d66 100644 --- a/libraries/triedent/include/triedent/mapping.hpp +++ b/libraries/triedent/include/triedent/mapping.hpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace triedent { @@ -14,6 +15,39 @@ namespace triedent read_write = 1 }; + /** + * For ACID **Durablity** requriments this configures + * how agressive triedent will be in flushing data to disk. + * + * 0. none - msync() will not be called and data will be + * lost if the computer crashes. So long as the OS + * doesn't crash your data is safe even if your + * program crashes. + * 1. async - msync(MS_ASYNC) will be used which will tell + * the OS to write as soon as possible without blocking + * the caller. This will write data frequently, and + * churn the SSD more than none. + * 2. sync - msync(MS_SYNC) will be used to block caller + * when they update the top-root. In this mode the + * database is "gauranteed" to be recoverable assuming + * the OS didn't silently buffer and the disks didn't + * silently buffer contrary to the implied behavior + * of msync() + * + */ + enum sync_type + { + none = 0, // on program close or as OS chooses + async = 1, // nonblocking, but write soon + sync = 2 // block until changes are committed to disk + }; + // none is implimented by specifiying MS_ASYNC and MS_SYNC which will + // cause msync to fail if not checked. + inline int msync_flag(sync_type st ) { + static int flags[] = { MS_ASYNC|MS_SYNC, MS_ASYNC, MS_SYNC }; + return flags[(int)st]; + }; + // Thread safety: // // The file must not be resized by another process @@ -48,6 +82,12 @@ namespace triedent std::size_t size() const { return _size; } bool pinned() const { return _pinned; } access_mode mode() const { return _mode; } + void sync( sync_type st = sync_type::sync) { + if( not msync_flag(st) ) return; + if( msync( data(), size(), msync_flag(st) ) ) { + throw std::runtime_error( "mapping.hpp: msync returned -1" ); + } + } private: std::atomic _data; diff --git a/libraries/triedent/include/triedent/node.hpp b/libraries/triedent/include/triedent/node.hpp index 01af6bf56..595072b76 100644 --- a/libraries/triedent/include/triedent/node.hpp +++ b/libraries/triedent/include/triedent/node.hpp @@ -1,6 +1,6 @@ #pragma once -#include #include +#include #include @@ -13,17 +13,13 @@ namespace triedent using key_type = std::string; using value_type = key_type; - object_id bump_refcount_or_copy(cache_allocator& ra, - std::unique_lock&, - object_id id); + using session_rlock = seg_allocator::session::read_lock; + template + using object_ref = session_rlock::object_ref; - class node - { - public: - inline uint8_t key_size() const { return (*reinterpret_cast(this)); } - }; + object_id bump_refcount_or_copy(session_rlock& state, object_id id); - class value_node : public node + class value_node { public: inline uint32_t key_size() const { return _key_size; } @@ -51,61 +47,56 @@ namespace triedent inline value_view data() const { return value_view(data_ptr(), data_size()); } inline key_view key() const { return key_view(key_ptr(), key_size()); } - inline static std::pair make( - cache_allocator& a, - std::unique_lock& session, - key_view key, - value_view val, - node_type type) + inline static object_ref make(session_rlock& state, + key_view key, + value_view val, + node_type type) { assert(val.size() < 0xffffff - key.size() - sizeof(value_node)); uint32_t alloc_size = sizeof(value_node) + key.size() + val.size(); - auto r = a.alloc(session, alloc_size, type); + auto r = state.alloc(alloc_size, type); if constexpr (debug_nodes) - std::cout << r.first.get_id().id << ": construct value_node: type=" << (int)type - << std::endl; - return std::make_pair(std::move(r.first), new (r.second) value_node(a, key, val)); + std::cout << r.id().id << ": construct value_node: type=" << (int)type + << " ref = " << r.ref_count() << std::endl; + new (r.data()) value_node(key, val); + return r; } // If id is non-null, it must refer to a source object that is being copied // Otherwise, key and value must be pointers to external memory - inline static std::pair clone( - cache_allocator& a, - std::unique_lock& session, - object_id id, - key_view key, - std::uint32_t key_offset, - value_view val, - node_type type) + inline static object_ref clone(session_rlock& state, + object_id id, + key_view key, + std::uint32_t key_offset, + value_view val, + node_type type) { if (id && type == node_type::roots) { - return clone_roots(a, session, id, key, key_offset, val, type); + return clone_roots(state, id, key, key_offset, val, type); } else { - return clone_bytes(a, session, id, key, key_offset, val, type); + return clone_bytes(state, id, key, key_offset, val, type); } } - inline static std::pair clone_bytes( - cache_allocator& a, - std::unique_lock& session, - object_id id, - key_view key, - std::uint32_t key_offset, - value_view val, - node_type type) + inline static object_ref clone_bytes(session_rlock& state, + object_id id, + key_view key, + std::uint32_t key_offset, + value_view val, + node_type type) { if (key_offset != std::uint32_t(-1)) key = key.substr(key_offset); assert(val.size() < 0xffffff - key.size() - sizeof(value_node)); uint32_t alloc_size = sizeof(value_node) + key.size() + val.size(); // alloc invalidates key and val - auto r = a.alloc(session, alloc_size, type); + auto r = state.alloc(alloc_size, type); if (id) { - auto ptr = get(a, session, id); + auto ptr = state.get(id).as(); if (key_offset != std::uint32_t(-1)) { key = ptr->key().substr(key_offset); @@ -113,18 +104,20 @@ namespace triedent val = ptr->data(); } if constexpr (debug_nodes) - std::cout << r.first.get_id().id << ": construct value_node: type=" << (int)type - << std::endl; - return std::make_pair(std::move(r.first), new (r.second) value_node(a, key, val)); + std::cout << r.id().id << ": construct value_node: type=" << (int)type << std::endl; + new (r.data()) value_node(key, val); + // r.obj()->update_checksum(); + return r; } - inline static std::pair clone_roots( - cache_allocator& a, - std::unique_lock& session, - object_id id, - key_view key, - std::uint32_t key_offset, - value_view val, - node_type type) + + // TODO: all clone functions should take object_ref instead of id to avoid + // having to look up the object twice! + inline static object_ref clone_roots(session_rlock& state, + object_id id, + key_view key, + std::uint32_t key_offset, + value_view val, + node_type type) { const std::size_t value_size = val.size(); if (key_offset != std::uint32_t(-1)) @@ -137,30 +130,32 @@ namespace triedent // copy_node or alloc invalidates key and val for (std::size_t i = 0; i < n; ++i) { - roots[i] = bump_refcount_or_copy(a, session, roots[i]); + roots[i] = bump_refcount_or_copy(state, roots[i]); } - auto r = a.alloc(session, alloc_size, type); + auto r = state.alloc(alloc_size, type); { if (key_offset != std::uint32_t(-1)) { - auto ptr = get(a, session, id); - key = ptr->key().substr(key_offset); + auto& in = state.get(id).as_value_node(); + key = in.key().substr(key_offset); } val = {reinterpret_cast(&roots[0]), value_size}; } if constexpr (debug_nodes) - std::cout << r.first.get_id().id << ": construct value_node: type=" << (int)type - << std::endl; - return std::make_pair(std::move(r.first), new (r.second) value_node(a, key, val)); + std::cout << r.id().id << ": construct value_node: type=" << (int)type << std::endl; + return r; } private: - static value_node* get(cache_allocator& a, session_lock_ref<> session, object_id id) +#if 0 // this shouldn't be needed any more + static value_node* get(session_rlock& state, object_id id) { - auto [ptr, type, ref] = a.get_cache(session, id); - return reinterpret_cast(ptr); + //auto [ptr, type, ref] = a.get_cache(session, id); + auto val = state.get(id, false /* NO COPY */); + return reinterpret_cast(val.obj()); } - value_node(cache_allocator& ra, key_view key, value_view val) +#endif + value_node(key_view key, value_view val) { _key_size = key.size(); if (!key.empty()) @@ -173,7 +168,7 @@ namespace triedent }; static_assert(sizeof(value_node) == 1, "unexpected padding"); - class inner_node : public node + class inner_node { public: inline object_id& branch(uint8_t b); @@ -196,25 +191,22 @@ namespace triedent inline int8_t reverse_lower_bound(uint8_t b) const; inline uint8_t upper_bound(uint8_t b) const; - inline static std::pair clone( - cache_allocator& a, - std::unique_lock& session, - object_id id, - const inner_node* in, - key_view key, - std::uint32_t key_offset, - object_id value, - std::uint64_t branches); - - inline static std::pair make( - cache_allocator& a, - std::unique_lock& session, - key_view prefix, - object_id val, - uint64_t branches); + inline static object_ref clone(session_rlock& state, + object_id id, + const inner_node* in, + key_view key, + std::uint32_t key_offset, + object_id value, + std::uint64_t branches); + + inline static object_ref make(session_rlock& state, + key_view prefix, + object_id val, + uint64_t branches); inline bool has_branch(uint32_t b) const { return _present_bits & (1ull << b); } + inline uint8_t key_size() const { return _prefix_length; } inline key_view key() const { return key_view(key_ptr(), key_size()); } inline int32_t branch_index(uint32_t branch) const; @@ -227,11 +219,14 @@ namespace triedent } private: - static inner_node* get(cache_allocator& a, session_lock_ref<> session, object_id id) +#if 0 + static inner_node* get(session_rlock& state, object_id id) { - auto [ptr, type, ref] = a.get_cache(session, id); - return reinterpret_cast(ptr); + auto ptr = a.get(id, false); // TODO: why not copy here? + return reinterpret_cast(ptr.obj()); } +#endif + inner_node(object_id id, key_view prefix, object_id val, @@ -242,20 +237,19 @@ namespace triedent uint8_t _prefix_length = 0; // mirrors value nodes to signal type and prefix length uint8_t _reserved_a = 0; // future use uint8_t _reserved_b = 0; // future use + uint8_t _reserved_c = 0; // future use object_id _value; // this is 5 bytes uint64_t _present_bits = 0; // keep this 8 byte aligned for popcount instructions } __attribute__((packed)); - static_assert(sizeof(inner_node) == 3 + 5 + 8, "unexpected padding"); - - inline std::pair inner_node::clone( - cache_allocator& a, - std::unique_lock& session, - object_id id, - const inner_node* in, - key_view key, - std::uint32_t key_offset, - object_id value, - std::uint64_t branches) + static_assert(sizeof(inner_node) == 4 + sizeof(object_id) + 8, "unexpected padding"); + + inline object_ref inner_node::clone(session_rlock& state, + object_id id, + const inner_node* in, + key_view key, + std::uint32_t key_offset, + object_id value, + std::uint64_t branches) { if (key_offset != std::uint32_t(-1)) key = key.substr(key_offset); @@ -264,7 +258,7 @@ namespace triedent object_id children[n + 1]; if (in->_present_bits == branches) { - std::memcpy(&children[0], in->children(), sizeof(children)); + std::memcpy(&children[0], in->children(), n * sizeof(object_id)); } else { @@ -281,40 +275,42 @@ namespace triedent } } // invalidates in and prefix - value = bump_refcount_or_copy(a, session, value); + value = bump_refcount_or_copy(state, value); for (std::size_t i = 0; i < n; ++i) { - children[i] = bump_refcount_or_copy(a, session, children[i]); + children[i] = bump_refcount_or_copy(state, children[i]); } - auto p = a.alloc(session, alloc_size, node_type::inner); + auto p = state.alloc(alloc_size, node_type::inner); if (key_offset != std::uint32_t(-1)) { - in = get(a, session, id); + in = state.get(id).as(); + // TODO: cache? key = in->key().substr(key_offset); } - auto newid = p.first.get_id(); + auto newid = p.id(); if constexpr (debug_nodes) - std::cout << newid.id << ": construct inner_node" << std::endl; - return std::make_pair(std::move(p.first), - new (p.second) inner_node(newid, key, value, branches, children)); + std::cout << newid.id << ": construct inner_node " << std::endl; + + new (p.data()) inner_node(newid, key, value, branches, children); + return p; } - inline std::pair inner_node::make( - cache_allocator& a, - std::unique_lock& session, - key_view prefix, - object_id val, - uint64_t branches) + inline object_ref inner_node::make(session_rlock& state, + key_view prefix, + object_id val, + uint64_t branches) { uint32_t alloc_size = sizeof(inner_node) + prefix.size() + std::popcount(branches) * sizeof(object_id); - auto p = a.alloc(session, alloc_size, node_type::inner); - auto id = p.first.get_id(); + auto p = state.alloc(alloc_size, node_type::inner); + auto id = p.id(); if constexpr (debug_nodes) - std::cout << id.id << ": construct inner_node" << std::endl; - return std::make_pair(std::move(p.first), - new (p.second) inner_node(id, prefix, val, branches)); + std::cout << p.id().id << ": construct inner_node val=" << val.id + << " ref: " << p.ref_count() << std::endl; + + new (p.data()) inner_node(id, prefix, val, branches); + return p; } inline inner_node::inner_node(object_id id, key_view prefix, object_id val, uint64_t branches) @@ -395,17 +391,45 @@ namespace triedent return b >= 63 ? 64 : std::countr_zero(_present_bits & mask); } - inline void release_node(session_lock_ref<> l, cache_allocator& ra, object_id obj) + // makes sure all nodes are reachable with a ref-count of 1+ + // TODO make sure all object hashes check out + inline bool validate_node(session_rlock& state, object_id obj, int depth = 0) { - if (!obj) - return; - auto [ptr, type] = ra.release(l, obj); - if (ptr && type == node_type::inner) + if (not obj.id) + { + return true; + } + auto oref = state.get(obj); // don't try to cache, we are releasing! + // + if (not oref.ref_count()) + { + throw std::runtime_error("0 ref count!"); + } + + if( not oref.obj()->validate_checksum() ) { + std::cout << obj.id <<": validate checkusm failed "<check <<" != " << oref.obj()->calculate_checksum() <<"\n"; + return false; + } + auto ctype = oref.type(); + auto& in = oref.as_inner_node(); + + bool error = false; + auto oj = oref.obj(); + if (oj->get_type() != oref.type()) + { + std::cerr << "obj: " << obj.id << " invariant violation id.type (" << (int)oref.type() + << ") and obj->type (" << (int)oj->get_type() << ") are not equal!\n"; + std::cerr << " obj->size: " << oj->size <<" id: " << oj->id<<" "; + std::cerr << " refc: " << oref.ref_count() <<" check: " << oj->check <<"\n"; + error = true; + } + + if (ctype == node_type::inner) { - auto& in = *reinterpret_cast(ptr); if constexpr (debug_nodes) - std::cout << obj.id << ": destroying; release value " << in.value().id << std::endl; - release_node(l, ra, in.value()); + std::cout << obj.id << ": validating; inner value " << in.value().id << std::endl; + if (not validate_node(state, in.value(), depth+1)) + return false; auto nb = in.num_branches(); auto pos = in.children(); auto end = pos + nb; @@ -413,57 +437,109 @@ namespace triedent { assert(*pos); if constexpr (debug_nodes) - std::cout << obj.id << ": destroying; release child " << pos->id << std::endl; - release_node(l, ra, *pos); + std::cout << obj.id << ": validating; inner child child " << pos->id << std::endl; + if (not validate_node(state, *pos, depth+1)) + return false; ++pos; } } - if (ptr && type == node_type::roots) + else if (ctype == node_type::roots) { - auto& vn = *reinterpret_cast(ptr); + auto& vn = reinterpret_cast(in); auto n = vn.num_roots(); auto roots = vn.roots(); while (n--) { if constexpr (debug_nodes) std::cout << obj.id << ": destroying; release root " << roots->id << std::endl; - release_node(l, ra, *roots++); + if (not validate_node(state, *roots++, depth+1)) + return false; + } + } + else if (ctype == node_type::bytes) + { + auto& vn = reinterpret_cast(in); + if( error ) + TRIEDENT_WARN( "value.key_size(): ", vn.key_size(), " data size: " , + vn.data_size(), " depth: ", depth ); + } + else + { + throw std::runtime_error("validating unknown node type"); + return false; + } + return true; + } + inline void release_node(session_rlock& state, object_id obj) + { + if (!obj) + return; + auto oref = state.get(obj); // don't try to cache, we are releasing! + auto ctype = oref.type(); + + // std::cerr << "before release node: " << obj.id <<" type: " << (int)oref.type() <<" loc: " << oref.location()._offset <<" ref: " << oref.ref_count()<<"\n"; + + // save the pointer in advance, because if released oref will return null + // the in pointer is still valid for the duration of state + auto& in = oref.as_inner_node(); + if (oref.release()) + { + if (ctype == node_type::inner) + { + if constexpr (debug_nodes) + std::cout << obj.id << ": destroying; release value " << in.value().id << std::endl; + release_node(state, in.value()); + auto nb = in.num_branches(); + auto pos = in.children(); + auto end = pos + nb; + while (pos != end) + { + assert(*pos); + if constexpr (debug_nodes) + std::cout << obj.id << ": destroying; release child " << pos->id << std::endl; + release_node(state, *pos); + ++pos; + } + } + else if (ctype == node_type::roots) + { + auto& vn = reinterpret_cast(in); //oref.as_value_node(); + auto n = vn.num_roots(); + auto roots = vn.roots(); + while (n--) + { + if constexpr (debug_nodes) + std::cout << obj.id << ": destroying; release root " << roots->id << std::endl; + release_node(state, *roots++); + } } } } - inline location_lock copy_node(cache_allocator& ra, - std::unique_lock& session, - object_id id, - void* ptr, - node_type type) + inline object_ref copy_node(session_rlock& state, object_ref oref) { - if (type != node_type::inner) + if (oref.type() != node_type::inner) // value or roots { - auto src = reinterpret_cast(ptr); - auto [lock, dest] = value_node::clone(ra, session, id, src->key(), 0, src->data(), type); - return std::move(lock); + auto& src = oref.as_value_node(); + return value_node::clone(state, oref.id(), src.key(), 0, src.data(), oref.type()); } else { - auto src = reinterpret_cast(ptr); - auto [lock, dest] = - inner_node::clone(ra, session, id, src, src->key(), 0, src->value(), src->branches()); - return std::move(lock); + auto& src = oref.as_inner_node(); + return inner_node::clone(state, oref.id(), &src, src.key(), 0, src.value(), + src.branches()); } } - inline object_id bump_refcount_or_copy(cache_allocator& ra, - std::unique_lock& session, - object_id id) + inline object_id bump_refcount_or_copy(session_rlock& state, object_id id) { if (!id) return id; if constexpr (debug_nodes) std::cout << id.id << ": bump_refcount_or_copy" << std::endl; - if (ra.bump_count(id)) - return id; - auto [ptr, type, ref] = ra.get_cache(session, id); - return copy_node(ra, session, id, ptr, type).get_id(); + auto oref = state.get(id); // TODO cache? + if (oref.retain()) + return oref.id(); + return copy_node(state, oref).id(); } } // namespace triedent diff --git a/libraries/triedent/include/triedent/object_db.hpp b/libraries/triedent/include/triedent/object_db.hpp index 76c33e92e..120bf2bc9 100644 --- a/libraries/triedent/include/triedent/object_db.hpp +++ b/libraries/triedent/include/triedent/object_db.hpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -32,7 +31,6 @@ namespace triedent std::uint64_t offset() const { return _offset * 8; } constexpr object_info& set_location(object_location loc) { - cache = loc.cache; _offset = loc.offset / 8; return *this; } @@ -40,21 +38,9 @@ namespace triedent { return ref | (_type << 15) | (cache << 17) | (_offset << 19); } - constexpr operator object_location() const { return {.offset = _offset * 8, .cache = cache}; } + constexpr operator object_location() const { return {.offset = _offset * 8}; } }; - struct mutex_group - { - static constexpr std::size_t count = 64; - static constexpr std::size_t align = 64; - explicit mutex_group() : _items(new location_mutex[count]) {} - location_mutex& operator()(void* base, void* ptr) const - { - auto diff = reinterpret_cast(ptr) - reinterpret_cast(base); - return _items[(diff / align) % count]; - } - std::unique_ptr _items; - }; /** * Assignes unique ids to objects, tracks their reference counts, @@ -62,7 +48,7 @@ namespace triedent */ class object_db { - friend location_lock; + // friend location_lock; public: using object_id = triedent::object_id; @@ -92,34 +78,8 @@ namespace triedent return true; } - // A thread which holds a location_lock may: - // * Move the object to another location - // * Modify the object if it's not already exposed to reader threads - - // Only acquire the lock if id points to loc - location_lock lock(object_id id, object_location loc) - { - auto* h = header(); - auto& atomic = h->objects[id.id]; - // If the object has already been moved, don't bother locking - if (object_info info{atomic.load()}; info.ref != 0 && info == loc) - { - location_lock l{_location_mutexes(h, &atomic), id}; - if (object_info info{atomic.load()}; info.ref != 0 && info == loc) - { - return l; - } - } - return location_lock{}; - } - location_lock lock(object_id id) - { - auto* h = header(); - auto& atomic = h->objects[id.id]; - return location_lock{_location_mutexes(h, &atomic), id}; - } - void move(const location_lock& lock, object_location loc) + void move( object_location loc) { auto& atomic = header()->objects[lock.get_id().id]; auto obj = atomic.load(); @@ -130,6 +90,7 @@ namespace triedent debug(lock.get_id().id, "move"); } + /* bool compare_and_move(const location_lock& lock, object_location expected, object_location loc) @@ -149,6 +110,7 @@ namespace triedent } } } + */ // The id must not be accessible to any thread // besides the creator. @@ -161,7 +123,7 @@ namespace triedent atomic.store(info.to_int()); } - object_id alloc(std::unique_lock&, node_type type); + object_id alloc(node_type type); object_info release(object_id id); @@ -183,11 +145,14 @@ namespace triedent void gc_finish(); bool pinned() const { return _region.pinned(); } + + /* std::span span() const { std::lock_guard l{_region_mutex}; return {reinterpret_cast(_region.data()), _region.size()}; } + */ private: static constexpr uint64_t ref_count_mask = (1ull << 15) - 1; @@ -198,8 +163,8 @@ namespace triedent // 19-63 offset or next_ptr // clang-format off - static uint64_t extract_next_ptr(uint64_t x) { return x >> 15; } - static uint64_t create_next_ptr(uint64_t x) { return x << 15; } + static inline uint64_t extract_next_ptr(uint64_t x) { return x >> 15; } + static inline uint64_t create_next_ptr(uint64_t x) { return x << 15; } // clang-format on static uint64_t obj_val(node_type type, uint16_t ref) @@ -228,7 +193,7 @@ namespace triedent gc_queue& _gc; mapping _region; mutable std::mutex _region_mutex; - mutex_group _location_mutexes; + //mutex_group _location_mutexes; object_db_header* header() { return reinterpret_cast(_region.data()); } @@ -282,7 +247,7 @@ namespace triedent idfile.native()); } - inline object_id object_db::alloc(std::unique_lock& session, node_type type) + inline object_id object_db::alloc(node_type type) { std::lock_guard l{_region_mutex}; auto _header = header(); diff --git a/libraries/triedent/include/triedent/object_fwd.hpp b/libraries/triedent/include/triedent/object_fwd.hpp index aa62b5920..268a0c2f6 100644 --- a/libraries/triedent/include/triedent/object_fwd.hpp +++ b/libraries/triedent/include/triedent/object_fwd.hpp @@ -1,29 +1,183 @@ #pragma once +#include #include +#include + +#define XXH_INLINE_ALL +#include namespace triedent { + using segment_offset = uint32_t; /// offset pointer from base of segment + using segment_number = uint64_t; /// segment_offset / segment_size + + class node; + class value_node; + class inner_node; + + // must be a power of 2 + // size of the data segments data is allocated in + // the smaller this value, the more overhead there is in + // searching for segments to manage and the free list + // each thread will have a segment this size, so larger values + // may use more memory than necessary for idle threads + // max value: 4 GB due to type of segment_offset + static const uint64_t segment_size = 1024 * 1024 * 128; // 256mb + + /// object pointers can only address 48 bits + /// 128 TB limit on database size with 47 bits, this saves us + /// 8MB of memory relative to 48 bits in cases with less than 128 TB + static const uint64_t max_segment_count = (1ull << 47) / segment_size; + + /** + * An offset/8 from object_db_header::alloc_segments encoded + * as 5 bytes. This allows addressing of 8TB worth of object IDs which + * is way beyond what will fit in RAM of most computers, 32 bits would + * have only supported 32GB of object IDs which clearly fits within the + * RAM of many laptops. 8 TB + */ struct object_id { - std::uint64_t id : 40 = 0; // obj id - explicit operator bool() const { return id != 0; } - friend bool operator==(object_id a, object_id b) = default; + uint64_t id : 40 = 0; // obj id + explicit operator bool() const { return id != 0; } + friend bool operator==(object_id a, object_id b) = default; } __attribute__((packed)) __attribute__((aligned(1))); static_assert(sizeof(object_id) == 5, "unexpected padding"); static_assert(alignof(object_id) == 1, "unexpected alignment"); enum class node_type : std::uint8_t { - inner, - bytes, - roots, + inner = 0, + bytes = 1, + roots = 2, + undefined = 3 }; + class object_info; struct object_location { - std::uint64_t offset : 48; - std::uint64_t cache : 2; - friend bool operator==(const object_location&, const object_location&) = default; + uint32_t segment() const { return _offset / segment_size; } + uint32_t index() const { return _offset & (segment_size - 1); } + + friend bool operator==(const object_location&, const object_location&) = default; + + friend class object_info; + uint64_t _offset : 48; + }; + + /** future replacement for object info, designed to + * get rid of the bit fields and unnecessary shifting/setting on construction + * so that this type can be used everywhere rather than manually twiddling bits + * all over the code that could get out of sync with the header + * + struct object_meta { + public: + explicit object_meta( uint64_t v = 0 ):_value(v){}; + object_meta& set_location( uint64_t loc ) { + assert( not loc & 0x7 ); + assert( (loc >> 3) == (loc / 8) ); + loc << (location_rshift-3); + value = (value & ~location_mask) | loc; + return *this; + } + object_meta& set_type( node_type type ) { + value = (value & ~type_mask ) | (uint64_t(type) << type_lshift); + } + uint32_t ref() { return _value & ref_mask; } + node_type type(){ return node_type( (_value & type_mask) >> type_lshift); } + uint64_t& data(){ return _value; } + private: + uint64_t _value; + }; + */ + + class object_info + { + public: + static const uint64_t ref_mask = 0x7fff; + static const uint64_t max_ref_count = ref_mask - 64; // allow some overflow bits for retain + static const uint64_t read_mask = 3 << 17; + static const uint64_t type_mask = 3 << 15; + static const uint64_t location_mask = ~(type_mask | read_mask | ref_mask); + static const uint32_t location_lshift = 45; + static const uint32_t location_rshift = 64 - location_lshift; + + explicit constexpr object_info(uint64_t x) + : _location(x >> location_rshift), + _read((x >> 17) & 3), + _type((x >> 15) & 3), + _ref(x & ref_mask) + { + } + object_info(node_type t, uint64_t loc = -1) : _type((int)t) + { + _ref = 0; + _read = 0; + _location = loc; + }; + + uint8_t read() const { return _read; } + uint32_t ref() const { return _ref; } + node_type type() const { return static_cast(_type); } + auto location() const { return object_location{_location * 8}; } + + void set_type(node_type t) { _type = (int)t; } + + // pre set location + constexpr object_info& set_location(const object_location& loc) + { + _location = loc._offset / 8; + return *this; + } + + constexpr uint64_t to_int() const + { + return _ref | (_type << 15) | (_read << 17) | (_location << 19); + } + constexpr operator object_location() const + { + return object_location{._offset = _location * 8}; + } + + //private: + friend class object_location; + uint64_t _ref : 15; + uint64_t _type : 2; + uint64_t _read : 2; + uint64_t _location : 45; }; + static_assert(sizeof(object_info) == sizeof(uint64_t), "unexpected padding"); + + struct object_header + { + uint32_t check = 0; // xxhash checksum of thre next size bytes + uint32_t type: 4; + uint32_t size: 28; + // size might not be a multiple of 8, next object is at data() + (size+7)&-8 + uint64_t unused: 24; // bytes of data, not including header + uint64_t id : 40; + + node_type get_type()const { return (node_type)type; } + void set_type( node_type t ) { type = (uint8_t) t; } + void set_id( object_id d ) { id = d.id; } + object_id get_id()const { return {id}; } + inline uint64_t data_size() const { return size; } + inline uint32_t data_capacity() const { return (size + 7) & -8; } + inline char* data() const { return (char*)(this + 1); } + + uint32_t calculate_checksum() { + return XXH3_64bits( &check+1, size + sizeof(object_header) - sizeof(check) ); + } + void update_checksum() { check = calculate_checksum(); } + bool validate_checksum() { return check == calculate_checksum(); } + + + // returns the end of data_capacity() cast as another object_header + inline object_header* next() const { return (object_header*)(((char*)this) + object_size()); } + + // capacity + sizeof(object_header) + inline uint32_t object_size() const { return data_capacity() + sizeof(object_header); } + }__attribute__((packed)) __attribute__((aligned(8))); + } // namespace triedent diff --git a/libraries/triedent/include/triedent/region_allocator.hpp b/libraries/triedent/include/triedent/region_allocator.hpp index f17cc4d26..f3ce153f8 100644 --- a/libraries/triedent/include/triedent/region_allocator.hpp +++ b/libraries/triedent/include/triedent/region_allocator.hpp @@ -28,6 +28,9 @@ namespace triedent access_mode mode, std::uint64_t initial_size = 64 * 1024 * 1024); ~region_allocator(); + + + // TODO: how is this a try? This is a do or hang/die trying? void* try_allocate(std::unique_lock& session, object_id id, std::uint32_t size, diff --git a/libraries/triedent/include/triedent/ring_allocator.hpp b/libraries/triedent/include/triedent/ring_allocator.hpp index 619da8dc9..4e04c3acd 100644 --- a/libraries/triedent/include/triedent/ring_allocator.hpp +++ b/libraries/triedent/include/triedent/ring_allocator.hpp @@ -16,16 +16,6 @@ namespace triedent { - struct object_header - { - // size might not be a multiple of 8, next object is at data() + (size+7)&-8 - uint64_t size : 24; // bytes of data, not including header - uint64_t id : 40; - - inline uint64_t data_size() const { return size; } - inline uint32_t data_capacity() const { return (size + 7) & -8; } - inline void* data() const { return (char*)(this + 1); } - }; // ring_allocator allocates memory from a single circular buffer. // The buffer is divided into three regions @@ -297,15 +287,12 @@ namespace triedent { uint64_t used_size = alloc_size(size); - std::unique_lock l{_free_mutex}; - if (check_contiguous_free_space(used_size)) + std::unique_lock l(_free_mutex,std::defer_lock); + if( l.try_lock() and check_contiguous_free_space(used_size) ) { return allocate_impl(size, used_size, id, init); } - else - { - return nullptr; - } + return nullptr; } template diff --git a/libraries/triedent/include/triedent/seg_allocator.hpp b/libraries/triedent/include/triedent/seg_allocator.hpp new file mode 100644 index 000000000..27bdcd0a5 --- /dev/null +++ b/libraries/triedent/include/triedent/seg_allocator.hpp @@ -0,0 +1,781 @@ +#pragma once +#include +#include +#include +#include + +/** + * @file seg_allocator.hpp + * + * Responsible for allocating large segments of memory (256MB), each + * segment in turn stores objects pointed to from the id_allocator. + * + * 1. Each thread has its own session and allocates to its own + * segment in an append-only manner. + * 2. Once an object has been written to a segment and its location + * exposed to the id_allocator it is considered immutible by + * the segment allocator, apps may still mutate it if they independently + * verify that only one thread is reading it and they lock the id while + * modifying it so that the gc thread doesn't try to compact it. + * - this should be unlikely because all modify in place operations + * occur with temporary, uncommited data which will likely be in + * the active allocation segment where it won't be moved anyway + * 3. Once a segment is full, it is marked as read-only to the seg_allocator until + * its data is no longer referenced and the segment can be + * recycled. E.g. no new allocation will write over it. + * 4. When data is read, it is copied to the current alloc segment unless + * another thread locked it first. Once copied the + * item's location in the object db is updated and it is unlocked. + * + * No threads need to wait for the copy because the data in the old location and new location + * are identical and the reader already has a "lock" on the old location + * + * 5. A garbage-collector (GC) thread finds the most empty segment and moves + * all of the objects that remain to its own segment, then makes the + * segment available for reuse by other threads (one all threads have + * released the implied write lock) + * 6. the Object ID allocation system can be made thread safe by giving + * each "writing session" a "segment" of the object id space. Writers would + * only have to synchronize id allocation requests when their segments are full. + * + * Theory: + * a. data will be organized in the order it tends to be accessed in + * b. infrequently accessed data will be grouped together by GC + * c. the most-recent N segments can have their memory pinned + * d. madvise can be effeciently used to mark alloc segmentsfor + * SEQ and/or pin them to memory. It can also mark segments as + * RANDOM or UNNEEDED to improve OS cache managment. + * + * + */ + +namespace triedent +{ + /// index into meta[free_segment_index]._free_segment_number + using free_segment_index = uint64_t; + + // types that are memory mapped + namespace mapped_memory + { + + // meta data about each segment, + // stored in an array in allocator_header indexed by segment number + struct segment_meta + { + // returns the free space in bytes, and number of objects freed + std::pair get_free_space_and_objs() const + { + uint64_t v = _free_space_and_obj.load(std::memory_order_relaxed); + return std::make_pair(v >> 32, v & 0xffffffff); + } + + // notes that an object of size was freed + void free_object(uint32_t size) + { + uint64_t so = size; + so <<= 32; + so += 1; + _free_space_and_obj.fetch_add(so, std::memory_order_relaxed); + } + + // doesn't increment object count + void free(uint32_t size) + { + uint64_t so = size; + so <<= 32; + _free_space_and_obj.fetch_add(so, std::memory_order_relaxed); + } + + void clear() + { + _free_space_and_obj.store(0, std::memory_order_relaxed); + _last_sync_pos.store(segment_size, std::memory_order_relaxed); + } + + /// the total number of bytes freed by swap + /// or by being moved to other segments. + std::atomic _free_space_and_obj; + std::atomic _last_sync_pos; // position of alloc pointer when last synced + }; + + /// should align on a page boundary + struct segment_header + { + // the next position to allocate data, only + // used by the thread that owns this segment and + // set to uint64_t max when this segment is ready + // to be marked read only to the seg_allocator + std::atomic _alloc_pos = 16; // sizeof(segment_header) + uint32_t + _age; // every time a segment is allocated it is assigned an age which aids in reconstruction + // used to calculate object density of segment header, + // to establish madvise + uint32_t _num_objects = 0; // inc on alloc + uint32_t _checksum = 0; // TODO + }; + static_assert(sizeof(segment_header) == 16); + + struct allocator_header + { + // when no segments are available for reuse, advance by segment_size + alignas(64) std::atomic alloc_ptr; // A below + alignas(64) std::atomic end_ptr; // E below + + // set to 0 just before exit, set to 1 when opening database + std::atomic clean_exit_flag; + std::atomic next_alloc_age = 0; + + // meta data associated with each segment, indexed by segment number + segment_meta seg_meta[max_segment_count]; + + // circular buffer described, big enough to hold every + // potentially allocated segment which is subseuently freed. + // + // |-------A----R1--R2---E-------------| max_segment_count + // + // A = alloc_ptr where recycled segments are used + // R* = session_ptrs last known recycled segment by each session + // E = end_ptr where the next freed segment is posted to be recycled + // Initial condition A = R* = E = 0 + // Invariant A <= R* <= E unless R* == -1 + // + // If A == min(R*) then we must ask block_alloc to create a new segment + // + // A, R*, and E are 64 bit numbers that count to infinity, the + // index in the buffer is A % max_segment_count which should be + // a simple bitwise & operation if max_segment_count is a power of 2. + // The values between [A-E) point to recyclable segments assuming no R* + // is present. Values before A or E and after point to no valid segments + segment_number free_seg_buffer[max_segment_count]; + }; + + /// crash recovery: + /// 1. scan all segments to find those that were mid-allocation: + /// if a lot of free space, then swap them and push to free seg buffer + /// 2. Update reference counts on all objects in database + /// 3. ? pray ? + + } // namespace mapped_memory + + class seg_allocator + { + public: + // only 64 bits in bitfield used to allocate sessions + // only really require 1 per thread + static const uint32_t max_session_count = 64; + + seg_allocator(std::filesystem::path dir); + ~seg_allocator(); + + void dump(); + void sync(sync_type st = sync_type::sync); + void start_compact_thread(); + bool compact_next_segment(); + + class session + { + public: + /** + * Ensures the read-lock is released so segments can be recycled + * and ensures that all data access flows through a read_lock. + * + * note: no mutexes are involved with this lock + */ + class read_lock + { + public: + template + class object_ref + { + public: + template + friend class object_ref; + + template + object_ref(object_ref p) + : _rlock(p._rlock), _id(p._id), _atom_loc(p._atom_loc), _cached(p._cached) + // _ptr(p._ptr) + { + // assert(_ptr == nullptr or (_ptr and (_ptr->id == _id.id))); + } + + object_id id() const { return _id; } + uint32_t ref_count() const { return _cached.ref(); } + node_type type() const { return _cached.type(); } + auto read() const { return _cached.read(); } + object_location location() const { return _cached.location(); } + + // return false if ref count overflow + bool retain(); + // return true if object is deleted + bool release(); + const object_header* obj() const; // TODO: rename header() + object_header* obj(); // TODO: rename header() + + char* data() + { + assert(obj()); + return obj()->data(); + } + + template + Type* as() + { + return reinterpret_cast(obj()->data()); + }; + template + const Type* as() const + { + return reinterpret_cast(obj()->data()); + }; + + explicit inline operator bool() const { return bool(id()); } + bool is_leaf_node() const { return type() != node_type::inner; } + inline auto& as_value_node() const { return *this->template as(); } + inline auto& as_inner_node() const { return *this->template as(); } + + inline const T* operator->() const { return this->template as(); } + inline T* operator->() { return this->template as(); } + inline const T& operator*() const { return *(this->template as()); } + inline T& operator*() { return *(this->template as()); } + + int64_t as_id() const { return _id.id; } + + auto loc() const { return _cached.location(); } + + auto& get_mutex() const { return _rlock._session._sega._id_alloc.get_mutex(_id); } + + // return false if object is released while atempting to move + bool move(object_location expect, object_location loc); + + bool cache_object(); + + void refresh() { _cached = object_info(_atom_loc.load(std::memory_order_acquire)); } + + protected: + friend class seg_allocator; + friend class seg_allocator::session; + + object_ref(seg_allocator::session::read_lock& rlock, + object_id id, + std::atomic& atom_loc) + : _rlock(rlock), + _atom_loc(atom_loc), + _cached(atom_loc.load(std::memory_order_acquire)), + _id(id) + { + // assert(_ptr == nullptr or (_ptr and (_ptr->id == _id.id))); + } + + seg_allocator::session::read_lock& _rlock; + std::atomic& _atom_loc; + object_info _cached; // cached read of atomic _atom_loc + object_id _id; + }; + + object_ref alloc(uint32_t size, node_type type); + + template + object_ref get(object_id id) + { + return object_ref(*this, id, _session._sega._id_alloc.get(id)); + } + + object_ref get(object_header*); + + // checks known invariants: + // id < max_id of id_allocator + // id points to obj that points back to it + // ref_count > 0 + // node_type is known and defined + // ptr is in a valid range + // others? + object_ref validate(object_id id) const + { + throw std::runtime_error("read_lock::validate not impl"); + } + + ~read_lock() { _session.release_read_lock(); } + + private: + friend class session; + template + friend class object_ref; + + object_header* get_object_pointer(object_location); + + read_lock(session& s) : _session(s) { _session.retain_read_lock(); } + session& _session; + }; + + // before any objects can be read, the session must note the + // current state of the free segment queue so that no segments that + // could be read while the return value of this method is in scope can + // be reused. + read_lock lock() { return read_lock(*this); } + + ~session() + { + if (_session_num == -1) + return; + if (_alloc_seg_ptr) // not moved + { + if (segment_size - _alloc_seg_ptr->_alloc_pos >= sizeof(object_header)) + { + memset(((char*)_alloc_seg_ptr) + _alloc_seg_ptr->_alloc_pos, 0, + sizeof(object_header)); // mark last object + } + _sega._header->seg_meta[_alloc_seg_num].free(segment_size - + _alloc_seg_ptr->_alloc_pos); + _alloc_seg_ptr->_alloc_pos = uint32_t(-1); + _alloc_seg_num = -1ull; + } + _sega.release_session_num(_session_num); + } + + session(session&& mv) + : _sega(mv._sega), + _session_num(mv._session_num), + _alloc_seg_num(mv._alloc_seg_num), + _alloc_seg_ptr(mv._alloc_seg_ptr) + { + mv._session_num = -1; + } + + private: + friend class lock; + friend class seg_allocator; + + // copy E to R* + void retain_read_lock() + { + auto pt = _sega._session_ptrs[_session_num].load(std::memory_order_acquire); + if (pt == -1ull) + _sega._session_ptrs[_session_num].store( + _sega._header->end_ptr.load(std::memory_order_acquire), + std::memory_order_relaxed); + else // TODO: this may be ok, but if so then + throw std::runtime_error("attempt to double-lock"); + } + + // R* goes to inifinity and beyond + void release_read_lock() + { + assert(_sega._session_ptrs[_session_num] != -1ull); + _sega._session_ptrs[_session_num] = -1ull; + } + + session(seg_allocator& a, uint32_t ses_num) + : _session_num(ses_num), _alloc_seg_num(-1ull), _alloc_seg_ptr(nullptr), _sega(a) + { + } + + session() = delete; + session(const session&) = delete; + + /** + * alloc_data + * + */ + std::pair alloc_data(uint32_t size, object_id id, node_type t) + { + assert(size < segment_size - 16); + // A - if no segment get a new segment + if (not _alloc_seg_ptr or + _alloc_seg_ptr->_alloc_pos.load(std::memory_order_relaxed) > segment_size) + { + auto [num, ptr] = _sega.get_new_segment(); + _alloc_seg_num = num; + _alloc_seg_ptr = ptr; + _sega._header->seg_meta[_alloc_seg_num]._last_sync_pos.store( + 0, std::memory_order_relaxed); + } + + auto* sh = _alloc_seg_ptr; + auto rounded_size = (size + 7) & -8; + + auto cur_apos = sh->_alloc_pos.load(std::memory_order_relaxed); + auto spec_pos = uint64_t(cur_apos) + rounded_size; + auto free_space = segment_size - cur_apos; + + // B - if there isn't enough space, notify compactor go to A + if (spec_pos > (segment_size - sizeof(object_header))) + { + if (free_space >= sizeof(object_header)) + { + assert(cur_apos + sizeof(uint64_t) <= segment_size); + memset(((char*)sh) + cur_apos, 0, sizeof(object_header)); + } + _sega._header->seg_meta[_alloc_seg_num].free(segment_size - sh->_alloc_pos); + sh->_alloc_pos.store(uint32_t(-1), std::memory_order_release); + _alloc_seg_ptr = nullptr; + _alloc_seg_num = -1ull; + + return alloc_data(size, id, t); // recurse + } + + auto obj = ((char*)sh) + sh->_alloc_pos.load(std::memory_order_relaxed); + auto head = (object_header*)obj; + head->size = size - sizeof(object_header); + head->id = id.id; + head->set_type(t); + + auto new_alloc_pos = + rounded_size + sh->_alloc_pos.fetch_add(rounded_size, std::memory_order_relaxed); + sh->_num_objects++; + + auto loc = _alloc_seg_num * segment_size + cur_apos; + + return {object_location{loc}, obj}; + } + + uint32_t _session_num; // index into _sega's active sessions list + + segment_number _alloc_seg_num = -1ull; + mapped_memory::segment_header* _alloc_seg_ptr = nullptr; + + seg_allocator& _sega; + }; + + session start_session() { return session(*this, alloc_session_num()); } + + private: + friend class session; + std::optional cses; + + mapped_memory::segment_header* get_segment(segment_number seg) + { + return static_cast(_block_alloc.get(seg)); + } + + uint32_t alloc_session_num() + { + auto fs_bits = _free_sessions.load(std::memory_order_relaxed); + if (fs_bits == 0) + { + throw std::runtime_error("max of 64 sessions can be in use"); + } + auto fs = std::countr_zero(fs_bits); + auto new_fs_bits = fs_bits & ~(1 << fs); + + while (not _free_sessions.compare_exchange_weak(fs_bits, new_fs_bits)) + { + if (fs_bits == 0) + { + throw std::runtime_error("max of 64 sessions can be in use"); + } + fs = std::countr_zero(fs_bits); + new_fs_bits = fs_bits & ~(1 << fs); + } + // std::cerr << " alloc session bits: " << fs << " " <(new_fs_bits) << "\n"; + // std::cerr << " new fs bits: " << std::bitset<64>(new_fs_bits) << "\n"; + // _free_sessions.store(new_fs_bits); + return fs; + } + void release_session_num(uint32_t sn) { _free_sessions.fetch_or(uint64_t(1) << sn); } + + std::pair get_new_segment(); + + void compact_loop(); + void compact_segment(session& ses, uint64_t seg_num); + + /** + * This must be called via a session because the session is responsible + * for documenting what regions could be read + * + * All objects are const because they cannot be modified after being + * written. + */ + const object_header* get_object(object_location loc) const; + const object_header* get_object(object_id oid) const; + + /** + * After all writes are complete, and there is not enough space + * to allocate the next object the alloc_ptr gets set to MAX and + * the page gets + */ + void finalize_segment(segment_number); + + /** + * After all data has been removed from a segment + * - madvise free/don't need + * - add the segment number to the free segments at allocator_header::end_ptr + * - increment allocator_header::end_ptr + */ + void release_segment(segment_number); + + /** + * finds the most empty segment that is at least 25% empty + * - marks it for sequential access + * - scans it for remaining objects, moving them to a new region + * - releases segment + * - marks it as unneeded + * + * and moves its contents to + * a new segment owned by the gc thread th + */ + std::thread _compact_thread; + + // maps ids to locations + id_allocator _id_alloc; + + // allocates new segments + block_allocator _block_alloc; + + /** + * This is the highest the alloc_ptr is allowed to + * advance and equal to min value of thread_ptrs. + * + * Do not read directly, read via get_min_read_ptr() + */ + std::atomic _min_read_ptr = -1ull; // min(R*) + uint64_t get_min_read_ptr(); + + /** + * At the start of each access to the DB, + * a read thread must copy the end_ptr and store + * it in this array indexed by the thread number. When + * the thread is done accessing the data it will reset + * the pointer to max_int64. Each read pos is an index + * into _free_segments + * + * TODO: perhaps these need to be on their own cache line + * since different threads are writing to them, if so then + * we can store other session-local data on that cache line + * for free. + */ + std::atomic _session_ptrs[64]; // R* above + + // to allocate a new session in thread-safe way you + // load, find first non-zero bit, and attempt to set it via C&S, + // the index of the bit is the session id. + // Reverse the process to free a session + std::atomic _free_sessions = -1ull; + + std::atomic _done; + + mapping _header_file; + mapped_memory::allocator_header* _header; + }; + + template + inline object_header* seg_allocator::session::read_lock::object_ref::obj() + { + auto val = _atom_loc.load(std::memory_order_acquire); + + if( (val & object_info::ref_mask) == 0 ) { + return nullptr; + } + + object_location loc{._offset = 8 * ( val >> object_info::location_rshift)}; + auto ptr = _rlock.get_object_pointer(loc); + return ptr; + } + + template + inline const object_header* seg_allocator::session::read_lock::object_ref::obj() const + { + auto val = _atom_loc.load(std::memory_order_acquire); + + if( (val & object_info::ref_mask) == 0 ) { + return nullptr; + } + + object_location loc{._offset = 8 * ( val >> object_info::location_rshift)}; + auto ptr = _rlock.get_object_pointer(loc); + return ptr; + } + + template + using deref = seg_allocator::session::read_lock::object_ref; + + /** + * Holds a unique_lock that toggles a bit and prevents the + * underlying object from being moved or released while the lock + * is held. + */ + template + struct mutable_deref : public deref + { + mutable_deref(const deref& src) : deref(src), lock(src.get_mutex()) {} + + /* + mutable_deref(std::unique_lock& m, const deref& src) + : deref(src), lock(m) + { + } + */ + + inline auto& as_value_node() const { return *this->template as(); } + inline auto& as_inner_node() const { return *this->template as(); } + + inline T* operator->() const { return const_cast(this->template as()); } + inline T& operator*() const { return const_cast(*this->template as()); } + + ~mutable_deref() { + this->obj()->update_checksum(); + } + + private: + std::unique_lock lock; + }; // mutable_deref + + + /** + * @param expect - the current location the caller things the object is at + * @param loc - the new location the caller wants it to point at if and only + * if the expected location hasn't changed. + * @return true if the swap was made and the object still has a positive ref count + */ + template + bool seg_allocator::session::read_lock::object_ref::move(object_location expect_loc, + object_location loc) + { + uint64_t expected = _atom_loc.load(std::memory_order_acquire); + do + { + object_info ex(expected); + if( ex.location() != expect_loc or ex.ref() == 0 ) + return false; + _cached = ex.set_location(loc); + } while (not _atom_loc.compare_exchange_weak(expected, _cached.to_int(), + std::memory_order_release)); + return true; + } + + template + bool seg_allocator::session::read_lock::object_ref::retain() + { + auto prior = _atom_loc.fetch_add(1, std::memory_order_relaxed); + if ((prior & object_info::ref_mask) >= object_info::max_ref_count) [[unlikely]] + { + _atom_loc.fetch_sub(1, std::memory_order_relaxed); + return false; + } + assert( prior & object_info::ref_mask ); + return true; + } + + template + bool seg_allocator::session::read_lock::object_ref::release() + { + assert(ref_count() != 0); + assert(type() != node_type::undefined); + auto prior = _atom_loc.fetch_sub(1, std::memory_order_relaxed); + if ((prior & object_info::ref_mask) > 1) + return false; + + + _cached = object_info(prior - 1); + auto loc = _cached.location(); + auto seg = loc.segment(); + + auto obj_ptr = + (object_header*)((char*)_rlock._session._sega._block_alloc.get(seg) + loc.index()); + obj_ptr->set_type( node_type::undefined ); + + // signal to compactor that this data is no longer valid before + // we allow the ID to be reused. + + // by touching this we are forcing pages to be written that were previously constant, + // but with recent changes to move() this check is almost redundant + obj_ptr->check = -1; //TODO: does this prevent false invalid checksum in validate + + + // This ID can be reused almost immediately after calling this method + // which means this objref object is worthless to the caller + _rlock._session._sega._id_alloc.free_id(_id); + _rlock._session._sega._header->seg_meta[seg].free_object(obj_ptr->data_capacity()); + + return true; + } + + template + using object_ref = seg_allocator::session::read_lock::object_ref; + inline object_ref seg_allocator::session::read_lock::alloc(uint32_t size, node_type type) + { + assert(type != node_type::undefined); + + auto [atom, id] = _session._sega._id_alloc.get_new_id(); + auto [loc, ptr] = _session.alloc_data(size + sizeof(object_header), id, type); + + // TODO: this could break if object_info changes + atom.store(1 | (uint64_t(type) << 15) | ((loc._offset / 8) << 19), std::memory_order_relaxed); + + assert(object_ref(*this, id, atom).type() != node_type::undefined); + return object_ref(*this, id, atom); + } + + /* + inline object_ref seg_allocator::session::read_lock::get(object_header* oh) + { + object_id oid(oh->id); + return object_ref(*this, oid, _session._sega._id_alloc.get(oid)); + } + */ + + inline object_header* seg_allocator::session::read_lock::get_object_pointer(object_location loc) + { + auto segment = (mapped_memory::segment_header*)_session._sega._block_alloc.get(loc.segment()); + // 0 means we are accessing a swapped object on a segment that hasn't started new allocs + // if alloc_pos > loc.index() then we haven't overwriten this object yet, we are accessing + // data behind the alloc pointer which should be safe + // to access data we had to get the location from obj id database and we should read + // with memory_order_acquire, when updating an object_info we need to write with + // memory_order_release otherwise the data written may not be visible yet to the reader coming + // along behind + assert(segment->_alloc_pos == 0 or segment->_alloc_pos > loc.index()); + return (object_header*)((char*)_session._sega._block_alloc.get(loc.segment()) + loc.index()); + } + + /** + * Given obj, if it isn't already located in the allocation segment of + * this thread or in the allocation segment of another thread then + * move it to the allocation segment of the current thread. + * + * - do not wait for a write lock, if we can't get the write lock + * then we will just let another thread move it + * + * @return true if the object was moved + */ + template + bool seg_allocator::session::read_lock::object_ref::cache_object() + { + std::unique_lock ul(get_mutex(), std::try_to_lock); + + if (ul.owns_lock()) + { + auto cur_loc = location()._offset; + + assert(ref_count()); + assert(cur_loc); + assert(cur_loc & (segment_size - 1)); + + auto cur_seg = cur_loc / segment_size; + auto cur_seg_ptr = _rlock._session._sega.get_segment(cur_seg); + object_header* cur_obj_ptr = + (object_header*)(((char*)cur_seg_ptr) + (cur_loc & (segment_size - 1))); + + assert(0 != cur_seg_ptr->_alloc_pos); // this would be on a freed segment + + // this would mean its currently located in an active alloc thread, while + // we could re-alloc it is probably already hot because a writer, reader, + // or compactor has just recently copied it. + if (cur_seg_ptr->_alloc_pos.load(std::memory_order_relaxed) != uint32_t(-1)) + return false; + + auto obj_size = cur_obj_ptr->object_size(); + auto [loc, ptr] = _rlock._session.alloc_data(obj_size, _id, cur_obj_ptr->get_type()); + memcpy(ptr, cur_obj_ptr, obj_size); + if (move(location(), loc)) + { + // note that this item has been freed from the segment so the space + // can be recovered by the compactor + _rlock._session._sega._header->seg_meta[cur_seg].free_object(obj_size); + return true; + } + } + return false; + } + +} // namespace triedent diff --git a/libraries/triedent/include/triedent/xxhash.h b/libraries/triedent/include/triedent/xxhash.h new file mode 100644 index 000000000..d11f0f633 --- /dev/null +++ b/libraries/triedent/include/triedent/xxhash.h @@ -0,0 +1,7048 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2023 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/*! + * @mainpage xxHash + * + * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed + * limits. + * + * It is proposed in four flavors, in three families: + * 1. @ref XXH32_family + * - Classic 32-bit hash function. Simple, compact, and runs on almost all + * 32-bit and 64-bit systems. + * 2. @ref XXH64_family + * - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most + * 64-bit systems (but _not_ 32-bit systems). + * 3. @ref XXH3_family + * - Modern 64-bit and 128-bit hash function family which features improved + * strength and performance across the board, especially on smaller data. + * It benefits greatly from SIMD and 64-bit without requiring it. + * + * Benchmarks + * --- + * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04. + * The open source benchmark program is compiled with clang v10.0 using -O3 flag. + * + * | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity | + * | -------------------- | ------- | ----: | ---------------: | ------------------: | + * | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 | + * | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 | + * | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 | + * | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 | + * | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 | + * | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 | + * | RAM sequential read | | N/A | 28.0 GB/s | N/A | + * | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 | + * | City64 | | 64 | 22.0 GB/s | 76.6 | + * | T1ha2 | | 64 | 22.0 GB/s | 99.0 | + * | City128 | | 128 | 21.7 GB/s | 57.7 | + * | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 | + * | XXH64() | | 64 | 19.4 GB/s | 71.0 | + * | SpookyHash | | 64 | 19.3 GB/s | 53.2 | + * | Mum | | 64 | 18.0 GB/s | 67.0 | + * | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 | + * | XXH32() | | 32 | 9.7 GB/s | 71.9 | + * | City32 | | 32 | 9.1 GB/s | 66.0 | + * | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 | + * | Murmur3 | | 32 | 3.9 GB/s | 56.1 | + * | SipHash* | | 64 | 3.0 GB/s | 43.2 | + * | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 | + * | HighwayHash | | 64 | 1.4 GB/s | 6.0 | + * | FNV64 | | 64 | 1.2 GB/s | 62.7 | + * | Blake2* | | 256 | 1.1 GB/s | 5.1 | + * | SHA1* | | 160 | 0.8 GB/s | 5.6 | + * | MD5* | | 128 | 0.6 GB/s | 7.8 | + * @note + * - Hashes which require a specific ISA extension are noted. SSE2 is also noted, + * even though it is mandatory on x64. + * - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic + * by modern standards. + * - Small data velocity is a rough average of algorithm's efficiency for small + * data. For more accurate information, see the wiki. + * - More benchmarks and strength tests are found on the wiki: + * https://github.com/Cyan4973/xxHash/wiki + * + * Usage + * ------ + * All xxHash variants use a similar API. Changing the algorithm is a trivial + * substitution. + * + * @pre + * For functions which take an input and length parameter, the following + * requirements are assumed: + * - The range from [`input`, `input + length`) is valid, readable memory. + * - The only exception is if the `length` is `0`, `input` may be `NULL`. + * - For C++, the objects must have the *TriviallyCopyable* property, as the + * functions access bytes directly as if it was an array of `unsigned char`. + * + * @anchor single_shot_example + * **Single Shot** + * + * These functions are stateless functions which hash a contiguous block of memory, + * immediately returning the result. They are the easiest and usually the fastest + * option. + * + * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits() + * + * @code{.c} + * #include + * #include "xxhash.h" + * + * // Example for a function which hashes a null terminated string with XXH32(). + * XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed) + * { + * // NULL pointers are only valid if the length is zero + * size_t length = (string == NULL) ? 0 : strlen(string); + * return XXH32(string, length, seed); + * } + * @endcode + * + * + * @anchor streaming_example + * **Streaming** + * + * These groups of functions allow incremental hashing of unknown size, even + * more than what would fit in a size_t. + * + * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset() + * + * @code{.c} + * #include + * #include + * #include "xxhash.h" + * // Example for a function which hashes a FILE incrementally with XXH3_64bits(). + * XXH64_hash_t hashFile(FILE* f) + * { + * // Allocate a state struct. Do not just use malloc() or new. + * XXH3_state_t* state = XXH3_createState(); + * assert(state != NULL && "Out of memory!"); + * // Reset the state to start a new hashing session. + * XXH3_64bits_reset(state); + * char buffer[4096]; + * size_t count; + * // Read the file in chunks + * while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) { + * // Run update() as many times as necessary to process the data + * XXH3_64bits_update(state, buffer, count); + * } + * // Retrieve the finalized hash. This will not change the state. + * XXH64_hash_t result = XXH3_64bits_digest(state); + * // Free the state. Do not use free(). + * XXH3_freeState(state); + * return result; + * } + * @endcode + * + * Streaming functions generate the xxHash value from an incremental input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + * + * + * @anchor canonical_representation_example + * **Canonical Representation** + * + * The default return values from XXH functions are unsigned 32, 64 and 128 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + * + * XXH32_canonicalFromHash(), XXH32_hashFromCanonical(), + * XXH64_canonicalFromHash(), XXH64_hashFromCanonical(), + * XXH128_canonicalFromHash(), XXH128_hashFromCanonical(), + * + * @code{.c} + * #include + * #include "xxhash.h" + * + * // Example for a function which prints XXH32_hash_t in human readable format + * void printXxh32(XXH32_hash_t hash) + * { + * XXH32_canonical_t cano; + * XXH32_canonicalFromHash(&cano, hash); + * size_t i; + * for(i = 0; i < sizeof(cano.digest); ++i) { + * printf("%02x", cano.digest[i]); + * } + * printf("\n"); + * } + * + * // Example for a function which converts XXH32_canonical_t to XXH32_hash_t + * XXH32_hash_t convertCanonicalToXxh32(XXH32_canonical_t cano) + * { + * XXH32_hash_t hash = XXH32_hashFromCanonical(&cano); + * return hash; + * } + * @endcode + * + * + * @file xxhash.h + * xxHash prototypes and implementation + */ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* **************************** + * INLINE mode + ******************************/ +/*! + * @defgroup public Public API + * Contains details on the public xxHash functions. + * @{ + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Gives access to internal state declaration, required for static allocation. + * + * Incompatible with dynamic linking, due to risks of ABI changes. + * + * Usage: + * @code{.c} + * #define XXH_STATIC_LINKING_ONLY + * #include "xxhash.h" + * @endcode + */ +# define XXH_STATIC_LINKING_ONLY +/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */ + +/*! + * @brief Gives access to internal definitions. + * + * Usage: + * @code{.c} + * #define XXH_STATIC_LINKING_ONLY + * #define XXH_IMPLEMENTATION + * #include "xxhash.h" + * @endcode + */ +# define XXH_IMPLEMENTATION +/* Do not undef XXH_IMPLEMENTATION for Doxygen */ + +/*! + * @brief Exposes the implementation and marks all functions as `inline`. + * + * Use these build macros to inline xxhash into the target unit. + * Inlining improves performance on small inputs, especially when the length is + * expressed as a compile-time constant: + * + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * + * It also keeps xxHash symbols private to the unit, so they are not exported. + * + * Usage: + * @code{.c} + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * @endcode + * Do not compile and link xxhash.o as a separate object, as it is not useful. + */ +# define XXH_INLINE_ALL +# undef XXH_INLINE_ALL +/*! + * @brief Exposes the implementation without marking functions as inline. + */ +# define XXH_PRIVATE_API +# undef XXH_PRIVATE_API +/*! + * @brief Emulate a namespace by transparently prefixing all symbols. + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +# define XXH_NAMESPACE /* YOUR NAME HERE */ +# undef XXH_NAMESPACE +#endif + +#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ + && !defined(XXH_INLINE_ALL_31684351384) + /* this section should be traversed only once */ +# define XXH_INLINE_ALL_31684351384 + /* give access to the advanced API, required to compile implementations */ +# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ +# define XXH_STATIC_LINKING_ONLY + /* make all functions private */ +# undef XXH_PUBLIC_API +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* note: this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif + + /* + * This part deals with the special case where a unit wants to inline xxHash, + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, + * such as part of some previously included *.h header file. + * Without further action, the new include would just be ignored, + * and functions would effectively _not_ be inlined (silent failure). + * The following macros solve this situation by prefixing all inlined names, + * avoiding naming collision with previous inclusions. + */ + /* Before that, we unconditionally #undef all symbols, + * in case they were already defined with XXH_NAMESPACE. + * They will then be redefined for XXH_INLINE_ALL + */ +# undef XXH_versionNumber + /* XXH32 */ +# undef XXH32 +# undef XXH32_createState +# undef XXH32_freeState +# undef XXH32_reset +# undef XXH32_update +# undef XXH32_digest +# undef XXH32_copyState +# undef XXH32_canonicalFromHash +# undef XXH32_hashFromCanonical + /* XXH64 */ +# undef XXH64 +# undef XXH64_createState +# undef XXH64_freeState +# undef XXH64_reset +# undef XXH64_update +# undef XXH64_digest +# undef XXH64_copyState +# undef XXH64_canonicalFromHash +# undef XXH64_hashFromCanonical + /* XXH3_64bits */ +# undef XXH3_64bits +# undef XXH3_64bits_withSecret +# undef XXH3_64bits_withSeed +# undef XXH3_64bits_withSecretandSeed +# undef XXH3_createState +# undef XXH3_freeState +# undef XXH3_copyState +# undef XXH3_64bits_reset +# undef XXH3_64bits_reset_withSeed +# undef XXH3_64bits_reset_withSecret +# undef XXH3_64bits_update +# undef XXH3_64bits_digest +# undef XXH3_generateSecret + /* XXH3_128bits */ +# undef XXH128 +# undef XXH3_128bits +# undef XXH3_128bits_withSeed +# undef XXH3_128bits_withSecret +# undef XXH3_128bits_reset +# undef XXH3_128bits_reset_withSeed +# undef XXH3_128bits_reset_withSecret +# undef XXH3_128bits_reset_withSecretandSeed +# undef XXH3_128bits_update +# undef XXH3_128bits_digest +# undef XXH128_isEqual +# undef XXH128_cmp +# undef XXH128_canonicalFromHash +# undef XXH128_hashFromCanonical + /* Finally, free the namespace itself */ +# undef XXH_NAMESPACE + + /* employ the namespace for XXH_INLINE_ALL */ +# define XXH_NAMESPACE XXH_INLINE_ + /* + * Some identifiers (enums, type names) are not symbols, + * but they must nonetheless be renamed to avoid redeclaration. + * Alternative solution: do not redeclare them. + * However, this requires some #ifdefs, and has a more dispersed impact. + * Meanwhile, renaming can be achieved in a single place. + */ +# define XXH_IPREF(Id) XXH_NAMESPACE ## Id +# define XXH_OK XXH_IPREF(XXH_OK) +# define XXH_ERROR XXH_IPREF(XXH_ERROR) +# define XXH_errorcode XXH_IPREF(XXH_errorcode) +# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) +# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) +# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) +# define XXH32_state_s XXH_IPREF(XXH32_state_s) +# define XXH32_state_t XXH_IPREF(XXH32_state_t) +# define XXH64_state_s XXH_IPREF(XXH64_state_s) +# define XXH64_state_t XXH_IPREF(XXH64_state_t) +# define XXH3_state_s XXH_IPREF(XXH3_state_s) +# define XXH3_state_t XXH_IPREF(XXH3_state_t) +# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) + /* Ensure the header is parsed again, even if it was previously included */ +# undef XXHASH_H_5627135585666179 +# undef XXHASH_H_STATIC_13879238742 +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + +/* **************************************************************** + * Stable API + *****************************************************************/ +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +/*! @brief Marks a global symbol. */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +/* XXH32 */ +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +/* XXH64 */ +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +/* XXH3_64bits */ +# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) +# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) +# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) +# define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed) +# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) +# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) +# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) +# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) +# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) +# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) +# define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed) +# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) +# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) +# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) +# define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed) +/* XXH3_128bits */ +# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) +# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) +# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) +# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) +# define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed) +# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) +# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) +# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) +# define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed) +# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) +# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) +# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) +# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) +# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) +# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) +#endif + + +/* ************************************* +* Compiler specifics +***************************************/ + +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#if defined (__GNUC__) +# define XXH_CONSTF __attribute__((const)) +# define XXH_PUREF __attribute__((pure)) +# define XXH_MALLOCF __attribute__((malloc)) +#else +# define XXH_CONSTF /* disable */ +# define XXH_PUREF +# define XXH_MALLOCF +#endif + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 8 +#define XXH_VERSION_RELEASE 2 +/*! @brief Version number, encoded as two digits each */ +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) + +/*! + * @brief Obtains the xxHash version. + * + * This is mostly useful when xxHash is compiled as a shared library, + * since the returned value comes from the library, as opposed to header file. + * + * @return @ref XXH_VERSION_NUMBER of the invoked library. + */ +XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void); + + +/* **************************** +* Common basic types +******************************/ +#include /* size_t */ +/*! + * @brief Exit code for the streaming API. + */ +typedef enum { + XXH_OK = 0, /*!< OK */ + XXH_ERROR /*!< Error */ +} XXH_errorcode; + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* Don't show include */ +/*! + * @brief An unsigned 32-bit integer. + * + * Not necessarily defined to `uint32_t` but functionally equivalent. + */ +typedef uint32_t XXH32_hash_t; + +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint32_t XXH32_hash_t; + +#else +# include +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXH32_hash_t; +# elif ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; +# else +# error "unsupported platform: need a 32-bit type" +# endif +#endif + +/*! + * @} + * + * @defgroup XXH32_family XXH32 family + * @ingroup public + * Contains functions used in the classic 32-bit xxHash algorithm. + * + * @note + * XXH32 is useful for older platforms, with no or poor 64-bit performance. + * Note that the @ref XXH3_family provides competitive speed for both 32-bit + * and 64-bit systems, and offers true 64/128 bit hash results. + * + * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families + * @see @ref XXH32_impl for implementation details + * @{ + */ + +/*! + * @brief Calculates the 32-bit hash of @p input using xxHash32. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 32-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 32-bit xxHash32 value. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); + +#ifndef XXH_NO_STREAM +/*! + * @typedef struct XXH32_state_s XXH32_state_t + * @brief The opaque state struct for the XXH32 streaming API. + * + * @see XXH32_state_s for details. + * @see @ref streaming_example "Streaming Example" + */ +typedef struct XXH32_state_s XXH32_state_t; + +/*! + * @brief Allocates an @ref XXH32_state_t. + * + * @return An allocated pointer of @ref XXH32_state_t on success. + * @return `NULL` on failure. + * + * @note Must be freed with XXH32_freeState(). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void); +/*! + * @brief Frees an @ref XXH32_state_t. + * + * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). + * + * @return @ref XXH_OK. + * + * @note @p statePtr must be allocated with XXH32_createState(). + * + * @see @ref streaming_example "Streaming Example" + * + */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +/*! + * @brief Copies one @ref XXH32_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + +/*! + * @brief Resets an @ref XXH32_state_t to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param seed The 32-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note This function resets and seeds a state. Call it before @ref XXH32_update(). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH32_state_t. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note Call this to incrementally consume blocks of data. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH32_state_t. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated 32-bit xxHash32 value from that state. + * + * @note + * Calling XXH32_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! + * @brief Canonical (big endian) representation of @ref XXH32_hash_t. + */ +typedef struct { + unsigned char digest[4]; /*!< Hash bytes, big endian */ +} XXH32_canonical_t; + +/*! + * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. + * + * @param dst The @ref XXH32_canonical_t pointer to be stored to. + * @param hash The @ref XXH32_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + * + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); + +/*! + * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. + * + * @param src The @ref XXH32_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + * + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + +/*! @cond Doxygen ignores this part */ +#ifdef __has_attribute +# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) +#else +# define XXH_HAS_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * C23 __STDC_VERSION__ number hasn't been specified yet. For now + * leave as `201711L` (C17 + 1). + * TODO: Update to correct value when its been specified. + */ +#define XXH_C23_VN 201711L +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* C-language Attributes are added in C23. */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute) +# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) +#else +# define XXH_HAS_C_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +#if defined(__cplusplus) && defined(__has_cpp_attribute) +# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#else +# define XXH_HAS_CPP_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute + * introduced in CPP17 and C23. + * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough + * C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough + */ +#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough) +# define XXH_FALLTHROUGH [[fallthrough]] +#elif XXH_HAS_ATTRIBUTE(__fallthrough__) +# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__)) +#else +# define XXH_FALLTHROUGH /* fallthrough */ +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * Define XXH_NOESCAPE for annotated pointers in public API. + * https://clang.llvm.org/docs/AttributeReference.html#noescape + * As of writing this, only supported by clang. + */ +#if XXH_HAS_ATTRIBUTE(noescape) +# define XXH_NOESCAPE __attribute__((noescape)) +#else +# define XXH_NOESCAPE +#endif +/*! @endcond */ + + +/*! + * @} + * @ingroup public + * @{ + */ + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* don't include */ +/*! + * @brief An unsigned 64-bit integer. + * + * Not necessarily defined to `uint64_t` but functionally equivalent. + */ +typedef uint64_t XXH64_hash_t; +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint64_t XXH64_hash_t; +#else +# include +# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL + /* LP64 ABI says uint64_t is unsigned long */ + typedef unsigned long XXH64_hash_t; +# else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXH64_hash_t; +# endif +#endif + +/*! + * @} + * + * @defgroup XXH64_family XXH64 family + * @ingroup public + * @{ + * Contains functions used in the classic 64-bit xxHash algorithm. + * + * @note + * XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. + * It provides better speed for systems with vector processing capabilities. + */ + +/*! + * @brief Calculates the 64-bit hash of @p input using xxHash64. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit xxHash64 value. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/*! + * @brief The opaque state struct for the XXH64 streaming API. + * + * @see XXH64_state_s for details. + * @see @ref streaming_example "Streaming Example" + */ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ + +/*! + * @brief Allocates an @ref XXH64_state_t. + * + * @return An allocated pointer of @ref XXH64_state_t on success. + * @return `NULL` on failure. + * + * @note Must be freed with XXH64_freeState(). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void); + +/*! + * @brief Frees an @ref XXH64_state_t. + * + * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState(). + * + * @return @ref XXH_OK. + * + * @note @p statePtr must be allocated with XXH64_createState(). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); + +/*! + * @brief Copies one @ref XXH64_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +/*! + * @brief Resets an @ref XXH64_state_t to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note This function resets and seeds a state. Call it before @ref XXH64_update(). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH64_state_t. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note Call this to incrementally consume blocks of data. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH64_state_t. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated 64-bit xxHash64 value from that state. + * + * @note + * Calling XXH64_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ +/******* Canonical representation *******/ + +/*! + * @brief Canonical (big endian) representation of @ref XXH64_hash_t. + */ +typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; + +/*! + * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t. + * + * @param dst The @ref XXH64_canonical_t pointer to be stored to. + * @param hash The @ref XXH64_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + * + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash); + +/*! + * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t. + * + * @param src The @ref XXH64_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + * + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src); + +#ifndef XXH_NO_XXH3 + +/*! + * @} + * ************************************************************************ + * @defgroup XXH3_family XXH3 family + * @ingroup public + * @{ + * + * XXH3 is a more recent hash algorithm featuring: + * - Improved speed for both small and large inputs + * - True 64-bit and 128-bit outputs + * - SIMD acceleration + * - Improved 32-bit viability + * + * Speed analysis methodology is explained here: + * + * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * + * Compared to XXH64, expect XXH3 to run approximately + * ~2x faster on large inputs and >3x faster on small ones, + * exact differences vary depending on platform. + * + * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, + * but does not require it. + * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3 + * at competitive speeds, even without vector support. Further details are + * explained in the implementation. + * + * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD + * implementations for many common platforms: + * - AVX512 + * - AVX2 + * - SSE2 + * - ARM NEON + * - WebAssembly SIMD128 + * - POWER8 VSX + * - s390x ZVector + * This can be controlled via the @ref XXH_VECTOR macro, but it automatically + * selects the best version according to predefined macros. For the x86 family, an + * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c. + * + * XXH3 implementation is portable: + * it has a generic C90 formulation that can be compiled on any platform, + * all implementations generate exactly the same hash value on all platforms. + * Starting from v0.8.0, it's also labelled "stable", meaning that + * any future version will also generate the same hash value. + * + * XXH3 offers 2 variants, _64bits and _128bits. + * + * When only 64 bits are needed, prefer invoking the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + */ +/*-********************************************************************** +* XXH3 64-bit variant +************************************************************************/ + +/*! + * @brief Calculates 64-bit unseeded variant of XXH3 hash of @p input. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit XXH3 hash value. + * + * @note + * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of `0`, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Calculates 64-bit seeded variant of XXH3 hash of @p input. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit XXH3 hash value. + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * This variant generates a custom secret on the fly based on default secret + * altered using the @p seed value. + * + * While this operation is decently fast, note that it's not completely free. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); + +/*! + * The bare minimum size for a custom secret. + * + * @see + * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), + * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). + */ +#define XXH3_SECRET_SIZE_MIN 136 + +/*! + * @brief Calculates 64-bit variant of XXH3 with a custom "secret". + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @return The calculated 64-bit XXH3 hash value. + * + * @pre + * The memory between @p data and @p data + @p len must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p data may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN). + * However, the quality of the secret impacts the dispersion of the hash algorithm. + * Therefore, the secret _must_ look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever in doubt about the "randomness" of the blob of bytes, + * consider employing @ref XXH3_generateSecret() instead (see below). + * It will generate a proper high entropy secret derived from the blob of bytes. + * Another advantage of using XXH3_generateSecret() is that + * it guarantees that all bits within the initial blob of bytes + * will impact every bit of the output. + * This is not necessarily the case when using the blob of bytes directly + * because, when hashing _small_ inputs, only a portion of the secret is employed. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); + + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + */ + +/*! + * @brief The opaque state struct for the XXH3 streaming API. + * + * @see XXH3_state_s for details. + * @see @ref streaming_example "Streaming Example" + */ +typedef struct XXH3_state_s XXH3_state_t; +XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); + +/*! + * @brief Copies one @ref XXH3_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state); + +/*! + * @brief Resets an @ref XXH3_state_t to begin a new hash. + * + * @param statePtr The state struct to reset. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret with default parameters. + * - Call this function before @ref XXH3_64bits_update(). + * - Digest will be equivalent to `XXH3_64bits()`. + * + * @see @ref streaming_example "Streaming Example" + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); + +/*! + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret from `seed`. + * - Call this function before @ref XXH3_64bits_update(). + * - Digest will be equivalent to `XXH3_64bits_withSeed()`. + * + * @see @ref streaming_example "Streaming Example" + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); + +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * `secret` is referenced, it _must outlive_ the hash streaming session. + * + * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN, + * and the quality of produced hash values depends on secret's entropy + * (secret's content should look like a bunch of random bytes). + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); + +/*! + * @brief Consumes a block of @p input to an @ref XXH3_state_t. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note Call this to incrementally consume blocks of data. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated XXH3 64-bit hash value from that state. + * + * @note + * Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* note : canonical representation of XXH3 is the same as XXH64 + * since they both produce XXH64_hash_t values */ + + +/*-********************************************************************** +* XXH3 128-bit variant +************************************************************************/ + +/*! + * @brief The return value from 128-bit hashes. + * + * Stored in little endian order, although the fields themselves are in native + * endianness. + */ +typedef struct { + XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ + XXH64_hash_t high64; /*!< `value >> 64` */ +} XXH128_hash_t; + +/*! + * @brief Calculates 128-bit unseeded variant of XXH3 of @p data. + * + * @param data The block of data to be hashed, at least @p length bytes in size. + * @param len The length of @p data, in bytes. + * + * @return The calculated 128-bit variant of XXH3 value. + * + * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead + * for shorter inputs. + * + * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of `0`, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len); +/*! @brief Calculates 128-bit seeded variant of XXH3 hash of @p data. + * + * @param data The block of data to be hashed, at least @p length bytes in size. + * @param len The length of @p data, in bytes. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @return The calculated 128-bit variant of XXH3 value. + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * This variant generates a custom secret on the fly based on default secret + * altered using the @p seed value. + * + * While this operation is decently fast, note that it's not completely free. + * + * @see XXH3_128bits(), XXH3_128bits_withSecret(): other seeding variants + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); +/*! + * @brief Calculates 128-bit variant of XXH3 with a custom "secret". + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @return The calculated 128-bit variant of XXH3 value. + * + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN). + * However, the quality of the secret impacts the dispersion of the hash algorithm. + * Therefore, the secret _must_ look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever in doubt about the "randomness" of the blob of bytes, + * consider employing @ref XXH3_generateSecret() instead (see below). + * It will generate a proper high entropy secret derived from the blob of bytes. + * Another advantage of using XXH3_generateSecret() is that + * it guarantees that all bits within the initial blob of bytes + * will impact every bit of the output. + * This is not necessarily the case when using the blob of bytes directly + * because, when hashing _small_ inputs, only a portion of the secret is employed. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + * + * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). + * Use already declared XXH3_createState() and XXH3_freeState(). + * + * All reset and streaming functions have same meaning as their 64-bit counterpart. + */ + +/*! + * @brief Resets an @ref XXH3_state_t to begin a new hash. + * + * @param statePtr The state struct to reset. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret with default parameters. + * - Call it before @ref XXH3_128bits_update(). + * - Digest will be equivalent to `XXH3_128bits()`. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); + +/*! + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret from `seed`. + * - Call it before @ref XXH3_128bits_update(). + * - Digest will be equivalent to `XXH3_128bits_withSeed()`. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * `secret` is referenced, it _must outlive_ the hash streaming session. + * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN, + * and the quality of produced hash values depends on secret's entropy + * (secret's content should look like a bunch of random bytes). + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); + +/*! + * @brief Consumes a block of @p input to an @ref XXH3_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated XXH3 128-bit hash value from that state. + * + * @note + * Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* Following helper functions make it possible to compare XXH128_hast_t values. + * Since XXH128_hash_t is a structure, this capability is not offered by the language. + * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ + +/*! + * @brief Check equality of two XXH128_hash_t values + * + * @param h1 The 128-bit hash value. + * @param h2 Another 128-bit hash value. + * + * @return `1` if `h1` and `h2` are equal. + * @return `0` if they are not. + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); + +/*! + * @brief Compares two @ref XXH128_hash_t + * + * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. + * + * @param h128_1 Left-hand side value + * @param h128_2 Right-hand side value + * + * @return >0 if @p h128_1 > @p h128_2 + * @return =0 if @p h128_1 == @p h128_2 + * @return <0 if @p h128_1 < @p h128_2 + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2); + + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; + + +/*! + * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t. + * + * @param dst The @ref XXH128_canonical_t pointer to be stored to. + * @param hash The @ref XXH128_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash); + +/*! + * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t. + * + * @param src The @ref XXH128_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src); + + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ + +/*! + * @} + */ +#endif /* XXHASH_H_5627135585666179 */ + + + +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) +#define XXHASH_H_STATIC_13879238742 +/* **************************************************************************** + * This section contains declarations which are not guaranteed to remain stable. + * They may change in future versions, becoming incompatible with a different + * version of the library. + * These declarations should only be used with static linking. + * Never use them in association with dynamic linking! + ***************************************************************************** */ + +/* + * These definitions are only present to allow static allocation + * of XXH states, on stack or in a struct, for example. + * Never **ever** access their members directly. + */ + +/*! + * @internal + * @brief Structure for XXH32 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH32_state_t. + * Do not access the members of this struct directly. + * @see XXH64_state_s, XXH3_state_s + */ +struct XXH32_state_s { + XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ + XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ + XXH32_hash_t v[4]; /*!< Accumulator lanes */ + XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ + XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */ +}; /* typedef'd to XXH32_state_t */ + + +#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ + +/*! + * @internal + * @brief Structure for XXH64 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH64_state_t. + * Do not access the members of this struct directly. + * @see XXH32_state_s, XXH3_state_s + */ +struct XXH64_state_s { + XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ + XXH64_hash_t v[4]; /*!< Accumulator lanes */ + XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ + XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ + XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */ +}; /* typedef'd to XXH64_state_t */ + +#ifndef XXH_NO_XXH3 + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */ +# include +# define XXH_ALIGN(n) alignas(n) +#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ +/* In C++ alignas() is a keyword */ +# define XXH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXH_ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# define XXH_ALIGN(n) __declspec(align(n)) +#else +# define XXH_ALIGN(n) /* disabled */ +#endif + +/* Old GCC versions only accept the attribute after the type in structures. */ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ + && defined(__GNUC__) +# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) +#else +# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type +#endif + +/*! + * @brief The size of the internal XXH3 buffer. + * + * This is the optimal update size for incremental hashing. + * + * @see XXH3_64b_update(), XXH3_128b_update(). + */ +#define XXH3_INTERNALBUFFER_SIZE 256 + +/*! + * @internal + * @brief Default size of the secret buffer (and @ref XXH3_kSecret). + * + * This is the size used in @ref XXH3_kSecret and the seeded functions. + * + * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. + */ +#define XXH3_SECRET_DEFAULT_SIZE 192 + +/*! + * @internal + * @brief Structure for XXH3 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. + * Otherwise it is an opaque type. + * Never use this definition in combination with dynamic library. + * This allows fields to safely be changed in the future. + * + * @note ** This structure has a strict alignment requirement of 64 bytes!! ** + * Do not allocate this with `malloc()` or `new`, + * it will not be sufficiently aligned. + * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. + * + * Typedef'd to @ref XXH3_state_t. + * Do never access the members of this struct directly. + * + * @see XXH3_INITSTATE() for stack initialization. + * @see XXH3_createState(), XXH3_freeState(). + * @see XXH32_state_s, XXH64_state_s + */ +struct XXH3_state_s { + XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); + /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */ + XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + /*!< Used to store a custom secret generated from a seed. */ + XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); + /*!< The internal buffer. @see XXH32_state_s::mem32 */ + XXH32_hash_t bufferedSize; + /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ + XXH32_hash_t useSeed; + /*!< Reserved field. Needed for padding on 64-bit. */ + size_t nbStripesSoFar; + /*!< Number or stripes processed. */ + XXH64_hash_t totalLen; + /*!< Total length hashed. 64-bit even on 32-bit targets. */ + size_t nbStripesPerBlock; + /*!< Number of stripes per block. */ + size_t secretLimit; + /*!< Size of @ref customSecret or @ref extSecret */ + XXH64_hash_t seed; + /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */ + XXH64_hash_t reserved64; + /*!< Reserved field. */ + const unsigned char* extSecret; + /*!< Reference to an external secret for the _withSecret variants, NULL + * for other variants. */ + /* note: there may be some padding at the end due to alignment on 64 bytes */ +}; /* typedef'd to XXH3_state_t */ + +#undef XXH_ALIGN_MEMBER + +/*! + * @brief Initializes a stack-allocated `XXH3_state_s`. + * + * When the @ref XXH3_state_t structure is merely emplaced on stack, + * it should be initialized with XXH3_INITSTATE() or a memset() + * in case its first reset uses XXH3_NNbits_reset_withSeed(). + * This init can be omitted if the first reset uses default or _withSecret mode. + * This operation isn't necessary when the state is created with XXH3_createState(). + * Note that this doesn't prepare the state for a streaming operation, + * it's still necessary to use XXH3_NNbits_reset*() afterwards. + */ +#define XXH3_INITSTATE(XXH3_state_ptr) \ + do { \ + XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \ + tmp_xxh3_state_ptr->seed = 0; \ + tmp_xxh3_state_ptr->extSecret = NULL; \ + } while(0) + + +/*! + * @brief Calculates the 128-bit hash of @p data using XXH3. + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p data and @p data + @p len must be valid, + * readable, contiguous memory. However, if @p len is `0`, @p data may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 128-bit XXH3 value. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); + + +/* === Experimental API === */ +/* Symbols defined below must be considered tied to a specific library version. */ + +/*! + * @brief Derive a high-entropy secret from any user-defined content, named customSeed. + * + * @param secretBuffer A writable buffer for derived high-entropy secret data. + * @param secretSize Size of secretBuffer, in bytes. Must be >= XXH3_SECRET_DEFAULT_SIZE. + * @param customSeed A user-defined content. + * @param customSeedSize Size of customSeed, in bytes. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * The generated secret can be used in combination with `*_withSecret()` functions. + * The `_withSecret()` variants are useful to provide a higher level of protection + * than 64-bit seed, as it becomes much more difficult for an external actor to + * guess how to impact the calculation logic. + * + * The function accepts as input a custom seed of any length and any content, + * and derives from it a high-entropy secret of length @p secretSize into an + * already allocated buffer @p secretBuffer. + * + * The generated secret can then be used with any `*_withSecret()` variant. + * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(), + * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret() + * are part of this list. They all accept a `secret` parameter + * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN) + * _and_ feature very high entropy (consist of random-looking bytes). + * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can + * be employed to ensure proper quality. + * + * @p customSeed can be anything. It can have any size, even small ones, + * and its content can be anything, even "poor entropy" sources such as a bunch + * of zeroes. The resulting `secret` will nonetheless provide all required qualities. + * + * @pre + * - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN + * - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior. + * + * Example code: + * @code{.c} + * #include + * #include + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Hashes argv[2] using the entropy from argv[1]. + * int main(int argc, char* argv[]) + * { + * char secret[XXH3_SECRET_SIZE_MIN]; + * if (argv != 3) { return 1; } + * XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1])); + * XXH64_hash_t h = XXH3_64bits_withSecret( + * argv[2], strlen(argv[2]), + * secret, sizeof(secret) + * ); + * printf("%016llx\n", (unsigned long long) h); + * } + * @endcode + */ +XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize); + +/*! + * @brief Generate the same secret as the _withSeed() variants. + * + * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes + * @param seed The 64-bit seed to alter the hash result predictably. + * + * The generated secret can be used in combination with + *`*_withSecret()` and `_withSecretandSeed()` variants. + * + * Example C++ `std::string` hash class: + * @code{.cpp} + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Slow, seeds each time + * class HashSlow { + * XXH64_hash_t seed; + * public: + * HashSlow(XXH64_hash_t s) : seed{s} {} + * size_t operator()(const std::string& x) const { + * return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)}; + * } + * }; + * // Fast, caches the seeded secret for future uses. + * class HashFast { + * unsigned char secret[XXH3_SECRET_SIZE_MIN]; + * public: + * HashFast(XXH64_hash_t s) { + * XXH3_generateSecret_fromSeed(secret, seed); + * } + * size_t operator()(const std::string& x) const { + * return size_t{ + * XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret)) + * }; + * } + * }; + * @endcode + */ +XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed); + +/*! + * @brief Calculates 64/128-bit seeded variant of XXH3 hash of @p data. + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * These variants generate hash values using either + * @p seed for "short" keys (< @ref XXH3_MIDSIZE_MAX = 240 bytes) + * or @p secret for "large" keys (>= @ref XXH3_MIDSIZE_MAX). + * + * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. + * `_withSeed()` has to generate the secret on the fly for "large" keys. + * It's fast, but can be perceptible for "not so large" keys (< 1 KB). + * `_withSecret()` has to generate the masks on the fly for "small" keys, + * which requires more instructions than _withSeed() variants. + * Therefore, _withSecretandSeed variant combines the best of both worlds. + * + * When @p secret has been generated by XXH3_generateSecret_fromSeed(), + * this variant produces *exactly* the same results as `_withSeed()` variant, + * hence offering only a pure speed benefit on "large" input, + * by skipping the need to regenerate the secret for every large input. + * + * Another usage scenario is to hash the secret to a 64-bit hash value, + * for example with XXH3_64bits(), which then becomes the seed, + * and then employ both the seed and the secret in _withSecretandSeed(). + * On top of speed, an added benefit is that each bit in the secret + * has a 50% chance to swap each bit in the output, via its impact to the seed. + * + * This is not guaranteed when using the secret directly in "small data" scenarios, + * because only portions of the secret are employed for small data. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed); +/*! + * @brief Calculates 128-bit seeded variant of XXH3 hash of @p data. + * + * @param input The block of data to be hashed, at least @p len bytes in size. + * @param length The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed64 The 64-bit seed to alter the hash result predictably. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @see XXH3_64bits_withSecretandSeed() + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#ifndef XXH_NO_STREAM +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed64 The 64-bit seed to alter the hash result predictably. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @see XXH3_64bits_withSecretandSeed() + */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed64 The 64-bit seed to alter the hash result predictably. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @see XXH3_64bits_withSecretandSeed() + */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#endif /* !XXH_NO_STREAM */ + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# define XXH_IMPLEMENTATION +#endif + +#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ + + +/*-********************************************************************** + * xxHash implementation + *-********************************************************************** + * xxHash's implementation used to be hosted inside xxhash.c. + * + * However, inlining requires implementation to be visible to the compiler, + * hence be included alongside the header. + * Previously, implementation was hosted inside xxhash.c, + * which was then #included when inlining was activated. + * This construction created issues with a few build and install systems, + * as it required xxhash.c to be stored in /include directory. + * + * xxHash implementation is now directly integrated within xxhash.h. + * As a consequence, xxhash.c is no longer needed in /include. + * + * xxhash.c is still available and is still useful. + * In a "normal" setup, when xxhash is not inlined, + * xxhash.h only exposes the prototypes and public symbols, + * while xxhash.c can be built into an object file xxhash.o + * which can then be linked into the final binary. + ************************************************************************/ + +#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ + || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) +# define XXH_IMPLEM_13a8737387 + +/* ************************************* +* Tuning parameters +***************************************/ + +/*! + * @defgroup tuning Tuning parameters + * @{ + * + * Various macros to control xxHash's behavior. + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Define this to disable 64-bit code. + * + * Useful if only using the @ref XXH32_family and you have a strict C90 compiler. + */ +# define XXH_NO_LONG_LONG +# undef XXH_NO_LONG_LONG /* don't actually */ +/*! + * @brief Controls how unaligned memory is accessed. + * + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated assembly + * is sub-optimal. + * + * The below switch allow selection of a different access method + * in the search for improved performance. + * + * @par Possible options: + * + * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` + * @par + * Use `memcpy()`. Safe and portable. Note that most modern compilers will + * eliminate the function call and treat it as an unaligned access. + * + * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))` + * @par + * Depends on compiler extensions and is therefore not portable. + * This method is safe _if_ your compiler supports it, + * and *generally* as fast or faster than `memcpy`. + * + * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast + * @par + * Casts directly and dereferences. This method doesn't depend on the + * compiler, but it violates the C standard as it directly dereferences an + * unaligned pointer. It can generate buggy code on targets which do not + * support unaligned memory accesses, but in some circumstances, it's the + * only known way to get the most performance. + * + * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift + * @par + * Also portable. This can generate the best code on old compilers which don't + * inline small `memcpy()` calls, and it might also be faster on big-endian + * systems which lack a native byteswap instruction. However, some compilers + * will emit literal byteshifts even if the target supports unaligned access. + * + * + * @warning + * Methods 1 and 2 rely on implementation-defined behavior. Use these with + * care, as what works on one compiler/platform/optimization level may cause + * another to read garbage data or even crash. + * + * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details. + * + * Prefer these methods in priority order (0 > 3 > 1 > 2) + */ +# define XXH_FORCE_MEMORY_ACCESS 0 + +/*! + * @def XXH_SIZE_OPT + * @brief Controls how much xxHash optimizes for size. + * + * xxHash, when compiled, tends to result in a rather large binary size. This + * is mostly due to heavy usage to forced inlining and constant folding of the + * @ref XXH3_family to increase performance. + * + * However, some developers prefer size over speed. This option can + * significantly reduce the size of the generated code. When using the `-Os` + * or `-Oz` options on GCC or Clang, this is defined to 1 by default, + * otherwise it is defined to 0. + * + * Most of these size optimizations can be controlled manually. + * + * This is a number from 0-2. + * - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed + * comes first. + * - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more + * conservative and disables hacks that increase code size. It implies the + * options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0, + * and @ref XXH3_NEON_LANES == 8 if they are not already defined. + * - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible. + * Performance may cry. For example, the single shot functions just use the + * streaming API. + */ +# define XXH_SIZE_OPT 0 + +/*! + * @def XXH_FORCE_ALIGN_CHECK + * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() + * and XXH64() only). + * + * This is an important performance trick for architectures without decent + * unaligned memory access performance. + * + * It checks for input alignment, and when conditions are met, uses a "fast + * path" employing direct 32-bit/64-bit reads, resulting in _dramatically + * faster_ read speed. + * + * The check costs one initial branch per hash, which is generally negligible, + * but not zero. + * + * Moreover, it's not useful to generate an additional code path if memory + * access uses the same instruction for both aligned and unaligned + * addresses (e.g. x86 and aarch64). + * + * In these cases, the alignment check can be removed by setting this macro to 0. + * Then the code will always use unaligned memory access. + * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips + * which are platforms known to offer good unaligned memory accesses performance. + * + * It is also disabled by default when @ref XXH_SIZE_OPT >= 1. + * + * This option does not affect XXH3 (only XXH32 and XXH64). + */ +# define XXH_FORCE_ALIGN_CHECK 0 + +/*! + * @def XXH_NO_INLINE_HINTS + * @brief When non-zero, sets all functions to `static`. + * + * By default, xxHash tries to force the compiler to inline almost all internal + * functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary which + * might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to performance, + * depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if + * @ref XXH_SIZE_OPT >= 1, this will automatically be defined. + */ +# define XXH_NO_INLINE_HINTS 0 + +/*! + * @def XXH3_INLINE_SECRET + * @brief Determines whether to inline the XXH3 withSecret code. + * + * When the secret size is known, the compiler can improve the performance + * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret(). + * + * However, if the secret size is not known, it doesn't have any benefit. This + * happens when xxHash is compiled into a global symbol. Therefore, if + * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0. + * + * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers + * that are *sometimes* force inline on -Og, and it is impossible to automatically + * detect this optimization level. + */ +# define XXH3_INLINE_SECRET 0 + +/*! + * @def XXH32_ENDJMP + * @brief Whether to use a jump for `XXH32_finalize`. + * + * For performance, `XXH32_finalize` uses multiple branches in the finalizer. + * This is generally preferable for performance, + * but depending on exact architecture, a jmp may be preferable. + * + * This setting is only possibly making a difference for very small inputs. + */ +# define XXH32_ENDJMP 0 + +/*! + * @internal + * @brief Redefines old internal names. + * + * For compatibility with code that uses xxHash's internals before the names + * were changed to improve namespacing. There is no other reason to use this. + */ +# define XXH_OLD_NAMES +# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ + +/*! + * @def XXH_NO_STREAM + * @brief Disables the streaming API. + * + * When xxHash is not inlined and the streaming functions are not used, disabling + * the streaming functions can improve code size significantly, especially with + * the @ref XXH3_family which tends to make constant folded copies of itself. + */ +# define XXH_NO_STREAM +# undef XXH_NO_STREAM /* don't actually */ +#endif /* XXH_DOXYGEN */ +/*! + * @} + */ + +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ + /* prefer __packed__ structures (method 1) for GCC + * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy + * which for some reason does unaligned loads. */ +# if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED)) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +#ifndef XXH_SIZE_OPT + /* default to 1 for -Os or -Oz */ +# if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__) +# define XXH_SIZE_OPT 1 +# else +# define XXH_SIZE_OPT 0 +# endif +#endif + +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ + /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */ +# if XXH_SIZE_OPT >= 1 || \ + defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \ + || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) /* visual */ +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +#ifndef XXH_NO_INLINE_HINTS +# if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# define XXH_NO_INLINE_HINTS 1 +# else +# define XXH_NO_INLINE_HINTS 0 +# endif +#endif + +#ifndef XXH3_INLINE_SECRET +# if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \ + || !defined(XXH_INLINE_ALL) +# define XXH3_INLINE_SECRET 0 +# else +# define XXH3_INLINE_SECRET 1 +# endif +#endif + +#ifndef XXH32_ENDJMP +/* generally preferable for performance */ +# define XXH32_ENDJMP 0 +#endif + +/*! + * @defgroup impl Implementation + * @{ + */ + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +#if defined(XXH_NO_STREAM) +/* nothing */ +#elif defined(XXH_NO_STDLIB) + +/* When requesting to disable any mention of stdlib, + * the library loses the ability to invoked malloc / free. + * In practice, it means that functions like `XXH*_createState()` + * will always fail, and return NULL. + * This flag is useful in situations where + * xxhash.h is integrated into some kernel, embedded or limited environment + * without access to dynamic allocation. + */ + +static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; } +static void XXH_free(void* p) { (void)p; } + +#else + +/* + * Modify the local functions below should you wish to use + * different memory routines for malloc() and free() + */ +#include + +/*! + * @internal + * @brief Modify this function to use a different routine than malloc(). + */ +static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); } + +/*! + * @internal + * @brief Modify this function to use a different routine than free(). + */ +static void XXH_free(void* p) { free(p); } + +#endif /* XXH_NO_STDLIB */ + +#include + +/*! + * @internal + * @brief Modify this function to use a different routine than memcpy(). + */ +static void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + +#include /* ULLONG_MAX */ + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio warning fix */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + +#if XXH_NO_INLINE_HINTS /* disable inlining hints */ +# if defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __attribute__((unused)) +# else +# define XXH_FORCE_INLINE static +# endif +# define XXH_NO_INLINE static +/* enable inlining hints */ +#elif defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) +# define XXH_NO_INLINE static __attribute__((noinline)) +#elif defined(_MSC_VER) /* Visual Studio */ +# define XXH_FORCE_INLINE static __forceinline +# define XXH_NO_INLINE static __declspec(noinline) +#elif defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ +# define XXH_FORCE_INLINE static inline +# define XXH_NO_INLINE static +#else +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +#endif + +#if XXH3_INLINE_SECRET +# define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE +#else +# define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE +#endif + + +/* ************************************* +* Debug +***************************************/ +/*! + * @ingroup tuning + * @def XXH_DEBUGLEVEL + * @brief Sets the debugging level. + * + * XXH_DEBUGLEVEL is expected to be defined externally, typically via the + * compiler's command line options. The value must be a number. + */ +#ifndef XXH_DEBUGLEVEL +# ifdef DEBUGLEVEL /* backwards compat */ +# define XXH_DEBUGLEVEL DEBUGLEVEL +# else +# define XXH_DEBUGLEVEL 0 +# endif +#endif + +#if (XXH_DEBUGLEVEL>=1) +# include /* note: can still be disabled with NDEBUG */ +# define XXH_ASSERT(c) assert(c) +#else +# if defined(__INTEL_COMPILER) +# define XXH_ASSERT(c) XXH_ASSUME((unsigned char) (c)) +# else +# define XXH_ASSERT(c) XXH_ASSUME(c) +# endif +#endif + +/* note: use after variable declarations */ +#ifndef XXH_STATIC_ASSERT +# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0) +# elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) +# else +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0) +# endif +# define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c) +#endif + +/*! + * @internal + * @def XXH_COMPILER_GUARD(var) + * @brief Used to prevent unwanted optimizations for @p var. + * + * It uses an empty GCC inline assembly statement with a register constraint + * which forces @p var into a general purpose register (eg eax, ebx, ecx + * on x86) and marks it as modified. + * + * This is used in a few places to avoid unwanted autovectorization (e.g. + * XXH32_round()). All vectorization we want is explicit via intrinsics, + * and _usually_ isn't wanted elsewhere. + * + * We also use it to prevent unwanted constant folding for AArch64 in + * XXH3_initCustomSecret_scalar(). + */ +#if defined(__GNUC__) || defined(__clang__) +# define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var)) +#else +# define XXH_COMPILER_GUARD(var) ((void)0) +#endif + +/* Specifically for NEON vectors which use the "w" constraint, on + * Clang. */ +#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__) +# define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var)) +#else +# define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0) +#endif + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXH32_hash_t xxh_u32; + +#ifdef XXH_OLD_NAMES +# warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly" +# define BYTE xxh_u8 +# define U8 xxh_u8 +# define U32 xxh_u32 +#endif + +/* *** Memory access *** */ + +/*! + * @internal + * @fn xxh_u32 XXH_read32(const void* ptr) + * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit native endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32(const void* ptr) + * @brief Reads an unaligned 32-bit little endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readBE32(const void* ptr) + * @brief Reads an unaligned 32-bit big endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit big endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) + * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is + * always @ref XXH_alignment::XXH_unaligned. + * + * @param ptr The pointer to read from. + * @param align Whether @p ptr is aligned. + * @pre + * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte + * aligned. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE32 and XXH_readBE32. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* + * Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware. + */ +static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +#endif +static xxh_u32 XXH_read32(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32; + return *((const xxh_unalign32*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u32 XXH_read32(const void* memPtr) +{ + xxh_u32 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* *** Endianness *** */ + +/*! + * @ingroup tuning + * @def XXH_CPU_LITTLE_ENDIAN + * @brief Whether the target is little endian. + * + * Defined to 1 if the target is little endian, or 0 if it is big endian. + * It can be defined externally, for example on the compiler command line. + * + * If it is not defined, + * a runtime check (which is usually constant folded) is used instead. + * + * @note + * This is not necessarily defined to an integer constant. + * + * @see XXH_isLittleEndian() for the runtime check. + */ +#ifndef XXH_CPU_LITTLE_ENDIAN +/* + * Try to detect endianness automatically, to avoid the nonstandard behavior + * in `XXH_isLittleEndian()` + */ +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 0 +# else +/*! + * @internal + * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. + * + * Most compilers will constant fold this. + */ +static int XXH_isLittleEndian(void) +{ + /* + * Portable and well-defined behavior. + * Don't use static: it is detrimental to performance. + */ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifdef __has_builtin +# define XXH_HAS_BUILTIN(x) __has_builtin(x) +#else +# define XXH_HAS_BUILTIN(x) 0 +#endif + + + +/* + * C23 and future versions have standard "unreachable()". + * Once it has been implemented reliably we can add it as an + * additional case: + * + * ``` + * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) + * # include + * # ifdef unreachable + * # define XXH_UNREACHABLE() unreachable() + * # endif + * #endif + * ``` + * + * Note C++23 also has std::unreachable() which can be detected + * as follows: + * ``` + * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L) + * # include + * # define XXH_UNREACHABLE() std::unreachable() + * #endif + * ``` + * NB: `__cpp_lib_unreachable` is defined in the `` header. + * We don't use that as including `` in `extern "C"` blocks + * doesn't work on GCC12 + */ + +#if XXH_HAS_BUILTIN(__builtin_unreachable) +# define XXH_UNREACHABLE() __builtin_unreachable() + +#elif defined(_MSC_VER) +# define XXH_UNREACHABLE() __assume(0) + +#else +# define XXH_UNREACHABLE() +#endif + +#if XXH_HAS_BUILTIN(__builtin_assume) +# define XXH_ASSUME(c) __builtin_assume(c) +#else +# define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); } +#endif + +/*! + * @internal + * @def XXH_rotl32(x,r) + * @brief 32-bit rotate left. + * + * @param x The 32-bit integer to be rotated. + * @param r The number of bits to rotate. + * @pre + * @p r > 0 && @p r < 32 + * @note + * @p x and @p r may be evaluated multiple times. + * @return The rotated result. + */ +#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ + && XXH_HAS_BUILTIN(__builtin_rotateleft64) +# define XXH_rotl32 __builtin_rotateleft32 +# define XXH_rotl64 __builtin_rotateleft64 +/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +/*! + * @internal + * @fn xxh_u32 XXH_swap32(xxh_u32 x) + * @brief A 32-bit byteswap. + * + * @param x The 32-bit integer to byteswap. + * @return @p x, byteswapped. + */ +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ + +/*! + * @internal + * @brief Enum to indicate whether a pointer is aligned. + */ +typedef enum { + XXH_aligned, /*!< Aligned */ + XXH_unaligned /*!< Possibly unaligned */ +} XXH_alignment; + +/* + * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. + * + * This is ideal for older compilers which don't inline memcpy. + */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u32)bytePtr[1] << 8) + | ((xxh_u32)bytePtr[2] << 16) + | ((xxh_u32)bytePtr[3] << 24); +} + +XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[3] + | ((xxh_u32)bytePtr[2] << 8) + | ((xxh_u32)bytePtr[1] << 16) + | ((xxh_u32)bytePtr[0] << 24); +} + +#else +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); +} + +static xxh_u32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u32 +XXH_readLE32_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) { + return XXH_readLE32(ptr); + } else { + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +/*! @ingroup public */ +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +/*! + * @} + * @defgroup XXH32_impl XXH32 implementation + * @ingroup impl + * + * Details on the XXH32 implementation. + * @{ + */ + /* #define instead of static const, to be used as initializers */ +#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ +#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ +#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ +#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ +#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ + +#ifdef XXH_OLD_NAMES +# define PRIME32_1 XXH_PRIME32_1 +# define PRIME32_2 XXH_PRIME32_2 +# define PRIME32_3 XXH_PRIME32_3 +# define PRIME32_4 XXH_PRIME32_4 +# define PRIME32_5 XXH_PRIME32_5 +#endif + +/*! + * @internal + * @brief Normal stripe processing routine. + * + * This shuffles the bits so that any bit from @p input impacts several bits in + * @p acc. + * + * @param acc The accumulator lane. + * @param input The stripe of input to mix. + * @return The mixed accumulator lane. + */ +static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) +{ + acc += input * XXH_PRIME32_2; + acc = XXH_rotl32(acc, 13); + acc *= XXH_PRIME32_1; +#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * UGLY HACK: + * A compiler fence is the only thing that prevents GCC and Clang from + * autovectorizing the XXH32 loop (pragmas and attributes don't work for some + * reason) without globally disabling SSE4.1. + * + * The reason we want to avoid vectorization is because despite working on + * 4 integers at a time, there are multiple factors slowing XXH32 down on + * SSE4: + * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on + * newer chips!) making it slightly slower to multiply four integers at + * once compared to four integers independently. Even when pmulld was + * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE + * just to multiply unless doing a long operation. + * + * - Four instructions are required to rotate, + * movqda tmp, v // not required with VEX encoding + * pslld tmp, 13 // tmp <<= 13 + * psrld v, 19 // x >>= 19 + * por v, tmp // x |= tmp + * compared to one for scalar: + * roll v, 13 // reliably fast across the board + * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason + * + * - Instruction level parallelism is actually more beneficial here because + * the SIMD actually serializes this operation: While v1 is rotating, v2 + * can load data, while v3 can multiply. SSE forces them to operate + * together. + * + * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing + * the loop. NEON is only faster on the A53, and with the newer cores, it is less + * than half the speed. + * + * Additionally, this is used on WASM SIMD128 because it JITs to the same + * SIMD instructions and has the same issue. + */ + XXH_COMPILER_GUARD(acc); +#endif + return acc; +} + +/*! + * @internal + * @brief Mixes all bits to finalize the hash. + * + * The final mix ensures that all input bits have a chance to impact any bit in + * the output digest, resulting in an unbiased distribution. + * + * @param hash The hash to avalanche. + * @return The avalanched hash. + */ +static xxh_u32 XXH32_avalanche(xxh_u32 hash) +{ + hash ^= hash >> 15; + hash *= XXH_PRIME32_2; + hash ^= hash >> 13; + hash *= XXH_PRIME32_3; + hash ^= hash >> 16; + return hash; +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-15 bytes of @p ptr. + * + * There may be up to 15 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 16. + * @param align Whether @p ptr is aligned. + * @return The finalized hash. + * @see XXH64_finalize(). + */ +static XXH_PUREF xxh_u32 +XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define XXH_PROCESS1 do { \ + hash += (*ptr++) * XXH_PRIME32_5; \ + hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \ +} while (0) + +#define XXH_PROCESS4 do { \ + hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \ + ptr += 4; \ + hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \ +} while (0) + + if (ptr==NULL) XXH_ASSERT(len == 0); + + /* Compact rerolled version; generally faster */ + if (!XXH32_ENDJMP) { + len &= 15; + while (len >= 4) { + XXH_PROCESS4; + len -= 4; + } + while (len > 0) { + XXH_PROCESS1; + --len; + } + return XXH32_avalanche(hash); + } else { + switch(len&15) /* or switch(bEnd - p) */ { + case 12: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 8: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 4: XXH_PROCESS4; + return XXH32_avalanche(hash); + + case 13: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 9: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 5: XXH_PROCESS4; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 14: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 10: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 6: XXH_PROCESS4; + XXH_PROCESS1; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 15: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 11: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 7: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 3: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 2: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 1: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 0: return XXH32_avalanche(hash); + } + XXH_ASSERT(0); + return hash; /* reaching this point is deemed impossible */ + } +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1 XXH_PROCESS1 +# define PROCESS4 XXH_PROCESS4 +#else +# undef XXH_PROCESS1 +# undef XXH_PROCESS4 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH32(). + * + * @param input , len , seed Directly passed from @ref XXH32(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u32 +XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) +{ + xxh_u32 h32; + + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=16) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 15; + xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + xxh_u32 v2 = seed + XXH_PRIME32_2; + xxh_u32 v3 = seed + 0; + xxh_u32 v4 = seed - XXH_PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; + v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; + v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; + v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; + } while (input < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + XXH_PRIME32_5; + } + + h32 += (xxh_u32)len; + + return XXH32_finalize(h32, input, len&15, align); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) +{ +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, (const xxh_u8*)input, len); + return XXH32_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); +#endif +} + + + +/******* Hash streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + statePtr->v[1] = seed + XXH_PRIME32_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME32_1; + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode +XXH32_update(XXH32_state_t* state, const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); + state->memsize += (XXH32_hash_t)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const xxh_u32* p32 = state->mem32; + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32)); + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const xxh_u8* const limit = bEnd - 16; + + do { + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) +{ + xxh_u32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v[0], 1) + + XXH_rotl32(state->v[1], 7) + + XXH_rotl32(state->v[2], 12) + + XXH_rotl32(state->v[3], 18); + } else { + h32 = state->v[2] /* == seed */ + XXH_PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ +/*! + * @} + * @ingroup impl + * @{ + */ +/******* Memory access *******/ + +typedef XXH64_hash_t xxh_u64; + +#ifdef XXH_OLD_NAMES +# define U64 xxh_u64 +#endif + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE64 and XXH_readBE64. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + return *(const xxh_u64*) memPtr; +} + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +#endif +static xxh_u64 XXH_read64(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64; + return *((const xxh_unalign64*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + xxh_u64 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXH_swap64(xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u64)bytePtr[1] << 8) + | ((xxh_u64)bytePtr[2] << 16) + | ((xxh_u64)bytePtr[3] << 24) + | ((xxh_u64)bytePtr[4] << 32) + | ((xxh_u64)bytePtr[5] << 40) + | ((xxh_u64)bytePtr[6] << 48) + | ((xxh_u64)bytePtr[7] << 56); +} + +XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[7] + | ((xxh_u64)bytePtr[6] << 8) + | ((xxh_u64)bytePtr[5] << 16) + | ((xxh_u64)bytePtr[4] << 24) + | ((xxh_u64)bytePtr[3] << 32) + | ((xxh_u64)bytePtr[2] << 40) + | ((xxh_u64)bytePtr[1] << 48) + | ((xxh_u64)bytePtr[0] << 56); +} + +#else +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); +} + +static xxh_u64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u64 +XXH_readLE64_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) + return XXH_readLE64(ptr); + else + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); +} + + +/******* xxh64 *******/ +/*! + * @} + * @defgroup XXH64_impl XXH64 implementation + * @ingroup impl + * + * Details on the XXH64 implementation. + * @{ + */ +/* #define rather that static const, to be used as initializers */ +#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */ +#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */ +#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */ +#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */ +#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +#ifdef XXH_OLD_NAMES +# define PRIME64_1 XXH_PRIME64_1 +# define PRIME64_2 XXH_PRIME64_2 +# define PRIME64_3 XXH_PRIME64_3 +# define PRIME64_4 XXH_PRIME64_4 +# define PRIME64_5 XXH_PRIME64_5 +#endif + +/*! @copydoc XXH32_round */ +static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) +{ + acc += input * XXH_PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= XXH_PRIME64_1; + return acc; +} + +static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; + return acc; +} + +/*! @copydoc XXH32_avalanche */ +static xxh_u64 XXH64_avalanche(xxh_u64 hash) +{ + hash ^= hash >> 33; + hash *= XXH_PRIME64_2; + hash ^= hash >> 29; + hash *= XXH_PRIME64_3; + hash ^= hash >> 32; + return hash; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-31 bytes of @p ptr. + * + * There may be up to 31 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 32. + * @param align Whether @p ptr is aligned. + * @return The finalized hash + * @see XXH32_finalize(). + */ +static XXH_PUREF xxh_u64 +XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ + if (ptr==NULL) XXH_ASSERT(len == 0); + len &= 31; + while (len >= 8) { + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); + ptr += 8; + hash ^= k1; + hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4; + len -= 8; + } + if (len >= 4) { + hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; + ptr += 4; + hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; + len -= 4; + } + while (len > 0) { + hash ^= (*ptr++) * XXH_PRIME64_5; + hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1; + --len; + } + return XXH64_avalanche(hash); +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1_64 XXH_PROCESS1_64 +# define PROCESS4_64 XXH_PROCESS4_64 +# define PROCESS8_64 XXH_PROCESS8_64 +#else +# undef XXH_PROCESS1_64 +# undef XXH_PROCESS4_64 +# undef XXH_PROCESS8_64 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH64(). + * + * @param input , len , seed Directly passed from @ref XXH64(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u64 +XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) +{ + xxh_u64 h64; + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=32) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 31; + xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + xxh_u64 v2 = seed + XXH_PRIME64_2; + xxh_u64 v3 = seed + 0; + xxh_u64 v4 = seed - XXH_PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; + v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; + v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; + v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; + } while (input= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, (const xxh_u8*)input, len); + return XXH64_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); + +#endif +} + +/******* Hash Streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH64_family*/ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + statePtr->v[1] = seed + XXH_PRIME64_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME64_1; + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode +XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); + state->memsize += (xxh_u32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0)); + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1)); + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2)); + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3)); + p += 32 - state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const xxh_u8* const limit = bEnd - 32; + + do { + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8; + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8; + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8; + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state) +{ + xxh_u64 h64; + + if (state->total_len >= 32) { + h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18); + h64 = XXH64_mergeRound(h64, state->v[0]); + h64 = XXH64_mergeRound(h64, state->v[1]); + h64 = XXH64_mergeRound(h64, state->v[2]); + h64 = XXH64_mergeRound(h64, state->v[3]); + } else { + h64 = state->v[2] /*seed*/ + XXH_PRIME64_5; + } + + h64 += (xxh_u64) state->total_len; + + return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + +#ifndef XXH_NO_XXH3 + +/* ********************************************************************* +* XXH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ +/*! + * @} + * @defgroup XXH3_impl XXH3 implementation + * @ingroup impl + * @{ + */ + +/* === Compiler specifics === */ + +#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ +# define XXH_RESTRICT /* disable */ +#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXH_RESTRICT restrict +#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ + || (defined (__clang__)) \ + || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \ + || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)) +/* + * There are a LOT more compilers that recognize __restrict but this + * covers the major ones. + */ +# define XXH_RESTRICT __restrict +#else +# define XXH_RESTRICT /* disable */ +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ + || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ + || defined(__clang__) +# define XXH_likely(x) __builtin_expect(x, 1) +# define XXH_unlikely(x) __builtin_expect(x, 0) +#else +# define XXH_likely(x) (x) +# define XXH_unlikely(x) (x) +#endif + +#ifndef XXH_HAS_INCLUDE +# ifdef __has_include +/* + * Not defined as XXH_HAS_INCLUDE(x) (function-like) because + * this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion) + */ +# define XXH_HAS_INCLUDE __has_include +# else +# define XXH_HAS_INCLUDE(x) 0 +# endif +#endif + +#if defined(__GNUC__) || defined(__clang__) +# if defined(__ARM_FEATURE_SVE) +# include +# endif +# if defined(__ARM_NEON__) || defined(__ARM_NEON) \ + || (defined(_M_ARM) && _M_ARM >= 7) \ + || defined(_M_ARM64) || defined(_M_ARM64EC) \ + || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE()) /* WASM SIMD128 via SIMDe */ +# define inline __inline__ /* circumvent a clang bug */ +# include +# undef inline +# elif defined(__AVX2__) +# include +# elif defined(__SSE2__) +# include +# endif +#endif + +#if defined(_MSC_VER) +# include +#endif + +/* + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. + * + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. + * + * For example, these two lines seem similar, and run equally fast on 64-bit: + * + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad + * + * However, to a 32-bit machine, there is a major difference. + * + * x ^= (x >> 47) looks like this: + * + * x.lo ^= (x.hi >> (47 - 32)); + * + * while x ^= (x >> 13) looks like this: + * + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); + * + * The first one is significantly faster than the second, simply because the + * shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the lower + * 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and therefore, + * we can ignore the upper 32 bits in the xor. + * + * Thanks to this optimization, XXH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent byteswap helps short inputs. + * + * The first two are already required by XXH32, and almost all 32-bit and 64-bit + * platforms which can run XXH32 can run XXH3 efficiently. + * + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one + * notable exception. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand allowing free + * shifts is helpful, too. + * + * Therefore, we do a quick sanity check. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we will + * emit a warning, as it is not a "sane" platform to compile for. + * + * Usually, if this happens, it is because of an accident and you probably need + * to specify -march, as you likely meant to compile for a newer architecture. + * + * Credit: large sections of the vectorial and asm source code paths + * have been contributed by @easyaspi314 + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ + +#ifdef XXH_DOXYGEN +/*! + * @ingroup tuning + * @brief Overrides the vectorization implementation chosen for XXH3. + * + * Can be defined to 0 to disable SIMD or any of the values mentioned in + * @ref XXH_VECTOR_TYPE. + * + * If this is not defined, it uses predefined macros to determine the best + * implementation. + */ +# define XXH_VECTOR XXH_SCALAR +/*! + * @ingroup tuning + * @brief Possible values for @ref XXH_VECTOR. + * + * Note that these are actually implemented as macros. + * + * If this is not defined, it is detected automatically. + * internal macro XXH_X86DISPATCH overrides this. + */ +enum XXH_VECTOR_TYPE /* fake enum */ { + XXH_SCALAR = 0, /*!< Portable scalar version */ + XXH_SSE2 = 1, /*!< + * SSE2 for Pentium 4, Opteron, all x86_64. + * + * @note SSE2 is also guaranteed on Windows 10, macOS, and + * Android x86. + */ + XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ + XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ + XXH_NEON = 4, /*!< + * NEON for most ARMv7-A, all AArch64, and WASM SIMD128 + * via the SIMDeverywhere polyfill provided with the + * Emscripten SDK. + */ + XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ + XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ +}; +/*! + * @ingroup tuning + * @brief Selects the minimum alignment for XXH3's accumulators. + * + * When using SIMD, this should match the alignment required for said vector + * type, so, for example, 32 for AVX2. + * + * Default: Auto detected. + */ +# define XXH_ACC_ALIGN 8 +#endif + +/* Actual definition */ +#ifndef XXH_DOXYGEN +# define XXH_SCALAR 0 +# define XXH_SSE2 1 +# define XXH_AVX2 2 +# define XXH_AVX512 3 +# define XXH_NEON 4 +# define XXH_VSX 5 +# define XXH_SVE 6 +#endif + +#ifndef XXH_VECTOR /* can be defined on command line */ +# if defined(__ARM_FEATURE_SVE) +# define XXH_VECTOR XXH_SVE +# elif ( \ + defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ + || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ + || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE()) /* wasm simd128 via SIMDe */ \ + ) && ( \ + defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ + ) +# define XXH_VECTOR XXH_NEON +# elif defined(__AVX512F__) +# define XXH_VECTOR XXH_AVX512 +# elif defined(__AVX2__) +# define XXH_VECTOR XXH_AVX2 +# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXH_VECTOR XXH_SSE2 +# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ + || (defined(__s390x__) && defined(__VEC__)) \ + && defined(__GNUC__) /* TODO: IBM XL */ +# define XXH_VECTOR XXH_VSX +# else +# define XXH_VECTOR XXH_SCALAR +# endif +#endif + +/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */ +#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE) +# ifdef _MSC_VER +# pragma warning(once : 4606) +# else +# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead." +# endif +# undef XXH_VECTOR +# define XXH_VECTOR XXH_SCALAR +#endif + +/* + * Controls the alignment of the accumulator, + * for compatibility with aligned vector loads, which are usually faster. + */ +#ifndef XXH_ACC_ALIGN +# if defined(XXH_X86DISPATCH) +# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ +# elif XXH_VECTOR == XXH_SCALAR /* scalar */ +# define XXH_ACC_ALIGN 8 +# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ +# define XXH_ACC_ALIGN 32 +# elif XXH_VECTOR == XXH_NEON /* neon */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_VSX /* vsx */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ +# define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_SVE /* sve */ +# define XXH_ACC_ALIGN 64 +# endif +#endif + +#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ + || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#elif XXH_VECTOR == XXH_SVE +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#else +# define XXH_SEC_ALIGN 8 +#endif + +#if defined(__GNUC__) || defined(__clang__) +# define XXH_ALIASING __attribute__((may_alias)) +#else +# define XXH_ALIASING /* nothing */ +#endif + +/* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling resulting + * in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which + * only applies to Sandy and Ivy Bridge... which don't even support AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use either + * -O2 -mavx2 -march=haswell + * or + * -O2 -mavx2 -mno-avx256-split-unaligned-load + * for decent performance, or to use Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC into + * -O2, but the other one we can't control without "failed to inline always + * inline function due to target mismatch" warnings. + */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ +# pragma GCC push_options +# pragma GCC optimize("-O2") +#endif + +#if XXH_VECTOR == XXH_NEON + +/* + * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3 + * optimizes out the entire hashLong loop because of the aliasing violation. + * + * However, GCC is also inefficient at load-store optimization with vld1q/vst1q, + * so the only option is to mark it as aliasing. + */ +typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING; + +/*! + * @internal + * @brief `vld1q_u64` but faster and alignment-safe. + * + * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only + * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86). + * + * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it + * prohibits load-store optimizations. Therefore, a direct dereference is used. + * + * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe + * unaligned load. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */ +{ + return *(xxh_aliasing_uint64x2_t const *)ptr; +} +#else +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) +{ + return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr)); +} +#endif + +/*! + * @internal + * @brief `vmlal_u32` on low and high halves of a vector. + * + * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with + * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32` + * with `vmlal_u32`. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11 +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + /* Inline assembly is the only way */ + __asm__("umlal %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs)); + return acc; +} +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + /* This intrinsic works as expected */ + return vmlal_high_u32(acc, lhs, rhs); +} +#else +/* Portable intrinsic versions */ +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs)); +} +/*! @copydoc XXH_vmlal_low_u32 + * Assume the compiler converts this to vmlal_high_u32 on aarch64 */ +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs)); +} +#endif + +/*! + * @ingroup tuning + * @brief Controls the NEON to scalar ratio for XXH3 + * + * This can be set to 2, 4, 6, or 8. + * + * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used. + * + * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those + * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU + * bandwidth. + * + * This is even more noticeable on the more advanced cores like the Cortex-A76 which + * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once. + * + * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes + * and 2 scalar lanes, which is chosen by default. + * + * This does not apply to Apple processors or 32-bit processors, which run better with + * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes. + * + * This change benefits CPUs with large micro-op buffers without negatively affecting + * most other CPUs: + * + * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. | + * |:----------------------|:--------------------|----------:|-----------:|------:| + * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% | + * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% | + * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% | + * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% | + * + * It also seems to fix some bad codegen on GCC, making it almost as fast as clang. + * + * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning + * it effectively becomes worse 4. + * + * @see XXH3_accumulate_512_neon() + */ +# ifndef XXH3_NEON_LANES +# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \ + && !defined(__APPLE__) && XXH_SIZE_OPT <= 0 +# define XXH3_NEON_LANES 6 +# else +# define XXH3_NEON_LANES XXH_ACC_NB +# endif +# endif +#endif /* XXH_VECTOR == XXH_NEON */ + +/* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ +#if XXH_VECTOR == XXH_VSX +/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`, + * and `pixel`. This is a problem for obvious reasons. + * + * These keywords are unnecessary; the spec literally says they are + * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd + * after including the header. + * + * We use pragma push_macro/pop_macro to keep the namespace clean. */ +# pragma push_macro("bool") +# pragma push_macro("vector") +# pragma push_macro("pixel") +/* silence potential macro redefined warnings */ +# undef bool +# undef vector +# undef pixel + +# if defined(__s390x__) +# include +# else +# include +# endif + +/* Restore the original macro values, if applicable. */ +# pragma pop_macro("pixel") +# pragma pop_macro("vector") +# pragma pop_macro("bool") + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + +/* + * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue. + */ +typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING; + +# ifndef XXH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXH_VSX_BE 1 +# else +# define XXH_VSX_BE 0 +# endif +# endif /* !defined(XXH_VSX_BE) */ + +# if XXH_VSX_BE +# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) +# define XXH_vec_revb vec_revb +# else +/*! + * A polyfill for POWER9's vec_revb(). + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) +{ + xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +# endif +# endif /* XXH_VSX_BE */ + +/*! + * Performs an unaligned vector load and byte swaps it on big endian. + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) +{ + xxh_u64x2 ret; + XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2)); +# if XXH_VSX_BE + ret = XXH_vec_revb(ret); +# endif + return ret; +} + +/* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a while, + * and they are endian dependent. Also, their meaning swap depending on version. + * */ +# if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ +# define XXH_vec_mulo vec_mulo +# define XXH_vec_mule vec_mule +# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__) +/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ + /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */ +# define XXH_vec_mulo __builtin_altivec_vmulouw +# define XXH_vec_mule __builtin_altivec_vmuleuw +# else +/* gcc needs inline assembly */ +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +# endif /* XXH_vec_mulo, XXH_vec_mule */ +#endif /* XXH_VECTOR == XXH_VSX */ + +#if XXH_VECTOR == XXH_SVE +#define ACCRND(acc, offset) \ +do { \ + svuint64_t input_vec = svld1_u64(mask, xinput + offset); \ + svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \ + svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \ + svuint64_t swapped = svtbl_u64(input_vec, kSwap); \ + svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \ + svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \ + svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \ + acc = svadd_u64_x(mask, acc, mul); \ +} while (0) +#endif /* XXH_VECTOR == XXH_SVE */ + +/* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ +#if defined(XXH_NO_PREFETCH) +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +# if XXH_SIZE_OPT >= 1 +# define XXH_PREFETCH(ptr) (void)(ptr) +# elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXH_NO_PREFETCH */ + + +/* ========================================== + * XXH3 default settings + * ========================================== */ + +#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + +#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +/*! Pseudorandom secret taken directly from FARSH. */ +XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + +static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL; /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */ +static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL; /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */ + +#ifdef XXH_OLD_NAMES +# define kSecret XXH3_kSecret +#endif + +#ifdef XXH_DOXYGEN +/*! + * @brief Calculates a 32-bit to 64-bit long multiply. + * + * Implemented as a macro. + * + * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't + * need to (but it shouldn't need to anyways, it is about 7 instructions to do + * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we + * use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better option, + * you may also want to write your own long multiply routine here. + * + * @param x, y Numbers to be multiplied + * @return 64-bit product of the low 32 bits of @p x and @p y. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64(xxh_u64 x, xxh_u64 y) +{ + return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); +} +#elif defined(_MSC_VER) && defined(_M_IX86) +# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) +#else +/* + * Downcast + upcast is usually better than masking on older compilers like + * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands + * and perform a full 64x64 multiply -- entirely redundant on 32-bit. + */ +# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) +#endif + +/*! + * @brief Calculates a 64->128-bit long multiply. + * + * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar + * version. + * + * @param lhs , rhs The 64-bit integers to be multiplied + * @return The 128-bit result represented in an @ref XXH128_hash_t. + */ +static XXH128_hash_t +XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this type + * despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; + XXH128_hash_t r128; + r128.low64 = (xxh_u64)(product); + r128.high64 = (xxh_u64)(product >> 64); + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXH128_hash_t r128; + r128.low64 = product_low; + r128.high64 = product_high; + return r128; + + /* + * MSVC for ARM64's __umulh method. + * + * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method. + */ +#elif defined(_M_ARM64) || defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(__umulh) +#endif + XXH128_hash_t r128; + r128.low64 = lhs * rhs; + r128.high64 = __umulh(lhs, rhs); + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown below + * with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 + * --------- + * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. + * This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARM's Digital Signal Processing extension + * in 32-bit ARMv6 and later, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, and + * allows this to be calculated in only 4 instructions at speeds + * comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be a couple + * of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXH128_hash_t r128; + r128.low64 = lower; + r128.high64 = upper; + return r128; +#endif +} + +/*! + * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. + * + * The reason for the separate function is to prevent passing too many structs + * around by value. This will hopefully inline the multiply, but we don't force it. + * + * @param lhs , rhs The 64-bit integers to multiply + * @return The low 64 bits of the product XOR'd by the high 64 bits. + * @see XXH_mult64to128() + */ +static xxh_u64 +XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXH128_hash_t product = XXH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + +/*! Seems to produce slightly better code on GCC for some reason. */ +XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +{ + XXH_ASSERT(0 <= shift && shift < 64); + return v64 ^ (v64 >> shift); +} + +/* + * This is a fast avalanche stage, + * suitable when input bits are already partially mixed + */ +static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) +{ + h64 = XXH_xorshift64(h64, 37); + h64 *= PRIME_MX1; + h64 = XXH_xorshift64(h64, 32); + return h64; +} + +/* + * This is a stronger avalanche, + * inspired by Pelle Evensen's rrmxmx + * preferable when input has not been previously mixed + */ +static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) +{ + /* this mix is inspired by Pelle Evensen's rrmxmx */ + h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); + h64 *= PRIME_MX2; + h64 ^= (h64 >> 35) + len ; + h64 *= PRIME_MX2; + return XXH_xorshift64(h64, 28); +} + + +/* ========================================== + * Short keys + * ========================================== + * One of the shortcomings of XXH32 and XXH64 was that their performance was + * sub-optimal on short lengths. It used an iterative algorithm which strongly + * favored lengths that were a multiple of 4 or 8. + * + * Instead of iterating over individual inputs, we use a set of single shot + * functions which piece together a range of lengths and operate in constant time. + * + * Additionally, the number of multiplies has been significantly reduced. This + * reduces latency, especially when emulating 64-bit multiplies on 32-bit. + * + * Depending on the platform, this may or may not be faster than XXH32, but it + * is almost guaranteed to be faster than XXH64. + */ + +/* + * At very short lengths, there isn't enough input to fully hide secrets, or use + * the entire secret. + * + * There is also only a limited amount of mixing we can do before significantly + * impacting performance. + * + * Therefore, we use different sections of the secret and always mix two secret + * samples with an XOR. This should have no effect on performance on the + * seedless or withSeed variants because everything _should_ be constant folded + * by modern compilers. + * + * The XOR mixing hides individual parts of the secret and increases entropy. + * + * This adds an extra layer of strength for custom secrets. + */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combined = { input[0], 0x01, input[0], input[0] } + * len = 2: combined = { input[1], 0x02, input[0], input[1] } + * len = 3: combined = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; + return XXH64_avalanche(keyed); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input1 = XXH_readLE32(input); + xxh_u32 const input2 = XXH_readLE32(input + len - 4); + xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; + xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); + xxh_u64 const keyed = input64 ^ bitflip; + return XXH3_rrmxmx(keyed, len); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; + xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; + xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; + xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; + xxh_u64 const acc = len + + XXH_swap64(input_lo) + input_hi + + XXH3_mul128_fold64(input_lo, input_hi); + return XXH3_avalanche(acc); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); + if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXH3_len_1to3_64b(input, len, secret, seed); + return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); + } +} + +/* + * DISCLAIMER: There are known *seed-dependent* multicollisions here due to + * multiplication by zero, affecting hashes of lengths 17 to 240. + * + * However, they are very unlikely. + * + * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all + * unseeded non-cryptographic hashes, it does not attempt to defend itself + * against specially crafted inputs, only random inputs. + * + * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes + * cancelling out the secret is taken an arbitrary number of times (addressed + * in XXH3_accumulate_512), this collision is very unlikely with random inputs + * and/or proper seeding: + * + * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a + * function that is only called up to 16 times per hash with up to 240 bytes of + * input. + * + * This is not too bad for a non-cryptographic hash function, especially with + * only 64 bit outputs. + * + * The 128-bit variant (which trades some speed for strength) is NOT affected + * by this, although it is always a good idea to use a proper seed if you care + * about strength. + */ +XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) +{ +#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + /* + * UGLY HACK: + * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in + * slower code. + * + * By forcing seed64 into a register, we disrupt the cost model and + * cause it to scalarize. See `XXH32_round()` + * + * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, + * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on + * GCC 9.2, despite both emitting scalar code. + * + * GCC generates much better scalar code than Clang for the rest of XXH3, + * which is why finding a more optimal codepath is an interest. + */ + XXH_COMPILER_GUARD(seed64); +#endif + { xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 const input_hi = XXH_readLE64(input+8); + return XXH3_mul128_fold64( + input_lo ^ (XXH_readLE64(secret) + seed64), + input_hi ^ (XXH_readLE64(secret+8) - seed64) + ); + } +} + +/* For mid range keys, XXH3 uses a Mum-hash variant. */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * XXH_PRIME64_1; +#if XXH_SIZE_OPT >= 1 + /* Smaller and cleaner, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc += XXH3_mix16B(input+16 * i, secret+32*i, seed); + acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed); + } while (i-- != 0); +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc += XXH3_mix16B(input+48, secret+96, seed); + acc += XXH3_mix16B(input+len-64, secret+112, seed); + } + acc += XXH3_mix16B(input+32, secret+64, seed); + acc += XXH3_mix16B(input+len-48, secret+80, seed); + } + acc += XXH3_mix16B(input+16, secret+32, seed); + acc += XXH3_mix16B(input+len-32, secret+48, seed); + } + acc += XXH3_mix16B(input+0, secret+0, seed); + acc += XXH3_mix16B(input+len-16, secret+16, seed); +#endif + return XXH3_avalanche(acc); + } +} + +/*! + * @brief Maximum size of "short" key in bytes. + */ +#define XXH3_MIDSIZE_MAX 240 + +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * XXH_PRIME64_1; + xxh_u64 acc_end; + unsigned int const nbRounds = (unsigned int)len / 16; + unsigned int i; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + for (i=0; i<8; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); + } + /* last bytes */ + acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); + XXH_ASSERT(nbRounds >= 8); + acc = XXH3_avalanche(acc); +#if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) +#endif + for (i=8 ; i < nbRounds; i++) { + /* + * Prevents clang for unrolling the acc loop and interleaving with this one. + */ + XXH_COMPILER_GUARD(acc); + acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + } + return XXH3_avalanche(acc + acc_end); + } +} + + +/* ======= Long Keys ======= */ + +#define XXH_STRIPE_LEN 64 +#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) + +#ifdef XXH_OLD_NAMES +# define STRIPE_LEN XXH_STRIPE_LEN +# define ACC_NB XXH_ACC_NB +#endif + +#ifndef XXH_PREFETCH_DIST +# ifdef __clang__ +# define XXH_PREFETCH_DIST 320 +# else +# if (XXH_VECTOR == XXH_AVX512) +# define XXH_PREFETCH_DIST 512 +# else +# define XXH_PREFETCH_DIST 384 +# endif +# endif /* __clang__ */ +#endif /* XXH_PREFETCH_DIST */ + +/* + * These macros are to generate an XXH3_accumulate() function. + * The two arguments select the name suffix and target attribute. + * + * The name of this symbol is XXH3_accumulate_() and it calls + * XXH3_accumulate_512_(). + * + * It may be useful to hand implement this function if the compiler fails to + * optimize the inline function. + */ +#define XXH3_ACCUMULATE_TEMPLATE(name) \ +void \ +XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \ + const xxh_u8* XXH_RESTRICT input, \ + const xxh_u8* XXH_RESTRICT secret, \ + size_t nbStripes) \ +{ \ + size_t n; \ + for (n = 0; n < nbStripes; n++ ) { \ + const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \ + XXH_PREFETCH(in + XXH_PREFETCH_DIST); \ + XXH3_accumulate_512_##name( \ + acc, \ + in, \ + secret + n*XXH_SECRET_CONSUME_RATE); \ + } \ +} + + +XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); + XXH_memcpy(dst, &v64, sizeof(v64)); +} + +/* Several intrinsic functions below are supposed to accept __int64 as argument, + * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . + * However, several environments do not define __int64 type, + * requiring a workaround. + */ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) + typedef int64_t xxh_i64; +#else + /* the following type must have a width of 64-bit */ + typedef long long xxh_i64; +#endif + + +/* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as the product. + * + * This means that in the (relatively likely) case of a multiply by zero, the + * original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together in + * the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +#if (XXH_VECTOR == XXH_AVX512) \ + || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) + +#ifndef XXH_TARGET_AVX512 +# define XXH_TARGET_AVX512 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + __m512i* const xacc = (__m512i *) acc; + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + + { + /* data_vec = input[0]; */ + __m512i const data_vec = _mm512_loadu_si512 (input); + /* key_vec = secret[0]; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + /* data_key = data_vec ^ key_vec; */ + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + /* xacc[0] += swap(data_vec); */ + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } +} +XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512) + +/* + * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. + * + * Multiplication isn't perfect, as explained by Google in HighwayHash: + * + * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + * // varying degrees. In descending order of goodness, bytes + * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + * // As expected, the upper and lower bytes are much worse. + * + * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 + * + * Since our algorithm uses a pseudorandom secret to add some variance into the + * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. + * + * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid + * extraction. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + { __m512i* const xacc = (__m512i*) acc; + const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); + + /* xacc[0] ^= (xacc[0] >> 47) */ + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); + /* xacc[0] ^= secret; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */); + + /* xacc[0] *= XXH_PRIME32_1; */ + __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32); + __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); + __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); + XXH_ASSERT(((size_t)customSecret & 63) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); + __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64); + __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos); + + const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); + __m512i* const dest = ( __m512i*) customSecret; + int i; + XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 63) == 0); + for (i=0; i < nbRounds; ++i) { + dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_AVX2) \ + || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) + +#ifndef XXH_TARGET_AVX2 +# define XXH_TARGET_AVX2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xinput = (const __m256i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2) + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); + (void)(&XXH_writeLE64); + XXH_PREFETCH(customSecret); + { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64); + + const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret); + __m256i* dest = ( __m256i*) customSecret; + +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dest); +# endif + XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 31) == 0); + + /* GCC -O2 need unroll loop manually */ + dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed); + dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed); + dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed); + dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed); + dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed); + dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed); + } +} + +#endif + +/* x86dispatch always generates SSE2 */ +#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) + +#ifndef XXH_TARGET_SSE2 +# define XXH_TARGET_SSE2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* SSE2 is just a half-scale version of the AVX2 version. */ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xinput = (const __m128i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2) + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); + +# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ + XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; + __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); +# else + __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); +# endif + int i; + + const void* const src16 = XXH3_kSecret; + __m128i* dst16 = (__m128i*) customSecret; +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dst16); +# endif + XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dst16 & 15) == 0); + + for (i=0; i < nbRounds; ++i) { + dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_NEON) + +/* forward declarations for the scalar routines */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, size_t lane); + +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, size_t lane); + +/*! + * @internal + * @brief The bulk processing loop for NEON and WASM SIMD128. + * + * The NEON code path is actually partially scalar when running on AArch64. This + * is to optimize the pipelining and can have up to 15% speedup depending on the + * CPU, and it also mitigates some GCC codegen issues. + * + * @see XXH3_NEON_LANES for configuring this and details about this optimization. + * + * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit + * integers instead of the other platforms which mask full 64-bit vectors, + * so the setup is more complicated than just shifting right. + * + * Additionally, there is an optimization for 4 lanes at once noted below. + * + * Since, as stated, the most optimal amount of lanes for Cortexes is 6, + * there needs to be *three* versions of the accumulate operation used + * for the remaining 2 lanes. + * + * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap + * nearly perfectly. + */ + +XXH_FORCE_INLINE void +XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0); + { /* GCC for darwin arm64 does not like aliasing here */ + xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* xinput = (const uint8_t *) input; + uint8_t const* xsecret = (const uint8_t *) secret; + + size_t i; +#ifdef __wasm_simd128__ + /* + * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret + * is constant propagated, which results in it converting it to this + * inside the loop: + * + * a = v128.load(XXH3_kSecret + 0 + $secret_offset, offset = 0) + * b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0) + * ... + * + * This requires a full 32-bit address immediate (and therefore a 6 byte + * instruction) as well as an add for each offset. + * + * Putting an asm guard prevents it from folding (at the cost of losing + * the alignment hint), and uses the free offset in `v128.load` instead + * of adding secret_offset each time which overall reduces code size by + * about a kilobyte and improves performance. + */ + XXH_COMPILER_GUARD(xsecret); +#endif + /* Scalar lanes use the normal scalarRound routine */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } + i = 0; + /* 4 NEON lanes at a time. */ + for (; i+1 < XXH3_NEON_LANES / 2; i+=2) { + /* data_vec = xinput[i]; */ + uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput + (i * 16)); + uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput + ((i+1) * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec_1 = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t key_vec_2 = XXH_vld1q_u64(xsecret + ((i+1) * 16)); + /* data_swap = swap(data_vec) */ + uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1); + uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1); + uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2); + + /* + * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a + * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to + * get one vector with the low 32 bits of each lane, and one vector + * with the high 32 bits of each lane. + * + * The intrinsic returns a double vector because the original ARMv7-a + * instruction modified both arguments in place. AArch64 and SIMD128 emit + * two instructions from this intrinsic. + * + * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ] + * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ] + */ + uint32x4x2_t unzipped = vuzpq_u32( + vreinterpretq_u32_u64(data_key_1), + vreinterpretq_u32_u64(data_key_2) + ); + /* data_key_lo = data_key & 0xFFFFFFFF */ + uint32x4_t data_key_lo = unzipped.val[0]; + /* data_key_hi = data_key >> 32 */ + uint32x4_t data_key_hi = unzipped.val[1]; + /* + * Then, we can split the vectors horizontally and multiply which, as for most + * widening intrinsics, have a variant that works on both high half vectors + * for free on AArch64. A similar instruction is available on SIMD128. + * + * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi + */ + uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi); + uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi); + /* + * Clang reorders + * a += b * c; // umlal swap.2d, dkl.2s, dkh.2s + * c += a; // add acc.2d, acc.2d, swap.2d + * to + * c += a; // add acc.2d, acc.2d, swap.2d + * c += b * c; // umlal acc.2d, dkl.2s, dkh.2s + * + * While it would make sense in theory since the addition is faster, + * for reasons likely related to umlal being limited to certain NEON + * pipelines, this is worse. A compiler guard fixes this. + */ + XXH_COMPILER_GUARD_CLANG_NEON(sum_1); + XXH_COMPILER_GUARD_CLANG_NEON(sum_2); + /* xacc[i] = acc_vec + sum; */ + xacc[i] = vaddq_u64(xacc[i], sum_1); + xacc[i+1] = vaddq_u64(xacc[i+1], sum_2); + } + /* Operate on the remaining NEON lanes 2 at a time. */ + for (; i < XXH3_NEON_LANES / 2; i++) { + /* data_vec = xinput[i]; */ + uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key = veorq_u64(data_vec, key_vec); + /* For two lanes, just use VMOVN and VSHRN. */ + /* data_key_lo = data_key & 0xFFFFFFFF; */ + uint32x2_t data_key_lo = vmovn_u64(data_key); + /* data_key_hi = data_key >> 32; */ + uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32); + /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */ + uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi); + /* Same Clang workaround as before */ + XXH_COMPILER_GUARD_CLANG_NEON(sum); + /* xacc[i] = acc_vec + sum; */ + xacc[i] = vaddq_u64 (xacc[i], sum); + } + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_aliasing_uint64x2_t* xacc = (xxh_aliasing_uint64x2_t*) acc; + uint8_t const* xsecret = (uint8_t const*) secret; + + size_t i; + /* WASM uses operator overloads and doesn't need these. */ +#ifndef __wasm_simd128__ + /* { prime32_1, prime32_1 } */ + uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1); + /* { 0, prime32_1, 0, prime32_1 } */ + uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32)); +#endif + + /* AArch64 uses both scalar and neon at the same time */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } + for (i=0; i < XXH3_NEON_LANES / 2; i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + uint64x2_t acc_vec = xacc[i]; + uint64x2_t shifted = vshrq_n_u64(acc_vec, 47); + uint64x2_t data_vec = veorq_u64(acc_vec, shifted); + + /* xacc[i] ^= xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64(data_vec, key_vec); + /* xacc[i] *= XXH_PRIME32_1 */ +#ifdef __wasm_simd128__ + /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */ + xacc[i] = data_key * XXH_PRIME32_1; +#else + /* + * Expanded version with portable NEON intrinsics + * + * lo(x) * lo(y) + (hi(x) * lo(y) << 32) + * + * prod_hi = hi(data_key) * lo(prime) << 32 + * + * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector + * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits + * and avoid the shift. + */ + uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi); + /* Extract low bits for vmlal_u32 */ + uint32x2_t data_key_lo = vmovn_u64(data_key); + /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */ + xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo); +#endif + } + } +} +#endif + +#if (XXH_VECTOR == XXH_VSX) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* presumed aligned */ + xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; + xxh_u8 const* const xinput = (xxh_u8 const*) input; /* no alignment restriction */ + xxh_u8 const* const xsecret = (xxh_u8 const*) secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* data_vec = xinput[i]; */ + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i); + /* key_vec = xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + /* shuffled = (data_key << 32) | (data_key >> 32); */ + xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ + xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); + /* acc_vec = xacc[i]; */ + xxh_u64x2 acc_vec = xacc[i]; + acc_vec += product; + + /* swap high and low halves */ +#ifdef __s390x__ + acc_vec += vec_permi(data_vec, data_vec, 2); +#else + acc_vec += vec_xxpermdi(data_vec, data_vec, 2); +#endif + xacc[i] = acc_vec; + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; + const xxh_u8* const xsecret = (const xxh_u8*) secret; + /* constants */ + xxh_u64x2 const v32 = { 32, 32 }; + xxh_u64x2 const v47 = { 47, 47 }; + xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + xxh_u64x2 const acc_vec = xacc[i]; + xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + + /* xacc[i] ^= xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + + /* xacc[i] *= XXH_PRIME32_1 */ + /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ + xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); + /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ + xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_SVE) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_sve( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc); + ACCRND(vacc, 0); + svst1_u64(mask, xacc, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } +} + +XXH_FORCE_INLINE void +XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes) +{ + if (nbStripes != 0) { + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc + 0); + do { + /* svprfd(svbool_t, void *, enum svfprop); */ + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(vacc, 0); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } + } +} + +#endif + +/* scalar variants - universal */ + +#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) +/* + * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they + * emit an excess mask and a full 64-bit multiply-add (MADD X-form). + * + * While this might not seem like much, as AArch64 is a 64-bit architecture, only + * big Cortex designs have a full 64-bit multiplier. + * + * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit + * multiplies expand to 2-3 multiplies in microcode. This has a major penalty + * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline. + * + * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does + * not have this penalty and does the mask automatically. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) +{ + xxh_u64 ret; + /* note: %x = 64-bit register, %w = 32-bit register */ + __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc)); + return ret; +} +#else +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) +{ + return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc; +} +#endif + +/*! + * @internal + * @brief Scalar round for @ref XXH3_accumulate_512_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* xacc = (xxh_u64*) acc; + xxh_u8 const* xinput = (xxh_u8 const*) input; + xxh_u8 const* xsecret = (xxh_u8 const*) secret; + XXH_ASSERT(lane < XXH_ACC_NB); + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + { + xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8); + xacc[lane ^ 1] += data_val; /* swap adjacent lanes */ + xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]); + } +} + +/*! + * @internal + * @brief Processes a 64 byte block of data using the scalar path. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + size_t i; + /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__arm__) || defined(__thumb2__)) \ + && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \ + && XXH_SIZE_OPT <= 0 +# pragma GCC unroll 8 +#endif + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar) + +/*! + * @internal + * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); + XXH_ASSERT(lane < XXH_ACC_NB); + { + xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8); + xxh_u64 acc64 = xacc[lane]; + acc64 = XXH_xorshift64(acc64, 47); + acc64 ^= key64; + acc64 *= XXH_PRIME32_1; + xacc[lane] = acc64; + } +} + +/*! + * @internal + * @brief Scrambles the accumulators after a large chunk has been read + */ +XXH_FORCE_INLINE void +XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + size_t i; + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } +} + +XXH_FORCE_INLINE void +XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + /* + * We need a separate pointer for the hack below, + * which requires a non-const pointer. + * Any decent compiler will optimize this out otherwise. + */ + const xxh_u8* kSecretPtr = XXH3_kSecret; + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + +#if defined(__GNUC__) && defined(__aarch64__) + /* + * UGLY HACK: + * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are + * placed sequentially, in order, at the top of the unrolled loop. + * + * While MOVK is great for generating constants (2 cycles for a 64-bit + * constant compared to 4 cycles for LDR), it fights for bandwidth with + * the arithmetic instructions. + * + * I L S + * MOVK + * MOVK + * MOVK + * MOVK + * ADD + * SUB STR + * STR + * By forcing loads from memory (as the asm line causes the compiler to assume + * that XXH3_kSecretPtr has been changed), the pipelines are used more + * efficiently: + * I L S + * LDR + * ADD LDR + * SUB STR + * STR + * + * See XXH3_NEON_LANES for details on the pipsline. + * + * XXH3_64bits_withSeed, len == 256, Snapdragon 835 + * without hack: 2654.4 MB/s + * with hack: 3202.9 MB/s + */ + XXH_COMPILER_GUARD(kSecretPtr); +#endif + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; + int i; + for (i=0; i < nbRounds; i++) { + /* + * The asm hack causes the compiler to assume that kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; + xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; + XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); + XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); + } } +} + + +typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t); +typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); +typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); + + +#if (XXH_VECTOR == XXH_AVX512) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx512 +#define XXH3_accumulate XXH3_accumulate_avx512 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 + +#elif (XXH_VECTOR == XXH_AVX2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx2 +#define XXH3_accumulate XXH3_accumulate_avx2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 + +#elif (XXH_VECTOR == XXH_SSE2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_sse2 +#define XXH3_accumulate XXH3_accumulate_sse2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 + +#elif (XXH_VECTOR == XXH_NEON) + +#define XXH3_accumulate_512 XXH3_accumulate_512_neon +#define XXH3_accumulate XXH3_accumulate_neon +#define XXH3_scrambleAcc XXH3_scrambleAcc_neon +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_VSX) + +#define XXH3_accumulate_512 XXH3_accumulate_512_vsx +#define XXH3_accumulate XXH3_accumulate_vsx +#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_SVE) +#define XXH3_accumulate_512 XXH3_accumulate_512_sve +#define XXH3_accumulate XXH3_accumulate_sve +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#else /* scalar */ + +#define XXH3_accumulate_512 XXH3_accumulate_512_scalar +#define XXH3_accumulate XXH3_accumulate_scalar +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#endif + +#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */ +# undef XXH3_initCustomSecret +# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#endif + +XXH_FORCE_INLINE void +XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; + size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; + size_t const nb_blocks = (len - 1) / block_len; + + size_t n; + + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + f_acc(acc, input + n*block_len, secret, nbStripesPerBlock); + f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); + } + + /* last partial block */ + XXH_ASSERT(len > XXH_STRIPE_LEN); + { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; + XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); + f_acc(acc, input + nb_blocks*block_len, secret, nbStripes); + + /* last stripe */ + { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; +#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ + XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); + } } +} + +XXH_FORCE_INLINE xxh_u64 +XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) +{ + return XXH3_mul128_fold64( + acc[0] ^ XXH_readLE64(secret), + acc[1] ^ XXH_readLE64(secret+8) ); +} + +static XXH64_hash_t +XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + size_t i = 0; + + for (i = 0; i < 4; i++) { + result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); +#if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Prevent autovectorization on Clang ARMv7-a. Exact same problem as + * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. + * XXH3_64bits, len == 256, Snapdragon 835: + * without hack: 2063.7 MB/s + * with hack: 2560.7 MB/s + */ + XXH_COMPILER_GUARD(result64); +#endif + } + + return XXH3_avalanche(result64); +} + +#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ + XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, + const void* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + /* do not align on 8, so that the secret is different from the accumulator */ +#define XXH_SECRET_MERGEACCS_START 11 + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); +} + +/* + * It's important for performance to transmit secret's size (when it's static) + * so that the compiler can properly optimize the vectorized loop. + * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set. + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE + * breaks -Og, this is XXH_NO_INLINE. + */ +XXH3_WITH_SECRET_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's preferable for performance that XXH3_hashLong is not inlined, + * as it results in a smaller function for small data, easier to the instruction cache. + * Note that inside this no_inline function, we do inline the internal loop, + * and provide a statically defined secret size to allow optimization of vector loop. + */ +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * XXH3_hashLong_64b_withSeed(): + * Generate a custom key based on alteration of default XXH3_kSecret with the seed, + * and then use this key for long mode hashing. + * + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + * + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, + XXH64_hash_t seed, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ +#if XXH_SIZE_OPT <= 0 + if (seed == 0) + return XXH3_hashLong_64b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); +#endif + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed); + return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_64b_withSeed_internal(input, len, seed, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + + +typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong64_f f_hashLong) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secretLen` condition is not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + * Also, note that function signature doesn't offer room to return an error. + */ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); +} + + +/* === Public entry point === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length) +{ + return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed) +{ + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (length <= XXH3_MIDSIZE_MAX) + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize); +} + + +/* === XXH3 streaming === */ +#ifndef XXH_NO_STREAM +/* + * Malloc's a pointer that is always aligned to align. + * + * This must be freed with `XXH_alignedFree()`. + * + * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte + * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 + * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. + * + * This underalignment previously caused a rather obvious crash which went + * completely unnoticed due to XXH3_createState() not actually being tested. + * Credit to RedSpah for noticing this bug. + * + * The alignment is done manually: Functions like posix_memalign or _mm_malloc + * are avoided: To maintain portability, we would have to write a fallback + * like this anyways, and besides, testing for the existence of library + * functions without relying on external build tools is impossible. + * + * The method is simple: Overallocate, manually align, and store the offset + * to the original behind the returned pointer. + * + * Align must be a power of 2 and 8 <= align <= 128. + */ +static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) +{ + XXH_ASSERT(align <= 128 && align >= 8); /* range check */ + XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ + XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ + { /* Overallocate to make room for manual realignment and an offset byte */ + xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); + if (base != NULL) { + /* + * Get the offset needed to align this pointer. + * + * Even if the returned pointer is aligned, there will always be + * at least one byte to store the offset to the original pointer. + */ + size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ + /* Add the offset for the now-aligned pointer */ + xxh_u8* ptr = base + offset; + + XXH_ASSERT((size_t)ptr % align == 0); + + /* Store the offset immediately before the returned pointer. */ + ptr[-1] = (xxh_u8)offset; + return ptr; + } + return NULL; + } +} +/* + * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass + * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. + */ +static void XXH_alignedFree(void* p) +{ + if (p != NULL) { + xxh_u8* ptr = (xxh_u8*)p; + /* Get the offset byte we added in XXH_malloc. */ + xxh_u8 offset = ptr[-1]; + /* Free the original malloc'd pointer */ + xxh_u8* base = ptr - offset; + XXH_free(base); + } +} +/*! @ingroup XXH3_family */ +/*! + * @brief Allocate an @ref XXH3_state_t. + * + * @return An allocated pointer of @ref XXH3_state_t on success. + * @return `NULL` on failure. + * + * @note Must be freed with XXH3_freeState(). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) +{ + XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); + if (state==NULL) return NULL; + XXH3_INITSTATE(state); + return state; +} + +/*! @ingroup XXH3_family */ +/*! + * @brief Frees an @ref XXH3_state_t. + * + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * + * @return @ref XXH_OK. + * + * @note Must be allocated with XXH3_createState(). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) +{ + XXH_alignedFree(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state) +{ + XXH_memcpy(dst_state, src_state, sizeof(*dst_state)); +} + +static void +XXH3_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const void* secret, size_t secretSize) +{ + size_t const initStart = offsetof(XXH3_state_t, bufferedSize); + size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; + XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); + XXH_ASSERT(statePtr != NULL); + /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ + memset((char*)statePtr + initStart, 0, initLength); + statePtr->acc[0] = XXH_PRIME32_3; + statePtr->acc[1] = XXH_PRIME64_1; + statePtr->acc[2] = XXH_PRIME64_2; + statePtr->acc[3] = XXH_PRIME64_3; + statePtr->acc[4] = XXH_PRIME64_4; + statePtr->acc[5] = XXH_PRIME32_2; + statePtr->acc[6] = XXH_PRIME64_5; + statePtr->acc[7] = XXH_PRIME32_1; + statePtr->seed = seed; + statePtr->useSeed = (seed != 0); + statePtr->extSecret = (const unsigned char*)secret; + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; + statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + if (seed==0) return XXH3_64bits_reset(statePtr); + if ((seed != statePtr->seed) || (statePtr->extSecret != NULL)) + XXH3_initCustomSecret(statePtr->customSecret, seed); + XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64) +{ + if (statePtr == NULL) return XXH_ERROR; + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + XXH3_reset_internal(statePtr, seed64, secret, secretSize); + statePtr->useSeed = 1; /* always, even if seed64==0 */ + return XXH_OK; +} + +/*! + * @internal + * @brief Processes a large input for XXH3_update() and XXH3_digest_long(). + * + * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block. + * + * @param acc Pointer to the 8 accumulator lanes + * @param nbStripesSoFarPtr In/out pointer to the number of leftover stripes in the block* + * @param nbStripesPerBlock Number of stripes in a block + * @param input Input pointer + * @param nbStripes Number of stripes to process + * @param secret Secret pointer + * @param secretLimit Offset of the last block in @p secret + * @param f_acc Pointer to an XXH3_accumulate implementation + * @param f_scramble Pointer to an XXH3_scrambleAcc implementation + * @return Pointer past the end of @p input after processing + */ +XXH_FORCE_INLINE const xxh_u8 * +XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, + size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, + const xxh_u8* XXH_RESTRICT input, size_t nbStripes, + const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE; + /* Process full blocks */ + if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) { + /* Process the initial partial block... */ + size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr; + + do { + /* Accumulate and scramble */ + f_acc(acc, input, initialSecret, nbStripesThisIter); + f_scramble(acc, secret + secretLimit); + input += nbStripesThisIter * XXH_STRIPE_LEN; + nbStripes -= nbStripesThisIter; + /* Then continue the loop with the full block size */ + nbStripesThisIter = nbStripesPerBlock; + initialSecret = secret; + } while (nbStripes >= nbStripesPerBlock); + *nbStripesSoFarPtr = 0; + } + /* Process a partial block */ + if (nbStripes > 0) { + f_acc(acc, input, initialSecret, nbStripes); + input += nbStripes * XXH_STRIPE_LEN; + *nbStripesSoFarPtr += nbStripes; + } + /* Return end pointer */ + return input; +} + +#ifndef XXH3_STREAM_USE_STACK +# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */ +# define XXH3_STREAM_USE_STACK 1 +# endif +#endif +/* + * Both XXH3_64bits_update and XXH3_128bits_update use this routine. + */ +XXH_FORCE_INLINE XXH_errorcode +XXH3_update(XXH3_state_t* XXH_RESTRICT const state, + const xxh_u8* XXH_RESTRICT input, size_t len, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + XXH_ASSERT(state != NULL); + { const xxh_u8* const bEnd = input + len; + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* For some reason, gcc and MSVC seem to suffer greatly + * when operating accumulators directly into state. + * Operating into stack space seems to enable proper optimization. + * clang, on the other hand, doesn't seem to need this trick */ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; + XXH_memcpy(acc, state->acc, sizeof(acc)); +#else + xxh_u64* XXH_RESTRICT const acc = state->acc; +#endif + state->totalLen += len; + XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); + + /* small input : just fill in tmp buffer */ + if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) { + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + + /* total input is now > XXH3_INTERNALBUFFER_SIZE */ + #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) + XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ + + /* + * Internal buffer is partially filled (always, except at beginning) + * Complete it, then consume it. + */ + if (state->bufferedSize) { + size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; + XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); + input += loadSize; + XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, XXH3_INTERNALBUFFER_STRIPES, + secret, state->secretLimit, + f_acc, f_scramble); + state->bufferedSize = 0; + } + XXH_ASSERT(input < bEnd); + if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { + size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; + input = XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + input, nbStripes, + secret, state->secretLimit, + f_acc, f_scramble); + XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + + } + /* Some remaining input (always) : buffer it */ + XXH_ASSERT(input < bEnd); + XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); + XXH_ASSERT(state->bufferedSize == 0); + XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); + state->bufferedSize = (XXH32_hash_t)(bEnd-input); +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* save stack accumulators into state */ + XXH_memcpy(state->acc, acc, sizeof(acc)); +#endif + } + + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate, XXH3_scrambleAcc); +} + + +XXH_FORCE_INLINE void +XXH3_digest_long (XXH64_hash_t* acc, + const XXH3_state_t* state, + const unsigned char* secret) +{ + xxh_u8 lastStripe[XXH_STRIPE_LEN]; + const xxh_u8* lastStripePtr; + + /* + * Digest on a local copy. This way, the state remains unaltered, and it can + * continue ingesting more input afterwards. + */ + XXH_memcpy(acc, state->acc, sizeof(state->acc)); + if (state->bufferedSize >= XXH_STRIPE_LEN) { + /* Consume remaining stripes then point to remaining data in buffer */ + size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; + size_t nbStripesSoFar = state->nbStripesSoFar; + XXH3_consumeStripes(acc, + &nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, nbStripes, + secret, state->secretLimit, + XXH3_accumulate, XXH3_scrambleAcc); + lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN; + } else { /* bufferedSize < XXH_STRIPE_LEN */ + /* Copy to temp buffer */ + size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; + XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ + XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + lastStripePtr = lastStripe; + } + /* Last stripe */ + XXH3_accumulate_512(acc, + lastStripePtr, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + return XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + } + /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ + if (state->useSeed) + return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ + + +/* ========================================== + * XXH3 128 bits (a.k.a XXH128) + * ========================================== + * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, + * even without counting the significantly larger output size. + * + * For example, extra steps are taken to avoid the seed-dependent collisions + * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). + * + * This strength naturally comes at the cost of some speed, especially on short + * lengths. Note that longer hashes are about as fast as the 64-bit version + * due to it using only a slight modification of the 64-bit loop. + * + * XXH128 is also more oriented towards 64-bit machines. It is still extremely + * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). + */ + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + /* A doubled version of 1to3_64b with different constants. */ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } + * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } + * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); + xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; + xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; + xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; + XXH128_hash_t h128; + h128.low64 = XXH64_avalanche(keyed_lo); + h128.high64 = XXH64_avalanche(keyed_hi); + return h128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input_lo = XXH_readLE32(input); + xxh_u32 const input_hi = XXH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); + xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; + xxh_u64 const keyed = input_64 ^ bitflip; + + /* Shift len to the left to ensure it is even, this avoids even multiplies. */ + XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); + + m128.high64 += (m128.low64 << 1); + m128.low64 ^= (m128.high64 >> 3); + + m128.low64 = XXH_xorshift64(m128.low64, 35); + m128.low64 *= PRIME_MX2; + m128.low64 = XXH_xorshift64(m128.low64, 28); + m128.high64 = XXH3_avalanche(m128.high64); + return m128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; + xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; + xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 input_hi = XXH_readLE64(input + len - 8); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); + /* + * Put len in the middle of m128 to ensure that the length gets mixed to + * both the low and high bits in the 128x64 multiply below. + */ + m128.low64 += (xxh_u64)(len - 1) << 54; + input_hi ^= bitfliph; + /* + * Add the high 32 bits of input_hi to the high 32 bits of m128, then + * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to + * the high 64 bits of m128. + * + * The best approach to this operation is different on 32-bit and 64-bit. + */ + if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ + /* + * 32-bit optimized version, which is more readable. + * + * On 32-bit, it removes an ADC and delays a dependency between the two + * halves of m128.high64, but it generates an extra mask on 64-bit. + */ + m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); + } else { + /* + * 64-bit optimized (albeit more confusing) version. + * + * Uses some properties of addition and multiplication to remove the mask: + * + * Let: + * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) + * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) + * c = XXH_PRIME32_2 + * + * a + (b * c) + * Inverse Property: x + y - x == y + * a + (b * (1 + c - 1)) + * Distributive Property: x * (y + z) == (x * y) + (x * z) + * a + (b * 1) + (b * (c - 1)) + * Identity Property: x * 1 == x + * a + b + (b * (c - 1)) + * + * Substitute a, b, and c: + * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + * + * Since input_hi.hi + input_hi.lo == input_hi, we get this: + * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + */ + m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); + } + /* m128 ^= XXH_swap64(m128 >> 64); */ + m128.low64 ^= XXH_swap64(m128.high64); + + { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ + XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); + h128.high64 += m128.high64 * XXH_PRIME64_2; + + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = XXH3_avalanche(h128.high64); + return h128; + } } +} + +/* + * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN + */ +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); + if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); + if (len) return XXH3_len_1to3_128b(input, len, secret, seed); + { XXH128_hash_t h128; + xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); + xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); + h128.low64 = XXH64_avalanche(seed ^ bitflipl); + h128.high64 = XXH64_avalanche( seed ^ bitfliph); + return h128; + } } +} + +/* + * A bit slower than XXH3_mix16B, but handles multiply by zero better. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, + const xxh_u8* secret, XXH64_hash_t seed) +{ + acc.low64 += XXH3_mix16B (input_1, secret+0, seed); + acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); + acc.high64 += XXH3_mix16B (input_2, secret+16, seed); + acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); + return acc; +} + + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { XXH128_hash_t acc; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + +#if XXH_SIZE_OPT >= 1 + { + /* Smaller, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed); + } while (i-- != 0); + } +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); + } + acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); + } + acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); + } + acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); +#endif + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + { XXH128_hash_t acc; + unsigned i; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + /* + * We set as `i` as offset + 32. We do this so that unchanged + * `len` can be used as upper bound. This reaches a sweet spot + * where both x86 and aarch64 get simple agen and good codegen + * for the loop. + */ + for (i = 32; i < 160; i += 32) { + acc = XXH128_mix32B(acc, + input + i - 32, + input + i - 16, + secret + i - 32, + seed); + } + acc.low64 = XXH3_avalanche(acc.low64); + acc.high64 = XXH3_avalanche(acc.high64); + /* + * NB: `i <= len` will duplicate the last 32-bytes if + * len % 32 was zero. This is an unfortunate necessity to keep + * the hash result stable. + */ + for (i=160; i <= len; i += 32) { + acc = XXH128_mix32B(acc, + input + i - 32, + input + i - 16, + secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, + seed); + } + /* last bytes */ + acc = XXH128_mix32B(acc, + input + len - 16, + input + len - 32, + secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, + (XXH64_hash_t)0 - seed); + + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)len * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + secretSize + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)len * XXH_PRIME64_2)); + return h128; + } +} + +/* + * It's important for performance that XXH3_hashLong() is not inlined. + */ +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's important for performance to pass @p secretLen (when it's static) + * to the compiler, so that it can properly optimize the vectorized loop. + * + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE + * breaks -Og, this is XXH_NO_INLINE. + */ +XXH3_WITH_SECRET_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, + XXH3_accumulate, XXH3_scrambleAcc); +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ + if (seed64 == 0) + return XXH3_hashLong_128b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed64); + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + +typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const void* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_128bits_internal(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong128_f f_hl128) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hl128(input, len, seed64, secret, secretLen); +} + + +/* === Public XXH128 API === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_128bits_internal(input, len, 0, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_128bits_internal(input, len, 0, + (const xxh_u8*)secret, secretSize, + XXH3_hashLong_128b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_internal(input, len, seed, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_withSeed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_withSeed(input, len, seed); +} + + +/* === XXH3 128-bit streaming === */ +#ifndef XXH_NO_STREAM +/* + * All initialization and update functions are identical to 64-bit streaming variant. + * The only difference is the finalization routine. + */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) +{ + return XXH3_64bits_reset(statePtr); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSeed(statePtr, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_64bits_update(state, input, len); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + state->secretLimit + XXH_STRIPE_LEN + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); + return h128; + } + } + /* len <= XXH3_MIDSIZE_MAX : short code */ + if (state->seed) + return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ +/* 128-bit utility functions */ + +#include /* memcmp, memcpy */ + +/* return : 1 is equal, 0 if different */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) +{ + /* note : XXH128_hash_t is compact, it has no padding byte */ + return !(memcmp(&h1, &h2, sizeof(h1))); +} + +/* This prototype is compatible with stdlib's qsort(). + * @return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2) +{ + XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; + XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; + int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); + /* note : bets that, in most cases, hash values are different */ + if (hcmp) return hcmp; + return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); +} + + +/*====== Canonical representation ======*/ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) { + hash.high64 = XXH_swap64(hash.high64); + hash.low64 = XXH_swap64(hash.low64); + } + XXH_memcpy(dst, &hash.high64, sizeof(hash.high64)); + XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src) +{ + XXH128_hash_t h; + h.high64 = XXH_readBE64(src); + h.low64 = XXH_readBE64(src->digest + 8); + return h; +} + + + +/* ========================================== + * Secret generators + * ========================================== + */ +#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) + +XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128) +{ + XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 ); + XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 ); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize) +{ +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(secretBuffer != NULL); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); +#else + /* production mode, assert() are disabled */ + if (secretBuffer == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; +#endif + + if (customSeedSize == 0) { + customSeed = XXH3_kSecret; + customSeedSize = XXH_SECRET_DEFAULT_SIZE; + } +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(customSeed != NULL); +#else + if (customSeed == NULL) return XXH_ERROR; +#endif + + /* Fill secretBuffer with a copy of customSeed - repeat as needed */ + { size_t pos = 0; + while (pos < secretSize) { + size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); + memcpy((char*)secretBuffer + pos, customSeed, toCopy); + pos += toCopy; + } } + + { size_t const nbSeg16 = secretSize / 16; + size_t n; + XXH128_canonical_t scrambler; + XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); + for (n=0; n +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +using namespace std::chrono_literals; +uint64_t bswap(uint64_t x) +{ + x = (x & 0x00000000FFFFFFFF) << 32 | (x & 0xFFFFFFFF00000000) >> 32; + x = (x & 0x0000FFFF0000FFFF) << 16 | (x & 0xFFFF0000FFFF0000) >> 16; + x = (x & 0x00FF00FF00FF00FF) << 8 | (x & 0xFF00FF00FF00FF00) >> 8; + return x; +} + +int64_t rand64() +{ + thread_local static std::mt19937 gen(rand()); + return uint64_t(gen()) << 32 | gen(); +} + +std::string add_comma(uint64_t s) +{ + if (s < 1000) + return std::to_string(s); + if (s < 1000000) + { + return std::to_string(s / 1000) + ',' + std::to_string((s % 1000) + 1000).substr(1); + } + if (s < 1000000000) + { + return std::to_string(s / 1000000) + ',' + + std::to_string(((s % 1000000) / 1000) + 1000).substr(1) + "," + + std::to_string((s % 1000) + 1000).substr(1); + } + return std::to_string(s); +}; + +int main(int argc, char** argv) +{ + triedent::set_current_thread_name("main"); + TRIEDENT_WARN("Hello, Welcome to Triedent!"); + namespace po = boost::program_options; + uint32_t hot_page_c = 34; + uint32_t warm_page_c = 33; + uint32_t cool_page_c = 35; + uint32_t cold_page_c = 35; + uint64_t num_objects = 500 * 1000 * 1000; + std::string db_dir; + bool use_string = false; + uint64_t insert_count; + uint64_t status_count; + bool check_content = false; + uint32_t rounds = 10; + uint32_t count = 1000 * 1000 * 10; + uint32_t group = 16; + uint32_t sync_mode = 0; + bool cor = true; + bool run_compactor = true; + bool run_validate = false; + + uint32_t num_read_threads = 6; + po::options_description desc("Allowed options"); + auto opt = desc.add_options(); + opt("help,h", "print this message"); + opt("reset", "reset the database"); + opt("seq-write", "perform seq writes"); + opt("seq-read", "perform seq reads"); + opt("seq-update", "perform seq updates, assumes after preform seq writes"); + opt("rand-write", "perform random writes"); + opt("rand-write-read", "perform random writes while reading"); + opt("read-only", "just query existing db"); + opt("sparce", po::value(&use_string)->default_value(false), "use sparse string keys"); + opt("compact", po::value(&run_compactor)->default_value(true), + "enable/disable background compactor, will compact between rounds instead"); + opt("validate", po::value(&run_validate)->default_value(false), + "enable/disable state validation between rounds"); + opt("data-dir", po::value(&db_dir)->default_value("./big.dir"), + "the folder that contains the database"); + opt("read-threads,r", po::value(&num_read_threads)->default_value(6), + "number of read threads to launch"); + opt("sync-mode", po::value(&sync_mode)->default_value(sync_mode), + "0 = none, 1 = aysnc, 2 = blocking"); + opt("cache-on-read", po::value(&cor)->default_value(cor), + "copy read objects to cache, higher"); + opt("rounds", po::value(&rounds)->default_value(10), + "the number of times to run each segment"); + opt("count", po::value(&count)->default_value(count), + "the number of times to run each round"); + opt("group", po::value(&group)->default_value(group), + "the number of items in each logical transaction"); + opt("hot-size,H", po::value(&hot_page_c)->default_value(33), + "the power of 2 for the amount of RAM for the hot ring, RAM = 2^(hot_size) bytes"); + opt("warm-size,w", po::value(&warm_page_c)->default_value(33), + "the power of 2 for the amount of RAM for the warm ring, RAM = 2^(warm_size) bytes"); + opt("cool-size,c", po::value(&cool_page_c)->default_value(33), + "the power of 2 for the amount of RAM for the cool ring, RAM = 2^(cool_size) bytes"); + opt("cold-size,C", po::value(&cold_page_c)->default_value(33), + "the power of 2 for the amount of RAM for the cold ring, RAM = 2^(cold_size) bytes"); + opt("max-objects,O", po::value(&num_objects)->default_value(num_objects), + "the maximum number of unique objects in the database"); + opt("insert,i", po::value(&insert_count)->default_value(100000000ull), + "the number of random key/value pairs to insert"); + opt("stat,s", po::value(&status_count)->default_value(1000000ull), + "the number of how often to print stats"); + opt("check-content", po::bool_switch(&check_content), "check content against std::map (slow)"); + + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) + { + std::cerr << desc << "\n"; + return 1; + } + + if (vm.count("reset")) + { + std::cerr << "resetting database\n"; + std::filesystem::remove_all(db_dir); + triedent::database::create(db_dir, {}); + } + bool read_only = false; + if (vm.count("read-only")) + { + read_only = true; + } + + if (num_read_threads > 64) + { + std::cerr << "maximum number of read threads is 64\n"; + return 0; + } + + triedent::DB::Options options{.config = {.cache_on_read = cor, + .run_compact_thread = run_compactor, + .sync_mode = (triedent::sync_type)sync_mode + + }}; + + std::cout << "opening database '" << db_dir << "'\n"; + auto db = triedent::DB::open(options, db_dir); + auto& ws = db->writeSession(); + + // uint32_t count = 1000 * 1000 * 10; + int64_t key = 0; + + /* + for( int i = 0; i < 7; ++i ) { + key++; + auto wt = ws.startTransaction(); + auto old_size = wt->put(std::span((char*)&key, 8), std::span((char*)&key, 8)); + wt->commit(); + std::cout << "-------------\n"; + } + std::cout << "=================\n"; + + { + auto rs = db->createReadSession(); + auto rt = rs->startTransaction(); + + for( int i = 0; i < 2; ++i ) { + key++; + auto wt = ws.startTransaction(); + auto old_size = wt->put(std::span((char*)&key, 8), std::span((char*)&key, 8)); + wt->commit(); + std::cout << "-------------\n"; + } + std::cout << "read session going away\n"; + } + std::cout << "write session going away\n"; + + return 0; + */ + + if (vm.count("seq-write")) + { + std::cout << "Starting to insert " << rounds << " rounds of " << add_comma(count) + << " sequential key/values\n"; + for (uint32_t round = 0; round < rounds; ++round) + { + auto start = std::chrono::steady_clock::now(); + + for (uint32_t g = 0; g < (count / group); ++g) + { + auto wt = ws.startTransaction(); + + for (uint32_t i = 0; i < group; ++i) + { + ++key; + auto kv = bswap(key); + auto old_size = + wt->put(std::span((char*)&kv, 8), std::span((char*)&key, 8)); + if (old_size != -1) + { + std::cerr << "this should be a new value! : " << old_size << "\n"; + return 0; + } + } + wt->commit(); + } + + auto end = std::chrono::steady_clock::now(); + auto delta = end - start; + + std::cerr << std::setw(4) << round << std::setw(12) + << add_comma(int64_t( + count / + (std::chrono::duration(delta).count() / 1000))) + << " items/sec \n"; + } + } + if (vm.count("seq-read")) + { + std::cout << "Starting to get" << rounds << " rounds of " << add_comma(count) + << " sequential key/values\n"; + auto rs = db->createReadSession(); + auto rt = rs->startTransaction(); + key = 0; + std::vector result; + for (uint32_t round = 0; round < rounds; ++round) + { + auto start = std::chrono::steady_clock::now(); + + for (uint32_t i = 0; i < count; ++i) + { + ++key; + //auto kv = key;//bswap(key); + auto kv = bswap(key); + auto found = rt->get(std::span((char*)&kv, 8), &result); + if (8 != result.size()) // not found.ok) + { + std::cerr << "unable to find key: " << key << "\n"; + return 0; + } + else + { + if (key != *((int64_t*)(result.data()))) + { + // std::cerr << "value didn't match expected\n"; + // return 0; + } + } + result.resize(0); + } + + auto end = std::chrono::steady_clock::now(); + auto delta = end - start; + + std::cerr << std::setw(4) << round << std::setw(12) + << add_comma(int64_t( + count / + (std::chrono::duration(delta).count() / 1000))) + << " items/sec \n"; + } + } + if (vm.count("seq-update")) + { + std::cout << "Starting to update " << rounds << " rounds of " << add_comma(count) + << " sequential key/values\n"; + key = 0; + for (uint32_t round = 0; round < rounds; ++round) + { + auto start = std::chrono::steady_clock::now(); + auto wt = ws.startTransaction(); + + for (uint32_t i = 0; i < count; ++i) + { + // std::cerr<put(std::span((char*)&kv, 8), std::span((char*)&val, 8)); + + if (old_size != 8) + { + std::cerr << "unable to find old value! " << old_size << " " << key << "\n"; + return 0; + } + } + + wt->commit(); + auto end = std::chrono::steady_clock::now(); + auto delta = end - start; + + std::cerr << std::setw(4) << round << std::setw(12) + << add_comma(int64_t( + count / + (std::chrono::duration(delta).count() / 1000))) + << " items/sec \n"; + } + } + if (vm.count("rand-write")) + { + std::cout << "Starting to insert " << rounds << " rounds of " << add_comma(count) + << " random key/values\n"; + key = 0; + for (uint32_t round = 0; round < rounds; ++round) + { + auto start = std::chrono::steady_clock::now(); + for (uint32_t g = 0; g < (count / group); ++g) + { + auto wt = ws.startTransaction(); + + for (uint32_t i = 0; i < group; ++i) + { + key = rand64(); + int64_t val = 16 * g + i; + auto old_size = + wt->put(std::span((char*)&key, 8), std::span((char*)&val, 8)); + } + wt->commit(); + } + auto end = std::chrono::steady_clock::now(); + auto delta = end - start; + + std::cerr << std::setw(4) << round << std::setw(12) + << add_comma(int64_t( + count / + (std::chrono::duration(delta).count() / 1000))) + << " items/sec \n"; + if (not run_compactor) + { + ws.validate(); + while (db->compact()) + ; + ws.validate(); + } + else + { + ws.validate(); + } + } + using namespace std::chrono_literals; + /* + db->print(); + std::cerr<< "compact one\n"; + for( uint32_t i = 0; i < 30; ++i ) { + db->compact(); + } + db->print(); + std::cerr<< "\nsleeping for 3 seconds... so compact can work\n\n"; + std::this_thread::sleep_for(3000ms); + */ + } + if (0) + { + auto rs = db->createReadSession(); + auto rt = rs->startTransaction(); + std::cout << "Starting to find lower bound " << rounds << " rounds of " << add_comma(count) + << " random key/values\n"; + std::vector result_key; + std::vector result_val; + key = 0; + for (uint32_t round = 0; round < rounds; ++round) + { + auto start = std::chrono::steady_clock::now(); + + for (uint32_t i = 0; i < count; ++i) + { + key = rand64(); + int64_t val = i; + rt->get_greater_equal(std::span((const char*)&key, 8), &result_key, + &result_val); + } + + auto end = std::chrono::steady_clock::now(); + auto delta = end - start; + + std::cerr << std::setw(4) << round << std::setw(12) + << add_comma(int64_t( + count / + (std::chrono::duration(delta).count() / 1000))) + << " items/sec \n"; + } + + /* + std::cout << "Starting to find lower bound " << rounds << " rounds of " << add_comma(count) + << " random key/values in " << num_read_threads << " threads\n"; + + for (uint32_t round = 0; round < rounds; ++round) + { + std::vector> rthreads; + rthreads.reserve(num_read_threads); + + auto start = std::chrono::steady_clock::now(); + + for (uint32_t i = 0; i < num_read_threads; ++i) + { + auto read_loop = [&]() + { + auto rs = db->createReadSession(); + auto rt = rs->startTransaction(); + + std::vector result_key; + std::vector result_val; + key = 0; + + for (uint32_t i = 0; i < count; ++i) + { + key = rand64(); + int64_t val = i; + rt->get_greater_equal(std::span((const char*)&key, 8), &result_key, + &result_val); + } + }; + rthreads.emplace_back(new std::thread(read_loop)); + } + + for (auto& r : rthreads) + r->join(); + + auto end = std::chrono::steady_clock::now(); + auto delta = end - start; + std::cerr << std::setw(4) << round << std::setw(12) + << add_comma( + int64_t((num_read_threads * count) / + (std::chrono::duration(delta).count() / 1000))) + << " items/sec \n"; + } + */ + } + + if (vm.count("rand-write-read")) + { + auto rs = db->createReadSession(); + auto rt = rs->startTransaction(); + + std::cout << "Starting to find lower bound " << rounds << " rounds of " << add_comma(count) + << " random key/values in " << num_read_threads << " threads while writing\n"; + + uint64_t total_writes = 0; + for (uint32_t round = 0; round < rounds; ++round) + { + std::vector> rthreads; + rthreads.reserve(num_read_threads); + + auto start = std::chrono::steady_clock::now(); + std::atomic done = 0; + + for (uint32_t i = 0; i < num_read_threads; ++i) + { + auto read_loop = [&]() + { + triedent::set_current_thread_name("read"); + auto lrs = db->createReadSession(); + + std::vector result_key; + std::vector result_val; + uint64_t key = 0; + + for (uint32_t g = 0; g < group; ++g) + { + auto rt = lrs->startTransaction(); + for (uint32_t i = 0; i < count / group; ++i) + { + key = rand64(); + int64_t val = g * (count / group) + i; + rt->get_greater_equal(std::span((const char*)&key, 8), &result_key, + &result_val); + } + } + // std::this_thread::sleep_for(1000ms); + // rt.reset(); + + ++done; + }; + rthreads.emplace_back(new std::thread(read_loop)); + } + + int64_t writes = 0; + while (done.load() < num_read_threads) + { + if (not read_only) + { + auto wt = ws.startTransaction(); + for (uint32_t i = 0; i < group; ++i) + { + key = rand64(); + int64_t val = key; + auto old_size = + wt->put(std::span((char*)&key, 8), std::span((char*)&val, 8)); + + ++writes; + ++total_writes; + if (done.load(std::memory_order_relaxed) >= num_read_threads) + break; + } + wt->commit(); + } else { + std::this_thread::sleep_for(10ms); + } + } + + auto end = std::chrono::steady_clock::now(); + auto delta = end - start; + std::cerr << std::setw(4) << round << std::setw(12) + << add_comma(int64_t( + (num_read_threads * count) / + (std::chrono::duration(delta).count() / 1000))) + << " read items/sec "; + + for (auto& r : rthreads) + r->join(); + + std::cerr << std::setw(12) + << add_comma(int64_t( + (writes) / + (std::chrono::duration(delta).count() / 1000))) + << " write items/sec " + << " items in db: " << add_comma(total_writes) << " \n"; + + if (run_validate) + ws.validate(); + if (not run_compactor) + { + while (db->compact()) + { + } + } + } + } + + /* + auto read_loop = [&]( int c ){ + auto rs = db->createReadSession(); // thread-local access to read db + + while( not done.load( std::memory_order_acquire ) ) { + auto rt = rs.startTransaction(); // grabs a snapshot + + rt-> + } + }; + */ + + return 0; +} diff --git a/libraries/triedent/src/cache_allocator.cpp b/libraries/triedent/src/cache_allocator.cpp index 1c7348725..623217810 100644 --- a/libraries/triedent/src/cache_allocator.cpp +++ b/libraries/triedent/src/cache_allocator.cpp @@ -21,20 +21,13 @@ namespace triedent [this]() { thread_name("swap"); -#ifndef __APPLE__ - pthread_setname_np(pthread_self(), "swap"); -#else // if __APPLE__ - pthread_setname_np("swap"); -#endif + set_current_thread_name( "swap" ); swap_loop(); }); + _gc_thread = std::thread{[this] { -#ifndef __APPLE__ - pthread_setname_np(pthread_self(), "swap"); -#else // if __APPLE__ - pthread_setname_np("swap"); -#endif + set_current_thread_name( "gc" ); _gc.run(&_done); }}; } @@ -63,8 +56,8 @@ namespace triedent bool cache_allocator::swap(gc_session& session) { - constexpr uint64_t target = 1024 * 1024 * 40ull; - constexpr std::uint64_t min_target = 1024 * 1024 * 33ull; + constexpr uint64_t target = 1024 * 1024 * 256ull; + constexpr std::uint64_t min_target = 1024 * 1024 * 128ull; bool did_work = false; auto do_swap = [&](auto& from, auto& to) { @@ -82,6 +75,8 @@ namespace triedent // if (auto lock = _obj_ids.lock({.id = o->id}, loc)) { + // note swap will fail on lock contention, will not wait for + // free space at the next level down. void* p = to.try_allocate(sl, lock.get_id(), o->size, [&](void* ptr, object_location newloc) { diff --git a/libraries/triedent/src/database.cpp b/libraries/triedent/src/database.cpp index 8f081c49c..692f4ae2d 100644 --- a/libraries/triedent/src/database.cpp +++ b/libraries/triedent/src/database.cpp @@ -1,5 +1,6 @@ #include #include +#include namespace triedent { @@ -7,9 +8,7 @@ namespace triedent const config& cfg, access_mode mode, bool allow_gc) - : _ring{dir / "data", cfg, mode, allow_gc}, - _file{dir / "db", mode}, - _root_release_session{_ring} + : _sega{dir}, _file{dir / "db", mode}, _root_release_session{_sega}, _config(cfg) { if (_file.size() == 0) { @@ -27,6 +26,9 @@ namespace triedent throw std::runtime_error("Not a triedent file: " + (dir / "db").native()); if ((_dbm->flags & file_type_mask) != file_type_database_root) throw std::runtime_error("Not a triedent db file: " + (dir / "db").native()); + if( cfg.run_compact_thread ) + _sega.start_compact_thread(); + } database::database(const std::filesystem::path& dir, access_mode mode, bool allow_gc) @@ -34,7 +36,9 @@ namespace triedent { } - database::~database() {} + database::~database() { + + } void database::create(std::filesystem::path dir, config cfg) { @@ -44,11 +48,11 @@ namespace triedent std::filesystem::create_directories(dir / "data"); - (void)database{dir, cfg, access_mode::read_write}; + std::make_shared(dir, cfg, access_mode::read_write); } void database::print_stats(std::ostream& os, bool detail) { - _ring.print_stats(os, detail); + _sega.dump(); } } // namespace triedent diff --git a/libraries/triedent/src/gc_queue.cpp b/libraries/triedent/src/gc_queue.cpp index 34587cab5..41fad9ad8 100644 --- a/libraries/triedent/src/gc_queue.cpp +++ b/libraries/triedent/src/gc_queue.cpp @@ -143,6 +143,7 @@ namespace triedent // \post // for each index in [start, R): // either U happens before W or P happens before L + // \return the sequence of the session with the lowest sequence gc_queue::size_type gc_queue::start_wait(size_type start, size_type end) { std::size_t lowest_sequence = end; diff --git a/libraries/triedent/src/mapping.cpp b/libraries/triedent/src/mapping.cpp index bdab61888..295f4d496 100644 --- a/libraries/triedent/src/mapping.cpp +++ b/libraries/triedent/src/mapping.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace triedent { namespace @@ -79,12 +81,15 @@ namespace triedent { _data = addr; try_pin(&_pinned, addr, _size); + // std::cerr<<"madvise random " << int64_t(addr) <<" " << _size << " \n"; + // madvise(addr, _size, MADV_RANDOM ); } else { ::close(_fd); throw std::system_error{errno, std::generic_category()}; } + } } diff --git a/libraries/triedent/src/seg_allocator.cpp b/libraries/triedent/src/seg_allocator.cpp new file mode 100644 index 000000000..2e214ab81 --- /dev/null +++ b/libraries/triedent/src/seg_allocator.cpp @@ -0,0 +1,522 @@ +#include +#include + +namespace triedent +{ + seg_allocator::seg_allocator(std::filesystem::path dir) + : _id_alloc(dir / "ids"), + _block_alloc(dir / "segs", segment_size, max_segment_count), + _header_file(dir / "header", access_mode::read_write, true) + { + if (_header_file.size() == 0) + { + _header_file.resize(round_to_page(sizeof(mapped_memory::allocator_header))); + new (_header_file.data()) mapped_memory::allocator_header(); + } + _header = reinterpret_cast(_header_file.data()); + + for (auto& sptr : _session_ptrs) + sptr.store(-1ull); + _done.store(false); + } + + seg_allocator::~seg_allocator() + { + cses.reset(); + _done.store(true); + if (_compact_thread.joinable()) + _compact_thread.join(); + } + + void seg_allocator::start_compact_thread() + { + if (not _compact_thread.joinable()) + { + _compact_thread = std::thread( + [this]() + { + thread_name("compactor"); + set_current_thread_name("compactor"); + compact_loop(); + }); + } + } + + /** + * This must be called via a session because the session is responsible + * for documenting what regions could be read + * + * All objects are const because they cannot be modified after being + * written. + const object_header* seg_allocator::get_object(object_location loc) const + { + return nullptr; + } + const object_header* seg_allocator::get_object(object_id oid) const + { + return nullptr; + } + */ + + /** + * After all writes are complete, and there is not enough space + * to allocate the next object the alloc_ptr gets set to MAX and + * the page gets + */ + void seg_allocator::finalize_segment(segment_number) + { + /// add maxsegsize - (seg_end-alloc_ptr) to free space + /// set seg.alloc_ptr = max + /// set seg as read only + /// mark seg as random access if average object size is + /// less than 2x page size. + /// mark seg as seq access if average object size is greater than 1mb + /// else mark seg as normal access + } + + /** + * After all data has been removed from a segment + * - madvise free/don't need + * - add the segment number to the free segments at allocator_header::end_ptr + * - increment allocator_header::end_ptr + */ + void seg_allocator::release_segment(segment_number) {} + + void seg_allocator::compact_loop() + { + using namespace std::chrono_literals; + if (not cses) + cses.emplace(start_session()); + + while (not _done.load()) + { + if (not compact_next_segment()) + { + /* + std::cerr << "sleeping because most seg: " << most_empty_seg_num + << " empty: " << most_empty_seg_free << " " + << 100 * most_empty_seg_free / double(segment_size) << "\n"; + */ + using namespace std::chrono_literals; + std::this_thread::sleep_for(100ms); + } + + // find most empty segment + // move it to my own personal write session + // add it to the free segment queue + } + } + + bool seg_allocator::compact_next_segment() + { + if (not cses) + cses.emplace(start_session()); + + uint64_t most_empty_seg_num = -1ll; + uint64_t most_empty_seg_free = 0; + auto total_segs = _block_alloc.num_blocks(); + auto oldest = -1ul; + for (uint32_t s = 0; s < total_segs; ++s) + { + auto fso = _header->seg_meta[s].get_free_space_and_objs(); + if (fso.first > most_empty_seg_free) + if (fso.first > segment_size / 8) // most_empty_seg_free) + { + auto seg = get_segment(s); + // only consider segs that are not actively allocing + // or that haven't already been processed + if (seg->_alloc_pos.load(std::memory_order_relaxed) == uint32_t(-1)) + { + // if (seg->_age <= oldest) + { + most_empty_seg_num = s; + most_empty_seg_free = fso.first; + oldest = seg->_age; + } + } + } + } + + // segments must be at least 25% empty before compaction is considered + if (most_empty_seg_num == -1ull or most_empty_seg_free < segment_size / 16) + { + return false; + } + + compact_segment(*cses, most_empty_seg_num); + return true; + } + + void seg_allocator::compact_segment(session& ses, uint64_t seg_num) + { + auto state = ses.lock(); + auto s = get_segment(seg_num); + auto send = (object_header*)((char*)s + segment_size); + char* foc = (char*)s + sizeof(mapped_memory::segment_header); + object_header* foo = (object_header*)(foc); + + /* + std::cerr << "compacting segment: " << seg_num << " into " << ses._alloc_seg_num << " " + << "seg free: " << _header->seg_meta[seg_num].get_free_space_and_objs().first << " " + << "seg alloc_pos: " << s->_alloc_pos <<" "; + if( ses._alloc_seg_ptr ) { + std::cerr << "calloc: " << ses._alloc_seg_ptr->_alloc_pos <<" cfree: " << _header->seg_meta[ses._alloc_seg_num].get_free_space_and_objs().first <<"\n"; + } else std::cerr<<"\n"; + */ + + assert(s->_alloc_pos == segment_offset(-1)); + // std::cerr << "seg " << seg_num <<" alloc pos: " << s->_alloc_pos <<"\n"; + + auto seg_state = seg_num * segment_size; + auto seg_end = (seg_num + 1) * segment_size; + + auto start_seg_ptr = ses._alloc_seg_ptr; + auto start_seg_num = ses._alloc_seg_num; + + madvise(s, segment_size, MADV_SEQUENTIAL); + while (foo < send and foo->id) + { + // if the object has been deleted, skip it + if (foo->check == uint32_t(-1)) + { + foo = foo->next(); + continue; + } + + // skip anything that has been freed + // note the ref can go to 0 before foo->check is set to -1 + auto obj_ref = state.get({foo->id}); + if (obj_ref.ref_count() == 0) + { + foo = foo->next(); + continue; + } + + // skip anything that isn't pointing + // to foo, it may have been moved *or* + // it may have been freed and the id reallocated to + // another object. We cannot replace this with obj_ref.obj() == foo + // because obj_ref could be pointing to an ID in the free list + auto foo_idx = (char*)foo - (char*)s; + auto current_loc = obj_ref.location(); + if (current_loc._offset != seg_num * segment_size + foo_idx) + { + foo = foo->next(); + continue; + } + + // attempt to move the object requires a lock because the + // object could be modified in place while trying to move it. + { + // lock the ID to prevent anyone else from moving or modifying it while we copy + std::unique_lock ul(obj_ref.get_mutex()); + + // reload the atomic variable and check the invariant that it is + // still pointing at us after the lock. + obj_ref.refresh(); + + auto foo_idx = (char*)foo - (char*)s; + auto expect_loc = obj_ref.location()._offset; + if ((expect_loc & (segment_size - 1)) != foo_idx or obj_ref.ref_count() == 0) + { + foo = foo->next(); + continue; + } + + // the object hasn't moved nor has its ref count gone to zero so + // we commit to alloc memory and memcpy the data + auto obj_size = foo->object_size(); + auto [loc, ptr] = ses.alloc_data(obj_size, {foo->id}, foo->get_type()); + memcpy(ptr, foo, obj_size); + + // get an object_header* to the newly move object to run some checks + auto moved_foo = ((object_header*)ptr); + + // release() does not grab the lock, so while we were copying the + // object may have been released and foo->check set to -1 + if (moved_foo->check == uint32_t(-1)) + { + // since we alocated data, we need to indicate that it is not being + // used. TODO: investigating resetting the alloc_ptr by -foo->object_size() + _header->seg_meta[start_seg_num].free_object(foo->object_size()); + } + + // after moving the data, check to make sure that the checksum is still + // valid. This will difinitively prove that a clean copy was made. + else if (not moved_foo->validate_checksum()) + { + bool source_still_valid = foo->validate_checksum(); + // if it was invalid it means a modification in place was made without a lock + // it could also mean memory corruption in the application and this error + // should be raised to the user TODO: how to report errors from the + // background process + std::cerr << foo->id << ": mv checksum invalid: '" << moved_foo->check << "' src check: "<check <<" src valid:"<seg_meta[start_seg_num].free_object(foo->object_size()); + } + // try move compare and exchange + else if (not obj_ref.move({expect_loc}, loc)) + { + // if it failed because the object was released or moved by + // someone else, then note the free space and move on with life + _header->seg_meta[start_seg_num].free_object(foo->object_size()); + } + } // end lock scope + + // if ses.alloc_data() was forced to make space in a new segment + // then we need to sync() the old write segment before moving forward + if (not start_seg_ptr) + { + start_seg_ptr = ses._alloc_seg_ptr; + start_seg_num = ses._alloc_seg_num; + } + else if (start_seg_ptr != ses._alloc_seg_ptr) + { + // TODO: only sync from alloc pos at last sync + msync(start_seg_ptr, segment_size, MS_SYNC); + _header->seg_meta[start_seg_num]._last_sync_pos.store(segment_size, + std::memory_order_relaxed); + start_seg_ptr = ses._alloc_seg_ptr; + start_seg_num = ses._alloc_seg_num; + } + foo = foo->next(); + } + + // in order to maintain the invariant that the segment we just cleared + // can be reused, we must make sure that the data we moved out has persisted to + // disk. + if (start_seg_ptr) + { + if (-1 == msync(start_seg_ptr, start_seg_ptr->_alloc_pos, MS_SYNC)) + { + std::cerr << "msync errorno: " << errno << "\n"; + } + _header->seg_meta[seg_num]._last_sync_pos.store(start_seg_ptr->_alloc_pos, + std::memory_order_relaxed); + } + + s->_num_objects = 0; + s->_alloc_pos = 0; + s->_age = -1; + // the segment we just cleared, so its free space and objects get reset to 0 + // and its last_sync pos gets put to the end because there is no need to sync it + // because its data has already been synced by the compactor + _header->seg_meta[seg_num].clear(); + + munlock(s, segment_size); + // it is unlikely to be accessed, and if it is don't pre-fetch + madvise(s, segment_size, MADV_RANDOM); + //madvise(s, segment_size, MADV_DONTNEED); + + // only one thread can move the end_ptr or this will break + // std::cerr<<"done freeing end_ptr: " << _header->end_ptr.load() <<" <== " << seg_num <<"\n"; + _header->free_seg_buffer[_header->end_ptr.load(std::memory_order_relaxed) & (max_session_count-1)] = seg_num; + _header->end_ptr.fetch_add(1, std::memory_order_release); + // + } + + /** + * The min read pointer, aka min(R*), must be A <= R* <= E. + * A, R, and E only ever increase + * The last value of this function is stored in _min_read_ptr + * + * So long as the last value is greater than A, A can advance without + * updating _min_read_ptr; however, if A >= _min_read_ptr then + * we want to check all active R* to find the min. If all sessions + * are idle, the the min becomes E. + */ + uint64_t seg_allocator::get_min_read_ptr() + { + auto ap = _header->alloc_ptr.load(std::memory_order_relaxed); + auto ep = _header->end_ptr.load(std::memory_order_acquire); + auto min = _min_read_ptr.load(std::memory_order_acquire); + + if (ap >= min) // then check to see if there is more + { + min = ep; + // find new last min + // TODO: only iterate over active sessions instead of all sessions + // this is so infrequent it probably doesn't matter. + auto fs = ~_free_sessions.load(); + auto num_ses = std::popcount(fs); + for (uint32_t i = 0; fs and i < max_session_count; ++i) + { + if (fs & (1ull << i)) + { + if (auto p = _session_ptrs[i].load(std::memory_order_relaxed); p < min) + { + min = p; + } + + // we can't find anything lower than this + if (min == ap) + { + _min_read_ptr.store(min, std::memory_order_release); + return min; + } + } + } + } + if (min > ep) + min = ep; + _min_read_ptr.store(min, std::memory_order_release); + return min; + } + + /** + * reads allocator_header::reuse_ptr and if it is less than + * allocator_header::min_read_ptr then attempts increment the + * reuse pointer by exactly 1, if so then it uses the segment + * at _free_segments[reuse_ptr.old] + * + * If reuse_ptr == min_read_ptr then advance the alloc_ptr by + * segment_size to claim a new segment. + * + * + */ + std::pair seg_allocator::get_new_segment() + { + auto ap = _header->alloc_ptr.load(std::memory_order_relaxed); + auto min = get_min_read_ptr(); + + auto prepare_segment = [&](segment_number sn) + { + auto sp = _block_alloc.get(sn); + madvise(sp, segment_size, MADV_FREE); // zero's pages if they happen to be accessed + madvise(sp, segment_size, MADV_RANDOM); + + auto r = mlock(sp, segment_size); + + if (r) + std::cerr << "MLOCK: " << r << " " << EINVAL << " " << EAGAIN << "\n"; + + //memset(sp, 0, segment_size); // TODO: is this necessary? + + auto shp = new (sp) mapped_memory::segment_header(); + shp->_age = _header->next_alloc_age.fetch_add(1, std::memory_order_relaxed); + + return std::pair(sn, shp); + }; + // std::cout <<"get new seg ap: " << ap << " min: " << min <<" min-ap:" << min - ap << "\n"; + + while (min - ap >= 1) + { + if (_header->alloc_ptr.compare_exchange_weak(ap, ap + 1)) + { + auto free_seg = _header->free_seg_buffer[ap]; + _header->free_seg_buffer[ap] = segment_number(-1); + // std::cerr << "reusing segment..." << free_seg <<"\n"; + return prepare_segment(free_seg); + } + } + return prepare_segment(_block_alloc.alloc()); + } + void seg_allocator::sync(sync_type st) + { + if (st == sync_type::none) + return; + + auto total_segs = _block_alloc.num_blocks(); + + for (uint32_t i = 0; i < total_segs; ++i) + { + auto seg = get_segment(i); + auto last_sync = _header->seg_meta[i]._last_sync_pos.load(std::memory_order_relaxed); + auto last_alloc = seg->_alloc_pos.load(std::memory_order_relaxed); + + if (last_alloc > segment_size) + last_alloc = segment_size; + + static const uint64_t page_size = getpagesize(); + static const uint64_t page_size_mask = ~(page_size - 1); + + auto sync_bytes = last_alloc - (last_sync & page_size_mask); + auto seg_sync_ptr = (((intptr_t)seg + last_sync) & page_size_mask); + + if (last_alloc > last_sync) + { + if (-1 == msync((char*)seg_sync_ptr, sync_bytes, msync_flag(st))) + { + std::cerr << "ps: " << getpagesize() << " len: " << sync_bytes << " rounded: \n"; + std::cerr << "msync errno: " << std::string(strerror(errno)) + << " seg_alloc::sync() seg: " << i << "\n"; + } + _header->seg_meta[i]._last_sync_pos.store(last_alloc); + } + } + } + + void seg_allocator::dump() + { + std::cerr << "\n--- segment allocator state ---\n"; + auto total_segs = _block_alloc.num_blocks(); + auto total_retained = 0; + uint64_t total_free_space = 0; + std::cerr << "total segments: " << total_segs << "\n"; + std::cerr << std::setw(6) << "#" + << " | "; + std::cerr << std::setw(8) << "freed %" + << " | "; + std::cerr << std::setw(12) << "freed bytes" + << " | "; + std::cerr << std::setw(12) << "freed obj" + << " | "; + std::cerr << std::setw(12) << "alloc pos" + << " | "; + std::cerr << std::setw(12) << "alloced obj" + << " | "; + std::cerr << std::setw(12) << "num obj" + << " | "; + std::cerr << std::setw(8) << "age" + << " \n"; + for (uint32_t i = 0; i < total_segs; ++i) + { + auto seg = get_segment(i); + auto space_objs = _header->seg_meta[i].get_free_space_and_objs(); + + std::cerr << std::setw(6) << i << " | "; + std::cerr << std::setw(8) << int(100 * double(space_objs.first) / segment_size) << " | "; + total_free_space += space_objs.first; + std::cerr << std::setw(12) << space_objs.first << " | "; + std::cerr << std::setw(12) << space_objs.second << " | "; + std::cerr << std::setw(12) + << (seg->_alloc_pos == -1 ? "END" : std::to_string(seg->_alloc_pos)) << " | "; + std::cerr << std::setw(12) << seg->_num_objects << " | "; + total_retained += seg->_num_objects - space_objs.second; + std::cerr << std::setw(12) << seg->_num_objects - space_objs.second << " | "; + std::cerr << std::setw(8) << seg->_age << " \n"; + } + std::cerr << "total free: " << total_free_space / 1024 / 1024. << "Mb " + << (100 * total_free_space / double(total_segs * segment_size)) << "%\n"; + std::cerr << "total retained: " << total_retained << " objects\n"; + + std::cerr << "---- free segment Q ------\n"; + std::cerr << "[---A---R*---E------]\n"; + std::cerr << "A - alloc idx: " << _header->alloc_ptr.load() << "\n"; + for (uint32_t i = 0; i < max_session_count; ++i) + { + if (auto p = _session_ptrs[i].load(); p != -1ull) + std::cerr << "R" << i << ": " << p << "\n"; + } + + std::cerr << "E - end idx: " << _header->end_ptr.load() << "\n"; + + auto fs = ~_free_sessions.load(); + auto num_ses = std::popcount(fs); + std::cerr << "active sessions: " << num_ses << "\n"; + for (uint32_t i = 0; i < max_session_count; ++i) + { + if (fs & (1ull << i)) + { + if (auto p = _session_ptrs[i].load(); p == -1ull) + std::cerr << "R" << i << ": UNLOCKED \n"; + } + } + + std::cerr << "------- pending free segments -----------\n"; + for (auto x = _header->alloc_ptr.load(); x < _header->end_ptr.load(); ++x) + { + std::cerr << x << "] " << _header->free_seg_buffer[x & (max_segment_count - 1)] << "\n"; + } + std::cerr << "--------------------------\n"; + } +}; // namespace triedent diff --git a/libraries/triedent/test/CMakeLists.txt b/libraries/triedent/test/CMakeLists.txt index 287207b1b..423ed44a8 100644 --- a/libraries/triedent/test/CMakeLists.txt +++ b/libraries/triedent/test/CMakeLists.txt @@ -17,3 +17,10 @@ add_executable(triedent-tests-bigdb big.cpp) target_link_libraries(triedent-tests-bigdb PUBLIC Boost::program_options triedent) target_include_directories(triedent-tests-bigdb PUBLIC ${Boost_INCLUDE_DIRS}) set_target_properties(triedent-tests-bigdb PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${ROOT_BINARY_DIR}) + +add_executable(dtester dtester.cpp) +target_link_libraries(dtester PUBLIC Boost::program_options triedent) +target_include_directories(dtester PUBLIC ${Boost_INCLUDE_DIRS}) +set_target_properties(dtester PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${ROOT_BINARY_DIR}) + + diff --git a/libraries/triedent/test/big.cpp b/libraries/triedent/test/big.cpp index 66a848858..0d4a7c66b 100644 --- a/libraries/triedent/test/big.cpp +++ b/libraries/triedent/test/big.cpp @@ -46,6 +46,7 @@ int main(int argc, char** argv) auto opt = desc.add_options(); opt("help,h", "print this message"); opt("reset", "reset the database"); + opt("read-only", "just query existing db"); opt("sparce", po::value(&use_string)->default_value(false), "use sparse string keys"); opt("data-dir", po::value(&db_dir)->default_value("./big.dir"), "the folder that contains the database"); @@ -86,6 +87,10 @@ int main(int argc, char** argv) .cool_bytes = 1ull << cool_page_c, .cold_bytes = 1ull << cold_page_c}); } + bool read_only = false; + if (vm.count("read-only")) { + read_only = true; + } if (num_read_threads > 64) { @@ -152,7 +157,7 @@ int main(int argc, char** argv) while (r.load(std::memory_order_relaxed) == v) { uint64_t h = (uint64_t(gen()) << 32) | gen(); - bool found = rs->get_less_than(rr, std::string_view((char*)&h, sizeof(h)), &found_key, &found_value, &result_roots); + bool found = rs->get_less_than(rr, std::string_view((char*)&h, sizeof(h)), &found_key, &found_value ); if (found) { ++total_lookups[c].total_lookups; } @@ -298,32 +303,36 @@ int main(int argc, char** argv) if (i < total) { - //base.emplace( std::make_pair(k,std::string((char*)&h, sizeof(h))) ); - if (use_string) - { - if (check_content) - comparison_map[str] = str; - int inserted; - inserted = s->upsert(root, str, str); - if (inserted >= 0) + if( read_only ) { + usleep( 2 ); + } else { + //base.emplace( std::make_pair(k,std::string((char*)&h, sizeof(h))) ); + if (use_string) { - // TRIEDENT_WARN("failed to insert: ", h); - break; + if (check_content) + comparison_map[str] = str; + int inserted; + inserted = s->upsert(root, str, str); + if (inserted >= 0) + { + // TRIEDENT_WARN("failed to insert: ", h); + break; + } + assert(inserted < 0); } - assert(inserted < 0); - } - else - { - if (check_content) - comparison_map[(std::string)hk] = (std::string)hk; - int inserted; - inserted = s->upsert(root, hk, hk); - if (inserted >= 0) + else { - // TRIEDENT_WARN("failed to insert: ", h); - break; + if (check_content) + comparison_map[(std::string)hk] = (std::string)hk; + int inserted; + inserted = s->upsert(root, hk, hk); + if (inserted >= 0) + { + // TRIEDENT_WARN("failed to insert: ", h); + break; + } + assert(inserted < 0); } - assert(inserted < 0); } } } diff --git a/libraries/triedent/test/dtester.cpp b/libraries/triedent/test/dtester.cpp new file mode 100644 index 000000000..5bf1673ea --- /dev/null +++ b/libraries/triedent/test/dtester.cpp @@ -0,0 +1,150 @@ +#include +#include +#include +#include +#include +#include +using namespace std::chrono_literals; + +using namespace triedent; + +int main(int argc, char** argv) +{ + try + { + std::vector result; + std::filesystem::remove_all("big.dir"); + std::filesystem::create_directories("big.dir"); + auto db = std::make_shared("big.dir", read_write); + auto ws = db->start_write_session(); + auto top = ws->get_top_root(); + auto r = ws->upsert(top, "key", "val" ); + std::cerr<< "old size: " << r <<"\n"; + auto r2 = ws->get(top, "key", &result ); + std::cerr<< "found: " << r2 <<" " << result.data() <<"\n"; + auto r3 = ws->upsert(top, "bottom", "dollar" ); + auto r4 = ws->get(top, "bottom", &result ); + std::cerr<< "found: " << r4 <<" " << result.data() <<"\n"; + return 0; + + + + std::filesystem::remove("data"); + std::filesystem::remove("ids"); + std::filesystem::remove("header"); + triedent::seg_allocator segs("."); + + std::cerr << "starting session\n"; + auto ss = segs.start_session(); + std::cerr << "locking data before accessing...\n"; + { + auto sl = ss.lock(); + std::cerr << "about to alloc\n"; + // pointers only valid while sl is held + auto oref = sl.alloc(20, triedent::node_type::inner); + std::cout << "oref.id: " << oref.id().id << "\n"; + std::cout << "oref.ref: " << oref.ref_count() << "\n"; + std::cout << "oref.type: " << (int)oref.type() << "\n"; + std::cout << "oref.obj->size: " << (int)oref.obj()->size << "\n"; + std::cout << "oref.obj->cap: " << (int)oref.obj()->data_capacity() << "\n"; + std::cout << "oref.obj->id: " << (int)oref.obj()->id << "\n"; + std::cout << "oref.loc->seg: " << (int)oref.loc().segment() << "\n"; + std::cout << "oref.loc->idx: " << (int)oref.loc().index() << "\n"; + auto oref2 = sl.alloc(25, triedent::node_type::inner); + std::cout << "oref2.id: " << oref2.id().id << "\n"; + std::cout << "oref2.ref: " << oref2.ref_count() << "\n"; + std::cout << "oref2.type: " << (int)oref2.type() << "\n"; + std::cout << "oref2.obj->size: " << (int)oref2.obj()->size << "\n"; + std::cout << "oref2.obj->cap: " << (int)oref2.obj()->data_capacity() << "\n"; + std::cout << "oref2.obj->id: " << (int)oref2.obj()->id << "\n"; + std::cout << "oref2.loc->seg: " << (int)oref2.loc().segment() << "\n"; + std::cout << "oref2.loc->idx: " << (int)oref2.loc().index() << "\n"; + + auto oref3 = sl.alloc(25, triedent::node_type::inner); + std::cout << "oref3.id: " << oref3.id().id << "\n"; + std::cout << "oref3.ref: " << oref3.ref_count() << "\n"; + std::cout << "oref3.type: " << (int)oref3.type() << "\n"; + std::cout << "oref3.obj->size: " << (int)oref3.obj()->size << "\n"; + std::cout << "oref3.obj->cap: " << (int)oref3.obj()->data_capacity() << "\n"; + std::cout << "oref3.obj->id: " << (int)oref3.obj()->id << "\n"; + std::cout << "oref3.loc->seg: " << (int)oref3.loc().segment() << "\n"; + std::cout << "oref3.loc->idx: " << (int)oref3.loc().index() << "\n"; + + std::vector> objs; + + for (uint32_t i = 0; i < 260; ++i) + { + auto oref3 = sl.alloc(1024 * 1024, triedent::node_type::inner); + objs.push_back(oref3); + } + + segs.dump(); + + std::cerr << "test release\n"; + oref3.release(); + + segs.dump(); + + std::cerr << "freeing half the objects"; + for (uint32_t i = 0; i < objs.size() / 2; ++i) + objs[i * 2].release(); + segs.dump(); + + std::cerr << "waiting on compact loop\n"; + // for (uint32_t i = 0; i < objs.size() / 2; ++i) +// + std::this_thread::sleep_for(1000ms); + + segs.dump(); + } + std::cerr<<"after lock release\n"; + segs.dump(); + + { + auto sl = ss.lock(); + std::cerr<<"after lock reopened\n"; + segs.dump(); + + std::vector> objs; + for (uint32_t i = 0; i < 260; ++i) + { + auto oref3 = sl.alloc(1024 * 1024, triedent::node_type::inner); + objs.push_back(oref3); + } + std::cerr<<"after a bunch of alloc \n"; + segs.dump(); + } + + + /* + triedent::id_allocator oa("test_file.dat"); + + auto ses = oa.start_session(); + + srand(time(nullptr)); + + for (uint32_t r = 0; r < 100; ++r) + { + auto start = std::chrono::steady_clock::now(); + uint64_t count = 1000 * 1000ull * 5; + + for (uint32_t i = 0; i < count; ++i) + { + ses.get_new_id(); + } + auto end = std::chrono::steady_clock::now(); + auto delta = end - start; + + std::cerr << std::setw(12) + << int64_t(count / + (std::chrono::duration(delta).count() / 1000)) + << " items/sec free: " << oa.get_free_count() <<" cap: " << oa.get_capacity() <<"\n"; + } + */ + } + catch (std::exception& e) + { + std::cerr << "exception: " << e.what() << "\n"; + } + return 0; +}