diff --git a/libraries/triedent/CMakeLists.txt b/libraries/triedent/CMakeLists.txt
index 69ea367d4..0ffef1bc9 100644
--- a/libraries/triedent/CMakeLists.txt
+++ b/libraries/triedent/CMakeLists.txt
@@ -5,12 +5,16 @@ find_package(Threads REQUIRED)
 add_library(triedent
     src/database.cpp
     src/mapping.cpp
-    src/gc_queue.cpp
-    src/ring_allocator.cpp
-    src/region_allocator.cpp
-    src/cache_allocator.cpp)
+    src/seg_allocator.cpp
+#    src/gc_queue.cpp
+#    src/ring_allocator.cpp
+#    src/region_allocator.cpp
+#    src/cache_allocator.cpp
+            )
 target_include_directories(triedent PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include ${Boost_INCLUDE_DIRS})
 target_link_libraries(triedent PUBLIC Threads::Threads)
+#target_compile_options(triedent PUBLIC -fsanitize=thread )
+#target_link_options(triedent PUBLIC -fsanitize=thread )
 
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(amd64)|(AMD64)")
    if( NOT APPLE )
@@ -22,9 +26,6 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(amd64)|(AMD64)")
    endif()
 endif()
 
-add_executable(mermaid src/mermaid.cpp)
-target_link_libraries(mermaid PUBLIC Boost::program_options triedent)
-target_include_directories(mermaid PUBLIC ${Boost_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/include)
-set_target_properties(mermaid PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${ROOT_BINARY_DIR})
 
+add_subdirectory(programs)
 add_subdirectory(test)
diff --git a/libraries/triedent/include/triedent/block_allocator.hpp b/libraries/triedent/include/triedent/block_allocator.hpp
new file mode 100644
index 000000000..4434f6316
--- /dev/null
+++ b/libraries/triedent/include/triedent/block_allocator.hpp
@@ -0,0 +1,174 @@
+#pragma once
+#include <filesystem>
+#include <memory>
+#include <utility>
+
+#include <cassert>
+#include <system_error>
+
+#include <fcntl.h>
+#include <sys/file.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <vector>
+
+#include <triedent/mapping.hpp>
+
+namespace triedent
+{
+
+   class block_allocator
+   {
+     public:
+      using id = uint32_t;
+
+      block_allocator(std::filesystem::path file,
+                      uint64_t              block_size,
+                      uint32_t              max_blocks,
+                      bool                  read_write = true)
+          : _filename(file), _block_size(block_size)
+      {
+         _max_blocks = max_blocks;
+         _block_mapping = new char_ptr[max_blocks];
+
+         int flags = O_CLOEXEC;
+         int flock_operation;
+         if (read_write)
+         {
+            flags |= O_RDWR;
+            flags |= O_CREAT;
+            flock_operation = LOCK_EX;
+         }
+         else
+         {
+            flags |= O_RDONLY;
+            flock_operation = LOCK_SH;
+         }
+
+         _fd = ::open(file.native().c_str(), flags, 0644);
+         if (_fd == -1) {
+            std::cerr <<"opening " << file.native() <<"\n";
+            throw std::runtime_error("unable to open block file");
+         }
+
+         if (::flock(_fd, flock_operation | LOCK_NB) != 0)
+         {
+            ::close(_fd);
+            throw std::system_error{errno, std::generic_category()};
+         }
+         struct stat statbuf[1];
+         if (::fstat(_fd, statbuf) != 0)
+         {
+            ::close(_fd);
+            throw std::system_error{errno, std::generic_category()};
+         }
+         _file_size = statbuf->st_size;
+         if (_file_size % block_size != 0)
+         {
+            ::close(_fd);
+            throw std::runtime_error("block file isn't a multiple of block size");
+         }
+         if (_file_size)
+         {
+            auto prot = PROT_READ | PROT_WRITE;  //get_prot(_mode);
+            if (auto addr = ::mmap(nullptr, _file_size, prot, MAP_SHARED, _fd, 0);
+                addr != MAP_FAILED)
+            {
+               char* data = (char*)addr;
+               auto  end  = data + _file_size;
+               while (data != end)
+               {
+                  _block_mapping[ _num_blocks.fetch_add(1) ] = data;
+                  //_block_mapping.push_back(data);
+                  data += _block_size;
+               }
+               // try_pin(&_pinned, addr, _size);
+               //      std::cerr<<"madvise random  " << int64_t(addr) <<"   " << _size << " \n";
+               //      madvise(addr, _size, MADV_RANDOM );
+            }
+            else
+            {
+               ::close(_fd);
+               throw std::system_error{errno, std::generic_category()};
+            }
+         }
+      }
+      ~block_allocator()
+      {
+         if (_fd)
+         {
+            for( uint32_t i = 0; i < _num_blocks.load(); ++i )
+               ::munmap(_block_mapping[i], _block_size);
+            ::close(_fd);
+         }
+      }
+
+      uint64_t block_size() const { return _block_size; }
+      uint64_t num_blocks()const  { return _num_blocks.load( std::memory_order_relaxed ); } 
+      
+      /**
+       * This method brute forces syncing all blocks which likely
+       * flushes more than needed.
+       */
+      void sync(sync_type st) {
+         if (_fd and sync_type::none != st )
+         {
+            uint64_t nb = num_blocks();
+            for( uint32_t i = 0; i < nb; ++i )
+               ::msync(_block_mapping[i], _block_size, msync_flag(st) );
+         }
+      }
+
+      // return the base pointer for the mapped segment
+      inline void* get(id i) { 
+         assert( i < _num_blocks.load(std::memory_order_relaxed) );
+         // this is safe because block mapping reserved capacity so 
+         // resize should never move the data
+         return _block_mapping[i]; 
+      }
+
+      id alloc()
+      {
+         std::lock_guard l{_resize_mutex};
+
+         auto new_size = _file_size + _block_size;
+         if (::ftruncate(_fd, new_size) < 0)
+         {
+            throw std::system_error(errno, std::generic_category());
+         }
+
+         auto prot = PROT_READ | PROT_WRITE;  //get_prot(_mode);
+         if (auto addr = ::mmap(nullptr, _block_size, prot, MAP_SHARED, _fd, _file_size);
+             addr != MAP_FAILED)
+         {
+            auto nb = _num_blocks.load( std::memory_order_relaxed );
+            if( nb == _max_blocks )
+               throw std::runtime_error("maximum block number reached");
+
+               _block_mapping[_num_blocks.load(std::memory_order_relaxed)] = (char*)addr;
+               _file_size = new_size;
+               return _num_blocks.fetch_add(1, std::memory_order_release);
+         }
+         if (::ftruncate(_fd, _file_size) < 0)
+         {
+            throw std::system_error(errno, std::generic_category());
+         }
+         throw std::runtime_error("unable to mmap new block");
+      }
+
+     private:
+      std::filesystem::path _filename;
+      uint64_t              _block_size;
+      uint64_t              _max_blocks;
+      uint64_t              _file_size;
+      int                   _fd;
+      std::atomic<uint64_t> _num_blocks;
+    //  std::vector<void*>    _block_mapping;
+      using char_ptr = char*;
+      char_ptr*             _block_mapping;
+      mutable std::mutex    _resize_mutex;
+   };
+}  // namespace triedent
diff --git a/libraries/triedent/include/triedent/cache_allocator.hpp b/libraries/triedent/include/triedent/cache_allocator.hpp
deleted file mode 100644
index 5641a0dc3..000000000
--- a/libraries/triedent/include/triedent/cache_allocator.hpp
+++ /dev/null
@@ -1,190 +0,0 @@
-#pragma once
-
-#include <triedent/gc_queue.hpp>
-#include <triedent/object_db.hpp>
-#include <triedent/region_allocator.hpp>
-#include <triedent/ring_allocator.hpp>
-
-#include <array>
-#include <atomic>
-#include <filesystem>
-#include <span>
-#include <tuple>
-#include <utility>
-
-namespace triedent
-{
-   // Cache allocator manages all storage for the database.
-   //
-   // It maintains multiple buffers and moves accessed data to the hot
-   // buffer. Objects that are not accessed will be moved to successively
-   // lower buffers over time.
-   //
-   // Objects may be moved at any time. All data
-   // reads must be protected by a session lock which ensures that
-   // existing pointers remain valid.  All writes must be protected
-   // by a location_lock, which prevents the data from being moved.
-   class cache_allocator
-   {
-     public:
-      using id = object_id;
-
-      // cold_bytes can grow
-      // hot/warm/cool are fixed
-      // hot/warm/cool/cold MUST be more than twice the
-      // maximum allocation size.
-      struct config
-      {
-         uint64_t hot_bytes  = 1000 * 1000ull;
-         uint64_t warm_bytes = 1000 * 1000ull;
-         uint64_t cool_bytes = 1000 * 1000ull;
-         uint64_t cold_bytes = 1000 * 1000ull;
-      };
-
-      cache_allocator(const std::filesystem::path& path,
-                      const config&                cfg,
-                      access_mode                  mode,
-                      bool                         allow_gc = false);
-      ~cache_allocator();
-
-      auto start_session() { return gc_queue::session{_gc}; }
-
-      bool          bump_count(object_id id) { return _obj_ids.bump_count(id); }
-      location_lock lock(object_id id) { return _obj_ids.lock(id); }
-
-      // WARNING: alloc temporarily unlocks the session, which invalidates
-      // all existing pointers to allocated objects
-      //
-      // WARNING: alloc is blocking. It should not be called while
-      // holding any locks other than the session. It should also
-      // not be called by the swap thread.
-      std::pair<location_lock, void*> alloc(std::unique_lock<gc_queue::session>& session,
-                                            std::size_t                          num_bytes,
-                                            node_type                            type);
-
-      std::pair<void*, node_type> release(session_lock_ref<>, id i);
-
-      // The returned pointer will remain valid until the session lock is released
-      // get_cache is non-blocking.
-      template <bool CopyToHot>
-      std::tuple<void*, node_type, std::uint16_t> get_cache(session_lock_ref<> session, id i);
-
-      std::uint16_t ref(id i) { return _obj_ids.ref(i); }
-
-      static std::uint32_t object_size(void* ptr)
-      {
-         return reinterpret_cast<object_header*>(ptr)[-1].size;
-      }
-
-      bool is_slow() const { return !_obj_ids.pinned() || !hot().pinned() || !warm().pinned(); }
-
-      std::array<std::span<const char>, 5> span() const
-      {
-         return {_obj_ids.span(), hot().span(), warm().span(), cool().span(), cold().span()};
-      }
-
-      bool gc_retain(object_id i) { return _obj_ids.gc_retain(i); }
-      void gc_start() { _obj_ids.gc_start(); }
-      void gc_finish() { _obj_ids.gc_finish(); }
-
-      void validate(id i) { _obj_ids.validate(i); }
-
-      void print_stats(std::ostream& os, bool detail);
-
-     private:
-      bool  swap(gc_session&);
-      void* try_move_object(session_lock_ref<>   session,
-                            ring_allocator&      to,
-                            const location_lock& lock,
-                            void*                data,
-                            std::uint32_t        size);
-
-      void swap_loop();
-
-      ring_allocator&   hot() { return _levels[hot_cache]; }
-      ring_allocator&   warm() { return _levels[warm_cache]; }
-      ring_allocator&   cool() { return _levels[cool_cache]; }
-      region_allocator& cold() { return _cold; }
-
-      const ring_allocator&   hot() const { return _levels[hot_cache]; }
-      const ring_allocator&   warm() const { return _levels[warm_cache]; }
-      const ring_allocator&   cool() const { return _levels[cool_cache]; }
-      const region_allocator& cold() const { return _cold; }
-
-      object_header* get_object(object_location loc)
-      {
-         if (loc.cache == cold_cache)
-            return _cold.get_object(loc.offset);
-         return _levels[loc.cache].get_object(loc.offset);
-      }
-
-      gc_queue         _gc;
-      object_db        _obj_ids;
-      ring_allocator   _levels[3];
-      region_allocator _cold;
-
-      std::atomic<bool> _done{false};
-      std::thread       _swap_thread;
-      std::thread       _gc_thread;
-   };
-
-   inline std::pair<location_lock, void*> cache_allocator::alloc(  //
-       std::unique_lock<gc_queue::session>& session,
-       std::size_t                          num_bytes,
-       node_type                            type)
-   {
-      if (num_bytes > 0xffffff - 8) [[unlikely]]
-         throw std::runtime_error("obj too big");
-
-      object_id i = _obj_ids.alloc(session, type);
-      hot().allocate(session, i, num_bytes,
-                     [&](void*, object_location loc) { _obj_ids.init(i, loc); });
-
-      auto lock = _obj_ids.lock(i);
-      return {std::move(lock), get_object(_obj_ids.get(i))->data()};
-   }
-
-   inline std::pair<void*, node_type> cache_allocator::release(session_lock_ref<>, id i)
-   {
-      auto l = _obj_ids.release(i);
-      if (l.ref == 0 && l.cache == cold_cache)
-      {
-         cold().deallocate(l);
-      }
-      return {(l.ref > 0 ? nullptr : (char*)get_object(l)->data()), {l.type()}};
-   }
-
-   // The returned pointer will remain valid until the session lock is released
-   template <bool CopyToHot>
-   std::tuple<void*, node_type, uint16_t> cache_allocator::get_cache(session_lock_ref<> session,
-                                                                     id                 i)
-   {
-      auto loc = _obj_ids.get(i);
-      auto obj = get_object(loc);
-
-      if constexpr (CopyToHot)
-      {
-         if (loc.cache != hot_cache && obj->size <= 4096)
-         {
-            // MUST NOT wait for free memory while holding a location lock
-            if (auto copy =
-                    try_move_object(session, hot(), _obj_ids.lock(i), obj->data(), obj->size))
-            {
-               if constexpr (debug_cache)
-               {
-           //       std::osyncstream(std::cout)
-           //           << "copied to hot: " << loc.cache << ":" << loc.offset() << std::endl;
-               }
-               return {copy, {loc.type()}, static_cast<std::uint16_t>(loc.ref)};
-            }
-         }
-      }
-
-      if constexpr (debug_cache)
-      {
-       //  std::osyncstream(std::cout) << "read: " << loc.cache << ":" << loc.offset() << std::endl;
-      }
-      return {obj->data(), {loc.type()}, static_cast<std::uint16_t>(loc.ref)};
-   }
-
-}  // namespace triedent
diff --git a/libraries/triedent/include/triedent/database.hpp b/libraries/triedent/include/triedent/database.hpp
index 73ad2917b..5b8f88915 100644
--- a/libraries/triedent/include/triedent/database.hpp
+++ b/libraries/triedent/include/triedent/database.hpp
@@ -4,6 +4,7 @@
 #include <memory>
 #include <optional>
 #include <span>
+#include <triedent/key6.hpp>
 #include <triedent/node.hpp>
 
 namespace triedent
@@ -20,14 +21,9 @@ namespace triedent
    struct write_access;
    struct read_access;
 
-   template <typename T = node>
-   struct deref;
-
    template <typename T = node>
    struct mutable_deref;
 
-   inline key_type from_key6(const key_view sixb);
-
    // Write thread usage notes:
    // * To create a new tree, default-initialize a shared_ptr<root>
    // * To get the upper-most root, use write_session::get_top_root
@@ -133,17 +129,18 @@ namespace triedent
       using string_view = std::string_view;
       using id          = object_id;
 
+      auto lock() const { return _session.lock(); }
+
      protected:
-      using swap_guard = std::lock_guard<gc_session>;
-      explicit session_base(cache_allocator& a);
-      operator gc_session&() const { return _session; }
+      explicit session_base(seg_allocator& a);
+      operator seg_allocator::session&() const { return _session; }
 
      public:
       key_view to_key6(key_view v) const;
 
      private:
-      mutable gc_session _session;
-      mutable key_type   key_buf;
+      mutable seg_allocator::session _session;  // or read_lock...?
+      mutable key_type               key_buf;
    };
 
    /**
@@ -164,25 +161,51 @@ namespace triedent
       bool                             get(const std::shared_ptr<root>&        r,
                                            std::span<const char>               key,
                                            std::vector<char>*                  result_bytes,
-                                           std::vector<std::shared_ptr<root>>* result_roots) const;
+                                           std::vector<std::shared_ptr<root>>* result_roots = nullptr) const;
       std::optional<std::vector<char>> get(const std::shared_ptr<root>& r,
                                            std::span<const char>        key) const;
 
+      ///    Assume keys a-z
+      ///
+      ///    key = m
+      ///
+      ///    greater_equal is m, or if m isn't present then it is n
+      ///    less_than is l
+      ///    max is z
+      ///    next = m+1 or n, if keys are strings then next is 'ma'
+      /**
+       *  TODO: verify these docs
+       *  finds the first key greater than or equal to key, this can be used to find
+       *  the first element by using an empty key()
+       *
+       *  ie. lower_bound
+       */
       bool get_greater_equal(const std::shared_ptr<root>&        r,
                              std::span<const char>               key,
                              std::vector<char>*                  result_key,
-                             std::vector<char>*                  result_bytes,
-                             std::vector<std::shared_ptr<root>>* result_roots) const;
+                             std::vector<char>*                  result_bytes = nullptr,
+                             std::vector<std::shared_ptr<root>>* result_roots = nullptr) const;
+
+      /**
+       *  TODO: verify these docs
+       *  finds the largest key less than key
+       */
       bool get_less_than(const std::shared_ptr<root>&        r,
                          std::span<const char>               key,
                          std::vector<char>*                  result_key,
-                         std::vector<char>*                  result_bytes,
-                         std::vector<std::shared_ptr<root>>* result_roots) const;
+                         std::vector<char>*                  result_bytes = nullptr,
+                         std::vector<std::shared_ptr<root>>* result_roots = nullptr) const;
+      /**
+       *  TODO: verify these docs
+       *  
+       *  finds the largest key with the given prefix, this can be used to find
+       *  the last key by using an empty prefix.
+       */
       bool get_max(const std::shared_ptr<root>&        r,
                    std::span<const char>               prefix,
                    std::vector<char>*                  result_key,
-                   std::vector<char>*                  result_bytes,
-                   std::vector<std::shared_ptr<root>>* result_roots) const;
+                   std::vector<char>*                  result_bytes = nullptr,
+                   std::vector<std::shared_ptr<root>>* result_roots = nullptr) const;
 
       void print(const std::shared_ptr<root>& r);
       void validate(const std::shared_ptr<root>& r);
@@ -194,12 +217,12 @@ namespace triedent
       session(const session&) = delete;
 
       inline object_id   get_id(const std::shared_ptr<root>& r) const;
-      void               validate(session_lock_ref<> l, id);
+      void               validate(session_rlock& l, id);
       void               print(id n, string_view prefix = "", std::string k = "");
-      inline deref<node> get_by_id(session_lock_ref<> l, object_id i) const;
-      inline deref<node> get_by_id(session_lock_ref<> l, object_id i, bool& unique) const;
+      inline deref<node> get_by_id(session_rlock& l, object_id i) const;
+      inline deref<node> get_by_id(session_rlock& l, object_id i, bool& unique) const;
 
-      bool unguarded_get(session_lock_ref<>                            l,
+      bool unguarded_get(session_rlock&                                l,
                          const std::shared_ptr<triedent::root>&        ancestor,
                          object_id                                     root,
                          std::string_view                              key,
@@ -213,39 +236,41 @@ namespace triedent
                        std::vector<std::shared_ptr<root>>* result_roots) const;
 
       bool unguarded_get_greater_equal(
-          session_lock_ref<>                            l,
+          session_rlock&                                l,
           const std::shared_ptr<triedent::root>&        ancestor,
           object_id                                     root,
           std::string_view                              key,
-          std::vector<char>&                            result_key,
+          temp_key6&                                    result_key,
           std::vector<char>*                            result_bytes,
           std::vector<std::shared_ptr<triedent::root>>* result_roots) const;
 
       bool unguarded_get_less_than(
-          session_lock_ref<>                            l,
+          session_rlock&                                l,
           const std::shared_ptr<triedent::root>&        ancestor,
           object_id                                     root,
           std::optional<std::string_view>               key,
-          std::vector<char>&                            result_key,
+          temp_key6&                                    result_key,
           std::vector<char>*                            result_bytes,
           std::vector<std::shared_ptr<triedent::root>>* result_roots) const;
 
-      bool unguarded_get_max(session_lock_ref<>                            l,
+      bool unguarded_get_max(session_rlock&                                l,
                              const std::shared_ptr<triedent::root>&        ancestor,
                              object_id                                     root,
                              std::string_view                              prefix_min,
                              std::string_view                              prefix_max,
-                             std::vector<char>&                            result_key,
+                             temp_key6&                                    result_key,
                              std::vector<char>*                            result_bytes,
                              std::vector<std::shared_ptr<triedent::root>>* result_roots) const;
 
-      inline id   retain(std::unique_lock<gc_session>&, id);
-      inline void release(session_lock_ref<> l, id);
+      inline id   retain(session_rlock&, id);   // bump or copy
+      inline void release(session_rlock&, id);  // polymorphic release node
 
       friend class database;
       std::shared_ptr<database> _db;
 
-      cache_allocator& ring() const;
+      seg_allocator& sega() const;
+
+      void cache(auto& objref) const;
    };
    using read_session = session<read_access>;
 
@@ -255,7 +280,7 @@ namespace triedent
       write_session(std::shared_ptr<database> db) : read_session(db) {}
 
       std::shared_ptr<root> get_top_root();
-      void                  set_top_root(const std::shared_ptr<root>& r);
+      void                  set_top_root(const std::shared_ptr<root>& r, bool sync = false);
 
       int upsert(std::shared_ptr<root>& r, std::span<const char> key, std::span<const char> val);
 
@@ -283,85 +308,134 @@ namespace triedent
       void end_collect_garbage();
       ///@}
 
+      void validate()
+      {
+         auto tr    = get_id(get_top_root());
+         auto state = session_base::lock();
+         validate_node(state, tr);
+      }
+
      private:
       inline bool get_unique(std::shared_ptr<root>& r);
-      inline void update_root(session_lock_ref<> l, std::shared_ptr<root>& r, object_id id);
-
-      void recursive_retain(session_lock_ref<> l, object_id id);
-
-      mutable_deref<value_node> make_value(std::unique_lock<gc_session>& session,
-                                           node_type                     type,
-                                           string_view                   k,
-                                           string_view                   v);
-      mutable_deref<value_node> clone_value(std::unique_lock<gc_session>& session,
-                                            object_id                     origin,
-                                            node_type                     type,
-                                            string_view                   key,
-                                            std::uint32_t                 key_offset,
-                                            string_view                   val);
-
-      mutable_deref<value_node>        clone_value(std::unique_lock<gc_session>& session,
-                                                   object_id                     origin,
-                                                   node_type                     type,
-                                                   const std::string&            key,
-                                                   string_view                   val);
-      inline mutable_deref<inner_node> make_inner(std::unique_lock<gc_session>& session,
-                                                  string_view                   pre,
-                                                  id                            val,
-                                                  uint64_t                      branches);
-      inline mutable_deref<inner_node> clone_inner(std::unique_lock<gc_session>& session,
-                                                   object_id                     id,
-                                                   const inner_node&             cpy,
-                                                   string_view                   pre,
-                                                   std::uint32_t                 offset,
-                                                   object_id                     val,
-                                                   uint64_t                      branches);
-      inline mutable_deref<inner_node> clone_inner(std::unique_lock<gc_session>& session,
-                                                   object_id                     id,
-                                                   const inner_node&             cpy,
-                                                   const std::string&            pre,
-                                                   object_id                     val,
-                                                   uint64_t                      branches);
+      inline void update_root(session_rlock& l, std::shared_ptr<root>& r, object_id id);
+
+      void recursive_retain(session_rlock& l, object_id id);
+
+      mutable_deref<value_node> make_value(session_rlock& state,
+                                           node_type      type,
+                                           string_view    k,
+                                           string_view    v);
+
+      inline object_id make_value_id(session_rlock& state,
+                                     node_type      type,
+                                     string_view    k,
+                                     string_view    v);
+
+      mutable_deref<value_node> clone_value(session_rlock& state,
+                                            object_id      origin,
+                                            node_type      type,
+                                            string_view    key,
+                                            std::uint32_t  key_offset,
+                                            string_view    val);
+
+      // like clone_value but doesn't construct a mutable_deref which does
+      // unnecessary locking
+      inline object_id clone_value_id(session_rlock& state,
+                                      object_id      origin,
+                                      node_type      type,
+                                      string_view    key,
+                                      std::uint32_t  key_offset,
+                                      string_view    val);
+
+      inline mutable_deref<value_node> clone_value(session_rlock&     state,
+                                                   object_id          origin,
+                                                   node_type          type,
+                                                   const std::string& key,
+                                                   string_view        val);
+
+      inline object_id clone_value_id(session_rlock&     state,
+                                      object_id          origin,
+                                      node_type          type,
+                                      const std::string& key,
+                                      string_view        val);
+
+      inline mutable_deref<inner_node> make_inner(session_rlock& state,
+                                                  string_view    pre,
+                                                  id             val,
+                                                  uint64_t       branches);
+
+      inline object_id make_inner_id(session_rlock& state,
+                                     string_view    pre,
+                                     id             val,
+                                     uint64_t       branches);
+
+      inline mutable_deref<inner_node> clone_inner(session_rlock&    state,
+                                                   object_id         id,
+                                                   const inner_node& cpy,
+                                                   string_view       pre,
+                                                   std::uint32_t     offset,
+                                                   object_id         val,
+                                                   uint64_t          branches);
+      inline mutable_deref<inner_node> clone_inner(session_rlock&     state,
+                                                   object_id          id,
+                                                   const inner_node&  cpy,
+                                                   const std::string& pre,
+                                                   object_id          val,
+                                                   uint64_t           branches);
+
+      inline object_id clone_inner_id(session_rlock&    state,
+                                      object_id         id,
+                                      const inner_node& cpy,
+                                      string_view       pre,
+                                      std::uint32_t     offset,
+                                      object_id         val,
+                                      uint64_t          branches);
+      inline object_id clone_inner_id(session_rlock&     state,
+                                      object_id          id,
+                                      const inner_node&  cpy,
+                                      const std::string& pre,
+                                      object_id          val,
+                                      uint64_t           branches);
 
       template <typename T>
       inline mutable_deref<T> lock(const deref<T>& obj);
 
-      inline id add_child(std::unique_lock<gc_session>& session,
-                          id                            root,
-                          bool                          unique,
-                          node_type                     type,
-                          string_view                   key,
-                          string_view                   val,
-                          int&                          old_size);
-      inline id remove_child(std::unique_lock<gc_session>& session,
-                             id                            root,
-                             bool                          unique,
-                             string_view                   key,
-                             int&                          removed_size);
-
-      inline void modify_value(session_lock_ref<>        l,
+      inline id add_child(session_rlock& state,
+                          id             root,
+                          bool           unique,
+                          node_type      type,
+                          string_view    key,
+                          string_view    val,
+                          int&           old_size);
+      inline id remove_child(session_rlock& state,
+                             id             root,
+                             bool           unique,
+                             string_view    key,
+                             int&           removed_size);
+
+      inline void modify_value(session_rlock&            state,
                                mutable_deref<value_node> mut,
                                string_view               val);
-      inline id   set_value(std::unique_lock<gc_session>& session,
-                            deref<node>                   n,
-                            bool                          unique,
-                            node_type                     type,
-                            string_view                   key,
-                            string_view                   val);
-      inline id   set_inner_value(std::unique_lock<gc_session>& session,
-                                  deref<inner_node>             n,
-                                  bool                          unique,
-                                  node_type                     type,
-                                  string_view                   val);
-      inline id   combine_value_nodes(std::unique_lock<gc_session>& session,
-                                      node_type                     t1,
-                                      string_view                   k1,
-                                      string_view                   v1,
-                                      object_id                     origin1,
-                                      node_type                     t2,
-                                      string_view                   k2,
-                                      string_view                   v2,
-                                      object_id                     origin2);
+      inline id   set_value(session_rlock& state,
+                            deref<node>    n,
+                            bool           unique,
+                            node_type      type,
+                            string_view    key,
+                            string_view    val);
+      inline id   set_inner_value(session_rlock&    state,
+                                  deref<inner_node> n,
+                                  bool              unique,
+                                  node_type         type,
+                                  string_view       val);
+      inline id   combine_value_nodes(session_rlock& state,
+                                      node_type      t1,
+                                      string_view    k1,
+                                      string_view    v1,
+                                      object_id      origin1,
+                                      node_type      t2,
+                                      string_view    k2,
+                                      string_view    v2,
+                                      object_id      origin2);
    };
 
    class database : public std::enable_shared_from_this<database>
@@ -374,7 +448,47 @@ namespace triedent
       friend root;
 
      public:
-      using config                     = cache_allocator::config;
+      struct config
+      {
+         /**
+          *  Read threads can move the accessed data into
+          *  a warm cache to improve cache locality and separate
+          *  infrequently used data from frequently used data.
+          *
+          *  If used with anything other than sync_type::none, this
+          *  will produce write amplification somewhat less than
+          *  the total data read because on sync() the moved cache
+          *  values must be flushed to disk.
+          */
+         bool cache_on_read = false;
+
+         /**
+          * By default triedent starts a background thread which
+          * will compact data ones a segment 
+          */
+         bool run_compact_thread = true;
+
+         /**
+          * The max amount of a segment that is allowed to be empty
+          * before the compactor thread will move the remaining contents
+          * to a new segment. 
+          *
+          * Lower values save space, but produce more write amplification when
+          * using sync_type other than none.  Lower values improve cache
+          * locality and reduce page misses by keeping the data denser.
+          */
+         int compact_empty_threshold_percent = 20;
+
+         /**
+          * Triedent will discourage the OS from swapping out 
+          * the most recently used segments by using mlock(),
+          * may want a higher compaction threshold if using mlock()
+          *
+          */
+         uint64_t max_pinnable_segments = 64;
+
+         sync_type sync_mode = sync_type::none;
+      };
       static constexpr auto read_write = access_mode::read_write;
       static constexpr auto read_only  = access_mode::read_only;
 
@@ -388,6 +502,9 @@ namespace triedent
       database(const std::filesystem::path& dir, access_mode mode, bool allow_gc = false);
       ~database();
 
+      void start_compact_thread() { _sega.start_compact_thread(); }
+      bool compact_next_segment() { return _sega.compact_next_segment(); }
+
       static void create(std::filesystem::path dir, config);
 
       std::shared_ptr<write_session> start_write_session();
@@ -395,11 +512,12 @@ namespace triedent
 
       void print_stats(std::ostream& os, bool detail = false);
 
-      bool is_slow() const { return _ring.is_slow(); }
-      auto span() const { return _ring.span(); }
+      // bool is_slow() const { return _ring.is_slow(); }
+      // auto span() const { return _ring.span(); }
+
 
      private:
-      inline void release(session_lock_ref<> l, id);
+      inline void release(session_rlock& l, id);
 
       struct database_memory
       {
@@ -414,7 +532,7 @@ namespace triedent
          std::atomic<uint64_t> top_root;
       };
 
-      cache_allocator  _ring;
+      seg_allocator    _sega;
       mapping          _file;
       database_memory* _dbm;
 
@@ -423,6 +541,7 @@ namespace triedent
 
       std::mutex   _root_release_session_mutex;
       session_base _root_release_session;
+      config       _config;
    };
 
    inline root::~root()
@@ -434,99 +553,22 @@ namespace triedent
       if (db && id && !ancestor)
       {
          std::lock_guard<std::mutex> lock(db->_root_release_session_mutex);
-         session_base::swap_guard    guard(db->_root_release_session);
-         db->release(guard, id);
+         auto                        state = db->_root_release_session.lock();
+         db->release(state, id);
       }
    }
 
-   template <typename T>
-   struct deref
-   {
-      using id = object_id;
-
-      deref(std::pair<id, value_node*> p, node_type t)
-          : _id(p.first), ptr((char*)p.second), _type(t)
-      {
-      }
-      deref(std::pair<id, inner_node*> p)
-          : _id(p.first), ptr((char*)p.second), _type(node_type::inner)
-      {
-      }
-      template <typename Other>
-      deref(deref<Other> p) : _id(p._id), ptr((char*)p.ptr), _type(p._type)
-      {
-      }
-      deref(id i, void* p, node_type t) : _id(i), ptr(p), _type(t) {}
-
-      explicit inline operator bool() const { return bool(_id); }
-      inline          operator id() const { return _id; }
-
-      auto         type() const { return _type; }
-      bool         is_leaf_node() const { return _type != node_type::inner; }
-      inline auto& as_value_node() const { return *reinterpret_cast<const value_node*>(ptr); }
-      inline auto& as_inner_node() const { return *reinterpret_cast<const inner_node*>(ptr); }
-
-      inline const T* operator->() const { return reinterpret_cast<const T*>(ptr); }
-      inline const T& operator*() const { return *reinterpret_cast<const T*>(ptr); }
-
-      int64_t as_id() const { return _id.id; }
-
-      // Allocation invalidates pointers. reload will make the deref object
-      // valid again after an allocation.
-      void reload(cache_allocator& a, session_lock_ref<> session)
-      {
-         auto [p, type, ref] = a.get_cache<false>(session, _id);
-         ptr                 = p;
-      }
-
-     protected:
-      template <typename Other>
-      friend class deref;
-
-      id        _id;
-      void*     ptr;
-      node_type _type;
-   };  // deref
-
-   template <typename T>
-   struct mutable_deref : deref<T>
-   {
-      mutable_deref() = default;
-      mutable_deref(std::pair<location_lock, value_node*> p, node_type type)
-          : deref<T>{{p.first.get_id(), p.second}, type}, lock{std::move(p.first)}
-      {
-      }
-      mutable_deref(std::pair<location_lock, inner_node*> p)
-          : deref<T>{{p.first.get_id(), p.second}}, lock{std::move(p.first)}
-      {
-      }
-      mutable_deref(location_lock lock, const deref<T>& src) : lock{std::move(lock)}, deref<T>{src}
-      {
-      }
-
-      inline auto& as_value_node() const { return *reinterpret_cast<value_node*>(this->ptr); }
-      inline auto& as_inner_node() const { return *reinterpret_cast<inner_node*>(this->ptr); }
-
-      inline T* operator->() const { return reinterpret_cast<T*>(this->ptr); }
-      inline T& operator*() const { return *reinterpret_cast<T*>(this->ptr); }
-
-      auto get_id() { return lock.get_id(); }
-
-     private:
-      location_lock lock;
-   };  // mutable_deref
-
-   inline session_base::session_base(cache_allocator& a) : _session(a.start_session()) {}
+   inline session_base::session_base(seg_allocator& a) : _session(a.start_session()) {}
 
    template <typename AccessMode>
-   inline cache_allocator& session<AccessMode>::ring() const
+   inline seg_allocator& session<AccessMode>::sega() const
    {
-      return _db->_ring;
+      return _db->_sega;
    }
 
    template <typename AccessMode>
    session<AccessMode>::session(std::shared_ptr<database> db)
-       : session_base{db->_ring}, _db(std::move(db))
+       : session_base{db->_sega}, _db(std::move(db))
    {
    }
    template <typename AccessMode>
@@ -545,18 +587,17 @@ namespace triedent
    }
 
    template <typename AccessMode>
-   inline deref<node> session<AccessMode>::get_by_id(session_lock_ref<> l, id i) const
+   inline deref<node> session<AccessMode>::get_by_id(session_rlock& state, id i) const
    {
-      auto [ptr, type, ref] = ring().template get_cache<true>(l, i);
-      return {i, ptr, type};
+      return deref<node>(state.get(i));  // TODO: cache
    }
 
    template <typename AccessMode>
-   inline deref<node> session<AccessMode>::get_by_id(session_lock_ref<> l, id i, bool& unique) const
+   inline deref<node> session<AccessMode>::get_by_id(session_rlock& state, id i, bool& unique) const
    {
-      auto [ptr, type, ref] = ring().template get_cache<true>(l, i);
-      unique &= ref == 1;
-      return {i, ptr, type};
+      auto ob = state.get(i);
+      unique &= (ob.ref_count() == 1);
+      return deref<node>(ob);
    }
 
    template <typename AccessMode>
@@ -569,29 +610,30 @@ namespace triedent
                       << std::endl;
       if (r.use_count() == 1 && r->db && !r->ancestor && r->id)
       {
-         auto id = r->id;
-         r->id   = {};
-         swap_guard g(*this);
-         release(g, id);
+         auto id    = r->id;
+         r->id      = {};
+         auto state = lock();
+         release(state, id);
       }
       r = {};
    }
 
    template <typename AccessMode>
-   inline void session<AccessMode>::release(session_lock_ref<> l, id obj)
+   inline void session<AccessMode>::release(session_rlock& state, id obj)
    {
-      _db->release(l, obj);
+      _db->release(state, obj);
    }
 
-   inline void database::release(session_lock_ref<> l, id obj)
+
+   inline void database::release(session_rlock& state, id obj)
    {
-      release_node(l, _ring, obj);
+      release_node(state, obj);
    }
 
    template <typename AccessMode>
-   inline database::id session<AccessMode>::retain(std::unique_lock<gc_session>& session, id obj)
+   inline database::id session<AccessMode>::retain(session_rlock& state, id obj)
    {
-      return bump_refcount_or_copy(ring(), session, obj);
+      return bump_refcount_or_copy(state, obj);
    }
 
    // This always returns a view into the first argument
@@ -617,12 +659,12 @@ namespace triedent
          return result;
       }
 
-      std::unique_lock<gc_session> l(*this);
-      id = retain(l, {id}).id;
+      auto state = session_base::lock();
+      id         = retain(state, {id}).id;
       return std::make_shared<root>(root{_db, nullptr, {id}});
    }
 
-   inline void write_session::set_top_root(const std::shared_ptr<root>& r)
+   inline void write_session::set_top_root(const std::shared_ptr<root>& r, bool sync)
    {
       std::lock_guard<std::mutex> lock(_db->_root_change_mutex);
       auto                        current = _db->_dbm->top_root.load();
@@ -634,12 +676,23 @@ namespace triedent
          return;
       }
 
-      std::unique_lock<gc_session> l(*this);
+      auto state = session_base::lock();
+      /*
+      if( id.id and not validate_node( state, id ) ) {
+         throw std::runtime_error( "invalid node!" );
+      }
+      */
+
       if constexpr (debug_roots)
          std::cout << id.id << ": set_top_root: old=" << current << std::endl;
-      id = retain(l, id);
+      id = retain(state, id);
       _db->_dbm->top_root.store(id.id);
-      release(l, {current});
+      if (_db->_config.sync_mode != sync_type::none)
+      {
+         _db->_sega.sync(_db->_config.sync_mode);  // data backing it is written here
+         _db->_file.sync(_db->_config.sync_mode);  // top root is written here
+      }
+      release(state, {current});
    }
 
    inline bool write_session::get_unique(std::shared_ptr<root>& r)
@@ -648,9 +701,7 @@ namespace triedent
       return r && r->db && !r->ancestor && r.use_count() == 1;
    }
 
-   inline void write_session::update_root(session_lock_ref<>     l,
-                                          std::shared_ptr<root>& r,
-                                          object_id              id)
+   inline void write_session::update_root(session_rlock& l, std::shared_ptr<root>& r, object_id id)
    {
       if (r && r->db && r->id == id)
       {
@@ -668,12 +719,15 @@ namespace triedent
          // bumped.
          if constexpr (debug_roots)
             std::cout << id.id << ": update_root replacing:" << r->id.id << std::endl;
+
          release(l, r->id);
+
          r->id = id;
       }
       else
       {
-         if constexpr (debug_roots) {
+         if constexpr (debug_roots)
+         {
             if (r == nullptr)
             {
                std::cout << id.id << ": update_root original was nullptr" << std::endl;
@@ -688,87 +742,138 @@ namespace triedent
       }
    }
 
-   inline mutable_deref<value_node> write_session::make_value(std::unique_lock<gc_session>& session,
-                                                              node_type                     type,
-                                                              string_view                   key,
-                                                              string_view                   val)
+   inline mutable_deref<value_node> write_session::make_value(session_rlock& state,
+                                                              node_type      type,
+                                                              string_view    key,
+                                                              string_view    val)
    {
-      return {value_node::make(ring(), session, key, val, type), type};
+      return {value_node::make(state, key, val, type)};
    }
 
-   inline mutable_deref<value_node> write_session::clone_value(
-       std::unique_lock<gc_session>& session,
-       object_id                     origin,
-       node_type                     type,
-       string_view                   key,
-       std::uint32_t                 key_offset,
-       string_view                   val)
+   inline object_id write_session::make_value_id(session_rlock& state,
+                                                 node_type      type,
+                                                 string_view    key,
+                                                 string_view    val)
    {
-      return {value_node::clone(ring(), session, origin, key, key_offset, val, type), type};
+      auto obr = value_node::make(state, key, val, type);
+      obr.obj()->update_checksum();
+      return obr.id();
    }
 
-   inline mutable_deref<value_node> write_session::clone_value(
-       std::unique_lock<gc_session>& session,
-       object_id                     origin,
-       node_type                     type,
-       const std::string&            key,
-       string_view                   val)
+   inline object_id write_session::clone_value_id(session_rlock& state,
+                                                  object_id      origin,
+                                                  node_type      type,
+                                                  string_view    key,
+                                                  std::uint32_t  key_offset,
+                                                  string_view    val)
    {
-      return {value_node::clone(ring(), session, origin, key, -1, val, type), type};
+      auto obr = value_node::clone(state, origin, key, key_offset, val, type);
+      obr.obj()->update_checksum();
+      return obr.id();
    }
 
-   inline mutable_deref<inner_node> write_session::make_inner(std::unique_lock<gc_session>& session,
-                                                              string_view                   pre,
-                                                              id                            val,
-                                                              uint64_t branches)
+   inline mutable_deref<value_node> write_session::clone_value(session_rlock&     state,
+                                                               object_id          origin,
+                                                               node_type          type,
+                                                               const std::string& key,
+                                                               string_view        val)
    {
-      return inner_node::make(ring(), session, pre, val, branches);
+      return {value_node::clone(state, origin, key, -1, val, type)};
+   }
+   inline object_id write_session::clone_value_id(session_rlock&     state,
+                                                  object_id          origin,
+                                                  node_type          type,
+                                                  const std::string& key,
+                                                  string_view        val)
+   {
+      auto obr = value_node::clone(state, origin, key, -1, val, type);
+      obr.obj()->update_checksum();
+      return obr.id();
    }
 
-   inline mutable_deref<inner_node> write_session::clone_inner(
-       std::unique_lock<gc_session>& session,
-       object_id                     id,
-       const inner_node&             cpy,
-       string_view                   pre,
-       std::uint32_t                 offset,
-       object_id                     val,
-       uint64_t                      branches)
+   inline mutable_deref<inner_node> write_session::make_inner(session_rlock& state,
+                                                              string_view    pre,
+                                                              id             val,
+                                                              uint64_t       branches)
    {
-      return inner_node::clone(ring(), session, id, &cpy, pre, offset, val, branches);
+      return inner_node::make(state, pre, val, branches);
+   }
+   inline object_id write_session::make_inner_id(session_rlock& state,
+                                                 string_view    pre,
+                                                 id             val,
+                                                 uint64_t       branches)
+   {
+      auto obr = inner_node::make(state, pre, val, branches);
+      obr.obj()->update_checksum();
+      return obr.id();
    }
 
-   inline mutable_deref<inner_node> write_session::clone_inner(
-       std::unique_lock<gc_session>& session,
-       object_id                     id,
-       const inner_node&             cpy,
-       const std::string&            pre,
-       object_id                     val,
-       uint64_t                      branches)
+   inline mutable_deref<inner_node> write_session::clone_inner(session_rlock&    state,
+                                                               object_id         id,
+                                                               const inner_node& cpy,
+                                                               string_view       pre,
+                                                               std::uint32_t     offset,
+                                                               object_id         val,
+                                                               uint64_t          branches)
    {
-      return inner_node::clone(ring(), session, id, &cpy, pre, -1, val, branches);
+      return inner_node::clone(state, id, &cpy, pre, offset, val, branches);
+   }
+
+   inline object_id write_session::clone_inner_id(session_rlock&    state,
+                                                  object_id         id,
+                                                  const inner_node& cpy,
+                                                  string_view       pre,
+                                                  std::uint32_t     offset,
+                                                  object_id         val,
+                                                  uint64_t          branches)
+   {
+      auto obr = inner_node::clone(state, id, &cpy, pre, offset, val, branches);
+      obr.obj()->update_checksum();
+      return obr.id();
+   }
+
+   inline mutable_deref<inner_node> write_session::clone_inner(session_rlock&     state,
+                                                               object_id          id,
+                                                               const inner_node&  cpy,
+                                                               const std::string& pre,
+                                                               object_id          val,
+                                                               uint64_t           branches)
+   {
+      return inner_node::clone(state, id, &cpy, pre, -1, val, branches);
+   }
+   inline object_id write_session::clone_inner_id(session_rlock&     state,
+                                                  object_id          id,
+                                                  const inner_node&  cpy,
+                                                  const std::string& pre,
+                                                  object_id          val,
+                                                  uint64_t           branches)
+   {
+      auto obr = inner_node::clone(state, id, &cpy, pre, -1, val, branches);
+      obr.obj()->update_checksum();
+      return obr.id();
    }
 
    template <typename T>
    inline mutable_deref<T> write_session::lock(const deref<T>& obj)
    {
-      return {ring().lock(obj), obj};
+      return {obj};
    }
 
    /**
     *  Given an existing value node and a new key/value to insert
     */
-   database::id write_session::combine_value_nodes(std::unique_lock<gc_session>& session,
-                                                   node_type                     t1,
-                                                   string_view                   k1,
-                                                   string_view                   v1,
-                                                   object_id                     origin1,
-                                                   node_type                     t2,
-                                                   string_view                   k2,
-                                                   string_view                   v2,
-                                                   object_id                     origin2)
+   database::id write_session::combine_value_nodes(session_rlock& state,
+                                                   node_type      t1,
+                                                   string_view    k1,
+                                                   string_view    v1,
+                                                   object_id      origin1,
+                                                   node_type      t2,
+                                                   string_view    k2,
+                                                   string_view    v2,
+                                                   object_id      origin2)
    {
       if (k1.size() > k2.size())
-         return combine_value_nodes(session, t2, k2, v2, origin2, t1, k1, v1, origin1);
+         return combine_value_nodes(state, t2, k2, v2, origin2, t1, k1, v1, origin1);
 
       //std::cerr << __func__ << ":" << __LINE__ << "\n";
       auto cpre = common_prefix(k1, k2);
@@ -800,33 +905,39 @@ namespace triedent
       if (cpre == k1)
       {
          auto [inner_id, branch_id] = build_children(
-             [&] { return clone_value(session, origin1, t1, k1, k1.size(), v1); },
-             [&] { return clone_value(session, origin2, t2, k2, cpre.size() + 1, v2); });
+             [&] { return clone_value_id(state, origin1, t1, k1, k1.size(), v1); },
+             [&] { return clone_value_id(state, origin2, t2, k2, cpre.size() + 1, v2); });
+
+         // this usesthe non-locking deref because no alloc before return
+         auto in = inner_node::make(state, cpre, id(), 1ull << b2);
 
-         auto in = make_inner(session, cpre, id(), 1ull << b2);
          // Set value separately, because we don't want to increment its refcount
          in->set_value(inner_id);
          in->branch(b2) = branch_id;
 
-         return in;
+         in.obj()->update_checksum();
+
+         return in.id();
       }
       else
       {
          auto b1sfx        = k1.substr(cpre.size());
          auto b1           = b1sfx.front();
          auto [b1id, b2id] = build_children(
-             [&] { return clone_value(session, origin1, t1, k1, cpre.size() + 1, v1); },
-             [&] { return clone_value(session, origin2, t2, k2, cpre.size() + 1, v2); });
+             [&] { return clone_value_id(state, origin1, t1, k1, cpre.size() + 1, v1); },
+             [&] { return clone_value_id(state, origin2, t2, k2, cpre.size() + 1, v2); });
 
-         auto in        = make_inner(session, cpre, id(), inner_node::branches(b1, b2));
+         // this usesthe non-locking deref because there are no alloc before return
+         auto in        = inner_node::make(state, cpre, id(), inner_node::branches(b1, b2));
          in->branch(b1) = b1id;
          in->branch(b2) = b2id;
 
-         return in;
+         in.obj()->update_checksum();
+         return in.id();
       }
    }
 
-   void write_session::modify_value(session_lock_ref<>        l,
+   void write_session::modify_value(session_rlock&            l,
                                     mutable_deref<value_node> mut,
                                     string_view               val)
    {
@@ -834,7 +945,7 @@ namespace triedent
       {
          if constexpr (debug_roots)
          {
-            std::cout << mut.get_id().id << ": modify_value; old:";
+            std::cout << mut.id().id << ": modify_value; old:";
             for (unsigned i = 0; i < mut->num_roots(); ++i)
                std::cout << " " << mut->roots()[i].id;
             std::cout << std::endl;
@@ -852,7 +963,7 @@ namespace triedent
 
          if constexpr (debug_roots)
          {
-            std::cout << mut.get_id().id << ": modify_value; new:";
+            std::cout << mut.id().id << ": modify_value; new:";
             for (unsigned i = 0; i < mut->num_roots(); ++i)
                std::cout << " " << mut->roots()[i].id;
             std::cout << std::endl;
@@ -862,63 +973,68 @@ namespace triedent
          memcpy(mut->data_ptr(), val.data(), val.size());
    }
 
-   database::id write_session::set_value(std::unique_lock<gc_session>& session,
-                                         deref<node>                   n,
-                                         bool                          unique,
-                                         node_type                     type,
-                                         string_view                   key,
-                                         string_view                   val)
+   database::id write_session::set_value(session_rlock& state,
+                                         deref<node>    n,
+                                         bool           unique,
+                                         node_type      type,
+                                         string_view    key,
+                                         string_view    val)
    {
       if (!n || !unique || type != n.type())
-         return make_value(session, type, key, val);
+         return make_value_id(state, type, key, val);
 
       assert(n.is_leaf_node());
 
       auto& vn = n.as_value_node();
       if (vn.data_size() == val.size())
       {
-         modify_value(session, lock(deref<value_node>(n)), val);
-         return n;
+         modify_value(state, deref<value_node>(n), val);
+         assert(n.obj()->validate_checksum());
+         return n.id();
       }
 
-      return make_value(session, type, key, val);
+      return make_value_id(state, type, key, val);
    }
 
-   database::id write_session::set_inner_value(std::unique_lock<gc_session>& session,
-                                               deref<inner_node>             n,
-                                               bool                          unique,
-                                               node_type                     type,
-                                               string_view                   val)
+   database::id write_session::set_inner_value(session_rlock&    state,
+                                               deref<inner_node> n,
+                                               bool              unique,
+                                               node_type         type,
+                                               string_view       val)
    {
       if (unique)
       {
          if (auto old_value = n->value())
          {
-            auto  v  = get_by_id(session, old_value);
+            auto  v  = state.get(old_value);  // TODO copy to cache?
             auto& vn = v.as_value_node();
-            if (v.type() == type && vn.data_size() == val.size() && ring().ref(old_value) == 1)
+            if (v.type() == type && vn.data_size() == val.size() && v.ref_count() == 1)
             {
-               modify_value(session, lock(deref<value_node>(v)), val);
-               return n;
+               modify_value(state, deref<value_node>(v), val);
+               assert(v.obj()->validate_checksum());
+               return n.id();
             }
             else
             {
-               ring().release(session, old_value);
+               v.release();
             }
          }
-         object_id val_id = make_value(session, type, string_view(), val);
-         n.reload(ring(), session);
-         auto locked = lock(n);
-         locked->set_value(val_id);
-         return n;
+         object_id val_id = make_value_id(state, type, string_view(), val);
+         // This lock is necessary because we alloc above and n was deref
+         // before
+         lock(n)->set_value(val_id);
+         assert(n.obj()->validate_checksum());
+         return n.id();
       }
       else
       {
-         object_id new_val = make_value(session, type, string_view(), val);
-         n.reload(ring(), session);
-         auto result = clone_inner(session, n, *n, n->key(), 0, object_id{}, n->branches());
+         object_id new_val = make_value_id(state, type, string_view(), val);
+
+         auto result =
+             inner_node::clone(state, n.id(), &*n, n->key(), 0, object_id{}, n->branches());
          result->set_value(new_val);
-         return result;
+         result.obj()->update_checksum();
+         return result.id();
       }
    }
 
@@ -926,28 +1042,28 @@ namespace triedent
     *  Given an existing tree node (root) add a new key/value under it and return the id
     *  of the new node if a new node had to be allocated.
     */
-   inline database::id write_session::add_child(std::unique_lock<gc_session>& session,
-                                                id                            root,
-                                                bool                          unique,
-                                                node_type                     type,
-                                                string_view                   key,
-                                                string_view                   val,
-                                                int&                          old_size)
+   inline database::id write_session::add_child(session_rlock& state,
+                                                id             root,
+                                                bool           unique,
+                                                node_type      type,
+                                                string_view    key,
+                                                string_view    val,
+                                                int&           old_size)
    {
       if (not root)  // empty case
-         return make_value(session, type, key, val);
+         return make_value_id(state, type, key, val);
 
-      auto n = get_by_id(session, root, unique);
+      auto n = get_by_id(state, root, unique);
       if (n.is_leaf_node())  // current root is value
       {
          auto& vn = n.as_value_node();
          if (vn.key() != key)
-            return combine_value_nodes(session, n.type(), vn.key(), vn.data(), root, type, key, val,
+            return combine_value_nodes(state, n.type(), vn.key(), vn.data(), root, type, key, val,
                                        object_id{});
          else
          {
             old_size = vn.data_size();
-            return set_value(session, n, unique, type, key, val);
+            return set_value(state, n, unique, type, key, val);
          }
       }
 
@@ -957,8 +1073,8 @@ namespace triedent
       if (in_key == key)  // whose prefix is same as key, therefore set the value
       {
          if (in->value())
-            old_size = get_by_id(session, in->value()).as_value_node().data_size();
-         return set_inner_value(session, n, unique, type, val);
+            old_size = state.get(in->value()).as_value_node().data_size();
+         return set_inner_value(state, n, unique, type, val);
       }
 
       // key should be the first argument, because (unlike in_key)
@@ -972,29 +1088,32 @@ namespace triedent
          {
             object_id cur_b = in->has_branch(b) ? in->branch(b) : object_id{};
             auto      new_b =
-                add_child(session, cur_b, false, type, key.substr(cpre.size() + 1), val, old_size);
-            in.reload(ring(), session);
-            auto new_in = clone_inner(session, root, *in, in->key(), 0, in->value(),
-                                      in->branches() | 1ull << b);
+                add_child(state, cur_b, false, type, key.substr(cpre.size() + 1), val, old_size);
+
+            auto new_in = inner_node::clone(state, root, &*in, in->key(), 0, in->value(),
+                                            in->branches() | 1ull << b);
 
             if (new_b != cur_b)
             {
                new_in->branch(b) = new_b;
-               release(session, cur_b);
+               release(state, cur_b);
             }
 
-            return new_in;
+            new_in.obj()->update_checksum();
+            return new_in.id();
          }  // else modify in place
 
          auto cur_b = in->branch(b);
          auto new_b =
-             add_child(session, cur_b, unique, type, key.substr(cpre.size() + 1), val, old_size);
+             add_child(state, cur_b, unique, type, key.substr(cpre.size() + 1), val, old_size);
 
          if (new_b != cur_b)
          {
-            in.reload(ring(), session);
-            lock(in)->branch(b) = new_b;
-            release(session, cur_b);
+            {
+               auto li       = lock(in);
+               li->branch(b) = new_b;
+            }
+            release(state, cur_b);
          }
          return root;
       }
@@ -1004,15 +1123,17 @@ namespace triedent
          {
             auto b1 = in_key[cpre.size()];
             // MUST convert to id to release the location_lock
-            id b1val =
-                clone_inner(session, in, *in, in_key, cpre.size() + 1, in->value(), in->branches());
-            id b0val = make_value(session, type, string_view(), val);
+            id b1val = clone_inner_id(state, in.id(), *in, in_key, cpre.size() + 1, in->value(),
+                                      in->branches());
+            id b0val = make_value_id(state, type, string_view(), val);
 
-            auto nin = make_inner(session, cpre, object_id{}, inner_node::branches(b1));
+            auto nin = inner_node::make(state, cpre, object_id{}, inner_node::branches(b1));
             // Set separately because we don't need to inc ref
-            nin->set_value(b0val);
-            nin->branch(b1) = b1val;
-            return nin;
+            auto& ninr = *nin;
+            ninr.set_value(b0val);
+            ninr.branch(b1) = b1val;
+            nin.obj()->update_checksum();
+            return nin.id();
          }
          else  // there are two branches
          {
@@ -1021,17 +1142,19 @@ namespace triedent
             auto b1key = key.substr(cpre.size() + 1);
             // Handle sub first, because b2key is invalidated by allocation.
             // cpre and b1key are safe because they point into key, which is externally owned
-            id sub =
-                clone_inner(session, in, *in, in_key, cpre.size() + 1, in->value(), in->branches());
-            id   b1val = make_value(session, type, b1key, val);
-            auto nin   = make_inner(session, cpre, id(), inner_node::branches(b1, b2));
+            id sub   = clone_inner_id(state, in.id(), *in, in_key, cpre.size() + 1, in->value(),
+                                      in->branches());
+            id b1val = make_value_id(state, type, b1key, val);
+
+            auto nin = inner_node::make(state, cpre, id(), inner_node::branches(b1, b2));
 
             assert(not nin->branch(b1));
             nin->branch(b1) = b1val;
             assert(not nin->branch(b2));
             nin->branch(b2) = sub;
+            nin.obj()->update_checksum();
 
-            return nin;
+            return nin.id();
          }
       }
    }  // write_session::add_child
@@ -1040,14 +1163,15 @@ namespace triedent
                                     std::span<const char>  key,
                                     std::span<const char>  val)
    {
-      std::unique_lock<gc_session> l(*this);
+      auto state = session_base::lock();
 
       int  old_size = -1;
       auto new_root =
-          add_child(l, get_id(r), get_unique(r), node_type::bytes,
+          add_child(state, get_id(r), false & get_unique(r), node_type::bytes,
                     to_key6({key.data(), key.size()}), {val.data(), val.size()}, old_size);
       assert(new_root.id);
-      update_root(l, r, new_root);
+      assert(state.get(new_root).obj()->validate_checksum());
+      update_root(state, r, new_root);
       return old_size;
    }
 
@@ -1055,19 +1179,19 @@ namespace triedent
                                     std::span<const char>                  key,
                                     std::span<const std::shared_ptr<root>> roots)
    {
-      std::unique_lock<gc_session> l(*this);
+      auto state = session_base::lock();
 
       std::vector<object_id> ids;
       ids.reserve(roots.size());
       for (auto& r : roots)
-         ids.push_back(retain(l, get_id(r)));
+         ids.push_back(retain(state, get_id(r)));
 
       int  old_size = -1;
       auto new_root = add_child(
-          l, get_id(r), get_unique(r), node_type::roots, to_key6({key.data(), key.size()}),
+          state, get_id(r), get_unique(r), node_type::roots, to_key6({key.data(), key.size()}),
           {reinterpret_cast<const char*>(ids.data()), ids.size() * sizeof(object_id)}, old_size);
       assert(new_root.id);
-      update_root(l, r, new_root);
+      update_root(state, r, new_root);
       return old_size;
    }
 
@@ -1087,14 +1211,14 @@ namespace triedent
                                  std::vector<char>*                  result_bytes,
                                  std::vector<std::shared_ptr<root>>* result_roots) const
    {
-      swap_guard g(*this);
-      return unguarded_get(g, r, get_id(r), to_key6({key.data(), key.size()}), result_bytes,
+      auto state = session_base::lock();
+      return unguarded_get(state, r, get_id(r), to_key6({key.data(), key.size()}), result_bytes,
                            result_roots);
    }
 
    template <typename AccessMode>
    bool session<AccessMode>::unguarded_get(
-       session_lock_ref<>                            l,
+       session_rlock&                                l,
        const std::shared_ptr<triedent::root>&        ancestor,
        object_id                                     root,
        std::string_view                              key,
@@ -1183,32 +1307,32 @@ namespace triedent
        std::vector<char>*                  result_bytes,
        std::vector<std::shared_ptr<root>>* result_roots) const
    {
-      swap_guard        g(*this);
-      std::vector<char> result_key6;
-      if (!unguarded_get_greater_equal(g, r, get_id(r), to_key6({key.data(), key.size()}),
+      auto      state = session_base::lock();
+      temp_key6 result_key6;
+      if (!unguarded_get_greater_equal(state, r, get_id(r), to_key6({key.data(), key.size()}),
                                        result_key6, result_bytes, result_roots))
          return false;
       if (result_key)
       {
-         auto s = from_key6({result_key6.data(), result_key6.size()});
-         result_key->assign(s.begin(), s.end());
+         from_key6({result_key6.data(), result_key6.size()}, *result_key);
       }
       return true;
    }
 
    template <typename AccessMode>
    bool session<AccessMode>::unguarded_get_greater_equal(
-       session_lock_ref<>                            l,
+       session_rlock&                                state,
        const std::shared_ptr<triedent::root>&        ancestor,
        object_id                                     root,
        std::string_view                              key,
-       std::vector<char>&                            result_key,
+       temp_key6&                                    result_key,
        std::vector<char>*                            result_bytes,
        std::vector<std::shared_ptr<triedent::root>>* result_roots) const
    {
       if (!root)
          return false;
-      auto n = get_by_id(l, root);
+      auto n = state.get<node>(root);
+      cache(n);
       if (n.is_leaf_node())
       {
          auto& vn     = n.as_value_node();
@@ -1236,7 +1360,7 @@ namespace triedent
       }
       else if (in.value())
       {
-         auto  v  = get_by_id(l, in.value());
+         auto  v  = state.get(in.value());  //get_by_id(l, in.value());
          auto& vn = v.as_value_node();
          return fill_result(ancestor, vn, v.type(), result_bytes, result_roots);
       }
@@ -1249,8 +1373,8 @@ namespace triedent
             return false;
          auto rk = result_key.size();
          result_key.push_back(b);
-         if (unguarded_get_greater_equal(l, ancestor, in.branch(b), key, result_key, result_bytes,
-                                         result_roots))
+         if (unguarded_get_greater_equal(state, ancestor, in.branch(b), key, result_key,
+                                         result_bytes, result_roots))
             return true;
          result_key.resize(rk);
          b   = in.lower_bound(b + 1);
@@ -1265,11 +1389,13 @@ namespace triedent
                                            std::vector<char>*                  result_bytes,
                                            std::vector<std::shared_ptr<root>>* result_roots) const
    {
-      swap_guard        g(*this);
-      std::vector<char> result_key6;
-      if (!unguarded_get_less_than(g, r, get_id(r), to_key6({key.data(), key.size()}), result_key6,
-                                   result_bytes, result_roots))
-         return false;
+      temp_key6 result_key6;
+      {  // scope the lock as narrow as possible
+         auto state = session_base::lock();
+         if (!unguarded_get_less_than(state, r, get_id(r), to_key6({key.data(), key.size()}),
+                                      result_key6, result_bytes, result_roots))
+            return false;
+      }
       if (result_key)
       {
          auto s = from_key6({result_key6.data(), result_key6.size()});
@@ -1280,24 +1406,26 @@ namespace triedent
 
    template <typename AccessMode>
    bool session<AccessMode>::unguarded_get_less_than(
-       session_lock_ref<>                            l,
+       session_rlock&                                l,
        const std::shared_ptr<triedent::root>&        ancestor,
        object_id                                     root,
        std::optional<std::string_view>               key,
-       std::vector<char>&                            result_key,
+       temp_key6&                                    result_key,
        std::vector<char>*                            result_bytes,
        std::vector<std::shared_ptr<triedent::root>>* result_roots) const
    {
       if (!root)
          return false;
       auto n = get_by_id(l, root);
+      cache(n);
       if (n.is_leaf_node())
       {
          auto& vn     = n.as_value_node();
          auto  vn_key = vn.key();
          if (key && vn_key >= *key)
             return false;
-         result_key.insert(result_key.end(), vn_key.begin(), vn_key.end());
+         //result_key.insert(result_key.end(), vn_key.begin(), vn_key.end());
+         result_key.append(vn_key.begin(), vn_key.end());
          return fill_result(ancestor, vn, n.type(), result_bytes, result_roots);
       }
       auto&   in     = n.as_inner_node();
@@ -1316,7 +1444,8 @@ namespace triedent
          else
             key = std::nullopt;
       }
-      result_key.insert(result_key.end(), in_key.begin(), in_key.end());
+      //result_key.insert(result_key.end(), in_key.begin(), in_key.end());
+      result_key.append(in_key.begin(), in_key.end());
       auto b = in.reverse_lower_bound(last_b);
       if (b < last_b)
          key = std::nullopt;
@@ -1349,16 +1478,19 @@ namespace triedent
                                      std::vector<char>*                  result_bytes,
                                      std::vector<std::shared_ptr<root>>* result_roots) const
    {
-      swap_guard g(*this);
-      auto       prefix_min = to_key6({prefix.data(), prefix.size()});
-      auto       extra_bits = prefix_min.size() * 6 - prefix.size() * 8;
-      auto       prefix_max = (std::string)prefix_min;
+      auto prefix_min = to_key6({prefix.data(), prefix.size()});
+      auto extra_bits = prefix_min.size() * 6 - prefix.size() * 8;
+      auto prefix_max = (std::string)prefix_min;
       if (!prefix_max.empty())
          prefix_max.back() |= (1 << extra_bits) - 1;
-      std::vector<char> result_key6;
-      if (!unguarded_get_max(g, r, get_id(r), prefix_min, prefix_max, result_key6, result_bytes,
-                             result_roots))
-         return false;
+      temp_key6 result_key6;
+
+      {
+         auto state = session_base::lock();
+         if (!unguarded_get_max(state, r, get_id(r), prefix_min, prefix_max, result_key6,
+                                result_bytes, result_roots))
+            return false;
+      }
       if (result_key)
       {
          auto s = from_key6({result_key6.data(), result_key6.size()});
@@ -1369,12 +1501,12 @@ namespace triedent
 
    template <typename AccessMode>
    bool session<AccessMode>::unguarded_get_max(
-       session_lock_ref<>                            l,
+       session_rlock&                                l,
        const std::shared_ptr<triedent::root>&        ancestor,
        object_id                                     root,
        std::string_view                              prefix_min,
        std::string_view                              prefix_max,
-       std::vector<char>&                            result_key,
+       temp_key6&                                    result_key,
        std::vector<char>*                            result_bytes,
        std::vector<std::shared_ptr<triedent::root>>* result_roots) const
    {
@@ -1384,6 +1516,7 @@ namespace triedent
       while (true)
       {
          auto n = get_by_id(l, root);
+         cache(n);
          if (n.is_leaf_node())
          {
             auto& vn     = n.as_value_node();
@@ -1427,25 +1560,24 @@ namespace triedent
 
    inline int write_session::remove(std::shared_ptr<root>& r, std::span<const char> key)
    {
-      std::unique_lock<gc_session> l(*this);
-
       int  removed_size = -1;
-      auto new_root = remove_child(l, get_id(r), get_unique(r), to_key6({key.data(), key.size()}),
-                                   removed_size);
-      update_root(l, r, new_root);
+      auto state        = session_base::lock();
+      auto new_root     = remove_child(state, get_id(r), get_unique(r),
+                                       to_key6({key.data(), key.size()}), removed_size);
+      update_root(state, r, new_root);
       return removed_size;
    }
 
-   inline database::id write_session::remove_child(std::unique_lock<gc_session>& session,
-                                                   id                            root,
-                                                   bool                          unique,
-                                                   string_view                   key,
-                                                   int&                          removed_size)
+   inline database::id write_session::remove_child(session_rlock& state,
+                                                   id             root,
+                                                   bool           unique,
+                                                   string_view    key,
+                                                   int&           removed_size)
    {
       if (not root)
          return root;
 
-      auto n = get_by_id(session, root, unique);
+      auto n = get_by_id(state, root, unique);
       if (n.is_leaf_node())  // current root is value
       {
          auto& vn = n.as_value_node();
@@ -1468,12 +1600,12 @@ namespace triedent
          auto iv = in->value();
          if (not iv)
             return root;
-         removed_size = get_by_id(session, iv).as_value_node().data_size();
+         removed_size = get_by_id(state, iv).as_value_node().data_size();
 
          if (in->num_branches() == 1)
          {
             char        b  = std::countr_zero(in->branches());
-            auto        bn = get_by_id(session, *in->children());
+            auto        bn = get_by_id(state, *in->children());
             std::string new_key;
             new_key += in_key;
             new_key += b;
@@ -1483,26 +1615,29 @@ namespace triedent
                auto& vn = bn.as_value_node();
                new_key += vn.key();
                //           TRIEDENT_DEBUG( "clone value" );
-               return clone_value(session, bn, bn.type(), new_key, vn.data());
+               return clone_value_id(state, bn.id(), bn.type(), new_key, vn.data());
             }
             else
             {
                auto& bin = bn.as_inner_node();
                new_key += bin.key();
                //          TRIEDENT_DEBUG( "clone inner " );
-               return clone_inner(session, bn, bin, new_key, bin.value(), bin.branches());
+               return clone_inner_id(state, bn.id(), bin, new_key, bin.value(), bin.branches());
             }
          }
 
          if (unique)
          {
             auto prev = in->value();
-            lock(in)->set_value(id());
-            release(session, prev);
+            {
+               auto lin = lock(in);
+               lin->set_value(id());
+            }
+            release(state, prev);
             return root;
          }
          else
-            return clone_inner(session, in, *in, key, 0, id(), in->branches());
+            return clone_inner_id(state, in.id(), *in, key, 0, id(), in->branches());
       }
 
       auto cpre = common_prefix(in_key, key);
@@ -1515,25 +1650,26 @@ namespace triedent
 
       object_id cur_b = in->branch(b);
 
-      auto new_b =
-          remove_child(session, cur_b, unique, key.substr(in_key.size() + 1), removed_size);
+      auto new_b = remove_child(state, cur_b, unique, key.substr(in_key.size() + 1), removed_size);
       if (new_b != cur_b)
       {
-         in.reload(ring(), session);
          if (new_b and unique)
          {
-            lock(in)->branch(b) = new_b;
-            release(session, cur_b);
+            {
+               auto lin       = lock(in);
+               lin->branch(b) = new_b;
+            }
+            release(state, cur_b);
             return root;
          }
          if (new_b)  // update branch
          {
             auto new_root =
-                clone_inner(session, in, *in, in->key(), 0, in->value(), in->branches());
+                inner_node::clone(state, in.id(), &*in, in->key(), 0, in->value(), in->branches());
             auto& new_br = new_root->branch(b);
-            release(session, new_br);
+            release(state, new_br);
             new_br = new_b;
-            return new_root;
+            return new_root.id();
          }
          else  // remove branch
          {
@@ -1541,7 +1677,7 @@ namespace triedent
             if (std::popcount(new_branches) + bool(in->value()) > 1)
             {  // multiple branches remain, nothing to merge up, just realloc without branch
                //   TRIEDENT_WARN( "clone without branch" );
-               return clone_inner(session, in, *in, in->key(), 0, in->value(), new_branches);
+               return clone_inner_id(state, in.id(), *in, in->key(), 0, in->value(), new_branches);
             }
             if (not new_branches)
             {
@@ -1551,11 +1687,11 @@ namespace triedent
                // in this case, not branches means it must have a value
                assert(in->value() and "expected value because we removed a branch");
 
-               auto  cur_v = get_by_id(session, in->value());
+               auto  cur_v = state.get(in->value());  //get_by_id(state, in->value());
                auto& cv    = cur_v.as_value_node();
                // make a copy because key and data come from different objects, which clone doesn't handle.
                std::string new_key{in->key()};
-               return clone_value(session, cur_v, cur_v.type(), new_key, cv.data());
+               return clone_value_id(state, cur_v.id(), cur_v.type(), new_key, cv.data());
             }
             else
             {  // there must be only 1 branch left
@@ -1564,7 +1700,7 @@ namespace triedent
                auto  lb          = std::countr_zero(in->branches() ^ inner_node::branches(b));
                auto& last_branch = in->branch(lb);
                // the one branch is either a value or a inner node
-               auto cur_v = get_by_id(session, last_branch);
+               auto cur_v = get_by_id(state, last_branch);
                if (cur_v.is_leaf_node())
                {
                   auto&       cv = cur_v.as_value_node();
@@ -1572,7 +1708,7 @@ namespace triedent
                   new_key += in->key();
                   new_key += char(lb);
                   new_key += cv.key();
-                  return clone_value(session, cur_v, cur_v.type(), new_key, cv.data());
+                  return clone_value_id(state, cur_v.id(), cur_v.type(), new_key, cv.data());
                }
                else
                {
@@ -1581,7 +1717,7 @@ namespace triedent
                   new_key += in->key();
                   new_key += char(lb);
                   new_key += cv.key();
-                  return clone_inner(session, cur_v, cv, new_key, cv.value(), cv.branches());
+                  return clone_inner_id(state, cur_v.id(), cv, new_key, cv.value(), cv.branches());
                }
             }
          }
@@ -1598,8 +1734,8 @@ namespace triedent
    template <typename AccessMode>
    void session<AccessMode>::validate(const std::shared_ptr<root>& r)
    {
-      swap_guard l{*this};
-      validate(l, get_id(r));
+      auto state = session_base::lock();
+      validate(state, get_id(r));
    }
 
    template <typename AccessMode>
@@ -1661,26 +1797,26 @@ namespace triedent
          std::lock_guard<std::mutex> lock(_db->_root_change_mutex);
          id = {_db->_dbm->top_root.load()};
       }
-      swap_guard l{*this};
-      recursive_retain(l, id);
+      auto state = session_base::lock();
+      recursive_retain(state, id);
    }
 
-   inline void write_session::recursive_retain(session_lock_ref<> l, id r)
+   inline void write_session::recursive_retain(session_rlock& state, id r)
    {
       if (not r)
          return;
 
-      if (!ring().gc_retain(r))
-         return;  // retaining this node indirectly retains all children
+      auto dr = state.get(r);
+      if (not dr.retain())
+         return;
 
-      auto dr = get_by_id(l, r);
       if (dr.type() == node_type::inner)
       {
          auto& in = dr.as_inner_node();
-         recursive_retain(l, in.value());
+         recursive_retain(state, in.value());
          for (auto child : std::span{in.children(), in.num_branches()})
          {
-            recursive_retain(l, child);
+            recursive_retain(state, child);
          }
       }
       else if (dr.type() == node_type::roots)
@@ -1688,18 +1824,20 @@ namespace triedent
          auto& rt = dr.as_value_node();
          for (auto child : std::span{rt.roots(), rt.num_roots()})
          {
-            recursive_retain(l, child);
+            recursive_retain(state, child);
          }
       }
    }
 
    inline void write_session::start_collect_garbage()
    {
-      ring().gc_start();
+      throw std::runtime_error("not impl yet");
+      //ring().gc_start();
    }
    inline void write_session::end_collect_garbage()
    {
-      ring().gc_finish();
+      throw std::runtime_error("not impl yet");
+      //ring().gc_finish();
    }
 
    template <typename AccessMode>
@@ -1713,111 +1851,49 @@ namespace triedent
    }
 
    template <typename AccessMode>
-   void session<AccessMode>::validate(session_lock_ref<> l, id r)
+   void session<AccessMode>::validate(session_rlock& state, id r)
    {
       if (not r)
          return;
 
       auto validate_id = [&](auto i)
       {
-         ring().validate(r);
-         if (0 == ring().ref(r))
+         auto rv = state.validate(r);
+         if (0 == rv.ref_count())
             throw std::runtime_error("found reference to object with 0 ref count: " +
                                      std::to_string(r.id));
       };
 
       validate_id(r);
 
-      auto dr = get_by_id(l, r);
+      auto dr = state.get(r);  //get_by_id(state, r);
       if (not dr.is_leaf_node())
       {
          auto& in = dr.as_inner_node();
-         validate(l, in.value());
+         validate(state, in.value());
 
          auto* c = in.children();
          auto* e = c + in.num_branches();
          while (c != e)
          {
-            validate(l, *c);
+            validate(state, *c);
             ++c;
          }
       }
    }
 
-   inline key_type from_key6(const key_view sixb)
+   inline key_view session_base::to_key6(key_view v) const
    {
-      std::string out;
-      out.resize((sixb.size() * 6) / 8);
-
-      const uint8_t* pos6     = (uint8_t*)sixb.data();
-      const uint8_t* pos6_end = (uint8_t*)sixb.data() + sixb.size();
-      uint8_t*       pos8     = (uint8_t*)out.data();
-
-      while (pos6_end - pos6 >= 4)
-      {
-         pos8[0] = (pos6[0] << 2) | (pos6[1] >> 4);  // 6 + 2t
-         pos8[1] = (pos6[1] << 4) | (pos6[2] >> 2);  // 4b + 4t
-         pos8[2] = (pos6[2] << 6) | pos6[3];         // 2b + 6
-         pos6 += 4;
-         pos8 += 3;
-      }
-      switch (pos6_end - pos6)
-      {
-         case 3:
-            pos8[0] = (pos6[0] << 2) | (pos6[1] >> 4);  // 6 + 2t
-            pos8[1] = (pos6[1] << 4) | (pos6[2] >> 2);  // 4b + 4t
-            //    pos8[2] = (pos6[2] << 6);                   // 2b + 6-0
-            break;
-         case 2:
-            pos8[0] = (pos6[0] << 2) | (pos6[1] >> 4);  // 6 + 2t
-            //     pos8[1] = (pos6[1] << 4);                   // 4b + 4-0
-            break;
-         case 1:
-            pos8[0] = (pos6[0] << 2);  // 6 + 2-0
-            break;
-      }
-      return out;
+      return triedent::to_key6(key_buf, v);
    }
-   inline key_view to_key6(key_type& key_buf, key_view v)
-   {
-      uint32_t bits  = v.size() * 8;
-      uint32_t byte6 = (bits + 5) / 6;
-
-      key_buf.resize(byte6);
-
-      uint8_t*       pos6     = (uint8_t*)key_buf.data();
-      const uint8_t* pos8     = (uint8_t*)v.data();
-      const uint8_t* pos8_end = (uint8_t*)v.data() + v.size();
 
-      while (pos8_end - pos8 >= 3)
-      {
-         pos6[0] = pos8[0] >> 2;
-         pos6[1] = (pos8[0] & 0x3) << 4 | pos8[1] >> 4;
-         pos6[2] = (pos8[1] & 0xf) << 2 | (pos8[2] >> 6);
-         pos6[3] = pos8[2] & 0x3f;
-         pos8 += 3;
-         pos6 += 4;
-      }
-
-      switch (pos8_end - pos8)
+   template <typename AccessMode>
+   void session<AccessMode>::cache(auto& objref) const
+   {
+      if (_db->_config.cache_on_read)
       {
-         case 2:
-            pos6[0] = pos8[0] >> 2;
-            pos6[1] = (pos8[0] & 0x3) << 4 | pos8[1] >> 4;
-            pos6[2] = (pos8[1] & 0xf) << 2;
-            break;
-         case 1:
-            pos6[0] = pos8[0] >> 2;
-            pos6[1] = (pos8[0] & 0x3) << 4;
-            break;
-         default:
-            break;
+         objref.cache_object();
       }
-      return {key_buf.data(), key_buf.size()};
-   }
-   inline key_view session_base::to_key6(key_view v) const
-   {
-      return triedent::to_key6(key_buf, v);
    }
 
 }  // namespace triedent
diff --git a/libraries/triedent/include/triedent/db.hpp b/libraries/triedent/include/triedent/db.hpp
new file mode 100644
index 000000000..fc85a312f
--- /dev/null
+++ b/libraries/triedent/include/triedent/db.hpp
@@ -0,0 +1,249 @@
+#pragma once
+#include <future>
+#include <list>
+#include <shared_mutex>
+#include <triedent/database.hpp>
+
+namespace triedent
+{
+
+   struct Status
+   {
+      bool ok = true;
+   };
+
+   /**
+    *  This is the high-level interface through which the
+    *  database should be accessed if you don't want to maintain
+    *  multiple persistent snapshots. This interface is designed to
+    *  operate with low-latency syncing between when a write transaction
+    *  commits and the first read sees the change.
+    */
+   class DB
+   {
+     public:
+      struct Options
+      {
+         bool             create_if_missing = false;
+         bool             error_if_exists   = false;
+         database::config config;
+      };
+      typedef std::shared_ptr<root> root_ptr;
+
+      /**
+          *  Thread-local read session, used to start read transactions which
+          *  all occur from the same state snapshot.
+          */
+      class ReadSession
+      {
+        public:
+         class Transaction
+         {
+           public:
+            /**
+                      *  Span is any type that has a data() and size() method.
+                      *      e.g. std::string, std::vector<char>, std::span<const char>
+                      */
+            template <typename Span>
+            Status get(const Span& key, std::vector<char>* value)
+            {
+               return Status{.ok = _rs._rs->get(_root, {key.data(), key.size()}, value, nullptr)};
+            }
+
+            template <typename Span>
+            bool get_greater_equal(const Span&        key,
+                                   std::vector<char>* result_key,
+                                   std::vector<char>* result_val = nullptr)
+            {
+               return _rs._rs->get_greater_equal(_root, {key.data(), key.size()}, result_key,
+                                                 result_val);
+            }
+
+            ~Transaction() {}
+
+           private:
+            friend class ReadSession;
+            Transaction(ReadSession& s) : _rs(s), _root(s._db.getRoot()) {}
+
+            ReadSession& _rs;
+            root_ptr     _root;
+         };  // Transaction
+
+         //auto startTransaction() { return std::make_shared<Transaction>(std::ref(*this)); }
+         auto startTransaction() { return std::shared_ptr<Transaction>(new Transaction(*this)); }
+
+         ReadSession(DB& d) : _db(d) { _rs = _db._db->start_read_session(); }
+
+        private:
+         friend class Transaction;
+
+         std::shared_ptr<read_session> _rs;
+         DB&                           _db;
+
+      };  // ReadSession
+
+      /**
+          * Only one write session can exist and it may only be called by a
+          * single thread. Writes are batched in WriteSession::Transactions and
+          * can be aborted before any reads see it.
+          */
+      class WriteSession
+      {
+        public:
+         class Transaction
+         {
+           public:
+            Status get(std::span<const char> key, std::vector<char>& value);
+            Status put(std::span<const char> key, std::span<const char> value);
+            Status remove(std::span<const char> key);
+
+            Status commit()
+            {
+               if (_root)
+               {
+                  //_ws._db._root = _root;
+                  _ws.setRoot(std::move(_root));
+                  return {};
+               }
+               return {.ok = false};
+            }
+
+            Status abort()
+            {
+               _root.reset();
+               return {};
+            }
+
+            ~Transaction() { commit(); }
+
+            // KeySpan and ValueSpan can be any type that has a .data() and .size() method
+            // @return the old size if a key was replaced, otherwise 0
+            template <typename KeySpan, typename ValueSpan>
+            int put(const KeySpan& key, const ValueSpan& value)
+            {
+               return _ws._ws->upsert(_root, {key.data(), key.size()},
+                                      {value.data(), value.size()});
+            }
+
+           private:
+            friend class WriteSession;
+            Transaction(WriteSession& s) : _ws(s), _root(s._db._root) {}
+
+            std::shared_ptr<root> _root;
+            WriteSession&         _ws;
+         };  // WriteSession::Transaction
+
+         auto startTransaction() { return new Transaction(*this); }
+
+         WriteSession(DB& d) : _db(d)
+         {
+            _ws       = _db._db->start_write_session();
+            _db._root = _ws->get_top_root();
+         }
+
+         void validate() { _ws->validate(); }
+
+        private:
+         friend class Transaction;
+         friend class DB;
+
+         void setRoot(std::shared_ptr<root> r)
+         {
+            _ws->set_top_root(r);
+            _db.setRoot(std::move(r));
+         }
+
+         DB&                            _db;
+         std::shared_ptr<write_session> _ws;
+      };  // WriteSession
+
+      static std::shared_ptr<DB> open(Options opt, std::filesystem::path dir)
+      {
+         return std::make_shared<DB>(std::make_shared<database>(dir.c_str(), opt.config, database::read_write));
+      }
+
+      DB(std::shared_ptr<database> d) : _db(std::move(d)), _ws(*this)
+      {
+         _root           = _ws._ws->get_top_root();
+         _release_thread = std::thread([this]() { release_loop(); });
+      }
+
+      auto          createReadSession() { return std::make_shared<ReadSession>(std::ref(*this)); }
+      WriteSession& writeSession() { return _ws; }
+
+      root_ptr getRoot() const
+      {
+         root_ptr tmp;
+         {
+            std::shared_lock m(_root_mutex);
+            tmp = _root;
+         }
+         return tmp;
+      }
+      ~DB()
+      {
+         _done = true;
+         _release_thread.join();
+         _db->print_stats(std::cout, true);
+      }
+
+      void print() { _db->print_stats(std::cout, true); }
+      bool compact() { return _db->compact_next_segment(); }
+
+     private:  // DB
+      void setRoot(root_ptr p)
+      {
+         {
+            std::unique_lock l(_root_mutex);
+            root_ptr         tmp = _root;  // delay release until unlock
+            _root                = std::move(p);
+            std::unique_lock l2(_release_mutex);
+            _release_queue.push_back(std::move(tmp));
+            // TODO: notify release thread
+         }
+      }
+
+      std::shared_ptr<database> _db;
+      WriteSession              _ws;
+
+      void release_loop()
+      {
+         while (not _done)
+         {
+            bool rest = false;
+            {
+               root_ptr tmp;
+               {
+                  std::unique_lock l(_release_mutex);
+                  if (not _release_queue.empty())
+                  {
+                     tmp = _release_queue.front();
+                     _release_queue.pop_front();
+                  }
+                  else
+                  {
+                     rest = true;
+                  }
+               }
+            }
+            if (rest)
+            {
+               // TODO: wait conditiopn
+               using namespace std::chrono_literals;
+               std::this_thread::sleep_for(30ms);
+            }
+         }
+         // clean up
+         std::unique_lock l(_root_mutex);
+         _release_queue.clear();
+      }
+
+      std::atomic<bool>         _done;
+      std::thread               _release_thread;
+      std::list<root_ptr>       _release_queue;
+      mutable std::shared_mutex _root_mutex;
+      mutable std::shared_mutex _release_mutex;
+      root_ptr                  _root;
+   };
+
+}  // namespace triedent
diff --git a/libraries/triedent/include/triedent/debug.hpp b/libraries/triedent/include/triedent/debug.hpp
index f7cdcbe1d..8d975c834 100644
--- a/libraries/triedent/include/triedent/debug.hpp
+++ b/libraries/triedent/include/triedent/debug.hpp
@@ -4,11 +4,14 @@
 #include <iostream>
 #include <thread>
 //#include <syncstream>
+// #undef NDEBUG
+#include <cassert>
 
 namespace triedent
 {
    static constexpr bool debug_cache = false;
    static constexpr bool debug_gc    = false;
+   static constexpr bool debug_invariant = true;
 
    struct scope
    {
@@ -22,7 +25,7 @@ namespace triedent
       }
    };
 
-   inline const char* thread_name(const char* n = "default")
+   inline const char* thread_name(const char* n = nullptr)
    {
       static thread_local const char* thread_name = n;
       if (n)
@@ -42,6 +45,7 @@ namespace triedent
    }
 
    inline auto set_current_thread_name( const char* name ) { 
+      thread_name(name);
 #ifdef __APPLE__
       return pthread_setname_np(name); 
 #else
diff --git a/libraries/triedent/include/triedent/gc_queue.hpp b/libraries/triedent/include/triedent/gc_queue.hpp
index 0bcf26e9a..f57e499f0 100644
--- a/libraries/triedent/include/triedent/gc_queue.hpp
+++ b/libraries/triedent/include/triedent/gc_queue.hpp
@@ -103,14 +103,14 @@ namespace triedent
       // notify _queue_cond.
       static constexpr size_type wait_bit = ~(npos >> 1);
       friend class session;
-      std::mutex                         _session_mutex;
-      std::vector<session*>              _sessions;
-      std::mutex                         _queue_mutex;
-      std::condition_variable            _queue_cond;
-      std::atomic<size_type>             _end;
-      std::size_t                        _size;
-      std::vector<std::shared_ptr<void>> _queue;
-      bool                               _waiting;
+      alignas(64) std::mutex                    _session_mutex;
+      std::vector<session*>                     _sessions;
+      alignas(64) std::mutex                    _queue_mutex;
+      alignas(64) std::condition_variable       _queue_cond;
+      alignas(64) std::atomic<size_type>        _end;
+      std::size_t                               _size;
+      std::vector<std::shared_ptr<void>>        _queue;
+      bool                                      _waiting;
    };
 
    using gc_session = gc_queue::session;
diff --git a/libraries/triedent/include/triedent/id_allocator.hpp b/libraries/triedent/include/triedent/id_allocator.hpp
new file mode 100644
index 000000000..f18ce7d7b
--- /dev/null
+++ b/libraries/triedent/include/triedent/id_allocator.hpp
@@ -0,0 +1,236 @@
+#pragma once
+#include <triedent/block_allocator.hpp>
+#include <triedent/file_fwd.hpp>
+#include <triedent/mapping.hpp>
+#include <triedent/object_fwd.hpp>
+
+#include <mutex>
+
+namespace triedent
+{
+
+   inline constexpr uint64_t obj_val(node_type type, uint16_t ref)
+   {
+      object_info result{0};
+      // This is distinct from any valid offset
+      result._location = (1ull << object_info::location_rshift) - 1;
+      result._ref      = ref;
+      result._type     = static_cast<std::uint64_t>(type);
+      return result.to_int();
+   }
+   inline constexpr uint64_t free_val(uint64_t loc)
+   {
+      object_info result{0};
+      // This is distinct from any valid offset
+      result._location = loc;
+      result._ref      = 0;
+      result._type     = static_cast<std::uint64_t>(node_type::undefined);
+      return result.to_int();
+   }
+
+   /**
+    *  Allocates object ids across multiple threads with
+    *  minimal locking by simulating a hash table that grows
+    *  when collision rate gets too high. 
+    *
+    *  - alloc and free are thread safe and non-blocking except 
+    *  alloc will block if the load reaches 80% in order to grow
+    *  the backing file.
+    *
+    *  - free is constant time two atomic operations with no memory ordering requirments
+    *  - alloc typically requires fetching 1 cache line and doing 
+    *    less than 8 loads and 1 C&S and 1 fetch add, but occassionally (0.1-.01%)
+    *    may take 3 or 4 times as long. 
+    *
+    *  There are no memory ordering requirments because the object's value
+    *  speaks for itself and is not gaurding other memory.
+    */
+   class id_allocator
+   {
+     public:
+      static const uint32_t id_block_size = 1024 * 1024 * 128;
+      static_assert(id_block_size % 64 == 0, "should be divisible by cacheline");
+
+      inline static constexpr uint64_t extract_next_ptr(uint64_t x) { return (x >> 19); }
+      inline static constexpr uint64_t create_next_ptr(uint64_t x) { return (x << 19); }
+      /*
+      inline static constexpr uint64_t extract_next_ptr(uint64_t x)
+      {
+        // assert((x >> 15 & 3) == uint64_t(node_type::undefined));
+         return (x & object_info::location_mask) >> object_info::location_rshift;
+         //return (x >> object_info::location_rshift) & object_info::location_mask;
+      }
+      inline static constexpr uint64_t create_next_ptr(uint64_t x)
+      {
+         auto r = (x << object_info::location_rshift) | (uint64_t(node_type::undefined) << 15);
+         assert( extract_next_ptr(r) == x );
+         return r;
+      }
+      */
+
+      id_allocator(std::filesystem::path id_file)
+          : _data_dir(id_file),
+            _block_alloc(id_file, id_block_size, 8192 /*1TB*/),
+            _ids_header_file(id_file.native() + ".header", access_mode::read_write)
+      {
+         if (_ids_header_file.size() == 0)
+         {
+            _ids_header_file.resize(round_to_page(sizeof(ids_header)));
+            auto idh = new (_ids_header_file.data()) ids_header();
+            idh->_next_alloc.store(1);
+            idh->_end_id.store(0);
+            idh->_first_free.store(object_info(node_type::undefined, 0).to_int());
+         }
+         _idheader = reinterpret_cast<ids_header*>(_ids_header_file.data());
+      }
+
+      uint64_t get_capacity() const { return _idheader->_end_id.load(std::memory_order_relaxed); }
+
+      std::atomic<uint64_t>& get(object_id id)
+      {
+         auto abs_pos        = id.id * sizeof(uint64_t);
+         auto block_num      = abs_pos / id_block_size;
+         auto index_in_block = uint64_t(abs_pos) & uint64_t(id_block_size - 1);
+         auto ptr            = ((char*)_block_alloc.get(block_num)) + index_in_block;
+         return reinterpret_cast<std::atomic<uint64_t>&>(*ptr);
+      }
+
+      /**
+          * The value stored at the returned object_id is equal to
+          * alloc_session::default_id_value which indicates undefined type with
+          * a reference count of 1. If you store 0 at this location the allocator
+          * will think it is free and invariants about load capacity will be broken.
+          */
+      std::pair<std::atomic<uint64_t>&, object_id> get_new_id()
+      {
+         //    std::cerr << "get new id...\n";
+         //   std::cerr << "   pre alloc free list: ";
+         //  print_free_list();
+
+         auto brand_new = [&]()
+         {
+            object_id id{_idheader->_next_alloc.fetch_add(1, std::memory_order_relaxed)};
+            grow(id);  // ensure that there should be new id
+
+            auto& atom = get(id);
+            atom.store(obj_val(node_type::undefined, 1), std::memory_order_relaxed);
+
+            //    std::cerr << " brand new id: " << id.id << "\n";
+            return std::pair<std::atomic<uint64_t>&, object_id>(atom, id);
+         };
+         //auto r = brand_new();
+         //std::cerr << "get new id: " << r.second.id << "\n";
+
+         std::unique_lock<std::mutex> l{_alloc_mutex};
+         uint64_t                     ff = _idheader->_first_free.load(std::memory_order_acquire);
+         do
+         {
+            if (extract_next_ptr(ff) == 0)
+            {
+               //      std::cerr << "alloc brand new! \n";
+               _alloc_mutex.unlock();
+               l.release();
+               return brand_new();
+            }
+         } while (not _idheader->_first_free.compare_exchange_strong(
+             ff, get({extract_next_ptr(ff)}).load(std::memory_order_relaxed)));
+
+         ff = extract_next_ptr(ff);
+         //      std::cerr << "  reused id: " << ff << "\n";
+         auto& ffa = get({ff});
+         // store 1 = ref count 1 prevents object as being interpreted as unalloc
+         ffa.store(obj_val(node_type::undefined, 1), std::memory_order_relaxed);
+
+         //     std::cerr << "   post alloc free list: ";
+         //    print_free_list();
+         return {ffa, {ff}};
+      }
+
+      void print_free_list()
+      {
+         uint64_t id = extract_next_ptr(_idheader->_first_free.load());
+         std::cerr << id;
+         while (id)
+         {
+            id = extract_next_ptr(get({id}));
+            std::cerr << ", " << id;
+         }
+         std::cerr << " END\n";
+      }
+
+      void free_id(object_id id)
+      {
+         auto& head_free_list = _idheader->_first_free;
+         auto& next_free      = get(id);
+         auto  new_head       = object_info(node_type::undefined, id.id).to_int();
+
+         uint64_t cur_head = _idheader->_first_free.load(std::memory_order_acquire);
+         assert(not(cur_head & object_info::ref_mask));
+         assert(not(next_free & object_info::ref_mask));
+         do
+         {
+            next_free.store(cur_head, std::memory_order_release);
+         } while (not head_free_list.compare_exchange_weak(cur_head, new_head, std::memory_order_release));
+         //print_free_list();
+      }
+
+      auto& get_mutex( object_id id ) {
+        return _locks[id.id&(8192-1)]; 
+      }
+
+     private:
+      friend class alloc_session;
+
+      void grow(object_id id)
+      {
+         // optimistic...
+         if ( id.id <
+             _idheader->_end_id.load(std::memory_order_relaxed))
+            return;
+
+         void* ptr;
+         {
+            std::lock_guard l{_grow_mutex};
+            if (id.id < _idheader->_end_id.load())
+               return;  // no need to grow, another thread grew first
+
+            //      std::cerr << "growing obj id db\n";
+            ptr = _block_alloc.get(_block_alloc.alloc());
+            _idheader->_end_id.store(_block_alloc.num_blocks() * _block_alloc.block_size() / 8, std::memory_order_release);
+         }  // don't hold lock while doing mlock
+
+         if (::mlock(ptr, id_block_size))
+         {
+            std::cerr << "WARNING: unable to mlock ID lookups\n";
+            ::madvise(ptr, id_block_size, MADV_RANDOM);
+         }
+      }
+
+      std::mutex            _alloc_mutex;
+      std::mutex            _grow_mutex;
+      std::filesystem::path _data_dir;
+      block_allocator       _block_alloc;
+
+      /**
+       * Mapped from disk to track meta data associated with the IDs
+       */
+      struct ids_header
+      {
+         uint64_t _magic      = 0;
+         uint64_t _block_size = id_block_size;
+
+         std::atomic<uint64_t> _next_alloc;  /// the next new ID to be allocated
+         std::atomic<uint64_t> _end_id;      /// the first ID beyond the end of file
+
+         // the lower 15 bits represent the alloc_session number of the last write
+         // the upper bits represent the index of the first free ID, the value
+         // stored at that index is the index of the next free ID or 0 if there
+         // are no unused ids available.
+         std::atomic<uint64_t> _first_free;  /// index of an ID that has the index of the next ID
+      };
+
+      ids_header* _idheader;
+      mapping     _ids_header_file;
+      std::mutex  _locks[8192];
+   };
+};  // namespace triedent
diff --git a/libraries/triedent/include/triedent/key6.hpp b/libraries/triedent/include/triedent/key6.hpp
new file mode 100644
index 000000000..39313aa22
--- /dev/null
+++ b/libraries/triedent/include/triedent/key6.hpp
@@ -0,0 +1,156 @@
+#pragma once
+#include <cassert>
+
+namespace triedent {
+
+   using key_view   = std::string_view;
+   using value_view = std::string_view;
+   using key_type   = std::string;
+   using value_type = key_type;
+
+   inline key_type from_key6(const key_view sixb);
+
+   template <typename KeyType>
+   inline void from_key6(const key_view sixb, KeyType& out);
+
+   // used to avoid malloc, because keys can be at most 256,
+   // this one change produced 13% improvment with 12 threads
+   struct temp_key6
+   {
+      uint32_t _size = 0;
+      char     _buffer[128];
+
+      uint32_t    size() const { return _size; }
+      const char* begin() const { return _buffer; }
+      const char* end() const { return _buffer + _size; }
+      char*       begin() { return _buffer; }
+      char*       end() { return _buffer + _size; }
+
+      void append(const char* p, const char* e)
+      {
+         int s = e - p;
+         if (_size + s > sizeof(_buffer))
+            throw std::runtime_error("key length overflow");
+         memcpy(end(), p, s);
+         _size += s;
+      }
+      void push_back(char c)
+      {
+         if (_size < sizeof(_buffer))
+         {
+            *end() = c;
+            ++_size;
+         }
+         else
+         {
+            throw std::runtime_error("key length overflow");
+         }
+      }
+      void resize(uint32_t s)
+      {
+         if (s < sizeof(_buffer))
+         {
+            _size = s;
+         }
+         else
+         {
+            throw std::runtime_error("key length overflow");
+         }
+      }
+      const char* data() const { return begin(); }
+      char*       data() { return begin(); }
+
+      void insert(char* pos, const char* begin, const char* end)
+      {
+         assert(pos >= _buffer and pos < _buffer + sizeof(_buffer));
+    //     assert((const char*)pos + end - begin < _buffer + sizeof(_buffer));
+         memcpy(pos, begin, end - begin);
+         _size += end - begin;
+      }
+
+      temp_key6() : _size(0) {}
+
+     private:
+      temp_key6(const temp_key6&) = delete;  // should not be copied
+   };
+
+   inline key_type from_key6(const key_view sixb)
+   {
+      key_type tmp;
+      from_key6(sixb, tmp);
+      return tmp;
+   }
+
+   template <typename KeyType>
+   inline void from_key6(const key_view sixb, KeyType& out)
+   {
+      out.resize((sixb.size() * 6) / 8);
+
+      const uint8_t* pos6     = (uint8_t*)sixb.data();
+      const uint8_t* pos6_end = (uint8_t*)sixb.data() + sixb.size();
+      uint8_t*       pos8     = (uint8_t*)out.data();
+
+      while (pos6_end - pos6 >= 4)
+      {
+         pos8[0] = (pos6[0] << 2) | (pos6[1] >> 4);  // 6 + 2t
+         pos8[1] = (pos6[1] << 4) | (pos6[2] >> 2);  // 4b + 4t
+         pos8[2] = (pos6[2] << 6) | pos6[3];         // 2b + 6
+         pos6 += 4;
+         pos8 += 3;
+      }
+      switch (pos6_end - pos6)
+      {
+         case 3:
+            pos8[0] = (pos6[0] << 2) | (pos6[1] >> 4);  // 6 + 2t
+            pos8[1] = (pos6[1] << 4) | (pos6[2] >> 2);  // 4b + 4t
+            //    pos8[2] = (pos6[2] << 6);                   // 2b + 6-0
+            break;
+         case 2:
+            pos8[0] = (pos6[0] << 2) | (pos6[1] >> 4);  // 6 + 2t
+            //     pos8[1] = (pos6[1] << 4);                   // 4b + 4-0
+            break;
+         case 1:
+            pos8[0] = (pos6[0] << 2);  // 6 + 2-0
+            break;
+      }
+   }
+   inline key_view to_key6(key_type& key_buf, key_view v)
+   {
+      uint32_t bits  = v.size() * 8;
+      uint32_t byte6 = (bits + 5) / 6;
+
+      key_buf.resize(byte6);
+
+      uint8_t*       pos6     = (uint8_t*)key_buf.data();
+      const uint8_t* pos8     = (uint8_t*)v.data();
+      const uint8_t* pos8_end = (uint8_t*)v.data() + v.size();
+
+      while (pos8_end - pos8 >= 3)
+      {
+         pos6[0] = pos8[0] >> 2;
+         pos6[1] = (pos8[0] & 0x3) << 4 | pos8[1] >> 4;
+         pos6[2] = (pos8[1] & 0xf) << 2 | (pos8[2] >> 6);
+         pos6[3] = pos8[2] & 0x3f;
+         pos8 += 3;
+         pos6 += 4;
+      }
+
+      switch (pos8_end - pos8)
+      {
+         case 2:
+            pos6[0] = pos8[0] >> 2;
+            pos6[1] = (pos8[0] & 0x3) << 4 | pos8[1] >> 4;
+            pos6[2] = (pos8[1] & 0xf) << 2;
+            break;
+         case 1:
+            pos6[0] = pos8[0] >> 2;
+            pos6[1] = (pos8[0] & 0x3) << 4;
+            break;
+         default:
+            break;
+      }
+      return {key_buf.data(), key_buf.size()};
+   }
+
+
+}
diff --git a/libraries/triedent/include/triedent/lehmer64.h b/libraries/triedent/include/triedent/lehmer64.h
new file mode 100644
index 000000000..401472082
--- /dev/null
+++ b/libraries/triedent/include/triedent/lehmer64.h
@@ -0,0 +1,88 @@
+#ifndef LEHMER64_H
+#define LEHMER64_H
+
+
+/**
+* D. H. Lehmer, Mathematical methods in large-scale computing units.
+* Proceedings of a Second Symposium on Large Scale Digital Calculating
+* Machinery;
+* Annals of the Computation Laboratory, Harvard Univ. 26 (1951), pp. 141-146.
+*
+* P L'Ecuyer,  Tables of linear congruential generators of different sizes and
+* good lattice structure. Mathematics of Computation of the American
+* Mathematical
+* Society 68.225 (1999): 249-260.
+*/
+struct lehmer64_rng
+{
+   lehmer64_rng(uint64_t seed)
+   {
+      _lehmer64_state =
+          (((__uint128_t)splitmix64_stateless(seed, 0)) << 64) + splitmix64_stateless(seed, 1);
+   }
+
+   uint64_t next()
+   {
+      _lehmer64_state *= 0xda942042e4dd58b5ull;
+      auto r = _lehmer64_state >> 64;
+
+      _lehmer64_state =
+          (((__uint128_t)splitmix64_stateless(r, 0)) << 64) + splitmix64_stateless(r, 1);
+
+      return r;
+   }
+
+  private:
+   __uint128_t _lehmer64_state;
+
+   // state for splitmix64
+   uint64_t splitmix64_x; /* The state can be seeded with any value. */
+
+   // call this one before calling splitmix64
+   inline void splitmix64_seed(uint64_t seed) { splitmix64_x = seed; }
+
+   // floor( ( (1+sqrt(5))/2 ) * 2**64 MOD 2**64)
+   static const uint64_t golden_gamma = 0x9E3779B97F4A7C15ull;
+
+   // returns random number, modifies seed[0]
+   // compared with D. Lemire against
+   // http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/8-b132/java/util/SplittableRandom.java#SplittableRandom.0gamma
+   inline uint64_t splitmix64_r(uint64_t* seed)
+   {
+      uint64_t z = (*seed += golden_gamma);
+      // David Stafford's Mix13 for MurmurHash3's 64-bit finalizer
+      z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
+      z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
+      return z ^ (z >> 31);
+   }
+
+   // returns random number, modifies splitmix64_x
+   inline uint64_t splitmix64(void) { return splitmix64_r(&splitmix64_x); }
+
+   // returns the 32 least significant bits of a call to splitmix64
+   // this is a simple (inlined) function call followed by a cast
+   inline uint32_t splitmix64_cast32(void) { return (uint32_t)splitmix64(); }
+
+   // returns the value of splitmix64 "offset" steps from seed
+   inline uint64_t splitmix64_stateless(uint64_t seed, uint64_t offset)
+   {
+      seed += offset * golden_gamma;
+      return splitmix64_r(&seed);
+   }
+};
+
+/*
+static inline void lehmer64_seed(uint64_t seed)
+{
+   g_lehmer64_state =
+       (((__uint128_t)splitmix64_stateless(seed, 0)) << 64) + splitmix64_stateless(seed, 1);
+}
+
+static inline uint64_t lehmer64()
+{
+   g_lehmer64_state *= UINT64_C(0xda942042e4dd58b5);
+   return g_lehmer64_state >> 64;
+}
+*/
+
+#endif
diff --git a/libraries/triedent/include/triedent/location_lock.hpp b/libraries/triedent/include/triedent/location_lock.hpp
index b830985b0..d9403f6d0 100644
--- a/libraries/triedent/include/triedent/location_lock.hpp
+++ b/libraries/triedent/include/triedent/location_lock.hpp
@@ -107,7 +107,9 @@ namespace triedent
           (64 - sizeof(_waiting) - 2 * sizeof(_mutex)) / sizeof(object_id);
       object_id _locked_ids[max_locks];
    };
-   static_assert(sizeof(location_mutex) == 64);
+  // TODO: Why do we care about the size, moving the atomics to alignas(64) prevents
+  // false cacheline sharing... 
+  //static_assert(sizeof(location_mutex) == 64);
 
    class location_lock
    {
diff --git a/libraries/triedent/include/triedent/mapping.hpp b/libraries/triedent/include/triedent/mapping.hpp
index dd4a01484..36a416d66 100644
--- a/libraries/triedent/include/triedent/mapping.hpp
+++ b/libraries/triedent/include/triedent/mapping.hpp
@@ -4,6 +4,7 @@
 #include <filesystem>
 #include <memory>
 #include <utility>
+#include <sys/mman.h>
 
 namespace triedent
 {
@@ -14,6 +15,39 @@ namespace triedent
       read_write = 1
    };
 
+   /**
+    * For ACID **Durablity** requriments this configures
+    * how agressive triedent will be in flushing data to disk.
+    * 
+    * 0. none - msync() will not be called and data will be
+    *      lost if the computer crashes. So long as the OS
+    *      doesn't crash your data is safe even if your
+    *      program crashes.
+    * 1. async - msync(MS_ASYNC) will be used which will tell
+    *      the OS to write as soon as possible without blocking
+    *      the caller. This will write data frequently, and
+    *      churn the SSD more than none. 
+    * 2. sync - msync(MS_SYNC) will be used to block caller
+    *      when they update the top-root. In this mode the
+    *      database is "gauranteed" to be recoverable assuming
+    *      the OS didn't silently buffer and the disks didn't
+    *      silently buffer contrary to the implied behavior
+    *      of msync()
+    *
+    */
+   enum sync_type
+   {
+      none     = 0,  // on program close or as OS chooses
+      async    = 1,  // nonblocking, but write soon
+      sync     = 2   // block until changes are committed to disk
+   };
+   // none is implimented by specifiying MS_ASYNC and MS_SYNC which will
+   // cause msync to fail if not checked.
+   inline int msync_flag(sync_type st ) {
+      static int flags[] = { MS_ASYNC|MS_SYNC, MS_ASYNC, MS_SYNC };
+      return flags[(int)st];
+   };
+
    // Thread safety:
    //
    // The file must not be resized by another process
@@ -48,6 +82,12 @@ namespace triedent
       std::size_t           size() const { return _size; }
       bool                  pinned() const { return _pinned; }
       access_mode           mode() const { return _mode; }
+      void                  sync( sync_type st = sync_type::sync) {
+         if( not msync_flag(st) ) return;
+         if( msync( data(), size(), msync_flag(st) ) ) {
+            throw std::runtime_error( "mapping.hpp: msync returned -1" );
+         }
+      }
 
      private:
       std::atomic<void*> _data;
diff --git a/libraries/triedent/include/triedent/node.hpp b/libraries/triedent/include/triedent/node.hpp
index 01af6bf56..595072b76 100644
--- a/libraries/triedent/include/triedent/node.hpp
+++ b/libraries/triedent/include/triedent/node.hpp
@@ -1,6 +1,6 @@
 #pragma once
-#include <triedent/cache_allocator.hpp>
 #include <triedent/debug.hpp>
+#include <triedent/seg_allocator.hpp>
 
 #include <cstring>
 
@@ -13,17 +13,13 @@ namespace triedent
    using key_type   = std::string;
    using value_type = key_type;
 
-   object_id bump_refcount_or_copy(cache_allocator& ra,
-                                   std::unique_lock<gc_session>&,
-                                   object_id id);
+   using session_rlock = seg_allocator::session::read_lock;
+   template <typename T = char>
+   using object_ref = session_rlock::object_ref<T>;
 
-   class node
-   {
-     public:
-      inline uint8_t key_size() const { return (*reinterpret_cast<const uint8_t*>(this)); }
-   };
+   object_id bump_refcount_or_copy(session_rlock& state, object_id id);
 
-   class value_node : public node
+   class value_node 
    {
      public:
       inline uint32_t    key_size() const { return _key_size; }
@@ -51,61 +47,56 @@ namespace triedent
       inline value_view data() const { return value_view(data_ptr(), data_size()); }
       inline key_view   key() const { return key_view(key_ptr(), key_size()); }
 
-      inline static std::pair<location_lock, value_node*> make(
-          cache_allocator&              a,
-          std::unique_lock<gc_session>& session,
-          key_view                      key,
-          value_view                    val,
-          node_type                     type)
+      inline static object_ref<value_node> make(session_rlock& state,
+                                                key_view       key,
+                                                value_view     val,
+                                                node_type      type)
       {
          assert(val.size() < 0xffffff - key.size() - sizeof(value_node));
          uint32_t alloc_size = sizeof(value_node) + key.size() + val.size();
-         auto     r          = a.alloc(session, alloc_size, type);
+         auto     r          = state.alloc(alloc_size, type);
          if constexpr (debug_nodes)
-            std::cout << r.first.get_id().id << ": construct value_node: type=" << (int)type
-                      << std::endl;
-         return std::make_pair(std::move(r.first), new (r.second) value_node(a, key, val));
+            std::cout << r.id().id << ": construct value_node: type=" << (int)type
+                      << " ref = " << r.ref_count() << std::endl;
+         new (r.data()) value_node(key, val);
+         return r;
       }
 
       // If id is non-null, it must refer to a source object that is being copied
       // Otherwise, key and value must be pointers to external memory
-      inline static std::pair<location_lock, value_node*> clone(
-          cache_allocator&              a,
-          std::unique_lock<gc_session>& session,
-          object_id                     id,
-          key_view                      key,
-          std::uint32_t                 key_offset,
-          value_view                    val,
-          node_type                     type)
+      inline static object_ref<value_node> clone(session_rlock& state,
+                                                 object_id      id,
+                                                 key_view       key,
+                                                 std::uint32_t  key_offset,
+                                                 value_view     val,
+                                                 node_type      type)
       {
          if (id && type == node_type::roots)
          {
-            return clone_roots(a, session, id, key, key_offset, val, type);
+            return clone_roots(state, id, key, key_offset, val, type);
          }
          else
          {
-            return clone_bytes(a, session, id, key, key_offset, val, type);
+            return clone_bytes(state, id, key, key_offset, val, type);
          }
       }
 
-      inline static std::pair<location_lock, value_node*> clone_bytes(
-          cache_allocator&              a,
-          std::unique_lock<gc_session>& session,
-          object_id                     id,
-          key_view                      key,
-          std::uint32_t                 key_offset,
-          value_view                    val,
-          node_type                     type)
+      inline static object_ref<value_node> clone_bytes(session_rlock& state,
+                                                       object_id      id,
+                                                       key_view       key,
+                                                       std::uint32_t  key_offset,
+                                                       value_view     val,
+                                                       node_type      type)
       {
          if (key_offset != std::uint32_t(-1))
             key = key.substr(key_offset);
          assert(val.size() < 0xffffff - key.size() - sizeof(value_node));
          uint32_t alloc_size = sizeof(value_node) + key.size() + val.size();
          // alloc invalidates key and val
-         auto r = a.alloc(session, alloc_size, type);
+         auto r = state.alloc(alloc_size, type);
          if (id)
          {
-            auto ptr = get(a, session, id);
+            auto ptr = state.get(id).as<value_node>();
             if (key_offset != std::uint32_t(-1))
             {
                key = ptr->key().substr(key_offset);
@@ -113,18 +104,20 @@ namespace triedent
             val = ptr->data();
          }
          if constexpr (debug_nodes)
-            std::cout << r.first.get_id().id << ": construct value_node: type=" << (int)type
-                      << std::endl;
-         return std::make_pair(std::move(r.first), new (r.second) value_node(a, key, val));
+            std::cout << r.id().id << ": construct value_node: type=" << (int)type << std::endl;
+         new (r.data()) value_node(key, val);
+      //   r.obj()->update_checksum();
+         return r;
       }
-      inline static std::pair<location_lock, value_node*> clone_roots(
-          cache_allocator&              a,
-          std::unique_lock<gc_session>& session,
-          object_id                     id,
-          key_view                      key,
-          std::uint32_t                 key_offset,
-          value_view                    val,
-          node_type                     type)
+
+      // TODO: all clone functions should take object_ref instead of id to avoid
+      // having to look up the object twice!
+      inline static object_ref<value_node> clone_roots(session_rlock& state,
+                                                       object_id      id,
+                                                       key_view       key,
+                                                       std::uint32_t  key_offset,
+                                                       value_view     val,
+                                                       node_type      type)
       {
          const std::size_t value_size = val.size();
          if (key_offset != std::uint32_t(-1))
@@ -137,30 +130,32 @@ namespace triedent
          // copy_node or alloc invalidates key and val
          for (std::size_t i = 0; i < n; ++i)
          {
-            roots[i] = bump_refcount_or_copy(a, session, roots[i]);
+            roots[i] = bump_refcount_or_copy(state, roots[i]);
          }
-         auto r = a.alloc(session, alloc_size, type);
+         auto r = state.alloc(alloc_size, type);
          {
             if (key_offset != std::uint32_t(-1))
             {
-               auto ptr = get(a, session, id);
-               key      = ptr->key().substr(key_offset);
+               auto& in = state.get(id).as_value_node();
+               key      = in.key().substr(key_offset);
             }
             val = {reinterpret_cast<const char*>(&roots[0]), value_size};
          }
          if constexpr (debug_nodes)
-            std::cout << r.first.get_id().id << ": construct value_node: type=" << (int)type
-                      << std::endl;
-         return std::make_pair(std::move(r.first), new (r.second) value_node(a, key, val));
+            std::cout << r.id().id << ": construct value_node: type=" << (int)type << std::endl;
+         return r;
       }
 
      private:
-      static value_node* get(cache_allocator& a, session_lock_ref<> session, object_id id)
+#if 0  // this shouldn't be needed any more
+      static value_node* get(session_rlock& state, object_id id)
       {
-         auto [ptr, type, ref] = a.get_cache<false>(session, id);
-         return reinterpret_cast<value_node*>(ptr);
+         //auto [ptr, type, ref] = a.get_cache<false>(session, id);
+         auto val = state.get(id, false /* NO COPY */);
+         return reinterpret_cast<value_node*>(val.obj());
       }
-      value_node(cache_allocator& ra, key_view key, value_view val)
+#endif
+      value_node(key_view key, value_view val)
       {
          _key_size = key.size();
          if (!key.empty())
@@ -173,7 +168,7 @@ namespace triedent
    };
    static_assert(sizeof(value_node) == 1, "unexpected padding");
 
-   class inner_node : public node
+   class inner_node 
    {
      public:
       inline object_id&       branch(uint8_t b);
@@ -196,25 +191,22 @@ namespace triedent
       inline int8_t  reverse_lower_bound(uint8_t b) const;
       inline uint8_t upper_bound(uint8_t b) const;
 
-      inline static std::pair<location_lock, inner_node*> clone(
-          cache_allocator&              a,
-          std::unique_lock<gc_session>& session,
-          object_id                     id,
-          const inner_node*             in,
-          key_view                      key,
-          std::uint32_t                 key_offset,
-          object_id                     value,
-          std::uint64_t                 branches);
-
-      inline static std::pair<location_lock, inner_node*> make(
-          cache_allocator&              a,
-          std::unique_lock<gc_session>& session,
-          key_view                      prefix,
-          object_id                     val,
-          uint64_t                      branches);
+      inline static object_ref<inner_node> clone(session_rlock&    state,
+                                                 object_id         id,
+                                                 const inner_node* in,
+                                                 key_view          key,
+                                                 std::uint32_t     key_offset,
+                                                 object_id         value,
+                                                 std::uint64_t     branches);
+
+      inline static object_ref<inner_node> make(session_rlock& state,
+                                                key_view       prefix,
+                                                object_id      val,
+                                                uint64_t       branches);
 
       inline bool has_branch(uint32_t b) const { return _present_bits & (1ull << b); }
 
+      inline uint8_t key_size() const { return _prefix_length; }
       inline key_view key() const { return key_view(key_ptr(), key_size()); }
 
       inline int32_t     branch_index(uint32_t branch) const;
@@ -227,11 +219,14 @@ namespace triedent
       }
 
      private:
-      static inner_node* get(cache_allocator& a, session_lock_ref<> session, object_id id)
+#if 0
+      static inner_node* get(session_rlock& state, object_id id)
       {
-         auto [ptr, type, ref] = a.get_cache<false>(session, id);
-         return reinterpret_cast<inner_node*>(ptr);
+         auto ptr = a.get(id, false);  // TODO: why not copy here?
+         return reinterpret_cast<inner_node*>(ptr.obj());
       }
+#endif
+
       inner_node(object_id  id,
                  key_view   prefix,
                  object_id  val,
@@ -242,20 +237,19 @@ namespace triedent
       uint8_t   _prefix_length = 0;  // mirrors value nodes to signal type and prefix length
       uint8_t   _reserved_a    = 0;  // future use
       uint8_t   _reserved_b    = 0;  // future use
+      uint8_t   _reserved_c    = 0;  // future use
       object_id _value;              // this is 5 bytes
       uint64_t  _present_bits = 0;   // keep this 8 byte aligned for popcount instructions
    } __attribute__((packed));
-   static_assert(sizeof(inner_node) == 3 + 5 + 8, "unexpected padding");
-
-   inline std::pair<location_lock, inner_node*> inner_node::clone(
-       cache_allocator&              a,
-       std::unique_lock<gc_session>& session,
-       object_id                     id,
-       const inner_node*             in,
-       key_view                      key,
-       std::uint32_t                 key_offset,
-       object_id                     value,
-       std::uint64_t                 branches)
+   static_assert(sizeof(inner_node) == 4 + sizeof(object_id)  + 8, "unexpected padding");
+
+   inline object_ref<inner_node> inner_node::clone(session_rlock&    state,
+                                                   object_id         id,
+                                                   const inner_node* in,
+                                                   key_view          key,
+                                                   std::uint32_t     key_offset,
+                                                   object_id         value,
+                                                   std::uint64_t     branches)
    {
       if (key_offset != std::uint32_t(-1))
          key = key.substr(key_offset);
@@ -264,7 +258,7 @@ namespace triedent
       object_id         children[n + 1];
       if (in->_present_bits == branches)
       {
-         std::memcpy(&children[0], in->children(), sizeof(children));
+         std::memcpy(&children[0], in->children(), n * sizeof(object_id));
       }
       else
       {
@@ -281,40 +275,42 @@ namespace triedent
          }
       }
       // invalidates in and prefix
-      value = bump_refcount_or_copy(a, session, value);
+      value = bump_refcount_or_copy(state, value);
       for (std::size_t i = 0; i < n; ++i)
       {
-         children[i] = bump_refcount_or_copy(a, session, children[i]);
+         children[i] = bump_refcount_or_copy(state, children[i]);
       }
-      auto p = a.alloc(session, alloc_size, node_type::inner);
+      auto p = state.alloc(alloc_size, node_type::inner);
       if (key_offset != std::uint32_t(-1))
       {
-         in  = get(a, session, id);
+         in = state.get(id).as<inner_node>();
+         // TODO: cache?
          key = in->key().substr(key_offset);
       }
 
-      auto newid = p.first.get_id();
+      auto newid = p.id();
       if constexpr (debug_nodes)
-         std::cout << newid.id << ": construct inner_node" << std::endl;
-      return std::make_pair(std::move(p.first),
-                            new (p.second) inner_node(newid, key, value, branches, children));
+         std::cout << newid.id << ": construct inner_node " << std::endl;
+
+      new (p.data()) inner_node(newid, key, value, branches, children);
+      return p;
    }
 
-   inline std::pair<location_lock, inner_node*> inner_node::make(
-       cache_allocator&              a,
-       std::unique_lock<gc_session>& session,
-       key_view                      prefix,
-       object_id                     val,
-       uint64_t                      branches)
+   inline object_ref<inner_node> inner_node::make(session_rlock& state,
+                                                  key_view       prefix,
+                                                  object_id      val,
+                                                  uint64_t       branches)
    {
       uint32_t alloc_size =
           sizeof(inner_node) + prefix.size() + std::popcount(branches) * sizeof(object_id);
-      auto p  = a.alloc(session, alloc_size, node_type::inner);
-      auto id = p.first.get_id();
+      auto p  = state.alloc(alloc_size, node_type::inner);
+      auto id = p.id();
       if constexpr (debug_nodes)
-         std::cout << id.id << ": construct inner_node" << std::endl;
-      return std::make_pair(std::move(p.first),
-                            new (p.second) inner_node(id, prefix, val, branches));
+         std::cout << p.id().id << ": construct inner_node val=" << val.id
+                   << " ref: " << p.ref_count() << std::endl;
+
+      new (p.data()) inner_node(id, prefix, val, branches);
+      return p;
    }
 
    inline inner_node::inner_node(object_id id, key_view prefix, object_id val, uint64_t branches)
@@ -395,17 +391,45 @@ namespace triedent
       return b >= 63 ? 64 : std::countr_zero(_present_bits & mask);
    }
 
-   inline void release_node(session_lock_ref<> l, cache_allocator& ra, object_id obj)
+   // makes sure all nodes are reachable with a ref-count of 1+
+   // TODO make sure all object hashes check out
+   inline bool validate_node(session_rlock& state, object_id obj, int depth = 0)
    {
-      if (!obj)
-         return;
-      auto [ptr, type] = ra.release(l, obj);
-      if (ptr && type == node_type::inner)
+      if (not obj.id)
+      {
+         return true;
+      }
+      auto oref = state.get(obj);  // don't try to cache, we are releasing!
+                                   //
+      if (not oref.ref_count())
+      {
+         throw std::runtime_error("0 ref count!");
+      }
+
+      if( not oref.obj()->validate_checksum() ) {
+         std::cout << obj.id <<": validate checkusm failed "<<oref.obj()->check <<" != " << oref.obj()->calculate_checksum() <<"\n";
+         return false;
+      } 
+      auto  ctype = oref.type();
+      auto& in    = oref.as_inner_node();
+
+      bool error = false;
+      auto oj = oref.obj();
+      if (oj->get_type() != oref.type())
+      {
+         std::cerr << "obj: " << obj.id << " invariant violation id.type (" << (int)oref.type()
+                   << ") and obj->type (" << (int)oj->get_type() << ") are not equal!\n";
+         std::cerr << "             obj->size: " << oj->size <<"  id: " << oj->id<<" ";
+         std::cerr << " refc: " << oref.ref_count() <<" check: " << oj->check <<"\n";
+         error = true;
+      }
+
+      if (ctype == node_type::inner)
       {
-         auto& in = *reinterpret_cast<inner_node*>(ptr);
          if constexpr (debug_nodes)
-            std::cout << obj.id << ": destroying; release value " << in.value().id << std::endl;
-         release_node(l, ra, in.value());
+            std::cout << obj.id << ": validating; inner value " << in.value().id << std::endl;
+         if (not validate_node(state, in.value(), depth+1))
+            return false;
          auto nb  = in.num_branches();
          auto pos = in.children();
          auto end = pos + nb;
@@ -413,57 +437,109 @@ namespace triedent
          {
             assert(*pos);
             if constexpr (debug_nodes)
-               std::cout << obj.id << ": destroying; release child " << pos->id << std::endl;
-            release_node(l, ra, *pos);
+               std::cout << obj.id << ": validating; inner child child " << pos->id << std::endl;
+            if (not validate_node(state, *pos, depth+1))
+               return false;
             ++pos;
          }
       }
-      if (ptr && type == node_type::roots)
+      else if (ctype == node_type::roots)
       {
-         auto& vn    = *reinterpret_cast<value_node*>(ptr);
+         auto& vn    = reinterpret_cast<const value_node&>(in);
          auto  n     = vn.num_roots();
          auto  roots = vn.roots();
          while (n--)
          {
             if constexpr (debug_nodes)
                std::cout << obj.id << ": destroying; release root " << roots->id << std::endl;
-            release_node(l, ra, *roots++);
+            if (not validate_node(state, *roots++, depth+1))
+               return false;
+         }
+      }
+      else if (ctype == node_type::bytes)
+      {
+         auto& vn    = reinterpret_cast<const value_node&>(in);
+         if( error )
+         TRIEDENT_WARN( "value.key_size(): ", vn.key_size(), " data size: " , 
+                        vn.data_size(), " depth: ", depth );
+      }
+      else
+      {
+         throw std::runtime_error("validating unknown node type");
+         return false;
+      }
+      return true;
+   }
+   inline void release_node(session_rlock& state, object_id obj)
+   {
+      if (!obj)
+         return;
+      auto oref  = state.get(obj);  // don't try to cache, we are releasing!
+      auto ctype = oref.type();
+
+      //      std::cerr << "before release node: " << obj.id <<" type: " << (int)oref.type() <<" loc: " << oref.location()._offset <<" ref: " << oref.ref_count()<<"\n";
+
+      // save the pointer in advance, because if released oref will return null
+      // the in pointer is still valid for the duration of state
+      auto& in = oref.as_inner_node();
+      if (oref.release())
+      {
+         if (ctype == node_type::inner)
+         {
+            if constexpr (debug_nodes)
+               std::cout << obj.id << ": destroying; release value " << in.value().id << std::endl;
+            release_node(state, in.value());
+            auto nb  = in.num_branches();
+            auto pos = in.children();
+            auto end = pos + nb;
+            while (pos != end)
+            {
+               assert(*pos);
+               if constexpr (debug_nodes)
+                  std::cout << obj.id << ": destroying; release child " << pos->id << std::endl;
+               release_node(state, *pos);
+               ++pos;
+            }
+         }
+         else if (ctype == node_type::roots)
+         {
+            auto& vn    = reinterpret_cast<const value_node&>(in);  //oref.as_value_node();
+            auto  n     = vn.num_roots();
+            auto  roots = vn.roots();
+            while (n--)
+            {
+               if constexpr (debug_nodes)
+                  std::cout << obj.id << ": destroying; release root " << roots->id << std::endl;
+               release_node(state, *roots++);
+            }
          }
       }
    }
 
-   inline location_lock copy_node(cache_allocator&              ra,
-                                  std::unique_lock<gc_session>& session,
-                                  object_id                     id,
-                                  void*                         ptr,
-                                  node_type                     type)
+   inline object_ref<node> copy_node(session_rlock& state, object_ref<node> oref)
    {
-      if (type != node_type::inner)
+      if (oref.type() != node_type::inner)  // value or roots
       {
-         auto src          = reinterpret_cast<value_node*>(ptr);
-         auto [lock, dest] = value_node::clone(ra, session, id, src->key(), 0, src->data(), type);
-         return std::move(lock);
+         auto& src = oref.as_value_node();
+         return value_node::clone(state, oref.id(), src.key(), 0, src.data(), oref.type());
       }
       else
       {
-         auto src = reinterpret_cast<inner_node*>(ptr);
-         auto [lock, dest] =
-             inner_node::clone(ra, session, id, src, src->key(), 0, src->value(), src->branches());
-         return std::move(lock);
+         auto& src = oref.as_inner_node();
+         return inner_node::clone(state, oref.id(), &src, src.key(), 0, src.value(),
+                                  src.branches());
       }
    }
 
-   inline object_id bump_refcount_or_copy(cache_allocator&              ra,
-                                          std::unique_lock<gc_session>& session,
-                                          object_id                     id)
+   inline object_id bump_refcount_or_copy(session_rlock& state, object_id id)
    {
       if (!id)
          return id;
       if constexpr (debug_nodes)
          std::cout << id.id << ": bump_refcount_or_copy" << std::endl;
-      if (ra.bump_count(id))
-         return id;
-      auto [ptr, type, ref] = ra.get_cache<false>(session, id);
-      return copy_node(ra, session, id, ptr, type).get_id();
+      auto oref = state.get(id);  // TODO cache?
+      if (oref.retain())
+         return oref.id();
+      return copy_node(state, oref).id();
    }
 }  // namespace triedent
diff --git a/libraries/triedent/include/triedent/object_db.hpp b/libraries/triedent/include/triedent/object_db.hpp
index 76c33e92e..120bf2bc9 100644
--- a/libraries/triedent/include/triedent/object_db.hpp
+++ b/libraries/triedent/include/triedent/object_db.hpp
@@ -10,7 +10,6 @@
 #include <triedent/debug.hpp>
 #include <triedent/file_fwd.hpp>
 #include <triedent/gc_queue.hpp>
-#include <triedent/location_lock.hpp>
 #include <triedent/mapping.hpp>
 #include <triedent/object_fwd.hpp>
 
@@ -32,7 +31,6 @@ namespace triedent
       std::uint64_t          offset() const { return _offset * 8; }
       constexpr object_info& set_location(object_location loc)
       {
-         cache   = loc.cache;
          _offset = loc.offset / 8;
          return *this;
       }
@@ -40,21 +38,9 @@ namespace triedent
       {
          return ref | (_type << 15) | (cache << 17) | (_offset << 19);
       }
-      constexpr operator object_location() const { return {.offset = _offset * 8, .cache = cache}; }
+      constexpr operator object_location() const { return {.offset = _offset * 8}; }
    };
 
-   struct mutex_group
-   {
-      static constexpr std::size_t count = 64;
-      static constexpr std::size_t align = 64;
-      explicit mutex_group() : _items(new location_mutex[count]) {}
-      location_mutex& operator()(void* base, void* ptr) const
-      {
-         auto diff = reinterpret_cast<std::uintptr_t>(ptr) - reinterpret_cast<std::uintptr_t>(base);
-         return _items[(diff / align) % count];
-      }
-      std::unique_ptr<location_mutex[]> _items;
-   };
 
    /**
     * Assignes unique ids to objects, tracks their reference counts,
@@ -62,7 +48,7 @@ namespace triedent
     */
    class object_db
    {
-      friend location_lock;
+  //    friend location_lock;
 
      public:
       using object_id = triedent::object_id;
@@ -92,34 +78,8 @@ namespace triedent
          return true;
       }
 
-      // A thread which holds a location_lock may:
-      // * Move the object to another location
-      // * Modify the object if it's not already exposed to reader threads
-
-      // Only acquire the lock if id points to loc
-      location_lock lock(object_id id, object_location loc)
-      {
-         auto* h      = header();
-         auto& atomic = h->objects[id.id];
-         // If the object has already been moved, don't bother locking
-         if (object_info info{atomic.load()}; info.ref != 0 && info == loc)
-         {
-            location_lock l{_location_mutexes(h, &atomic), id};
-            if (object_info info{atomic.load()}; info.ref != 0 && info == loc)
-            {
-               return l;
-            }
-         }
-         return location_lock{};
-      }
-      location_lock lock(object_id id)
-      {
-         auto* h      = header();
-         auto& atomic = h->objects[id.id];
-         return location_lock{_location_mutexes(h, &atomic), id};
-      }
 
-      void move(const location_lock& lock, object_location loc)
+      void move( object_location loc)
       {
          auto& atomic = header()->objects[lock.get_id().id];
          auto  obj    = atomic.load();
@@ -130,6 +90,7 @@ namespace triedent
          debug(lock.get_id().id, "move");
       }
 
+      /*
       bool compare_and_move(const location_lock& lock,
                             object_location      expected,
                             object_location      loc)
@@ -149,6 +110,7 @@ namespace triedent
             }
          }
       }
+      */
 
       // The id must not be accessible to any thread
       // besides the creator.
@@ -161,7 +123,7 @@ namespace triedent
          atomic.store(info.to_int());
       }
 
-      object_id alloc(std::unique_lock<gc_session>&, node_type type);
+      object_id alloc(node_type type);
 
       object_info release(object_id id);
 
@@ -183,11 +145,14 @@ namespace triedent
       void gc_finish();
 
       bool                  pinned() const { return _region.pinned(); }
+
+      /*
       std::span<const char> span() const
       {
          std::lock_guard l{_region_mutex};
          return {reinterpret_cast<const char*>(_region.data()), _region.size()};
       }
+      */
 
      private:
       static constexpr uint64_t ref_count_mask = (1ull << 15) - 1;
@@ -198,8 +163,8 @@ namespace triedent
       // 19-63    offset         or next_ptr
 
       // clang-format off
-      static uint64_t    extract_next_ptr(uint64_t x)   { return x >> 15; }
-      static uint64_t    create_next_ptr(uint64_t x)    { return x << 15; }
+      static inline uint64_t    extract_next_ptr(uint64_t x)   { return x >> 15; }
+      static inline uint64_t    create_next_ptr(uint64_t x)    { return x << 15; }
       // clang-format on
 
       static uint64_t obj_val(node_type type, uint16_t ref)
@@ -228,7 +193,7 @@ namespace triedent
       gc_queue&          _gc;
       mapping            _region;
       mutable std::mutex _region_mutex;
-      mutex_group        _location_mutexes;
+      //mutex_group        _location_mutexes;
 
       object_db_header* header() { return reinterpret_cast<object_db_header*>(_region.data()); }
 
@@ -282,7 +247,7 @@ namespace triedent
                                   idfile.native());
    }
 
-   inline object_id object_db::alloc(std::unique_lock<gc_session>& session, node_type type)
+   inline object_id object_db::alloc(node_type type)
    {
       std::lock_guard l{_region_mutex};
       auto            _header = header();
diff --git a/libraries/triedent/include/triedent/object_fwd.hpp b/libraries/triedent/include/triedent/object_fwd.hpp
index aa62b5920..268a0c2f6 100644
--- a/libraries/triedent/include/triedent/object_fwd.hpp
+++ b/libraries/triedent/include/triedent/object_fwd.hpp
@@ -1,29 +1,183 @@
 #pragma once
+#include <triedent/debug.hpp>
 
 #include <cstdint>
+#include <cstring>
+
+#define XXH_INLINE_ALL
+#include <triedent/xxhash.h>
 
 namespace triedent
 {
+   using segment_offset = uint32_t;  /// offset pointer from base of segment
+   using segment_number = uint64_t;  /// segment_offset / segment_size
+
+   class node;
+   class value_node;
+   class inner_node;
+
+   // must be a power of 2
+   // size of the data segments data is allocated in
+   // the smaller this value, the more overhead there is in
+   // searching for segments to manage and the free list
+   // each thread will have a segment this size, so larger values
+   // may use more memory than necessary for idle threads
+   // max value: 4 GB due to type of segment_offset
+   static const uint64_t segment_size = 1024 * 1024 * 128;  // 256mb
+
+   /// object pointers can only address 48 bits
+   /// 128 TB limit on database size with 47 bits, this saves us
+   /// 8MB of memory relative to 48 bits in cases with less than 128 TB
+   static const uint64_t max_segment_count = (1ull << 47) / segment_size;
+
+   /**
+    *  An offset/8 from object_db_header::alloc_segments encoded
+    *  as 5 bytes. This allows addressing of 8TB worth of object IDs which
+    *  is way beyond what will fit in RAM of most computers, 32 bits would
+    *  have only supported 32GB of object IDs which clearly fits within the
+    *  RAM of many laptops. 8 TB 
+    */
    struct object_id
    {
-      std::uint64_t id : 40 = 0;  // obj id
-      explicit      operator bool() const { return id != 0; }
-      friend bool   operator==(object_id a, object_id b) = default;
+      uint64_t    id : 40 = 0;  // obj id
+      explicit    operator bool() const { return id != 0; }
+      friend bool operator==(object_id a, object_id b) = default;
    } __attribute__((packed)) __attribute__((aligned(1)));
    static_assert(sizeof(object_id) == 5, "unexpected padding");
    static_assert(alignof(object_id) == 1, "unexpected alignment");
 
    enum class node_type : std::uint8_t
    {
-      inner,
-      bytes,
-      roots,
+      inner     = 0,
+      bytes     = 1,
+      roots     = 2,
+      undefined = 3
    };
 
+   class object_info;
    struct object_location
    {
-      std::uint64_t offset : 48;
-      std::uint64_t cache : 2;
-      friend bool   operator==(const object_location&, const object_location&) = default;
+      uint32_t segment() const { return _offset / segment_size; }
+      uint32_t index() const { return _offset & (segment_size - 1); }
+
+      friend bool operator==(const object_location&, const object_location&) = default;
+
+      friend class object_info;
+      uint64_t _offset : 48;
+   };
+
+   /** future replacement for object info, designed to 
+    * get rid of the bit fields and unnecessary shifting/setting on construction
+    * so that this type can be used everywhere rather than manually twiddling bits
+    * all over the code that could get out of sync with the header
+    *
+   struct object_meta {
+      public:
+         explicit object_meta( uint64_t v = 0 ):_value(v){};
+         object_meta& set_location( uint64_t loc ) {
+            assert( not loc & 0x7 );
+            assert( (loc >> 3) == (loc / 8) );
+            loc << (location_rshift-3);
+            value = (value & ~location_mask) | loc;
+            return *this;
+         }
+         object_meta& set_type( node_type type ) {
+            value = (value & ~type_mask ) | (uint64_t(type) << type_lshift);
+         }
+         uint32_t  ref() { return _value & ref_mask; }
+         node_type type(){ return node_type( (_value & type_mask) >> type_lshift); }
+         uint64_t& data(){ return _value; }
+      private:
+         uint64_t _value;
+   };
+   */
+
+   class object_info
+   {
+     public:
+      static const uint64_t ref_mask        = 0x7fff;
+      static const uint64_t max_ref_count   = ref_mask - 64;  // allow some overflow bits for retain
+      static const uint64_t read_mask       = 3 << 17;
+      static const uint64_t type_mask       = 3 << 15;
+      static const uint64_t location_mask   = ~(type_mask | read_mask | ref_mask);
+      static const uint32_t location_lshift = 45;
+      static const uint32_t location_rshift = 64 - location_lshift;
+
+      explicit constexpr object_info(uint64_t x)
+          : _location(x >> location_rshift),
+            _read((x >> 17) & 3),
+            _type((x >> 15) & 3),
+            _ref(x & ref_mask)
+      {
+      }
+      object_info(node_type t, uint64_t loc = -1) : _type((int)t)
+      {
+         _ref      = 0;
+         _read     = 0;
+         _location = loc;
+      };
+
+      uint8_t   read() const { return _read; }
+      uint32_t  ref() const { return _ref; }
+      node_type type() const { return static_cast<node_type>(_type); }
+      auto      location() const { return object_location{_location * 8}; }
+
+      void set_type(node_type t) { _type = (int)t; }
+
+      // pre set location
+      constexpr object_info& set_location(const object_location& loc)
+      {
+         _location = loc._offset / 8;
+         return *this;
+      }
+
+      constexpr uint64_t to_int() const
+      {
+         return _ref | (_type << 15) | (_read << 17) | (_location << 19);
+      }
+      constexpr operator object_location() const
+      {
+         return object_location{._offset = _location * 8};
+      }
+
+      //private:
+      friend class object_location;
+      uint64_t _ref : 15;
+      uint64_t _type : 2;
+      uint64_t _read : 2;
+      uint64_t _location : 45;
    };
+   static_assert(sizeof(object_info) == sizeof(uint64_t), "unexpected padding");
+
+   struct object_header
+   {
+      uint32_t check = 0; // xxhash checksum of thre next size bytes
+      uint32_t type: 4;
+      uint32_t size: 28;
+      // size might not be a multiple of 8, next object is at data() + (size+7)&-8
+      uint64_t unused: 24;  // bytes of data, not including header
+      uint64_t id : 40;
+
+      node_type       get_type()const { return (node_type)type; }
+      void            set_type( node_type t ) { type = (uint8_t) t; }
+      void            set_id( object_id d )   { id = d.id; }
+      object_id       get_id()const { return {id}; }
+      inline uint64_t data_size() const { return size; }
+      inline uint32_t data_capacity() const { return (size + 7) & -8; }
+      inline char*    data() const { return (char*)(this + 1); }
+
+      uint32_t calculate_checksum() {
+        return XXH3_64bits( &check+1, size + sizeof(object_header) - sizeof(check) );
+      }
+      void update_checksum()   { check = calculate_checksum();         }
+      bool validate_checksum() { return check == calculate_checksum(); }
+
+
+      // returns the end of data_capacity() cast as another object_header
+      inline object_header* next() const { return (object_header*)(((char*)this) + object_size()); }
+
+      // capacity + sizeof(object_header)
+      inline uint32_t object_size() const { return data_capacity() + sizeof(object_header); }
+   }__attribute__((packed)) __attribute__((aligned(8)));
+
 }  // namespace triedent
diff --git a/libraries/triedent/include/triedent/region_allocator.hpp b/libraries/triedent/include/triedent/region_allocator.hpp
index f17cc4d26..f3ce153f8 100644
--- a/libraries/triedent/include/triedent/region_allocator.hpp
+++ b/libraries/triedent/include/triedent/region_allocator.hpp
@@ -28,6 +28,9 @@ namespace triedent
                        access_mode                  mode,
                        std::uint64_t                initial_size = 64 * 1024 * 1024);
       ~region_allocator();
+
+
+      // TODO: how is this a try?  This is a do or hang/die trying?
       void* try_allocate(std::unique_lock<gc_session>& session,
                          object_id                     id,
                          std::uint32_t                 size,
diff --git a/libraries/triedent/include/triedent/ring_allocator.hpp b/libraries/triedent/include/triedent/ring_allocator.hpp
index 619da8dc9..4e04c3acd 100644
--- a/libraries/triedent/include/triedent/ring_allocator.hpp
+++ b/libraries/triedent/include/triedent/ring_allocator.hpp
@@ -16,16 +16,6 @@
 
 namespace triedent
 {
-   struct object_header
-   {
-      // size might not be a multiple of 8, next object is at data() + (size+7)&-8
-      uint64_t size : 24;  // bytes of data, not including header
-      uint64_t id : 40;
-
-      inline uint64_t data_size() const { return size; }
-      inline uint32_t data_capacity() const { return (size + 7) & -8; }
-      inline void*    data() const { return (char*)(this + 1); }
-   };
 
    // ring_allocator allocates memory from a single circular buffer.
    // The buffer is divided into three regions
@@ -297,15 +287,12 @@ namespace triedent
    {
       uint64_t used_size = alloc_size(size);
 
-      std::unique_lock l{_free_mutex};
-      if (check_contiguous_free_space(used_size))
+      std::unique_lock l(_free_mutex,std::defer_lock);
+      if( l.try_lock() and check_contiguous_free_space(used_size) )
       {
          return allocate_impl(size, used_size, id, init);
       }
-      else
-      {
-         return nullptr;
-      }
+      return nullptr;
    }
 
    template <typename F>
diff --git a/libraries/triedent/include/triedent/seg_allocator.hpp b/libraries/triedent/include/triedent/seg_allocator.hpp
new file mode 100644
index 000000000..27bdcd0a5
--- /dev/null
+++ b/libraries/triedent/include/triedent/seg_allocator.hpp
@@ -0,0 +1,781 @@
+#pragma once
+#include <thread>
+#include <triedent/debug.hpp>
+#include <triedent/id_allocator.hpp>
+#include <triedent/mapping.hpp>
+
+/**
+ *  @file seg_allocator.hpp
+ *
+ *  Responsible for allocating large segments of memory (256MB), each
+ *  segment in turn stores objects pointed to from the id_allocator.
+ *
+ *  1. Each thread has its own session and allocates to its own
+ *     segment in an append-only manner.
+ *  2. Once an object has been written to a segment and its location
+ *     exposed to the id_allocator it is considered immutible by
+ *     the segment allocator, apps may still mutate it if they independently
+ *     verify that only one thread is reading it and they lock the id while
+ *     modifying it so that the gc thread doesn't try to compact it.
+ *          - this should be unlikely because all modify in place operations
+ *          occur with temporary, uncommited data which will likely be in
+ *          the active allocation segment where it won't be moved anyway
+ *  3. Once a segment is full, it is marked as read-only to the seg_allocator until
+ *     its data is no longer referenced and the segment can be 
+ *     recycled. E.g. no new allocation will write over it.
+ *  4. When data is read, it is copied to the current alloc segment unless
+ *     another thread locked it first. Once copied the
+ *     item's location in the object db is updated and it is unlocked.
+ *
+ *     No threads need to wait for the copy because the data in the old location and new location
+ *     are identical and the reader already has a "lock" on the old location
+ *
+ *  5. A garbage-collector (GC) thread finds the most empty segment and moves 
+ *     all of the objects that remain to its own segment, then makes the
+ *     segment available for reuse by other threads (one all threads have
+ *     released the implied write lock)
+ *  6. the Object ID allocation system can be made thread safe by giving
+ *     each "writing session" a "segment" of the object id space. Writers would
+ *     only have to synchronize id allocation requests when their segments are full.
+ *
+ *  Theory: 
+ *      a. data will be organized in the order it tends to be accessed in
+ *      b. infrequently accessed data will be grouped together by GC
+ *      c. the most-recent N segments can have their memory pinned
+ *      d. madvise can be effeciently used to mark alloc segmentsfor
+ *           SEQ and/or pin them to memory. It can also mark segments as
+ *           RANDOM or UNNEEDED to improve OS cache managment.
+ *
+ *   
+ */
+
+namespace triedent
+{
+   /// index into meta[free_segment_index]._free_segment_number
+   using free_segment_index = uint64_t;
+
+   // types that are memory mapped
+   namespace mapped_memory
+   {
+
+      // meta data about each segment,
+      // stored in an array in allocator_header indexed by segment number
+      struct segment_meta
+      {
+         // returns the free space in bytes, and number of objects freed
+         std::pair<uint32_t, uint32_t> get_free_space_and_objs() const
+         {
+            uint64_t v = _free_space_and_obj.load(std::memory_order_relaxed);
+            return std::make_pair(v >> 32, v & 0xffffffff);
+         }
+
+         // notes that an object of size was freed
+         void free_object(uint32_t size)
+         {
+            uint64_t so = size;
+            so <<= 32;
+            so += 1;
+            _free_space_and_obj.fetch_add(so, std::memory_order_relaxed);
+         }
+
+         // doesn't increment object count
+         void free(uint32_t size)
+         {
+            uint64_t so = size;
+            so <<= 32;
+            _free_space_and_obj.fetch_add(so, std::memory_order_relaxed);
+         }
+
+         void clear()
+         {
+            _free_space_and_obj.store(0, std::memory_order_relaxed);
+            _last_sync_pos.store(segment_size, std::memory_order_relaxed);
+         }
+
+         /// the total number of bytes freed by swap
+         /// or by being moved to other segments.
+         std::atomic<uint64_t> _free_space_and_obj;
+         std::atomic<uint64_t> _last_sync_pos;  // position of alloc pointer when last synced
+      };
+
+      /// should align on a page boundary
+      struct segment_header
+      {
+         // the next position to allocate data, only
+         // used by the thread that owns this segment and
+         // set to uint64_t max when this segment is ready
+         // to be marked read only to the seg_allocator
+         std::atomic<uint32_t> _alloc_pos = 16;  // sizeof(segment_header)
+         uint32_t
+             _age;  // every time a segment is allocated it is assigned an age which aids in reconstruction
+         // used to calculate object density of segment header,
+         // to establish madvise
+         uint32_t _num_objects = 0;  // inc on alloc
+         uint32_t _checksum    = 0;  // TODO
+      };
+      static_assert(sizeof(segment_header) == 16);
+
+      struct allocator_header
+      {
+         // when no segments are available for reuse, advance by segment_size
+         alignas(64) std::atomic<segment_offset> alloc_ptr;    // A below
+         alignas(64) std::atomic<free_segment_index> end_ptr;  // E below
+
+         // set to 0 just before exit, set to 1 when opening database
+         std::atomic<bool>     clean_exit_flag;
+         std::atomic<uint32_t> next_alloc_age = 0;
+
+         // meta data associated with each segment, indexed by segment number
+         segment_meta seg_meta[max_segment_count];
+
+         // circular buffer described, big enough to hold every
+         // potentially allocated segment which is subseuently freed.
+         //
+         // |-------A----R1--R2---E-------------| max_segment_count
+         //
+         // A = alloc_ptr where recycled segments are used
+         // R* = session_ptrs last known recycled segment by each session
+         // E = end_ptr where the next freed segment is posted to be recycled
+         // Initial condition A = R* = E = 0
+         // Invariant A <= R* <= E unless R* == -1
+         //
+         // If A == min(R*) then we must ask block_alloc to create a new segment
+         //
+         // A, R*, and E are 64 bit numbers that count to infinity, the
+         // index in the buffer is A % max_segment_count which should be
+         // a simple bitwise & operation if max_segment_count is a power of 2.
+         // The values between [A-E) point to recyclable segments assuming no R*
+         // is present. Values before A or E and after point to no valid segments
+         segment_number free_seg_buffer[max_segment_count];
+      };
+
+      /// crash recovery:
+      /// 1. scan all segments to find those that were mid-allocation:
+      ///    if a lot of free space, then swap them and push to free seg buffer
+      /// 2. Update reference counts on all objects in database
+      /// 3. ? pray ?
+
+   }  // namespace mapped_memory
+
+   class seg_allocator
+   {
+     public:
+      // only 64 bits in bitfield used to allocate sessions
+      // only really require 1 per thread
+      static const uint32_t max_session_count = 64;
+
+      seg_allocator(std::filesystem::path dir);
+      ~seg_allocator();
+
+      void dump();
+      void sync(sync_type st = sync_type::sync);
+      void start_compact_thread();
+      bool compact_next_segment();
+
+      class session
+      {
+        public:
+         /**
+          * Ensures the read-lock is released so segments can be recycled
+          * and ensures that all data access flows through a read_lock.
+          *
+          * note: no mutexes are involved with this lock
+          */
+         class read_lock
+         {
+           public:
+            template <typename T = char>
+            class object_ref
+            {
+              public:
+               template <typename Other>
+               friend class object_ref;
+
+               template <typename Other>
+               object_ref(object_ref<Other> p)
+                   : _rlock(p._rlock), _id(p._id), _atom_loc(p._atom_loc), _cached(p._cached)
+               //    _ptr(p._ptr)
+               {
+                  //      assert(_ptr == nullptr or (_ptr and (_ptr->id == _id.id)));
+               }
+
+               object_id       id() const { return _id; }
+               uint32_t        ref_count() const { return _cached.ref(); }
+               node_type       type() const { return _cached.type(); }
+               auto            read() const { return _cached.read(); }
+               object_location location() const { return _cached.location(); }
+
+               // return false if ref count overflow
+               bool retain();
+               // return true if object is deleted
+               bool                 release();
+               const object_header* obj() const;  // TODO: rename header()
+               object_header*       obj();        // TODO: rename header()
+
+               char* data()
+               {
+                  assert(obj());
+                  return obj()->data();
+               }
+
+               template <typename Type>
+               Type* as()
+               {
+                  return reinterpret_cast<Type*>(obj()->data());
+               };
+               template <typename Type>
+               const Type* as() const
+               {
+                  return reinterpret_cast<const Type*>(obj()->data());
+               };
+
+               explicit inline operator bool() const { return bool(id()); }
+               bool            is_leaf_node() const { return type() != node_type::inner; }
+               inline auto& as_value_node() const { return *this->template as<const value_node>(); }
+               inline auto& as_inner_node() const { return *this->template as<const inner_node>(); }
+
+               inline const T* operator->() const { return this->template as<const T>(); }
+               inline T*       operator->() { return this->template as<T>(); }
+               inline const T& operator*() const { return *(this->template as<const T>()); }
+               inline T& operator*() { return *(this->template as<T>()); }
+
+               int64_t as_id() const { return _id.id; }
+
+               auto loc() const { return _cached.location(); }
+
+               auto& get_mutex() const { return _rlock._session._sega._id_alloc.get_mutex(_id); }
+
+               // return false if object is released while atempting to move
+               bool move(object_location expect, object_location loc);
+
+               bool cache_object();
+
+               void refresh() { _cached = object_info(_atom_loc.load(std::memory_order_acquire)); }
+
+              protected:
+               friend class seg_allocator;
+               friend class seg_allocator::session;
+
+               object_ref(seg_allocator::session::read_lock& rlock,
+                          object_id                          id,
+                          std::atomic<uint64_t>&             atom_loc)
+                   : _rlock(rlock),
+                     _atom_loc(atom_loc),
+                     _cached(atom_loc.load(std::memory_order_acquire)),
+                     _id(id)
+               {
+                  //    assert(_ptr == nullptr or (_ptr and (_ptr->id == _id.id)));
+               }
+
+               seg_allocator::session::read_lock& _rlock;
+               std::atomic<uint64_t>&             _atom_loc;
+               object_info                        _cached;  // cached read of atomic _atom_loc
+               object_id                          _id;
+            };
+
+            object_ref<char> alloc(uint32_t size, node_type type);
+
+            template <typename T = char>
+            object_ref<T> get(object_id id)
+            {
+               return object_ref<T>(*this, id, _session._sega._id_alloc.get(id));
+            }
+
+            object_ref<char> get(object_header*);
+
+            // checks known invariants:
+            //   id < max_id of id_allocator
+            //   id points to obj that points back to it
+            //   ref_count > 0
+            //   node_type is known and defined
+            //   ptr is in a valid range
+            //   others?
+            object_ref<char> validate(object_id id) const
+            {
+               throw std::runtime_error("read_lock::validate not impl");
+            }
+
+            ~read_lock() { _session.release_read_lock(); }
+
+           private:
+            friend class session;
+            template <typename T>
+            friend class object_ref;
+
+            object_header* get_object_pointer(object_location);
+
+            read_lock(session& s) : _session(s) { _session.retain_read_lock(); }
+            session& _session;
+         };
+
+         // before any objects can be read, the session must note the
+         // current state of the free segment queue so that no segments that
+         // could be read while the return value of this method is in scope can
+         // be reused.
+         read_lock lock() { return read_lock(*this); }
+
+         ~session()
+         {
+            if (_session_num == -1)
+               return;
+            if (_alloc_seg_ptr)  // not moved
+            {
+               if (segment_size - _alloc_seg_ptr->_alloc_pos >= sizeof(object_header))
+               {
+                  memset(((char*)_alloc_seg_ptr) + _alloc_seg_ptr->_alloc_pos, 0,
+                         sizeof(object_header));  // mark last object
+               }
+               _sega._header->seg_meta[_alloc_seg_num].free(segment_size -
+                                                            _alloc_seg_ptr->_alloc_pos);
+               _alloc_seg_ptr->_alloc_pos = uint32_t(-1);
+               _alloc_seg_num             = -1ull;
+            }
+            _sega.release_session_num(_session_num);
+         }
+
+         session(session&& mv)
+             : _sega(mv._sega),
+               _session_num(mv._session_num),
+               _alloc_seg_num(mv._alloc_seg_num),
+               _alloc_seg_ptr(mv._alloc_seg_ptr)
+         {
+            mv._session_num = -1;
+         }
+
+        private:
+         friend class lock;
+         friend class seg_allocator;
+
+         // copy E to R*
+         void retain_read_lock()
+         {
+            auto pt = _sega._session_ptrs[_session_num].load(std::memory_order_acquire);
+            if (pt == -1ull)
+               _sega._session_ptrs[_session_num].store(
+                   _sega._header->end_ptr.load(std::memory_order_acquire),
+                   std::memory_order_relaxed);
+            else  // TODO: this may be ok, but if so then
+               throw std::runtime_error("attempt to double-lock");
+         }
+
+         // R* goes to inifinity and beyond
+         void release_read_lock()
+         {
+            assert(_sega._session_ptrs[_session_num] != -1ull);
+            _sega._session_ptrs[_session_num] = -1ull;
+         }
+
+         session(seg_allocator& a, uint32_t ses_num)
+             : _session_num(ses_num), _alloc_seg_num(-1ull), _alloc_seg_ptr(nullptr), _sega(a)
+         {
+         }
+
+         session()               = delete;
+         session(const session&) = delete;
+
+         /**
+          *   alloc_data
+          *
+          */
+         std::pair<object_location, char*> alloc_data(uint32_t size, object_id id, node_type t)
+         {
+            assert(size < segment_size - 16);
+            // A - if no segment get a new segment
+            if (not _alloc_seg_ptr or
+                _alloc_seg_ptr->_alloc_pos.load(std::memory_order_relaxed) > segment_size)
+            {
+               auto [num, ptr] = _sega.get_new_segment();
+               _alloc_seg_num  = num;
+               _alloc_seg_ptr  = ptr;
+               _sega._header->seg_meta[_alloc_seg_num]._last_sync_pos.store(
+                   0, std::memory_order_relaxed);
+            }
+
+            auto* sh           = _alloc_seg_ptr;
+            auto  rounded_size = (size + 7) & -8;
+
+            auto cur_apos   = sh->_alloc_pos.load(std::memory_order_relaxed);
+            auto spec_pos   = uint64_t(cur_apos) + rounded_size;
+            auto free_space = segment_size - cur_apos;
+
+            // B - if there isn't enough space, notify compactor go to A
+            if (spec_pos > (segment_size - sizeof(object_header)))
+            {
+               if (free_space >= sizeof(object_header))
+               {
+                  assert(cur_apos + sizeof(uint64_t) <= segment_size);
+                  memset(((char*)sh) + cur_apos, 0, sizeof(object_header));
+               }
+               _sega._header->seg_meta[_alloc_seg_num].free(segment_size - sh->_alloc_pos);
+               sh->_alloc_pos.store(uint32_t(-1), std::memory_order_release);
+               _alloc_seg_ptr = nullptr;
+               _alloc_seg_num = -1ull;
+
+               return alloc_data(size, id, t);  // recurse
+            }
+
+            auto obj   = ((char*)sh) + sh->_alloc_pos.load(std::memory_order_relaxed);
+            auto head  = (object_header*)obj;
+            head->size = size - sizeof(object_header);
+            head->id   = id.id;
+            head->set_type(t);
+
+            auto new_alloc_pos =
+                rounded_size + sh->_alloc_pos.fetch_add(rounded_size, std::memory_order_relaxed);
+            sh->_num_objects++;
+
+            auto loc = _alloc_seg_num * segment_size + cur_apos;
+
+            return {object_location{loc}, obj};
+         }
+
+         uint32_t _session_num;  // index into _sega's active sessions list
+
+         segment_number                 _alloc_seg_num = -1ull;
+         mapped_memory::segment_header* _alloc_seg_ptr = nullptr;
+
+         seg_allocator& _sega;
+      };
+
+      session start_session() { return session(*this, alloc_session_num()); }
+
+     private:
+      friend class session;
+      std::optional<session> cses;
+
+      mapped_memory::segment_header* get_segment(segment_number seg)
+      {
+         return static_cast<mapped_memory::segment_header*>(_block_alloc.get(seg));
+      }
+
+      uint32_t alloc_session_num()
+      {
+         auto fs_bits = _free_sessions.load(std::memory_order_relaxed);
+         if (fs_bits == 0)
+         {
+            throw std::runtime_error("max of 64 sessions can be in use");
+         }
+         auto fs          = std::countr_zero(fs_bits);
+         auto new_fs_bits = fs_bits & ~(1 << fs);
+
+         while (not _free_sessions.compare_exchange_weak(fs_bits, new_fs_bits))
+         {
+            if (fs_bits == 0)
+            {
+               throw std::runtime_error("max of 64 sessions can be in use");
+            }
+            fs          = std::countr_zero(fs_bits);
+            new_fs_bits = fs_bits & ~(1 << fs);
+         }
+         //    std::cerr << "   alloc session bits: " << fs << " " <<std::bitset<64>(new_fs_bits) << "\n";
+         //    std::cerr << "   new fs bits: " << std::bitset<64>(new_fs_bits) << "\n";
+         //    _free_sessions.store(new_fs_bits);
+         return fs;
+      }
+      void release_session_num(uint32_t sn) { _free_sessions.fetch_or(uint64_t(1) << sn); }
+
+      std::pair<segment_number, mapped_memory::segment_header*> get_new_segment();
+
+      void compact_loop();
+      void compact_segment(session& ses, uint64_t seg_num);
+
+      /**
+       * This must be called via a session because the session is responsible
+       * for documenting what regions could be read
+       *
+       * All objects are const because they cannot be modified after being
+       * written.
+       */
+      const object_header* get_object(object_location loc) const;
+      const object_header* get_object(object_id oid) const;
+
+      /**
+       *  After all writes are complete, and there is not enough space
+       *  to allocate the next object the alloc_ptr gets set to MAX and
+       *  the page gets 
+       */
+      void finalize_segment(segment_number);
+
+      /**
+       *  After all data has been removed from a segment
+       * - madvise free/don't need 
+       * - add the segment number to the free segments at allocator_header::end_ptr
+       * - increment allocator_header::end_ptr
+       */
+      void release_segment(segment_number);
+
+      /**
+       * finds the most empty segment that is at least 25% empty
+       * - marks it for sequential access
+       * - scans it for remaining objects, moving them to a new region
+       * - releases segment
+       * - marks it as unneeded 
+       *
+       * and moves its contents to
+       * a new segment owned by the gc thread th
+       */
+      std::thread _compact_thread;
+
+      // maps ids to locations
+      id_allocator _id_alloc;
+
+      // allocates new segments
+      block_allocator _block_alloc;
+
+      /**
+       *  This is the highest the alloc_ptr is allowed to
+       *  advance and equal to min value of thread_ptrs.
+       *
+       *  Do not read directly, read via get_min_read_ptr()
+       */
+      std::atomic<uint64_t> _min_read_ptr = -1ull;  // min(R*)
+      uint64_t              get_min_read_ptr();
+
+      /**
+      * At the start of each access to the DB, 
+      * a read thread must copy the end_ptr and store
+      * it in this array indexed by the thread number. When
+      * the thread is done accessing the data it will reset
+      * the pointer to max_int64.  Each read pos is an index
+      * into _free_segments
+      *
+      * TODO: perhaps these need to be on their own cache line
+      * since different threads are writing to them, if so then
+      * we can store other session-local data on that cache line
+      * for free.
+      */
+      std::atomic<uint64_t> _session_ptrs[64];  // R* above
+
+      // to allocate a new session in thread-safe way you
+      // load, find first non-zero bit, and attempt to set it via C&S,
+      // the index of the bit is the session id.
+      // Reverse the process to free a session
+      std::atomic<uint64_t> _free_sessions = -1ull;
+
+      std::atomic<bool> _done;
+
+      mapping                          _header_file;
+      mapped_memory::allocator_header* _header;
+   };
+
+   template <typename T>
+   inline object_header* seg_allocator::session::read_lock::object_ref<T>::obj()
+   {
+      auto val = _atom_loc.load(std::memory_order_acquire);
+
+      if( (val & object_info::ref_mask)  == 0  ) {
+         return nullptr;
+      }
+
+      object_location loc{._offset = 8 * ( val >> object_info::location_rshift)};
+      auto            ptr = _rlock.get_object_pointer(loc);
+      return ptr;
+   }
+
+   template <typename T>
+   inline const object_header* seg_allocator::session::read_lock::object_ref<T>::obj() const
+   {
+      auto val = _atom_loc.load(std::memory_order_acquire);
+
+      if( (val & object_info::ref_mask)  == 0  ) {
+         return nullptr;
+      }
+
+      object_location loc{._offset = 8 * ( val >> object_info::location_rshift)};
+      auto            ptr = _rlock.get_object_pointer(loc);
+      return ptr;
+   }
+
+   template <typename T>
+   using deref = seg_allocator::session::read_lock::object_ref<T>;
+
+   /**
+    * Holds a unique_lock that toggles a bit and prevents the
+    * underlying object from being moved or released while the lock
+    * is held.
+    */
+   template <typename T>
+   struct mutable_deref : public deref<T>
+   {
+      mutable_deref(const deref<T>& src) : deref<T>(src), lock(src.get_mutex()) {}
+
+      /*
+      mutable_deref(std::unique_lock<std::mutex>& m, const deref<T>& src) 
+         : deref<T>(src), lock(m)
+      {
+      }
+      */
+
+      inline auto& as_value_node() const { return *this->template as<value_node>(); }
+      inline auto& as_inner_node() const { return *this->template as<inner_node>(); }
+
+      inline T* operator->() const { return const_cast<T*>(this->template as<T>()); }
+      inline T& operator*() const { return const_cast<T&>(*this->template as<T>()); }
+
+      ~mutable_deref() {
+         this->obj()->update_checksum();
+      }
+
+     private:
+      std::unique_lock<std::mutex> lock;
+   };  // mutable_deref
+
+
+   /**
+    * @param expect - the current location the caller things the object is at
+    * @param loc    - the new location the caller wants it to point at if and only
+    *                 if the expected location hasn't changed.
+    * @return true if the swap was made and the object still has a positive ref count
+    */
+   template <typename T>
+   bool seg_allocator::session::read_lock::object_ref<T>::move(object_location expect_loc, 
+                                                               object_location loc)
+   {
+      uint64_t expected  = _atom_loc.load(std::memory_order_acquire);
+      do
+      {
+         object_info ex(expected);
+         if( ex.location() != expect_loc or ex.ref() == 0 )
+            return false;
+         _cached = ex.set_location(loc);
+      } while (not _atom_loc.compare_exchange_weak(expected, _cached.to_int(),
+                                                   std::memory_order_release));
+      return true;
+   }
+
+   template <typename T>
+   bool seg_allocator::session::read_lock::object_ref<T>::retain()
+   {
+      auto prior = _atom_loc.fetch_add(1, std::memory_order_relaxed);
+      if ((prior & object_info::ref_mask) >= object_info::max_ref_count) [[unlikely]]
+      {
+         _atom_loc.fetch_sub(1, std::memory_order_relaxed);
+         return false;
+      }
+      assert( prior & object_info::ref_mask );
+      return true;
+   }
+
+   template <typename T>
+   bool seg_allocator::session::read_lock::object_ref<T>::release()
+   {
+      assert(ref_count() != 0);
+      assert(type() != node_type::undefined);
+      auto prior = _atom_loc.fetch_sub(1, std::memory_order_relaxed);
+      if ((prior & object_info::ref_mask) > 1)
+         return false;
+
+
+      _cached  = object_info(prior - 1);
+      auto loc = _cached.location();
+      auto seg = loc.segment();
+
+      auto obj_ptr =
+          (object_header*)((char*)_rlock._session._sega._block_alloc.get(seg) + loc.index());
+      obj_ptr->set_type( node_type::undefined );
+
+      // signal to compactor that this data is no longer valid before
+      // we allow the ID to be reused.
+      
+      // by touching this we are forcing pages to be written that were previously constant,
+      // but with recent changes to move() this check is almost redundant
+       obj_ptr->check = -1; //TODO: does this prevent false invalid checksum in validate
+
+
+      // This ID can be reused almost immediately after calling this method
+      // which means this objref object is worthless to the caller
+      _rlock._session._sega._id_alloc.free_id(_id);
+      _rlock._session._sega._header->seg_meta[seg].free_object(obj_ptr->data_capacity());
+
+      return true;
+   }
+
+   template <typename T>
+   using object_ref = seg_allocator::session::read_lock::object_ref<T>;
+   inline object_ref<char> seg_allocator::session::read_lock::alloc(uint32_t size, node_type type)
+   {
+      assert(type != node_type::undefined);
+
+      auto [atom, id] = _session._sega._id_alloc.get_new_id();
+      auto [loc, ptr] = _session.alloc_data(size + sizeof(object_header), id, type);
+
+      // TODO: this could break if object_info changes
+      atom.store(1 | (uint64_t(type) << 15) | ((loc._offset / 8) << 19), std::memory_order_relaxed);
+
+      assert(object_ref(*this, id, atom).type() != node_type::undefined);
+      return object_ref(*this, id, atom);
+   }
+
+   /*
+   inline object_ref<char> seg_allocator::session::read_lock::get(object_header* oh)
+   {
+      object_id oid(oh->id);
+      return object_ref(*this, oid, _session._sega._id_alloc.get(oid));
+   }
+   */
+
+   inline object_header* seg_allocator::session::read_lock::get_object_pointer(object_location loc)
+   {
+      auto segment = (mapped_memory::segment_header*)_session._sega._block_alloc.get(loc.segment());
+      // 0 means we are accessing a swapped object on a segment that hasn't started new allocs
+      // if alloc_pos > loc.index() then we haven't overwriten this object yet, we are accessing
+      // data behind the alloc pointer which should be safe
+      // to access data we had to get the location from obj id database and we should read
+      // with memory_order_acquire, when updating an object_info we need to write with
+      // memory_order_release otherwise the data written may not be visible yet to the reader coming
+      // along behind
+      assert(segment->_alloc_pos == 0 or segment->_alloc_pos > loc.index());
+      return (object_header*)((char*)_session._sega._block_alloc.get(loc.segment()) + loc.index());
+   }
+
+   /**
+    *  Given obj, if it isn't already located in the allocation segment of
+    *  this thread or in the allocation segment of another thread then
+    *  move it to the allocation segment of the current thread.
+    *
+    *  - do not wait for a write lock, if we can't get the write lock
+    *  then we will just let another thread move it
+    *
+    *  @return true if the object was moved
+    */
+   template <typename T>
+   bool seg_allocator::session::read_lock::object_ref<T>::cache_object()
+   {
+      std::unique_lock ul(get_mutex(), std::try_to_lock);
+
+      if (ul.owns_lock())
+      {
+         auto cur_loc = location()._offset;
+
+         assert(ref_count());
+         assert(cur_loc);
+         assert(cur_loc & (segment_size - 1));
+
+         auto           cur_seg     = cur_loc / segment_size;
+         auto           cur_seg_ptr = _rlock._session._sega.get_segment(cur_seg);
+         object_header* cur_obj_ptr =
+             (object_header*)(((char*)cur_seg_ptr) + (cur_loc & (segment_size - 1)));
+
+         assert(0 != cur_seg_ptr->_alloc_pos);  // this would be on a freed segment
+
+         // this would mean its currently located in an active alloc thread, while
+         // we could re-alloc it is probably already hot because a writer, reader,
+         // or compactor has just recently copied it. 
+         if (cur_seg_ptr->_alloc_pos.load(std::memory_order_relaxed) != uint32_t(-1))
+            return false;
+
+         auto obj_size   = cur_obj_ptr->object_size();
+         auto [loc, ptr] = _rlock._session.alloc_data(obj_size, _id, cur_obj_ptr->get_type());
+         memcpy(ptr, cur_obj_ptr, obj_size);
+         if (move(location(), loc))
+         {
+            // note that this item has been freed from the segment so the space
+            // can be recovered by the compactor
+            _rlock._session._sega._header->seg_meta[cur_seg].free_object(obj_size);
+            return true;
+         }
+      }
+      return false;
+   }
+
+}  // namespace triedent
diff --git a/libraries/triedent/include/triedent/xxhash.h b/libraries/triedent/include/triedent/xxhash.h
new file mode 100644
index 000000000..d11f0f633
--- /dev/null
+++ b/libraries/triedent/include/triedent/xxhash.h
@@ -0,0 +1,7048 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (C) 2012-2023 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*!
+ * @mainpage xxHash
+ *
+ * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed
+ * limits.
+ *
+ * It is proposed in four flavors, in three families:
+ * 1. @ref XXH32_family
+ *   - Classic 32-bit hash function. Simple, compact, and runs on almost all
+ *     32-bit and 64-bit systems.
+ * 2. @ref XXH64_family
+ *   - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most
+ *     64-bit systems (but _not_ 32-bit systems).
+ * 3. @ref XXH3_family
+ *   - Modern 64-bit and 128-bit hash function family which features improved
+ *     strength and performance across the board, especially on smaller data.
+ *     It benefits greatly from SIMD and 64-bit without requiring it.
+ *
+ * Benchmarks
+ * ---
+ * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04.
+ * The open source benchmark program is compiled with clang v10.0 using -O3 flag.
+ *
+ * | Hash Name            | ISA ext | Width | Large Data Speed | Small Data Velocity |
+ * | -------------------- | ------- | ----: | ---------------: | ------------------: |
+ * | XXH3_64bits()        | @b AVX2 |    64 |        59.4 GB/s |               133.1 |
+ * | MeowHash             | AES-NI  |   128 |        58.2 GB/s |                52.5 |
+ * | XXH3_128bits()       | @b AVX2 |   128 |        57.9 GB/s |               118.1 |
+ * | CLHash               | PCLMUL  |    64 |        37.1 GB/s |                58.1 |
+ * | XXH3_64bits()        | @b SSE2 |    64 |        31.5 GB/s |               133.1 |
+ * | XXH3_128bits()       | @b SSE2 |   128 |        29.6 GB/s |               118.1 |
+ * | RAM sequential read  |         |   N/A |        28.0 GB/s |                 N/A |
+ * | ahash                | AES-NI  |    64 |        22.5 GB/s |               107.2 |
+ * | City64               |         |    64 |        22.0 GB/s |                76.6 |
+ * | T1ha2                |         |    64 |        22.0 GB/s |                99.0 |
+ * | City128              |         |   128 |        21.7 GB/s |                57.7 |
+ * | FarmHash             | AES-NI  |    64 |        21.3 GB/s |                71.9 |
+ * | XXH64()              |         |    64 |        19.4 GB/s |                71.0 |
+ * | SpookyHash           |         |    64 |        19.3 GB/s |                53.2 |
+ * | Mum                  |         |    64 |        18.0 GB/s |                67.0 |
+ * | CRC32C               | SSE4.2  |    32 |        13.0 GB/s |                57.9 |
+ * | XXH32()              |         |    32 |         9.7 GB/s |                71.9 |
+ * | City32               |         |    32 |         9.1 GB/s |                66.0 |
+ * | Blake3*              | @b AVX2 |   256 |         4.4 GB/s |                 8.1 |
+ * | Murmur3              |         |    32 |         3.9 GB/s |                56.1 |
+ * | SipHash*             |         |    64 |         3.0 GB/s |                43.2 |
+ * | Blake3*              | @b SSE2 |   256 |         2.4 GB/s |                 8.1 |
+ * | HighwayHash          |         |    64 |         1.4 GB/s |                 6.0 |
+ * | FNV64                |         |    64 |         1.2 GB/s |                62.7 |
+ * | Blake2*              |         |   256 |         1.1 GB/s |                 5.1 |
+ * | SHA1*                |         |   160 |         0.8 GB/s |                 5.6 |
+ * | MD5*                 |         |   128 |         0.6 GB/s |                 7.8 |
+ * @note
+ *   - Hashes which require a specific ISA extension are noted. SSE2 is also noted,
+ *     even though it is mandatory on x64.
+ *   - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic
+ *     by modern standards.
+ *   - Small data velocity is a rough average of algorithm's efficiency for small
+ *     data. For more accurate information, see the wiki.
+ *   - More benchmarks and strength tests are found on the wiki:
+ *         https://github.com/Cyan4973/xxHash/wiki
+ *
+ * Usage
+ * ------
+ * All xxHash variants use a similar API. Changing the algorithm is a trivial
+ * substitution.
+ *
+ * @pre
+ *    For functions which take an input and length parameter, the following
+ *    requirements are assumed:
+ *    - The range from [`input`, `input + length`) is valid, readable memory.
+ *      - The only exception is if the `length` is `0`, `input` may be `NULL`.
+ *    - For C++, the objects must have the *TriviallyCopyable* property, as the
+ *      functions access bytes directly as if it was an array of `unsigned char`.
+ *
+ * @anchor single_shot_example
+ * **Single Shot**
+ *
+ * These functions are stateless functions which hash a contiguous block of memory,
+ * immediately returning the result. They are the easiest and usually the fastest
+ * option.
+ *
+ * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits()
+ *
+ * @code{.c}
+ *   #include <string.h>
+ *   #include "xxhash.h"
+ *
+ *   // Example for a function which hashes a null terminated string with XXH32().
+ *   XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed)
+ *   {
+ *       // NULL pointers are only valid if the length is zero
+ *       size_t length = (string == NULL) ? 0 : strlen(string);
+ *       return XXH32(string, length, seed);
+ *   }
+ * @endcode
+ *
+ *
+ * @anchor streaming_example
+ * **Streaming**
+ *
+ * These groups of functions allow incremental hashing of unknown size, even
+ * more than what would fit in a size_t.
+ *
+ * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset()
+ *
+ * @code{.c}
+ *   #include <stdio.h>
+ *   #include <assert.h>
+ *   #include "xxhash.h"
+ *   // Example for a function which hashes a FILE incrementally with XXH3_64bits().
+ *   XXH64_hash_t hashFile(FILE* f)
+ *   {
+ *       // Allocate a state struct. Do not just use malloc() or new.
+ *       XXH3_state_t* state = XXH3_createState();
+ *       assert(state != NULL && "Out of memory!");
+ *       // Reset the state to start a new hashing session.
+ *       XXH3_64bits_reset(state);
+ *       char buffer[4096];
+ *       size_t count;
+ *       // Read the file in chunks
+ *       while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) {
+ *           // Run update() as many times as necessary to process the data
+ *           XXH3_64bits_update(state, buffer, count);
+ *       }
+ *       // Retrieve the finalized hash. This will not change the state.
+ *       XXH64_hash_t result = XXH3_64bits_digest(state);
+ *       // Free the state. Do not use free().
+ *       XXH3_freeState(state);
+ *       return result;
+ *   }
+ * @endcode
+ *
+ * Streaming functions generate the xxHash value from an incremental input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * An XXH state must first be allocated using `XXH*_createState()`.
+ *
+ * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
+ *
+ * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
+ *
+ * The function returns an error code, with 0 meaning OK, and any other value
+ * meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a
+ * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+ *
+ * When done, release the state using `XXH*_freeState()`.
+ *
+ *
+ * @anchor canonical_representation_example
+ * **Canonical Representation**
+ *
+ * The default return values from XXH functions are unsigned 32, 64 and 128 bit
+ * integers.
+ * This the simplest and fastest format for further post-processing.
+ *
+ * However, this leaves open the question of what is the order on the byte level,
+ * since little and big endian conventions will store the same number differently.
+ *
+ * The canonical representation settles this issue by mandating big-endian
+ * convention, the same convention as human-readable numbers (large digits first).
+ *
+ * When writing hash values to storage, sending them over a network, or printing
+ * them, it's highly recommended to use the canonical representation to ensure
+ * portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values to and from
+ * canonical format.
+ *
+ * XXH32_canonicalFromHash(), XXH32_hashFromCanonical(),
+ * XXH64_canonicalFromHash(), XXH64_hashFromCanonical(),
+ * XXH128_canonicalFromHash(), XXH128_hashFromCanonical(),
+ *
+ * @code{.c}
+ *   #include <stdio.h>
+ *   #include "xxhash.h"
+ *
+ *   // Example for a function which prints XXH32_hash_t in human readable format
+ *   void printXxh32(XXH32_hash_t hash)
+ *   {
+ *       XXH32_canonical_t cano;
+ *       XXH32_canonicalFromHash(&cano, hash);
+ *       size_t i;
+ *       for(i = 0; i < sizeof(cano.digest); ++i) {
+ *           printf("%02x", cano.digest[i]);
+ *       }
+ *       printf("\n");
+ *   }
+ *
+ *   // Example for a function which converts XXH32_canonical_t to XXH32_hash_t
+ *   XXH32_hash_t convertCanonicalToXxh32(XXH32_canonical_t cano)
+ *   {
+ *       XXH32_hash_t hash = XXH32_hashFromCanonical(&cano);
+ *       return hash;
+ *   }
+ * @endcode
+ *
+ *
+ * @file xxhash.h
+ * xxHash prototypes and implementation
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ****************************
+ *  INLINE mode
+ ******************************/
+/*!
+ * @defgroup public Public API
+ * Contains details on the public xxHash functions.
+ * @{
+ */
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Gives access to internal state declaration, required for static allocation.
+ *
+ * Incompatible with dynamic linking, due to risks of ABI changes.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_STATIC_LINKING_ONLY
+ *     #include "xxhash.h"
+ * @endcode
+ */
+#  define XXH_STATIC_LINKING_ONLY
+/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */
+
+/*!
+ * @brief Gives access to internal definitions.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_STATIC_LINKING_ONLY
+ *     #define XXH_IMPLEMENTATION
+ *     #include "xxhash.h"
+ * @endcode
+ */
+#  define XXH_IMPLEMENTATION
+/* Do not undef XXH_IMPLEMENTATION for Doxygen */
+
+/*!
+ * @brief Exposes the implementation and marks all functions as `inline`.
+ *
+ * Use these build macros to inline xxhash into the target unit.
+ * Inlining improves performance on small inputs, especially when the length is
+ * expressed as a compile-time constant:
+ *
+ *  https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
+ *
+ * It also keeps xxHash symbols private to the unit, so they are not exported.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ * @endcode
+ * Do not compile and link xxhash.o as a separate object, as it is not useful.
+ */
+#  define XXH_INLINE_ALL
+#  undef XXH_INLINE_ALL
+/*!
+ * @brief Exposes the implementation without marking functions as inline.
+ */
+#  define XXH_PRIVATE_API
+#  undef XXH_PRIVATE_API
+/*!
+ * @brief Emulate a namespace by transparently prefixing all symbols.
+ *
+ * If you want to include _and expose_ xxHash functions from within your own
+ * library, but also want to avoid symbol collisions with other libraries which
+ * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix
+ * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE
+ * (therefore, avoid empty or numeric values).
+ *
+ * Note that no change is required within the calling program as long as it
+ * includes `xxhash.h`: Regular symbol names will be automatically translated
+ * by this header.
+ */
+#  define XXH_NAMESPACE /* YOUR NAME HERE */
+#  undef XXH_NAMESPACE
+#endif
+
+#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
+    && !defined(XXH_INLINE_ALL_31684351384)
+   /* this section should be traversed only once */
+#  define XXH_INLINE_ALL_31684351384
+   /* give access to the advanced API, required to compile implementations */
+#  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
+#  define XXH_STATIC_LINKING_ONLY
+   /* make all functions private */
+#  undef XXH_PUBLIC_API
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* note: this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+
+   /*
+    * This part deals with the special case where a unit wants to inline xxHash,
+    * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
+    * such as part of some previously included *.h header file.
+    * Without further action, the new include would just be ignored,
+    * and functions would effectively _not_ be inlined (silent failure).
+    * The following macros solve this situation by prefixing all inlined names,
+    * avoiding naming collision with previous inclusions.
+    */
+   /* Before that, we unconditionally #undef all symbols,
+    * in case they were already defined with XXH_NAMESPACE.
+    * They will then be redefined for XXH_INLINE_ALL
+    */
+#  undef XXH_versionNumber
+    /* XXH32 */
+#  undef XXH32
+#  undef XXH32_createState
+#  undef XXH32_freeState
+#  undef XXH32_reset
+#  undef XXH32_update
+#  undef XXH32_digest
+#  undef XXH32_copyState
+#  undef XXH32_canonicalFromHash
+#  undef XXH32_hashFromCanonical
+    /* XXH64 */
+#  undef XXH64
+#  undef XXH64_createState
+#  undef XXH64_freeState
+#  undef XXH64_reset
+#  undef XXH64_update
+#  undef XXH64_digest
+#  undef XXH64_copyState
+#  undef XXH64_canonicalFromHash
+#  undef XXH64_hashFromCanonical
+    /* XXH3_64bits */
+#  undef XXH3_64bits
+#  undef XXH3_64bits_withSecret
+#  undef XXH3_64bits_withSeed
+#  undef XXH3_64bits_withSecretandSeed
+#  undef XXH3_createState
+#  undef XXH3_freeState
+#  undef XXH3_copyState
+#  undef XXH3_64bits_reset
+#  undef XXH3_64bits_reset_withSeed
+#  undef XXH3_64bits_reset_withSecret
+#  undef XXH3_64bits_update
+#  undef XXH3_64bits_digest
+#  undef XXH3_generateSecret
+    /* XXH3_128bits */
+#  undef XXH128
+#  undef XXH3_128bits
+#  undef XXH3_128bits_withSeed
+#  undef XXH3_128bits_withSecret
+#  undef XXH3_128bits_reset
+#  undef XXH3_128bits_reset_withSeed
+#  undef XXH3_128bits_reset_withSecret
+#  undef XXH3_128bits_reset_withSecretandSeed
+#  undef XXH3_128bits_update
+#  undef XXH3_128bits_digest
+#  undef XXH128_isEqual
+#  undef XXH128_cmp
+#  undef XXH128_canonicalFromHash
+#  undef XXH128_hashFromCanonical
+    /* Finally, free the namespace itself */
+#  undef XXH_NAMESPACE
+
+    /* employ the namespace for XXH_INLINE_ALL */
+#  define XXH_NAMESPACE XXH_INLINE_
+   /*
+    * Some identifiers (enums, type names) are not symbols,
+    * but they must nonetheless be renamed to avoid redeclaration.
+    * Alternative solution: do not redeclare them.
+    * However, this requires some #ifdefs, and has a more dispersed impact.
+    * Meanwhile, renaming can be achieved in a single place.
+    */
+#  define XXH_IPREF(Id)   XXH_NAMESPACE ## Id
+#  define XXH_OK XXH_IPREF(XXH_OK)
+#  define XXH_ERROR XXH_IPREF(XXH_ERROR)
+#  define XXH_errorcode XXH_IPREF(XXH_errorcode)
+#  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
+#  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
+#  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
+#  define XXH32_state_s XXH_IPREF(XXH32_state_s)
+#  define XXH32_state_t XXH_IPREF(XXH32_state_t)
+#  define XXH64_state_s XXH_IPREF(XXH64_state_s)
+#  define XXH64_state_t XXH_IPREF(XXH64_state_t)
+#  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
+#  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
+#  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
+   /* Ensure the header is parsed again, even if it was previously included */
+#  undef XXHASH_H_5627135585666179
+#  undef XXHASH_H_STATIC_13879238742
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+/* ****************************************************************
+ *  Stable API
+ *****************************************************************/
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+/*! @brief Marks a global symbol. */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+/* XXH32 */
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+/* XXH64 */
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+/* XXH3_64bits */
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+#  define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#  define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
+#  define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)
+/* XXH3_128bits */
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+#  define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Compiler specifics
+***************************************/
+
+/* specific declaration modes for Windows */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+#if defined (__GNUC__)
+# define XXH_CONSTF  __attribute__((const))
+# define XXH_PUREF   __attribute__((pure))
+# define XXH_MALLOCF __attribute__((malloc))
+#else
+# define XXH_CONSTF  /* disable */
+# define XXH_PUREF
+# define XXH_MALLOCF
+#endif
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    8
+#define XXH_VERSION_RELEASE  2
+/*! @brief Version number, encoded as two digits each */
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+
+/*!
+ * @brief Obtains the xxHash version.
+ *
+ * This is mostly useful when xxHash is compiled as a shared library,
+ * since the returned value comes from the library, as opposed to header file.
+ *
+ * @return @ref XXH_VERSION_NUMBER of the invoked library.
+ */
+XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Common basic types
+******************************/
+#include <stddef.h>   /* size_t */
+/*!
+ * @brief Exit code for the streaming API.
+ */
+typedef enum {
+    XXH_OK = 0, /*!< OK */
+    XXH_ERROR   /*!< Error */
+} XXH_errorcode;
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
+/*!
+ * @brief An unsigned 32-bit integer.
+ *
+ * Not necessarily defined to `uint32_t` but functionally equivalent.
+ */
+typedef uint32_t XXH32_hash_t;
+
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint32_t XXH32_hash_t;
+
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   elif ULONG_MAX == 0xFFFFFFFFUL
+      typedef unsigned long XXH32_hash_t;
+#   else
+#     error "unsupported platform: need a 32-bit type"
+#   endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup XXH32_family XXH32 family
+ * @ingroup public
+ * Contains functions used in the classic 32-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH32 is useful for older platforms, with no or poor 64-bit performance.
+ *   Note that the @ref XXH3_family provides competitive speed for both 32-bit
+ *   and 64-bit systems, and offers true 64/128 bit hash results.
+ *
+ * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families
+ * @see @ref XXH32_impl for implementation details
+ * @{
+ */
+
+/*!
+ * @brief Calculates the 32-bit hash of @p input using xxHash32.
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 32-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 32-bit xxHash32 value.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+
+#ifndef XXH_NO_STREAM
+/*!
+ * @typedef struct XXH32_state_s XXH32_state_t
+ * @brief The opaque state struct for the XXH32 streaming API.
+ *
+ * @see XXH32_state_s for details.
+ * @see @ref streaming_example "Streaming Example"
+ */
+typedef struct XXH32_state_s XXH32_state_t;
+
+/*!
+ * @brief Allocates an @ref XXH32_state_t.
+ *
+ * @return An allocated pointer of @ref XXH32_state_t on success.
+ * @return `NULL` on failure.
+ *
+ * @note Must be freed with XXH32_freeState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void);
+/*!
+ * @brief Frees an @ref XXH32_state_t.
+ *
+ * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().
+ *
+ * @return @ref XXH_OK.
+ *
+ * @note @p statePtr must be allocated with XXH32_createState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+/*!
+ * @brief Copies one @ref XXH32_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH32_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 32-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note This function resets and seeds a state. Call it before @ref XXH32_update().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH32_state_t.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note Call this to incrementally consume blocks of data.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH32_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated 32-bit xxHash32 value from that state.
+ *
+ * @note
+ *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/*******   Canonical representation   *******/
+
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
+ */
+typedef struct {
+    unsigned char digest[4]; /*!< Hash bytes, big endian */
+} XXH32_canonical_t;
+
+/*!
+ * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
+ *
+ * @param dst  The @ref XXH32_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH32_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
+ *
+ * @param src The @ref XXH32_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+/*! @cond Doxygen ignores this part */
+#ifdef __has_attribute
+# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
+#else
+# define XXH_HAS_ATTRIBUTE(x) 0
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/*
+ * C23 __STDC_VERSION__ number hasn't been specified yet. For now
+ * leave as `201711L` (C17 + 1).
+ * TODO: Update to correct value when its been specified.
+ */
+#define XXH_C23_VN 201711L
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/* C-language Attributes are added in C23. */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute)
+# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
+#else
+# define XXH_HAS_C_ATTRIBUTE(x) 0
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+#if defined(__cplusplus) && defined(__has_cpp_attribute)
+# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+# define XXH_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/*
+ * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
+ * introduced in CPP17 and C23.
+ * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
+ * C23   : https://en.cppreference.com/w/c/language/attributes/fallthrough
+ */
+#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough)
+# define XXH_FALLTHROUGH [[fallthrough]]
+#elif XXH_HAS_ATTRIBUTE(__fallthrough__)
+# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__))
+#else
+# define XXH_FALLTHROUGH /* fallthrough */
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/*
+ * Define XXH_NOESCAPE for annotated pointers in public API.
+ * https://clang.llvm.org/docs/AttributeReference.html#noescape
+ * As of writing this, only supported by clang.
+ */
+#if XXH_HAS_ATTRIBUTE(noescape)
+# define XXH_NOESCAPE __attribute__((noescape))
+#else
+# define XXH_NOESCAPE
+#endif
+/*! @endcond */
+
+
+/*!
+ * @}
+ * @ingroup public
+ * @{
+ */
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */
+/*!
+ * @brief An unsigned 64-bit integer.
+ *
+ * Not necessarily defined to `uint64_t` but functionally equivalent.
+ */
+typedef uint64_t XXH64_hash_t;
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#  include <stdint.h>
+   typedef uint64_t XXH64_hash_t;
+#else
+#  include <limits.h>
+#  if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
+     /* LP64 ABI says uint64_t is unsigned long */
+     typedef unsigned long XXH64_hash_t;
+#  else
+     /* the following type must have a width of 64-bit */
+     typedef unsigned long long XXH64_hash_t;
+#  endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup XXH64_family XXH64 family
+ * @ingroup public
+ * @{
+ * Contains functions used in the classic 64-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ *   and offers true 64/128 bit hash results.
+ *   It provides better speed for systems with vector processing capabilities.
+ */
+
+/*!
+ * @brief Calculates the 64-bit hash of @p input using xxHash64.
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit xxHash64 value.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*!
+ * @brief The opaque state struct for the XXH64 streaming API.
+ *
+ * @see XXH64_state_s for details.
+ * @see @ref streaming_example "Streaming Example"
+ */
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+
+/*!
+ * @brief Allocates an @ref XXH64_state_t.
+ *
+ * @return An allocated pointer of @ref XXH64_state_t on success.
+ * @return `NULL` on failure.
+ *
+ * @note Must be freed with XXH64_freeState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void);
+
+/*!
+ * @brief Frees an @ref XXH64_state_t.
+ *
+ * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState().
+ *
+ * @return @ref XXH_OK.
+ *
+ * @note @p statePtr must be allocated with XXH64_createState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+
+/*!
+ * @brief Copies one @ref XXH64_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH64_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note This function resets and seeds a state. Call it before @ref XXH64_update().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH64_state_t.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note Call this to incrementally consume blocks of data.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH64_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated 64-bit xxHash64 value from that state.
+ *
+ * @note
+ *   Calling XXH64_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+/*******   Canonical representation   *******/
+
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH64_hash_t.
+ */
+typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
+
+/*!
+ * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t.
+ *
+ * @param dst The @ref XXH64_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH64_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t.
+ *
+ * @param src The @ref XXH64_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src);
+
+#ifndef XXH_NO_XXH3
+
+/*!
+ * @}
+ * ************************************************************************
+ * @defgroup XXH3_family XXH3 family
+ * @ingroup public
+ * @{
+ *
+ * XXH3 is a more recent hash algorithm featuring:
+ *  - Improved speed for both small and large inputs
+ *  - True 64-bit and 128-bit outputs
+ *  - SIMD acceleration
+ *  - Improved 32-bit viability
+ *
+ * Speed analysis methodology is explained here:
+ *
+ *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *
+ * Compared to XXH64, expect XXH3 to run approximately
+ * ~2x faster on large inputs and >3x faster on small ones,
+ * exact differences vary depending on platform.
+ *
+ * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
+ * but does not require it.
+ * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3
+ * at competitive speeds, even without vector support. Further details are
+ * explained in the implementation.
+ *
+ * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD
+ * implementations for many common platforms:
+ *   - AVX512
+ *   - AVX2
+ *   - SSE2
+ *   - ARM NEON
+ *   - WebAssembly SIMD128
+ *   - POWER8 VSX
+ *   - s390x ZVector
+ * This can be controlled via the @ref XXH_VECTOR macro, but it automatically
+ * selects the best version according to predefined macros. For the x86 family, an
+ * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c.
+ *
+ * XXH3 implementation is portable:
+ * it has a generic C90 formulation that can be compiled on any platform,
+ * all implementations generate exactly the same hash value on all platforms.
+ * Starting from v0.8.0, it's also labelled "stable", meaning that
+ * any future version will also generate the same hash value.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ *
+ * When only 64 bits are needed, prefer invoking the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ */
+/*-**********************************************************************
+*  XXH3 64-bit variant
+************************************************************************/
+
+/*!
+ * @brief Calculates 64-bit unseeded variant of XXH3 hash of @p input.
+ *
+ * @param input  The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit XXH3 hash value.
+ *
+ * @note
+ *   This is equivalent to @ref XXH3_64bits_withSeed() with a seed of `0`, however
+ *   it may have slightly better performance due to constant propagation of the
+ *   defaults.
+ *
+ * @see
+ *    XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Calculates 64-bit seeded variant of XXH3 hash of @p input.
+ *
+ * @param input  The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed   The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit XXH3 hash value.
+ *
+ * @note
+ *    seed == 0 produces the same results as @ref XXH3_64bits().
+ *
+ * This variant generates a custom secret on the fly based on default secret
+ * altered using the @p seed value.
+ *
+ * While this operation is decently fast, note that it's not completely free.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
+
+/*!
+ * The bare minimum size for a custom secret.
+ *
+ * @see
+ *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
+ *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+
+/*!
+ * @brief Calculates 64-bit variant of XXH3 with a custom "secret".
+ *
+ * @param data       The block of data to be hashed, at least @p len bytes in size.
+ * @param len        The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @return The calculated 64-bit XXH3 hash value.
+ *
+ * @pre
+ *   The memory between @p data and @p data + @p len must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p data may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
+ * Therefore, the secret _must_ look like a bunch of random bytes.
+ * Avoid "trivial" or structured data such as repeated sequences or a text document.
+ * Whenever in doubt about the "randomness" of the blob of bytes,
+ * consider employing @ref XXH3_generateSecret() instead (see below).
+ * It will generate a proper high entropy secret derived from the blob of bytes.
+ * Another advantage of using XXH3_generateSecret() is that
+ * it guarantees that all bits within the initial blob of bytes
+ * will impact every bit of the output.
+ * This is not necessarily the case when using the blob of bytes directly
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ */
+
+/*!
+ * @brief The opaque state struct for the XXH3 streaming API.
+ *
+ * @see XXH3_state_s for details.
+ * @see @ref streaming_example "Streaming Example"
+ */
+typedef struct XXH3_state_s XXH3_state_t;
+XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+
+/*!
+ * @brief Copies one @ref XXH3_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret with default parameters.
+ *   - Call this function before @ref XXH3_64bits_update().
+ *   - Digest will be equivalent to `XXH3_64bits()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed     The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret from `seed`.
+ *   - Call this function before @ref XXH3_64bits_update().
+ *   - Digest will be equivalent to `XXH3_64bits_withSeed()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   `secret` is referenced, it _must outlive_ the hash streaming session.
+ *
+ * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,
+ * and the quality of produced hash values depends on secret's entropy
+ * (secret's content should look like a bunch of random bytes).
+ * When in doubt about the randomness of a candidate `secret`,
+ * consider employing `XXH3_generateSecret()` instead (see below).
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH3_state_t.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note Call this to incrementally consume blocks of data.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated XXH3 64-bit hash value from that state.
+ *
+ * @note
+ *   Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t  XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/* note : canonical representation of XXH3 is the same as XXH64
+ * since they both produce XXH64_hash_t values */
+
+
+/*-**********************************************************************
+*  XXH3 128-bit variant
+************************************************************************/
+
+/*!
+ * @brief The return value from 128-bit hashes.
+ *
+ * Stored in little endian order, although the fields themselves are in native
+ * endianness.
+ */
+typedef struct {
+    XXH64_hash_t low64;   /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+    XXH64_hash_t high64;  /*!< `value >> 64` */
+} XXH128_hash_t;
+
+/*!
+ * @brief Calculates 128-bit unseeded variant of XXH3 of @p data.
+ *
+ * @param data The block of data to be hashed, at least @p length bytes in size.
+ * @param len  The length of @p data, in bytes.
+ *
+ * @return The calculated 128-bit variant of XXH3 value.
+ *
+ * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead
+ * for shorter inputs.
+ *
+ * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of `0`, however
+ * it may have slightly better performance due to constant propagation of the
+ * defaults.
+ *
+ * @see XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len);
+/*! @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.
+ *
+ * @param data The block of data to be hashed, at least @p length bytes in size.
+ * @param len  The length of @p data, in bytes.
+ * @param seed The 64-bit seed to alter the hash result predictably.
+ *
+ * @return The calculated 128-bit variant of XXH3 value.
+ *
+ * @note
+ *    seed == 0 produces the same results as @ref XXH3_64bits().
+ *
+ * This variant generates a custom secret on the fly based on default secret
+ * altered using the @p seed value.
+ *
+ * While this operation is decently fast, note that it's not completely free.
+ *
+ * @see XXH3_128bits(), XXH3_128bits_withSecret(): other seeding variants
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
+/*!
+ * @brief Calculates 128-bit variant of XXH3 with a custom "secret".
+ *
+ * @param data       The block of data to be hashed, at least @p len bytes in size.
+ * @param len        The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @return The calculated 128-bit variant of XXH3 value.
+ *
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
+ * Therefore, the secret _must_ look like a bunch of random bytes.
+ * Avoid "trivial" or structured data such as repeated sequences or a text document.
+ * Whenever in doubt about the "randomness" of the blob of bytes,
+ * consider employing @ref XXH3_generateSecret() instead (see below).
+ * It will generate a proper high entropy secret derived from the blob of bytes.
+ * Another advantage of using XXH3_generateSecret() is that
+ * it guarantees that all bits within the initial blob of bytes
+ * will impact every bit of the output.
+ * This is not necessarily the case when using the blob of bytes directly
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ *
+ * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
+ * Use already declared XXH3_createState() and XXH3_freeState().
+ *
+ * All reset and streaming functions have same meaning as their 64-bit counterpart.
+ */
+
+/*!
+ * @brief Resets an @ref XXH3_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret with default parameters.
+ *   - Call it before @ref XXH3_128bits_update().
+ *   - Digest will be equivalent to `XXH3_128bits()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed     The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret from `seed`.
+ *   - Call it before @ref XXH3_128bits_update().
+ *   - Digest will be equivalent to `XXH3_128bits_withSeed()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr   The state struct to reset.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * `secret` is referenced, it _must outlive_ the hash streaming session.
+ * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,
+ * and the quality of produced hash values depends on secret's entropy
+ * (secret's content should look like a bunch of random bytes).
+ * When in doubt about the randomness of a candidate `secret`,
+ * consider employing `XXH3_generateSecret()` instead (see below).
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH3_state_t.
+ *
+ * Call this to incrementally consume blocks of data.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated XXH3 128-bit hash value from that state.
+ *
+ * @note
+ *   Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/* Following helper functions make it possible to compare XXH128_hast_t values.
+ * Since XXH128_hash_t is a structure, this capability is not offered by the language.
+ * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
+
+/*!
+ * @brief Check equality of two XXH128_hash_t values
+ *
+ * @param h1 The 128-bit hash value.
+ * @param h2 Another 128-bit hash value.
+ *
+ * @return `1` if `h1` and `h2` are equal.
+ * @return `0` if they are not.
+ */
+XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/*!
+ * @brief Compares two @ref XXH128_hash_t
+ *
+ * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
+ *
+ * @param h128_1 Left-hand side value
+ * @param h128_2 Right-hand side value
+ *
+ * @return >0 if @p h128_1  > @p h128_2
+ * @return =0 if @p h128_1 == @p h128_2
+ * @return <0 if @p h128_1  < @p h128_2
+ */
+XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2);
+
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
+
+
+/*!
+ * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t.
+ *
+ * @param dst  The @ref XXH128_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH128_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t.
+ *
+ * @param src The @ref XXH128_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src);
+
+
+#endif  /* !XXH_NO_XXH3 */
+#endif  /* XXH_NO_LONG_LONG */
+
+/*!
+ * @}
+ */
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+#define XXHASH_H_STATIC_13879238742
+/* ****************************************************************************
+ * This section contains declarations which are not guaranteed to remain stable.
+ * They may change in future versions, becoming incompatible with a different
+ * version of the library.
+ * These declarations should only be used with static linking.
+ * Never use them in association with dynamic linking!
+ ***************************************************************************** */
+
+/*
+ * These definitions are only present to allow static allocation
+ * of XXH states, on stack or in a struct, for example.
+ * Never **ever** access their members directly.
+ */
+
+/*!
+ * @internal
+ * @brief Structure for XXH32 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH32_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH64_state_s, XXH3_state_s
+ */
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
+   XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
+   XXH32_hash_t v[4];         /*!< Accumulator lanes */
+   XXH32_hash_t mem32[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem32 */
+   XXH32_hash_t reserved;     /*!< Reserved field. Do not read nor write to it. */
+};   /* typedef'd to XXH32_state_t */
+
+
+#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+
+/*!
+ * @internal
+ * @brief Structure for XXH64 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH64_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH32_state_s, XXH3_state_s
+ */
+struct XXH64_state_s {
+   XXH64_hash_t total_len;    /*!< Total length hashed. This is always 64-bit. */
+   XXH64_hash_t v[4];         /*!< Accumulator lanes */
+   XXH64_hash_t mem64[4];     /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */
+   XXH32_hash_t memsize;      /*!< Amount of data in @ref mem64 */
+   XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
+   XXH64_hash_t reserved64;   /*!< Reserved field. Do not read or write to it. */
+};   /* typedef'd to XXH64_state_t */
+
+#ifndef XXH_NO_XXH3
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
+#  include <stdalign.h>
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
+/* In C++ alignas() is a keyword */
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
+
+/* Old GCC versions only accept the attribute after the type in structures. */
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
+    && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
+    && defined(__GNUC__)
+#   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+#else
+#   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+#endif
+
+/*!
+ * @brief The size of the internal XXH3 buffer.
+ *
+ * This is the optimal update size for incremental hashing.
+ *
+ * @see XXH3_64b_update(), XXH3_128b_update().
+ */
+#define XXH3_INTERNALBUFFER_SIZE 256
+
+/*!
+ * @internal
+ * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
+ *
+ * This is the size used in @ref XXH3_kSecret and the seeded functions.
+ *
+ * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
+ */
+#define XXH3_SECRET_DEFAULT_SIZE 192
+
+/*!
+ * @internal
+ * @brief Structure for XXH3 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
+ * Otherwise it is an opaque type.
+ * Never use this definition in combination with dynamic library.
+ * This allows fields to safely be changed in the future.
+ *
+ * @note ** This structure has a strict alignment requirement of 64 bytes!! **
+ * Do not allocate this with `malloc()` or `new`,
+ * it will not be sufficiently aligned.
+ * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
+ *
+ * Typedef'd to @ref XXH3_state_t.
+ * Do never access the members of this struct directly.
+ *
+ * @see XXH3_INITSTATE() for stack initialization.
+ * @see XXH3_createState(), XXH3_freeState().
+ * @see XXH32_state_s, XXH64_state_s
+ */
+struct XXH3_state_s {
+   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+       /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */
+   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+       /*!< Used to store a custom secret generated from a seed. */
+   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+       /*!< The internal buffer. @see XXH32_state_s::mem32 */
+   XXH32_hash_t bufferedSize;
+       /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
+   XXH32_hash_t useSeed;
+       /*!< Reserved field. Needed for padding on 64-bit. */
+   size_t nbStripesSoFar;
+       /*!< Number or stripes processed. */
+   XXH64_hash_t totalLen;
+       /*!< Total length hashed. 64-bit even on 32-bit targets. */
+   size_t nbStripesPerBlock;
+       /*!< Number of stripes per block. */
+   size_t secretLimit;
+       /*!< Size of @ref customSecret or @ref extSecret */
+   XXH64_hash_t seed;
+       /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */
+   XXH64_hash_t reserved64;
+       /*!< Reserved field. */
+   const unsigned char* extSecret;
+       /*!< Reference to an external secret for the _withSecret variants, NULL
+        *   for other variants. */
+   /* note: there may be some padding at the end due to alignment on 64 bytes */
+}; /* typedef'd to XXH3_state_t */
+
+#undef XXH_ALIGN_MEMBER
+
+/*!
+ * @brief Initializes a stack-allocated `XXH3_state_s`.
+ *
+ * When the @ref XXH3_state_t structure is merely emplaced on stack,
+ * it should be initialized with XXH3_INITSTATE() or a memset()
+ * in case its first reset uses XXH3_NNbits_reset_withSeed().
+ * This init can be omitted if the first reset uses default or _withSecret mode.
+ * This operation isn't necessary when the state is created with XXH3_createState().
+ * Note that this doesn't prepare the state for a streaming operation,
+ * it's still necessary to use XXH3_NNbits_reset*() afterwards.
+ */
+#define XXH3_INITSTATE(XXH3_state_ptr)                       \
+    do {                                                     \
+        XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \
+        tmp_xxh3_state_ptr->seed = 0;                        \
+        tmp_xxh3_state_ptr->extSecret = NULL;                \
+    } while(0)
+
+
+/*!
+ * @brief Calculates the 128-bit hash of @p data using XXH3.
+ *
+ * @param data The block of data to be hashed, at least @p len bytes in size.
+ * @param len  The length of @p data, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p data and @p data + @p len must be valid,
+ *   readable, contiguous memory. However, if @p len is `0`, @p data may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 128-bit XXH3 value.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* ===   Experimental API   === */
+/* Symbols defined below must be considered tied to a specific library version. */
+
+/*!
+ * @brief Derive a high-entropy secret from any user-defined content, named customSeed.
+ *
+ * @param secretBuffer    A writable buffer for derived high-entropy secret data.
+ * @param secretSize      Size of secretBuffer, in bytes.  Must be >= XXH3_SECRET_DEFAULT_SIZE.
+ * @param customSeed      A user-defined content.
+ * @param customSeedSize  Size of customSeed, in bytes.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * The generated secret can be used in combination with `*_withSecret()` functions.
+ * The `_withSecret()` variants are useful to provide a higher level of protection
+ * than 64-bit seed, as it becomes much more difficult for an external actor to
+ * guess how to impact the calculation logic.
+ *
+ * The function accepts as input a custom seed of any length and any content,
+ * and derives from it a high-entropy secret of length @p secretSize into an
+ * already allocated buffer @p secretBuffer.
+ *
+ * The generated secret can then be used with any `*_withSecret()` variant.
+ * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(),
+ * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret()
+ * are part of this list. They all accept a `secret` parameter
+ * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN)
+ * _and_ feature very high entropy (consist of random-looking bytes).
+ * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can
+ * be employed to ensure proper quality.
+ *
+ * @p customSeed can be anything. It can have any size, even small ones,
+ * and its content can be anything, even "poor entropy" sources such as a bunch
+ * of zeroes. The resulting `secret` will nonetheless provide all required qualities.
+ *
+ * @pre
+ *   - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN
+ *   - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
+ *
+ * Example code:
+ * @code{.c}
+ *    #include <stdio.h>
+ *    #include <stdlib.h>
+ *    #include <string.h>
+ *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
+ *    #include "xxhash.h"
+ *    // Hashes argv[2] using the entropy from argv[1].
+ *    int main(int argc, char* argv[])
+ *    {
+ *        char secret[XXH3_SECRET_SIZE_MIN];
+ *        if (argv != 3) { return 1; }
+ *        XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1]));
+ *        XXH64_hash_t h = XXH3_64bits_withSecret(
+ *             argv[2], strlen(argv[2]),
+ *             secret, sizeof(secret)
+ *        );
+ *        printf("%016llx\n", (unsigned long long) h);
+ *    }
+ * @endcode
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize);
+
+/*!
+ * @brief Generate the same secret as the _withSeed() variants.
+ *
+ * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes
+ * @param seed         The 64-bit seed to alter the hash result predictably.
+ *
+ * The generated secret can be used in combination with
+ *`*_withSecret()` and `_withSecretandSeed()` variants.
+ *
+ * Example C++ `std::string` hash class:
+ * @code{.cpp}
+ *    #include <string>
+ *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
+ *    #include "xxhash.h"
+ *    // Slow, seeds each time
+ *    class HashSlow {
+ *        XXH64_hash_t seed;
+ *    public:
+ *        HashSlow(XXH64_hash_t s) : seed{s} {}
+ *        size_t operator()(const std::string& x) const {
+ *            return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)};
+ *        }
+ *    };
+ *    // Fast, caches the seeded secret for future uses.
+ *    class HashFast {
+ *        unsigned char secret[XXH3_SECRET_SIZE_MIN];
+ *    public:
+ *        HashFast(XXH64_hash_t s) {
+ *            XXH3_generateSecret_fromSeed(secret, seed);
+ *        }
+ *        size_t operator()(const std::string& x) const {
+ *            return size_t{
+ *                XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret))
+ *            };
+ *        }
+ *    };
+ * @endcode
+ */
+XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed);
+
+/*!
+ * @brief Calculates 64/128-bit seeded variant of XXH3 hash of @p data.
+ *
+ * @param data       The block of data to be hashed, at least @p len bytes in size.
+ * @param len        The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed       The 64-bit seed to alter the hash result predictably.
+ *
+ * These variants generate hash values using either
+ * @p seed for "short" keys (< @ref XXH3_MIDSIZE_MAX = 240 bytes)
+ * or @p secret for "large" keys (>= @ref XXH3_MIDSIZE_MAX).
+ *
+ * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
+ * `_withSeed()` has to generate the secret on the fly for "large" keys.
+ * It's fast, but can be perceptible for "not so large" keys (< 1 KB).
+ * `_withSecret()` has to generate the masks on the fly for "small" keys,
+ * which requires more instructions than _withSeed() variants.
+ * Therefore, _withSecretandSeed variant combines the best of both worlds.
+ *
+ * When @p secret has been generated by XXH3_generateSecret_fromSeed(),
+ * this variant produces *exactly* the same results as `_withSeed()` variant,
+ * hence offering only a pure speed benefit on "large" input,
+ * by skipping the need to regenerate the secret for every large input.
+ *
+ * Another usage scenario is to hash the secret to a 64-bit hash value,
+ * for example with XXH3_64bits(), which then becomes the seed,
+ * and then employ both the seed and the secret in _withSecretandSeed().
+ * On top of speed, an added benefit is that each bit in the secret
+ * has a 50% chance to swap each bit in the output, via its impact to the seed.
+ *
+ * This is not guaranteed when using the secret directly in "small data" scenarios,
+ * because only portions of the secret are employed for small data.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t
+XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len,
+                              XXH_NOESCAPE const void* secret, size_t secretSize,
+                              XXH64_hash_t seed);
+/*!
+ * @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.
+ *
+ * @param input      The block of data to be hashed, at least @p len bytes in size.
+ * @param length     The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed64     The 64-bit seed to alter the hash result predictably.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @see XXH3_64bits_withSecretandSeed()
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t
+XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length,
+                               XXH_NOESCAPE const void* secret, size_t secretSize,
+                               XXH64_hash_t seed64);
+#ifndef XXH_NO_STREAM
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr   A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed64     The 64-bit seed to alter the hash result predictably.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @see XXH3_64bits_withSecretandSeed()
+ */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
+                                    XXH_NOESCAPE const void* secret, size_t secretSize,
+                                    XXH64_hash_t seed64);
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr   A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed64     The 64-bit seed to alter the hash result predictably.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @see XXH3_64bits_withSecretandSeed()
+ */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
+                                     XXH_NOESCAPE const void* secret, size_t secretSize,
+                                     XXH64_hash_t seed64);
+#endif /* !XXH_NO_STREAM */
+
+#endif  /* !XXH_NO_XXH3 */
+#endif  /* XXH_NO_LONG_LONG */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  define XXH_IMPLEMENTATION
+#endif
+
+#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+
+
+/* ======================================================================== */
+/* ======================================================================== */
+/* ======================================================================== */
+
+
+/*-**********************************************************************
+ * xxHash implementation
+ *-**********************************************************************
+ * xxHash's implementation used to be hosted inside xxhash.c.
+ *
+ * However, inlining requires implementation to be visible to the compiler,
+ * hence be included alongside the header.
+ * Previously, implementation was hosted inside xxhash.c,
+ * which was then #included when inlining was activated.
+ * This construction created issues with a few build and install systems,
+ * as it required xxhash.c to be stored in /include directory.
+ *
+ * xxHash implementation is now directly integrated within xxhash.h.
+ * As a consequence, xxhash.c is no longer needed in /include.
+ *
+ * xxhash.c is still available and is still useful.
+ * In a "normal" setup, when xxhash is not inlined,
+ * xxhash.h only exposes the prototypes and public symbols,
+ * while xxhash.c can be built into an object file xxhash.o
+ * which can then be linked into the final binary.
+ ************************************************************************/
+
+#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+#  define XXH_IMPLEM_13a8737387
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+
+/*!
+ * @defgroup tuning Tuning parameters
+ * @{
+ *
+ * Various macros to control xxHash's behavior.
+ */
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Define this to disable 64-bit code.
+ *
+ * Useful if only using the @ref XXH32_family and you have a strict C90 compiler.
+ */
+#  define XXH_NO_LONG_LONG
+#  undef XXH_NO_LONG_LONG /* don't actually */
+/*!
+ * @brief Controls how unaligned memory is accessed.
+ *
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable.
+ *
+ * Unfortunately, on some target/compiler combinations, the generated assembly
+ * is sub-optimal.
+ *
+ * The below switch allow selection of a different access method
+ * in the search for improved performance.
+ *
+ * @par Possible options:
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
+ *   @par
+ *     Use `memcpy()`. Safe and portable. Note that most modern compilers will
+ *     eliminate the function call and treat it as an unaligned access.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))`
+ *   @par
+ *     Depends on compiler extensions and is therefore not portable.
+ *     This method is safe _if_ your compiler supports it,
+ *     and *generally* as fast or faster than `memcpy`.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
+ *  @par
+ *     Casts directly and dereferences. This method doesn't depend on the
+ *     compiler, but it violates the C standard as it directly dereferences an
+ *     unaligned pointer. It can generate buggy code on targets which do not
+ *     support unaligned memory accesses, but in some circumstances, it's the
+ *     only known way to get the most performance.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
+ *  @par
+ *     Also portable. This can generate the best code on old compilers which don't
+ *     inline small `memcpy()` calls, and it might also be faster on big-endian
+ *     systems which lack a native byteswap instruction. However, some compilers
+ *     will emit literal byteshifts even if the target supports unaligned access.
+ *
+ *
+ * @warning
+ *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
+ *   care, as what works on one compiler/platform/optimization level may cause
+ *   another to read garbage data or even crash.
+ *
+ * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
+ *
+ * Prefer these methods in priority order (0 > 3 > 1 > 2)
+ */
+#  define XXH_FORCE_MEMORY_ACCESS 0
+
+/*!
+ * @def XXH_SIZE_OPT
+ * @brief Controls how much xxHash optimizes for size.
+ *
+ * xxHash, when compiled, tends to result in a rather large binary size. This
+ * is mostly due to heavy usage to forced inlining and constant folding of the
+ * @ref XXH3_family to increase performance.
+ *
+ * However, some developers prefer size over speed. This option can
+ * significantly reduce the size of the generated code. When using the `-Os`
+ * or `-Oz` options on GCC or Clang, this is defined to 1 by default,
+ * otherwise it is defined to 0.
+ *
+ * Most of these size optimizations can be controlled manually.
+ *
+ * This is a number from 0-2.
+ *  - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed
+ *    comes first.
+ *  - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more
+ *    conservative and disables hacks that increase code size. It implies the
+ *    options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0,
+ *    and @ref XXH3_NEON_LANES == 8 if they are not already defined.
+ *  - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible.
+ *    Performance may cry. For example, the single shot functions just use the
+ *    streaming API.
+ */
+#  define XXH_SIZE_OPT 0
+
+/*!
+ * @def XXH_FORCE_ALIGN_CHECK
+ * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
+ * and XXH64() only).
+ *
+ * This is an important performance trick for architectures without decent
+ * unaligned memory access performance.
+ *
+ * It checks for input alignment, and when conditions are met, uses a "fast
+ * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
+ * faster_ read speed.
+ *
+ * The check costs one initial branch per hash, which is generally negligible,
+ * but not zero.
+ *
+ * Moreover, it's not useful to generate an additional code path if memory
+ * access uses the same instruction for both aligned and unaligned
+ * addresses (e.g. x86 and aarch64).
+ *
+ * In these cases, the alignment check can be removed by setting this macro to 0.
+ * Then the code will always use unaligned memory access.
+ * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips
+ * which are platforms known to offer good unaligned memory accesses performance.
+ *
+ * It is also disabled by default when @ref XXH_SIZE_OPT >= 1.
+ *
+ * This option does not affect XXH3 (only XXH32 and XXH64).
+ */
+#  define XXH_FORCE_ALIGN_CHECK 0
+
+/*!
+ * @def XXH_NO_INLINE_HINTS
+ * @brief When non-zero, sets all functions to `static`.
+ *
+ * By default, xxHash tries to force the compiler to inline almost all internal
+ * functions.
+ *
+ * This can usually improve performance due to reduced jumping and improved
+ * constant folding, but significantly increases the size of the binary which
+ * might not be favorable.
+ *
+ * Additionally, sometimes the forced inlining can be detrimental to performance,
+ * depending on the architecture.
+ *
+ * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+ * compiler full control on whether to inline or not.
+ *
+ * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if
+ * @ref XXH_SIZE_OPT >= 1, this will automatically be defined.
+ */
+#  define XXH_NO_INLINE_HINTS 0
+
+/*!
+ * @def XXH3_INLINE_SECRET
+ * @brief Determines whether to inline the XXH3 withSecret code.
+ *
+ * When the secret size is known, the compiler can improve the performance
+ * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret().
+ *
+ * However, if the secret size is not known, it doesn't have any benefit. This
+ * happens when xxHash is compiled into a global symbol. Therefore, if
+ * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0.
+ *
+ * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers
+ * that are *sometimes* force inline on -Og, and it is impossible to automatically
+ * detect this optimization level.
+ */
+#  define XXH3_INLINE_SECRET 0
+
+/*!
+ * @def XXH32_ENDJMP
+ * @brief Whether to use a jump for `XXH32_finalize`.
+ *
+ * For performance, `XXH32_finalize` uses multiple branches in the finalizer.
+ * This is generally preferable for performance,
+ * but depending on exact architecture, a jmp may be preferable.
+ *
+ * This setting is only possibly making a difference for very small inputs.
+ */
+#  define XXH32_ENDJMP 0
+
+/*!
+ * @internal
+ * @brief Redefines old internal names.
+ *
+ * For compatibility with code that uses xxHash's internals before the names
+ * were changed to improve namespacing. There is no other reason to use this.
+ */
+#  define XXH_OLD_NAMES
+#  undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
+
+/*!
+ * @def XXH_NO_STREAM
+ * @brief Disables the streaming API.
+ *
+ * When xxHash is not inlined and the streaming functions are not used, disabling
+ * the streaming functions can improve code size significantly, especially with
+ * the @ref XXH3_family which tends to make constant folded copies of itself.
+ */
+#  define XXH_NO_STREAM
+#  undef XXH_NO_STREAM /* don't actually */
+#endif /* XXH_DOXYGEN */
+/*!
+ * @}
+ */
+
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+   /* prefer __packed__ structures (method 1) for GCC
+    * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy
+    * which for some reason does unaligned loads. */
+#  if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+#ifndef XXH_SIZE_OPT
+   /* default to 1 for -Os or -Oz */
+#  if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__)
+#    define XXH_SIZE_OPT 1
+#  else
+#    define XXH_SIZE_OPT 0
+#  endif
+#endif
+
+#ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
+   /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */
+#  if XXH_SIZE_OPT >= 1 || \
+      defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \
+   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64)    || defined(_M_ARM) /* visual */
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+#ifndef XXH_NO_INLINE_HINTS
+#  if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__)  /* -O0, -fno-inline */
+#    define XXH_NO_INLINE_HINTS 1
+#  else
+#    define XXH_NO_INLINE_HINTS 0
+#  endif
+#endif
+
+#ifndef XXH3_INLINE_SECRET
+#  if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \
+     || !defined(XXH_INLINE_ALL)
+#    define XXH3_INLINE_SECRET 0
+#  else
+#    define XXH3_INLINE_SECRET 1
+#  endif
+#endif
+
+#ifndef XXH32_ENDJMP
+/* generally preferable for performance */
+#  define XXH32_ENDJMP 0
+#endif
+
+/*!
+ * @defgroup impl Implementation
+ * @{
+ */
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+#if defined(XXH_NO_STREAM)
+/* nothing */
+#elif defined(XXH_NO_STDLIB)
+
+/* When requesting to disable any mention of stdlib,
+ * the library loses the ability to invoked malloc / free.
+ * In practice, it means that functions like `XXH*_createState()`
+ * will always fail, and return NULL.
+ * This flag is useful in situations where
+ * xxhash.h is integrated into some kernel, embedded or limited environment
+ * without access to dynamic allocation.
+ */
+
+static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; }
+static void XXH_free(void* p) { (void)p; }
+
+#else
+
+/*
+ * Modify the local functions below should you wish to use
+ * different memory routines for malloc() and free()
+ */
+#include <stdlib.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than malloc().
+ */
+static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); }
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than free().
+ */
+static void XXH_free(void* p) { free(p); }
+
+#endif  /* XXH_NO_STDLIB */
+
+#include <string.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than memcpy().
+ */
+static void* XXH_memcpy(void* dest, const void* src, size_t size)
+{
+    return memcpy(dest,src,size);
+}
+
+#include <limits.h>   /* ULLONG_MAX */
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER /* Visual Studio warning fix */
+#  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+#if XXH_NO_INLINE_HINTS  /* disable inlining hints */
+#  if defined(__GNUC__) || defined(__clang__)
+#    define XXH_FORCE_INLINE static __attribute__((unused))
+#  else
+#    define XXH_FORCE_INLINE static
+#  endif
+#  define XXH_NO_INLINE static
+/* enable inlining hints */
+#elif defined(__GNUC__) || defined(__clang__)
+#  define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
+#  define XXH_NO_INLINE static __attribute__((noinline))
+#elif defined(_MSC_VER)  /* Visual Studio */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
+#elif defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* C99 */
+#  define XXH_FORCE_INLINE static inline
+#  define XXH_NO_INLINE static
+#else
+#  define XXH_FORCE_INLINE static
+#  define XXH_NO_INLINE static
+#endif
+
+#if XXH3_INLINE_SECRET
+#  define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE
+#else
+#  define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE
+#endif
+
+
+/* *************************************
+*  Debug
+***************************************/
+/*!
+ * @ingroup tuning
+ * @def XXH_DEBUGLEVEL
+ * @brief Sets the debugging level.
+ *
+ * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
+ * compiler's command line options. The value must be a number.
+ */
+#ifndef XXH_DEBUGLEVEL
+#  ifdef DEBUGLEVEL /* backwards compat */
+#    define XXH_DEBUGLEVEL DEBUGLEVEL
+#  else
+#    define XXH_DEBUGLEVEL 0
+#  endif
+#endif
+
+#if (XXH_DEBUGLEVEL>=1)
+#  include <assert.h>   /* note: can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  if defined(__INTEL_COMPILER)
+#    define XXH_ASSERT(c)   XXH_ASSUME((unsigned char) (c))
+#  else
+#    define XXH_ASSERT(c)   XXH_ASSUME(c)
+#  endif
+#endif
+
+/* note: use after variable declarations */
+#ifndef XXH_STATIC_ASSERT
+#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)    /* C11 */
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0)
+#  elif defined(__cplusplus) && (__cplusplus >= 201103L)            /* C++11 */
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
+#  else
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0)
+#  endif
+#  define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c)
+#endif
+
+/*!
+ * @internal
+ * @def XXH_COMPILER_GUARD(var)
+ * @brief Used to prevent unwanted optimizations for @p var.
+ *
+ * It uses an empty GCC inline assembly statement with a register constraint
+ * which forces @p var into a general purpose register (eg eax, ebx, ecx
+ * on x86) and marks it as modified.
+ *
+ * This is used in a few places to avoid unwanted autovectorization (e.g.
+ * XXH32_round()). All vectorization we want is explicit via intrinsics,
+ * and _usually_ isn't wanted elsewhere.
+ *
+ * We also use it to prevent unwanted constant folding for AArch64 in
+ * XXH3_initCustomSecret_scalar().
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#  define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var))
+#else
+#  define XXH_COMPILER_GUARD(var) ((void)0)
+#endif
+
+/* Specifically for NEON vectors which use the "w" constraint, on
+ * Clang. */
+#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__)
+#  define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var))
+#else
+#  define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0)
+#endif
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef uint8_t xxh_u8;
+#else
+  typedef unsigned char xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+#ifdef XXH_OLD_NAMES
+#  warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly"
+#  define BYTE xxh_u8
+#  define U8   xxh_u8
+#  define U32  xxh_u32
+#endif
+
+/* ***   Memory access   *** */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_read32(const void* ptr)
+ * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit native endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readBE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit big endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
+ * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
+ * always @ref XXH_alignment::XXH_unaligned.
+ *
+ * @param ptr The pointer to read from.
+ * @param align Whether @p ptr is aligned.
+ * @pre
+ *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
+ *   aligned.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE32 and XXH_readBE32.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/*
+ * Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware.
+ */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
+ * documentation claimed that it only increased the alignment, but actually it
+ * can decrease it on gcc, clang, and icc:
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
+ * https://gcc.godbolt.org/z/xYez1j67Y.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+#endif
+static xxh_u32 XXH_read32(const void* ptr)
+{
+    typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32;
+    return *((const xxh_unalign32*)ptr);
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    XXH_memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ***   Endianness   *** */
+
+/*!
+ * @ingroup tuning
+ * @def XXH_CPU_LITTLE_ENDIAN
+ * @brief Whether the target is little endian.
+ *
+ * Defined to 1 if the target is little endian, or 0 if it is big endian.
+ * It can be defined externally, for example on the compiler command line.
+ *
+ * If it is not defined,
+ * a runtime check (which is usually constant folded) is used instead.
+ *
+ * @note
+ *   This is not necessarily defined to an integer constant.
+ *
+ * @see XXH_isLittleEndian() for the runtime check.
+ */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+/*
+ * Try to detect endianness automatically, to avoid the nonstandard behavior
+ * in `XXH_isLittleEndian()`
+ */
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+/*!
+ * @internal
+ * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
+ *
+ * Most compilers will constant fold this.
+ */
+static int XXH_isLittleEndian(void)
+{
+    /*
+     * Portable and well-defined behavior.
+     * Don't use static: it is detrimental to performance.
+     */
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifdef __has_builtin
+#  define XXH_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define XXH_HAS_BUILTIN(x) 0
+#endif
+
+
+
+/*
+ * C23 and future versions have standard "unreachable()".
+ * Once it has been implemented reliably we can add it as an
+ * additional case:
+ *
+ * ```
+ * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN)
+ * #  include <stddef.h>
+ * #  ifdef unreachable
+ * #    define XXH_UNREACHABLE() unreachable()
+ * #  endif
+ * #endif
+ * ```
+ *
+ * Note C++23 also has std::unreachable() which can be detected
+ * as follows:
+ * ```
+ * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L)
+ * #  include <utility>
+ * #  define XXH_UNREACHABLE() std::unreachable()
+ * #endif
+ * ```
+ * NB: `__cpp_lib_unreachable` is defined in the `<version>` header.
+ * We don't use that as including `<utility>` in `extern "C"` blocks
+ * doesn't work on GCC12
+ */
+
+#if XXH_HAS_BUILTIN(__builtin_unreachable)
+#  define XXH_UNREACHABLE() __builtin_unreachable()
+
+#elif defined(_MSC_VER)
+#  define XXH_UNREACHABLE() __assume(0)
+
+#else
+#  define XXH_UNREACHABLE()
+#endif
+
+#if XXH_HAS_BUILTIN(__builtin_assume)
+#  define XXH_ASSUME(c) __builtin_assume(c)
+#else
+#  define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); }
+#endif
+
+/*!
+ * @internal
+ * @def XXH_rotl32(x,r)
+ * @brief 32-bit rotate left.
+ *
+ * @param x The 32-bit integer to be rotated.
+ * @param r The number of bits to rotate.
+ * @pre
+ *   @p r > 0 && @p r < 32
+ * @note
+ *   @p x and @p r may be evaluated multiple times.
+ * @return The rotated result.
+ */
+#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \
+                               && XXH_HAS_BUILTIN(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_swap32(xxh_u32 x)
+ * @brief A 32-bit byteswap.
+ *
+ * @param x The 32-bit integer to byteswap.
+ * @return @p x, byteswapped.
+ */
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+
+/*!
+ * @internal
+ * @brief Enum to indicate whether a pointer is aligned.
+ */
+typedef enum {
+    XXH_aligned,  /*!< Aligned */
+    XXH_unaligned /*!< Possibly unaligned */
+} XXH_alignment;
+
+/*
+ * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
+ *
+ * This is ideal for older compilers which don't inline memcpy.
+ */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u32)bytePtr[1] << 8)
+         | ((xxh_u32)bytePtr[2] << 16)
+         | ((xxh_u32)bytePtr[3] << 24);
+}
+
+XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[3]
+         | ((xxh_u32)bytePtr[2] << 8)
+         | ((xxh_u32)bytePtr[1] << 16)
+         | ((xxh_u32)bytePtr[0] << 24);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+/*! @ingroup public */
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @defgroup XXH32_impl XXH32 implementation
+ * @ingroup impl
+ *
+ * Details on the XXH32 implementation.
+ * @{
+ */
+ /* #define instead of static const, to be used as initializers */
+#define XXH_PRIME32_1  0x9E3779B1U  /*!< 0b10011110001101110111100110110001 */
+#define XXH_PRIME32_2  0x85EBCA77U  /*!< 0b10000101111010111100101001110111 */
+#define XXH_PRIME32_3  0xC2B2AE3DU  /*!< 0b11000010101100101010111000111101 */
+#define XXH_PRIME32_4  0x27D4EB2FU  /*!< 0b00100111110101001110101100101111 */
+#define XXH_PRIME32_5  0x165667B1U  /*!< 0b00010110010101100110011110110001 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME32_1 XXH_PRIME32_1
+#  define PRIME32_2 XXH_PRIME32_2
+#  define PRIME32_3 XXH_PRIME32_3
+#  define PRIME32_4 XXH_PRIME32_4
+#  define PRIME32_5 XXH_PRIME32_5
+#endif
+
+/*!
+ * @internal
+ * @brief Normal stripe processing routine.
+ *
+ * This shuffles the bits so that any bit from @p input impacts several bits in
+ * @p acc.
+ *
+ * @param acc The accumulator lane.
+ * @param input The stripe of input to mix.
+ * @return The mixed accumulator lane.
+ */
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+    acc += input * XXH_PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= XXH_PRIME32_1;
+#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * UGLY HACK:
+     * A compiler fence is the only thing that prevents GCC and Clang from
+     * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
+     * reason) without globally disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
+     *   newer chips!) making it slightly slower to multiply four integers at
+     *   once compared to four integers independently. Even when pmulld was
+     *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
+     *   just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because
+     *   the SIMD actually serializes this operation: While v1 is rotating, v2
+     *   can load data, while v3 can multiply. SSE forces them to operate
+     *   together.
+     *
+     * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing
+     * the loop. NEON is only faster on the A53, and with the newer cores, it is less
+     * than half the speed.
+     *
+     * Additionally, this is used on WASM SIMD128 because it JITs to the same
+     * SIMD instructions and has the same issue.
+     */
+    XXH_COMPILER_GUARD(acc);
+#endif
+    return acc;
+}
+
+/*!
+ * @internal
+ * @brief Mixes all bits to finalize the hash.
+ *
+ * The final mix ensures that all input bits have a chance to impact any bit in
+ * the output digest, resulting in an unbiased distribution.
+ *
+ * @param hash The hash to avalanche.
+ * @return The avalanched hash.
+ */
+static xxh_u32 XXH32_avalanche(xxh_u32 hash)
+{
+    hash ^= hash >> 15;
+    hash *= XXH_PRIME32_2;
+    hash ^= hash >> 13;
+    hash *= XXH_PRIME32_3;
+    hash ^= hash >> 16;
+    return hash;
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+/*!
+ * @internal
+ * @brief Processes the last 0-15 bytes of @p ptr.
+ *
+ * There may be up to 15 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param hash The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 16.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash.
+ * @see XXH64_finalize().
+ */
+static XXH_PUREF xxh_u32
+XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define XXH_PROCESS1 do {                             \
+    hash += (*ptr++) * XXH_PRIME32_5;                 \
+    hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1;      \
+} while (0)
+
+#define XXH_PROCESS4 do {                             \
+    hash += XXH_get32bits(ptr) * XXH_PRIME32_3;       \
+    ptr += 4;                                         \
+    hash  = XXH_rotl32(hash, 17) * XXH_PRIME32_4;     \
+} while (0)
+
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+
+    /* Compact rerolled version; generally faster */
+    if (!XXH32_ENDJMP) {
+        len &= 15;
+        while (len >= 4) {
+            XXH_PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            XXH_PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(hash);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 8:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 4:       XXH_PROCESS4;
+                         return XXH32_avalanche(hash);
+
+           case 13:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 9:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 5:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(hash);
+
+           case 14:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 10:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 6:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(hash);
+
+           case 15:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 11:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 7:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 3:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 2:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 1:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 0:       return XXH32_avalanche(hash);
+        }
+        XXH_ASSERT(0);
+        return hash;   /* reaching this point is deemed impossible */
+    }
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1 XXH_PROCESS1
+#  define PROCESS4 XXH_PROCESS4
+#else
+#  undef XXH_PROCESS1
+#  undef XXH_PROCESS4
+#endif
+
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH32().
+ *
+ * @param input , len , seed Directly passed from @ref XXH32().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    xxh_u32 h32;
+
+    if (input==NULL) XXH_ASSERT(len == 0);
+
+    if (len>=16) {
+        const xxh_u8* const bEnd = input + len;
+        const xxh_u8* const limit = bEnd - 15;
+        xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+        xxh_u32 v2 = seed + XXH_PRIME32_2;
+        xxh_u32 v3 = seed + 0;
+        xxh_u32 v4 = seed - XXH_PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+        } while (input < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + XXH_PRIME32_5;
+    }
+
+    h32 += (xxh_u32)len;
+
+    return XXH32_finalize(h32, input, len&15, align);
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*******   Hash streaming   *******/
+#ifndef XXH_NO_STREAM
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+    statePtr->v[1] = seed + XXH_PRIME32_2;
+    statePtr->v[2] = seed + 0;
+    statePtr->v[3] = seed - XXH_PRIME32_1;
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const xxh_u32* p32 = state->mem32;
+                state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++;
+                state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++;
+                state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++;
+                state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p <= bEnd-16) {
+            const xxh_u8* const limit = bEnd - 16;
+
+            do {
+                state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4;
+                state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4;
+                state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4;
+                state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4;
+            } while (p<=limit);
+
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
+{
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v[0], 1)
+            + XXH_rotl32(state->v[1], 7)
+            + XXH_rotl32(state->v[2], 12)
+            + XXH_rotl32(state->v[3], 18);
+    } else {
+        h32 = state->v[2] /* == seed */ + XXH_PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
+#endif /* !XXH_NO_STREAM */
+
+/*******   Canonical representation   *******/
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    XXH_memcpy(dst, &hash, sizeof(*dst));
+}
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @ingroup impl
+ * @{
+ */
+/*******   Memory access   *******/
+
+typedef XXH64_hash_t xxh_u64;
+
+#ifdef XXH_OLD_NAMES
+#  define U64 xxh_u64
+#endif
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE64 and XXH_readBE64.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    return *(const xxh_u64*) memPtr;
+}
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
+ * documentation claimed that it only increased the alignment, but actually it
+ * can decrease it on gcc, clang, and icc:
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
+ * https://gcc.godbolt.org/z/xYez1j67Y.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+#endif
+static xxh_u64 XXH_read64(const void* ptr)
+{
+    typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64;
+    return *((const xxh_unalign64*)ptr);
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+ */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    XXH_memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64(xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u64)bytePtr[1] << 8)
+         | ((xxh_u64)bytePtr[2] << 16)
+         | ((xxh_u64)bytePtr[3] << 24)
+         | ((xxh_u64)bytePtr[4] << 32)
+         | ((xxh_u64)bytePtr[5] << 40)
+         | ((xxh_u64)bytePtr[6] << 48)
+         | ((xxh_u64)bytePtr[7] << 56);
+}
+
+XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[7]
+         | ((xxh_u64)bytePtr[6] << 8)
+         | ((xxh_u64)bytePtr[5] << 16)
+         | ((xxh_u64)bytePtr[4] << 24)
+         | ((xxh_u64)bytePtr[3] << 32)
+         | ((xxh_u64)bytePtr[2] << 40)
+         | ((xxh_u64)bytePtr[1] << 48)
+         | ((xxh_u64)bytePtr[0] << 56);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*******   xxh64   *******/
+/*!
+ * @}
+ * @defgroup XXH64_impl XXH64 implementation
+ * @ingroup impl
+ *
+ * Details on the XXH64 implementation.
+ * @{
+ */
+/* #define rather that static const, to be used as initializers */
+#define XXH_PRIME64_1  0x9E3779B185EBCA87ULL  /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */
+#define XXH_PRIME64_2  0xC2B2AE3D27D4EB4FULL  /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */
+#define XXH_PRIME64_3  0x165667B19E3779F9ULL  /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */
+#define XXH_PRIME64_4  0x85EBCA77C2B2AE63ULL  /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */
+#define XXH_PRIME64_5  0x27D4EB2F165667C5ULL  /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME64_1 XXH_PRIME64_1
+#  define PRIME64_2 XXH_PRIME64_2
+#  define PRIME64_3 XXH_PRIME64_3
+#  define PRIME64_4 XXH_PRIME64_4
+#  define PRIME64_5 XXH_PRIME64_5
+#endif
+
+/*! @copydoc XXH32_round */
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * XXH_PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= XXH_PRIME64_1;
+    return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
+    return acc;
+}
+
+/*! @copydoc XXH32_avalanche */
+static xxh_u64 XXH64_avalanche(xxh_u64 hash)
+{
+    hash ^= hash >> 33;
+    hash *= XXH_PRIME64_2;
+    hash ^= hash >> 29;
+    hash *= XXH_PRIME64_3;
+    hash ^= hash >> 32;
+    return hash;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+/*!
+ * @internal
+ * @brief Processes the last 0-31 bytes of @p ptr.
+ *
+ * There may be up to 31 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param hash The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 32.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash
+ * @see XXH32_finalize().
+ */
+static XXH_PUREF xxh_u64
+XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+    len &= 31;
+    while (len >= 8) {
+        xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
+        ptr += 8;
+        hash ^= k1;
+        hash  = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
+        len -= 8;
+    }
+    if (len >= 4) {
+        hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
+        ptr += 4;
+        hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
+        len -= 4;
+    }
+    while (len > 0) {
+        hash ^= (*ptr++) * XXH_PRIME64_5;
+        hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1;
+        --len;
+    }
+    return  XXH64_avalanche(hash);
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1_64 XXH_PROCESS1_64
+#  define PROCESS4_64 XXH_PROCESS4_64
+#  define PROCESS8_64 XXH_PROCESS8_64
+#else
+#  undef XXH_PROCESS1_64
+#  undef XXH_PROCESS4_64
+#  undef XXH_PROCESS8_64
+#endif
+
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH64().
+ *
+ * @param input , len , seed Directly passed from @ref XXH64().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    xxh_u64 h64;
+    if (input==NULL) XXH_ASSERT(len == 0);
+
+    if (len>=32) {
+        const xxh_u8* const bEnd = input + len;
+        const xxh_u8* const limit = bEnd - 31;
+        xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+        xxh_u64 v2 = seed + XXH_PRIME64_2;
+        xxh_u64 v3 = seed + 0;
+        xxh_u64 v4 = seed - XXH_PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+        } while (input<limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
+}
+
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, (const xxh_u8*)input, len);
+    return XXH64_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*******   Hash Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*! @ingroup XXH64_family*/
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+    statePtr->v[1] = seed + XXH_PRIME64_2;
+    statePtr->v[2] = seed + 0;
+    statePtr->v[3] = seed - XXH_PRIME64_1;
+    return XXH_OK;
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len += len;
+
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+            state->memsize += (xxh_u32)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0));
+            state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1));
+            state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2));
+            state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3));
+            p += 32 - state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p+32 <= bEnd) {
+            const xxh_u8* const limit = bEnd - 32;
+
+            do {
+                state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8;
+                state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8;
+                state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8;
+                state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8;
+            } while (p<=limit);
+
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18);
+        h64 = XXH64_mergeRound(h64, state->v[0]);
+        h64 = XXH64_mergeRound(h64, state->v[1]);
+        h64 = XXH64_mergeRound(h64, state->v[2]);
+        h64 = XXH64_mergeRound(h64, state->v[3]);
+    } else {
+        h64  = state->v[2] /*seed*/ + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) state->total_len;
+
+    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+}
+#endif /* !XXH_NO_STREAM */
+
+/******* Canonical representation   *******/
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    XXH_memcpy(dst, &hash, sizeof(*dst));
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+#ifndef XXH_NO_XXH3
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+/*!
+ * @}
+ * @defgroup XXH3_impl XXH3 implementation
+ * @ingroup impl
+ * @{
+ */
+
+/* ===   Compiler specifics   === */
+
+#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
+#  define XXH_RESTRICT   /* disable */
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
+   || (defined (__clang__)) \
+   || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \
+   || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300))
+/*
+ * There are a LOT more compilers that recognize __restrict but this
+ * covers the major ones.
+ */
+#  define XXH_RESTRICT   __restrict
+#else
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define XXH_likely(x) __builtin_expect(x, 1)
+#    define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+#    define XXH_likely(x) (x)
+#    define XXH_unlikely(x) (x)
+#endif
+
+#ifndef XXH_HAS_INCLUDE
+#  ifdef __has_include
+/*
+ * Not defined as XXH_HAS_INCLUDE(x) (function-like) because
+ * this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion)
+ */
+#    define XXH_HAS_INCLUDE __has_include
+#  else
+#    define XXH_HAS_INCLUDE(x) 0
+#  endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(__ARM_FEATURE_SVE)
+#    include <arm_sve.h>
+#  endif
+#  if defined(__ARM_NEON__) || defined(__ARM_NEON) \
+   || (defined(_M_ARM) && _M_ARM >= 7) \
+   || defined(_M_ARM64) || defined(_M_ARM64EC) \
+   || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* WASM SIMD128 via SIMDe */
+#    define inline __inline__  /* circumvent a clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  elif defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+ * remaining a true 64-bit/128-bit hash function.
+ *
+ * This is done by prioritizing a subset of 64-bit operations that can be
+ * emulated without too many steps on the average 32-bit machine.
+ *
+ * For example, these two lines seem similar, and run equally fast on 64-bit:
+ *
+ *   xxh_u64 x;
+ *   x ^= (x >> 47); // good
+ *   x ^= (x >> 13); // bad
+ *
+ * However, to a 32-bit machine, there is a major difference.
+ *
+ * x ^= (x >> 47) looks like this:
+ *
+ *   x.lo ^= (x.hi >> (47 - 32));
+ *
+ * while x ^= (x >> 13) looks like this:
+ *
+ *   // note: funnel shifts are not usually cheap.
+ *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+ *   x.hi ^= (x.hi >> 13);
+ *
+ * The first one is significantly faster than the second, simply because the
+ * shift is larger than 32. This means:
+ *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
+ *    32 bits in the shift.
+ *  - The shift result will always fit in the lower 32 bits, and therefore,
+ *    we can ignore the upper 32 bits in the xor.
+ *
+ * Thanks to this optimization, XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *  - For the 128-bit variant, a decent byteswap helps short inputs.
+ *
+ * The first two are already required by XXH32, and almost all 32-bit and 64-bit
+ * platforms which can run XXH32 can run XXH3 efficiently.
+ *
+ * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+ * notable exception.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand allowing free
+ * shifts is helpful, too.
+ *
+ * Therefore, we do a quick sanity check.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we will
+ * emit a warning, as it is not a "sane" platform to compile for.
+ *
+ * Usually, if this happens, it is because of an accident and you probably need
+ * to specify -march, as you likely meant to compile for a newer architecture.
+ *
+ * Credit: large sections of the vectorial and asm source code paths
+ *         have been contributed by @easyaspi314
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @ingroup tuning
+ * @brief Overrides the vectorization implementation chosen for XXH3.
+ *
+ * Can be defined to 0 to disable SIMD or any of the values mentioned in
+ * @ref XXH_VECTOR_TYPE.
+ *
+ * If this is not defined, it uses predefined macros to determine the best
+ * implementation.
+ */
+#  define XXH_VECTOR XXH_SCALAR
+/*!
+ * @ingroup tuning
+ * @brief Possible values for @ref XXH_VECTOR.
+ *
+ * Note that these are actually implemented as macros.
+ *
+ * If this is not defined, it is detected automatically.
+ * internal macro XXH_X86DISPATCH overrides this.
+ */
+enum XXH_VECTOR_TYPE /* fake enum */ {
+    XXH_SCALAR = 0,  /*!< Portable scalar version */
+    XXH_SSE2   = 1,  /*!<
+                      * SSE2 for Pentium 4, Opteron, all x86_64.
+                      *
+                      * @note SSE2 is also guaranteed on Windows 10, macOS, and
+                      * Android x86.
+                      */
+    XXH_AVX2   = 2,  /*!< AVX2 for Haswell and Bulldozer */
+    XXH_AVX512 = 3,  /*!< AVX512 for Skylake and Icelake */
+    XXH_NEON   = 4,  /*!<
+                       * NEON for most ARMv7-A, all AArch64, and WASM SIMD128
+                       * via the SIMDeverywhere polyfill provided with the
+                       * Emscripten SDK.
+                       */
+    XXH_VSX    = 5,  /*!< VSX and ZVector for POWER8/z13 (64-bit) */
+    XXH_SVE    = 6,  /*!< SVE for some ARMv8-A and ARMv9-A */
+};
+/*!
+ * @ingroup tuning
+ * @brief Selects the minimum alignment for XXH3's accumulators.
+ *
+ * When using SIMD, this should match the alignment required for said vector
+ * type, so, for example, 32 for AVX2.
+ *
+ * Default: Auto detected.
+ */
+#  define XXH_ACC_ALIGN 8
+#endif
+
+/* Actual definition */
+#ifndef XXH_DOXYGEN
+#  define XXH_SCALAR 0
+#  define XXH_SSE2   1
+#  define XXH_AVX2   2
+#  define XXH_AVX512 3
+#  define XXH_NEON   4
+#  define XXH_VSX    5
+#  define XXH_SVE    6
+#endif
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__ARM_FEATURE_SVE)
+#    define XXH_VECTOR XXH_SVE
+#  elif ( \
+        defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
+     || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
+     || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* wasm simd128 via SIMDe */ \
+   ) && ( \
+        defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
+   )
+#    define XXH_VECTOR XXH_NEON
+#  elif defined(__AVX512F__)
+#    define XXH_VECTOR XXH_AVX512
+#  elif defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
+     || (defined(__s390x__) && defined(__VEC__)) \
+     && defined(__GNUC__) /* TODO: IBM XL */
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */
+#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)
+#  ifdef _MSC_VER
+#    pragma warning(once : 4606)
+#  else
+#    warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."
+#  endif
+#  undef XXH_VECTOR
+#  define XXH_VECTOR XXH_SCALAR
+#endif
+
+/*
+ * Controls the alignment of the accumulator,
+ * for compatibility with aligned vector loads, which are usually faster.
+ */
+#ifndef XXH_ACC_ALIGN
+#  if defined(XXH_X86DISPATCH)
+#     define XXH_ACC_ALIGN 64  /* for compatibility with avx512 */
+#  elif XXH_VECTOR == XXH_SCALAR  /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == XXH_NEON  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_VSX   /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX512  /* avx512 */
+#     define XXH_ACC_ALIGN 64
+#  elif XXH_VECTOR == XXH_SVE   /* sve */
+#     define XXH_ACC_ALIGN 64
+#  endif
+#endif
+
+#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
+    || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#elif XXH_VECTOR == XXH_SVE
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#else
+#  define XXH_SEC_ALIGN 8
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  define XXH_ALIASING __attribute__((may_alias))
+#else
+#  define XXH_ALIASING /* nothing */
+#endif
+
+/*
+ * UGLY HACK:
+ * GCC usually generates the best code with -O3 for xxHash.
+ *
+ * However, when targeting AVX2, it is overzealous in its unrolling resulting
+ * in code roughly 3/4 the speed of Clang.
+ *
+ * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+ * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
+ * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
+ *
+ * That is why when compiling the AVX2 version, it is recommended to use either
+ *   -O2 -mavx2 -march=haswell
+ * or
+ *   -O2 -mavx2 -mno-avx256-split-unaligned-load
+ * for decent performance, or to use Clang instead.
+ *
+ * Fortunately, we can control the first one with a pragma that forces GCC into
+ * -O2, but the other one we can't control without "failed to inline always
+ * inline function due to target mismatch" warnings.
+ */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
+#  pragma GCC push_options
+#  pragma GCC optimize("-O2")
+#endif
+
+#if XXH_VECTOR == XXH_NEON
+
+/*
+ * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3
+ * optimizes out the entire hashLong loop because of the aliasing violation.
+ *
+ * However, GCC is also inefficient at load-store optimization with vld1q/vst1q,
+ * so the only option is to mark it as aliasing.
+ */
+typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
+
+/*!
+ * @internal
+ * @brief `vld1q_u64` but faster and alignment-safe.
+ *
+ * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only
+ * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86).
+ *
+ * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it
+ * prohibits load-store optimizations. Therefore, a direct dereference is used.
+ *
+ * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe
+ * unaligned load.
+ */
+#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
+XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
+{
+    return *(xxh_aliasing_uint64x2_t const *)ptr;
+}
+#else
+XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
+{
+    return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
+}
+#endif
+
+/*!
+ * @internal
+ * @brief `vmlal_u32` on low and high halves of a vector.
+ *
+ * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with
+ * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32`
+ * with `vmlal_u32`.
+ */
+#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    /* Inline assembly is the only way */
+    __asm__("umlal   %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs));
+    return acc;
+}
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    /* This intrinsic works as expected */
+    return vmlal_high_u32(acc, lhs, rhs);
+}
+#else
+/* Portable intrinsic versions */
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs));
+}
+/*! @copydoc XXH_vmlal_low_u32
+ * Assume the compiler converts this to vmlal_high_u32 on aarch64 */
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs));
+}
+#endif
+
+/*!
+ * @ingroup tuning
+ * @brief Controls the NEON to scalar ratio for XXH3
+ *
+ * This can be set to 2, 4, 6, or 8.
+ *
+ * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used.
+ *
+ * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those
+ * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU
+ * bandwidth.
+ *
+ * This is even more noticeable on the more advanced cores like the Cortex-A76 which
+ * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
+ *
+ * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes
+ * and 2 scalar lanes, which is chosen by default.
+ *
+ * This does not apply to Apple processors or 32-bit processors, which run better with
+ * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes.
+ *
+ * This change benefits CPUs with large micro-op buffers without negatively affecting
+ * most other CPUs:
+ *
+ *  | Chipset               | Dispatch type       | NEON only | 6:2 hybrid | Diff. |
+ *  |:----------------------|:--------------------|----------:|-----------:|------:|
+ *  | Snapdragon 730 (A76)  | 2 NEON/8 micro-ops  |  8.8 GB/s |  10.1 GB/s |  ~16% |
+ *  | Snapdragon 835 (A73)  | 2 NEON/3 micro-ops  |  5.1 GB/s |   5.3 GB/s |   ~5% |
+ *  | Marvell PXA1928 (A53) | In-order dual-issue |  1.9 GB/s |   1.9 GB/s |    0% |
+ *  | Apple M1              | 4 NEON/8 micro-ops  | 37.3 GB/s |  36.1 GB/s |  ~-3% |
+ *
+ * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
+ *
+ * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning
+ * it effectively becomes worse 4.
+ *
+ * @see XXH3_accumulate_512_neon()
+ */
+# ifndef XXH3_NEON_LANES
+#  if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
+   && !defined(__APPLE__) && XXH_SIZE_OPT <= 0
+#   define XXH3_NEON_LANES 6
+#  else
+#   define XXH3_NEON_LANES XXH_ACC_NB
+#  endif
+# endif
+#endif  /* XXH_VECTOR == XXH_NEON */
+
+/*
+ * VSX and Z Vector helpers.
+ *
+ * This is very messy, and any pull requests to clean this up are welcome.
+ *
+ * There are a lot of problems with supporting VSX and s390x, due to
+ * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+ */
+#if XXH_VECTOR == XXH_VSX
+/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`,
+ * and `pixel`. This is a problem for obvious reasons.
+ *
+ * These keywords are unnecessary; the spec literally says they are
+ * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd
+ * after including the header.
+ *
+ * We use pragma push_macro/pop_macro to keep the namespace clean. */
+#  pragma push_macro("bool")
+#  pragma push_macro("vector")
+#  pragma push_macro("pixel")
+/* silence potential macro redefined warnings */
+#  undef bool
+#  undef vector
+#  undef pixel
+
+#  if defined(__s390x__)
+#    include <s390intrin.h>
+#  else
+#    include <altivec.h>
+#  endif
+
+/* Restore the original macro values, if applicable. */
+#  pragma pop_macro("pixel")
+#  pragma pop_macro("vector")
+#  pragma pop_macro("bool")
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char xxh_u8x16;
+typedef __vector unsigned xxh_u32x4;
+
+/*
+ * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue.
+ */
+typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING;
+
+# ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+# endif /* !defined(XXH_VSX_BE) */
+
+# if XXH_VSX_BE
+#  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
+#    define XXH_vec_revb vec_revb
+#  else
+/*!
+ * A polyfill for POWER9's vec_revb().
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
+{
+    xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                                  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+# endif /* XXH_VSX_BE */
+
+/*!
+ * Performs an unaligned vector load and byte swaps it on big endian.
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
+{
+    xxh_u64x2 ret;
+    XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));
+# if XXH_VSX_BE
+    ret = XXH_vec_revb(ret);
+# endif
+    return ret;
+}
+
+/*
+ * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+ *
+ * These intrinsics weren't added until GCC 8, despite existing for a while,
+ * and they are endian dependent. Also, their meaning swap depending on version.
+ * */
+# if defined(__s390x__)
+ /* s390x is always big endian, no issue on this platform */
+#  define XXH_vec_mulo vec_mulo
+#  define XXH_vec_mule vec_mule
+# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__)
+/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
+ /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+# else
+/* gcc needs inline assembly */
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+# endif /* XXH_vec_mulo, XXH_vec_mule */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+#if XXH_VECTOR == XXH_SVE
+#define ACCRND(acc, offset) \
+do { \
+    svuint64_t input_vec = svld1_u64(mask, xinput + offset);         \
+    svuint64_t secret_vec = svld1_u64(mask, xsecret + offset);       \
+    svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec);     \
+    svuint64_t swapped = svtbl_u64(input_vec, kSwap);                \
+    svuint64_t mixed_lo = svextw_u64_x(mask, mixed);                 \
+    svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32);            \
+    svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \
+    acc = svadd_u64_x(mask, acc, mul);                               \
+} while (0)
+#endif /* XXH_VECTOR == XXH_SVE */
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if XXH_SIZE_OPT >= 1
+#    define XXH_PREFETCH(ptr) (void)(ptr)
+#  elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+/*! Pseudorandom secret taken directly from FARSH. */
+XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL;  /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */
+static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL;  /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */
+
+#ifdef XXH_OLD_NAMES
+#  define kSecret XXH3_kSecret
+#endif
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Implemented as a macro.
+ *
+ * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't
+ * need to (but it shouldn't need to anyways, it is about 7 instructions to do
+ * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we
+ * use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better option,
+ * you may also want to write your own long multiply routine here.
+ *
+ * @param x, y Numbers to be multiplied
+ * @return 64-bit product of the low 32 bits of @p x and @p y.
+ */
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+{
+   return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+}
+#elif defined(_MSC_VER) && defined(_M_IX86)
+#    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+#else
+/*
+ * Downcast + upcast is usually better than masking on older compilers like
+ * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
+ *
+ * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
+ * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
+ */
+#    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+#endif
+
+/*!
+ * @brief Calculates a 64->128-bit long multiply.
+ *
+ * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
+ * version.
+ *
+ * @param lhs , rhs The 64-bit integers to be multiplied
+ * @return The 128-bit result represented in an @ref XXH128_hash_t.
+ */
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this type
+     * despite not having the arithmetic for it. This results in a laggy
+     * compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t r128;
+    r128.low64  = (xxh_u64)(product);
+    r128.high64 = (xxh_u64)(product >> 64);
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t r128;
+    r128.low64  = product_low;
+    r128.high64 = product_high;
+    return r128;
+
+    /*
+     * MSVC for ARM64's __umulh method.
+     *
+     * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
+     */
+#elif defined(_M_ARM64) || defined(_M_ARM64EC)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(__umulh)
+#endif
+    XXH128_hash_t r128;
+    r128.low64  = lhs * rhs;
+    r128.high64 = __umulh(lhs, rhs);
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown below
+     * with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
+     *     ---------
+     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
+     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
+     *     ---------
+     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
+     *     This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARM's Digital Signal Processing extension
+     *     in 32-bit ARMv6 and later, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication, and
+     *     allows this to be calculated in only 4 instructions at speeds
+     *     comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be a couple
+     *     of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128;
+    r128.low64  = lower;
+    r128.high64 = upper;
+    return r128;
+#endif
+}
+
+/*!
+ * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force it.
+ *
+ * @param lhs , rhs The 64-bit integers to multiply
+ * @return The low 64 bits of the product XOR'd by the high 64 bits.
+ * @see XXH_mult64to128()
+ */
+static xxh_u64
+XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+/*! Seems to produce slightly better code on GCC for some reason. */
+XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+{
+    XXH_ASSERT(0 <= shift && shift < 64);
+    return v64 ^ (v64 >> shift);
+}
+
+/*
+ * This is a fast avalanche stage,
+ * suitable when input bits are already partially mixed
+ */
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+{
+    h64 = XXH_xorshift64(h64, 37);
+    h64 *= PRIME_MX1;
+    h64 = XXH_xorshift64(h64, 32);
+    return h64;
+}
+
+/*
+ * This is a stronger avalanche,
+ * inspired by Pelle Evensen's rrmxmx
+ * preferable when input has not been previously mixed
+ */
+static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
+{
+    /* this mix is inspired by Pelle Evensen's rrmxmx */
+    h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
+    h64 *= PRIME_MX2;
+    h64 ^= (h64 >> 35) + len ;
+    h64 *= PRIME_MX2;
+    return XXH_xorshift64(h64, 28);
+}
+
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8  const c1 = input[0];
+        xxh_u8  const c2 = input[len >> 1];
+        xxh_u8  const c3 = input[len - 1];
+        xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
+                               | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
+        xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+        return XXH64_avalanche(keyed);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input1 = XXH_readLE32(input);
+        xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+        xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+        xxh_u64 const keyed = input64 ^ bitflip;
+        return XXH3_rrmxmx(keyed, len);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+        xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+        xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+        xxh_u64 const acc = len
+                          + XXH_swap64(input_lo) + input_hi
+                          + XXH3_mul128_fold64(input_lo, input_hi);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
+        if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+        return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
+    }
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
+  && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
+    /*
+     * UGLY HACK:
+     * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
+     * slower code.
+     *
+     * By forcing seed64 into a register, we disrupt the cost model and
+     * cause it to scalarize. See `XXH32_round()`
+     *
+     * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+     * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+     * GCC 9.2, despite both emitting scalar code.
+     *
+     * GCC generates much better scalar code than Clang for the rest of XXH3,
+     * which is why finding a more optimal codepath is an interest.
+     */
+    XXH_COMPILER_GUARD(seed64);
+#endif
+    {   xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64 const input_hi = XXH_readLE64(input+8);
+        return XXH3_mul128_fold64(
+            input_lo ^ (XXH_readLE64(secret)   + seed64),
+            input_hi ^ (XXH_readLE64(secret+8) - seed64)
+        );
+    }
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+#if XXH_SIZE_OPT >= 1
+        /* Smaller and cleaner, but slightly slower. */
+        unsigned int i = (unsigned int)(len - 1) / 32;
+        do {
+            acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);
+            acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);
+        } while (i-- != 0);
+#else
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3_mix16B(input+48, secret+96, seed);
+                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc += XXH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc += XXH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc += XXH3_mix16B(input+len-16, secret+16, seed);
+#endif
+        return XXH3_avalanche(acc);
+    }
+}
+
+/*!
+ * @brief Maximum size of "short" key in bytes.
+ */
+#define XXH3_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+        xxh_u64 acc_end;
+        unsigned int const nbRounds = (unsigned int)len / 16;
+        unsigned int i;
+        XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        /* last bytes */
+        acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        XXH_ASSERT(nbRounds >= 8);
+        acc = XXH3_avalanche(acc);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+#endif
+        for (i=8 ; i < nbRounds; i++) {
+            /*
+             * Prevents clang for unrolling the acc loop and interleaving with this one.
+             */
+            XXH_COMPILER_GUARD(acc);
+            acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        return XXH3_avalanche(acc + acc_end);
+    }
+}
+
+
+/* =======     Long Keys     ======= */
+
+#define XXH_STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
+
+#ifdef XXH_OLD_NAMES
+#  define STRIPE_LEN XXH_STRIPE_LEN
+#  define ACC_NB XXH_ACC_NB
+#endif
+
+#ifndef XXH_PREFETCH_DIST
+#  ifdef __clang__
+#    define XXH_PREFETCH_DIST 320
+#  else
+#    if (XXH_VECTOR == XXH_AVX512)
+#      define XXH_PREFETCH_DIST 512
+#    else
+#      define XXH_PREFETCH_DIST 384
+#    endif
+#  endif  /* __clang__ */
+#endif  /* XXH_PREFETCH_DIST */
+
+/*
+ * These macros are to generate an XXH3_accumulate() function.
+ * The two arguments select the name suffix and target attribute.
+ *
+ * The name of this symbol is XXH3_accumulate_<name>() and it calls
+ * XXH3_accumulate_512_<name>().
+ *
+ * It may be useful to hand implement this function if the compiler fails to
+ * optimize the inline function.
+ */
+#define XXH3_ACCUMULATE_TEMPLATE(name)                      \
+void                                                        \
+XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc,           \
+                       const xxh_u8* XXH_RESTRICT input,    \
+                       const xxh_u8* XXH_RESTRICT secret,   \
+                       size_t nbStripes)                    \
+{                                                           \
+    size_t n;                                               \
+    for (n = 0; n < nbStripes; n++ ) {                      \
+        const xxh_u8* const in = input + n*XXH_STRIPE_LEN;  \
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);               \
+        XXH3_accumulate_512_##name(                         \
+                 acc,                                       \
+                 in,                                        \
+                 secret + n*XXH_SECRET_CONSUME_RATE);       \
+    }                                                       \
+}
+
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    XXH_memcpy(dst, &v64, sizeof(v64));
+}
+
+/* Several intrinsic functions below are supposed to accept __int64 as argument,
+ * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
+ * However, several environments do not define __int64 type,
+ * requiring a workaround.
+ */
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+    typedef int64_t xxh_i64;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef long long xxh_i64;
+#endif
+
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+#if (XXH_VECTOR == XXH_AVX512) \
+     || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
+
+#ifndef XXH_TARGET_AVX512
+# define XXH_TARGET_AVX512  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    __m512i* const xacc = (__m512i *) acc;
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+
+    {
+        /* data_vec    = input[0]; */
+        __m512i const data_vec    = _mm512_loadu_si512   (input);
+        /* key_vec     = secret[0]; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        /* data_key    = data_vec ^ key_vec; */
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+        /* data_key_lo = data_key >> 32; */
+        __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32);
+        /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+        __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
+        /* xacc[0] += swap(data_vec); */
+        __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
+        __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
+        /* xacc[0] += product; */
+        *xacc = _mm512_add_epi64(product, sum);
+    }
+}
+XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512)
+
+/*
+ * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+ *
+ * Multiplication isn't perfect, as explained by Google in HighwayHash:
+ *
+ *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ *  // varying degrees. In descending order of goodness, bytes
+ *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ *  // As expected, the upper and lower bytes are much worse.
+ *
+ * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
+ *
+ * Since our algorithm uses a pseudorandom secret to add some variance into the
+ * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
+ *
+ * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
+ * extraction.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+    {   __m512i* const xacc = (__m512i*) acc;
+        const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
+
+        /* xacc[0] ^= (xacc[0] >> 47) */
+        __m512i const acc_vec     = *xacc;
+        __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+        /* xacc[0] ^= secret; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        __m512i const data_key    = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */);
+
+        /* xacc[0] *= XXH_PRIME32_1; */
+        __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32);
+        __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
+        __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
+        *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
+    XXH_ASSERT(((size_t)customSecret & 63) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
+        __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64);
+        __m512i const seed     = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos);
+
+        const __m512i* const src  = (const __m512i*) ((const void*) XXH3_kSecret);
+              __m512i* const dest = (      __m512i*) customSecret;
+        int i;
+        XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 63) == 0);
+        for (i=0; i < nbRounds; ++i) {
+            dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_AVX2) \
+    || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
+
+#ifndef XXH_TARGET_AVX2
+# define XXH_TARGET_AVX2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   __m256i* const xacc    =       (__m256i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
+        const         __m256i* const xinput  = (const __m256i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32);
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+            __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm256_add_epi64(product, sum);
+    }   }
+}
+XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2)
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   __m256i* const xacc = (__m256i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32);
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
+    (void)(&XXH_writeLE64);
+    XXH_PREFETCH(customSecret);
+    {   __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);
+
+        const __m256i* const src  = (const __m256i*) ((const void*) XXH3_kSecret);
+              __m256i*       dest = (      __m256i*) customSecret;
+
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dest);
+#       endif
+        XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 31) == 0);
+
+        /* GCC -O2 need unroll loop manually */
+        dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed);
+        dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed);
+        dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed);
+        dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed);
+        dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed);
+        dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed);
+    }
+}
+
+#endif
+
+/* x86dispatch always generates SSE2 */
+#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
+
+#ifndef XXH_TARGET_SSE2
+# define XXH_TARGET_SSE2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* SSE2 is just a half-scale version of the AVX2 version. */
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   __m128i* const xacc    =       (__m128i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xinput  = (const __m128i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+            __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm_add_epi64(product, sum);
+    }   }
+}
+XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2)
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   __m128i* const xacc = (__m128i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
+
+#       if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+        /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
+        XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) };
+        __m128i const seed = _mm_load_si128((__m128i const*)seed64x2);
+#       else
+        __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
+#       endif
+        int i;
+
+        const void* const src16 = XXH3_kSecret;
+        __m128i* dst16 = (__m128i*) customSecret;
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dst16);
+#       endif
+        XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dst16 & 15) == 0);
+
+        for (i=0; i < nbRounds; ++i) {
+            dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_NEON)
+
+/* forward declarations for the scalar routines */
+XXH_FORCE_INLINE void
+XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,
+                 void const* XXH_RESTRICT secret, size_t lane);
+
+XXH_FORCE_INLINE void
+XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
+                         void const* XXH_RESTRICT secret, size_t lane);
+
+/*!
+ * @internal
+ * @brief The bulk processing loop for NEON and WASM SIMD128.
+ *
+ * The NEON code path is actually partially scalar when running on AArch64. This
+ * is to optimize the pipelining and can have up to 15% speedup depending on the
+ * CPU, and it also mitigates some GCC codegen issues.
+ *
+ * @see XXH3_NEON_LANES for configuring this and details about this optimization.
+ *
+ * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit
+ * integers instead of the other platforms which mask full 64-bit vectors,
+ * so the setup is more complicated than just shifting right.
+ *
+ * Additionally, there is an optimization for 4 lanes at once noted below.
+ *
+ * Since, as stated, the most optimal amount of lanes for Cortexes is 6,
+ * there needs to be *three* versions of the accumulate operation used
+ * for the remaining 2 lanes.
+ *
+ * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap
+ * nearly perfectly.
+ */
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
+    {   /* GCC for darwin arm64 does not like aliasing here */
+        xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* xinput = (const uint8_t *) input;
+        uint8_t const* xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+#ifdef __wasm_simd128__
+        /*
+         * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret
+         * is constant propagated, which results in it converting it to this
+         * inside the loop:
+         *
+         *    a = v128.load(XXH3_kSecret +  0 + $secret_offset, offset = 0)
+         *    b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0)
+         *    ...
+         *
+         * This requires a full 32-bit address immediate (and therefore a 6 byte
+         * instruction) as well as an add for each offset.
+         *
+         * Putting an asm guard prevents it from folding (at the cost of losing
+         * the alignment hint), and uses the free offset in `v128.load` instead
+         * of adding secret_offset each time which overall reduces code size by
+         * about a kilobyte and improves performance.
+         */
+        XXH_COMPILER_GUARD(xsecret);
+#endif
+        /* Scalar lanes use the normal scalarRound routine */
+        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
+            XXH3_scalarRound(acc, input, secret, i);
+        }
+        i = 0;
+        /* 4 NEON lanes at a time. */
+        for (; i+1 < XXH3_NEON_LANES / 2; i+=2) {
+            /* data_vec = xinput[i]; */
+            uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput  + (i * 16));
+            uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput  + ((i+1) * 16));
+            /* key_vec  = xsecret[i];  */
+            uint64x2_t key_vec_1  = XXH_vld1q_u64(xsecret + (i * 16));
+            uint64x2_t key_vec_2  = XXH_vld1q_u64(xsecret + ((i+1) * 16));
+            /* data_swap = swap(data_vec) */
+            uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
+            uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
+            uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
+
+            /*
+             * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
+             * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
+             * get one vector with the low 32 bits of each lane, and one vector
+             * with the high 32 bits of each lane.
+             *
+             * The intrinsic returns a double vector because the original ARMv7-a
+             * instruction modified both arguments in place. AArch64 and SIMD128 emit
+             * two instructions from this intrinsic.
+             *
+             *  [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
+             *  [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
+             */
+            uint32x4x2_t unzipped = vuzpq_u32(
+                vreinterpretq_u32_u64(data_key_1),
+                vreinterpretq_u32_u64(data_key_2)
+            );
+            /* data_key_lo = data_key & 0xFFFFFFFF */
+            uint32x4_t data_key_lo = unzipped.val[0];
+            /* data_key_hi = data_key >> 32 */
+            uint32x4_t data_key_hi = unzipped.val[1];
+            /*
+             * Then, we can split the vectors horizontally and multiply which, as for most
+             * widening intrinsics, have a variant that works on both high half vectors
+             * for free on AArch64. A similar instruction is available on SIMD128.
+             *
+             * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
+             */
+            uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi);
+            uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi);
+            /*
+             * Clang reorders
+             *    a += b * c;     // umlal   swap.2d, dkl.2s, dkh.2s
+             *    c += a;         // add     acc.2d, acc.2d, swap.2d
+             * to
+             *    c += a;         // add     acc.2d, acc.2d, swap.2d
+             *    c += b * c;     // umlal   acc.2d, dkl.2s, dkh.2s
+             *
+             * While it would make sense in theory since the addition is faster,
+             * for reasons likely related to umlal being limited to certain NEON
+             * pipelines, this is worse. A compiler guard fixes this.
+             */
+            XXH_COMPILER_GUARD_CLANG_NEON(sum_1);
+            XXH_COMPILER_GUARD_CLANG_NEON(sum_2);
+            /* xacc[i] = acc_vec + sum; */
+            xacc[i]   = vaddq_u64(xacc[i], sum_1);
+            xacc[i+1] = vaddq_u64(xacc[i+1], sum_2);
+        }
+        /* Operate on the remaining NEON lanes 2 at a time. */
+        for (; i < XXH3_NEON_LANES / 2; i++) {
+            /* data_vec = xinput[i]; */
+            uint64x2_t data_vec = XXH_vld1q_u64(xinput  + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
+            /* acc_vec_2 = swap(data_vec) */
+            uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1);
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
+            /* For two lanes, just use VMOVN and VSHRN. */
+            /* data_key_lo = data_key & 0xFFFFFFFF; */
+            uint32x2_t data_key_lo = vmovn_u64(data_key);
+            /* data_key_hi = data_key >> 32; */
+            uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
+            /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */
+            uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi);
+            /* Same Clang workaround as before */
+            XXH_COMPILER_GUARD_CLANG_NEON(sum);
+            /* xacc[i] = acc_vec + sum; */
+            xacc[i] = vaddq_u64 (xacc[i], sum);
+        }
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   xxh_aliasing_uint64x2_t* xacc       = (xxh_aliasing_uint64x2_t*) acc;
+        uint8_t const* xsecret = (uint8_t const*) secret;
+
+        size_t i;
+        /* WASM uses operator overloads and doesn't need these. */
+#ifndef __wasm_simd128__
+        /* { prime32_1, prime32_1 } */
+        uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1);
+        /* { 0, prime32_1, 0, prime32_1 } */
+        uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32));
+#endif
+
+        /* AArch64 uses both scalar and neon at the same time */
+        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
+            XXH3_scalarScrambleRound(acc, secret, i);
+        }
+        for (i=0; i < XXH3_NEON_LANES / 2; i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            uint64x2_t acc_vec  = xacc[i];
+            uint64x2_t shifted  = vshrq_n_u64(acc_vec, 47);
+            uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
+
+            /* xacc[i] ^= xsecret[i]; */
+            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
+            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
+            /* xacc[i] *= XXH_PRIME32_1 */
+#ifdef __wasm_simd128__
+            /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */
+            xacc[i] = data_key * XXH_PRIME32_1;
+#else
+            /*
+             * Expanded version with portable NEON intrinsics
+             *
+             *    lo(x) * lo(y) + (hi(x) * lo(y) << 32)
+             *
+             * prod_hi = hi(data_key) * lo(prime) << 32
+             *
+             * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector
+             * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits
+             * and avoid the shift.
+             */
+            uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi);
+            /* Extract low bits for vmlal_u32  */
+            uint32x2_t data_key_lo = vmovn_u64(data_key);
+            /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
+            xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
+#endif
+        }
+    }
+}
+#endif
+
+#if (XXH_VECTOR == XXH_VSX)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* presumed aligned */
+    xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
+    xxh_u8 const* const xinput   = (xxh_u8 const*) input;   /* no alignment restriction */
+    xxh_u8 const* const xsecret  = (xxh_u8 const*) secret;    /* no alignment restriction */
+    xxh_u64x2 const v32 = { 32, 32 };
+    size_t i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+        /* data_vec = xinput[i]; */
+        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i);
+        /* key_vec = xsecret[i]; */
+        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
+        xxh_u64x2 const data_key = data_vec ^ key_vec;
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+        /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+        xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+        /* acc_vec = xacc[i]; */
+        xxh_u64x2 acc_vec        = xacc[i];
+        acc_vec += product;
+
+        /* swap high and low halves */
+#ifdef __s390x__
+        acc_vec += vec_permi(data_vec, data_vec, 2);
+#else
+        acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
+#endif
+        xacc[i] = acc_vec;
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
+        const xxh_u8* const xsecret = (const xxh_u8*) secret;
+        /* constants */
+        xxh_u64x2 const v32  = { 32, 32 };
+        xxh_u64x2 const v47 = { 47, 47 };
+        xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };
+        size_t i;
+        for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            xxh_u64x2 const acc_vec  = xacc[i];
+            xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+
+            /* xacc[i] ^= xsecret[i]; */
+            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
+            xxh_u64x2 const data_key = data_vec ^ key_vec;
+
+            /* xacc[i] *= XXH_PRIME32_1 */
+            /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
+            xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
+            /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+            xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+            xacc[i] = prod_odd + (prod_even << v32);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_SVE)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
+                   const void* XXH_RESTRICT input,
+                   const void* XXH_RESTRICT secret)
+{
+    uint64_t *xacc = (uint64_t *)acc;
+    const uint64_t *xinput = (const uint64_t *)(const void *)input;
+    const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
+    svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
+    uint64_t element_count = svcntd();
+    if (element_count >= 8) {
+        svbool_t mask = svptrue_pat_b64(SV_VL8);
+        svuint64_t vacc = svld1_u64(mask, xacc);
+        ACCRND(vacc, 0);
+        svst1_u64(mask, xacc, vacc);
+    } else if (element_count == 2) {   /* sve128 */
+        svbool_t mask = svptrue_pat_b64(SV_VL2);
+        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+        svuint64_t acc1 = svld1_u64(mask, xacc + 2);
+        svuint64_t acc2 = svld1_u64(mask, xacc + 4);
+        svuint64_t acc3 = svld1_u64(mask, xacc + 6);
+        ACCRND(acc0, 0);
+        ACCRND(acc1, 2);
+        ACCRND(acc2, 4);
+        ACCRND(acc3, 6);
+        svst1_u64(mask, xacc + 0, acc0);
+        svst1_u64(mask, xacc + 2, acc1);
+        svst1_u64(mask, xacc + 4, acc2);
+        svst1_u64(mask, xacc + 6, acc3);
+    } else {
+        svbool_t mask = svptrue_pat_b64(SV_VL4);
+        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+        svuint64_t acc1 = svld1_u64(mask, xacc + 4);
+        ACCRND(acc0, 0);
+        ACCRND(acc1, 4);
+        svst1_u64(mask, xacc + 0, acc0);
+        svst1_u64(mask, xacc + 4, acc1);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
+               const xxh_u8* XXH_RESTRICT input,
+               const xxh_u8* XXH_RESTRICT secret,
+               size_t nbStripes)
+{
+    if (nbStripes != 0) {
+        uint64_t *xacc = (uint64_t *)acc;
+        const uint64_t *xinput = (const uint64_t *)(const void *)input;
+        const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
+        svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
+        uint64_t element_count = svcntd();
+        if (element_count >= 8) {
+            svbool_t mask = svptrue_pat_b64(SV_VL8);
+            svuint64_t vacc = svld1_u64(mask, xacc + 0);
+            do {
+                /* svprfd(svbool_t, void *, enum svfprop); */
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(vacc, 0);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, vacc);
+        } else if (element_count == 2) { /* sve128 */
+            svbool_t mask = svptrue_pat_b64(SV_VL2);
+            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+            svuint64_t acc1 = svld1_u64(mask, xacc + 2);
+            svuint64_t acc2 = svld1_u64(mask, xacc + 4);
+            svuint64_t acc3 = svld1_u64(mask, xacc + 6);
+            do {
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(acc0, 0);
+                ACCRND(acc1, 2);
+                ACCRND(acc2, 4);
+                ACCRND(acc3, 6);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, acc0);
+           svst1_u64(mask, xacc + 2, acc1);
+           svst1_u64(mask, xacc + 4, acc2);
+           svst1_u64(mask, xacc + 6, acc3);
+        } else {
+            svbool_t mask = svptrue_pat_b64(SV_VL4);
+            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+            svuint64_t acc1 = svld1_u64(mask, xacc + 4);
+            do {
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(acc0, 0);
+                ACCRND(acc1, 4);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, acc0);
+           svst1_u64(mask, xacc + 4, acc1);
+       }
+    }
+}
+
+#endif
+
+/* scalar variants - universal */
+
+#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
+/*
+ * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they
+ * emit an excess mask and a full 64-bit multiply-add (MADD X-form).
+ *
+ * While this might not seem like much, as AArch64 is a 64-bit architecture, only
+ * big Cortex designs have a full 64-bit multiplier.
+ *
+ * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit
+ * multiplies expand to 2-3 multiplies in microcode. This has a major penalty
+ * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline.
+ *
+ * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does
+ * not have this penalty and does the mask automatically.
+ */
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
+{
+    xxh_u64 ret;
+    /* note: %x = 64-bit register, %w = 32-bit register */
+    __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc));
+    return ret;
+}
+#else
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
+{
+    return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc;
+}
+#endif
+
+/*!
+ * @internal
+ * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
+ *
+ * This is extracted to its own function because the NEON path uses a combination
+ * of NEON and scalar.
+ */
+XXH_FORCE_INLINE void
+XXH3_scalarRound(void* XXH_RESTRICT acc,
+                 void const* XXH_RESTRICT input,
+                 void const* XXH_RESTRICT secret,
+                 size_t lane)
+{
+    xxh_u64* xacc = (xxh_u64*) acc;
+    xxh_u8 const* xinput  = (xxh_u8 const*) input;
+    xxh_u8 const* xsecret = (xxh_u8 const*) secret;
+    XXH_ASSERT(lane < XXH_ACC_NB);
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    {
+        xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
+        xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
+        xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]);
+    }
+}
+
+/*!
+ * @internal
+ * @brief Processes a 64 byte block of data using the scalar path.
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    size_t i;
+    /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */
+#if defined(__GNUC__) && !defined(__clang__) \
+  && (defined(__arm__) || defined(__thumb2__)) \
+  && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \
+  && XXH_SIZE_OPT <= 0
+#  pragma GCC unroll 8
+#endif
+    for (i=0; i < XXH_ACC_NB; i++) {
+        XXH3_scalarRound(acc, input, secret, i);
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar)
+
+/*!
+ * @internal
+ * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().
+ *
+ * This is extracted to its own function because the NEON path uses a combination
+ * of NEON and scalar.
+ */
+XXH_FORCE_INLINE void
+XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
+                         void const* XXH_RESTRICT secret,
+                         size_t lane)
+{
+    xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    XXH_ASSERT(lane < XXH_ACC_NB);
+    {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);
+        xxh_u64 acc64 = xacc[lane];
+        acc64 = XXH_xorshift64(acc64, 47);
+        acc64 ^= key64;
+        acc64 *= XXH_PRIME32_1;
+        xacc[lane] = acc64;
+    }
+}
+
+/*!
+ * @internal
+ * @brief Scrambles the accumulators after a large chunk has been read
+ */
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    size_t i;
+    for (i=0; i < XXH_ACC_NB; i++) {
+        XXH3_scalarScrambleRound(acc, secret, i);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    /*
+     * We need a separate pointer for the hack below,
+     * which requires a non-const pointer.
+     * Any decent compiler will optimize this out otherwise.
+     */
+    const xxh_u8* kSecretPtr = XXH3_kSecret;
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+#if defined(__GNUC__) && defined(__aarch64__)
+    /*
+     * UGLY HACK:
+     * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are
+     * placed sequentially, in order, at the top of the unrolled loop.
+     *
+     * While MOVK is great for generating constants (2 cycles for a 64-bit
+     * constant compared to 4 cycles for LDR), it fights for bandwidth with
+     * the arithmetic instructions.
+     *
+     *   I   L   S
+     * MOVK
+     * MOVK
+     * MOVK
+     * MOVK
+     * ADD
+     * SUB      STR
+     *          STR
+     * By forcing loads from memory (as the asm line causes the compiler to assume
+     * that XXH3_kSecretPtr has been changed), the pipelines are used more
+     * efficiently:
+     *   I   L   S
+     *      LDR
+     *  ADD LDR
+     *  SUB     STR
+     *          STR
+     *
+     * See XXH3_NEON_LANES for details on the pipsline.
+     *
+     * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+     *   without hack: 2654.4 MB/s
+     *   with hack:    3202.9 MB/s
+     */
+    XXH_COMPILER_GUARD(kSecretPtr);
+#endif
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+        int i;
+        for (i=0; i < nbRounds; i++) {
+            /*
+             * The asm hack causes the compiler to assume that kSecretPtr aliases with
+             * customSecret, and on aarch64, this prevented LDP from merging two
+             * loads together for free. Putting the loads together before the stores
+             * properly generates LDP.
+             */
+            xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
+            xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i,     lo);
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);
+    }   }
+}
+
+
+typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t);
+typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
+typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
+
+
+#if (XXH_VECTOR == XXH_AVX512)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
+#define XXH3_accumulate     XXH3_accumulate_avx512
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx512
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
+#define XXH3_accumulate     XXH3_accumulate_avx2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
+#define XXH3_accumulate     XXH3_accumulate_sse2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_sse2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_neon
+#define XXH3_accumulate     XXH3_accumulate_neon
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_neon
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
+#define XXH3_accumulate     XXH3_accumulate_vsx
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_vsx
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_SVE)
+#define XXH3_accumulate_512 XXH3_accumulate_512_sve
+#define XXH3_accumulate     XXH3_accumulate_sve
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#else /* scalar */
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
+#define XXH3_accumulate     XXH3_accumulate_scalar
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#endif
+
+#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */
+#  undef XXH3_initCustomSecret
+#  define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+#endif
+
+XXH_FORCE_INLINE void
+XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate f_acc,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
+    size_t const nb_blocks = (len - 1) / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        f_acc(acc, input + n*block_len, secret, nbStripesPerBlock);
+        f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > XXH_STRIPE_LEN);
+    {   size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        f_acc(acc, input + nb_blocks*block_len, secret, nbStripes);
+
+        /* last stripe */
+        {   const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
+#define XXH_SECRET_LASTACC_START 7  /* not aligned on 8, last secret is different from acc & scrambler */
+            XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++) {
+        result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+         * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+         * XXH3_64bits, len == 256, Snapdragon 835:
+         *   without hack: 2063.7 MB/s
+         *   with hack:    2560.7 MB/s
+         */
+        XXH_COMPILER_GUARD(result64);
+#endif
+    }
+
+    return XXH3_avalanche(result64);
+}
+
+#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
+                        XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
+                           const void* XXH_RESTRICT secret, size_t secretSize,
+                           XXH3_f_accumulate f_acc,
+                           XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    /* do not align on 8, so that the secret is different from the accumulator */
+#define XXH_SECRET_MERGEACCS_START 11
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1);
+}
+
+/*
+ * It's important for performance to transmit secret's size (when it's static)
+ * so that the compiler can properly optimize the vectorized loop.
+ * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
+ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
+ * breaks -Og, this is XXH_NO_INLINE.
+ */
+XXH3_WITH_SECRET_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                             XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * It's preferable for performance that XXH3_hashLong is not inlined,
+ * as it results in a smaller function for small data, easier to the instruction cache.
+ * Note that inside this no_inline function, we do inline the internal loop,
+ * and provide a statically defined secret size to allow optimization of vector loop.
+ */
+XXH_NO_INLINE XXH_PUREF XXH64_hash_t
+XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
+                          XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * XXH3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default XXH3_kSecret with the seed,
+ * and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
+                                    XXH64_hash_t seed,
+                                    XXH3_f_accumulate f_acc,
+                                    XXH3_f_scrambleAcc f_scramble,
+                                    XXH3_f_initCustomSecret f_initSec)
+{
+#if XXH_SIZE_OPT <= 0
+    if (seed == 0)
+        return XXH3_hashLong_64b_internal(input, len,
+                                          XXH3_kSecret, sizeof(XXH3_kSecret),
+                                          f_acc, f_scramble);
+#endif
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed);
+        return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
+                                          f_acc, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
+                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+
+typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,
+                                          XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
+                     XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                     XXH3_hashLong64_f f_hashLong)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secretLen` condition is not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     * Also, note that function signature doesn't offer room to return an error.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
+}
+
+
+/* ===   Public entry point   === */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length)
+{
+    return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed)
+{
+    return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (length <= XXH3_MIDSIZE_MAX)
+        return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize);
+}
+
+
+/* ===   XXH3 streaming   === */
+#ifndef XXH_NO_STREAM
+/*
+ * Malloc's a pointer that is always aligned to align.
+ *
+ * This must be freed with `XXH_alignedFree()`.
+ *
+ * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+ * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+ * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+ *
+ * This underalignment previously caused a rather obvious crash which went
+ * completely unnoticed due to XXH3_createState() not actually being tested.
+ * Credit to RedSpah for noticing this bug.
+ *
+ * The alignment is done manually: Functions like posix_memalign or _mm_malloc
+ * are avoided: To maintain portability, we would have to write a fallback
+ * like this anyways, and besides, testing for the existence of library
+ * functions without relying on external build tools is impossible.
+ *
+ * The method is simple: Overallocate, manually align, and store the offset
+ * to the original behind the returned pointer.
+ *
+ * Align must be a power of 2 and 8 <= align <= 128.
+ */
+static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align)
+{
+    XXH_ASSERT(align <= 128 && align >= 8); /* range check */
+    XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
+    XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
+    {   /* Overallocate to make room for manual realignment and an offset byte */
+        xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
+        if (base != NULL) {
+            /*
+             * Get the offset needed to align this pointer.
+             *
+             * Even if the returned pointer is aligned, there will always be
+             * at least one byte to store the offset to the original pointer.
+             */
+            size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
+            /* Add the offset for the now-aligned pointer */
+            xxh_u8* ptr = base + offset;
+
+            XXH_ASSERT((size_t)ptr % align == 0);
+
+            /* Store the offset immediately before the returned pointer. */
+            ptr[-1] = (xxh_u8)offset;
+            return ptr;
+        }
+        return NULL;
+    }
+}
+/*
+ * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
+ * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
+ */
+static void XXH_alignedFree(void* p)
+{
+    if (p != NULL) {
+        xxh_u8* ptr = (xxh_u8*)p;
+        /* Get the offset byte we added in XXH_malloc. */
+        xxh_u8 offset = ptr[-1];
+        /* Free the original malloc'd pointer */
+        xxh_u8* base = ptr - offset;
+        XXH_free(base);
+    }
+}
+/*! @ingroup XXH3_family */
+/*!
+ * @brief Allocate an @ref XXH3_state_t.
+ *
+ * @return An allocated pointer of @ref XXH3_state_t on success.
+ * @return `NULL` on failure.
+ *
+ * @note Must be freed with XXH3_freeState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+    if (state==NULL) return NULL;
+    XXH3_INITSTATE(state);
+    return state;
+}
+
+/*! @ingroup XXH3_family */
+/*!
+ * @brief Frees an @ref XXH3_state_t.
+ *
+ * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
+ *
+ * @return @ref XXH_OK.
+ *
+ * @note Must be allocated with XXH3_createState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_alignedFree(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state)
+{
+    XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_reset_internal(XXH3_state_t* statePtr,
+                    XXH64_hash_t seed,
+                    const void* secret, size_t secretSize)
+{
+    size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
+    size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
+    XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
+    XXH_ASSERT(statePtr != NULL);
+    /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
+    memset((char*)statePtr + initStart, 0, initLength);
+    statePtr->acc[0] = XXH_PRIME32_3;
+    statePtr->acc[1] = XXH_PRIME64_1;
+    statePtr->acc[2] = XXH_PRIME64_2;
+    statePtr->acc[3] = XXH_PRIME64_3;
+    statePtr->acc[4] = XXH_PRIME64_4;
+    statePtr->acc[5] = XXH_PRIME32_2;
+    statePtr->acc[6] = XXH_PRIME64_5;
+    statePtr->acc[7] = XXH_PRIME32_1;
+    statePtr->seed = seed;
+    statePtr->useSeed = (seed != 0);
+    statePtr->extSecret = (const unsigned char*)secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (seed==0) return XXH3_64bits_reset(statePtr);
+    if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))
+        XXH3_initCustomSecret(statePtr->customSecret, seed);
+    XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, seed64, secret, secretSize);
+    statePtr->useSeed = 1; /* always, even if seed64==0 */
+    return XXH_OK;
+}
+
+/*!
+ * @internal
+ * @brief Processes a large input for XXH3_update() and XXH3_digest_long().
+ *
+ * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block.
+ *
+ * @param acc                Pointer to the 8 accumulator lanes
+ * @param nbStripesSoFarPtr  In/out pointer to the number of leftover stripes in the block*
+ * @param nbStripesPerBlock  Number of stripes in a block
+ * @param input              Input pointer
+ * @param nbStripes          Number of stripes to process
+ * @param secret             Secret pointer
+ * @param secretLimit        Offset of the last block in @p secret
+ * @param f_acc              Pointer to an XXH3_accumulate implementation
+ * @param f_scramble         Pointer to an XXH3_scrambleAcc implementation
+ * @return                   Pointer past the end of @p input after processing
+ */
+XXH_FORCE_INLINE const xxh_u8 *
+XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
+                    size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
+                    const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
+                    const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
+                    XXH3_f_accumulate f_acc,
+                    XXH3_f_scrambleAcc f_scramble)
+{
+    const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE;
+    /* Process full blocks */
+    if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) {
+        /* Process the initial partial block... */
+        size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr;
+
+        do {
+            /* Accumulate and scramble */
+            f_acc(acc, input, initialSecret, nbStripesThisIter);
+            f_scramble(acc, secret + secretLimit);
+            input += nbStripesThisIter * XXH_STRIPE_LEN;
+            nbStripes -= nbStripesThisIter;
+            /* Then continue the loop with the full block size */
+            nbStripesThisIter = nbStripesPerBlock;
+            initialSecret = secret;
+        } while (nbStripes >= nbStripesPerBlock);
+        *nbStripesSoFarPtr = 0;
+    }
+    /* Process a partial block */
+    if (nbStripes > 0) {
+        f_acc(acc, input, initialSecret, nbStripes);
+        input += nbStripes * XXH_STRIPE_LEN;
+        *nbStripesSoFarPtr += nbStripes;
+    }
+    /* Return end pointer */
+    return input;
+}
+
+#ifndef XXH3_STREAM_USE_STACK
+# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */
+#   define XXH3_STREAM_USE_STACK 1
+# endif
+#endif
+/*
+ * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+ */
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
+            const xxh_u8* XXH_RESTRICT input, size_t len,
+            XXH3_f_accumulate f_acc,
+            XXH3_f_scrambleAcc f_scramble)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    XXH_ASSERT(state != NULL);
+    {   const xxh_u8* const bEnd = input + len;
+        const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* For some reason, gcc and MSVC seem to suffer greatly
+         * when operating accumulators directly into state.
+         * Operating into stack space seems to enable proper optimization.
+         * clang, on the other hand, doesn't seem to need this trick */
+        XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8];
+        XXH_memcpy(acc, state->acc, sizeof(acc));
+#else
+        xxh_u64* XXH_RESTRICT const acc = state->acc;
+#endif
+        state->totalLen += len;
+        XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
+
+        /* small input : just fill in tmp buffer */
+        if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) {
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0);   /* clean multiple */
+
+        /*
+         * Internal buffer is partially filled (always, except at beginning)
+         * Complete it, then consume it.
+         */
+        if (state->bufferedSize) {
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                secret, state->secretLimit,
+                                f_acc, f_scramble);
+            state->bufferedSize = 0;
+        }
+        XXH_ASSERT(input < bEnd);
+        if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
+            size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
+            input = XXH3_consumeStripes(acc,
+                                       &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                       input, nbStripes,
+                                       secret, state->secretLimit,
+                                       f_acc, f_scramble);
+            XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+
+        }
+        /* Some remaining input (always) : buffer it */
+        XXH_ASSERT(input < bEnd);
+        XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
+        XXH_ASSERT(state->bufferedSize == 0);
+        XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+        state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* save stack accumulators into state */
+        XXH_memcpy(state->acc, acc, sizeof(acc));
+#endif
+    }
+
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len,
+                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc,
+                  const XXH3_state_t* state,
+                  const unsigned char* secret)
+{
+    xxh_u8 lastStripe[XXH_STRIPE_LEN];
+    const xxh_u8* lastStripePtr;
+
+    /*
+     * Digest on a local copy. This way, the state remains unaltered, and it can
+     * continue ingesting more input afterwards.
+     */
+    XXH_memcpy(acc, state->acc, sizeof(state->acc));
+    if (state->bufferedSize >= XXH_STRIPE_LEN) {
+        /* Consume remaining stripes then point to remaining data in buffer */
+        size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
+        size_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, nbStripes,
+                            secret, state->secretLimit,
+                            XXH3_accumulate, XXH3_scrambleAcc);
+        lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN;
+    } else {  /* bufferedSize < XXH_STRIPE_LEN */
+        /* Copy to temp buffer */
+        size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
+        XXH_ASSERT(state->bufferedSize > 0);  /* there is always some input buffered */
+        XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+        XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+        lastStripePtr = lastStripe;
+    }
+    /* Last stripe */
+    XXH3_accumulate_512(acc,
+                        lastStripePtr,
+                        secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        return XXH3_mergeAccs(acc,
+                              secret + XXH_SECRET_MERGEACCS_START,
+                              (xxh_u64)state->totalLen * XXH_PRIME64_1);
+    }
+    /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
+    if (state->useSeed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                  secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+#endif /* !XXH_NO_STREAM */
+
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ * even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    /* A doubled version of 1to3_64b with different constants. */
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
+                                | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+        xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
+        xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
+        xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
+        xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+        XXH128_hash_t h128;
+        h128.low64  = XXH64_avalanche(keyed_lo);
+        h128.high64 = XXH64_avalanche(keyed_hi);
+        return h128;
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
+        xxh_u64 const keyed = input_64 ^ bitflip;
+
+        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+        XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
+
+        m128.high64 += (m128.low64 << 1);
+        m128.low64  ^= (m128.high64 >> 3);
+
+        m128.low64   = XXH_xorshift64(m128.low64, 35);
+        m128.low64  *= PRIME_MX2;
+        m128.low64   = XXH_xorshift64(m128.low64, 28);
+        m128.high64  = XXH3_avalanche(m128.high64);
+        return m128;
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
+        xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
+        xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64       input_hi = XXH_readLE64(input + len - 8);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
+        /*
+         * Put len in the middle of m128 to ensure that the length gets mixed to
+         * both the low and high bits in the 128x64 multiply below.
+         */
+        m128.low64 += (xxh_u64)(len - 1) << 54;
+        input_hi   ^= bitfliph;
+        /*
+         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+         * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
+         * the high 64 bits of m128.
+         *
+         * The best approach to this operation is different on 32-bit and 64-bit.
+         */
+        if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
+            /*
+             * 32-bit optimized version, which is more readable.
+             *
+             * On 32-bit, it removes an ADC and delays a dependency between the two
+             * halves of m128.high64, but it generates an extra mask on 64-bit.
+             */
+            m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
+        } else {
+            /*
+             * 64-bit optimized (albeit more confusing) version.
+             *
+             * Uses some properties of addition and multiplication to remove the mask:
+             *
+             * Let:
+             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+             *    c = XXH_PRIME32_2
+             *
+             *    a + (b * c)
+             * Inverse Property: x + y - x == y
+             *    a + (b * (1 + c - 1))
+             * Distributive Property: x * (y + z) == (x * y) + (x * z)
+             *    a + (b * 1) + (b * (c - 1))
+             * Identity Property: x * 1 == x
+             *    a + b + (b * (c - 1))
+             *
+             * Substitute a, b, and c:
+             *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             *
+             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+             *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             */
+            m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
+        }
+        /* m128 ^= XXH_swap64(m128 >> 64); */
+        m128.low64  ^= XXH_swap64(m128.high64);
+
+        {   /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
+            XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
+            h128.high64 += m128.high64 * XXH_PRIME64_2;
+
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/*
+ * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+ */
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t h128;
+            xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
+            xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
+            h128.low64 = XXH64_avalanche(seed ^ bitflipl);
+            h128.high64 = XXH64_avalanche( seed ^ bitfliph);
+            return h128;
+    }   }
+}
+
+/*
+ * A bit slower than XXH3_mix16B, but handles multiply by zero better.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
+              const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+
+#if XXH_SIZE_OPT >= 1
+        {
+            /* Smaller, but slightly slower. */
+            unsigned int i = (unsigned int)(len - 1) / 32;
+            do {
+                acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed);
+            } while (i-- != 0);
+        }
+#else
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+#endif
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_NO_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        unsigned i;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+        /*
+         *  We set as `i` as offset + 32. We do this so that unchanged
+         * `len` can be used as upper bound. This reaches a sweet spot
+         * where both x86 and aarch64 get simple agen and good codegen
+         * for the loop.
+         */
+        for (i = 32; i < 160; i += 32) {
+            acc = XXH128_mix32B(acc,
+                                input  + i - 32,
+                                input  + i - 16,
+                                secret + i - 32,
+                                seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        /*
+         * NB: `i <= len` will duplicate the last 32-bytes if
+         * len % 32 was zero. This is an unfortunate necessity to keep
+         * the hash result stable.
+         */
+        for (i=160; i <= len; i += 32) {
+            acc = XXH128_mix32B(acc,
+                                input + i - 32,
+                                input + i - 16,
+                                secret + XXH3_MIDSIZE_STARTOFFSET + i - 160,
+                                seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc,
+                            input + len - 16,
+                            input + len - 32,
+                            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+                            (XXH64_hash_t)0 - seed);
+
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate f_acc,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   XXH128_hash_t h128;
+        h128.low64  = XXH3_mergeAccs(acc,
+                                     secret + XXH_SECRET_MERGEACCS_START,
+                                     (xxh_u64)len * XXH_PRIME64_1);
+        h128.high64 = XXH3_mergeAccs(acc,
+                                     secret + secretSize
+                                            - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                     ~((xxh_u64)len * XXH_PRIME64_2));
+        return h128;
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong() is not inlined.
+ */
+XXH_NO_INLINE XXH_PUREF XXH128_hash_t
+XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed64,
+                           const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
+                                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * It's important for performance to pass @p secretLen (when it's static)
+ * to the compiler, so that it can properly optimize the vectorized loop.
+ *
+ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
+ * breaks -Og, this is XXH_NO_INLINE.
+ */
+XXH3_WITH_SECRET_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                              XXH64_hash_t seed64,
+                              const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
+                                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
+                                XXH64_hash_t seed64,
+                                XXH3_f_accumulate f_acc,
+                                XXH3_f_scrambleAcc f_scramble,
+                                XXH3_f_initCustomSecret f_initSec)
+{
+    if (seed64 == 0)
+        return XXH3_hashLong_128b_internal(input, len,
+                                           XXH3_kSecret, sizeof(XXH3_kSecret),
+                                           f_acc, f_scramble);
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed64);
+        return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
+                                           f_acc, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed(const void* input, size_t len,
+                            XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
+                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
+                                            XXH64_hash_t, const void* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_128bits_internal(const void* input, size_t len,
+                      XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                      XXH3_hashLong128_f f_hl128)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hl128(input, len, seed64, secret, secretLen);
+}
+
+
+/* ===   Public XXH128 API   === */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_default);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 (const xxh_u8*)secret, secretSize,
+                                 XXH3_hashLong_128b_withSecret);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_internal(input, len, seed,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_withSeed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+#ifndef XXH_NO_STREAM
+/*
+ * All initialization and update functions are identical to 64-bit streaming variant.
+ * The only difference is the finalization routine.
+ */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
+{
+    return XXH3_64bits_reset(statePtr);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    return XXH3_64bits_reset_withSeed(statePtr, seed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_64bits_update(state, input, len);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   XXH128_hash_t h128;
+            h128.low64  = XXH3_mergeAccs(acc,
+                                         secret + XXH_SECRET_MERGEACCS_START,
+                                         (xxh_u64)state->totalLen * XXH_PRIME64_1);
+            h128.high64 = XXH3_mergeAccs(acc,
+                                         secret + state->secretLimit + XXH_STRIPE_LEN
+                                                - sizeof(acc) - XXH_SECRET_MERGEACCS_START,
+                                         ~((xxh_u64)state->totalLen * XXH_PRIME64_2));
+            return h128;
+        }
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+#endif /* !XXH_NO_STREAM */
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp, memcpy */
+
+/* return : 1 is equal, 0 if different */
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * @return : >0 if *h128_1  > *h128_2
+ *           <0 if *h128_1  < *h128_2
+ *           =0 if *h128_1 == *h128_2  */
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));
+    XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+
+
+/* ==========================================
+ * Secret generators
+ * ==========================================
+ */
+#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+
+XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
+{
+    XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );
+    XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize)
+{
+#if (XXH_DEBUGLEVEL >= 1)
+    XXH_ASSERT(secretBuffer != NULL);
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+#else
+    /* production mode, assert() are disabled */
+    if (secretBuffer == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+#endif
+
+    if (customSeedSize == 0) {
+        customSeed = XXH3_kSecret;
+        customSeedSize = XXH_SECRET_DEFAULT_SIZE;
+    }
+#if (XXH_DEBUGLEVEL >= 1)
+    XXH_ASSERT(customSeed != NULL);
+#else
+    if (customSeed == NULL) return XXH_ERROR;
+#endif
+
+    /* Fill secretBuffer with a copy of customSeed - repeat as needed */
+    {   size_t pos = 0;
+        while (pos < secretSize) {
+            size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);
+            memcpy((char*)secretBuffer + pos, customSeed, toCopy);
+            pos += toCopy;
+    }   }
+
+    {   size_t const nbSeg16 = secretSize / 16;
+        size_t n;
+        XXH128_canonical_t scrambler;
+        XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
+        for (n=0; n<nbSeg16; n++) {
+            XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);
+            XXH3_combine16((char*)secretBuffer + n*16, h128);
+        }
+        /* last segment */
+        XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));
+    }
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed)
+{
+    XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    XXH3_initCustomSecret(secret, seed);
+    XXH_ASSERT(secretBuffer != NULL);
+    memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);
+}
+
+
+
+/* Pop our optimization override from above */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
+#  pragma GCC pop_options
+#endif
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif  /* XXH_NO_XXH3 */
+
+/*!
+ * @}
+ */
+#endif  /* XXH_IMPLEMENTATION */
+
+
+#if defined (__cplusplus)
+} /* extern "C" */
+#endif
diff --git a/libraries/triedent/programs/CMakeLists.txt b/libraries/triedent/programs/CMakeLists.txt
new file mode 100644
index 000000000..457ca8977
--- /dev/null
+++ b/libraries/triedent/programs/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+add_executable(triedentdb tdb.cpp)
+target_link_libraries(triedentdb PUBLIC Boost::program_options triedent)
+target_include_directories(triedentdb PUBLIC ${Boost_INCLUDE_DIRS})
+set_target_properties(triedentdb PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${ROOT_BINARY_DIR})
+
+add_executable(mermaid mermaid.cpp)
+target_link_libraries(mermaid PUBLIC Boost::program_options triedent)
+target_include_directories(mermaid PUBLIC ${Boost_INCLUDE_DIRS} )
+set_target_properties(mermaid PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${ROOT_BINARY_DIR})
diff --git a/libraries/triedent/src/mermaid.cpp b/libraries/triedent/programs/mermaid.cpp
similarity index 95%
rename from libraries/triedent/src/mermaid.cpp
rename to libraries/triedent/programs/mermaid.cpp
index 139f1cb58..f498e2908 100644
--- a/libraries/triedent/src/mermaid.cpp
+++ b/libraries/triedent/programs/mermaid.cpp
@@ -20,6 +20,7 @@ int main(int argc, char** argv)
 
       po::options_description desc("Allowed options");
       auto                    opt = desc.add_options();
+      opt("about", "about mermaid");
       opt("help,h", "print this message");
       opt("reset", "reset the database");
       opt("status", "print status of the database");
@@ -41,6 +42,10 @@ int main(int argc, char** argv)
       po::store(po::parse_command_line(argc, argv, desc), vm);
       po::notify(vm);
 
+      if (vm.count("about" )) {
+         std::cout<<"Mermaid helps maintain, migrate, and repair triedent databases\n";
+         return 0;
+      }
       if (vm.count("help"))
       {
          std::cout << desc << "\n";
diff --git a/libraries/triedent/programs/tdb.cpp b/libraries/triedent/programs/tdb.cpp
new file mode 100644
index 000000000..8db773164
--- /dev/null
+++ b/libraries/triedent/programs/tdb.cpp
@@ -0,0 +1,548 @@
+#include <stdlib.h>
+#include <unistd.h>
+#include <boost/format.hpp>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <random>
+
+#include <boost/program_options/cmdline.hpp>
+#include <boost/program_options/options_description.hpp>
+#include <boost/program_options/parsers.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include <triedent/db.hpp>
+
+using namespace std::chrono_literals;
+uint64_t bswap(uint64_t x)
+{
+   x = (x & 0x00000000FFFFFFFF) << 32 | (x & 0xFFFFFFFF00000000) >> 32;
+   x = (x & 0x0000FFFF0000FFFF) << 16 | (x & 0xFFFF0000FFFF0000) >> 16;
+   x = (x & 0x00FF00FF00FF00FF) << 8 | (x & 0xFF00FF00FF00FF00) >> 8;
+   return x;
+}
+
+int64_t rand64()
+{
+   thread_local static std::mt19937 gen(rand());
+   return uint64_t(gen()) << 32 | gen();
+}
+
+std::string add_comma(uint64_t s)
+{
+   if (s < 1000)
+      return std::to_string(s);
+   if (s < 1000000)
+   {
+      return std::to_string(s / 1000) + ',' + std::to_string((s % 1000) + 1000).substr(1);
+   }
+   if (s < 1000000000)
+   {
+      return std::to_string(s / 1000000) + ',' +
+             std::to_string(((s % 1000000) / 1000) + 1000).substr(1) + "," +
+             std::to_string((s % 1000) + 1000).substr(1);
+   }
+   return std::to_string(s);
+};
+
+int main(int argc, char** argv)
+{
+   triedent::set_current_thread_name("main");
+   TRIEDENT_WARN("Hello, Welcome to Triedent!");
+   namespace po            = boost::program_options;
+   uint32_t    hot_page_c  = 34;
+   uint32_t    warm_page_c = 33;
+   uint32_t    cool_page_c = 35;
+   uint32_t    cold_page_c = 35;
+   uint64_t    num_objects = 500 * 1000 * 1000;
+   std::string db_dir;
+   bool        use_string = false;
+   uint64_t    insert_count;
+   uint64_t    status_count;
+   bool        check_content = false;
+   uint32_t    rounds        = 10;
+   uint32_t    count         = 1000 * 1000 * 10;
+   uint32_t    group         = 16;
+   uint32_t    sync_mode     = 0;
+   bool        cor           = true;
+   bool        run_compactor = true;
+   bool        run_validate  = false;
+
+   uint32_t                num_read_threads = 6;
+   po::options_description desc("Allowed options");
+   auto                    opt = desc.add_options();
+   opt("help,h", "print this message");
+   opt("reset", "reset the database");
+   opt("seq-write", "perform seq writes");
+   opt("seq-read", "perform seq reads");
+   opt("seq-update", "perform seq updates, assumes after preform seq writes");
+   opt("rand-write", "perform random writes");
+   opt("rand-write-read", "perform random writes while reading");
+   opt("read-only", "just query existing db");
+   opt("sparce", po::value<bool>(&use_string)->default_value(false), "use sparse string keys");
+   opt("compact", po::value<bool>(&run_compactor)->default_value(true),
+       "enable/disable background compactor, will compact between rounds instead");
+   opt("validate", po::value<bool>(&run_validate)->default_value(false),
+       "enable/disable state validation between rounds");
+   opt("data-dir", po::value<std::string>(&db_dir)->default_value("./big.dir"),
+       "the folder that contains the database");
+   opt("read-threads,r", po::value<uint32_t>(&num_read_threads)->default_value(6),
+       "number of read threads to launch");
+   opt("sync-mode", po::value<uint32_t>(&sync_mode)->default_value(sync_mode),
+       "0 = none, 1 = aysnc, 2 = blocking");
+   opt("cache-on-read", po::value<bool>(&cor)->default_value(cor),
+       "copy read objects to cache, higher");
+   opt("rounds", po::value<uint32_t>(&rounds)->default_value(10),
+       "the number of times to run each segment");
+   opt("count", po::value<uint32_t>(&count)->default_value(count),
+       "the number of times to run each round");
+   opt("group", po::value<uint32_t>(&group)->default_value(group),
+       "the number of items in each logical transaction");
+   opt("hot-size,H", po::value<uint32_t>(&hot_page_c)->default_value(33),
+       "the power of 2 for the amount of RAM for the hot ring, RAM = 2^(hot_size) bytes");
+   opt("warm-size,w", po::value<uint32_t>(&warm_page_c)->default_value(33),
+       "the power of 2 for the amount of RAM for the warm ring, RAM = 2^(warm_size) bytes");
+   opt("cool-size,c", po::value<uint32_t>(&cool_page_c)->default_value(33),
+       "the power of 2 for the amount of RAM for the cool ring, RAM = 2^(cool_size) bytes");
+   opt("cold-size,C", po::value<uint32_t>(&cold_page_c)->default_value(33),
+       "the power of 2 for the amount of RAM for the cold ring, RAM = 2^(cold_size) bytes");
+   opt("max-objects,O", po::value<uint64_t>(&num_objects)->default_value(num_objects),
+       "the maximum number of unique objects in the database");
+   opt("insert,i", po::value<uint64_t>(&insert_count)->default_value(100000000ull),
+       "the number of random key/value pairs to insert");
+   opt("stat,s", po::value<uint64_t>(&status_count)->default_value(1000000ull),
+       "the number of how often to print stats");
+   opt("check-content", po::bool_switch(&check_content), "check content against std::map (slow)");
+
+   po::variables_map vm;
+   po::store(po::parse_command_line(argc, argv, desc), vm);
+   po::notify(vm);
+
+   if (vm.count("help"))
+   {
+      std::cerr << desc << "\n";
+      return 1;
+   }
+
+   if (vm.count("reset"))
+   {
+      std::cerr << "resetting database\n";
+      std::filesystem::remove_all(db_dir);
+      triedent::database::create(db_dir, {});
+   }
+   bool read_only = false;
+   if (vm.count("read-only"))
+   {
+      read_only = true;
+   }
+
+   if (num_read_threads > 64)
+   {
+      std::cerr << "maximum number of read threads is 64\n";
+      return 0;
+   }
+
+   triedent::DB::Options options{.config = {.cache_on_read      = cor,
+                                            .run_compact_thread = run_compactor,
+                                            .sync_mode          = (triedent::sync_type)sync_mode
+
+                                 }};
+
+   std::cout << "opening database '" << db_dir << "'\n";
+   auto  db = triedent::DB::open(options, db_dir);
+   auto& ws = db->writeSession();
+
+   //   uint32_t count = 1000 * 1000 * 10;
+   int64_t key = 0;
+
+   /*
+   for( int i = 0; i  < 7; ++i ) {
+      key++;
+      auto wt    = ws.startTransaction();
+      auto old_size = wt->put(std::span<char>((char*)&key, 8), std::span<char>((char*)&key, 8));
+      wt->commit();
+      std::cout << "-------------\n";
+   }
+   std::cout << "=================\n";
+
+   {
+      auto rs = db->createReadSession();
+      auto rt = rs->startTransaction();
+
+      for( int i = 0; i  < 2; ++i ) {
+         key++;
+         auto wt    = ws.startTransaction();
+         auto old_size = wt->put(std::span<char>((char*)&key, 8), std::span<char>((char*)&key, 8));
+         wt->commit();
+         std::cout << "-------------\n";
+      }
+      std::cout << "read session going away\n";
+   }
+   std::cout << "write session going away\n";
+
+      return 0;
+      */
+
+   if (vm.count("seq-write"))
+   {
+      std::cout << "Starting to insert " << rounds << " rounds of " << add_comma(count)
+                << " sequential key/values\n";
+      for (uint32_t round = 0; round < rounds; ++round)
+      {
+         auto start = std::chrono::steady_clock::now();
+
+         for (uint32_t g = 0; g < (count / group); ++g)
+         {
+            auto wt = ws.startTransaction();
+
+            for (uint32_t i = 0; i < group; ++i)
+            {
+               ++key;
+               auto kv = bswap(key);
+               auto old_size =
+                   wt->put(std::span<char>((char*)&kv, 8), std::span<char>((char*)&key, 8));
+               if (old_size != -1)
+               {
+                  std::cerr << "this should be a new value! : " << old_size << "\n";
+                  return 0;
+               }
+            }
+            wt->commit();
+         }
+
+         auto end   = std::chrono::steady_clock::now();
+         auto delta = end - start;
+
+         std::cerr << std::setw(4) << round << std::setw(12)
+                   << add_comma(int64_t(
+                          count /
+                          (std::chrono::duration<double, std::milli>(delta).count() / 1000)))
+                   << " items/sec   \n";
+      }
+   }
+   if (vm.count("seq-read"))
+   {
+      std::cout << "Starting to get" << rounds << " rounds of " << add_comma(count)
+                << " sequential key/values\n";
+      auto rs = db->createReadSession();
+      auto rt = rs->startTransaction();
+      key     = 0;
+      std::vector<char> result;
+      for (uint32_t round = 0; round < rounds; ++round)
+      {
+         auto start = std::chrono::steady_clock::now();
+
+         for (uint32_t i = 0; i < count; ++i)
+         {
+            ++key;
+            //auto kv = key;//bswap(key);
+            auto kv    = bswap(key);
+            auto found = rt->get(std::span<char>((char*)&kv, 8), &result);
+            if (8 != result.size())  // not found.ok)
+            {
+               std::cerr << "unable to find key: " << key << "\n";
+               return 0;
+            }
+            else
+            {
+               if (key != *((int64_t*)(result.data())))
+               {
+                  // std::cerr << "value didn't match expected\n";
+                  // return 0;
+               }
+            }
+            result.resize(0);
+         }
+
+         auto end   = std::chrono::steady_clock::now();
+         auto delta = end - start;
+
+         std::cerr << std::setw(4) << round << std::setw(12)
+                   << add_comma(int64_t(
+                          count /
+                          (std::chrono::duration<double, std::milli>(delta).count() / 1000)))
+                   << " items/sec   \n";
+      }
+   }
+   if (vm.count("seq-update"))
+   {
+      std::cout << "Starting to update " << rounds << " rounds of " << add_comma(count)
+                << " sequential key/values\n";
+      key = 0;
+      for (uint32_t round = 0; round < rounds; ++round)
+      {
+         auto start = std::chrono::steady_clock::now();
+         auto wt    = ws.startTransaction();
+
+         for (uint32_t i = 0; i < count; ++i)
+         {
+            //   std::cerr<<i <<"           \r";
+            ++key;
+            auto    kv  = bswap(key);
+            int64_t val = -key;
+            auto    old_size =
+                wt->put(std::span<char>((char*)&kv, 8), std::span<char>((char*)&val, 8));
+
+            if (old_size != 8)
+            {
+               std::cerr << "unable to find old value! " << old_size << "  " << key << "\n";
+               return 0;
+            }
+         }
+
+         wt->commit();
+         auto end   = std::chrono::steady_clock::now();
+         auto delta = end - start;
+
+         std::cerr << std::setw(4) << round << std::setw(12)
+                   << add_comma(int64_t(
+                          count /
+                          (std::chrono::duration<double, std::milli>(delta).count() / 1000)))
+                   << " items/sec   \n";
+      }
+   }
+   if (vm.count("rand-write"))
+   {
+      std::cout << "Starting to insert " << rounds << " rounds of " << add_comma(count)
+                << " random key/values\n";
+      key = 0;
+      for (uint32_t round = 0; round < rounds; ++round)
+      {
+         auto start = std::chrono::steady_clock::now();
+         for (uint32_t g = 0; g < (count / group); ++g)
+         {
+            auto wt = ws.startTransaction();
+
+            for (uint32_t i = 0; i < group; ++i)
+            {
+               key         = rand64();
+               int64_t val = 16 * g + i;
+               auto    old_size =
+                   wt->put(std::span<char>((char*)&key, 8), std::span<char>((char*)&val, 8));
+            }
+            wt->commit();
+         }
+         auto end   = std::chrono::steady_clock::now();
+         auto delta = end - start;
+
+         std::cerr << std::setw(4) << round << std::setw(12)
+                   << add_comma(int64_t(
+                          count /
+                          (std::chrono::duration<double, std::milli>(delta).count() / 1000)))
+                   << " items/sec   \n";
+         if (not run_compactor)
+         {
+            ws.validate();
+            while (db->compact())
+               ;
+            ws.validate();
+         }
+         else
+         {
+            ws.validate();
+         }
+      }
+      using namespace std::chrono_literals;
+      /*
+      db->print();
+      std::cerr<< "compact one\n";
+      for( uint32_t i = 0; i < 30; ++i ) {
+      db->compact();
+      }
+      db->print();
+      std::cerr<< "\nsleeping for 3 seconds... so compact can work\n\n";
+            std::this_thread::sleep_for(3000ms);
+      */
+   }
+   if (0)
+   {
+      auto rs = db->createReadSession();
+      auto rt = rs->startTransaction();
+      std::cout << "Starting to find lower bound " << rounds << " rounds of " << add_comma(count)
+                << " random key/values\n";
+      std::vector<char> result_key;
+      std::vector<char> result_val;
+      key = 0;
+      for (uint32_t round = 0; round < rounds; ++round)
+      {
+         auto start = std::chrono::steady_clock::now();
+
+         for (uint32_t i = 0; i < count; ++i)
+         {
+            key         = rand64();
+            int64_t val = i;
+            rt->get_greater_equal(std::span<const char>((const char*)&key, 8), &result_key,
+                                  &result_val);
+         }
+
+         auto end   = std::chrono::steady_clock::now();
+         auto delta = end - start;
+
+         std::cerr << std::setw(4) << round << std::setw(12)
+                   << add_comma(int64_t(
+                          count /
+                          (std::chrono::duration<double, std::milli>(delta).count() / 1000)))
+                   << " items/sec   \n";
+      }
+
+      /*
+   std::cout << "Starting to find lower bound " << rounds << " rounds of " << add_comma(count)
+             << " random key/values in " << num_read_threads << " threads\n";
+
+   for (uint32_t round = 0; round < rounds; ++round)
+   {
+      std::vector<std::unique_ptr<std::thread>> rthreads;
+      rthreads.reserve(num_read_threads);
+
+      auto start = std::chrono::steady_clock::now();
+
+      for (uint32_t i = 0; i < num_read_threads; ++i)
+      {
+         auto read_loop = [&]()
+         {
+            auto rs = db->createReadSession();
+            auto rt = rs->startTransaction();
+
+            std::vector<char> result_key;
+            std::vector<char> result_val;
+            key = 0;
+
+            for (uint32_t i = 0; i < count; ++i)
+            {
+               key         = rand64();
+               int64_t val = i;
+               rt->get_greater_equal(std::span<const char>((const char*)&key, 8), &result_key,
+                                     &result_val);
+            }
+         };
+         rthreads.emplace_back(new std::thread(read_loop));
+      }
+
+      for (auto& r : rthreads)
+         r->join();
+
+      auto end   = std::chrono::steady_clock::now();
+      auto delta = end - start;
+      std::cerr << std::setw(4) << round << std::setw(12)
+                << add_comma(
+                       int64_t((num_read_threads * count) /
+                               (std::chrono::duration<double, std::milli>(delta).count() / 1000)))
+                << " items/sec   \n";
+   }
+   */
+   }
+
+   if (vm.count("rand-write-read"))
+   {
+      auto rs = db->createReadSession();
+      auto rt = rs->startTransaction();
+
+      std::cout << "Starting to find lower bound " << rounds << " rounds of " << add_comma(count)
+                << " random key/values in " << num_read_threads << " threads while writing\n";
+
+      uint64_t total_writes = 0;
+      for (uint32_t round = 0; round < rounds; ++round)
+      {
+         std::vector<std::unique_ptr<std::thread>> rthreads;
+         rthreads.reserve(num_read_threads);
+
+         auto             start = std::chrono::steady_clock::now();
+         std::atomic<int> done  = 0;
+
+         for (uint32_t i = 0; i < num_read_threads; ++i)
+         {
+            auto read_loop = [&]()
+            {
+               triedent::set_current_thread_name("read");
+               auto lrs = db->createReadSession();
+
+               std::vector<char> result_key;
+               std::vector<char> result_val;
+               uint64_t          key = 0;
+
+               for (uint32_t g = 0; g < group; ++g)
+               {
+                  auto rt = lrs->startTransaction();
+                  for (uint32_t i = 0; i < count / group; ++i)
+                  {
+                     key         = rand64();
+                     int64_t val = g * (count / group) + i;
+                     rt->get_greater_equal(std::span<const char>((const char*)&key, 8), &result_key,
+                                           &result_val);
+                  }
+               }
+               //   std::this_thread::sleep_for(1000ms);
+               //     rt.reset();
+
+               ++done;
+            };
+            rthreads.emplace_back(new std::thread(read_loop));
+         }
+
+         int64_t writes = 0;
+         while (done.load() < num_read_threads)
+         {
+            if (not read_only)
+            {
+               auto wt = ws.startTransaction();
+               for (uint32_t i = 0; i < group; ++i)
+               {
+                  key         = rand64();
+                  int64_t val = key;
+                  auto    old_size =
+                      wt->put(std::span<char>((char*)&key, 8), std::span<char>((char*)&val, 8));
+
+                  ++writes;
+                  ++total_writes;
+                  if (done.load(std::memory_order_relaxed) >= num_read_threads)
+                     break;
+               }
+               wt->commit();
+            } else {
+               std::this_thread::sleep_for(10ms);
+            }
+         }
+
+         auto end   = std::chrono::steady_clock::now();
+         auto delta = end - start;
+         std::cerr << std::setw(4) << round << std::setw(12)
+                   << add_comma(int64_t(
+                          (num_read_threads * count) /
+                          (std::chrono::duration<double, std::milli>(delta).count() / 1000)))
+                   << " read items/sec  ";
+
+         for (auto& r : rthreads)
+            r->join();
+
+         std::cerr << std::setw(12)
+                   << add_comma(int64_t(
+                          (writes) /
+                          (std::chrono::duration<double, std::milli>(delta).count() / 1000)))
+                   << " write items/sec   "
+                   << " items in db: " << add_comma(total_writes) << " \n";
+
+         if (run_validate)
+            ws.validate();
+         if (not run_compactor)
+         {
+            while (db->compact())
+            {
+            }
+         }
+      }
+   }
+
+   /*
+   auto read_loop = [&]( int c ){
+      auto rs = db->createReadSession(); // thread-local access to read db
+
+      while( not done.load( std::memory_order_acquire ) ) {
+           auto rt = rs.startTransaction(); // grabs a snapshot
+
+           rt->
+      }
+   };
+   */
+
+   return 0;
+}
diff --git a/libraries/triedent/src/cache_allocator.cpp b/libraries/triedent/src/cache_allocator.cpp
index 1c7348725..623217810 100644
--- a/libraries/triedent/src/cache_allocator.cpp
+++ b/libraries/triedent/src/cache_allocator.cpp
@@ -21,20 +21,13 @@ namespace triedent
              [this]()
              {
                 thread_name("swap");
-#ifndef __APPLE__
-                pthread_setname_np(pthread_self(), "swap");
-#else // if __APPLE__
-                pthread_setname_np("swap");
-#endif
+                set_current_thread_name( "swap" );
                 swap_loop();
              });
+
          _gc_thread = std::thread{[this]
                                   {
-#ifndef __APPLE__
-                                    pthread_setname_np(pthread_self(), "swap");
-#else // if __APPLE__
-                                    pthread_setname_np("swap");
-#endif
+                                     set_current_thread_name( "gc" );
                                      _gc.run(&_done);
                                   }};
       }
@@ -63,8 +56,8 @@ namespace triedent
 
    bool cache_allocator::swap(gc_session& session)
    {
-      constexpr uint64_t      target     = 1024 * 1024 * 40ull;
-      constexpr std::uint64_t min_target = 1024 * 1024 * 33ull;
+      constexpr uint64_t      target     = 1024 * 1024 * 256ull;
+      constexpr std::uint64_t min_target = 1024 * 1024 * 128ull;
       bool                    did_work   = false;
       auto                    do_swap    = [&](auto& from, auto& to)
       {
@@ -82,6 +75,8 @@ namespace triedent
             //
             if (auto lock = _obj_ids.lock({.id = o->id}, loc))
             {
+               // note swap will fail on lock contention, will not wait for
+               // free space at the next level down.
                void* p = to.try_allocate(sl, lock.get_id(), o->size,
                                          [&](void* ptr, object_location newloc)
                                          {
diff --git a/libraries/triedent/src/database.cpp b/libraries/triedent/src/database.cpp
index 8f081c49c..692f4ae2d 100644
--- a/libraries/triedent/src/database.cpp
+++ b/libraries/triedent/src/database.cpp
@@ -1,5 +1,6 @@
 #include <triedent/database.hpp>
 #include <triedent/debug.hpp>
+#include <triedent/file_fwd.hpp>
 
 namespace triedent
 {
@@ -7,9 +8,7 @@ namespace triedent
                       const config&                cfg,
                       access_mode                  mode,
                       bool                         allow_gc)
-       : _ring{dir / "data", cfg, mode, allow_gc},
-         _file{dir / "db", mode},
-         _root_release_session{_ring}
+       : _sega{dir}, _file{dir / "db", mode}, _root_release_session{_sega}, _config(cfg)
    {
       if (_file.size() == 0)
       {
@@ -27,6 +26,9 @@ namespace triedent
          throw std::runtime_error("Not a triedent file: " + (dir / "db").native());
       if ((_dbm->flags & file_type_mask) != file_type_database_root)
          throw std::runtime_error("Not a triedent db file: " + (dir / "db").native());
+      if( cfg.run_compact_thread )
+         _sega.start_compact_thread();
+
    }
 
    database::database(const std::filesystem::path& dir, access_mode mode, bool allow_gc)
@@ -34,7 +36,9 @@ namespace triedent
    {
    }
 
-   database::~database() {}
+   database::~database() {
+      
+   }
 
    void database::create(std::filesystem::path dir, config cfg)
    {
@@ -44,11 +48,11 @@ namespace triedent
 
       std::filesystem::create_directories(dir / "data");
 
-      (void)database{dir, cfg, access_mode::read_write};
+      std::make_shared<database>(dir, cfg, access_mode::read_write);
    }
 
    void database::print_stats(std::ostream& os, bool detail)
    {
-      _ring.print_stats(os, detail);
+      _sega.dump();
    }
 }  // namespace triedent
diff --git a/libraries/triedent/src/gc_queue.cpp b/libraries/triedent/src/gc_queue.cpp
index 34587cab5..41fad9ad8 100644
--- a/libraries/triedent/src/gc_queue.cpp
+++ b/libraries/triedent/src/gc_queue.cpp
@@ -143,6 +143,7 @@ namespace triedent
    // \post
    // for each index in [start, R):
    //   either U happens before W or P happens before L
+   // \return the sequence of the session with the lowest sequence
    gc_queue::size_type gc_queue::start_wait(size_type start, size_type end)
    {
       std::size_t     lowest_sequence = end;
diff --git a/libraries/triedent/src/mapping.cpp b/libraries/triedent/src/mapping.cpp
index bdab61888..295f4d496 100644
--- a/libraries/triedent/src/mapping.cpp
+++ b/libraries/triedent/src/mapping.cpp
@@ -9,6 +9,8 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include <iostream>
+
 namespace triedent
 {
    namespace
@@ -79,12 +81,15 @@ namespace triedent
          {
             _data = addr;
             try_pin(&_pinned, addr, _size);
+      //      std::cerr<<"madvise random  " << int64_t(addr) <<"   " << _size << " \n";
+      //      madvise(addr, _size, MADV_RANDOM );
          }
          else
          {
             ::close(_fd);
             throw std::system_error{errno, std::generic_category()};
          }
+
       }
    }
 
diff --git a/libraries/triedent/src/seg_allocator.cpp b/libraries/triedent/src/seg_allocator.cpp
new file mode 100644
index 000000000..2e214ab81
--- /dev/null
+++ b/libraries/triedent/src/seg_allocator.cpp
@@ -0,0 +1,522 @@
+#include <triedent/file_fwd.hpp>
+#include <triedent/seg_allocator.hpp>
+
+namespace triedent
+{
+   seg_allocator::seg_allocator(std::filesystem::path dir)
+       : _id_alloc(dir / "ids"),
+         _block_alloc(dir / "segs", segment_size, max_segment_count),
+         _header_file(dir / "header", access_mode::read_write, true)
+   {
+      if (_header_file.size() == 0)
+      {
+         _header_file.resize(round_to_page(sizeof(mapped_memory::allocator_header)));
+         new (_header_file.data()) mapped_memory::allocator_header();
+      }
+      _header = reinterpret_cast<mapped_memory::allocator_header*>(_header_file.data());
+
+      for (auto& sptr : _session_ptrs)
+         sptr.store(-1ull);
+      _done.store(false);
+   }
+
+   seg_allocator::~seg_allocator()
+   {
+      cses.reset();
+      _done.store(true);
+      if (_compact_thread.joinable())
+         _compact_thread.join();
+   }
+
+   void seg_allocator::start_compact_thread()
+   {
+      if (not _compact_thread.joinable())
+      {
+         _compact_thread = std::thread(
+             [this]()
+             {
+                thread_name("compactor");
+                set_current_thread_name("compactor");
+                compact_loop();
+             });
+      }
+   }
+
+   /**
+    * This must be called via a session because the session is responsible
+    * for documenting what regions could be read
+    *
+    * All objects are const because they cannot be modified after being
+    * written.
+   const object_header* seg_allocator::get_object(object_location loc) const
+   {
+      return nullptr;
+   }
+   const object_header* seg_allocator::get_object(object_id oid) const
+   {
+      return nullptr;
+   }
+    */
+
+   /**
+    *  After all writes are complete, and there is not enough space
+    *  to allocate the next object the alloc_ptr gets set to MAX and
+    *  the page gets 
+    */
+   void seg_allocator::finalize_segment(segment_number)
+   {
+      /// add maxsegsize - (seg_end-alloc_ptr) to free space
+      /// set seg.alloc_ptr = max
+      /// set seg as read only
+      /// mark seg as random access if average object size is
+      /// less than 2x page size.
+      /// mark seg as seq access if average object size is greater than 1mb
+      /// else mark seg as normal access
+   }
+
+   /**
+    *  After all data has been removed from a segment
+    * - madvise free/don't need 
+    * - add the segment number to the free segments at allocator_header::end_ptr
+    * - increment allocator_header::end_ptr
+    */
+   void seg_allocator::release_segment(segment_number) {}
+
+   void seg_allocator::compact_loop()
+   {
+      using namespace std::chrono_literals;
+      if (not cses)
+         cses.emplace(start_session());
+
+      while (not _done.load())
+      {
+         if (not compact_next_segment())
+         {
+            /*
+            std::cerr << "sleeping because most seg: " << most_empty_seg_num
+                      << " empty: " << most_empty_seg_free << " "
+                      << 100 * most_empty_seg_free / double(segment_size) << "\n";
+                      */
+            using namespace std::chrono_literals;
+            std::this_thread::sleep_for(100ms);
+         }
+
+         // find most empty segment
+         // move it to my own personal write session
+         // add it to the free segment queue
+      }
+   }
+
+   bool seg_allocator::compact_next_segment()
+   {
+      if (not cses)
+         cses.emplace(start_session());
+
+      uint64_t most_empty_seg_num  = -1ll;
+      uint64_t most_empty_seg_free = 0;
+      auto     total_segs          = _block_alloc.num_blocks();
+      auto     oldest              = -1ul;
+      for (uint32_t s = 0; s < total_segs; ++s)
+      {
+         auto fso = _header->seg_meta[s].get_free_space_and_objs();
+         if (fso.first > most_empty_seg_free)
+            if (fso.first > segment_size / 8)  // most_empty_seg_free)
+            {
+               auto seg = get_segment(s);
+               // only consider segs that are not actively allocing
+               // or that haven't already been processed
+               if (seg->_alloc_pos.load(std::memory_order_relaxed) == uint32_t(-1))
+               {
+                  //      if (seg->_age <= oldest)
+                  {
+                     most_empty_seg_num  = s;
+                     most_empty_seg_free = fso.first;
+                     oldest              = seg->_age;
+                  }
+               }
+            }
+      }
+
+      // segments must be at least 25% empty before compaction is considered
+      if (most_empty_seg_num == -1ull or most_empty_seg_free < segment_size / 16)
+      {
+         return false;
+      }
+
+      compact_segment(*cses, most_empty_seg_num);
+      return true;
+   }
+
+   void seg_allocator::compact_segment(session& ses, uint64_t seg_num)
+   {
+      auto           state = ses.lock();
+      auto           s     = get_segment(seg_num);
+      auto           send  = (object_header*)((char*)s + segment_size);
+      char*          foc   = (char*)s + sizeof(mapped_memory::segment_header);
+      object_header* foo   = (object_header*)(foc);
+
+      /*
+      std::cerr << "compacting segment: " << seg_num << " into " << ses._alloc_seg_num << " "
+      << "seg free: " << _header->seg_meta[seg_num].get_free_space_and_objs().first << " "
+      << "seg alloc_pos: " << s->_alloc_pos <<" ";
+      if( ses._alloc_seg_ptr ) {
+         std::cerr << "calloc: " << ses._alloc_seg_ptr->_alloc_pos <<" cfree: " << _header->seg_meta[ses._alloc_seg_num].get_free_space_and_objs().first <<"\n";
+      } else std::cerr<<"\n";
+      */
+
+      assert(s->_alloc_pos == segment_offset(-1));
+      //   std::cerr << "seg " << seg_num <<" alloc pos: " << s->_alloc_pos <<"\n";
+
+      auto seg_state = seg_num * segment_size;
+      auto seg_end   = (seg_num + 1) * segment_size;
+
+      auto start_seg_ptr = ses._alloc_seg_ptr;
+      auto start_seg_num = ses._alloc_seg_num;
+
+      madvise(s, segment_size, MADV_SEQUENTIAL);
+      while (foo < send and foo->id)
+      {
+         // if the object has been deleted, skip it
+         if (foo->check == uint32_t(-1))
+         {
+            foo = foo->next();
+            continue;
+         }
+
+         // skip anything that has been freed
+         // note the ref can go to 0 before foo->check is set to -1
+         auto obj_ref = state.get({foo->id});
+         if (obj_ref.ref_count() == 0)
+         {
+            foo = foo->next();
+            continue;
+         }
+
+         // skip anything that isn't pointing
+         // to foo, it may have been moved *or*
+         // it may have been freed and the id reallocated to
+         // another object. We cannot replace this with obj_ref.obj() == foo
+         // because obj_ref could be pointing to an ID in the free list
+         auto foo_idx     = (char*)foo - (char*)s;
+         auto current_loc = obj_ref.location();
+         if (current_loc._offset != seg_num * segment_size + foo_idx)
+         {
+            foo = foo->next();
+            continue;
+         }
+
+         // attempt to move the object requires a lock because the
+         // object could be modified in place while trying to move it.
+         {
+            // lock the ID to prevent anyone else from moving or modifying it while we copy
+            std::unique_lock ul(obj_ref.get_mutex());
+
+            // reload the atomic variable and check the invariant that it is
+            // still pointing at us after the lock.
+            obj_ref.refresh();
+
+            auto foo_idx    = (char*)foo - (char*)s;
+            auto expect_loc = obj_ref.location()._offset;
+            if ((expect_loc & (segment_size - 1)) != foo_idx or obj_ref.ref_count() == 0)
+            {
+               foo = foo->next();
+               continue;
+            }
+
+            // the object hasn't moved nor has its ref count gone to zero so
+            // we commit to alloc memory and memcpy the data
+            auto obj_size   = foo->object_size();
+            auto [loc, ptr] = ses.alloc_data(obj_size, {foo->id}, foo->get_type());
+            memcpy(ptr, foo, obj_size);
+
+            // get an object_header* to the newly move object to run some checks
+            auto moved_foo = ((object_header*)ptr);
+
+            // release() does not grab the lock, so while we were copying the
+            // object may have been released and foo->check set to -1
+            if (moved_foo->check == uint32_t(-1))
+            {
+               // since we alocated data, we need to indicate that it is not being
+               // used. TODO: investigating resetting the alloc_ptr by -foo->object_size()
+               _header->seg_meta[start_seg_num].free_object(foo->object_size());
+            }
+
+            // after moving the data, check to make sure that the checksum is still
+            // valid. This will difinitively prove that a clean copy was made.
+            else if (not moved_foo->validate_checksum())
+            {
+               bool source_still_valid = foo->validate_checksum();
+               // if it was invalid it means a modification in place was made without a lock
+               // it could also mean memory corruption in the application and this error
+               // should be raised to the user TODO: how to report errors from the
+               // background process
+               std::cerr << foo->id << ": mv checksum invalid: '" << moved_foo->check << "' src check: "<<foo->check <<" src valid:"<<source_still_valid <<"\n";
+               _header->seg_meta[start_seg_num].free_object(foo->object_size());
+            }
+            // try move compare and exchange
+            else if (not obj_ref.move({expect_loc}, loc))
+            {
+               // if it failed because the object was released or moved by
+               // someone else, then note the free space and move on with life
+               _header->seg_meta[start_seg_num].free_object(foo->object_size());
+            }
+         }  // end lock scope
+
+         // if ses.alloc_data() was forced to make space in a new segment
+         // then we need to sync() the old write segment before moving forward
+         if (not start_seg_ptr)
+         {
+            start_seg_ptr = ses._alloc_seg_ptr;
+            start_seg_num = ses._alloc_seg_num;
+         }
+         else if (start_seg_ptr != ses._alloc_seg_ptr)
+         {
+            // TODO: only sync from alloc pos at last sync
+            msync(start_seg_ptr, segment_size, MS_SYNC);
+            _header->seg_meta[start_seg_num]._last_sync_pos.store(segment_size,
+                                                                  std::memory_order_relaxed);
+            start_seg_ptr = ses._alloc_seg_ptr;
+            start_seg_num = ses._alloc_seg_num;
+         }
+         foo = foo->next();
+      }
+
+      // in order to maintain the invariant that the segment we just cleared
+      // can be reused, we must make sure that the data we moved out has persisted to
+      // disk.
+      if (start_seg_ptr)
+      {
+         if (-1 == msync(start_seg_ptr, start_seg_ptr->_alloc_pos, MS_SYNC))
+         {
+            std::cerr << "msync errorno: " << errno << "\n";
+         }
+         _header->seg_meta[seg_num]._last_sync_pos.store(start_seg_ptr->_alloc_pos,
+                                                         std::memory_order_relaxed);
+      }
+
+      s->_num_objects = 0;
+      s->_alloc_pos   = 0;
+      s->_age         = -1;
+      // the segment we just cleared, so its free space and objects get reset to 0
+      // and its last_sync pos gets put to the end because there is no need to sync it
+      // because its data has already been synced by the compactor
+      _header->seg_meta[seg_num].clear();
+
+      munlock(s, segment_size);
+      // it is unlikely to be accessed, and if it is don't pre-fetch
+      madvise(s, segment_size, MADV_RANDOM);
+      //madvise(s, segment_size, MADV_DONTNEED);
+
+      // only one thread can move the end_ptr or this will break
+      // std::cerr<<"done freeing end_ptr: " << _header->end_ptr.load() <<" <== " << seg_num <<"\n";
+      _header->free_seg_buffer[_header->end_ptr.load(std::memory_order_relaxed) & (max_session_count-1)] = seg_num;
+      _header->end_ptr.fetch_add(1, std::memory_order_release);
+      //
+   }
+
+   /**
+    * The min read pointer, aka min(R*), must be A <= R* <= E.
+    * A, R, and E only ever increase
+    * The last value of this function is stored in _min_read_ptr
+    *
+    * So long as the last value is greater than A, A can advance without
+    * updating _min_read_ptr; however, if A >= _min_read_ptr then 
+    * we want to check all active R* to find the min. If all sessions
+    * are idle, the the min becomes E.
+    */
+   uint64_t seg_allocator::get_min_read_ptr()
+   {
+      auto ap  = _header->alloc_ptr.load(std::memory_order_relaxed);
+      auto ep  = _header->end_ptr.load(std::memory_order_acquire);
+      auto min = _min_read_ptr.load(std::memory_order_acquire);
+
+      if (ap >= min)  // then check to see if there is more
+      {
+         min = ep;
+         // find new last min
+         // TODO: only iterate over active sessions instead of all sessions
+         // this is so infrequent it probably doesn't matter.
+         auto fs      = ~_free_sessions.load();
+         auto num_ses = std::popcount(fs);
+         for (uint32_t i = 0; fs and i < max_session_count; ++i)
+         {
+            if (fs & (1ull << i))
+            {
+               if (auto p = _session_ptrs[i].load(std::memory_order_relaxed); p < min)
+               {
+                  min = p;
+               }
+
+               // we can't find anything lower than this
+               if (min == ap)
+               {
+                  _min_read_ptr.store(min, std::memory_order_release);
+                  return min;
+               }
+            }
+         }
+      }
+      if (min > ep)
+         min = ep;
+      _min_read_ptr.store(min, std::memory_order_release);
+      return min;
+   }
+
+   /**
+    *  reads allocator_header::reuse_ptr and if it is less than
+    *  allocator_header::min_read_ptr then attempts increment the
+    *  reuse pointer by exactly 1, if so then it uses the segment
+    *  at _free_segments[reuse_ptr.old] 
+    *
+    *  If reuse_ptr == min_read_ptr then advance the alloc_ptr by
+    *  segment_size to claim a new segment.
+    *
+    *
+    */
+   std::pair<segment_number, mapped_memory::segment_header*> seg_allocator::get_new_segment()
+   {
+      auto ap  = _header->alloc_ptr.load(std::memory_order_relaxed);
+      auto min = get_min_read_ptr();
+
+      auto prepare_segment = [&](segment_number sn)
+      {
+         auto sp = _block_alloc.get(sn);
+         madvise(sp, segment_size, MADV_FREE); // zero's pages if they happen to be accessed
+         madvise(sp, segment_size, MADV_RANDOM);
+
+         auto r = mlock(sp, segment_size);
+
+         if (r)
+            std::cerr << "MLOCK: " << r << "  " << EINVAL << "  " << EAGAIN << "\n";
+
+         //memset(sp, 0, segment_size);  // TODO: is this necessary?
+
+         auto shp  = new (sp) mapped_memory::segment_header();
+         shp->_age = _header->next_alloc_age.fetch_add(1, std::memory_order_relaxed);
+
+         return std::pair<segment_number, mapped_memory::segment_header*>(sn, shp);
+      };
+      //  std::cout <<"get new seg ap: " << ap << "  min: " << min <<"  min-ap:" << min - ap << "\n";
+
+      while (min - ap >= 1)
+      {
+         if (_header->alloc_ptr.compare_exchange_weak(ap, ap + 1))
+         {
+            auto free_seg                = _header->free_seg_buffer[ap];
+            _header->free_seg_buffer[ap] = segment_number(-1);
+            //       std::cerr << "reusing segment..." << free_seg <<"\n";
+            return prepare_segment(free_seg);
+         }
+      }
+      return prepare_segment(_block_alloc.alloc());
+   }
+   void seg_allocator::sync(sync_type st)
+   {
+      if (st == sync_type::none)
+         return;
+
+      auto total_segs = _block_alloc.num_blocks();
+
+      for (uint32_t i = 0; i < total_segs; ++i)
+      {
+         auto seg        = get_segment(i);
+         auto last_sync  = _header->seg_meta[i]._last_sync_pos.load(std::memory_order_relaxed);
+         auto last_alloc = seg->_alloc_pos.load(std::memory_order_relaxed);
+
+         if (last_alloc > segment_size)
+            last_alloc = segment_size;
+
+         static const uint64_t page_size      = getpagesize();
+         static const uint64_t page_size_mask = ~(page_size - 1);
+
+         auto sync_bytes   = last_alloc - (last_sync & page_size_mask);
+         auto seg_sync_ptr = (((intptr_t)seg + last_sync) & page_size_mask);
+
+         if (last_alloc > last_sync)
+         {
+            if (-1 == msync((char*)seg_sync_ptr, sync_bytes, msync_flag(st)))
+            {
+               std::cerr << "ps: " << getpagesize() << " len: " << sync_bytes << " rounded:  \n";
+               std::cerr << "msync errno: " << std::string(strerror(errno))
+                         << " seg_alloc::sync() seg: " << i << "\n";
+            }
+            _header->seg_meta[i]._last_sync_pos.store(last_alloc);
+         }
+      }
+   }
+
+   void seg_allocator::dump()
+   {
+      std::cerr << "\n--- segment allocator state ---\n";
+      auto     total_segs       = _block_alloc.num_blocks();
+      auto     total_retained   = 0;
+      uint64_t total_free_space = 0;
+      std::cerr << "total segments: " << total_segs << "\n";
+      std::cerr << std::setw(6) << "#"
+                << " | ";
+      std::cerr << std::setw(8) << "freed %"
+                << " | ";
+      std::cerr << std::setw(12) << "freed bytes"
+                << " | ";
+      std::cerr << std::setw(12) << "freed obj"
+                << " | ";
+      std::cerr << std::setw(12) << "alloc pos"
+                << " | ";
+      std::cerr << std::setw(12) << "alloced obj"
+                << " | ";
+      std::cerr << std::setw(12) << "num obj"
+                << " | ";
+      std::cerr << std::setw(8) << "age"
+                << " \n";
+      for (uint32_t i = 0; i < total_segs; ++i)
+      {
+         auto seg        = get_segment(i);
+         auto space_objs = _header->seg_meta[i].get_free_space_and_objs();
+
+         std::cerr << std::setw(6) << i << " | ";
+         std::cerr << std::setw(8) << int(100 * double(space_objs.first) / segment_size) << " | ";
+         total_free_space += space_objs.first;
+         std::cerr << std::setw(12) << space_objs.first << " | ";
+         std::cerr << std::setw(12) << space_objs.second << " | ";
+         std::cerr << std::setw(12)
+                   << (seg->_alloc_pos == -1 ? "END" : std::to_string(seg->_alloc_pos)) << " | ";
+         std::cerr << std::setw(12) << seg->_num_objects << " | ";
+         total_retained += seg->_num_objects - space_objs.second;
+         std::cerr << std::setw(12) << seg->_num_objects - space_objs.second << " | ";
+         std::cerr << std::setw(8) << seg->_age << " \n";
+      }
+      std::cerr << "total free: " << total_free_space / 1024 / 1024. << "Mb  "
+                << (100 * total_free_space / double(total_segs * segment_size)) << "%\n";
+      std::cerr << "total retained: " << total_retained << " objects\n";
+
+      std::cerr << "---- free segment Q ------\n";
+      std::cerr << "[---A---R*---E------]\n";
+      std::cerr << "A - alloc idx: " << _header->alloc_ptr.load() << "\n";
+      for (uint32_t i = 0; i < max_session_count; ++i)
+      {
+         if (auto p = _session_ptrs[i].load(); p != -1ull)
+            std::cerr << "R" << i << ": " << p << "\n";
+      }
+
+      std::cerr << "E - end idx: " << _header->end_ptr.load() << "\n";
+
+      auto fs      = ~_free_sessions.load();
+      auto num_ses = std::popcount(fs);
+      std::cerr << "active sessions: " << num_ses << "\n";
+      for (uint32_t i = 0; i < max_session_count; ++i)
+      {
+         if (fs & (1ull << i))
+         {
+            if (auto p = _session_ptrs[i].load(); p == -1ull)
+               std::cerr << "R" << i << ": UNLOCKED \n";
+         }
+      }
+
+      std::cerr << "------- pending free segments -----------\n";
+      for (auto x = _header->alloc_ptr.load(); x < _header->end_ptr.load(); ++x)
+      {
+         std::cerr << x << "] " << _header->free_seg_buffer[x & (max_segment_count - 1)] << "\n";
+      }
+      std::cerr << "--------------------------\n";
+   }
+};  // namespace triedent
diff --git a/libraries/triedent/test/CMakeLists.txt b/libraries/triedent/test/CMakeLists.txt
index 287207b1b..423ed44a8 100644
--- a/libraries/triedent/test/CMakeLists.txt
+++ b/libraries/triedent/test/CMakeLists.txt
@@ -17,3 +17,10 @@ add_executable(triedent-tests-bigdb big.cpp)
 target_link_libraries(triedent-tests-bigdb PUBLIC Boost::program_options triedent)
 target_include_directories(triedent-tests-bigdb PUBLIC ${Boost_INCLUDE_DIRS})
 set_target_properties(triedent-tests-bigdb PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${ROOT_BINARY_DIR})
+
+add_executable(dtester dtester.cpp)
+target_link_libraries(dtester PUBLIC Boost::program_options triedent)
+target_include_directories(dtester PUBLIC ${Boost_INCLUDE_DIRS})
+set_target_properties(dtester PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${ROOT_BINARY_DIR})
+
+
diff --git a/libraries/triedent/test/big.cpp b/libraries/triedent/test/big.cpp
index 66a848858..0d4a7c66b 100644
--- a/libraries/triedent/test/big.cpp
+++ b/libraries/triedent/test/big.cpp
@@ -46,6 +46,7 @@ int main(int argc, char** argv)
    auto                    opt = desc.add_options();
    opt("help,h", "print this message");
    opt("reset", "reset the database");
+   opt("read-only", "just query existing db");
    opt("sparce", po::value<bool>(&use_string)->default_value(false), "use sparse string keys");
    opt("data-dir", po::value<std::string>(&db_dir)->default_value("./big.dir"),
        "the folder that contains the database");
@@ -86,6 +87,10 @@ int main(int argc, char** argv)
                                                             .cool_bytes = 1ull << cool_page_c,
                                                             .cold_bytes = 1ull << cold_page_c});
    }
+   bool read_only = false;
+   if (vm.count("read-only")) {
+      read_only = true;
+   }
 
    if (num_read_threads > 64)
    {
@@ -152,7 +157,7 @@ int main(int argc, char** argv)
             while (r.load(std::memory_order_relaxed) == v)
             {
                uint64_t h = (uint64_t(gen()) << 32) | gen();
-               bool found = rs->get_less_than(rr, std::string_view((char*)&h, sizeof(h)), &found_key, &found_value, &result_roots);
+               bool found = rs->get_less_than(rr, std::string_view((char*)&h, sizeof(h)), &found_key, &found_value );
                if (found) {
                   ++total_lookups[c].total_lookups;
                }
@@ -298,32 +303,36 @@ int main(int argc, char** argv)
 
          if (i < total)
          {
-            //base.emplace( std::make_pair(k,std::string((char*)&h, sizeof(h))) );
-            if (use_string)
-            {
-               if (check_content)
-                  comparison_map[str] = str;
-               int inserted;
-               inserted = s->upsert(root, str, str);
-               if (inserted >= 0)
+            if( read_only ) {
+               usleep( 2 );
+            } else {
+               //base.emplace( std::make_pair(k,std::string((char*)&h, sizeof(h))) );
+               if (use_string)
                {
-                  // TRIEDENT_WARN("failed to insert: ", h);
-                  break;
+                  if (check_content)
+                     comparison_map[str] = str;
+                  int inserted;
+                  inserted = s->upsert(root, str, str);
+                  if (inserted >= 0)
+                  {
+                     // TRIEDENT_WARN("failed to insert: ", h);
+                     break;
+                  }
+                  assert(inserted < 0);
                }
-               assert(inserted < 0);
-            }
-            else
-            {
-               if (check_content)
-                  comparison_map[(std::string)hk] = (std::string)hk;
-               int inserted;
-               inserted = s->upsert(root, hk, hk);
-               if (inserted >= 0)
+               else
                {
-                  // TRIEDENT_WARN("failed to insert: ", h);
-                  break;
+                  if (check_content)
+                     comparison_map[(std::string)hk] = (std::string)hk;
+                  int inserted;
+                  inserted = s->upsert(root, hk, hk);
+                  if (inserted >= 0)
+                  {
+                     // TRIEDENT_WARN("failed to insert: ", h);
+                     break;
+                  }
+                  assert(inserted < 0);
                }
-               assert(inserted < 0);
             }
          }
       }
diff --git a/libraries/triedent/test/dtester.cpp b/libraries/triedent/test/dtester.cpp
new file mode 100644
index 000000000..5bf1673ea
--- /dev/null
+++ b/libraries/triedent/test/dtester.cpp
@@ -0,0 +1,150 @@
+#include <stdlib.h>
+#include <triedent/block_allocator.hpp>
+#include <triedent/id_allocator.hpp>
+#include <triedent/seg_allocator.hpp>
+#include <triedent/node.hpp>
+#include <triedent/database.hpp>
+using namespace std::chrono_literals;
+
+using namespace triedent;
+
+int main(int argc, char** argv)
+{
+   try
+   {
+      std::vector<char> result;
+      std::filesystem::remove_all("big.dir");
+      std::filesystem::create_directories("big.dir");
+      auto db = std::make_shared<database>("big.dir", read_write);
+      auto ws = db->start_write_session();
+      auto top = ws->get_top_root();
+      auto r = ws->upsert(top, "key", "val" );
+      std::cerr<< "old size: " << r <<"\n";
+      auto r2 = ws->get(top, "key", &result );
+      std::cerr<< "found: " << r2 <<" " << result.data() <<"\n";
+      auto r3 = ws->upsert(top, "bottom", "dollar" );
+      auto r4 = ws->get(top, "bottom", &result );
+      std::cerr<< "found: " << r4 <<" " << result.data() <<"\n";
+      return 0;
+
+
+
+      std::filesystem::remove("data");
+      std::filesystem::remove("ids");
+      std::filesystem::remove("header");
+      triedent::seg_allocator segs(".");
+
+      std::cerr << "starting session\n";
+      auto ss = segs.start_session();
+      std::cerr << "locking data before accessing...\n";
+      {
+      auto sl = ss.lock();
+      std::cerr << "about to alloc\n";
+      // pointers only valid while sl is held
+      auto oref = sl.alloc(20, triedent::node_type::inner);
+      std::cout << "oref.id: " << oref.id().id << "\n";
+      std::cout << "oref.ref: " << oref.ref_count() << "\n";
+      std::cout << "oref.type: " << (int)oref.type() << "\n";
+      std::cout << "oref.obj->size: " << (int)oref.obj()->size << "\n";
+      std::cout << "oref.obj->cap: " << (int)oref.obj()->data_capacity() << "\n";
+      std::cout << "oref.obj->id: " << (int)oref.obj()->id << "\n";
+      std::cout << "oref.loc->seg: " << (int)oref.loc().segment() << "\n";
+      std::cout << "oref.loc->idx: " << (int)oref.loc().index() << "\n";
+      auto oref2 = sl.alloc(25, triedent::node_type::inner);
+      std::cout << "oref2.id: " << oref2.id().id << "\n";
+      std::cout << "oref2.ref: " << oref2.ref_count() << "\n";
+      std::cout << "oref2.type: " << (int)oref2.type() << "\n";
+      std::cout << "oref2.obj->size: " << (int)oref2.obj()->size << "\n";
+      std::cout << "oref2.obj->cap: " << (int)oref2.obj()->data_capacity() << "\n";
+      std::cout << "oref2.obj->id: " << (int)oref2.obj()->id << "\n";
+      std::cout << "oref2.loc->seg: " << (int)oref2.loc().segment() << "\n";
+      std::cout << "oref2.loc->idx: " << (int)oref2.loc().index() << "\n";
+
+      auto oref3 = sl.alloc(25, triedent::node_type::inner);
+      std::cout << "oref3.id: " << oref3.id().id << "\n";
+      std::cout << "oref3.ref: " << oref3.ref_count() << "\n";
+      std::cout << "oref3.type: " << (int)oref3.type() << "\n";
+      std::cout << "oref3.obj->size: " << (int)oref3.obj()->size << "\n";
+      std::cout << "oref3.obj->cap: " << (int)oref3.obj()->data_capacity() << "\n";
+      std::cout << "oref3.obj->id: " << (int)oref3.obj()->id << "\n";
+      std::cout << "oref3.loc->seg: " << (int)oref3.loc().segment() << "\n";
+      std::cout << "oref3.loc->idx: " << (int)oref3.loc().index() << "\n";
+
+      std::vector<triedent::seg_allocator::session::read_lock::object_ref<char>> objs;
+
+      for (uint32_t i = 0; i < 260; ++i)
+      {
+         auto oref3 = sl.alloc(1024 * 1024, triedent::node_type::inner);
+         objs.push_back(oref3);
+      }
+
+      segs.dump();
+
+      std::cerr << "test release\n";
+      oref3.release();
+
+      segs.dump();
+
+      std::cerr << "freeing half the objects";
+      for (uint32_t i = 0; i < objs.size() / 2; ++i)
+         objs[i * 2].release();
+      segs.dump();
+
+      std::cerr << "waiting on compact loop\n";
+   //   for (uint32_t i = 0; i < objs.size() / 2; ++i)
+//
+      std::this_thread::sleep_for(1000ms);
+
+      segs.dump();
+      }
+      std::cerr<<"after lock release\n";
+      segs.dump();
+
+      {
+      auto sl = ss.lock();
+      std::cerr<<"after lock reopened\n";
+      segs.dump();
+
+      std::vector<triedent::seg_allocator::session::read_lock::object_ref<char>> objs;
+      for (uint32_t i = 0; i < 260; ++i)
+      {
+         auto oref3 = sl.alloc(1024 * 1024, triedent::node_type::inner);
+         objs.push_back(oref3);
+      }
+      std::cerr<<"after a bunch of alloc \n";
+      segs.dump();
+      }
+
+
+      /*
+      triedent::id_allocator oa("test_file.dat");
+
+      auto ses = oa.start_session();
+
+      srand(time(nullptr));
+
+      for (uint32_t r = 0; r < 100; ++r)
+      {
+         auto     start = std::chrono::steady_clock::now();
+         uint64_t count = 1000 * 1000ull * 5;
+
+         for (uint32_t i = 0; i < count; ++i)
+         {
+            ses.get_new_id();
+         }
+         auto end = std::chrono::steady_clock::now();
+         auto delta = end - start;
+
+         std::cerr << std::setw(12)
+                   << int64_t(count /
+                              (std::chrono::duration<double, std::milli>(delta).count() / 1000))
+                   << " items/sec  free: " << oa.get_free_count() <<"  cap: " << oa.get_capacity() <<"\n";
+      }
+      */
+   }
+   catch (std::exception& e)
+   {
+      std::cerr << "exception: " << e.what() << "\n";
+   }
+   return 0;
+}