Skip to content

Commit

Permalink
[FEATURE] Unitialised resize for deserialisation
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Nov 6, 2023
1 parent b6d288e commit 10ad88d
Show file tree
Hide file tree
Showing 7 changed files with 188 additions and 11 deletions.
35 changes: 35 additions & 0 deletions cmake/configuration.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ message (STATUS "Finding HIBF (${HIBF_VERSION}) and checking requirements")

include (CheckIncludeFileCXX)
include (CheckCXXSourceCompiles)
include (CheckCXXSourceRuns)
include (CheckCXXCompilerFlag)

# ----------------------------------------------------------------------------
Expand Down Expand Up @@ -161,6 +162,40 @@ if (HIBF_SUPPRESS_GCC4_ABI)
hibf_config_print ("Suppressing GCC 4 warnings: via -Wno-psabi")
endif ()

# ----------------------------------------------------------------------------
# Optional: seqan::hibf::bit_vector::resize_for_overwrite
# ----------------------------------------------------------------------------

set (HIBF_UNINITIALISED_RESIZE_TEST_SOURCE
"#include <cstddef>
#include <vector>
struct my_vector : public std::vector<int>
{
void resize_for_overwrite(size_t const size)
{
this->_M_create_storage(size);
this->_M_impl._M_finish = this->_M_impl._M_end_of_storage;
}
};
int main()
{
my_vector vec{};
vec.resize_for_overwrite(10u);
return vec.size() != 10u;
}
")

check_cxx_source_runs ("${HIBF_UNINITIALISED_RESIZE_TEST_SOURCE}" HIBF_UNINITIALISED_RESIZE_SUPPORT)

if (HIBF_UNINITIALISED_RESIZE_SUPPORT)
hibf_config_print ("Unitialised resize support: enabled")
set (HIBF_DEFINITIONS ${HIBF_DEFINITIONS} "-DHIBF_UNINITIALISED_RESIZE")
else ()
hibf_config_print ("Unitialised resize support: disabled")
endif ()

# ----------------------------------------------------------------------------
# Optimizations
# ----------------------------------------------------------------------------
Expand Down
4 changes: 4 additions & 0 deletions include/hibf/cereal/concepts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <hibf/platform.hpp>

#include <cereal/details/helpers.hpp> // for InputArchiveBase, OutputArchiveBase
#include <cereal/details/traits.hpp> // for InputArchiveBase, OutputArchiveBase

namespace seqan::hibf
{
Expand All @@ -20,4 +21,7 @@ concept cereal_input_archive = std::is_base_of_v<cereal::detail::InputArchiveBas
template <typename t>
concept cereal_archive = cereal_output_archive<t> || cereal_input_archive<t>;

template <typename t>
concept cereal_text_archive = std::is_base_of_v<cereal::traits::TextArchive, t>;

} // namespace seqan::hibf
1 change: 1 addition & 0 deletions include/hibf/hierarchical_interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <hibf/platform.hpp> // for HIBF_CONSTEXPR_VECTOR

#include <cereal/macros.hpp> // for CEREAL_SERIALIZE_FUNCTION_NAME
#include <cereal/types/vector.hpp>

namespace seqan::hibf
{
Expand Down
66 changes: 55 additions & 11 deletions include/hibf/misc/bit_vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,9 @@
#include <hibf/contrib/aligned_allocator.hpp> // for aligned_allocator
#include <hibf/platform.hpp> // for HIBF_CONSTEXPR_VECTOR

#include <cereal/macros.hpp> // for CEREAL_SERIALIZE_FUNCTION_NAME
#include <cereal/specialize.hpp> // for specialization, specialize
#include <cereal/types/base_class.hpp> // for base_class
#include <cereal/types/vector.hpp> // IWYU pragma: keep
#include <cereal/cereal.hpp> // for CEREAL_SERIALIZE_FUNCTION_NAME
#include <cereal/macros.hpp> // for CEREAL_SERIALIZE_FUNCTION_NAME
#include <cereal/specialize.hpp> // for specialization, specialize

namespace seqan::hibf
{
Expand Down Expand Up @@ -876,13 +875,59 @@ class bit_vector :
* \attention These functions are never called directly.
*/
template <cereal_archive archive_t>
void CEREAL_SERIALIZE_FUNCTION_NAME(archive_t & archive)
void CEREAL_LOAD_FUNCTION_NAME(archive_t & archive)
{
archive(cereal::base_class<base_t>(this), _size);
// Not using `cereal::make_size_tag(_size)`, because the size tag is inferred for text (XML/JSON) archives.
// For text archives, `cereal::make_size_tag(_size)` would be the number of elements serialised in the for-loop.
// E.g., `_size == 100` would store `2` (`== host_size_impl(_size)`).
archive(_size);
size_t const vector_size = host_size_impl(_size);

resize_for_overwrite(vector_size);

if constexpr (cereal_text_archive<archive_t>)
{
for (auto && v : *as_base())
archive(v);
}
else
{
archive(cereal::binary_data(data(), vector_size * sizeof(chunk_type)));
}
}

//!\copydoc load
template <cereal_archive archive_t>
void CEREAL_SAVE_FUNCTION_NAME(archive_t & archive) const
{
// Not using `cereal::make_size_tag(_size)`, because the size tag is inferred for text (XML/JSON) archives.
// For text archives, `cereal::make_size_tag(_size)` would be the number of elements serialised in the for-loop.
// E.g., `_size == 100` would store `2` (`== host_size_impl(_size)`).
archive(_size);

if constexpr (cereal_text_archive<archive_t>)
{
for (auto && v : *as_base())
archive(v);
}
else
{
archive(cereal::binary_data(data(), base_t::size() * sizeof(chunk_type)));
}
}
//!\endcond

private:
HIBF_CONSTEXPR_VECTOR inline void resize_for_overwrite(size_t const size)
{
#ifdef HIBF_UNINITIALISED_RESIZE
this->_M_create_storage(size);
this->_M_impl._M_finish = this->_M_impl._M_end_of_storage;
#else
base_t::resize(size);
#endif
}

//!\brief Performs the binary bitwise-operation on the underlying chunks.
template <typename binary_operator_t>
constexpr bit_vector & binary_transform_impl(bit_vector const & rhs, binary_operator_t && op) noexcept
Expand Down Expand Up @@ -952,16 +997,15 @@ class bit_vector :
//!\cond
// See https://uscilab.github.io/cereal/serialization_functions.html#inheritance
// seqan::hibf::bit_vector's base class is std::vector
// We include <cereal/types/vector.hpp> for std::vector serialisation
// cereal provides these as separate load/save functions
// bit_vector inherits those and also provides a serialise function
// Since both load/save member functions (from std::vector) and a serialise function (bit_vector) are available,
// If we include <cereal/types/vector.hpp> for std::vector serialisation (e.g., HIBF),
// cereal provides these as non-member load/save functions.
// Since both load/save non-member functions (std::vector) and load/save member functions (bit_vector) are available,
// cereal needs to be told which one to use.
namespace cereal
{

template <typename archive_t>
struct specialize<archive_t, seqan::hibf::bit_vector, cereal::specialization::member_serialize>
struct specialize<archive_t, seqan::hibf::bit_vector, cereal::specialization::member_load_save>
{};

} // namespace cereal
Expand Down
3 changes: 3 additions & 0 deletions test/documentation/hibf_doxygen_cfg.in
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,9 @@ SEARCH_INCLUDES = YES
INCLUDE_PATH = ${HIBF_DOXYGEN_SOURCE_DIR}/include
INCLUDE_FILE_PATTERNS =
PREDEFINED = "HIBF_DOXYGEN_ONLY(x)= x" \
CEREAL_SERIALIZE_FUNCTION_NAME=serialize \
CEREAL_LOAD_FUNCTION_NAME=load \
CEREAL_SAVE_FUNCTION_NAME=save \
${HIBF_DOXYGEN_PREDEFINED_NDEBUG} \
HIBF_CONSTEXPR_VECTOR=constexpr
EXPAND_AS_DEFINED =
Expand Down
1 change: 1 addition & 0 deletions test/performance/ibf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
# SPDX-License-Identifier: BSD-3-Clause

hibf_benchmark (bit_vector_benchmark.cpp)
hibf_benchmark (bit_vector_serialisation_benchmark.cpp)
hibf_benchmark (interleaved_bloom_filter_benchmark.cpp)
89 changes: 89 additions & 0 deletions test/performance/ibf/bit_vector_serialisation_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// SPDX-FileCopyrightText: 2006-2023, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2023, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#include <benchmark/benchmark.h> // for Benchmark, State, BENCHMARK_CAPTURE, DoNotOptimize, BENCHMARK_MAIN

#include <algorithm> // for __generate_fn, generate
#include <cinttypes> // for int32_t, uint8_t
#include <cstddef> // for size_t
#include <filesystem>
#include <fstream>
#include <memory> // for allocator
#include <random> // for uniform_int_distribution, mt19937_64
#include <type_traits> // for invoke_result_t
#include <utility> // for move, pair

#include <hibf/misc/bit_vector.hpp> // for bit_vector
#include <hibf/test/sandboxed_path.hpp> // for sandboxed_path, operator/
#include <hibf/test/tmp_directory.hpp> // for tmp_directory

#include <cereal/archives/binary.hpp> // for BinaryInputArchive, BinaryOutputArchive

seqan::hibf::bit_vector generate_bit_vector(size_t const size_in_bits, size_t const seed = 0u)
{
std::mt19937_64 engine{seed};
std::uniform_int_distribution<uint8_t> dist{0u, 1u};

auto gen = [&dist, &engine]()
{
return dist(engine);
};
seqan::hibf::bit_vector vec(size_in_bits);
std::ranges::generate(vec, gen);

return vec;
}

static seqan::hibf::test::tmp_directory tmp{};

void load_bit_vector(benchmark::State & state)
{
size_t const size_in_bits = 1ULL << state.range(0);
auto const filename = tmp.path() / std::to_string(state.range(0));

if (!std::filesystem::exists(filename))
{
seqan::hibf::bit_vector const vector = generate_bit_vector(size_in_bits);

std::ofstream output_stream{filename, std::ios::binary};
cereal::BinaryOutputArchive oarchive{output_stream};
oarchive(vector);
}

// Substract 8 bytes for serialised size_ member.
size_t const filesize_in_bytes = std::filesystem::file_size(filename) - 8u;

if (size_in_bits / 8u != filesize_in_bytes)
throw std::logic_error{"Actual and expected file size differ."};

for (auto _ : state)
{
seqan::hibf::bit_vector vector{};
{
std::ifstream input_stream{filename, std::ios::binary};
cereal::BinaryInputArchive iarchive{input_stream};
iarchive(vector);
}
benchmark::DoNotOptimize(vector);
}

state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * filesize_in_bytes);
// Use `filesize_in_bytes` to make benchmark show `1Mi` instead of `1.00001Mi`.
state.counters["filesize"] =
benchmark::Counter(filesize_in_bytes, benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024);
}

// Benchmark ranges are `int32_t`, i.e to small to represent 4 GiB or more.
// Hence we use `2^x`.
// 13 -> 1KiB
// 23 -> 1MiB
// 33 -> 1GiB
// We use a small number here for the unit tests.
// Sizes of 10s of GiB would be more interesting for actual benchmarking.
static constexpr int32_t min_range = 13;
static constexpr int32_t max_range = 13;

BENCHMARK(load_bit_vector)->Range(min_range, max_range);

BENCHMARK_MAIN();

0 comments on commit 10ad88d

Please sign in to comment.