From 9f2436fb2488d1c1ef28f60ceaeec2c4f4543883 Mon Sep 17 00:00:00 2001 From: canepat <16927169+canepat@users.noreply.github.com> Date: Tue, 23 Jan 2024 13:30:13 +0100 Subject: [PATCH] capi: block execution commits after state batch has been flushed (#1621) * api: use bytes instead of gas as batch size unit node: fix batch size computation in db buffer node: avoid caching read values in db buffer * api: restore usage of gas as batch size unit * api: improve logging in block execution * make fmt * make format uniform in logs * api: commit after any state batch has been flushed node: change RWTxnUnmanaged to allow commit and abort * fix comment * api: flush state history after each block execution api: change batch flush policy from gas to size node: fix empty element in call traces * node: log MDBX commit latency * node: update state batch size for initial=current=empty account node: fix state batch size computation for account code and storage node: sort storage locations to insert ordered data into db node: fix unmanaged r/w txn destruction add unit tests * add unit tests for batch size * delete check already moved within db::Buffer::update_account * add check also to InMemoryState::update_account * fix some warnings and naming --------- Co-authored-by: GitHub --- silkworm/capi/silkworm.cpp | 74 ++-- silkworm/core/common/util.cpp | 6 +- silkworm/core/common/util.hpp | 2 +- silkworm/core/state/in_memory_state.cpp | 4 + silkworm/core/state/intra_block_state.cpp | 4 - silkworm/node/db/buffer.cpp | 85 +++-- silkworm/node/db/buffer.hpp | 8 +- silkworm/node/db/buffer_test.cpp | 40 +- silkworm/node/db/mdbx.cpp | 82 ++++- silkworm/node/db/mdbx.hpp | 19 +- silkworm/node/db/mdbx_test.cpp | 428 +++++++++++++++++++++- 11 files changed, 657 insertions(+), 95 deletions(-) diff --git a/silkworm/capi/silkworm.cpp b/silkworm/capi/silkworm.cpp index ab70219059..f4f492da6d 100644 --- a/silkworm/capi/silkworm.cpp +++ b/silkworm/capi/silkworm.cpp @@ -28,7 +28,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -49,6 +51,7 @@ static MemoryMappedRegion make_region(const SilkwormMemoryMappedFile& mmf) { static log::Settings kLogSettingsLikeErigon{ .log_utc = false, // display local time .log_timezone = false, // no timezone ID + .log_nocolor = true, // do not use colors .log_trim = true, // compact rendering (i.e. no whitespaces) }; @@ -62,7 +65,6 @@ struct ExecutionProgress { size_t processed_transactions{0}; size_t processed_gas{0}; float gas_state_perc{0.0}; - float gas_history_perc{0.0}; }; //! Generate log arguments for Silkworm library version @@ -74,8 +76,27 @@ static log::Args log_args_for_version() { "git_tag", std::string(build_info->project_version), "git_commit", - std::string(build_info->git_commit_hash), - }; + std::string(build_info->git_commit_hash)}; +} + +//! Generate log arguments for execution flush at specified block +static log::Args log_args_for_exec_flush(const db::Buffer& state_buffer, uint64_t max_batch_size, uint64_t current_block) { + return { + "batch", + std::to_string(state_buffer.current_batch_state_size()), + "max_batch", + std::to_string(max_batch_size), + "block", + std::to_string(current_block)}; +} + +//! Generate log arguments for execution commit at specified block +static log::Args log_args_for_exec_commit(StopWatch::Duration elapsed, const std::filesystem::path& db_path) { + return { + "in", + StopWatch::format(elapsed), + "chaindata", + std::to_string(Directory{db_path}.size())}; } static std::filesystem::path make_path(const char data_dir_path[SILKWORM_PATH_SIZE]) { @@ -117,10 +138,7 @@ static log::Args log_args_for_exec_progress(ExecutionProgress& progress, uint64_ "Mgas/s", float_to_string(speed_mgas), "gasState", - float_to_string(progress.gas_state_perc), - "gasHistory", - float_to_string(progress.gas_history_perc), - }; + float_to_string(progress.gas_state_perc)}; } //! A signal handler guard using RAII pattern to acquire/release signal handling @@ -373,6 +391,7 @@ int silkworm_execute_blocks(SilkwormHandle handle, MDBX_txn* mdbx_txn, uint64_t try { // Wrap MDBX txn into an internal *unmanaged* txn, i.e. MDBX txn is only used but neither aborted nor committed db::RWTxnUnmanaged txn{mdbx_txn}; + const auto db_path{txn.db().get_path()}; db::Buffer state_buffer{txn, /*prune_history_threshold=*/0}; db::DataModel access_layer{txn}; @@ -381,9 +400,10 @@ int silkworm_execute_blocks(SilkwormHandle handle, MDBX_txn* mdbx_txn, uint64_t AnalysisCache analysis_cache{kCacheSize}; ObjectPool state_pool; - // Transform batch size limit into gas units (Ggas = Giga gas, Tgas = Tera gas) - const size_t gas_max_history_size{batch_size * 1_Kibi / 2}; // 512MB -> 256Ggas roughly - const size_t gas_max_batch_size{gas_max_history_size * 20}; // 256Ggas -> 5Tgas roughly + const size_t max_batch_size{batch_size}; + + // Transform batch size limit into gas units (Ggas = Giga gas) + const size_t gas_max_batch_size{batch_size * 2_Kibi}; // 256MB -> 512Ggas roughly // Preload requested blocks in batches from storage, i.e. from MDBX database or snapshots static constexpr size_t kMaxPrefetchedBlocks{10240}; @@ -393,7 +413,7 @@ int silkworm_execute_blocks(SilkwormHandle handle, MDBX_txn* mdbx_txn, uint64_t auto signal_check_time{progress.start_time}; auto log_time{progress.start_time}; - size_t gas_batch_size{0}, gas_history_size{0}; + size_t gas_batch_size{0}; for (BlockNum block_number{start_block}; block_number <= max_block; ++block_number) { if (prefetched_blocks.empty()) { const auto num_blocks{std::min(size_t(max_block - block_number + 1), kMaxPrefetchedBlocks)}; @@ -443,19 +463,23 @@ int silkworm_execute_blocks(SilkwormHandle handle, MDBX_txn* mdbx_txn, uint64_t progress.processed_transactions += block.transactions.size(); progress.processed_gas += block.header.gas_used; gas_batch_size += block.header.gas_used; - gas_history_size += block.header.gas_used; prefetched_blocks.pop_front(); - // Flush whole state buffer or just history if we've reached the target batch sizes in gas units - if (gas_batch_size >= gas_max_batch_size) { - SILK_TRACE << log::Args{"buffer", "state", "size", human_size(state_buffer.current_batch_state_size())}; - state_buffer.write_to_db(write_change_sets); + // Always flush history for single processed block (no batching) + state_buffer.write_history_to_db(write_change_sets); + + // Flush state buffer if we've reached the target batch size + if (state_buffer.current_batch_state_size() >= max_batch_size) { + log::Info{"[4/12 Execution] Flushing state", // NOLINT(*-unused-raii) + log_args_for_exec_flush(state_buffer, max_batch_size, block.header.number)}; + state_buffer.write_state_to_db(); gas_batch_size = 0; - } else if (gas_history_size >= gas_max_history_size) { - SILK_TRACE << log::Args{"buffer", "history", "size", human_size(state_buffer.current_batch_history_size())}; - state_buffer.write_history_to_db(write_change_sets); - gas_history_size = 0; + StopWatch sw{/*auto_start=*/true}; + txn.commit_and_renew(); + const auto [elapsed, _]{sw.stop()}; + log::Info("[4/12 Execution] Commit state+history", // NOLINT(*-unused-raii) + log_args_for_exec_commit(sw.since_start(elapsed), db_path)); } const auto now{std::chrono::steady_clock::now()}; @@ -467,7 +491,6 @@ int silkworm_execute_blocks(SilkwormHandle handle, MDBX_txn* mdbx_txn, uint64_t } if (log_time <= now) { progress.gas_state_perc = float(gas_batch_size) / float(gas_max_batch_size); - progress.gas_history_perc = float(gas_history_size) / float(gas_max_history_size); progress.end_time = now; log::Info{"[4/12 Execution] Executed blocks", // NOLINT(*-unused-raii) log_args_for_exec_progress(progress, block.header.number)}; @@ -475,7 +498,14 @@ int silkworm_execute_blocks(SilkwormHandle handle, MDBX_txn* mdbx_txn, uint64_t } } - state_buffer.write_to_db(write_change_sets); + log::Info{"[4/12 Execution] Flushing state", // NOLINT(*-unused-raii) + log_args_for_exec_flush(state_buffer, max_batch_size, max_block)}; + state_buffer.write_state_to_db(); + StopWatch sw{/*auto_start=*/true}; + txn.commit_and_renew(); + const auto [elapsed, _]{sw.stop()}; + log::Info("[4/12 Execution] Commit state+history", // NOLINT(*-unused-raii) + log_args_for_exec_commit(sw.since_start(elapsed), db_path)); return SILKWORM_OK; } catch (const mdbx::exception& e) { if (mdbx_error_code) { diff --git a/silkworm/core/common/util.cpp b/silkworm/core/common/util.cpp index 6641a3bed1..9d7bafae66 100644 --- a/silkworm/core/common/util.cpp +++ b/silkworm/core/common/util.cpp @@ -200,8 +200,8 @@ std::optional parse_size(const std::string& sizestr) { return number; } -std::string human_size(uint64_t bytes) { - static const char* suffix[]{"B", "KB", "MB", "GB", "TB"}; +std::string human_size(uint64_t bytes, const char* unit) { + static const char* suffix[]{"", "K", "M", "G", "T"}; static const uint32_t items{sizeof(suffix) / sizeof(suffix[0])}; uint32_t index{0}; double value{static_cast(bytes)}; @@ -213,7 +213,7 @@ std::string human_size(uint64_t bytes) { } static constexpr size_t kBufferSize{64}; SILKWORM_THREAD_LOCAL char output[kBufferSize]; - SILKWORM_ASSERT(std::snprintf(output, kBufferSize, "%.02lf %s", value, suffix[index]) > 0); + SILKWORM_ASSERT(std::snprintf(output, kBufferSize, "%.02lf %s%s", value, suffix[index], unit) > 0); return output; } diff --git a/silkworm/core/common/util.hpp b/silkworm/core/common/util.hpp index 4c8ae2f6f3..2bbfc9f5bf 100644 --- a/silkworm/core/common/util.hpp +++ b/silkworm/core/common/util.hpp @@ -86,7 +86,7 @@ std::optional from_hex(std::string_view hex) noexcept; std::optional parse_size(const std::string& sizestr); // Converts a number of bytes in a human-readable format -std::string human_size(uint64_t bytes); +std::string human_size(uint64_t bytes, const char* unit = "B"); // Compares two strings for equality with case insensitivity bool iequals(std::string_view a, std::string_view b); diff --git a/silkworm/core/state/in_memory_state.cpp b/silkworm/core/state/in_memory_state.cpp index ffa6c628fb..dd291b1124 100644 --- a/silkworm/core/state/in_memory_state.cpp +++ b/silkworm/core/state/in_memory_state.cpp @@ -150,6 +150,10 @@ void InMemoryState::begin_block(BlockNum block_number) { void InMemoryState::update_account(const evmc::address& address, std::optional initial, std::optional current) { + // Skip update if both initial and final state are non-existent (i.e. contract creation+destruction within the same block) + if (!initial && !current) { + return; + } account_changes_[block_number_][address] = initial; if (current.has_value()) { diff --git a/silkworm/core/state/intra_block_state.cpp b/silkworm/core/state/intra_block_state.cpp index f9336bac74..4f2aa1e2f9 100644 --- a/silkworm/core/state/intra_block_state.cpp +++ b/silkworm/core/state/intra_block_state.cpp @@ -341,10 +341,6 @@ void IntraBlockState::write_to_db(uint64_t block_number) { } for (const auto& [address, obj] : objects_) { - // Skip update if both initial and final state are empty (i.e. contract creation+destruction within the same block) - if (!obj.initial && !obj.current) { - continue; - } db_.update_account(address, obj.initial, obj.current); if (!obj.current) { continue; diff --git a/silkworm/node/db/buffer.cpp b/silkworm/node/db/buffer.cpp index 810fcefb93..ae5eab6c02 100644 --- a/silkworm/node/db/buffer.cpp +++ b/silkworm/node/db/buffer.cpp @@ -38,6 +38,13 @@ void Buffer::begin_block(uint64_t block_number) { void Buffer::update_account(const evmc::address& address, std::optional initial, std::optional current) { + // Skip update if both initial and final state are non-existent (i.e. contract creation+destruction within the same block) + if (!initial && !current) { + // Only to perfectly match Erigon state batch size (Erigon does count any account w/ old=new=empty value). + batch_state_size_ += kAddressLength; + return; + } + const bool equal{current == initial}; const bool account_deleted{!current.has_value()}; @@ -66,11 +73,11 @@ void Buffer::update_account(const evmc::address& address, std::optional } auto it{accounts_.find(address)}; if (it != accounts_.end()) { - batch_state_size_ -= it->second.has_value() ? sizeof(Account) : 0; - batch_state_size_ += (current ? sizeof(Account) : 0); + batch_state_size_ -= it->second.has_value() ? it->second->encoding_length_for_storage() : 0; + batch_state_size_ += (current ? current->encoding_length_for_storage() : 0); it->second = current; } else { - batch_state_size_ += kAddressLength + (current ? sizeof(Account) : 0); + batch_state_size_ += kAddressLength + (current ? current->encoding_length_for_storage() : 0); accounts_[address] = current; } @@ -85,10 +92,12 @@ void Buffer::update_account(const evmc::address& address, std::optional void Buffer::update_account_code(const evmc::address& address, uint64_t incarnation, const evmc::bytes32& code_hash, ByteView code) { - // Don't overwrite already existing code so that views of it - // that were previously returned by read_code() are still valid. - if (hash_to_code_.try_emplace(code_hash, code).second) { + // Don't overwrite existing code so that views of it that were previously returned by read_code are still valid + const auto [inserted_or_existing_it, inserted] = hash_to_code_.try_emplace(code_hash, code); + if (inserted) { batch_state_size_ += kHashLength + code.length(); + } else { + batch_state_size_ += code.length() - inserted_or_existing_it->second.length(); } if (storage_prefix_to_code_hash_.insert_or_assign(storage_prefix(address, incarnation), code_hash).second) { @@ -111,8 +120,14 @@ void Buffer::update_storage(const evmc::address& address, uint64_t incarnation, } } - if (storage_[address][incarnation].insert_or_assign(location, current).second) { - batch_state_size_ += kPlainStoragePrefixLength + kHashLength + kHashLength; + // Iterator in insert_or_assign return value "is pointing at the element that was inserted or updated" + // so we cannot use it to determine the old value size: we need to use initial instead + const auto [_, inserted] = storage_[address][incarnation].insert_or_assign(location, current); + ByteView current_val{zeroless_view(current.bytes)}; + if (inserted) { + batch_state_size_ += kPlainStoragePrefixLength + kHashLength + current_val.length(); + } else { + batch_state_size_ += current_val.length() - zeroless_view(initial.bytes).length(); } } @@ -242,8 +257,8 @@ void Buffer::write_history_to_db(bool write_change_sets) { batch_history_size_ = 0; auto [finish_time, _]{sw.stop()}; - log::Info("Flushed history", - {"size", human_size(total_written_size), "in", StopWatch::format(sw.since_start(finish_time))}); + log::Trace("Flushed history", + {"size", human_size(total_written_size), "in", StopWatch::format(sw.since_start(finish_time))}); } void Buffer::write_state_to_db() { @@ -337,9 +352,17 @@ void Buffer::write_state_to_db() { if (auto it{storage_.find(address)}; it != storage_.end()) { for (const auto& [incarnation, contract_storage] : it->second) { Bytes prefix{storage_prefix(address, incarnation)}; - for (const auto& [location, value] : contract_storage) { - upsert_storage_value(*state_table, prefix, location.bytes, value.bytes); - written_size += prefix.length() + kLocationLength + kHashLength; + // Extract sorted set of storage locations to insert ordered data into the DB + absl::btree_set storage_locations; + for (auto& storage_entry : contract_storage) { + storage_locations.insert(storage_entry.first); + } + for (const auto& location : storage_locations) { + if (auto storage_it{contract_storage.find(location)}; storage_it != contract_storage.end()) { + const auto& value{storage_it->second}; + upsert_storage_value(*state_table, prefix, location.bytes, value.bytes); + written_size += prefix.length() + kLocationLength + zeroless_view(value.bytes).size(); + } } } storage_.erase(it); @@ -397,23 +420,24 @@ void Buffer::insert_call_traces(BlockNum block_number, const CallTraces& traces) touched_accounts.insert(recipient); } - if (not touched_accounts.empty()) { + if (!touched_accounts.empty()) { batch_history_size_ += sizeof(BlockNum); - } - absl::btree_set values; - for (const auto& account : touched_accounts) { - Bytes value(kAddressLength + 1, '\0'); - std::memcpy(value.data(), account.bytes, kAddressLength); - if (traces.senders.contains(account)) { - value[kAddressLength] |= 1; - } - if (traces.recipients.contains(account)) { - value[kAddressLength] |= 2; + + absl::btree_set values; + for (const auto& account : touched_accounts) { + Bytes value(kAddressLength + 1, '\0'); + std::memcpy(value.data(), account.bytes, kAddressLength); + if (traces.senders.contains(account)) { + value[kAddressLength] |= 1; + } + if (traces.recipients.contains(account)) { + value[kAddressLength] |= 2; + } + batch_history_size_ += value.size(); + values.insert(std::move(value)); } - batch_history_size_ += value.size(); - values.insert(std::move(value)); + call_traces_.emplace(block_number, values); } - call_traces_.emplace(block_number, values); } evmc::bytes32 Buffer::state_root_hash() const { @@ -482,8 +506,6 @@ std::optional Buffer::read_account(const evmc::address& address) const return it->second; } auto db_account{db::read_account(txn_, address, historical_block_)}; - accounts_[address] = db_account; - batch_state_size_ += kAddressLength + db_account.value_or(Account()).encoding_length_for_storage(); return db_account; } @@ -501,19 +523,14 @@ ByteView Buffer::read_code(const evmc::bytes32& code_hash) const noexcept { evmc::bytes32 Buffer::read_storage(const evmc::address& address, uint64_t incarnation, const evmc::bytes32& location) const noexcept { - size_t payload_length{kAddressLength + kIncarnationLength + kLocationLength + kHashLength}; if (auto it1{storage_.find(address)}; it1 != storage_.end()) { - payload_length -= kAddressLength; if (auto it2{it1->second.find(incarnation)}; it2 != it1->second.end()) { - payload_length -= kIncarnationLength; if (auto it3{it2->second.find(location)}; it3 != it2->second.end()) { return it3->second; } } } auto db_storage{db::read_storage(txn_, address, incarnation, location, historical_block_)}; - storage_[address][incarnation][location] = db_storage; - batch_state_size_ += payload_length; return db_storage; } diff --git a/silkworm/node/db/buffer.hpp b/silkworm/node/db/buffer.hpp index c8ac3586cc..e51d914f81 100644 --- a/silkworm/node/db/buffer.hpp +++ b/silkworm/node/db/buffer.hpp @@ -130,10 +130,10 @@ class Buffer : public State { //! @param write_change_sets flag indicating if state changes should be written or not (default: true) void write_history_to_db(bool write_change_sets = true); - private: //! \brief Persists *state* accrued contents into db void write_state_to_db(); + private: RWTxn& txn_; db::DataModel access_layer_; uint64_t prune_history_threshold_; @@ -148,9 +148,9 @@ class Buffer : public State { mutable absl::flat_hash_map> accounts_; // address -> incarnation -> location -> value - mutable absl::flat_hash_map>> - storage_; + using Storage = absl::flat_hash_map; + using StorageByIncarnation = absl::btree_map; + mutable absl::flat_hash_map storage_; absl::btree_map incarnations_; absl::btree_map hash_to_code_; diff --git a/silkworm/node/db/buffer_test.cpp b/silkworm/node/db/buffer_test.cpp index ddd3961c70..8fb90b83eb 100644 --- a/silkworm/node/db/buffer_test.cpp +++ b/silkworm/node/db/buffer_test.cpp @@ -109,7 +109,11 @@ TEST_CASE("Account update") { Buffer buffer{txn, 0}; buffer.begin_block(1); buffer.update_account(address, /*initial=*/std::nullopt, current_account); - REQUIRE(buffer.account_changes().empty() == false); + REQUIRE(!buffer.account_changes().empty()); + // Current state batch: current account address + current account encoding + CHECK(buffer.current_batch_state_size() == kAddressLength + current_account.encoding_length_for_storage()); + // State history batch: current block + initial account address + initial account encoding (empty) + CHECK(buffer.current_batch_history_size() == sizeof(uint64_t) + kAddressLength); REQUIRE_NOTHROW(buffer.write_to_db()); auto account_changeset{db::open_cursor(txn, table::kAccountChangeSet)}; @@ -140,7 +144,11 @@ TEST_CASE("Account update") { Buffer buffer{txn, 0}; buffer.begin_block(1); buffer.update_account(address, /*initial=*/initial_account, current_account); - REQUIRE(buffer.account_changes().empty() == false); + REQUIRE(!buffer.account_changes().empty()); + // Current state batch: current account address + current account encoding + CHECK(buffer.current_batch_state_size() == kAddressLength + current_account.encoding_length_for_storage()); + // State history batch: current block + initial account address + initial account encoding + CHECK(buffer.current_batch_history_size() == sizeof(uint64_t) + kAddressLength + initial_account.encoding_length_for_storage()); REQUIRE_NOTHROW(buffer.write_to_db()); auto account_changeset{db::open_cursor(txn, table::kAccountChangeSet)}; @@ -170,7 +178,11 @@ TEST_CASE("Account update") { Buffer buffer{txn, 0}; buffer.begin_block(1); buffer.update_account(address, /*initial=*/account, /*current=*/std::nullopt); - REQUIRE(buffer.account_changes().empty() == false); + REQUIRE(!buffer.account_changes().empty()); + // Current state batch: initial account for delete + (initial account + incarnation) for incarnation + CHECK(buffer.current_batch_state_size() == kAddressLength + (kAddressLength + kIncarnationLength)); + // State history batch: current block + initial account address + initial account encoding + CHECK(buffer.current_batch_history_size() == sizeof(uint64_t) + kAddressLength + account.encoding_length_for_storage()); REQUIRE_NOTHROW(buffer.write_to_db()); auto incarnations{db::open_cursor(txn, table::kIncarnationMap)}; @@ -207,6 +219,28 @@ TEST_CASE("Account update") { CHECK(std::memcmp(data.key.data(), address.bytes, kAddressLength) == 0); CHECK(endian::load_big_u64(db::from_slice(data.value).data()) == account.incarnation); } + + SECTION("Change EOA account w/ new value equal to old one") { + const auto address{0xbe00000000000000000000000000000000000000_address}; + Account initial_account; + initial_account.nonce = 2; + initial_account.balance = kEther; + + Account current_account; + current_account.nonce = 2; + current_account.balance = kEther; + + Buffer buffer{txn, 0}; + buffer.begin_block(1); + buffer.update_account(address, /*initial=*/initial_account, current_account); + REQUIRE(buffer.account_changes().empty()); + CHECK(buffer.current_batch_state_size() == 0); // No change in current state batch + CHECK(buffer.current_batch_history_size() == 0); // No change in state history batch + REQUIRE_NOTHROW(buffer.write_to_db()); + + auto account_changeset{db::open_cursor(txn, table::kAccountChangeSet)}; + REQUIRE(txn->get_map_stat(account_changeset.map()).ms_entries == 0); + } } } // namespace silkworm::db diff --git a/silkworm/node/db/mdbx.cpp b/silkworm/node/db/mdbx.cpp index 512fd35386..fd7471e28e 100644 --- a/silkworm/node/db/mdbx.cpp +++ b/silkworm/node/db/mdbx.cpp @@ -18,6 +18,7 @@ #include +#include #include namespace silkworm::db { @@ -32,6 +33,21 @@ namespace detail { dump.append(std::to_string(bool(result.value))); return dump; } + + log::Args log_args_for_commit_latency(const MDBX_commit_latency& commit_latency) { + return { + "preparation", + std::to_string(commit_latency.preparation), + "write", + std::to_string(commit_latency.write), + "sync", + std::to_string(commit_latency.sync), + "ending", + std::to_string(commit_latency.ending), + "whole", + std::to_string(commit_latency.whole), + }; + } } // namespace detail //! \brief Returns data of current cursor position or moves it to the beginning or the end of the table based on @@ -85,18 +101,18 @@ ::mdbx::env_managed open_env(EnvConfig& config) { } fs::path db_file{db::get_datafile_path(db_path)}; - size_t db_ondisk_file_size{fs::exists(db_file) ? fs::file_size(db_file) : 0}; + const size_t db_file_size{fs::exists(db_file) ? fs::file_size(db_file) : 0}; - if (!config.create && !db_ondisk_file_size) { + if (!config.create && !db_file_size) { throw std::runtime_error("Unable to locate " + db_file.string() + ", which is required to exist"); - } else if (config.create && db_ondisk_file_size) { + } else if (config.create && db_file_size) { throw std::runtime_error("File " + db_file.string() + " already exists but create was set"); } // Prevent mapping a file with a smaller map size than the size on disk. // Opening would not fail but only a part of data would be mapped. - if (db_ondisk_file_size > config.max_size) { - throw std::runtime_error("Database map size is too small. Min required " + human_size(db_ondisk_file_size)); + if (db_file_size > config.max_size) { + throw std::runtime_error("Database map size is too small. Min required " + human_size(db_file_size)); } uint32_t flags{MDBX_NOTLS | MDBX_NORDAHEAD | MDBX_COALESCE | MDBX_SYNC_DURABLE}; // Default flags @@ -136,7 +152,7 @@ ::mdbx::env_managed open_env(EnvConfig& config) { auto growth_size = static_cast(config.in_memory ? 8_Mebi : config.growth_size); cp.geometry.make_dynamic(::mdbx::env::geometry::default_value, max_map_size); cp.geometry.growth_step = growth_size; - if (!db_ondisk_file_size) { + if (!db_file_size) { cp.geometry.pagesize = static_cast(config.page_size); } } @@ -144,7 +160,7 @@ ::mdbx::env_managed open_env(EnvConfig& config) { using OP = ::mdbx::env::operate_parameters; OP op{}; // Operational parameters op.mode = OP::mode_from_flags(static_cast(flags)); - op.options = op.options_from_flags(static_cast(flags)); + op.options = OP::options_from_flags(static_cast(flags)); op.durability = OP::durability_from_flags(static_cast(flags)); op.max_maps = config.max_tables; op.max_readers = config.max_readers; @@ -155,7 +171,7 @@ ::mdbx::env_managed open_env(EnvConfig& config) { config.page_size = env.get_pagesize(); if (!config.shared) { - // C++ bindings don't have setoptions + // C++ bindings don't have set_option ::mdbx::error::success_or_throw(::mdbx_env_set_option(env, MDBX_opt_rp_augment_limit, 32_Mebi)); if (!config.readonly) { ::mdbx::error::success_or_throw(::mdbx_env_set_option(env, MDBX_opt_txn_dp_initial, 16_Kibi)); @@ -190,7 +206,7 @@ ::mdbx::cursor_managed open_cursor(::mdbx::txn& tx, const MapConfig& config) { size_t max_value_size_for_leaf_page(const size_t page_size, const size_t key_size) { /* - * On behalf of configured MDBX's page size we need to find + * On behalf of configured MDBX page size we need to find * the size of each shard best fitting in data page without * causing MDBX to write value in overflow pages. * @@ -252,6 +268,54 @@ void RWTxnManaged::commit_and_stop() { } } +RWTxnUnmanaged::~RWTxnUnmanaged() { + if (handle_) { + RWTxnUnmanaged::abort(); + } +} + +void RWTxnUnmanaged::abort() { + const ::mdbx::error err = static_cast(::mdbx_txn_abort(handle_)); + if (err.code() != MDBX_THREAD_MISMATCH) { + handle_ = nullptr; + } + if (err.code() != MDBX_SUCCESS) { + err.throw_exception(); + } +} + +void RWTxnUnmanaged::commit_and_renew() { + if (commit_disabled_) { + return; + } + mdbx::env env = db(); + commit(); + // Renew transaction + ::mdbx::error::success_or_throw( + ::mdbx_txn_begin(env, nullptr, MDBX_TXN_READWRITE, &handle_)); + SILKWORM_ASSERT(handle_); +} + +void RWTxnUnmanaged::commit_and_stop() { + if (commit_disabled_) { + return; + } + commit(); +} + +void RWTxnUnmanaged::commit() { + MDBX_commit_latency commit_latency{}; + const ::mdbx::error err = static_cast(::mdbx_txn_commit_ex(handle_, &commit_latency)); + if (err.code() != MDBX_THREAD_MISMATCH) { + handle_ = nullptr; + } + if (err.code() != MDBX_SUCCESS) { + err.throw_exception(); + } + SILKWORM_ASSERT(!handle_); + SILK_TRACE << "Commit latency" << detail::log_args_for_commit_latency(commit_latency); +} + thread_local ObjectPool PooledCursor::handles_pool_{}; PooledCursor::PooledCursor() { diff --git a/silkworm/node/db/mdbx.hpp b/silkworm/node/db/mdbx.hpp index 1eaf2101a1..41bf3ab5df 100644 --- a/silkworm/node/db/mdbx.hpp +++ b/silkworm/node/db/mdbx.hpp @@ -319,15 +319,18 @@ class RWTxnManaged : public RWTxn { }; //! \brief ROTxnUnmanaged wraps an *unmanaged* read-write transaction, which means the underlying transaction -//! lifecycle is not touched by this class. This implies that this class does not commit nor abort the transaction. +//! is not created by this class, but can be committed or aborted. class RWTxnUnmanaged : public RWTxn, protected ::mdbx::txn { public: explicit RWTxnUnmanaged(MDBX_txn* ptr) : RWTxn{static_cast<::mdbx::txn&>(*this)}, ::mdbx::txn{ptr} {} - ~RWTxnUnmanaged() override = default; + ~RWTxnUnmanaged() override; - void abort() override {} - void commit_and_renew() override {} - void commit_and_stop() override {} + void abort() override; + void commit_and_renew() override; + void commit_and_stop() override; + + private: + void commit(); }; //! \brief This class create ROTxn(s) on demand, it is used to enforce in some method signatures the type of db access @@ -391,12 +394,12 @@ ::mdbx::map_handle open_map(::mdbx::txn& tx, const MapConfig& config); //! \return A handle to the opened cursor ::mdbx::cursor_managed open_cursor(::mdbx::txn& tx, const MapConfig& config); -//! \brief Computes the max size of value data to fit in a leaf data page -//! \param [in] page_size : the actually configured MDBX's page size +//! \brief Computes the max size of single-value data to fit into a leaf data page +//! \param [in] page_size : the actually configured MDBX page size //! \param [in] key_size : the known key size to fit in bundle computed value size size_t max_value_size_for_leaf_page(size_t page_size, size_t key_size); -//! \brief Computes the max size of value data to fit in a leaf data page +//! \brief Computes the max size of single-value data to fit into a leaf data page //! \param [in] txn : the transaction used to derive pagesize from //! \param [in] key_size : the known key size to fit in bundle computed value size size_t max_value_size_for_leaf_page(const ::mdbx::txn& txn, size_t key_size); diff --git a/silkworm/node/db/mdbx_test.cpp b/silkworm/node/db/mdbx_test.cpp index a6cf1d45ca..44c1cc07ff 100644 --- a/silkworm/node/db/mdbx_test.cpp +++ b/silkworm/node/db/mdbx_test.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -238,7 +239,7 @@ TEST_CASE("RWTxn") { auto env{db::open_env(db_config)}; static const char* table_name{"GeneticCode"}; - SECTION("Managed") { + SECTION("Managed: commit_and_renew") { { auto tx{db::RWTxnManaged(env)}; db::PooledCursor table_cursor(*tx, {table_name}); @@ -256,7 +257,25 @@ TEST_CASE("RWTxn") { REQUIRE(table_cursor.empty() == false); } - SECTION("External") { + SECTION("Managed: commit_and_stop") { + { + auto tx{db::RWTxnManaged(env)}; + db::PooledCursor table_cursor(*tx, {table_name}); + + // populate table + for (const auto& [key, value] : kGeneticCode) { + table_cursor.upsert(mdbx::slice{key}, mdbx::slice{value}); + } + + tx.commit_and_stop(); + } + + auto tx{env.start_read()}; + db::PooledCursor table_cursor(tx, {table_name}); + REQUIRE(table_cursor.empty() == false); + } + + SECTION("External: commit_and_renew") { RWTxnManaged tx{env}; tx.disable_commit(); { @@ -268,6 +287,18 @@ TEST_CASE("RWTxn") { REQUIRE(db::has_map(tx2, table_name) == false); } + SECTION("External: commit_and_stop") { + RWTxnManaged tx{env}; + tx.disable_commit(); + { + (void)tx->create_map(table_name, mdbx::key_mode::usual, mdbx::value_mode::single); + tx.commit_and_stop(); // Does not have any effect + } + tx.abort(); + RWTxnManaged tx2{env}; + REQUIRE(db::has_map(tx2, table_name) == false); + } + SECTION("Cursor from RWTxn") { auto tx{db::RWTxnManaged(env)}; db::PooledCursor table_cursor(tx, {table_name}); @@ -276,6 +307,46 @@ TEST_CASE("RWTxn") { table_cursor.close(); REQUIRE_THROWS(table_cursor.bind(tx, {table_name})); } + + SECTION("Unmanaged: commit_and_renew") { + { + ::MDBX_txn* rw_txn{nullptr}; + ::mdbx::error::success_or_throw(::mdbx_txn_begin(env, nullptr, MDBX_TXN_READWRITE, &rw_txn)); + + auto tx{db::RWTxnUnmanaged(rw_txn)}; + db::PooledCursor table_cursor(*tx, {table_name}); + + // populate table + for (const auto& [key, value] : kGeneticCode) { + table_cursor.upsert(mdbx::slice{key}, mdbx::slice{value}); + } + + tx.commit_and_renew(); + } + auto ro_txn{env.start_read()}; + db::PooledCursor cursor(ro_txn, {table_name}); + CHECK(cursor.empty() == false); + } + + SECTION("Unmanaged: commit_and_stop") { + { + ::MDBX_txn* rw_txn{nullptr}; + ::mdbx::error::success_or_throw(::mdbx_txn_begin(env, nullptr, MDBX_TXN_READWRITE, &rw_txn)); + + auto tx{db::RWTxnUnmanaged(rw_txn)}; + db::PooledCursor table_cursor(*tx, {table_name}); + + // populate table + for (const auto& [key, value] : kGeneticCode) { + table_cursor.upsert(mdbx::slice{key}, mdbx::slice{value}); + } + + tx.commit_and_stop(); + } + auto ro_txn{env.start_read()}; + db::PooledCursor cursor(ro_txn, {table_name}); + CHECK(cursor.empty() == false); + } } TEST_CASE("Cursor walk") { @@ -494,11 +565,26 @@ TEST_CASE("Cursor walk") { } } +//! Compute the maximum free space in an empty MDBX database page +//! \warning this may change in future versions of MDBX +//! \details see `page_space` function in MDBX core.c +static size_t page_space(const mdbx::env& env) { + constexpr size_t kPageHeaderSize{20}; // size of MDBX page header (PAGEHDRSZ) + const size_t db_page_size{env.get_pagesize()}; + return db_page_size - kPageHeaderSize; +} + +static size_t max_multivalue_size_for_leaf_page(const mdbx::txn& txn) { + const size_t kDupSortNodes{2}; + const size_t kNodeHeaderSize{8}; // size of MDBX node header (NODESIZE) + return page_space(txn.env()) / kDupSortNodes - 2 * kNodeHeaderSize; +} + TEST_CASE("OF pages") { test::Context context; db::RWTxn& txn = context.rw_txn(); - SECTION("No overflow") { + SECTION("Single-value map: No overflow") { db::PooledCursor target(txn, db::table::kAccountHistory); Bytes key(20, '\0'); Bytes value(db::max_value_size_for_leaf_page(*txn, key.size()), '\0'); @@ -506,18 +592,346 @@ TEST_CASE("OF pages") { txn.commit_and_renew(); target.bind(txn, db::table::kAccountHistory); auto stats{target.get_map_stat()}; - REQUIRE(!stats.ms_overflow_pages); + CHECK(stats.ms_overflow_pages == 0); } - SECTION("Let's overflow") { + SECTION("Single-value map: Let's overflow") { db::PooledCursor target(txn, db::table::kAccountHistory); Bytes key(20, '\0'); - Bytes value(db::max_value_size_for_leaf_page(*txn, key.size()) + /*any extra value */ 1, '\0'); + Bytes value(db::max_value_size_for_leaf_page(*txn, key.size()) + /*any extra value*/ 1, '\0'); target.insert(db::to_slice(key), db::to_slice(value)); txn.commit_and_renew(); target.bind(txn, db::table::kAccountHistory); auto stats{target.get_map_stat()}; - REQUIRE(stats.ms_overflow_pages); + CHECK(stats.ms_overflow_pages > 0); + } + + SECTION("Multi-value map: No overflow, value size OK") { + db::PooledCursor target(txn, db::table::kPlainState); + Bytes key(20, '\0'); + Bytes value(db::max_multivalue_size_for_leaf_page(txn), '\0'); + target.insert(db::to_slice(key), db::to_slice(value)); + txn.commit_and_renew(); + target.bind(txn, db::table::kPlainState); + auto stats{target.get_map_stat()}; + CHECK(stats.ms_overflow_pages == 0); + } + + // Skip the following section in debug as too big data size in multi-value map will assert +#ifndef MDBX_DEBUG + SECTION("Multi-value map: No overflow, error for value too big") { + db::PooledCursor target(txn, db::table::kPlainState); + Bytes key(20, '\0'); + Bytes value(db::max_multivalue_size_for_leaf_page(txn) + /*any extra value*/ 1, '\0'); + CHECK_THROWS_AS(target.insert(db::to_slice(key), db::to_slice(value)), ::mdbx::exception); + } +#endif // MDBX_DEBUG +} + +static uint64_t get_free_pages(const ::mdbx::env& env) { + uint64_t free_pages{0}; + + // Use threaded execution because MDBX does not allow overlapping txns in same thread + std::async([&]() { + constexpr MDBX_dbi FREE_DBI{0}; + ::mdbx::map_handle free_map{FREE_DBI}; + + auto ro_txn{env.start_read()}; + auto free_cursor{ro_txn.open_cursor(free_map)}; + auto data = free_cursor.to_first(false); + while (data.done) { + size_t tx_id{0}; + std::memcpy(&tx_id, db::from_slice(data.key).data(), sizeof(size_t)); + uint32_t tx_free_pages{0}; + std::memcpy(&tx_free_pages, db::from_slice(data.value).data(), sizeof(uint32_t)); + free_pages += tx_free_pages; + data = free_cursor.to_next(false); + } + }).get(); + + return free_pages; +} + +TEST_CASE("Single-value erase+upsert w/ same value increases free pages") { + TemporaryDirectory tmp_dir{}; + auto data_directory{std::make_unique(tmp_dir.path(), /*create=*/true)}; + db::EnvConfig env_config{ + .path = data_directory->chaindata().path().string(), + .create = true, + .readonly = false, + .exclusive = false, + .in_memory = true, + }; + auto env{db::open_env(env_config)}; + + constexpr size_t kKeySize{20}; // just to copycat account address size + const Bytes key(kKeySize, '\0'); + + // Initialize the map content w/ one max-size value [scope needed to limit rw_txn lifecycle] + { + auto rw_txn{env.start_write()}; + auto code_map{db::open_map(rw_txn, db::table::kCode)}; + auto code_stats{rw_txn.get_map_stat(code_map)}; + REQUIRE(code_stats.ms_entries == 0); + REQUIRE(code_stats.ms_depth == 0); + REQUIRE(code_stats.ms_branch_pages == 0); + REQUIRE(code_stats.ms_leaf_pages == 0); + REQUIRE(code_stats.ms_overflow_pages == 0); + auto code_cursor{rw_txn.open_cursor(code_map)}; + Bytes value(db::max_value_size_for_leaf_page(rw_txn, key.size()), static_cast(10)); + code_cursor.insert(db::to_slice(key), db::to_slice(value)); // insert or upsert equivalent here + code_stats = rw_txn.get_map_stat(code_map); + REQUIRE(code_stats.ms_entries == 1); // we have 1 value here + REQUIRE(code_stats.ms_depth == 1); + REQUIRE(code_stats.ms_branch_pages == 0); + REQUIRE(code_stats.ms_leaf_pages == 1); // we have 1 max data value + REQUIRE(code_stats.ms_overflow_pages == 0); + rw_txn.commit(); + CHECK(get_free_pages(env) == 0); // No free pages after initialization + } + + SECTION("upsert same value does not cause any free page") { + auto rw_txn{env.start_write()}; + auto code_map{db::open_map(rw_txn, db::table::kCode)}; + auto code_cursor{rw_txn.open_cursor(code_map)}; + auto key_slice{db::to_slice(key)}; + Bytes value(db::max_value_size_for_leaf_page(rw_txn, key.size()), static_cast(10)); + auto value_slice{db::to_slice(value)}; + code_cursor.upsert(key_slice, value_slice); + auto code_stats{rw_txn.get_map_stat(code_map)}; + REQUIRE(code_stats.ms_entries == 1); + REQUIRE(code_stats.ms_depth == 1); + REQUIRE(code_stats.ms_branch_pages == 0); + REQUIRE(code_stats.ms_leaf_pages == 1); + REQUIRE(code_stats.ms_overflow_pages == 0); + rw_txn.commit(); + CHECK(get_free_pages(env) == 0); // no free page produced + } + + SECTION("erase + upsert same value causes two free pages") { + auto rw_txn{env.start_write()}; + auto code_map{db::open_map(rw_txn, db::table::kCode)}; + auto code_cursor{rw_txn.open_cursor(code_map)}; + auto key_slice{db::to_slice(key)}; + CHECK(code_cursor.erase(key_slice)); + Bytes value(db::max_value_size_for_leaf_page(rw_txn, key.size()), static_cast(10)); + auto value_slice{db::to_slice(value)}; + code_cursor.upsert(key_slice, value_slice); + auto code_stats{rw_txn.get_map_stat(code_map)}; + REQUIRE(code_stats.ms_entries == 1); + REQUIRE(code_stats.ms_depth == 1); + REQUIRE(code_stats.ms_branch_pages == 0); + REQUIRE(code_stats.ms_leaf_pages == 1); + REQUIRE(code_stats.ms_overflow_pages == 0); + rw_txn.commit(); + CHECK(get_free_pages(env) == 2); // +2 free pages if we erase+upsert w/ same value => bad pattern + } + + SECTION("upsert different value causes two free pages") { + auto rw_txn{env.start_write()}; + auto code_map{db::open_map(rw_txn, db::table::kCode)}; + auto code_cursor{rw_txn.open_cursor(code_map)}; + auto key_slice{db::to_slice(key)}; + Bytes value(db::max_value_size_for_leaf_page(rw_txn, key.size()), static_cast(11)); + auto value_slice{db::to_slice(value)}; + code_cursor.upsert(key_slice, value_slice); + auto code_stats{rw_txn.get_map_stat(code_map)}; + REQUIRE(code_stats.ms_entries == 1); // we have 1 value here since table is single-value + REQUIRE(code_stats.ms_depth == 1); + REQUIRE(code_stats.ms_branch_pages == 0); + REQUIRE(code_stats.ms_leaf_pages == 1); // we have 1 max data value + REQUIRE(code_stats.ms_overflow_pages == 0); + rw_txn.commit(); + CHECK(get_free_pages(env) == 2); // +2 free pages in any case + } + + SECTION("erase + upsert different value causes two free pages") { + auto rw_txn{env.start_write()}; + auto code_map{db::open_map(rw_txn, db::table::kCode)}; + auto code_cursor{rw_txn.open_cursor(code_map)}; + auto key_slice{db::to_slice(key)}; + CHECK(code_cursor.erase(key_slice, true)); + Bytes value(db::max_value_size_for_leaf_page(rw_txn, key.size()), static_cast(11)); + auto value_slice{db::to_slice(value)}; + code_cursor.upsert(key_slice, value_slice); + auto code_stats{rw_txn.get_map_stat(code_map)}; + REQUIRE(code_stats.ms_entries == 1); + REQUIRE(code_stats.ms_depth == 1); + REQUIRE(code_stats.ms_branch_pages == 0); + REQUIRE(code_stats.ms_leaf_pages == 1); + REQUIRE(code_stats.ms_overflow_pages == 0); + rw_txn.commit(); + CHECK(get_free_pages(env) == 2); // +2 free pages in any case + } + + SECTION("erase causes two free pages") { + auto rw_txn{env.start_write()}; + auto code_map{db::open_map(rw_txn, db::table::kCode)}; + auto code_cursor{rw_txn.open_cursor(code_map)}; + auto key_slice{db::to_slice(key)}; + CHECK(code_cursor.erase(key_slice)); + auto code_stats{rw_txn.get_map_stat(code_map)}; + REQUIRE(code_stats.ms_entries == 0); + REQUIRE(code_stats.ms_depth == 0); + REQUIRE(code_stats.ms_branch_pages == 0); + REQUIRE(code_stats.ms_leaf_pages == 0); + REQUIRE(code_stats.ms_overflow_pages == 0); + rw_txn.commit(); + CHECK(get_free_pages(env) == 2); // +2 free pages if we erase + } + + SECTION("erase + upsert same value w/ 2 commits causes two free pages") { + { + auto rw_txn{env.start_write()}; + auto code_map{db::open_map(rw_txn, db::table::kCode)}; + auto code_cursor{rw_txn.open_cursor(code_map)}; + auto key_slice{db::to_slice(key)}; + CHECK(code_cursor.erase(key_slice)); + auto code_stats{rw_txn.get_map_stat(code_map)}; + REQUIRE(code_stats.ms_entries == 0); + REQUIRE(code_stats.ms_depth == 0); + REQUIRE(code_stats.ms_branch_pages == 0); + REQUIRE(code_stats.ms_leaf_pages == 0); + REQUIRE(code_stats.ms_overflow_pages == 0); + rw_txn.commit(); + CHECK(get_free_pages(env) == 2); + } + { + auto rw_txn{env.start_write()}; + auto code_map{db::open_map(rw_txn, db::table::kCode)}; + auto code_cursor{rw_txn.open_cursor(code_map)}; + auto key_slice{db::to_slice(key)}; + Bytes value(db::max_value_size_for_leaf_page(rw_txn, key.size()), static_cast(10)); + auto value_slice{db::to_slice(value)}; + code_cursor.upsert(key_slice, value_slice); + auto code_stats{rw_txn.get_map_stat(code_map)}; + REQUIRE(code_stats.ms_entries == 1); + REQUIRE(code_stats.ms_depth == 1); + REQUIRE(code_stats.ms_branch_pages == 0); + REQUIRE(code_stats.ms_leaf_pages == 1); + REQUIRE(code_stats.ms_overflow_pages == 0); + rw_txn.commit(); + CHECK(get_free_pages(env) == 2); + } + } +} + +TEST_CASE("Multi-value erase+upsert w/ same value increases free pages") { + TemporaryDirectory tmp_dir{}; + auto data_directory{std::make_unique(tmp_dir.path(), /*create=*/true)}; + db::EnvConfig env_config{ + .path = data_directory->chaindata().path().string(), + .create = true, + .readonly = false, + .exclusive = false, + .in_memory = true, + }; + auto env{db::open_env(env_config)}; + + // We need to split max multi-value data size between key and value + constexpr size_t kKeySize{20}; // just to copycat account address size + const size_t kMaxNonInitialValueSize{[&env]() { + auto ro_txn{env.start_read()}; + return db::max_multivalue_size_for_leaf_page(ro_txn); + }()}; + const size_t kMaxFirstValueSize{kMaxNonInitialValueSize - kKeySize}; // we need to take key size into account once + const Bytes key(kKeySize, '\0'); + + // Initialize the map content w/ one max-size value [scope needed to limit rw_txn lifecycle] + { + auto rw_txn{env.start_write()}; + auto plain_state_map{db::open_map(rw_txn, db::table::kPlainState)}; + auto plain_state_stats{rw_txn.get_map_stat(plain_state_map)}; + REQUIRE(plain_state_stats.ms_entries == 0); + REQUIRE(plain_state_stats.ms_depth == 0); + REQUIRE(plain_state_stats.ms_branch_pages == 0); + REQUIRE(plain_state_stats.ms_leaf_pages == 0); + REQUIRE(plain_state_stats.ms_overflow_pages == 0); + auto plain_state_cursor{rw_txn.open_cursor(plain_state_map)}; + Bytes value(kMaxFirstValueSize, static_cast(10)); + plain_state_cursor.insert(db::to_slice(key), db::to_slice(value)); // insert or upsert equivalent here + plain_state_stats = rw_txn.get_map_stat(plain_state_map); + REQUIRE(plain_state_stats.ms_entries == 1); // we have 1 value here + REQUIRE(plain_state_stats.ms_depth == 1); + REQUIRE(plain_state_stats.ms_branch_pages == 0); + REQUIRE(plain_state_stats.ms_leaf_pages == 1); + REQUIRE(plain_state_stats.ms_overflow_pages == 0); + rw_txn.commit(); + CHECK(get_free_pages(env) == 0); // No free pages after + } + + SECTION("upsert same value does not cause any free page") { + auto rw_txn{env.start_write()}; + auto plain_state_map{db::open_map(rw_txn, db::table::kPlainState)}; + auto plain_state_cursor{rw_txn.open_cursor(plain_state_map)}; + auto key_slice{db::to_slice(key)}; + Bytes value(kMaxFirstValueSize, static_cast(10)); + auto value_slice{db::to_slice(value)}; + plain_state_cursor.upsert(key_slice, value_slice); + auto plain_state_stats{rw_txn.get_map_stat(plain_state_map)}; + REQUIRE(plain_state_stats.ms_entries == 1); + REQUIRE(plain_state_stats.ms_depth == 1); + REQUIRE(plain_state_stats.ms_branch_pages == 0); + REQUIRE(plain_state_stats.ms_leaf_pages == 1); + REQUIRE(plain_state_stats.ms_overflow_pages == 0); + rw_txn.commit(); + CHECK(get_free_pages(env) == 0); // no free page produced + } + + SECTION("erase + upsert same value causes two free pages") { + auto rw_txn{env.start_write()}; + auto plain_state_map{db::open_map(rw_txn, db::table::kPlainState)}; + auto plain_state_cursor{rw_txn.open_cursor(plain_state_map)}; + auto key_slice{db::to_slice(key)}; + CHECK(plain_state_cursor.erase(key_slice, /*whole_multivalue=*/true)); + Bytes value(kMaxFirstValueSize, static_cast(10)); + auto value_slice{db::to_slice(value)}; + plain_state_cursor.upsert(key_slice, value_slice); + auto plain_state_stats{rw_txn.get_map_stat(plain_state_map)}; + REQUIRE(plain_state_stats.ms_entries == 1); + REQUIRE(plain_state_stats.ms_depth == 1); + REQUIRE(plain_state_stats.ms_branch_pages == 0); + REQUIRE(plain_state_stats.ms_leaf_pages == 1); + REQUIRE(plain_state_stats.ms_overflow_pages == 0); + rw_txn.commit(); + CHECK(get_free_pages(env) == 2); // +2 free pages if we erase+upsert w/ same value => bad pattern + } + + SECTION("upsert different value causes two free pages") { + auto rw_txn{env.start_write()}; + auto plain_state_map{db::open_map(rw_txn, db::table::kPlainState)}; + auto plain_state_cursor{rw_txn.open_cursor(plain_state_map)}; + auto key_slice{db::to_slice(key)}; + Bytes value(kMaxNonInitialValueSize, static_cast(11)); + auto value_slice{db::to_slice(value)}; + plain_state_cursor.upsert(key_slice, value_slice); + auto plain_state_stats{rw_txn.get_map_stat(plain_state_map)}; + REQUIRE(plain_state_stats.ms_entries == 2); // we have 2 values here since table is multi-value + REQUIRE(plain_state_stats.ms_depth == 1); + REQUIRE(plain_state_stats.ms_branch_pages == 0); + REQUIRE(plain_state_stats.ms_leaf_pages == 2); // we have 2 max data values hence we need 2 pages + REQUIRE(plain_state_stats.ms_overflow_pages == 0); + rw_txn.commit(); + CHECK(get_free_pages(env) == 2); // +2 free pages in any case + } + + SECTION("erase + upsert different value causes two free pages") { + auto rw_txn{env.start_write()}; + auto plain_state_map{db::open_map(rw_txn, db::table::kPlainState)}; + auto plain_state_cursor{rw_txn.open_cursor(plain_state_map)}; + auto key_slice{db::to_slice(key)}; + CHECK(plain_state_cursor.erase(key_slice, /*whole_multivalue=*/true)); + Bytes value(kMaxFirstValueSize, static_cast(11)); + auto value_slice{db::to_slice(value)}; + plain_state_cursor.upsert(key_slice, value_slice); + auto plain_state_stats{rw_txn.get_map_stat(plain_state_map)}; + REQUIRE(plain_state_stats.ms_entries == 1); + REQUIRE(plain_state_stats.ms_depth == 1); + REQUIRE(plain_state_stats.ms_branch_pages == 0); + REQUIRE(plain_state_stats.ms_leaf_pages == 1); + REQUIRE(plain_state_stats.ms_overflow_pages == 0); + rw_txn.commit(); + CHECK(get_free_pages(env) == 2); // +2 free pages in any case } }