From 1f5e4905c3ff662060e6b67d484779005626b497 Mon Sep 17 00:00:00 2001 From: SpyCheese Date: Mon, 16 Sep 2024 18:01:50 +0300 Subject: [PATCH] Fix extimating block size, repeat collation on error --- crypto/block/block.h | 6 +++ validator/fabric.h | 2 +- validator/impl/collator-impl.h | 6 ++- validator/impl/collator.cpp | 88 ++++++++++++++++++++++++---------- validator/impl/fabric.cpp | 12 +++-- 5 files changed, 83 insertions(+), 31 deletions(-) diff --git a/crypto/block/block.h b/crypto/block/block.h index 5f3dadff4..0247d79cb 100644 --- a/crypto/block/block.h +++ b/crypto/block/block.h @@ -239,6 +239,12 @@ struct ParamLimits { bool deserialize(vm::CellSlice& cs); int classify(td::uint64 value) const; bool fits(unsigned cls, td::uint64 value) const; + void multiply_by(double x) { + CHECK(x > 0.0); + for (td::uint32& y : limits_) { + y = (td::uint32)std::min(y * x, 1e9); + } + } private: std::array limits_; diff --git a/validator/fabric.h b/validator/fabric.h index fabdf8e3c..80d962e09 100644 --- a/validator/fabric.h +++ b/validator/fabric.h @@ -81,7 +81,7 @@ void run_validate_query(ShardIdFull shard, BlockIdExt min_masterchain_block_id, void run_collate_query(ShardIdFull shard, const BlockIdExt& min_masterchain_block_id, std::vector prev, Ed25519_PublicKey creator, td::Ref validator_set, td::Ref collator_opts, td::actor::ActorId manager, - td::Timestamp timeout, td::Promise promise); + td::Timestamp timeout, td::Promise promise, int attempt_idx = 0); void run_collate_hardfork(ShardIdFull shard, const BlockIdExt& min_masterchain_block_id, std::vector prev, td::actor::ActorId manager, td::Timestamp timeout, td::Promise promise); diff --git a/validator/impl/collator-impl.h b/validator/impl/collator-impl.h index 708278545..3bfd53fed 100644 --- a/validator/impl/collator-impl.h +++ b/validator/impl/collator-impl.h @@ -76,6 +76,8 @@ class Collator final : public td::actor::Actor { td::Timestamp timeout; td::Timestamp queue_cleanup_timeout_, soft_timeout_, medium_timeout_; td::Promise main_promise; + int attempt_idx_; + bool allow_repeat_collation_ = false; ton::BlockSeqno last_block_seqno{0}; ton::BlockSeqno prev_mc_block_seqno{0}; ton::BlockSeqno new_block_seqno{0}; @@ -90,7 +92,8 @@ class Collator final : public td::actor::Actor { public: Collator(ShardIdFull shard, bool is_hardfork, BlockIdExt min_masterchain_block_id, std::vector prev, Ref validator_set, Ed25519_PublicKey collator_id, Ref collator_opts, - td::actor::ActorId manager, td::Timestamp timeout, td::Promise promise); + td::actor::ActorId manager, td::Timestamp timeout, td::Promise promise, + int attempt_idx); ~Collator() override = default; bool is_busy() const { return busy_; @@ -318,6 +321,7 @@ class Collator final : public td::actor::Actor { bool insert_out_msg(Ref out_msg); bool insert_out_msg(Ref out_msg, td::ConstBitPtr msg_hash); bool register_out_msg_queue_op(bool force = false); + bool register_dispatch_queue_op(bool force = false); bool update_min_mc_seqno(ton::BlockSeqno some_mc_seqno); bool combine_account_transactions(); bool update_public_libraries(); diff --git a/validator/impl/collator.cpp b/validator/impl/collator.cpp index e9d89cd05..016e9fd2f 100644 --- a/validator/impl/collator.cpp +++ b/validator/impl/collator.cpp @@ -45,11 +45,13 @@ using td::Ref; using namespace std::literals::string_literals; // Don't increase MERGE_MAX_QUEUE_LIMIT too much: merging requires cleaning the whole queue in out_msg_queue_cleanup -static const td::uint32 FORCE_SPLIT_QUEUE_SIZE = 4096; -static const td::uint32 SPLIT_MAX_QUEUE_SIZE = 100000; -static const td::uint32 MERGE_MAX_QUEUE_SIZE = 2047; -static const td::uint32 SKIP_EXTERNALS_QUEUE_SIZE = 8000; -static const int HIGH_PRIORITY_EXTERNAL = 10; // don't skip high priority externals when queue is big +static constexpr td::uint32 FORCE_SPLIT_QUEUE_SIZE = 4096; +static constexpr td::uint32 SPLIT_MAX_QUEUE_SIZE = 100000; +static constexpr td::uint32 MERGE_MAX_QUEUE_SIZE = 2047; +static constexpr td::uint32 SKIP_EXTERNALS_QUEUE_SIZE = 8000; +static constexpr int HIGH_PRIORITY_EXTERNAL = 10; // don't skip high priority externals when queue is big + +static constexpr int MAX_ATTEMPTS = 4; #define DBG(__n) dbg(__n)&& #define DSTART int __dcnt = 0; @@ -74,11 +76,12 @@ static inline bool dbg(int c) { * @param manager The ActorId of the ValidatorManager. * @param timeout The timeout for the collator. * @param promise The promise to return the result. + * @param attempt_idx The index of the attempt, starting from 0. On later attempts collator decreases block limits and skips some steps. */ Collator::Collator(ShardIdFull shard, bool is_hardfork, BlockIdExt min_masterchain_block_id, std::vector prev, td::Ref validator_set, Ed25519_PublicKey collator_id, Ref collator_opts, td::actor::ActorId manager, - td::Timestamp timeout, td::Promise promise) + td::Timestamp timeout, td::Promise promise, int attempt_idx) : shard_(shard) , is_hardfork_(is_hardfork) , min_mc_block_id{min_masterchain_block_id} @@ -93,6 +96,7 @@ Collator::Collator(ShardIdFull shard, bool is_hardfork, BlockIdExt min_mastercha , soft_timeout_(td::Timestamp::at(timeout.at() - 3.0)) , medium_timeout_(td::Timestamp::at(timeout.at() - 1.5)) , main_promise(std::move(promise)) + , attempt_idx_(attempt_idx) , perf_timer_("collate", 0.1, [manager](double duration) { send_closure(manager, &ValidatorManager::add_perf_timer_stat, "collate", duration); }) { @@ -107,7 +111,8 @@ Collator::Collator(ShardIdFull shard, bool is_hardfork, BlockIdExt min_mastercha * The results of these queries are handled by corresponding callback functions. */ void Collator::start_up() { - LOG(WARNING) << "Collator for shard " << shard_.to_str() << " started"; + LOG(WARNING) << "Collator for shard " << shard_.to_str() << " started" + << (attempt_idx_ ? PSTRING() << " (attempt #" << attempt_idx_ << ")" : ""); LOG(DEBUG) << "Previous block #1 is " << prev_blocks.at(0).to_str(); if (prev_blocks.size() > 1) { LOG(DEBUG) << "Previous block #2 is " << prev_blocks.at(1).to_str(); @@ -340,7 +345,13 @@ bool Collator::fatal_error(td::Status error) { error.ensure_error(); LOG(ERROR) << "cannot generate block candidate for " << show_shard(shard_) << " : " << error.to_string(); if (busy_) { - main_promise(std::move(error)); + if (allow_repeat_collation_ && attempt_idx_ + 1 < MAX_ATTEMPTS && !is_hardfork_ && !timeout.is_in_past()) { + LOG(WARNING) << "Repeating collation (attempt #" << attempt_idx_ + 1 << ")"; + run_collate_query(shard_, min_mc_block_id, prev_blocks, created_by_, validator_set_, collator_opts_, manager, + td::Timestamp::in(10.0), std::move(main_promise), attempt_idx_ + 1); + } else { + main_promise(std::move(error)); + } busy_ = false; } stop(); @@ -712,6 +723,15 @@ bool Collator::unpack_last_mc_state() { return fatal_error(limits.move_as_error()); } block_limits_ = limits.move_as_ok(); + if (attempt_idx_ == 2) { + LOG(INFO) << "Attempt #2: bytes, gas limits /= 2"; + block_limits_->bytes.multiply_by(0.5); + block_limits_->gas.multiply_by(0.5); + } else if (attempt_idx_ == 3) { + LOG(INFO) << "Attempt #3: bytes, gas limits /= 4"; + block_limits_->bytes.multiply_by(0.25); + block_limits_->gas.multiply_by(0.25); + } LOG(DEBUG) << "block limits: bytes [" << block_limits_->bytes.underload() << ", " << block_limits_->bytes.soft() << ", " << block_limits_->bytes.hard() << "]"; LOG(DEBUG) << "block limits: gas [" << block_limits_->gas.underload() << ", " << block_limits_->gas.soft() << ", " @@ -2093,6 +2113,7 @@ bool Collator::do_collate() { if (max_lt == start_lt) { ++max_lt; } + allow_repeat_collation_ = true; // NB: interchanged 1.2 and 1.1 (is this always correct?) // 1.1. re-adjust neighbors' out_msg_queues (for oneself) if (!add_trivial_neighbor()) { @@ -3565,6 +3586,10 @@ bool Collator::process_inbound_external_messages() { LOG(INFO) << "skipping processing of inbound external messages"; return true; } + if (attempt_idx_ >= 2) { + LOG(INFO) << "Attempt #" << attempt_idx_ << ": skip external messages"; + return true; + } if (out_msg_queue_size_ > SKIP_EXTERNALS_QUEUE_SIZE) { LOG(INFO) << "skipping processing of inbound external messages (except for high-priority) because out_msg_queue is " "too big (" @@ -3692,6 +3717,10 @@ bool Collator::process_dispatch_queue() { if (max_per_initiator[iter] == 0 || max_total_count[iter] == 0) { continue; } + if (iter > 0 && attempt_idx_ >= 1) { + LOG(INFO) << "Attempt #" << attempt_idx_ << ": skip process_dispatch_queue"; + break; + } vm::AugmentedDictionary cur_dispatch_queue{dispatch_queue_->get_root(), 256, block::tlb::aug_DispatchQueue}; std::map, size_t> count_per_initiator; size_t total_count = 0; @@ -3704,13 +3733,13 @@ bool Collator::process_dispatch_queue() { stats_.limits_log += PSTRING() << "DISPATCH_QUEUE_STAGE_" << iter << ": " << block_full_comment(*block_limit_status_, block::ParamLimits::cl_normal) << "\n"; - return true; + return register_dispatch_queue_op(true); } if (soft_timeout_.is_in_past(td::Timestamp::now())) { block_full_ = true; LOG(WARNING) << "soft timeout reached, stop processing dispatch queue"; stats_.limits_log += PSTRING() << "DISPATCH_QUEUE_STAGE_" << iter << ": timeout\n"; - return true; + return register_dispatch_queue_op(true); } StdSmcAddress src_addr; td::Ref account_dispatch_queue; @@ -3788,6 +3817,7 @@ bool Collator::process_dispatch_queue() { if (iter == 0) { have_unprocessed_account_dispatch_queue_ = false; } + register_dispatch_queue_op(true); } return true; } @@ -3811,12 +3841,7 @@ bool Collator::process_deferred_message(Ref enq_msg, StdSmcAddres return fatal_error(PSTRING() << "failed to delete message from DispatchQueue: address=" << src_addr.to_hex() << ", lt=" << lt); } - ++dispatch_queue_ops_; - if (!(dispatch_queue_ops_ & 63)) { - if (!block_limit_status_->add_proof(dispatch_queue_->get_root_cell())) { - return false; - } - } + register_dispatch_queue_op(); ++sender_generated_messages_count_[src_addr]; LogicalTime enqueued_lt = 0; @@ -3909,6 +3934,7 @@ bool Collator::process_deferred_message(Ref enq_msg, StdSmcAddres ++unprocessed_deferred_messages_[src_addr]; LOG(INFO) << "delivering deferred message from account " << src_addr.to_hex() << ", lt=" << lt << ", emitted_lt=" << emitted_lt; + block_limit_status_->add_cell(msg_env); register_new_msg(std::move(new_msg)); msg_metadata = std::move(env.metadata); return true; @@ -4088,11 +4114,7 @@ bool Collator::enqueue_message(block::NewOutMsg msg, td::RefInt256 fwd_fees_rema } ++dispatch_dict_size; dispatch_queue_->set(src_addr, block::pack_account_dispatch_queue(dispatch_dict, dispatch_dict_size)); - ++dispatch_queue_ops_; - if (!(dispatch_queue_ops_ & 63)) { - return block_limit_status_->add_proof(dispatch_queue_->get_root_cell()); - } - return true; + return register_dispatch_queue_op(); } auto next_hop = block::interpolate_addr(src_prefix, dest_prefix, route_info.second); @@ -4973,6 +4995,23 @@ bool Collator::register_out_msg_queue_op(bool force) { } } +/** + * Registers a dispatch queue message queue operation. + * Adds the proof to the block limit status every 64 operations. + * + * @param force If true, the proof will always be added to the block limit status. + * + * @returns True if the operation was successfully registered, false otherwise. + */ +bool Collator::register_dispatch_queue_op(bool force) { + ++dispatch_queue_ops_; + if (force || !(dispatch_queue_ops_ & 63)) { + return block_limit_status_->add_proof(dispatch_queue_->get_root_cell()); + } else { + return true; + } +} + /** * Creates a new shard state and the Merkle update. * @@ -5098,9 +5137,10 @@ bool Collator::compute_out_msg_queue_info(Ref& out_msg_queue_info) { vm::CellSlice maybe_extra = cb.as_cellslice(); cb.reset(); - return register_out_msg_queue_op(true) && out_msg_queue_->append_dict_to_bool(cb) // _ out_queue:OutMsgQueue - && processed_upto_->pack(cb) // proc_info:ProcessedInfo - && cb.append_cellslice_bool(maybe_extra) // extra:(Maybe OutMsgQueueExtra) + return register_out_msg_queue_op(true) && register_dispatch_queue_op(true) && + out_msg_queue_->append_dict_to_bool(cb) // _ out_queue:OutMsgQueue + && processed_upto_->pack(cb) // proc_info:ProcessedInfo + && cb.append_cellslice_bool(maybe_extra) // extra:(Maybe OutMsgQueueExtra) && cb.finalize_to(out_msg_queue_info); } diff --git a/validator/impl/fabric.cpp b/validator/impl/fabric.cpp index bfc25f6ed..fc942cc54 100644 --- a/validator/impl/fabric.cpp +++ b/validator/impl/fabric.cpp @@ -214,16 +214,18 @@ void run_validate_query(ShardIdFull shard, BlockIdExt min_masterchain_block_id, void run_collate_query(ShardIdFull shard, const BlockIdExt& min_masterchain_block_id, std::vector prev, Ed25519_PublicKey creator, td::Ref validator_set, td::Ref collator_opts, td::actor::ActorId manager, - td::Timestamp timeout, td::Promise promise) { + td::Timestamp timeout, td::Promise promise, int attempt_idx) { BlockSeqno seqno = 0; for (auto& p : prev) { if (p.seqno() > seqno) { seqno = p.seqno(); } } - td::actor::create_actor(PSTRING() << "collate" << shard.to_str() << ":" << (seqno + 1), shard, false, - min_masterchain_block_id, std::move(prev), std::move(validator_set), creator, - std::move(collator_opts), std::move(manager), timeout, std::move(promise)) + td::actor::create_actor(PSTRING() << "collate" << shard.to_str() << ":" << (seqno + 1) + << (attempt_idx ? "_" + td::to_string(attempt_idx) : ""), + shard, false, min_masterchain_block_id, std::move(prev), std::move(validator_set), + creator, std::move(collator_opts), std::move(manager), timeout, std::move(promise), + attempt_idx) .release(); } @@ -239,7 +241,7 @@ void run_collate_hardfork(ShardIdFull shard, const BlockIdExt& min_masterchain_b td::actor::create_actor(PSTRING() << "collate" << shard.to_str() << ":" << (seqno + 1), shard, true, min_masterchain_block_id, std::move(prev), td::Ref{}, Ed25519_PublicKey{Bits256::zero()}, td::Ref{true}, - std::move(manager), timeout, std::move(promise)) + std::move(manager), timeout, std::move(promise), 0) .release(); }