From dc26c3be671b13cd7d07f232df8be4ccb8cdc2e0 Mon Sep 17 00:00:00 2001 From: SpyCheese Date: Tue, 20 Aug 2024 19:54:16 +0300 Subject: [PATCH] Improve validator session stats (#1117) * Improve validator session stats * Collator stats: block limits, number of processed external messages * Collator and validator work time * Last key block seqno * Approvers and signers * End validator session stats --- catchain/catchain.h | 1 + catchain/catchain.hpp | 9 ++ tdutils/td/utils/Timer.cpp | 43 +++++++++ tdutils/td/utils/Timer.h | 18 ++++ tl/generate/scheme/ton_api.tl | 25 +++-- tl/generate/scheme/ton_api.tlo | Bin 91004 -> 92472 bytes validator-session/validator-session-types.h | 14 +++ validator-session/validator-session.cpp | 29 ++++++ validator-session/validator-session.h | 1 + validator-session/validator-session.hpp | 1 + validator/impl/collator-impl.h | 5 + validator/impl/collator.cpp | 63 ++++++++++++ validator/impl/validate-query.cpp | 21 ++++ validator/impl/validate-query.hpp | 4 + validator/interfaces/validator-manager.h | 17 ++++ validator/manager-disk.hpp | 3 + validator/manager-hardfork.hpp | 3 + validator/manager.cpp | 102 +++++++++++++++++--- validator/manager.hpp | 19 +++- validator/validator-group.cpp | 11 +++ validator/validator-group.hpp | 11 ++- 21 files changed, 373 insertions(+), 27 deletions(-) diff --git a/catchain/catchain.h b/catchain/catchain.h index 912957e56..c5c8af28d 100644 --- a/catchain/catchain.h +++ b/catchain/catchain.h @@ -96,6 +96,7 @@ class CatChain : public td::actor::Actor { virtual void send_query_via(const PublicKeyHash &dst, std::string name, td::Promise promise, td::Timestamp timeout, td::BufferSlice query, td::uint64 max_answer_size, td::actor::ActorId via) = 0; + virtual void get_source_heights(td::Promise> promise) = 0; virtual void destroy() = 0; static td::actor::ActorOwn create(std::unique_ptr callback, const CatChainOptions &opts, diff --git a/catchain/catchain.hpp b/catchain/catchain.hpp index 8c8bb99ae..586cf4744 100644 --- a/catchain/catchain.hpp +++ b/catchain/catchain.hpp @@ -115,6 +115,15 @@ class CatChainImpl : public CatChain { td::actor::send_closure(receiver_, &CatChainReceiverInterface::send_custom_query_data_via, dst, name, std::move(promise), timeout, std::move(query), max_answer_size, via); } + void get_source_heights(td::Promise> promise) override { + std::vector heights(top_source_blocks_.size(), 0); + for (size_t i = 0; i < top_source_blocks_.size(); ++i) { + if (top_source_blocks_[i]) { + heights[i] = top_source_blocks_[i]->height(); + } + } + promise.set_result(std::move(heights)); + } void destroy() override; CatChainImpl(std::unique_ptr callback, const CatChainOptions &opts, td::actor::ActorId keyring, td::actor::ActorId adnl, diff --git a/tdutils/td/utils/Timer.cpp b/tdutils/td/utils/Timer.cpp index 1f72fba96..24de099aa 100644 --- a/tdutils/td/utils/Timer.cpp +++ b/tdutils/td/utils/Timer.cpp @@ -91,4 +91,47 @@ double PerfWarningTimer::elapsed() const { return Time::now() - start_at_; } +static double thread_cpu_clock() { +#if defined(CLOCK_THREAD_CPUTIME_ID) + timespec ts; + int result = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + CHECK(result == 0); + return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9; +#else + return 0.0; // TODO: MacOS and Windows support (currently cpu timer is used only in validators) +#endif +} + +ThreadCpuTimer::ThreadCpuTimer(bool is_paused) : is_paused_(is_paused) { + if (is_paused_) { + start_time_ = 0; + } else { + start_time_ = thread_cpu_clock(); + } +} + +void ThreadCpuTimer::pause() { + if (is_paused_) { + return; + } + elapsed_ += thread_cpu_clock() - start_time_; + is_paused_ = true; +} + +void ThreadCpuTimer::resume() { + if (!is_paused_) { + return; + } + start_time_ = thread_cpu_clock(); + is_paused_ = false; +} + +double ThreadCpuTimer::elapsed() const { + double res = elapsed_; + if (!is_paused_) { + res += thread_cpu_clock() - start_time_; + } + return res; +} + } // namespace td diff --git a/tdutils/td/utils/Timer.h b/tdutils/td/utils/Timer.h index 3e0cafbf5..a27cac8a7 100644 --- a/tdutils/td/utils/Timer.h +++ b/tdutils/td/utils/Timer.h @@ -62,4 +62,22 @@ class PerfWarningTimer { std::function callback_; }; +class ThreadCpuTimer { + public: + ThreadCpuTimer() : ThreadCpuTimer(false) { + } + explicit ThreadCpuTimer(bool is_paused); + ThreadCpuTimer(const ThreadCpuTimer &other) = default; + ThreadCpuTimer &operator=(const ThreadCpuTimer &other) = default; + + double elapsed() const; + void pause(); + void resume(); + + private: + double elapsed_{0}; + double start_time_; + bool is_paused_{false}; +}; + } // namespace td diff --git a/tl/generate/scheme/ton_api.tl b/tl/generate/scheme/ton_api.tl index b33ca5425..bf919b0fd 100644 --- a/tl/generate/scheme/ton_api.tl +++ b/tl/generate/scheme/ton_api.tl @@ -388,8 +388,8 @@ tonNode.newShardBlock block:tonNode.blockIdExt cc_seqno:int data:bytes = tonNode tonNode.blockBroadcastCompressed.data signatures:(vector tonNode.blockSignature) proof_data:bytes = tonNode.blockBroadcaseCompressed.Data; -tonNode.blockBroadcast id:tonNode.blockIdExt catchain_seqno:int validator_set_hash:int - signatures:(vector tonNode.blockSignature) +tonNode.blockBroadcast id:tonNode.blockIdExt catchain_seqno:int validator_set_hash:int + signatures:(vector tonNode.blockSignature) proof:bytes data:bytes = tonNode.Broadcast; tonNode.blockBroadcastCompressed id:tonNode.blockIdExt catchain_seqno:int validator_set_hash:int flags:# compressed:bytes = tonNode.Broadcast; @@ -769,13 +769,19 @@ http.server.config dhs:(vector http.server.dnsEntry) local_hosts:(vector http.se ---types--- -validatorSession.statsProducer id:int256 candidate_id:int256 block_status:int comment:string - block_timestamp:double is_accepted:Bool is_ours:Bool got_submit_at:double +validatorSession.collationStats bytes:int gas:int lt_delta:int cat_bytes:int cat_gas:int cat_lt_delta:int + limits_log:string ext_msgs_total:int ext_msgs_filtered:int ext_msgs_accepted:int ext_msgs_rejected:int = validadorSession.CollationStats; + +validatorSession.statsProducer id:int256 candidate_id:int256 block_status:int root_hash:int256 file_hash:int256 + comment:string block_timestamp:double is_accepted:Bool is_ours:Bool got_submit_at:double collation_time:double collated_at:double collation_cached:Bool + collation_work_time:double collation_cpu_work_time:double + collation_stats:validatorSession.collationStats validation_time:double validated_at:double validation_cached:Bool + validation_work_time:double validation_cpu_work_time:double gen_utime:double - approved_weight:long approved_33pct_at:double approved_66pct_at:double - signed_weight:long signed_33pct_at:double signed_66pct_at:double + approved_weight:long approved_33pct_at:double approved_66pct_at:double approvers:string + signed_weight:long signed_33pct_at:double signed_66pct_at:double signers:string serialize_time:double deserialize_time:double serialized_size:int = validatorSession.StatsProducer; validatorSession.statsRound timestamp:double producers:(vector validatorSession.statsProducer) = validatorSession.StatsRound; @@ -786,9 +792,14 @@ validatorSession.stats success:Bool id:tonNode.blockIdExt timestamp:double self: first_round:int rounds:(vector validatorSession.statsRound) = validatorSession.Stats; validatorSession.newValidatorGroupStats.node id:int256 weight:long = validatorSession.newValidatorGroupStats.Node; -validatorSession.newValidatorGroupStats session_id:int256 workchain:int shard:long cc_seqno:int timestamp:double +validatorSession.newValidatorGroupStats session_id:int256 workchain:int shard:long cc_seqno:int + last_key_block_seqno:int timestamp:double self_idx:int nodes:(vector validatorSession.newValidatorGroupStats.node) = validatorSession.NewValidatorGroupStats; +validatorSession.endValidatorGroupStats.node id:int256 catchain_blocks:int = validatorSession.endValidatorGroupStats.Node; +validatorSession.endValidatorGroupStats session_id:int256 timestamp:double + nodes:(vector validatorSession.endValidatorGroupStats.node) = validatorSession.EndValidatorGroupStats; + ---functions--- ---types--- diff --git a/tl/generate/scheme/ton_api.tlo b/tl/generate/scheme/ton_api.tlo index da1aa331d2048803f582b99ede705270f28a22ea..337dd071e3f746cd822236e2dcb63c194b379de4 100644 GIT binary patch delta 841 zcmex!jCIE)R^CUm^{p77fNLY~d3}|4min3UWr;bNDTyihMZu}X#hLkedd~SdIf*4e zR&YsTN%6!D5}WVn?_iXi_R1(;9iq4dq1ZJqW%@;aM&Zdn3?(K{F|=S=WW;%J@&&^c zES-~cgeKP*6-c&d`z+GI(3lF;xKM&oWU{2O{Ny)AKNx#AKQ*>L#R_)Q^ouNv!ivbw z014kbCgj5n^nmoU`4q$f(n!LI$;qh&B^bg*sadJX5MlAj2^JhIAonkp_$P=X7(t-{3Pu)C zC{I^#VpN!JAjBvz{e~c;%H$QkDiWYTWQ0auku#ePhi%(Iuv#8|#x<;+`kvg1oKb5kb=%#)B1M@S?Wl%h(?@*^aRL1_o%UXVjUo@boC zRg|$^R0Nt3fXNF`3spvW|5Mycsh;*50=|J@O1?0~Q`Bp4O&r$K2JQ0PuR5U3%*4@@b^8Ht&B@ku%P T$=St|57^3pWtZ1TFnR+3<5Dl9 delta 209 zcmdmSiS^GhR^CUm^{p77fPEwHdHu~220Iuhix^q3>_0DYW^#bh3KmawcJ;}Bj0z_2 zF#f^VwRxk7{i&^27{3TipCZJ#fEA=neezyobxsiL<}o23_UQ*j8MP-@6uC@)Bg&{U zT}Onm0c_g#4I+#mm{>vP$xT-jWfYsfTbz-T6T}DUlbAd)NO}4PamIk{9TJQ>jMFWo p7;C0WNHZ>({y>_sVtR!P&>WEB>3?JxJ0PqMSw@BJE^>@tya3`PN1gxx diff --git a/validator-session/validator-session-types.h b/validator-session/validator-session-types.h index e13c36d24..78a9b2460 100644 --- a/validator-session/validator-session-types.h +++ b/validator-session/validator-session-types.h @@ -77,6 +77,8 @@ struct ValidatorSessionStats { ValidatorSessionCandidateId candidate_id = ValidatorSessionCandidateId::zero(); int block_status = status_none; double block_timestamp = -1.0; + td::Bits256 root_hash = td::Bits256::zero(); + td::Bits256 file_hash = td::Bits256::zero(); std::string comment; bool is_accepted = false; @@ -159,11 +161,23 @@ struct NewValidatorGroupStats { ValidatorSessionId session_id = ValidatorSessionId::zero(); ShardIdFull shard{masterchainId}; CatchainSeqno cc_seqno = 0; + BlockSeqno last_key_block_seqno = 0; double timestamp = -1.0; td::uint32 self_idx = 0; std::vector nodes; }; +struct EndValidatorGroupStats { + struct Node { + PublicKeyHash id = PublicKeyHash::zero(); + td::uint32 catchain_blocks = 0; + }; + + ValidatorSessionId session_id = ValidatorSessionId::zero(); + double timestamp = -1.0; + std::vector nodes; +}; + } // namespace validatorsession } // namespace ton diff --git a/validator-session/validator-session.cpp b/validator-session/validator-session.cpp index 46dd44403..be5443785 100644 --- a/validator-session/validator-session.cpp +++ b/validator-session/validator-session.cpp @@ -270,6 +270,8 @@ void ValidatorSessionImpl::process_broadcast(PublicKeyHash src, td::BufferSlice } stat->deserialize_time = deserialize_time; stat->serialized_size = data.size(); + stat->root_hash = candidate->root_hash_; + stat->file_hash = file_hash; } if ((td::int32)block_round < (td::int32)cur_round_ - MAX_PAST_ROUND_BLOCK || @@ -468,6 +470,8 @@ void ValidatorSessionImpl::generated_block(td::uint32 round, ValidatorSessionCan stat->collated_at = td::Clocks::system(); stat->block_timestamp = td::Clocks::system(); stat->collation_cached = collation_cached; + stat->root_hash = root_hash; + stat->file_hash = file_hash; } if (round != cur_round_) { return; @@ -602,6 +606,8 @@ void ValidatorSessionImpl::try_approve_block(const SentBlock *block) { if (stat->block_timestamp <= 0.0) { stat->block_timestamp = td::Clocks::system(); } + stat->root_hash = B->root_hash_; + stat->file_hash = td::sha256_bits256(B->data_); } auto P = td::PromiseCreator::lambda([round = cur_round_, hash = block_id, root_hash = block->get_root_hash(), @@ -997,6 +1003,29 @@ void ValidatorSessionImpl::get_current_stats(td::Promise promise.set_result(cur_stats_); } +void ValidatorSessionImpl::get_end_stats(td::Promise promise) { + if (!started_) { + promise.set_error(td::Status::Error(ErrorCode::notready, "not started")); + return; + } + EndValidatorGroupStats stats; + stats.session_id = unique_hash_; + stats.timestamp = td::Clocks::system(); + stats.nodes.resize(description().get_total_nodes()); + for (size_t i = 0; i < stats.nodes.size(); ++i) { + stats.nodes[i].id = description().get_source_id(i); + } + td::actor::send_closure(catchain_, &catchain::CatChain::get_source_heights, + [promise = std::move(promise), + stats = std::move(stats)](td::Result> R) mutable { + TRY_RESULT_PROMISE(promise, heights, std::move(R)); + for (size_t i = 0; i < std::min(heights.size(), stats.nodes.size()); ++i) { + stats.nodes[i].catchain_blocks = heights[i]; + } + promise.set_result(std::move(stats)); + }); +} + void ValidatorSessionImpl::get_validator_group_info_for_litequery( td::uint32 cur_round, td::Promise>> promise) { diff --git a/validator-session/validator-session.h b/validator-session/validator-session.h index 0870f6718..2e1ed9b13 100644 --- a/validator-session/validator-session.h +++ b/validator-session/validator-session.h @@ -105,6 +105,7 @@ class ValidatorSession : public td::actor::Actor { virtual void start() = 0; virtual void destroy() = 0; virtual void get_current_stats(td::Promise promise) = 0; + virtual void get_end_stats(td::Promise promise) = 0; virtual void get_validator_group_info_for_litequery( td::uint32 cur_round, td::Promise>> promise) = 0; diff --git a/validator-session/validator-session.hpp b/validator-session/validator-session.hpp index 580582824..2ee4885b9 100644 --- a/validator-session/validator-session.hpp +++ b/validator-session/validator-session.hpp @@ -187,6 +187,7 @@ class ValidatorSessionImpl : public ValidatorSession { void start() override; void destroy() override; void get_current_stats(td::Promise promise) override; + void get_end_stats(td::Promise promise) override; void get_validator_group_info_for_litequery( td::uint32 cur_round, td::Promise>> promise) override; diff --git a/validator/impl/collator-impl.h b/validator/impl/collator-impl.h index 913a0ed87..b8d9e56d3 100644 --- a/validator/impl/collator-impl.h +++ b/validator/impl/collator-impl.h @@ -356,6 +356,11 @@ class Collator final : public td::actor::Actor { public: static td::uint32 get_skip_externals_queue_size(); + + private: + td::Timer work_timer_{true}; + td::ThreadCpuTimer cpu_work_timer_{true}; + CollationStats stats_; }; } // namespace validator diff --git a/validator/impl/collator.cpp b/validator/impl/collator.cpp index c6dd7caf2..f465c0f55 100644 --- a/validator/impl/collator.cpp +++ b/validator/impl/collator.cpp @@ -1772,6 +1772,12 @@ bool Collator::register_shard_block_creators(std::vector creator_li * @returns True if collation is successful, false otherwise. */ bool Collator::try_collate() { + work_timer_.resume(); + cpu_work_timer_.resume(); + SCOPE_EXIT { + work_timer_.pause(); + cpu_work_timer_.pause(); + }; if (!preinit_complete) { LOG(WARNING) << "running do_preinit()"; if (!do_preinit()) { @@ -3481,6 +3487,29 @@ bool Collator::process_inbound_message(Ref enq_msg, ton::LogicalT return true; } +/** + * Creates a string that explains which limit is exceeded. Used for collator stats. + * + * @param block_limit_status Status of block limits. + * @param cls Which limit class is exceeded. + * + * @returns String for collator stats. + */ +static std::string block_full_comment(const block::BlockLimitStatus& block_limit_status, unsigned cls) { + auto bytes = block_limit_status.estimate_block_size(); + if (!block_limit_status.limits.bytes.fits(cls, bytes)) { + return PSTRING() << "block_full bytes " << bytes; + } + if (!block_limit_status.limits.gas.fits(cls, block_limit_status.gas_used)) { + return PSTRING() << "block_full gas " << block_limit_status.gas_used; + } + auto lt_delta = block_limit_status.cur_lt - block_limit_status.limits.start_lt; + if (!block_limit_status.limits.lt_delta.fits(cls, lt_delta)) { + return PSTRING() << "block_full lt_delta " << lt_delta; + } + return ""; +} + /** * Processes inbound internal messages from message queues of the neighbors. * Messages are processed until the normal limit is reached, soft timeout is reached or there are no more messages. @@ -3495,11 +3524,14 @@ bool Collator::process_inbound_internal_messages() { block_full_ = !block_limit_status_->fits(block::ParamLimits::cl_normal); if (block_full_) { LOG(INFO) << "BLOCK FULL, stop processing inbound internal messages"; + stats_.limits_log += PSTRING() << "INBOUND_INT_MESSAGES: " + << block_full_comment(*block_limit_status_, block::ParamLimits::cl_normal) << "\n"; break; } if (soft_timeout_.is_in_past(td::Timestamp::now())) { block_full_ = true; LOG(WARNING) << "soft timeout reached, stop processing inbound internal messages"; + stats_.limits_log += PSTRING() << "INBOUND_INT_MESSAGES: timeout\n"; break; } auto kv = nb_out_msgs_->extract_cur(); @@ -3547,15 +3579,23 @@ bool Collator::process_inbound_external_messages() { } if (full) { LOG(INFO) << "BLOCK FULL, stop processing external messages"; + stats_.limits_log += PSTRING() << "INBOUND_EXT_MESSAGES: " + << block_full_comment(*block_limit_status_, block::ParamLimits::cl_soft) << "\n"; break; } if (medium_timeout_.is_in_past(td::Timestamp::now())) { LOG(WARNING) << "medium timeout reached, stop processing inbound external messages"; + stats_.limits_log += PSTRING() << "INBOUND_EXT_MESSAGES: timeout\n"; break; } auto ext_msg = ext_msg_struct.cell; ton::Bits256 hash{ext_msg->get_hash().bits()}; int r = process_external_message(std::move(ext_msg)); + if (r > 0) { + ++stats_.ext_msgs_accepted; + } else { + ++stats_.ext_msgs_rejected; + } if (r < 0) { bad_ext_msgs_.emplace_back(ext_msg_struct.hash); return false; @@ -3661,11 +3701,15 @@ bool Collator::process_dispatch_queue() { block_full_ = !block_limit_status_->fits(block::ParamLimits::cl_normal); if (block_full_) { LOG(INFO) << "BLOCK FULL, stop processing dispatch queue"; + stats_.limits_log += PSTRING() << "DISPATCH_QUEUE_STAGE_" << iter << ": " + << block_full_comment(*block_limit_status_, block::ParamLimits::cl_normal) + << "\n"; return true; } if (soft_timeout_.is_in_past(td::Timestamp::now())) { block_full_ = true; LOG(WARNING) << "soft timeout reached, stop processing dispatch queue"; + stats_.limits_log += PSTRING() << "DISPATCH_QUEUE_STAGE_" << iter << ": timeout\n"; return true; } StdSmcAddress src_addr; @@ -3715,6 +3759,7 @@ bool Collator::process_dispatch_queue() { ++total_count; if (total_count >= max_total_count[iter]) { dispatch_queue_total_limit_reached_ = true; + stats_.limits_log += PSTRING() << "DISPATCH_QUEUE_STAGE_" << iter << ": total limit reached\n"; break; } } @@ -4064,6 +4109,8 @@ bool Collator::process_new_messages(bool enqueue_only) { if ((block_full_ || have_unprocessed_account_dispatch_queue_) && !enqueue_only) { LOG(INFO) << "BLOCK FULL, enqueue all remaining new messages"; enqueue_only = true; + stats_.limits_log += PSTRING() << "NEW_MESSAGES: " + << block_full_comment(*block_limit_status_, block::ParamLimits::cl_normal) << "\n"; } LOG(DEBUG) << "have message with lt=" << msg.lt; int res = process_one_new_message(std::move(msg), enqueue_only); @@ -4072,6 +4119,8 @@ bool Collator::process_new_messages(bool enqueue_only) { } else if (res == 3) { LOG(INFO) << "All remaining new messages must be enqueued (BLOCK FULL)"; enqueue_only = true; + stats_.limits_log += PSTRING() << "NEW_MESSAGES: " + << block_full_comment(*block_limit_status_, block::ParamLimits::cl_normal) << "\n"; } } return true; @@ -5435,6 +5484,18 @@ bool Collator::create_block_candidate() { td::actor::send_closure_later(manager, &ValidatorManager::complete_external_messages, std::move(delay_ext_msgs_), std::move(bad_ext_msgs_)); } + + double work_time = work_timer_.elapsed(); + double cpu_work_time = cpu_work_timer_.elapsed(); + LOG(WARNING) << "Collate query work time = " << work_time << "s, cpu time = " << cpu_work_time << "s"; + stats_.bytes = block_limit_status_->estimate_block_size(); + stats_.gas = block_limit_status_->gas_used; + stats_.lt_delta = block_limit_status_->cur_lt - block_limit_status_->limits.start_lt; + stats_.cat_bytes = block_limit_status_->limits.classify_size(stats_.bytes); + stats_.cat_gas = block_limit_status_->limits.classify_gas(stats_.gas); + stats_.cat_lt_delta = block_limit_status_->limits.classify_lt(block_limit_status_->cur_lt); + td::actor::send_closure(manager, &ValidatorManager::record_collate_query_stats, block_candidate->id, work_time, + cpu_work_time, std::move(stats_)); return true; } @@ -5539,6 +5600,7 @@ void Collator::after_get_external_messages(td::Result ext_msg_cell = ext_msg->root_cell(); @@ -5550,6 +5612,7 @@ void Collator::after_get_external_messages(td::Resulthash()); } } diff --git a/validator/impl/validate-query.cpp b/validator/impl/validate-query.cpp index 8c39a1ab4..003b7f9f7 100644 --- a/validator/impl/validate-query.cpp +++ b/validator/impl/validate-query.cpp @@ -118,6 +118,7 @@ bool ValidateQuery::reject_query(std::string error, td::BufferSlice reason) { error = error_ctx() + error; LOG(ERROR) << "REJECT: aborting validation of block candidate for " << shard_.to_str() << " : " << error; if (main_promise) { + record_stats(); errorlog::ErrorLog::log(PSTRING() << "REJECT: aborting validation of block candidate for " << shard_.to_str() << " : " << error << ": data=" << block_candidate.id.file_hash.to_hex() << " collated_data=" << block_candidate.collated_file_hash.to_hex()); @@ -155,6 +156,7 @@ bool ValidateQuery::soft_reject_query(std::string error, td::BufferSlice reason) error = error_ctx() + error; LOG(ERROR) << "SOFT REJECT: aborting validation of block candidate for " << shard_.to_str() << " : " << error; if (main_promise) { + record_stats(); errorlog::ErrorLog::log(PSTRING() << "SOFT REJECT: aborting validation of block candidate for " << shard_.to_str() << " : " << error << ": data=" << block_candidate.id.file_hash.to_hex() << " collated_data=" << block_candidate.collated_file_hash.to_hex()); @@ -177,6 +179,7 @@ bool ValidateQuery::fatal_error(td::Status error) { error.ensure_error(); LOG(ERROR) << "aborting validation of block candidate for " << shard_.to_str() << " : " << error.to_string(); if (main_promise) { + record_stats(); auto c = error.code(); if (c <= -667 && c >= -670) { errorlog::ErrorLog::log(PSTRING() << "FATAL ERROR: aborting validation of block candidate for " << shard_.to_str() @@ -234,6 +237,7 @@ bool ValidateQuery::fatal_error(std::string err_msg, int err_code) { */ void ValidateQuery::finish_query() { if (main_promise) { + record_stats(); LOG(WARNING) << "validate query done"; main_promise.set_result(now_); } @@ -6764,6 +6768,12 @@ bool ValidateQuery::try_validate() { if (pending) { return true; } + work_timer_.resume(); + cpu_work_timer_.resume(); + SCOPE_EXIT { + work_timer_.pause(); + cpu_work_timer_.pause(); + }; try { if (!stage_) { LOG(WARNING) << "try_validate stage 0"; @@ -6903,6 +6913,17 @@ void ValidateQuery::written_candidate() { finish_query(); } +/** + * Sends validation work time to manager. + */ +void ValidateQuery::record_stats() { + double work_time = work_timer_.elapsed(); + double cpu_work_time = cpu_work_timer_.elapsed(); + LOG(WARNING) << "Validate query work time = " << work_time << "s, cpu time = " << cpu_work_time << "s"; + td::actor::send_closure(manager, &ValidatorManager::record_validate_query_stats, block_candidate.id, work_time, + cpu_work_time); +} + } // namespace validator } // namespace ton diff --git a/validator/impl/validate-query.hpp b/validator/impl/validate-query.hpp index 824afb49d..104950938 100644 --- a/validator/impl/validate-query.hpp +++ b/validator/impl/validate-query.hpp @@ -398,6 +398,10 @@ class ValidateQuery : public td::actor::Actor { } return true; } + + td::Timer work_timer_{true}; + td::ThreadCpuTimer cpu_work_timer_{true}; + void record_stats(); }; } // namespace validator diff --git a/validator/interfaces/validator-manager.h b/validator/interfaces/validator-manager.h index 0e9fab73b..b6016bc2b 100644 --- a/validator/interfaces/validator-manager.h +++ b/validator/interfaces/validator-manager.h @@ -52,6 +52,16 @@ struct AsyncSerializerState { UnixTime last_written_block_ts; }; +struct CollationStats { + td::uint32 bytes, gas, lt_delta; + int cat_bytes, cat_gas, cat_lt_delta; + std::string limits_log; + td::uint32 ext_msgs_total = 0; + td::uint32 ext_msgs_filtered = 0; + td::uint32 ext_msgs_accepted = 0; + td::uint32 ext_msgs_rejected = 0; +}; + using ValidateCandidateResult = td::Variant; class ValidatorManager : public ValidatorManagerInterface { @@ -173,6 +183,7 @@ class ValidatorManager : public ValidatorManagerInterface { virtual void log_validator_session_stats(BlockIdExt block_id, validatorsession::ValidatorSessionStats stats) = 0; virtual void log_new_validator_group_stats(validatorsession::NewValidatorGroupStats stats) = 0; + virtual void log_end_validator_group_stats(validatorsession::EndValidatorGroupStats stats) = 0; virtual void get_block_handle_for_litequery(BlockIdExt block_id, td::Promise promise) = 0; virtual void get_block_data_for_litequery(BlockIdExt block_id, td::Promise> promise) = 0; @@ -192,6 +203,12 @@ class ValidatorManager : public ValidatorManagerInterface { virtual void add_lite_query_stats(int lite_query_id) { } + virtual void record_collate_query_stats(BlockIdExt block_id, double work_time, double cpu_work_time, + CollationStats stats) { + } + virtual void record_validate_query_stats(BlockIdExt block_id, double work_time, double cpu_work_time) { + } + static bool is_persistent_state(UnixTime ts, UnixTime prev_ts) { return ts / (1 << 17) != prev_ts / (1 << 17); } diff --git a/validator/manager-disk.hpp b/validator/manager-disk.hpp index a77be2725..3a77f2301 100644 --- a/validator/manager-disk.hpp +++ b/validator/manager-disk.hpp @@ -388,6 +388,9 @@ class ValidatorManagerImpl : public ValidatorManager { void log_new_validator_group_stats(validatorsession::NewValidatorGroupStats stats) override { UNREACHABLE(); } + void log_end_validator_group_stats(validatorsession::EndValidatorGroupStats stats) override { + UNREACHABLE(); + } void get_out_msg_queue_size(BlockIdExt block_id, td::Promise promise) override { if (queue_size_counter_.empty()) { queue_size_counter_ = diff --git a/validator/manager-hardfork.hpp b/validator/manager-hardfork.hpp index e7175b77b..cf4d3799f 100644 --- a/validator/manager-hardfork.hpp +++ b/validator/manager-hardfork.hpp @@ -450,6 +450,9 @@ class ValidatorManagerImpl : public ValidatorManager { void log_new_validator_group_stats(validatorsession::NewValidatorGroupStats stats) override { UNREACHABLE(); } + void log_end_validator_group_stats(validatorsession::EndValidatorGroupStats stats) override { + UNREACHABLE(); + } void get_out_msg_queue_size(BlockIdExt block_id, td::Promise promise) override { if (queue_size_counter_.empty()) { queue_size_counter_ = diff --git a/validator/manager.cpp b/validator/manager.cpp index eb082d91e..fa592a788 100644 --- a/validator/manager.cpp +++ b/validator/manager.cpp @@ -44,6 +44,7 @@ #include "td/utils/JsonBuilder.h" #include "common/delay.h" +#include "td/utils/filesystem.h" #include "validator/stats-merger.h" @@ -2044,7 +2045,7 @@ void ValidatorManagerImpl::update_shards() { } new_validator_groups_.emplace(val_group_id, std::move(it2->second)); } else { - auto G = create_validator_group(val_group_id, shard, val_set, opts, started_); + auto G = create_validator_group(val_group_id, shard, val_set, key_seqno, opts, started_); if (!G.empty()) { td::actor::send_closure(G, &ValidatorGroup::start, prev, last_masterchain_block_id_, last_masterchain_state_->get_unix_time()); @@ -2100,7 +2101,7 @@ void ValidatorManagerImpl::update_shards() { } new_validator_groups_.emplace(val_group_id, std::move(it2->second)); } else { - auto G = create_validator_group(val_group_id, shard, val_set, opts, started_); + auto G = create_validator_group(val_group_id, shard, val_set, key_seqno, opts, started_); if (!G.empty()) { td::actor::send_closure(G, &ValidatorGroup::start, prev, last_masterchain_block_id_, last_masterchain_state_->get_unix_time()); @@ -2127,7 +2128,7 @@ void ValidatorManagerImpl::update_shards() { } else { new_next_validator_groups_.emplace( val_group_id, - ValidatorGroupEntry{create_validator_group(val_group_id, shard, val_set, opts, started_), shard}); + ValidatorGroupEntry{create_validator_group(val_group_id, shard, val_set, key_seqno, opts, started_), shard}); } } } @@ -2230,7 +2231,7 @@ ValidatorSessionId ValidatorManagerImpl::get_validator_set_id(ShardIdFull shard, } td::actor::ActorOwn ValidatorManagerImpl::create_validator_group( - ValidatorSessionId session_id, ShardIdFull shard, td::Ref validator_set, + ValidatorSessionId session_id, ShardIdFull shard, td::Ref validator_set, BlockSeqno key_seqno, validatorsession::ValidatorSessionOptions opts, bool init_session) { if (check_gc_list_.count(session_id) == 1) { return td::actor::ActorOwn{}; @@ -2241,8 +2242,8 @@ td::actor::ActorOwn ValidatorManagerImpl::create_validator_group auto validator_id = get_validator(shard, validator_set); CHECK(!validator_id.is_zero()); auto G = td::actor::create_actor( - "validatorgroup", shard, validator_id, session_id, validator_set, opts, keyring_, adnl_, rldp_, overlays_, - db_root_, actor_id(this), init_session, + "validatorgroup", shard, validator_id, session_id, validator_set, key_seqno, opts, keyring_, adnl_, rldp_, + overlays_, db_root_, actor_id(this), init_session, opts_->check_unsafe_resync_allowed(validator_set->get_catchain_seqno()), opts_); return G; } @@ -2831,13 +2832,35 @@ void ValidatorManagerImpl::log_validator_session_stats(BlockIdExt block_id, for (const auto &round : stats.rounds) { std::vector> producers; for (const auto &producer : round.producers) { + BlockIdExt cur_block_id{block_id.id, producer.root_hash, producer.file_hash}; + auto it = recorded_block_stats_.find(cur_block_id); + tl_object_ptr collation_stats; + if (it != recorded_block_stats_.end() && it->second.collator_stats_) { + auto &stats = it->second.collator_stats_.value(); + collation_stats = create_tl_object( + stats.bytes, stats.gas, stats.lt_delta, stats.cat_bytes, stats.cat_gas, stats.cat_lt_delta, + stats.limits_log, stats.ext_msgs_total, stats.ext_msgs_filtered, stats.ext_msgs_accepted, + stats.ext_msgs_rejected); + } + std::string approvers, signers; + for (bool x : producer.approvers) { + approvers += (x ? '1' : '0'); + } + for (bool x : producer.signers) { + signers += (x ? '1' : '0'); + } producers.push_back(create_tl_object( - producer.id.bits256_value(), producer.candidate_id, producer.block_status, producer.comment, - producer.block_timestamp, producer.is_accepted, producer.is_ours, producer.got_submit_at, - producer.collation_time, producer.collated_at, producer.collation_cached, producer.validation_time, - producer.validated_at, producer.validation_cached, producer.gen_utime, producer.approved_weight, - producer.approved_33pct_at, producer.approved_66pct_at, producer.signed_weight, producer.signed_33pct_at, - producer.signed_66pct_at, producer.serialize_time, producer.deserialize_time, producer.serialized_size)); + producer.id.bits256_value(), producer.candidate_id, producer.block_status, producer.root_hash, + producer.file_hash, producer.comment, producer.block_timestamp, producer.is_accepted, producer.is_ours, + producer.got_submit_at, producer.collation_time, producer.collated_at, producer.collation_cached, + it == recorded_block_stats_.end() ? -1.0 : it->second.collator_work_time_, + it == recorded_block_stats_.end() ? -1.0 : it->second.collator_cpu_work_time_, std::move(collation_stats), + producer.validation_time, producer.validated_at, producer.validation_cached, + it == recorded_block_stats_.end() ? -1.0 : it->second.validator_work_time_, + it == recorded_block_stats_.end() ? -1.0 : it->second.validator_cpu_work_time_, producer.gen_utime, + producer.approved_weight, producer.approved_33pct_at, producer.approved_66pct_at, std::move(approvers), + producer.signed_weight, producer.signed_33pct_at, producer.signed_66pct_at, std::move(signers), + producer.serialize_time, producer.deserialize_time, producer.serialized_size)); } rounds.push_back(create_tl_object(round.timestamp, std::move(producers))); } @@ -2869,8 +2892,8 @@ void ValidatorManagerImpl::log_new_validator_group_stats(validatorsession::NewVa create_tl_object(node.id.bits256_value(), node.weight)); } auto obj = create_tl_object( - stats.session_id, stats.shard.workchain, stats.shard.shard, stats.cc_seqno, stats.timestamp, stats.self_idx, - std::move(nodes)); + stats.session_id, stats.shard.workchain, stats.shard.shard, stats.cc_seqno, stats.last_key_block_seqno, + stats.timestamp, stats.self_idx, std::move(nodes)); auto s = td::json_encode(td::ToJson(*obj.get()), false); s.erase(std::remove_if(s.begin(), s.end(), [](char c) { return c == '\n' || c == '\r'; }), s.end()); @@ -2879,7 +2902,31 @@ void ValidatorManagerImpl::log_new_validator_group_stats(validatorsession::NewVa file << s << "\n"; file.close(); - LOG(INFO) << "Writing new validator group stats for " << stats.shard.to_str(); + LOG(INFO) << "Writing new validator group stats for " << stats.session_id << " shard=" << stats.shard.to_str() + << " cc_seqno=" << stats.cc_seqno; +} + +void ValidatorManagerImpl::log_end_validator_group_stats(validatorsession::EndValidatorGroupStats stats) { + std::string fname = opts_->get_session_logs_file(); + if (fname.empty()) { + return; + } + std::vector> nodes; + for (const auto &node : stats.nodes) { + nodes.push_back(create_tl_object(node.id.bits256_value(), + node.catchain_blocks)); + } + auto obj = create_tl_object(stats.session_id, stats.timestamp, + std::move(nodes)); + auto s = td::json_encode(td::ToJson(*obj.get()), false); + s.erase(std::remove_if(s.begin(), s.end(), [](char c) { return c == '\n' || c == '\r'; }), s.end()); + + std::ofstream file; + file.open(fname, std::ios_base::app); + file << s << "\n"; + file.close(); + + LOG(INFO) << "Writing end validator group stats for " << stats.session_id; } void ValidatorManagerImpl::get_block_handle_for_litequery(BlockIdExt block_id, td::Promise promise) { @@ -3165,6 +3212,31 @@ td::actor::ActorOwn ValidatorManagerFactory::create( rldp, overlays); } +void ValidatorManagerImpl::record_collate_query_stats(BlockIdExt block_id, double work_time, double cpu_work_time, + CollationStats stats) { + auto &record = new_block_stats_record(block_id); + record.collator_work_time_ = work_time; + record.collator_cpu_work_time_ = cpu_work_time; + record.collator_stats_ = std::move(stats); +} + +void ValidatorManagerImpl::record_validate_query_stats(BlockIdExt block_id, double work_time, double cpu_work_time) { + auto &record = new_block_stats_record(block_id); + record.validator_work_time_ = work_time; + record.validator_cpu_work_time_ = cpu_work_time; +} + +ValidatorManagerImpl::RecordedBlockStats &ValidatorManagerImpl::new_block_stats_record(BlockIdExt block_id) { + if (!recorded_block_stats_.count(block_id)) { + recorded_block_stats_lru_.push(block_id); + if (recorded_block_stats_lru_.size() > 4096) { + recorded_block_stats_.erase(recorded_block_stats_lru_.front()); + recorded_block_stats_lru_.pop(); + } + } + return recorded_block_stats_[block_id]; +} + size_t ValidatorManagerImpl::CheckedExtMsgCounter::get_msg_count(WorkchainId wc, StdSmcAddress addr) { before_query(); auto it1 = counter_cur_.find({wc, addr}); diff --git a/validator/manager.hpp b/validator/manager.hpp index 12354c634..99aa4e0e1 100644 --- a/validator/manager.hpp +++ b/validator/manager.hpp @@ -38,6 +38,7 @@ #include #include #include +#include namespace ton { @@ -261,7 +262,7 @@ class ValidatorManagerImpl : public ValidatorManager { BlockSeqno last_key_block_seqno, const validatorsession::ValidatorSessionOptions &opts); td::actor::ActorOwn create_validator_group(ValidatorSessionId session_id, ShardIdFull shard, - td::Ref validator_set, + td::Ref validator_set, BlockSeqno key_seqno, validatorsession::ValidatorSessionOptions opts, bool create_catchain); struct ValidatorGroupEntry { @@ -589,6 +590,7 @@ class ValidatorManagerImpl : public ValidatorManager { void log_validator_session_stats(BlockIdExt block_id, validatorsession::ValidatorSessionStats stats) override; void log_new_validator_group_stats(validatorsession::NewValidatorGroupStats stats) override; + void log_end_validator_group_stats(validatorsession::EndValidatorGroupStats stats) override; void update_options(td::Ref opts) override; @@ -708,6 +710,21 @@ class ValidatorManagerImpl : public ValidatorManager { td::uint32 ls_stats_check_ext_messages_{0}; td::actor::ActorOwn candidates_buffer_; + + struct RecordedBlockStats { + double collator_work_time_ = -1.0; + double collator_cpu_work_time_ = -1.0; + td::optional collator_stats_; + double validator_work_time_ = -1.0; + double validator_cpu_work_time_ = -1.0; + }; + std::map recorded_block_stats_; + std::queue recorded_block_stats_lru_; + + void record_collate_query_stats(BlockIdExt block_id, double work_time, double cpu_work_time, + CollationStats stats) override; + void record_validate_query_stats(BlockIdExt block_id, double work_time, double cpu_work_time) override; + RecordedBlockStats &new_block_stats_record(BlockIdExt block_id); }; } // namespace validator diff --git a/validator/validator-group.cpp b/validator/validator-group.cpp index fc3ebe541..4b61c07cd 100644 --- a/validator/validator-group.cpp +++ b/validator/validator-group.cpp @@ -386,6 +386,7 @@ void ValidatorGroup::start(std::vector prev, BlockIdExt min_masterch stats.session_id = session_id_; stats.shard = shard_; stats.cc_seqno = validator_set_->get_catchain_seqno(); + stats.last_key_block_seqno = last_key_block_seqno_; stats.timestamp = td::Clocks::system(); td::uint32 idx = 0; for (const auto& node : validator_set_->export_vector()) { @@ -417,6 +418,16 @@ void ValidatorGroup::destroy() { td::actor::send_closure(manager, &ValidatorManager::log_validator_session_stats, block_id, std::move(stats)); }); + td::actor::send_closure(session_, &validatorsession::ValidatorSession::get_end_stats, + [manager = manager_](td::Result R) { + if (R.is_error()) { + LOG(DEBUG) << "Failed to get validator session end stats: " << R.move_as_error(); + return; + } + auto stats = R.move_as_ok(); + td::actor::send_closure(manager, &ValidatorManager::log_end_validator_group_stats, + std::move(stats)); + }); auto ses = session_.release(); delay_action([ses]() mutable { td::actor::send_closure(ses, &validatorsession::ValidatorSession::destroy); }, td::Timestamp::in(10.0)); diff --git a/validator/validator-group.hpp b/validator/validator-group.hpp index 3499da9d7..936d2fdc7 100644 --- a/validator/validator-group.hpp +++ b/validator/validator-group.hpp @@ -69,15 +69,17 @@ class ValidatorGroup : public td::actor::Actor { } ValidatorGroup(ShardIdFull shard, PublicKeyHash local_id, ValidatorSessionId session_id, - td::Ref validator_set, validatorsession::ValidatorSessionOptions config, - td::actor::ActorId keyring, td::actor::ActorId adnl, - td::actor::ActorId rldp, td::actor::ActorId overlays, - std::string db_root, td::actor::ActorId validator_manager, bool create_session, + td::Ref validator_set, BlockSeqno last_key_block_seqno, + validatorsession::ValidatorSessionOptions config, td::actor::ActorId keyring, + td::actor::ActorId adnl, td::actor::ActorId rldp, + td::actor::ActorId overlays, std::string db_root, + td::actor::ActorId validator_manager, bool create_session, bool allow_unsafe_self_blocks_resync, td::Ref opts) : shard_(shard) , local_id_(std::move(local_id)) , session_id_(session_id) , validator_set_(std::move(validator_set)) + , last_key_block_seqno_(last_key_block_seqno) , config_(std::move(config)) , keyring_(keyring) , adnl_(adnl) @@ -115,6 +117,7 @@ class ValidatorGroup : public td::actor::Actor { UnixTime min_ts_; td::Ref validator_set_; + BlockSeqno last_key_block_seqno_; validatorsession::ValidatorSessionOptions config_; td::actor::ActorId keyring_;