From ad4d1d19e05ce92439c83dc774c661df6dbd4f55 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 11 Sep 2024 13:53:22 -0400 Subject: [PATCH 01/70] rename epoch_tail_hash -> sync_hash --- chain/chain/src/chain.rs | 2 +- chain/chain/src/store/mod.rs | 2 +- chain/chain/src/store_validator/validate.rs | 6 +----- chain/client/src/client.rs | 2 +- core/primitives/src/sharding.rs | 12 ++++++++++-- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index e7af6450e02..8896255f188 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -782,7 +782,7 @@ impl Chain { debug!(target: "chain", "Downloading state for {:?}, I'm {:?}", shards_to_state_sync, me); let state_sync_info = StateSyncInfo { - epoch_tail_hash: *block.header().hash(), + sync_hash: *block.header().hash(), shards: shards_to_state_sync .iter() .map(|shard_id| { diff --git a/chain/chain/src/store/mod.rs b/chain/chain/src/store/mod.rs index 3e65b53690c..8450c20de2a 100644 --- a/chain/chain/src/store/mod.rs +++ b/chain/chain/src/store/mod.rs @@ -2557,7 +2557,7 @@ impl<'a> ChainStoreUpdate<'a> { for state_sync_info in self.add_state_sync_infos.drain(..) { store_update.set_ser( DBCol::StateDlInfos, - state_sync_info.epoch_tail_hash.as_ref(), + state_sync_info.sync_hash.as_ref(), &state_sync_info, )?; } diff --git a/chain/chain/src/store_validator/validate.rs b/chain/chain/src/store_validator/validate.rs index 743a293e499..de89f4e4edd 100644 --- a/chain/chain/src/store_validator/validate.rs +++ b/chain/chain/src/store_validator/validate.rs @@ -720,11 +720,7 @@ pub(crate) fn state_sync_info_valid( block_hash: &CryptoHash, state_sync_info: &StateSyncInfo, ) -> Result<(), StoreValidatorError> { - check_discrepancy!( - state_sync_info.epoch_tail_hash, - *block_hash, - "Invalid StateSyncInfo stored" - ); + check_discrepancy!(state_sync_info.sync_hash, *block_hash, "Invalid StateSyncInfo stored"); Ok(()) } diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 943630212ac..0f332272747 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2464,7 +2464,7 @@ impl Client { let me = signer.as_ref().map(|x| x.validator_id().clone()); for (sync_hash, state_sync_info) in self.chain.chain_store().iterate_state_sync_infos()? { - assert_eq!(sync_hash, state_sync_info.epoch_tail_hash); + assert_eq!(sync_hash, state_sync_info.sync_hash); let network_adapter = self.network_adapter.clone(); let shards_to_split = self.get_shards_to_split(sync_hash, &state_sync_info, &me)?; diff --git a/core/primitives/src/sharding.rs b/core/primitives/src/sharding.rs index 089635f6aff..ef2c33ca3c1 100644 --- a/core/primitives/src/sharding.rs +++ b/core/primitives/src/sharding.rs @@ -62,8 +62,16 @@ pub struct ShardInfo(pub ShardId, pub ChunkHash); /// Contains the information that is used to sync state for shards as epochs switch #[derive(Debug, PartialEq, BorshSerialize, BorshDeserialize)] pub struct StateSyncInfo { - /// The first block of the epoch for which syncing is happening - pub epoch_tail_hash: CryptoHash, + /// The block we'll use as the "sync_hash" when state syncing. Previously, state sync + /// used the first block of an epoch as the "sync_hash", and synced state to the epoch before. + /// Now that state sync downloads the state of the current epoch, we need to wait a few blocks + /// after applying the first block in an epoch to know what "sync_hash" we'll use. + /// In order to avoid the need for a database migration while keeping backward compatibility, + /// this field is no longer equal to the database key associated with this value, and is set to + /// CryptoHash::default() when we apply the first block in an epoch and want to sync the current + /// epoch's state. If we instead want to sync the previous epoch's state, we set this field in the same + /// way it used to be set + pub sync_hash: CryptoHash, /// Shards to fetch state pub shards: Vec, } From 76f93501d277e1683f93b5f71bb8b794a44579d3 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Fri, 13 Sep 2024 14:44:17 -0400 Subject: [PATCH 02/70] change var names --- chain/client/src/client.rs | 20 +++++++++----------- chain/client/src/client_actor.rs | 20 +++++++++----------- chain/client/src/test_utils/client.rs | 8 +++----- 3 files changed, 21 insertions(+), 27 deletions(-) diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 0f332272747..bd5cc3f7c3d 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2472,7 +2472,7 @@ impl Client { let block_header = self.chain.get_block(&sync_hash)?.header().clone(); let epoch_id = block_header.epoch_id(); - let (state_sync, shards_to_split, blocks_catch_up_state) = + let (state_sync, state_downloads, catchup) = self.catchup_state_syncs.entry(sync_hash).or_insert_with(|| { tracing::debug!(target: "client", ?sync_hash, "inserting new state sync"); notify_state_sync = true; @@ -2491,7 +2491,7 @@ impl Client { }); // For colour decorators to work, they need to printed directly. Otherwise the decorators get escaped, garble output and don't add colours. - debug!(target: "catchup", ?me, ?sync_hash, progress_per_shard = ?format_shard_sync_phase_per_shard(&shards_to_split, false), "Catchup"); + debug!(target: "catchup", ?me, ?sync_hash, progress_per_shard = ?format_shard_sync_phase_per_shard(&state_downloads, false), "Catchup"); let use_colour = matches!(self.config.log_summary_style, LogSummaryStyle::Colored); let tracking_shards: Vec = @@ -2520,7 +2520,7 @@ impl Client { // Initialize the new shard sync to contain the shards to split at // first. It will get updated with the shard sync download status // for other shards later. - let new_shard_sync = shards_to_split; + let new_shard_sync = state_downloads; match state_sync.run( &me, sync_hash, @@ -2541,11 +2541,11 @@ impl Client { self.chain.catchup_blocks_step( &me, &sync_hash, - blocks_catch_up_state, + catchup, block_catch_up_task_scheduler, )?; - if blocks_catch_up_state.is_finished() { + if catchup.is_finished() { let mut block_processing_artifacts = BlockProcessingArtifact::default(); self.chain.finish_catchup_blocks( @@ -2553,7 +2553,7 @@ impl Client { &sync_hash, &mut block_processing_artifacts, apply_chunks_done_sender.clone(), - &blocks_catch_up_state.done_blocks, + &catchup.done_blocks, )?; self.process_block_processing_artifact(block_processing_artifacts, &signer); @@ -2791,11 +2791,9 @@ impl Client { impl Client { pub fn get_catchup_status(&self) -> Result, near_chain::Error> { let mut ret = vec![]; - for (sync_hash, (_, shard_sync_state, block_catchup_state)) in - self.catchup_state_syncs.iter() - { + for (sync_hash, (_, state_downloads, catchup)) in self.catchup_state_syncs.iter() { let sync_block_height = self.chain.get_block_header(sync_hash)?.height(); - let shard_sync_status: HashMap<_, _> = shard_sync_state + let shard_sync_status: HashMap<_, _> = state_downloads .iter() .map(|(shard_id, state)| (*shard_id, state.status.to_string())) .collect(); @@ -2803,7 +2801,7 @@ impl Client { sync_block_hash: *sync_hash, sync_block_height, shard_sync_status, - blocks_to_catchup: self.chain.get_block_catchup_status(block_catchup_state), + blocks_to_catchup: self.chain.get_block_catchup_status(catchup), }); } Ok(ret) diff --git a/chain/client/src/client_actor.rs b/chain/client/src/client_actor.rs index ca1af1f4513..788f027228d 100644 --- a/chain/client/src/client_actor.rs +++ b/chain/client/src/client_actor.rs @@ -627,10 +627,10 @@ impl Handler for ClientActorInner { } // ... Or one of the catchups - if let Some((state_sync, shards_to_download, _)) = + if let Some((state_sync, state_downloads, _)) = self.client.catchup_state_syncs.get_mut(&hash) { - if let Some(shard_download) = shards_to_download.get_mut(&shard_id) { + if let Some(shard_download) = state_downloads.get_mut(&shard_id) { state_sync.update_download_on_state_response_message( shard_download, hash, @@ -2114,9 +2114,9 @@ impl ClientActorInner { impl Handler for ClientActorInner { fn handle(&mut self, msg: ApplyStatePartsResponse) { tracing::debug!(target: "client", ?msg); - if let Some((sync, _, _)) = self.client.catchup_state_syncs.get_mut(&msg.sync_hash) { + if let Some((state_sync, _, _)) = self.client.catchup_state_syncs.get_mut(&msg.sync_hash) { // We are doing catchup - sync.set_apply_result(msg.shard_id, msg.apply_result); + state_sync.set_apply_result(msg.shard_id, msg.apply_result); } else { self.client.state_sync.set_apply_result(msg.shard_id, msg.apply_result); } @@ -2126,11 +2126,9 @@ impl Handler for ClientActorInner { impl Handler for ClientActorInner { fn handle(&mut self, msg: BlockCatchUpResponse) { tracing::debug!(target: "client", ?msg); - if let Some((_, _, blocks_catch_up_state)) = - self.client.catchup_state_syncs.get_mut(&msg.sync_hash) - { - assert!(blocks_catch_up_state.scheduled_blocks.remove(&msg.block_hash)); - blocks_catch_up_state.processed_blocks.insert( + if let Some((_, _, catchup)) = self.client.catchup_state_syncs.get_mut(&msg.sync_hash) { + assert!(catchup.scheduled_blocks.remove(&msg.block_hash)); + catchup.processed_blocks.insert( msg.block_hash, msg.results.into_iter().map(|res| res.1).collect::>(), ); @@ -2148,9 +2146,9 @@ impl Handler for ClientActorInner { #[perf] fn handle(&mut self, msg: LoadMemtrieResponse) { tracing::debug!(target: "client", ?msg); - if let Some((sync, _, _)) = self.client.catchup_state_syncs.get_mut(&msg.sync_hash) { + if let Some((state_sync, _, _)) = self.client.catchup_state_syncs.get_mut(&msg.sync_hash) { // We are doing catchup - sync.set_load_memtrie_result(msg.shard_uid, msg.load_result); + state_sync.set_load_memtrie_result(msg.shard_uid, msg.load_result); } else { // We are doing state sync self.client.state_sync.set_load_memtrie_result(msg.shard_uid, msg.load_result); diff --git a/chain/client/src/test_utils/client.rs b/chain/client/src/test_utils/client.rs index f8e8063738b..2d72c81b896 100644 --- a/chain/client/src/test_utils/client.rs +++ b/chain/client/src/test_utils/client.rs @@ -317,11 +317,9 @@ pub fn run_catchup( .into_iter() .map(|res| res.1) .collect_vec(); - if let Some((_, _, blocks_catch_up_state)) = - client.catchup_state_syncs.get_mut(&msg.sync_hash) - { - assert!(blocks_catch_up_state.scheduled_blocks.remove(&msg.block_hash)); - blocks_catch_up_state.processed_blocks.insert(msg.block_hash, results); + if let Some((_, _, catchup)) = client.catchup_state_syncs.get_mut(&msg.sync_hash) { + assert!(catchup.scheduled_blocks.remove(&msg.block_hash)); + catchup.processed_blocks.insert(msg.block_hash, results); } else { panic!("block catch up processing result from unknown sync hash"); } From a445393cbf6a12a13f024078a999c6b523260909 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Fri, 13 Sep 2024 14:56:19 -0400 Subject: [PATCH 03/70] organize catchup_state_syncs value into CatchupState --- chain/client/src/client.rs | 25 ++++++++++++++++--------- chain/client/src/client_actor.rs | 16 +++++++++++----- chain/client/src/test_utils/client.rs | 6 ++++-- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index bd5cc3f7c3d..645faba2320 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -111,6 +111,12 @@ pub enum AdvProduceBlocksMode { OnlyValid, } +pub struct CatchupState { + pub state_sync: StateSync, + pub state_downloads: HashMap, + pub catchup: BlocksCatchUpState, +} + pub struct Client { /// Adversarial controls - should be enabled only to test disruptive /// behaviour on chain. @@ -149,8 +155,7 @@ pub struct Client { lru::LruCache>, /// A mapping from a block for which a state sync is underway for the next epoch, and the object /// storing the current status of the state sync and blocks catch up - pub catchup_state_syncs: - HashMap, BlocksCatchUpState)>, + pub catchup_state_syncs: HashMap, /// Keeps track of information needed to perform the initial Epoch Sync pub epoch_sync: EpochSync, /// Keeps track of syncing headers. @@ -2472,12 +2477,12 @@ impl Client { let block_header = self.chain.get_block(&sync_hash)?.header().clone(); let epoch_id = block_header.epoch_id(); - let (state_sync, state_downloads, catchup) = + let CatchupState { state_sync, state_downloads, catchup } = self.catchup_state_syncs.entry(sync_hash).or_insert_with(|| { tracing::debug!(target: "client", ?sync_hash, "inserting new state sync"); notify_state_sync = true; - ( - StateSync::new( + CatchupState { + state_sync: StateSync::new( self.clock.clone(), network_adapter, state_sync_timeout, @@ -2485,9 +2490,9 @@ impl Client { &self.config.state_sync.sync, true, ), - shards_to_split, - BlocksCatchUpState::new(sync_hash, *epoch_id), - ) + state_downloads: shards_to_split, + catchup: BlocksCatchUpState::new(sync_hash, *epoch_id), + } }); // For colour decorators to work, they need to printed directly. Otherwise the decorators get escaped, garble output and don't add colours. @@ -2791,7 +2796,9 @@ impl Client { impl Client { pub fn get_catchup_status(&self) -> Result, near_chain::Error> { let mut ret = vec![]; - for (sync_hash, (_, state_downloads, catchup)) in self.catchup_state_syncs.iter() { + for (sync_hash, CatchupState { state_downloads, catchup, .. }) in + self.catchup_state_syncs.iter() + { let sync_block_height = self.chain.get_block_header(sync_hash)?.height(); let shard_sync_status: HashMap<_, _> = state_downloads .iter() diff --git a/chain/client/src/client_actor.rs b/chain/client/src/client_actor.rs index 788f027228d..cc5bc42c80f 100644 --- a/chain/client/src/client_actor.rs +++ b/chain/client/src/client_actor.rs @@ -7,7 +7,7 @@ #[cfg(feature = "test_features")] use crate::client::AdvProduceBlocksMode; -use crate::client::{Client, EPOCH_START_INFO_BLOCKS}; +use crate::client::{CatchupState, Client, EPOCH_START_INFO_BLOCKS}; use crate::config_updater::ConfigUpdater; use crate::debug::new_network_info_view; use crate::info::{display_sync_status, InfoHelper}; @@ -627,7 +627,7 @@ impl Handler for ClientActorInner { } // ... Or one of the catchups - if let Some((state_sync, state_downloads, _)) = + if let Some(CatchupState { state_sync, state_downloads, .. }) = self.client.catchup_state_syncs.get_mut(&hash) { if let Some(shard_download) = state_downloads.get_mut(&shard_id) { @@ -2114,7 +2114,9 @@ impl ClientActorInner { impl Handler for ClientActorInner { fn handle(&mut self, msg: ApplyStatePartsResponse) { tracing::debug!(target: "client", ?msg); - if let Some((state_sync, _, _)) = self.client.catchup_state_syncs.get_mut(&msg.sync_hash) { + if let Some(CatchupState { state_sync, .. }) = + self.client.catchup_state_syncs.get_mut(&msg.sync_hash) + { // We are doing catchup state_sync.set_apply_result(msg.shard_id, msg.apply_result); } else { @@ -2126,7 +2128,9 @@ impl Handler for ClientActorInner { impl Handler for ClientActorInner { fn handle(&mut self, msg: BlockCatchUpResponse) { tracing::debug!(target: "client", ?msg); - if let Some((_, _, catchup)) = self.client.catchup_state_syncs.get_mut(&msg.sync_hash) { + if let Some(CatchupState { catchup, .. }) = + self.client.catchup_state_syncs.get_mut(&msg.sync_hash) + { assert!(catchup.scheduled_blocks.remove(&msg.block_hash)); catchup.processed_blocks.insert( msg.block_hash, @@ -2146,7 +2150,9 @@ impl Handler for ClientActorInner { #[perf] fn handle(&mut self, msg: LoadMemtrieResponse) { tracing::debug!(target: "client", ?msg); - if let Some((state_sync, _, _)) = self.client.catchup_state_syncs.get_mut(&msg.sync_hash) { + if let Some(CatchupState { state_sync, .. }) = + self.client.catchup_state_syncs.get_mut(&msg.sync_hash) + { // We are doing catchup state_sync.set_load_memtrie_result(msg.shard_uid, msg.load_result); } else { diff --git a/chain/client/src/test_utils/client.rs b/chain/client/src/test_utils/client.rs index 2d72c81b896..31564bb1c04 100644 --- a/chain/client/src/test_utils/client.rs +++ b/chain/client/src/test_utils/client.rs @@ -5,7 +5,7 @@ use std::mem::swap; use std::sync::{Arc, RwLock}; -use crate::client::ProduceChunkResult; +use crate::client::{CatchupState, ProduceChunkResult}; use crate::Client; use actix_rt::{Arbiter, System}; use itertools::Itertools; @@ -317,7 +317,9 @@ pub fn run_catchup( .into_iter() .map(|res| res.1) .collect_vec(); - if let Some((_, _, catchup)) = client.catchup_state_syncs.get_mut(&msg.sync_hash) { + if let Some(CatchupState { catchup, .. }) = + client.catchup_state_syncs.get_mut(&msg.sync_hash) + { assert!(catchup.scheduled_blocks.remove(&msg.block_hash)); catchup.processed_blocks.insert(msg.block_hash, results); } else { From 174b62f92e910570ddc3984683fd34870ffa39e7 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Fri, 13 Sep 2024 15:07:53 -0400 Subject: [PATCH 04/70] use hashmap entry explicitly so i can use question mark --- chain/client/src/client.rs | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 645faba2320..86b2444c473 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2478,22 +2478,26 @@ impl Client { let epoch_id = block_header.epoch_id(); let CatchupState { state_sync, state_downloads, catchup } = - self.catchup_state_syncs.entry(sync_hash).or_insert_with(|| { - tracing::debug!(target: "client", ?sync_hash, "inserting new state sync"); - notify_state_sync = true; - CatchupState { - state_sync: StateSync::new( - self.clock.clone(), - network_adapter, - state_sync_timeout, - &self.config.chain_id, - &self.config.state_sync.sync, - true, - ), - state_downloads: shards_to_split, - catchup: BlocksCatchUpState::new(sync_hash, *epoch_id), + match self.catchup_state_syncs.entry(sync_hash) { + std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), + std::collections::hash_map::Entry::Vacant(e) => { + tracing::debug!(target: "client", ?sync_hash, "inserting new state sync"); + notify_state_sync = true; + let state = CatchupState { + state_sync: StateSync::new( + self.clock.clone(), + network_adapter, + state_sync_timeout, + &self.config.chain_id, + &self.config.state_sync.sync, + true, + ), + state_downloads: shards_to_split, + catchup: BlocksCatchUpState::new(sync_hash, *epoch_id), + }; + e.insert(state) } - }); + }; // For colour decorators to work, they need to printed directly. Otherwise the decorators get escaped, garble output and don't add colours. debug!(target: "catchup", ?me, ?sync_hash, progress_per_shard = ?format_shard_sync_phase_per_shard(&state_downloads, false), "Catchup"); From a59e59427ed8ffd84cc67b17701718cfd6571c88 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 16 Sep 2024 14:36:38 -0400 Subject: [PATCH 05/70] rename sync_hash -> epoch_first_block --- chain/client/src/client.rs | 68 +++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 86b2444c473..fa9fbb9c26d 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2468,39 +2468,44 @@ impl Client { let mut notify_state_sync = false; let me = signer.as_ref().map(|x| x.validator_id().clone()); - for (sync_hash, state_sync_info) in self.chain.chain_store().iterate_state_sync_infos()? { - assert_eq!(sync_hash, state_sync_info.sync_hash); + for (epoch_first_block, state_sync_info) in + self.chain.chain_store().iterate_state_sync_infos()? + { + assert_eq!(epoch_first_block, state_sync_info.sync_hash); let network_adapter = self.network_adapter.clone(); - let shards_to_split = self.get_shards_to_split(sync_hash, &state_sync_info, &me)?; + let shards_to_split = + self.get_shards_to_split(epoch_first_block, &state_sync_info, &me)?; let state_sync_timeout = self.config.state_sync_timeout; - let block_header = self.chain.get_block(&sync_hash)?.header().clone(); + let block_header = self.chain.get_block(&epoch_first_block)?.header().clone(); let epoch_id = block_header.epoch_id(); - let CatchupState { state_sync, state_downloads, catchup } = - match self.catchup_state_syncs.entry(sync_hash) { - std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), - std::collections::hash_map::Entry::Vacant(e) => { - tracing::debug!(target: "client", ?sync_hash, "inserting new state sync"); - notify_state_sync = true; - let state = CatchupState { - state_sync: StateSync::new( - self.clock.clone(), - network_adapter, - state_sync_timeout, - &self.config.chain_id, - &self.config.state_sync.sync, - true, - ), - state_downloads: shards_to_split, - catchup: BlocksCatchUpState::new(sync_hash, *epoch_id), - }; - e.insert(state) - } - }; + let CatchupState { state_sync, state_downloads, catchup } = match self + .catchup_state_syncs + .entry(epoch_first_block) + { + std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), + std::collections::hash_map::Entry::Vacant(e) => { + tracing::debug!(target: "client", ?epoch_first_block, "inserting new state sync"); + notify_state_sync = true; + let state = CatchupState { + state_sync: StateSync::new( + self.clock.clone(), + network_adapter, + state_sync_timeout, + &self.config.chain_id, + &self.config.state_sync.sync, + true, + ), + state_downloads: shards_to_split, + catchup: BlocksCatchUpState::new(epoch_first_block, *epoch_id), + }; + e.insert(state) + } + }; // For colour decorators to work, they need to printed directly. Otherwise the decorators get escaped, garble output and don't add colours. - debug!(target: "catchup", ?me, ?sync_hash, progress_per_shard = ?format_shard_sync_phase_per_shard(&state_downloads, false), "Catchup"); + debug!(target: "catchup", ?me, ?epoch_first_block, progress_per_shard = ?format_shard_sync_phase_per_shard(&state_downloads, false), "Catchup"); let use_colour = matches!(self.config.log_summary_style, LogSummaryStyle::Colored); let tracking_shards: Vec = @@ -2517,7 +2522,10 @@ impl Client { match self.state_sync_adapter.clone().read() { Ok(sync_adapter) => sync_adapter.send_sync_message( shard_uid, - SyncMessage::StartSync(SyncShardInfo { shard_uid, sync_hash }), + SyncMessage::StartSync(SyncShardInfo { + shard_uid, + sync_hash: epoch_first_block, + }), ), Err(_) => { error!(target:"catchup", "State sync adapter lock is poisoned.") @@ -2532,7 +2540,7 @@ impl Client { let new_shard_sync = state_downloads; match state_sync.run( &me, - sync_hash, + epoch_first_block, new_shard_sync, &mut self.chain, self.epoch_manager.as_ref(), @@ -2549,7 +2557,7 @@ impl Client { debug!(target: "catchup", "state sync completed now catch up blocks"); self.chain.catchup_blocks_step( &me, - &sync_hash, + &epoch_first_block, catchup, block_catch_up_task_scheduler, )?; @@ -2559,7 +2567,7 @@ impl Client { self.chain.finish_catchup_blocks( &me, - &sync_hash, + &epoch_first_block, &mut block_processing_artifacts, apply_chunks_done_sender.clone(), &catchup.done_blocks, From f8f6fb67df4b3b1ca9689c2d276171346a988d8b Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 16 Sep 2024 21:51:35 -0400 Subject: [PATCH 06/70] state sync to the current epoch during catchup this just implements the requester side of things, and does not work because there is no support in the state dumper for this. But it sets up all the logic to request state headers and parts in the new way --- chain/chain/src/block_processing_utils.rs | 11 ++ chain/chain/src/chain.rs | 11 +- chain/chain/src/chain_update.rs | 2 +- chain/chain/src/store/mod.rs | 14 +- chain/client/src/client.rs | 206 +++++++++++++++++++--- core/primitives/src/sharding.rs | 4 +- 6 files changed, 206 insertions(+), 42 deletions(-) diff --git a/chain/chain/src/block_processing_utils.rs b/chain/chain/src/block_processing_utils.rs index cc96b3d171b..4cc8b3a4600 100644 --- a/chain/chain/src/block_processing_utils.rs +++ b/chain/chain/src/block_processing_utils.rs @@ -18,6 +18,17 @@ pub(crate) const MAX_PROCESSING_BLOCKS: usize = 5; /// Contains information from preprocessing a block pub(crate) struct BlockPreprocessInfo { + /// This field has two related but actually different meanings. For the first block of an + /// epoch, this will be set to false if we need to download state for shards we'll track in + /// the future but don't track currently. This implies the first meaning, which is that if + /// this is true, then we are ready to apply all chunks and update flat state for shards + /// we'll track in this and the next epoch. This comes into play when we decide what ApplyChunksMode + /// to pass to Chain::apply_chunks_preprocessing(). + /// The other meaning is that the catchup code should process this block. When the state sync sync_hash + /// is the first block of the epoch, these two meanings are the same. But if the sync_hash is moved forward + /// in order to sync the current epoch's state instead of last epoch's, this field being false no longer implies + /// that we want to apply this block during catchup, so some care is needed to ensure we start catchup at the right + /// point in Client::run_catchup() pub(crate) is_caught_up: bool, pub(crate) state_sync_info: Option, pub(crate) incoming_receipts: HashMap>, diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 8896255f188..d1b555f0aca 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -781,8 +781,12 @@ impl Chain { } else { debug!(target: "chain", "Downloading state for {:?}, I'm {:?}", shards_to_state_sync, me); + let protocol_version = + self.epoch_manager.get_epoch_protocol_version(block.header().epoch_id())?; + let sync_hash = + if protocol_version < 72 { *block.header().hash() } else { CryptoHash::default() }; let state_sync_info = StateSyncInfo { - sync_hash: *block.header().hash(), + sync_hash, shards: shards_to_state_sync .iter() .map(|shard_id| { @@ -3206,6 +3210,9 @@ impl Chain { &mut self, me: &Option, epoch_first_block: &CryptoHash, + // TODO: remove the ones not in affected_blocks by breadth first searching from `epoch_first_block` and adding + // descendant blocks to the search when they're not equal to this hash, and then removing everything we see in that search + _catchup_start_block: &CryptoHash, block_processing_artifacts: &mut BlockProcessingArtifact, apply_chunks_done_sender: Option>, affected_blocks: &[CryptoHash], @@ -4725,7 +4732,7 @@ pub struct ChunkStateWitnessMessage { /// Otherwise results are committed, block is moved to done blocks and any blocks that /// have this block as previous are added to pending pub struct BlocksCatchUpState { - /// Hash of first block of an epoch + /// Hash of the block where catchup will start from pub first_block_hash: CryptoHash, /// Epoch id pub epoch_id: EpochId, diff --git a/chain/chain/src/chain_update.rs b/chain/chain/src/chain_update.rs index 6130e5a4729..4298b2162c8 100644 --- a/chain/chain/src/chain_update.rs +++ b/chain/chain/src/chain_update.rs @@ -237,7 +237,7 @@ impl<'a> ChainUpdate<'a> { ); } if let Some(state_sync_info) = state_sync_info { - self.chain_store_update.add_state_sync_info(state_sync_info); + self.chain_store_update.add_state_sync_info(*block.hash(), state_sync_info); } self.chain_store_update.save_block_extra(block.hash(), BlockExtra { challenges_result }); diff --git a/chain/chain/src/store/mod.rs b/chain/chain/src/store/mod.rs index 8450c20de2a..e28c12147e7 100644 --- a/chain/chain/src/store/mod.rs +++ b/chain/chain/src/store/mod.rs @@ -1387,7 +1387,7 @@ pub struct ChainStoreUpdate<'a> { remove_blocks_to_catchup: Vec<(CryptoHash, CryptoHash)>, // A prev_hash to be removed with all the hashes associated with it remove_prev_blocks_to_catchup: Vec, - add_state_sync_infos: Vec, + add_state_sync_infos: Vec<(CryptoHash, StateSyncInfo)>, remove_state_sync_infos: Vec, challenged_blocks: HashSet, } @@ -2032,8 +2032,8 @@ impl<'a> ChainStoreUpdate<'a> { self.remove_prev_blocks_to_catchup.push(hash); } - pub fn add_state_sync_info(&mut self, info: StateSyncInfo) { - self.add_state_sync_infos.push(info); + pub fn add_state_sync_info(&mut self, block_hash: CryptoHash, info: StateSyncInfo) { + self.add_state_sync_infos.push((block_hash, info)); } pub fn remove_state_sync_info(&mut self, hash: CryptoHash) { @@ -2554,12 +2554,8 @@ impl<'a> ChainStoreUpdate<'a> { } } - for state_sync_info in self.add_state_sync_infos.drain(..) { - store_update.set_ser( - DBCol::StateDlInfos, - state_sync_info.sync_hash.as_ref(), - &state_sync_info, - )?; + for (block_hash, state_sync_info) in self.add_state_sync_infos.drain(..) { + store_update.set_ser(DBCol::StateDlInfos, block_hash.as_ref(), &state_sync_info)?; } for hash in self.remove_state_sync_infos.drain(..) { store_update.delete(DBCol::StateDlInfos, hash.as_ref()); diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index fa9fbb9c26d..0b44083508c 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -34,8 +34,8 @@ use near_chain::types::{ StorageDataSource, }; use near_chain::{ - BlockProcessingArtifact, BlockStatus, Chain, ChainGenesis, ChainStoreAccess, Doomslug, - DoomslugThresholdMode, Provenance, + BlockProcessingArtifact, BlockStatus, Chain, ChainGenesis, ChainStoreAccess, ChainStoreUpdate, + Doomslug, DoomslugThresholdMode, Provenance, }; use near_chain_configs::{ ClientConfig, LogSummaryStyle, MutableValidatorSigner, UpdateableClientConfig, @@ -153,6 +153,9 @@ pub struct Client { /// Approvals for which we do not have the block yet pub pending_approvals: lru::LruCache>, + /// A mapping from the first block of an epoch that we know needs to be state synced for some shards + /// to a tracker that will find an appropriate sync_hash for state sync + catchup_tracker: HashMap, /// A mapping from a block for which a state sync is underway for the next epoch, and the object /// storing the current status of the state sync and blocks catch up pub catchup_state_syncs: HashMap, @@ -234,6 +237,91 @@ pub struct ProduceChunkResult { pub transactions_storage_proof: Option, } +/// This keeps track of the numbef of new chunks seen in each shard since the block that was passed to new() +/// This whole thing could be replaced with a much simpler function that just computes the numbef of new chunks +/// in each shard from scratch every time we call it, but in the unlikely and unfortunate case where a shard +/// hasn't had any chunks for a very long time, it would end up being a nontrivial inefficiency to do that +/// every time run_catchup() is called +pub struct NewChunkTracker { + last_checked_hash: CryptoHash, + last_checked_height: BlockHeight, + num_new_chunks: HashMap, + sync_hash: Option, +} + +impl NewChunkTracker { + fn new( + first_block_hash: CryptoHash, + first_block_height: BlockHeight, + shard_ids: &[ShardId], + ) -> Self { + Self { + last_checked_hash: first_block_hash, + last_checked_height: first_block_height, + num_new_chunks: shard_ids.iter().map(|shard_id| (*shard_id, 0)).collect(), + sync_hash: None, + } + } + + fn record_new_chunks(&mut self, header: &BlockHeader) -> Result { + let mut done = true; + for (shard_id, num_new_chunks) in self.num_new_chunks.iter_mut() { + let included = match header.chunk_mask().get(*shard_id as usize) { + Some(i) => *i, + None => { + return Err(Error::Other(format!( + "can't get shard {} in chunk mask for block {}", + shard_id, + header.hash() + ))); + } + }; + if included { + *num_new_chunks += 1; + } + if *num_new_chunks < 2 { + done = false; + } + } + self.last_checked_hash = *header.hash(); + self.last_checked_height = header.height(); + Ok(done) + } + + fn find_sync_hash(&mut self, chain: &Chain) -> Result, Error> { + if let Some(sync_hash) = self.sync_hash { + return Ok(Some(sync_hash)); + } + + let final_head = chain.final_head()?; + + while self.last_checked_height < final_head.height { + let next_hash = match chain.chain_store().get_next_block_hash(&self.last_checked_hash) { + Ok(h) => h, + Err(near_chain_primitives::Error::DBNotFoundErr(_)) => { + return Err(Error::Other(format!( + "final head is #{} {} but get_next_block_hash(#{} {}) is not found", + final_head.height, + final_head.last_block_hash, + self.last_checked_height, + &self.last_checked_hash + ))); + } + Err(e) => return Err(e.into()), + }; + let next_header = chain.get_block_header(&next_hash)?; + let done = self.record_new_chunks(&next_header)?; + if done { + // TODO: check to make sure the epoch IDs are the same. If there are no new chunks in some shard in the epoch, + // this will be for an epoch ahead of this one + self.sync_hash = Some(next_hash); + break; + } + } + Ok(self.sync_hash) + } +} + impl Client { pub fn new( clock: Clock, @@ -386,6 +474,7 @@ impl Client { pending_approvals: lru::LruCache::new( NonZeroUsize::new(num_block_producer_seats).unwrap(), ), + catchup_tracker: HashMap::new(), catchup_state_syncs: HashMap::new(), epoch_sync, header_sync, @@ -2453,6 +2542,23 @@ impl Client { Ok(false) } + fn set_state_sync_hash<'a>( + mut update: ChainStoreUpdate<'a>, + epoch_first_block: CryptoHash, + state_sync_info: &mut StateSyncInfo, + sync_hash: CryptoHash, + ) -> Result<(), Error> { + state_sync_info.sync_hash = sync_hash; + // note that iterate_state_sync_infos() collects everything into a Vec, so we're not + // actually writing to the DB while actively iterating this column + update.add_state_sync_info(epoch_first_block, state_sync_info.clone()); + + // TODO: would be nice to be able to propagate context up the call stack so we can just log + // once at the top with all the info. Otherwise this error will look very cryptic + update.commit()?; + Ok(()) + } + /// Walks through all the ongoing state syncs for future epochs and processes them pub fn run_catchup( &mut self, @@ -2468,48 +2574,91 @@ impl Client { let mut notify_state_sync = false; let me = signer.as_ref().map(|x| x.validator_id().clone()); - for (epoch_first_block, state_sync_info) in + for (epoch_first_block, mut state_sync_info) in self.chain.chain_store().iterate_state_sync_infos()? { - assert_eq!(epoch_first_block, state_sync_info.sync_hash); - let network_adapter = self.network_adapter.clone(); - let shards_to_split = self.get_shards_to_split(epoch_first_block, &state_sync_info, &me)?; let state_sync_timeout = self.config.state_sync_timeout; let block_header = self.chain.get_block(&epoch_first_block)?.header().clone(); let epoch_id = block_header.epoch_id(); - let CatchupState { state_sync, state_downloads, catchup } = match self - .catchup_state_syncs - .entry(epoch_first_block) + let CatchupState { state_sync, state_downloads, catchup } = if state_sync_info.sync_hash + != CryptoHash::default() { - std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), - std::collections::hash_map::Entry::Vacant(e) => { - tracing::debug!(target: "client", ?epoch_first_block, "inserting new state sync"); + match self.catchup_state_syncs.entry(state_sync_info.sync_hash) { + std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), + std::collections::hash_map::Entry::Vacant(e) => { + tracing::debug!(target: "client", ?epoch_first_block, sync_hash = ?&state_sync_info.sync_hash, "inserting new state sync"); + notify_state_sync = true; + e.insert(CatchupState { + state_sync: StateSync::new( + self.clock.clone(), + self.network_adapter.clone(), + state_sync_timeout, + &self.config.chain_id, + &self.config.state_sync.sync, + true, + ), + state_downloads: shards_to_split.clone(), + catchup: BlocksCatchUpState::new(state_sync_info.sync_hash, *epoch_id), + }) + } + } + } else { + let new_chunk_tracker = match self.catchup_tracker.entry(epoch_first_block) { + std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), + std::collections::hash_map::Entry::Vacant(e) => { + let shard_ids = self.epoch_manager.shard_ids(block_header.epoch_id())?; + e.insert(NewChunkTracker::new( + *block_header.hash(), + block_header.height(), + &shard_ids, + )) + } + }; + if let Some(sync_hash) = new_chunk_tracker.find_sync_hash(&self.chain)? { + let update = self.chain.mut_chain_store().store_update(); + Self::set_state_sync_hash( + update, + epoch_first_block, + &mut state_sync_info, + sync_hash, + )?; + tracing::debug!(target: "client", ?epoch_first_block, ?sync_hash, "inserting new state sync"); notify_state_sync = true; - let state = CatchupState { - state_sync: StateSync::new( - self.clock.clone(), - network_adapter, - state_sync_timeout, - &self.config.chain_id, - &self.config.state_sync.sync, - true, - ), - state_downloads: shards_to_split, - catchup: BlocksCatchUpState::new(epoch_first_block, *epoch_id), - }; - e.insert(state) + match self.catchup_state_syncs.entry(sync_hash) { + std::collections::hash_map::Entry::Vacant(e) => e.insert(CatchupState { + state_sync: StateSync::new( + self.clock.clone(), + self.network_adapter.clone(), + state_sync_timeout, + &self.config.chain_id, + &self.config.state_sync.sync, + true, + ), + state_downloads: shards_to_split.clone(), + catchup: BlocksCatchUpState::new(sync_hash, *epoch_id), + }), + std::collections::hash_map::Entry::Occupied(_) => { + panic!("Error occurred during catchup. Already stored state sync info for {}", sync_hash); + } + } + } else { + continue; } }; + // Here if the state sync structs are set, it must mean we've found a sync hash to use + assert_ne!(state_sync_info.sync_hash, CryptoHash::default()); + // For colour decorators to work, they need to printed directly. Otherwise the decorators get escaped, garble output and don't add colours. - debug!(target: "catchup", ?me, ?epoch_first_block, progress_per_shard = ?format_shard_sync_phase_per_shard(&state_downloads, false), "Catchup"); + debug!(target: "catchup", ?me, sync_hash = ?&state_sync_info.sync_hash, progress_per_shard = ?format_shard_sync_phase_per_shard(&state_downloads, false), "Catchup"); let use_colour = matches!(self.config.log_summary_style, LogSummaryStyle::Colored); let tracking_shards: Vec = state_sync_info.shards.iter().map(|tuple| tuple.0).collect(); + // Notify each shard to sync. if notify_state_sync { let shard_layout = self @@ -2540,7 +2689,7 @@ impl Client { let new_shard_sync = state_downloads; match state_sync.run( &me, - epoch_first_block, + state_sync_info.sync_hash, new_shard_sync, &mut self.chain, self.epoch_manager.as_ref(), @@ -2557,7 +2706,7 @@ impl Client { debug!(target: "catchup", "state sync completed now catch up blocks"); self.chain.catchup_blocks_step( &me, - &epoch_first_block, + &state_sync_info.sync_hash, catchup, block_catch_up_task_scheduler, )?; @@ -2568,6 +2717,7 @@ impl Client { self.chain.finish_catchup_blocks( &me, &epoch_first_block, + &state_sync_info.sync_hash, &mut block_processing_artifacts, apply_chunks_done_sender.clone(), &catchup.done_blocks, diff --git a/core/primitives/src/sharding.rs b/core/primitives/src/sharding.rs index ef2c33ca3c1..35e6ee36f24 100644 --- a/core/primitives/src/sharding.rs +++ b/core/primitives/src/sharding.rs @@ -56,11 +56,11 @@ impl From for ChunkHash { } } -#[derive(Debug, PartialEq, BorshSerialize, BorshDeserialize)] +#[derive(Clone, Debug, PartialEq, BorshSerialize, BorshDeserialize)] pub struct ShardInfo(pub ShardId, pub ChunkHash); /// Contains the information that is used to sync state for shards as epochs switch -#[derive(Debug, PartialEq, BorshSerialize, BorshDeserialize)] +#[derive(Clone, Debug, PartialEq, BorshSerialize, BorshDeserialize)] pub struct StateSyncInfo { /// The block we'll use as the "sync_hash" when state syncing. Previously, state sync /// used the first block of an epoch as the "sync_hash", and synced state to the epoch before. From f4eba89b383a1732a659dc49a498b7ca3a8ea540 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 17 Sep 2024 00:59:29 -0400 Subject: [PATCH 07/70] move get_epoch_start_sync_hash to Chain --- chain/chain/src/chain.rs | 20 ++++++++++++++++++++ chain/client/src/client_actor.rs | 5 ++--- chain/client/src/sync/state.rs | 23 ----------------------- nearcore/src/state_sync.rs | 4 ++-- tools/state-viewer/src/state_parts.rs | 7 +++---- 5 files changed, 27 insertions(+), 32 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index d1b555f0aca..a5559df80ee 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -2411,6 +2411,26 @@ impl Chain { ) } + /// Find the hash of the first block on the same epoch (and chain) of block with hash `sync_hash`. + pub fn get_epoch_start_sync_hash(&self, sync_hash: &CryptoHash) -> Result { + let mut header = self.get_block_header(sync_hash)?; + let mut epoch_id = *header.epoch_id(); + let mut hash = *header.hash(); + let mut prev_hash = *header.prev_hash(); + loop { + if prev_hash == CryptoHash::default() { + return Ok(hash); + } + header = self.get_block_header(&prev_hash)?; + if &epoch_id != header.epoch_id() { + return Ok(hash); + } + epoch_id = *header.epoch_id(); + hash = *header.hash(); + prev_hash = *header.prev_hash(); + } + } + /// Computes ShardStateSyncResponseHeader. pub fn compute_state_response_header( &self, diff --git a/chain/client/src/client_actor.rs b/chain/client/src/client_actor.rs index cc5bc42c80f..51ab8ba3ac1 100644 --- a/chain/client/src/client_actor.rs +++ b/chain/client/src/client_actor.rs @@ -13,7 +13,7 @@ use crate::debug::new_network_info_view; use crate::info::{display_sync_status, InfoHelper}; use crate::stateless_validation::partial_witness::partial_witness_actor::PartialWitnessSenderForClient; use crate::sync::adapter::{SyncMessage, SyncShardInfo}; -use crate::sync::state::{StateSync, StateSyncResult}; +use crate::sync::state::StateSyncResult; use crate::sync_jobs_actor::{ClientSenderForSyncJobs, SyncJobsActor}; use crate::{metrics, StatusResponse, SyncAdapter}; use actix::Actor; @@ -1587,8 +1587,7 @@ impl ClientActorInner { fn find_sync_hash(&mut self) -> Result { let header_head = self.client.chain.header_head()?; let sync_hash = header_head.last_block_hash; - let epoch_start_sync_hash = - StateSync::get_epoch_start_sync_hash(&mut self.client.chain, &sync_hash)?; + let epoch_start_sync_hash = self.client.chain.get_epoch_start_sync_hash(&sync_hash)?; let genesis_hash = self.client.chain.genesis().hash(); tracing::debug!( diff --git a/chain/client/src/sync/state.rs b/chain/client/src/sync/state.rs index 0344ab89d5e..f39639d9c72 100644 --- a/chain/client/src/sync/state.rs +++ b/chain/client/src/sync/state.rs @@ -460,29 +460,6 @@ impl StateSync { self.load_memtrie_results.insert(shard_uid, result); } - /// Find the hash of the first block on the same epoch (and chain) of block with hash `sync_hash`. - pub fn get_epoch_start_sync_hash( - chain: &Chain, - sync_hash: &CryptoHash, - ) -> Result { - let mut header = chain.get_block_header(sync_hash)?; - let mut epoch_id = *header.epoch_id(); - let mut hash = *header.hash(); - let mut prev_hash = *header.prev_hash(); - loop { - if prev_hash == CryptoHash::default() { - return Ok(hash); - } - header = chain.get_block_header(&prev_hash)?; - if &epoch_id != header.epoch_id() { - return Ok(hash); - } - epoch_id = *header.epoch_id(); - hash = *header.hash(); - prev_hash = *header.prev_hash(); - } - } - // Function called when our node receives the network response with a part. pub fn received_requested_part( &mut self, diff --git a/nearcore/src/state_sync.rs b/nearcore/src/state_sync.rs index 73c3ed5feff..46b79349c87 100644 --- a/nearcore/src/state_sync.rs +++ b/nearcore/src/state_sync.rs @@ -15,7 +15,7 @@ use near_client::sync::external::{ external_storage_location_directory, get_part_id_from_filename, is_part_filename, ExternalConnection, }; -use near_client::sync::state::{StateSync, STATE_DUMP_ITERATION_TIME_LIMIT_SECS}; +use near_client::sync::state::STATE_DUMP_ITERATION_TIME_LIMIT_SECS; use near_epoch_manager::shard_tracker::ShardTracker; use near_epoch_manager::EpochManagerAdapter; use near_primitives::hash::CryptoHash; @@ -662,7 +662,7 @@ fn get_latest_epoch( let hash = head.last_block_hash; let header = chain.get_block_header(&hash)?; let final_hash = header.last_final_block(); - let sync_hash = StateSync::get_epoch_start_sync_hash(chain, final_hash)?; + let sync_hash = chain.get_epoch_start_sync_hash(final_hash)?; let final_block_header = chain.get_block_header(&final_hash)?; let epoch_id = *final_block_header.epoch_id(); let epoch_info = epoch_manager.get_epoch_info(&epoch_id)?; diff --git a/tools/state-viewer/src/state_parts.rs b/tools/state-viewer/src/state_parts.rs index 229a41a92b2..26bf8385118 100644 --- a/tools/state-viewer/src/state_parts.rs +++ b/tools/state-viewer/src/state_parts.rs @@ -6,7 +6,6 @@ use near_client::sync::external::{ external_storage_location_directory, get_num_parts_from_filename, ExternalConnection, StateFileType, }; -use near_client::sync::state::StateSync; use near_epoch_manager::shard_tracker::{ShardTracker, TrackedConfig}; use near_epoch_manager::EpochManager; use near_primitives::challenge::PartialState; @@ -339,7 +338,7 @@ async fn load_state_parts( let epoch = chain.epoch_manager.get_epoch_info(&epoch_id).unwrap(); let sync_hash = get_any_block_hash_of_epoch(&epoch, chain); - let sync_hash = StateSync::get_epoch_start_sync_hash(chain, &sync_hash).unwrap(); + let sync_hash = chain.get_epoch_start_sync_hash(&sync_hash).unwrap(); let state_header = chain.get_state_response_header(shard_id, sync_hash).unwrap(); let state_root = state_header.chunk_prev_state_root(); @@ -440,7 +439,7 @@ async fn dump_state_parts( let epoch_id = epoch_selection.to_epoch_id(store, chain); let epoch = chain.epoch_manager.get_epoch_info(&epoch_id).unwrap(); let sync_hash = get_any_block_hash_of_epoch(&epoch, chain); - let sync_hash = StateSync::get_epoch_start_sync_hash(chain, &sync_hash).unwrap(); + let sync_hash = chain.get_epoch_start_sync_hash(&sync_hash).unwrap(); let sync_block_header = chain.get_block_header(&sync_hash).unwrap(); let sync_prev_header = chain.get_previous_header(&sync_block_header).unwrap(); let sync_prev_prev_hash = sync_prev_header.prev_hash(); @@ -542,7 +541,7 @@ fn read_state_header( let epoch = chain.epoch_manager.get_epoch_info(&epoch_id).unwrap(); let sync_hash = get_any_block_hash_of_epoch(&epoch, chain); - let sync_hash = StateSync::get_epoch_start_sync_hash(chain, &sync_hash).unwrap(); + let sync_hash = chain.get_epoch_start_sync_hash(&sync_hash).unwrap(); let state_header = chain.chain_store().get_state_header(shard_id, sync_hash); tracing::info!(target: "state-parts", ?epoch_id, ?sync_hash, ?state_header); From 53e933c4cfe3b0884a374425666ddd70aa0459ea Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 17 Sep 2024 11:08:38 -0400 Subject: [PATCH 08/70] dump state for the current epoch --- chain/chain/src/chain.rs | 117 +++++++++++++++++++++++++++++-------- nearcore/src/state_sync.rs | 17 +++++- 2 files changed, 108 insertions(+), 26 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index a5559df80ee..8d1e61cf97e 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -781,6 +781,9 @@ impl Chain { } else { debug!(target: "chain", "Downloading state for {:?}, I'm {:?}", shards_to_state_sync, me); + // This is not the way protocol features are normally gated, but this is not actually + // a protocol feature/change. The protocol version is just an easy way to coordinate + // among nodes for now let protocol_version = self.epoch_manager.get_epoch_protocol_version(block.header().epoch_id())?; let sync_hash = @@ -1801,7 +1804,7 @@ impl Chain { let (apply_chunk_work, block_preprocess_info) = preprocess_res; // 2) Start creating snapshot if needed. - if let Err(err) = self.process_snapshot() { + if let Err(err) = self.process_snapshot(block.header()) { tracing::error!(target: "state_snapshot", ?err, "Failed to make a state snapshot"); } @@ -2411,17 +2414,18 @@ impl Chain { ) } - /// Find the hash of the first block on the same epoch (and chain) of block with hash `sync_hash`. - pub fn get_epoch_start_sync_hash(&self, sync_hash: &CryptoHash) -> Result { - let mut header = self.get_block_header(sync_hash)?; - let mut epoch_id = *header.epoch_id(); - let mut hash = *header.hash(); - let mut prev_hash = *header.prev_hash(); + fn get_epoch_start_sync_hash_impl( + &self, + sync_header: &BlockHeader, + ) -> Result { + let mut epoch_id = *sync_header.epoch_id(); + let mut hash = *sync_header.hash(); + let mut prev_hash = *sync_header.prev_hash(); loop { if prev_hash == CryptoHash::default() { return Ok(hash); } - header = self.get_block_header(&prev_hash)?; + let header = self.get_block_header(&prev_hash)?; if &epoch_id != header.epoch_id() { return Ok(hash); } @@ -2431,6 +2435,63 @@ impl Chain { } } + /// Find the hash of the first block on the same epoch (and chain) of block with hash `sync_hash`. + pub fn get_epoch_start_sync_hash(&self, sync_hash: &CryptoHash) -> Result { + let header = self.get_block_header(sync_hash)?; + self.get_epoch_start_sync_hash_impl(&header) + } + + // Returns the first block for which at least two new chunks have been produced for every shard in the epoch + pub fn get_current_epoch_sync_hash( + &self, + sync_header: &BlockHeader, + ) -> Result, Error> { + let epoch_start = Self::get_epoch_start_sync_hash_impl(self, sync_header)?; + if epoch_start == *sync_header.hash() { + return Ok(None); + } + let mut header = self.get_block_header(&epoch_start)?; + + let shard_ids = self.epoch_manager.shard_ids(header.epoch_id())?; + let mut num_new_chunks: HashMap<_, _> = + shard_ids.iter().map(|shard_id| (*shard_id, 0)).collect(); + + loop { + header = if header.hash() == sync_header.prev_hash() { + sync_header.clone() + } else { + let next_hash = self.chain_store().get_next_block_hash(header.hash())?; + self.get_block_header(&next_hash)? + }; + + let mut done = true; + for (shard_id, num_new_chunks) in num_new_chunks.iter_mut() { + let included = match header.chunk_mask().get(*shard_id as usize) { + Some(i) => *i, + None => { + return Err(Error::Other(format!( + "can't get shard {} in chunk mask for block {}", + shard_id, + header.hash() + ))); + } + }; + if included { + *num_new_chunks += 1; + } + if *num_new_chunks < 2 { + done = false; + } + } + if done { + return Ok(Some(*header.hash())); + } + if header.hash() == sync_header.hash() { + return Ok(None); + } + } + } + /// Computes ShardStateSyncResponseHeader. pub fn compute_state_response_header( &self, @@ -2456,10 +2517,9 @@ impl Chain { // The chunk was applied at height `chunk_header.height_included`. // Getting the `current` state. + // TODO: check that the sync block is what we would expect. So, either the first + // block of an epoch, or the first block where there have been two new chunks in the epoch let sync_prev_block = self.get_block(sync_block_header.prev_hash())?; - if sync_block_epoch_id == sync_prev_block.header().epoch_id() { - return Err(sync_hash_not_first_hash(sync_hash)); - } // Chunk header here is the same chunk header as at the `current` height. let sync_prev_hash = sync_prev_block.hash(); let chunks = sync_prev_block.chunks(); @@ -2663,9 +2723,6 @@ impl Chain { return Err(shard_id_out_of_bounds(shard_id)); } let prev_block = self.get_block(header.prev_hash())?; - if epoch_id == prev_block.header().epoch_id() { - return Err(sync_hash_not_first_hash(sync_hash)); - } let state_root = prev_block .chunks() .get(shard_id as usize) @@ -3803,8 +3860,8 @@ impl Chain { } /// Function to create or delete a snapshot if necessary. - fn process_snapshot(&mut self) -> Result<(), Error> { - let (make_snapshot, delete_snapshot) = self.should_make_or_delete_snapshot()?; + fn process_snapshot(&mut self, next_block: &BlockHeader) -> Result<(), Error> { + let (make_snapshot, delete_snapshot) = self.should_make_or_delete_snapshot(next_block)?; if !make_snapshot && !delete_snapshot { return Ok(()); } @@ -3851,7 +3908,10 @@ impl Chain { /// Function to check whether we need to create a new snapshot while processing the current block /// Note that this functions is called as a part of block preprocesing, so the head is not updated to current block - fn should_make_or_delete_snapshot(&mut self) -> Result<(bool, bool), Error> { + fn should_make_or_delete_snapshot( + &mut self, + next_block: &BlockHeader, + ) -> Result<(bool, bool), Error> { // head value is that of the previous block, i.e. curr_block.prev_hash let head = self.head()?; if head.prev_block_hash == CryptoHash::default() { @@ -3863,11 +3923,26 @@ impl Chain { self.epoch_manager.is_next_block_epoch_start(&head.last_block_hash)?; let will_shard_layout_change = self.epoch_manager.will_shard_layout_change(&head.last_block_hash)?; + let next_block_epoch = + self.epoch_manager.get_epoch_id_from_prev_block(&head.last_block_hash)?; + let next_block_protocol_version = + self.epoch_manager.get_epoch_protocol_version(&next_block_epoch)?; + let next_block_is_sync_block = if next_block_protocol_version < 72 { + is_epoch_boundary + } else { + // FIXME: before submitting, this needs to be fixed. can't be iterating over the whole chain inside of preprocess + // block like that if there are many missed chunks + match self.get_current_epoch_sync_hash(next_block)? { + Some(sync_hash) => sync_hash == *next_block.hash(), + None => false, + } + }; + let tries = self.runtime_adapter.get_tries(); let snapshot_config = tries.state_snapshot_config(); let make_snapshot = match snapshot_config.state_snapshot_type { // For every epoch, we snapshot if the next block would be in a different epoch - StateSnapshotType::EveryEpoch => is_epoch_boundary, + StateSnapshotType::EveryEpoch => next_block_is_sync_block, // For resharding only, we snapshot if next block would be in a different shard layout StateSnapshotType::ForReshardingOnly => is_epoch_boundary && will_shard_layout_change, }; @@ -3993,12 +4068,6 @@ fn shard_id_out_of_bounds(shard_id: ShardId) -> Error { Error::InvalidStateRequest(format!("shard_id {shard_id:?} out of bounds").into()) } -fn sync_hash_not_first_hash(sync_hash: CryptoHash) -> Error { - Error::InvalidStateRequest( - format!("sync_hash {sync_hash:?} is not the first hash of the epoch").into(), - ) -} - /// We want to guarantee that transactions are only applied once for each shard, /// even though apply_chunks may be called twice, once with /// ApplyChunksMode::NotCaughtUp once with ApplyChunksMode::CatchingUp. Note diff --git a/nearcore/src/state_sync.rs b/nearcore/src/state_sync.rs index 46b79349c87..b802e33256e 100644 --- a/nearcore/src/state_sync.rs +++ b/nearcore/src/state_sync.rs @@ -265,6 +265,11 @@ fn get_current_state( err })?; + let new_sync_hash = match new_sync_hash { + Some(h) => h, + None => return Ok(StateDumpAction::Wait), + }; + if Some(&new_epoch_id) == was_last_epoch_done.as_ref() { tracing::debug!(target: "state_sync_dump", shard_id, ?was_last_epoch_done, ?new_epoch_id, new_epoch_height, ?new_sync_hash, "latest epoch is done. No new epoch to dump. Idle"); Ok(StateDumpAction::Wait) @@ -648,7 +653,7 @@ struct LatestEpochInfo { prev_epoch_id: EpochId, epoch_id: EpochId, epoch_height: EpochHeight, - sync_hash: CryptoHash, + sync_hash: Option, } /// return epoch_id and sync_hash of the latest complete epoch available locally. @@ -662,12 +667,20 @@ fn get_latest_epoch( let hash = head.last_block_hash; let header = chain.get_block_header(&hash)?; let final_hash = header.last_final_block(); - let sync_hash = chain.get_epoch_start_sync_hash(final_hash)?; let final_block_header = chain.get_block_header(&final_hash)?; let epoch_id = *final_block_header.epoch_id(); let epoch_info = epoch_manager.get_epoch_info(&epoch_id)?; let prev_epoch_id = epoch_manager.get_prev_epoch_id_from_prev_block(&head.prev_block_hash)?; let epoch_height = epoch_info.epoch_height(); + + // This is not the way protocol features are normally gated, but this is not actually + // a protocol feature/change. The protocol version is just an easy way to coordinate + // among nodes for now + let sync_hash = if epoch_info.protocol_version() >= 72 { + chain.get_current_epoch_sync_hash(&final_block_header)? + } else { + Some(chain.get_epoch_start_sync_hash(final_hash)?) + }; tracing::debug!(target: "state_sync_dump", ?final_hash, ?sync_hash, ?epoch_id, epoch_height, "get_latest_epoch"); Ok(LatestEpochInfo { prev_epoch_id, epoch_id, epoch_height, sync_hash }) From 5c9055fab45d531651407c2e33c438f9badf47f3 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 17 Sep 2024 11:17:13 -0400 Subject: [PATCH 09/70] get rid of store validator check --- chain/chain/src/store_validator.rs | 7 ------- chain/chain/src/store_validator/validate.rs | 9 --------- 2 files changed, 16 deletions(-) diff --git a/chain/chain/src/store_validator.rs b/chain/chain/src/store_validator.rs index 43744ce105d..6a3f0d8471a 100644 --- a/chain/chain/src/store_validator.rs +++ b/chain/chain/src/store_validator.rs @@ -249,13 +249,6 @@ impl StoreValidator { DBCol::StateDlInfos => { let block_hash = CryptoHash::try_from(key_ref)?; let state_sync_info = StateSyncInfo::try_from_slice(value_ref)?; - // StateSyncInfo is valid - self.check( - &validate::state_sync_info_valid, - &block_hash, - &state_sync_info, - col, - ); // Block which can be indexed by StateSyncInfo exists self.check( &validate::state_sync_info_block_exists, diff --git a/chain/chain/src/store_validator/validate.rs b/chain/chain/src/store_validator/validate.rs index de89f4e4edd..1cc352d588c 100644 --- a/chain/chain/src/store_validator/validate.rs +++ b/chain/chain/src/store_validator/validate.rs @@ -715,15 +715,6 @@ pub(crate) fn outcome_indexed_by_block_hash( err!("Outcome id {:?} is not found in DBCol::OutcomeIds", outcome_id) } -pub(crate) fn state_sync_info_valid( - _sv: &mut StoreValidator, - block_hash: &CryptoHash, - state_sync_info: &StateSyncInfo, -) -> Result<(), StoreValidatorError> { - check_discrepancy!(state_sync_info.sync_hash, *block_hash, "Invalid StateSyncInfo stored"); - Ok(()) -} - pub(crate) fn state_sync_info_block_exists( sv: &mut StoreValidator, block_hash: &CryptoHash, From 281c156ecc4776e4e4ebfa5b9810c132491a9adf Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 17 Sep 2024 11:48:14 -0400 Subject: [PATCH 10/70] add TODOs --- chain/chain/src/chain.rs | 1 + chain/client/src/client_actor.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 8d1e61cf97e..e60c51c4dbb 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -4512,6 +4512,7 @@ impl Chain { } /// Check that sync_hash is the first block of an epoch. + /// TODO: allow the new way of computing the sync hash for syncing to the current epoch pub fn check_sync_hash_validity(&self, sync_hash: &CryptoHash) -> Result { // It's important to check that Block exists because we will sync with it. // Do not replace with `get_block_header()`. diff --git a/chain/client/src/client_actor.rs b/chain/client/src/client_actor.rs index 51ab8ba3ac1..1efcf8c303f 100644 --- a/chain/client/src/client_actor.rs +++ b/chain/client/src/client_actor.rs @@ -1584,6 +1584,7 @@ impl ClientActorInner { /// /// The selected block will always be the first block on a new epoch: /// . + /// TODO: allow the new way of computing the sync hash for syncing to the current epoch fn find_sync_hash(&mut self) -> Result { let header_head = self.client.chain.header_head()?; let sync_hash = header_head.last_block_hash; From 386df33c9d58e02947e3afed430ed2ac2add65b5 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 18 Sep 2024 14:31:38 -0400 Subject: [PATCH 11/70] remove test_mock_node_basic --- tools/mock-node/src/setup.rs | 188 ----------------------------------- 1 file changed, 188 deletions(-) diff --git a/tools/mock-node/src/setup.rs b/tools/mock-node/src/setup.rs index 99dba008def..17a5279e15c 100644 --- a/tools/mock-node/src/setup.rs +++ b/tools/mock-node/src/setup.rs @@ -273,191 +273,3 @@ pub fn setup_mock_node( MockNode { target_height, mock_peer, rpc_client } } - -#[cfg(test)] -mod tests { - use crate::setup::{setup_mock_node, MockNode}; - use crate::MockNetworkConfig; - use actix::{Actor, System}; - use futures::{future, FutureExt}; - use near_actix_test_utils::{run_actix, spawn_interruptible}; - use near_chain::{ChainStore, ChainStoreAccess}; - use near_chain_configs::{Genesis, NEAR_BASE}; - use near_client::{GetBlock, ProcessTxRequest}; - use near_crypto::{InMemorySigner, KeyType}; - use near_epoch_manager::{EpochManager, EpochManagerAdapter}; - use near_network::tcp; - use near_network::test_utils::{wait_or_timeout, WaitOrTimeoutActor}; - use near_o11y::testonly::init_integration_logger; - use near_o11y::WithSpanContextExt; - use near_primitives::transaction::SignedTransaction; - use near_store::test_utils::gen_account_from_alphabet; - use nearcore::{load_test_config, start_with_config}; - use rand::thread_rng; - use std::ops::ControlFlow; - use std::sync::{Arc, RwLock}; - use std::time::Duration; - - // Test the basic mocknet setup. - // This test first starts a localnet with one validator node that generates 2 epochs of blocks - // to generate a chain history. - // Then start a mock network with this chain history and test that the client in the mock network can catch up these 2 epochs. - // The localnet needs to have state snapshots enabled. It copies state from - // one instance to another by using the state sync mechanism, which relies - // on the flat storage snapshots. - #[test] - fn test_mock_node_basic() { - init_integration_logger(); - - // first set up a network with only one validator and generate some blocks - let mut genesis = - Genesis::test(vec!["test0".parse().unwrap(), "test1".parse().unwrap()], 1); - let epoch_length = 50; - genesis.config.epoch_length = epoch_length; - let mut near_config = - load_test_config("test0", tcp::ListenerAddr::reserve_for_test(), genesis.clone()); - near_config.client_config.min_num_peers = 0; - near_config.config.store.state_snapshot_enabled = true; - near_config.config.tracked_shards = vec![0]; // Track all shards. - - let dir = tempfile::Builder::new().prefix("test0").tempdir().unwrap(); - let path1 = dir.path(); - run_actix(async move { - let nearcore::NearNode { view_client, client, .. } = - start_with_config(path1, near_config).expect("start_with_config"); - - let view_client1 = view_client; - let nonce = Arc::new(RwLock::new(10)); - WaitOrTimeoutActor::new( - Box::new(move |_ctx| { - let nonce = nonce.clone(); - let client1 = client.clone(); - let actor = view_client1.send(GetBlock::latest().with_span_context()); - let actor = actor.then(move |res| { - if let Ok(Ok(block)) = res { - let next_nonce = *nonce.read().unwrap(); - if next_nonce < 100 { - WaitOrTimeoutActor::new( - Box::new(move |_ctx| { - let signer0 = InMemorySigner::from_seed( - "test1".parse().unwrap(), - KeyType::ED25519, - "test1", - ); - let mut rng = thread_rng(); - let transaction = SignedTransaction::create_account( - next_nonce, - "test1".parse().unwrap(), - gen_account_from_alphabet(&mut rng, b"abcdefghijklmn"), - 5 * NEAR_BASE, - signer0.public_key.clone(), - &signer0.into(), - block.header.hash, - ); - spawn_interruptible( - client1 - .send( - ProcessTxRequest { - transaction, - is_forwarded: false, - check_only: false, - } - .with_span_context(), - ) - .then(move |_res| future::ready(())), - ); - }), - 100, - 30000, - ) - .start(); - *nonce.write().unwrap() = next_nonce + 1; - } - - // This is the flaky part. - // The node needs to stop as late into an epoch as - // possible without going over into the next epoch. - let expected_height = epoch_length * 3 - 5; - if block.header.height >= expected_height { - tracing::info!( - block_height = block.header.height, - expected_height, - "Time to stop" - ); - System::current().stop() - } - } - future::ready(()) - }); - spawn_interruptible(actor); - }), - // Keep this number low to ensure the node is stopped late in - // the epoch without going into the next epoch. - 100, - 60000, - ) - .start(); - }); - - // start the mock network to simulate a new node "test1" to sync up - // start the client at height 10 (end of the first epoch) - let dir1 = tempfile::Builder::new().prefix("test1").tempdir().unwrap(); - let mut near_config1 = load_test_config("", tcp::ListenerAddr::reserve_for_test(), genesis); - near_config1.client_config.min_num_peers = 1; - near_config1.client_config.tracked_shards = vec![0]; // Track all shards. - near_config1.config.store.state_snapshot_enabled = true; - let network_config = MockNetworkConfig::with_delay(Duration::from_millis(10)); - - let client_start_height = { - tracing::info!(target: "mock_node", store_path = ?dir.path(), "Opening the created store to get client_start_height"); - let store = near_store::NodeStorage::opener( - dir.path(), - near_config1.config.archive, - &near_config1.config.store, - None, - ) - .open() - .unwrap() - .get_hot_store(); - let epoch_manager = - EpochManager::new_arc_handle(store.clone(), &near_config1.genesis.config); - let chain_store = ChainStore::new( - store, - near_config1.genesis.config.genesis_height, - near_config1.client_config.save_trie_changes, - ); - let network_head_hash = chain_store.head().unwrap().last_block_hash; - let last_epoch_start_height = - epoch_manager.get_epoch_start_height(&network_head_hash).unwrap(); - tracing::info!(target: "mock_node", ?network_head_hash, last_epoch_start_height); - // Needs to be the last block of an epoch. - last_epoch_start_height - 1 - }; - tracing::info!(target: "mock_node", client_start_height); - - run_actix(async { - let MockNode { rpc_client, .. } = setup_mock_node( - dir1.path(), - dir.path(), - near_config1, - &network_config, - client_start_height, - None, - None, - false, - tcp::ListenerAddr::reserve_for_test(), - ); - wait_or_timeout(100, 60000, || async { - if let Ok(status) = rpc_client.status().await { - if status.sync_info.latest_block_height >= client_start_height { - System::current().stop(); - return ControlFlow::Break(()); - } - } - ControlFlow::Continue(()) - }) - .await - .unwrap(); - }) - } -} From 2eda3114b764dd57f8819a88bfa8d915c3a24193 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 23 Sep 2024 21:41:26 -0400 Subject: [PATCH 12/70] make it a nightly protocol feature --- chain/chain/src/chain.rs | 28 ++++++++++++++++------------ core/primitives-core/src/version.rs | 6 ++++++ nearcore/src/state_sync.rs | 12 +++++++----- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index b36d1ddf528..8564db16d9b 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -788,8 +788,11 @@ impl Chain { // among nodes for now let protocol_version = self.epoch_manager.get_epoch_protocol_version(block.header().epoch_id())?; - let sync_hash = - if protocol_version < 72 { *block.header().hash() } else { CryptoHash::default() }; + let sync_hash = if checked_feature!("stable", StateSyncHashUpdate, protocol_version) { + CryptoHash::default() + } else { + *block.header().hash() + }; let state_sync_info = StateSyncInfo { sync_hash, shards: shards_to_state_sync @@ -4014,16 +4017,17 @@ impl Chain { self.epoch_manager.get_epoch_id_from_prev_block(&head.last_block_hash)?; let next_block_protocol_version = self.epoch_manager.get_epoch_protocol_version(&next_block_epoch)?; - let next_block_is_sync_block = if next_block_protocol_version < 72 { - is_epoch_boundary - } else { - // FIXME: before submitting, this needs to be fixed. can't be iterating over the whole chain inside of preprocess - // block like that if there are many missed chunks - match self.get_current_epoch_sync_hash(next_block)? { - Some(sync_hash) => sync_hash == *next_block.hash(), - None => false, - } - }; + let next_block_is_sync_block = + if !checked_feature!("stable", StateSyncHashUpdate, next_block_protocol_version) { + is_epoch_boundary + } else { + // FIXME: before submitting, this needs to be fixed. can't be iterating over the whole chain inside of preprocess + // block like that if there are many missed chunks + match self.get_current_epoch_sync_hash(next_block)? { + Some(sync_hash) => sync_hash == *next_block.hash(), + None => false, + } + }; let tries = self.runtime_adapter.get_tries(); let snapshot_config = tries.state_snapshot_config(); diff --git a/core/primitives-core/src/version.rs b/core/primitives-core/src/version.rs index 63c16f50278..161652a1ded 100644 --- a/core/primitives-core/src/version.rs +++ b/core/primitives-core/src/version.rs @@ -172,6 +172,11 @@ pub enum ProtocolFeature { ChunkEndorsementsInBlockHeader, /// Store receipts in State in the StateStoredReceipt format. StateStoredReceipt, + /// Indicates that the "sync_hash" used to identify the point in the chain to sync state to + /// should no longer be the first block of the epoch, but a couple blocks after that in order + /// to sync the current epoch's state. This is not strictly a protocol feature, but is included + /// here to coordinate among nodes + StateSyncHashUpdate, } impl ProtocolFeature { @@ -248,6 +253,7 @@ impl ProtocolFeature { // TODO(#11201): When stabilizing this feature in mainnet, also remove the temporary code // that always enables this for mocknet (see config_mocknet function). ProtocolFeature::ShuffleShardAssignments => 143, + ProtocolFeature::StateSyncHashUpdate => 144, } } diff --git a/nearcore/src/state_sync.rs b/nearcore/src/state_sync.rs index b802e33256e..727bacdd525 100644 --- a/nearcore/src/state_sync.rs +++ b/nearcore/src/state_sync.rs @@ -18,6 +18,7 @@ use near_client::sync::external::{ use near_client::sync::state::STATE_DUMP_ITERATION_TIME_LIMIT_SECS; use near_epoch_manager::shard_tracker::ShardTracker; use near_epoch_manager::EpochManagerAdapter; +use near_primitives::checked_feature; use near_primitives::hash::CryptoHash; use near_primitives::state_part::PartId; use near_primitives::state_sync::{StatePartKey, StateSyncDumpProgress}; @@ -676,11 +677,12 @@ fn get_latest_epoch( // This is not the way protocol features are normally gated, but this is not actually // a protocol feature/change. The protocol version is just an easy way to coordinate // among nodes for now - let sync_hash = if epoch_info.protocol_version() >= 72 { - chain.get_current_epoch_sync_hash(&final_block_header)? - } else { - Some(chain.get_epoch_start_sync_hash(final_hash)?) - }; + let sync_hash = + if checked_feature!("stable", StateSyncHashUpdate, epoch_info.protocol_version()) { + chain.get_current_epoch_sync_hash(&final_block_header)? + } else { + Some(chain.get_epoch_start_sync_hash(final_hash)?) + }; tracing::debug!(target: "state_sync_dump", ?final_hash, ?sync_hash, ?epoch_id, epoch_height, "get_latest_epoch"); Ok(LatestEpochInfo { prev_epoch_id, epoch_id, epoch_height, sync_hash }) From 5900d8c567e263a04d51b742d175a93076b51e24 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 23 Sep 2024 22:07:52 -0400 Subject: [PATCH 13/70] comments --- chain/chain/src/chain.rs | 11 +++++++---- chain/client/src/client.rs | 7 ++++++- chain/client/src/client_actor.rs | 2 +- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 8564db16d9b..c78b6933d98 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -2607,7 +2607,7 @@ impl Chain { // The chunk was applied at height `chunk_header.height_included`. // Getting the `current` state. - // TODO: check that the sync block is what we would expect. So, either the first + // TODO(current_epoch_state_sync): check that the sync block is what we would expect. So, either the first // block of an epoch, or the first block where there have been two new chunks in the epoch let sync_prev_block = self.get_block(sync_block_header.prev_hash())?; // Chunk header here is the same chunk header as at the `current` height. @@ -3377,7 +3377,7 @@ impl Chain { &mut self, me: &Option, epoch_first_block: &CryptoHash, - // TODO: remove the ones not in affected_blocks by breadth first searching from `epoch_first_block` and adding + // TODO(current_epoch_state_sync): remove the ones not in affected_blocks by breadth first searching from `epoch_first_block` and adding // descendant blocks to the search when they're not equal to this hash, and then removing everything we see in that search _catchup_start_block: &CryptoHash, block_processing_artifacts: &mut BlockProcessingArtifact, @@ -3950,6 +3950,7 @@ impl Chain { } /// Function to create or delete a snapshot if necessary. + /// TODO: this function calls head() inside of start_process_block_impl(), consider changing to next_block.header().prev_hash() fn process_snapshot(&mut self, next_block: &BlockHeader) -> Result<(), Error> { let (make_snapshot, delete_snapshot) = self.should_make_or_delete_snapshot(next_block)?; if !make_snapshot && !delete_snapshot { @@ -4603,7 +4604,7 @@ impl Chain { } /// Check that sync_hash is the first block of an epoch. - /// TODO: allow the new way of computing the sync hash for syncing to the current epoch + /// TODO(current_epoch_state_sync): allow the new way of computing the sync hash for syncing to the current epoch pub fn check_sync_hash_validity(&self, sync_hash: &CryptoHash) -> Result { // It's important to check that Block exists because we will sync with it. // Do not replace with `get_block_header()`. @@ -4902,7 +4903,9 @@ pub struct ChunkStateWitnessMessage { } /// Helper to track blocks catch up -/// Lifetime of a block_hash is as follows: +/// Starting from the first block we want to apply after syncing state (so either the first block +/// of an epoch, or a couple blocks after that, if syncing the current epoch's state) the lifetime +/// of a block_hash is as follows: /// 1. It is added to pending blocks, either as first block of an epoch or because we (post) /// processed previous block /// 2. Block is preprocessed and scheduled for processing in sync jobs actor. Block hash diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 0b44083508c..d98a2344c7f 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -111,9 +111,14 @@ pub enum AdvProduceBlocksMode { OnlyValid, } +/// The state associated with downloading state for a shard this node will track in the +/// future but does not currently. pub struct CatchupState { + /// Manages downloading the state. pub state_sync: StateSync, + /// Keeps track of state downloads, and gets passed to `state_sync`. pub state_downloads: HashMap, + /// Manages going back to apply chunks after state has been downloaded. pub catchup: BlocksCatchUpState, } @@ -312,7 +317,7 @@ impl NewChunkTracker { let next_header = chain.get_block_header(&next_hash)?; let done = self.record_new_chunks(&next_header)?; if done { - // TODO: check to make sure the epoch IDs are the same. If there are no new chunks in some shard in the epoch, + // TODO(current_epoch_state_sync): check to make sure the epoch IDs are the same. If there are no new chunks in some shard in the epoch, // this will be for an epoch ahead of this one self.sync_hash = Some(next_hash); break; diff --git a/chain/client/src/client_actor.rs b/chain/client/src/client_actor.rs index 1efcf8c303f..10047e482e9 100644 --- a/chain/client/src/client_actor.rs +++ b/chain/client/src/client_actor.rs @@ -1584,7 +1584,7 @@ impl ClientActorInner { /// /// The selected block will always be the first block on a new epoch: /// . - /// TODO: allow the new way of computing the sync hash for syncing to the current epoch + /// TODO(current_epoch_state_sync): allow the new way of computing the sync hash for syncing to the current epoch fn find_sync_hash(&mut self) -> Result { let header_head = self.client.chain.header_head()?; let sync_hash = header_head.last_block_hash; From 8dea6ad21b89684af4ee1f414f583aadf13e51f5 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 23 Sep 2024 22:18:52 -0400 Subject: [PATCH 14/70] map epoch ID to NewChunkTracker --- chain/client/src/client.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index d98a2344c7f..bde97510464 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -158,9 +158,9 @@ pub struct Client { /// Approvals for which we do not have the block yet pub pending_approvals: lru::LruCache>, - /// A mapping from the first block of an epoch that we know needs to be state synced for some shards - /// to a tracker that will find an appropriate sync_hash for state sync - catchup_tracker: HashMap, + /// A mapping from an epoch that we know needs to be state synced for some shards + /// to a tracker that will find an appropriate sync_hash for state sync to that epoch + catchup_tracker: HashMap, /// A mapping from a block for which a state sync is underway for the next epoch, and the object /// storing the current status of the state sync and blocks catch up pub catchup_state_syncs: HashMap, @@ -2611,7 +2611,7 @@ impl Client { } } } else { - let new_chunk_tracker = match self.catchup_tracker.entry(epoch_first_block) { + let new_chunk_tracker = match self.catchup_tracker.entry(*block_header.epoch_id()) { std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), std::collections::hash_map::Entry::Vacant(e) => { let shard_ids = self.epoch_manager.shard_ids(block_header.epoch_id())?; From 78610c73fa1e82f88803d77fc67f027bc425f045 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 23 Sep 2024 22:24:44 -0400 Subject: [PATCH 15/70] remove set_state_sync_hash --- chain/client/src/client.rs | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index bde97510464..459e23ebe67 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -34,8 +34,8 @@ use near_chain::types::{ StorageDataSource, }; use near_chain::{ - BlockProcessingArtifact, BlockStatus, Chain, ChainGenesis, ChainStoreAccess, ChainStoreUpdate, - Doomslug, DoomslugThresholdMode, Provenance, + BlockProcessingArtifact, BlockStatus, Chain, ChainGenesis, ChainStoreAccess, Doomslug, + DoomslugThresholdMode, Provenance, }; use near_chain_configs::{ ClientConfig, LogSummaryStyle, MutableValidatorSigner, UpdateableClientConfig, @@ -2547,23 +2547,6 @@ impl Client { Ok(false) } - fn set_state_sync_hash<'a>( - mut update: ChainStoreUpdate<'a>, - epoch_first_block: CryptoHash, - state_sync_info: &mut StateSyncInfo, - sync_hash: CryptoHash, - ) -> Result<(), Error> { - state_sync_info.sync_hash = sync_hash; - // note that iterate_state_sync_infos() collects everything into a Vec, so we're not - // actually writing to the DB while actively iterating this column - update.add_state_sync_info(epoch_first_block, state_sync_info.clone()); - - // TODO: would be nice to be able to propagate context up the call stack so we can just log - // once at the top with all the info. Otherwise this error will look very cryptic - update.commit()?; - Ok(()) - } - /// Walks through all the ongoing state syncs for future epochs and processes them pub fn run_catchup( &mut self, @@ -2623,13 +2606,15 @@ impl Client { } }; if let Some(sync_hash) = new_chunk_tracker.find_sync_hash(&self.chain)? { - let update = self.chain.mut_chain_store().store_update(); - Self::set_state_sync_hash( - update, - epoch_first_block, - &mut state_sync_info, - sync_hash, - )?; + state_sync_info.sync_hash = sync_hash; + let mut update = self.chain.mut_chain_store().store_update(); + // note that iterate_state_sync_infos() collects everything into a Vec, so we're not + // actually writing to the DB while actively iterating this column + update.add_state_sync_info(epoch_first_block, state_sync_info.clone()); + // TODO: would be nice to be able to propagate context up the call stack so we can just log + // once at the top with all the info. Otherwise this error will look very cryptic + update.commit()?; + tracing::debug!(target: "client", ?epoch_first_block, ?sync_hash, "inserting new state sync"); notify_state_sync = true; match self.catchup_state_syncs.entry(sync_hash) { From e5d428e88eb3c17b0633ae4e1b8be57623eb34d9 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 24 Sep 2024 11:17:34 -0400 Subject: [PATCH 16/70] add TODO --- chain/chain/src/chain.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index c78b6933d98..69c9a4d7d66 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -2504,6 +2504,10 @@ impl Chain { ) } + // TODO(current_epoch_state_sync): move state sync related code to state sync files + // when this is no longer needed in this file after we more efficiently + // implement should_make_or_delete_snapshot(). Also, the logic in this function can probably + // be made simpler, as we shouldn't need to iterate over all blocks just to find the epoch start. fn get_epoch_start_sync_hash_impl( &self, sync_header: &BlockHeader, From 2379aece6c44a803a058131a81ca9b44e75db8a9 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 24 Sep 2024 11:22:37 -0400 Subject: [PATCH 17/70] rename sync_header -> next_header --- chain/chain/src/chain.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 69c9a4d7d66..71ca3b4e92c 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -2536,12 +2536,13 @@ impl Chain { } // Returns the first block for which at least two new chunks have been produced for every shard in the epoch + // `next_header` is the next block header that is being processed in start_process_block_impl() pub fn get_current_epoch_sync_hash( &self, - sync_header: &BlockHeader, + next_header: &BlockHeader, ) -> Result, Error> { - let epoch_start = Self::get_epoch_start_sync_hash_impl(self, sync_header)?; - if epoch_start == *sync_header.hash() { + let epoch_start = Self::get_epoch_start_sync_hash_impl(self, next_header)?; + if epoch_start == *next_header.hash() { return Ok(None); } let mut header = self.get_block_header(&epoch_start)?; @@ -2551,8 +2552,8 @@ impl Chain { shard_ids.iter().map(|shard_id| (*shard_id, 0)).collect(); loop { - header = if header.hash() == sync_header.prev_hash() { - sync_header.clone() + header = if header.hash() == next_header.prev_hash() { + next_header.clone() } else { let next_hash = self.chain_store().get_next_block_hash(header.hash())?; self.get_block_header(&next_hash)? @@ -2580,7 +2581,7 @@ impl Chain { if done { return Ok(Some(*header.hash())); } - if header.hash() == sync_header.hash() { + if header.hash() == next_header.hash() { return Ok(None); } } From 5e2f65446300dfc6dbbe9958a2ade59e810255e8 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 30 Sep 2024 22:39:34 -0400 Subject: [PATCH 18/70] add integration test --- core/chain-configs/src/test_genesis.rs | 13 +- integration-tests/src/test_loop/builder.rs | 17 +- .../test_loop/tests/fix_min_stake_ratio.rs | 2 +- integration-tests/src/test_loop/tests/mod.rs | 1 + .../src/test_loop/tests/state_sync.rs | 290 ++++++++++++++++++ .../src/test_loop/utils/network.rs | 78 ++++- .../src/test_loop/utils/transactions.rs | 29 +- 7 files changed, 401 insertions(+), 29 deletions(-) create mode 100644 integration-tests/src/test_loop/tests/state_sync.rs diff --git a/core/chain-configs/src/test_genesis.rs b/core/chain-configs/src/test_genesis.rs index 7f3e826d7d6..7d3fd0900ea 100644 --- a/core/chain-configs/src/test_genesis.rs +++ b/core/chain-configs/src/test_genesis.rs @@ -189,15 +189,18 @@ impl TestGenesisBuilder { pub fn validators_raw( &mut self, validators: Vec, - num_block_and_chunk_producer_seats: NumSeats, + num_block_producer_seats: NumSeats, + num_chunk_producer_seats: NumSeats, num_chunk_validator_only_seats: NumSeats, ) -> &mut Self { + let num_chunk_validator_seats = + std::cmp::max(num_block_producer_seats, num_chunk_producer_seats) + + num_chunk_validator_only_seats; self.validators = Some(ValidatorsSpec::Raw { validators, - num_block_producer_seats: num_block_and_chunk_producer_seats, - num_chunk_producer_seats: num_block_and_chunk_producer_seats, - num_chunk_validator_seats: num_block_and_chunk_producer_seats - + num_chunk_validator_only_seats, + num_block_producer_seats, + num_chunk_producer_seats, + num_chunk_validator_seats, }); self } diff --git a/integration-tests/src/test_loop/builder.rs b/integration-tests/src/test_loop/builder.rs index e086d883ded..f0c68abebef 100644 --- a/integration-tests/src/test_loop/builder.rs +++ b/integration-tests/src/test_loop/builder.rs @@ -60,6 +60,12 @@ pub(crate) struct TestLoopBuilder { chunks_storage: Arc>, /// Whether test loop should drop all chunks validated by the given account. drop_chunks_validated_by: Option, + /// Specifies the chunks that should be produced by their appearance in the + /// chain with respect to the start of an epoch. That is, a given chunk at height + /// `height_included` for shard `shard_id` will be produced if + /// chunks_produced[`height_included` - `epoch_start`][`shard_id`] is true, or if + /// `height_included` - `epoch_start` > chunks_produced.len() + chunks_produced: Option>>, /// Whether test loop should drop all endorsements from the given account. drop_endorsements_from: Option, /// Number of latest epochs to keep before garbage collecting associated data. @@ -82,6 +88,7 @@ impl TestLoopBuilder { test_loop_data_dir: None, archival_clients: HashSet::new(), chunks_storage: Default::default(), + chunks_produced: None, drop_chunks_validated_by: None, drop_endorsements_from: None, gc_num_epochs_to_keep: None, @@ -128,6 +135,11 @@ impl TestLoopBuilder { self } + pub(crate) fn chunks_produced(mut self, chunks_produced: Vec>) -> Self { + self.chunks_produced = Some(chunks_produced); + self + } + pub(crate) fn drop_chunks_validated_by(mut self, account_id: &str) -> Self { self.drop_chunks_validated_by = Some(account_id.parse().unwrap()); self @@ -525,11 +537,12 @@ impl TestLoopBuilder { Arc::new(self.test_loop.future_spawner()), ); - if let Some(account_id) = &self.drop_chunks_validated_by { + if self.drop_chunks_validated_by.is_some() || self.chunks_produced.is_some() { peer_manager_actor.register_override_handler(partial_encoded_chunks_dropper( self.chunks_storage.clone(), epoch_manager_adapters[idx].clone(), - account_id.clone(), + self.drop_chunks_validated_by.clone(), + self.chunks_produced.clone(), )); } diff --git a/integration-tests/src/test_loop/tests/fix_min_stake_ratio.rs b/integration-tests/src/test_loop/tests/fix_min_stake_ratio.rs index b38941f3951..7f8494ab323 100644 --- a/integration-tests/src/test_loop/tests/fix_min_stake_ratio.rs +++ b/integration-tests/src/test_loop/tests/fix_min_stake_ratio.rs @@ -64,7 +64,7 @@ fn test_fix_min_stake_ratio() { .shard_layout(ShardLayout::get_simple_nightshade_layout_v3()) .protocol_version(ProtocolFeature::FixMinStakeRatio.protocol_version() - 1) .epoch_length(epoch_length) - .validators_raw(validators, 1, 2) + .validators_raw(validators, 1, 1, 2) // For genesis, set high minimum stake ratio so that small validator // will be excluded from the validator set. .minimum_stake_ratio(Rational32::new(1, 6_250)) diff --git a/integration-tests/src/test_loop/tests/mod.rs b/integration-tests/src/test_loop/tests/mod.rs index 29c5ab9348b..3ee67522822 100644 --- a/integration-tests/src/test_loop/tests/mod.rs +++ b/integration-tests/src/test_loop/tests/mod.rs @@ -8,5 +8,6 @@ pub mod max_receipt_size; pub mod multinode_stateless_validators; pub mod multinode_test_loop_example; pub mod simple_test_loop_example; +pub mod state_sync; pub mod syncing; pub mod view_requests_to_archival_node; diff --git a/integration-tests/src/test_loop/tests/state_sync.rs b/integration-tests/src/test_loop/tests/state_sync.rs new file mode 100644 index 00000000000..1789b14d183 --- /dev/null +++ b/integration-tests/src/test_loop/tests/state_sync.rs @@ -0,0 +1,290 @@ +use near_async::messaging::SendAsync; +use near_async::test_loop::TestLoopV2; +use near_async::time::Duration; +use near_chain_configs::test_genesis::TestGenesisBuilder; +use near_network::client::ProcessTxRequest; +use near_o11y::testonly::init_test_logger; +use near_primitives::test_utils::create_user_test_signer; +use near_primitives::transaction::SignedTransaction; +use near_primitives::types::{AccountId, AccountInfo, BlockHeightDelta, Nonce, NumSeats}; + +use crate::test_loop::builder::TestLoopBuilder; +use crate::test_loop::env::{TestData, TestLoopEnv}; +use crate::test_loop::utils::transactions::get_anchor_hash; +use crate::test_loop::utils::ONE_NEAR; + +use itertools::Itertools; + +const EPOCH_LENGTH: BlockHeightDelta = 40; + +struct ShardAccounts { + boundary_accounts: Vec, + accounts: Vec>, +} + +fn generate_accounts(num_shards: usize) -> ShardAccounts { + let accounts_per_shard = 5; + + if num_shards > 27 { + todo!("don't know how to include more than 27 shards yet!"); + } + let mut boundary_accounts = Vec::::new(); + for c in b'a'..=b'z' { + if boundary_accounts.len() + 1 >= num_shards { + break; + } + let mut boundary_account = format!("{}", c as char); + while boundary_account.len() < AccountId::MIN_LEN { + boundary_account.push('0'); + } + boundary_accounts.push(boundary_account); + } + + let mut accounts = Vec::new(); + let mut account_base = "0"; + for a in boundary_accounts.iter() { + accounts.push( + (0..accounts_per_shard) + .map(|i| (format!("{}{}", account_base, i).parse().unwrap(), 1)) + .collect::>(), + ); + account_base = a.as_str(); + } + accounts.push( + (0..accounts_per_shard) + .map(|i| (format!("{}{}", account_base, i).parse().unwrap(), 1)) + .collect::>(), + ); + + ShardAccounts { boundary_accounts, accounts } +} + +struct TestState { + env: TestLoopEnv, + accounts: Vec>, +} + +fn setup_initial_blockchain(num_shards: usize, chunks_produced: Vec>) -> TestState { + let builder = TestLoopBuilder::new(); + + let num_block_producer_seats = 1; + let num_chunk_producer_seats = num_shards; + let num_validators = std::cmp::max(num_block_producer_seats, num_chunk_producer_seats); + let validators = (0..num_validators) + .map(|i| { + let account_id = format!("node{}", i); + AccountInfo { + account_id: account_id.parse().unwrap(), + public_key: near_primitives::test_utils::create_test_signer(account_id.as_str()) + .public_key(), + amount: 10000 * ONE_NEAR, + } + }) + .collect::>(); + let clients = validators.iter().map(|v| v.account_id.clone()).collect::>(); + + let ShardAccounts { boundary_accounts, accounts } = generate_accounts(num_shards); + + let mut genesis_builder = TestGenesisBuilder::new(); + genesis_builder + .genesis_time_from_clock(&builder.clock()) + .protocol_version_latest() + .genesis_height(10000) + .epoch_length(EPOCH_LENGTH) + .gas_prices_free() + .gas_limit_one_petagas() + .shard_layout_simple_v1(&boundary_accounts.iter().map(|s| s.as_str()).collect::>()) + .transaction_validity_period(1000) + .epoch_length(10) + .validators_raw( + validators, + num_block_producer_seats as NumSeats, + num_chunk_producer_seats as NumSeats, + 0, + ) + // shuffle the shard assignment so that nodes will have to state sync to catch up future tracked shards. + // This part is the only reference to state sync at all in this test, since all we check is that the blockchain + // progresses for a few epochs, meaning that state sync must have been successful. + .shuffle_shard_assignment_for_chunk_producers(true); + for accounts in accounts.iter() { + for (account, _nonce) in accounts.iter() { + genesis_builder.add_user_account_simple(account.clone(), 10000 * ONE_NEAR); + } + } + let genesis = genesis_builder.build(); + + let env = + builder.genesis(genesis.clone()).clients(clients).chunks_produced(chunks_produced).build(); + + TestState { env, accounts } +} + +fn get_wrapped(s: &[T], idx: usize) -> &T { + &s[idx % s.len()] +} + +fn get_wrapped_mut(s: &mut [T], idx: usize) -> &mut T { + &mut s[idx % s.len()] +} + +/// tries to generate transactions between lots of different pairs of shards (accounts for shard i are in accounts[i]) +fn send_txs_between_shards( + test_loop: &mut TestLoopV2, + node_data: &[TestData], + accounts: &mut [Vec<(AccountId, Nonce)>], +) { + let clients = node_data + .iter() + .map(|data| &test_loop.data.get(&data.client_sender.actor_handle()).client) + .collect_vec(); + let block_hash = get_anchor_hash(&clients); + + let num_shards = accounts.len(); + + // which client should we send txs to next? + let mut client_idx = 0; + let mut from_shard = 0; + // which account should we choose among all the accounts of a shard? + let mut account_idx = 0; + let mut shard_diff = 1; + + let mut txs_sent = 0; + while txs_sent < 200 { + let to_shard = (from_shard + shard_diff) % num_shards; + let (receiver, _nonce) = get_wrapped(&accounts[to_shard], account_idx); + let receiver = receiver.clone(); + let (sender, nonce) = get_wrapped_mut(&mut accounts[from_shard], account_idx); + + let tx = SignedTransaction::send_money( + *nonce, + sender.clone(), + receiver.clone(), + &create_user_test_signer(sender).into(), + 1000, + block_hash, + ); + *nonce += 1; + + let future = get_wrapped(node_data, client_idx) + .client_sender + .clone() + //.with_delay(Duration::milliseconds(300 * txs_sent as i64)) + .send_async(ProcessTxRequest { + transaction: tx, + is_forwarded: false, + check_only: false, + }); + drop(future); + + txs_sent += 1; + from_shard = (from_shard + 1) % num_shards; + if from_shard == 0 { + shard_diff += 1; + } + account_idx += 1; + client_idx = 1; + } +} + +/// runs the network and sends transactions at the beginning of each epoch. At the end the condition we're +/// looking for is just that a few epochs have passed, because that should only be possible if state sync was successful +/// (which will be required because we enable chunk producer shard shuffling on this chain) +fn produce_chunks(env: &mut TestLoopEnv, mut accounts: Vec>) { + let handle = env.datas[0].client_sender.actor_handle(); + let client = &env.test_loop.data.get(&handle).client; + let mut tip = client.chain.head().unwrap(); + // TODO: make this more precise. We don't have to wait 2 whole seconds, but the amount we wait will + // depend on whether this block is meant to have skipped chunks. + let timeout = client.config.min_block_production_delay + Duration::seconds(2); + + let mut epoch_id_switches = 0; + loop { + env.test_loop.run_until( + |data| { + let client = &data.get(&handle).client; + let new_tip = client.chain.head().unwrap(); + new_tip.height != tip.height + }, + timeout, + ); + + let handle = env.datas[0].client_sender.actor_handle(); + let client = &env.test_loop.data.get(&handle).client; + let new_tip = client.chain.head().unwrap(); + let header = client.chain.get_block_header(&tip.last_block_hash).unwrap(); + tracing::debug!("chunk mask for #{} {:?}", header.height(), header.chunk_mask()); + + if new_tip.epoch_id != tip.epoch_id { + epoch_id_switches += 1; + if epoch_id_switches > 2 { + break; + } + send_txs_between_shards(&mut env.test_loop, &env.datas, &mut accounts); + } + tip = new_tip; + } +} + +fn run_test(state: TestState) { + let TestState { mut env, mut accounts } = state; + let handle = env.datas[0].client_sender.actor_handle(); + let client = &env.test_loop.data.get(&handle).client; + let first_epoch_time = client.config.min_block_production_delay + * u32::try_from(EPOCH_LENGTH).unwrap_or(u32::MAX) + + Duration::seconds(2); + + send_txs_between_shards(&mut env.test_loop, &env.datas, &mut accounts); + + env.test_loop.run_until( + |data| { + let handle = env.datas[0].client_sender.actor_handle(); + let client = &data.get(&handle).client; + let tip = client.chain.head().unwrap(); + tip.epoch_id != Default::default() + }, + first_epoch_time, + ); + + produce_chunks(&mut env, accounts); + env.shutdown_and_drain_remaining_events(Duration::seconds(3)); +} + +#[derive(Debug)] +struct StateSyncTest { + num_shards: usize, + chunks_produced: &'static [&'static [bool]], +} + +static TEST_CASES: &[StateSyncTest] = &[ + // The first two make no modifications to chunks_produced, and all chunks should be produced. This is the normal case + StateSyncTest { num_shards: 2, chunks_produced: &[] }, + StateSyncTest { num_shards: 4, chunks_produced: &[] }, + // Now we miss some chunks at the beginning of the epoch + StateSyncTest { num_shards: 4, chunks_produced: &[&[false, true, true, true]] }, + StateSyncTest { + num_shards: 4, + chunks_produced: &[&[true, true, true, true], &[false, false, true, true]], + }, + StateSyncTest { + num_shards: 4, + chunks_produced: &[ + &[false, false, false, false], + &[true, true, true, true], + &[true, false, true, true], + ], + }, +]; + +#[test] +fn test_state_sync_current_epoch() { + init_test_logger(); + + for t in TEST_CASES.iter() { + tracing::info!("run test: {:?}", t); + let state = setup_initial_blockchain( + t.num_shards, + t.chunks_produced.to_vec().into_iter().map(|v| v.to_vec()).collect(), + ); + run_test(state); + } +} diff --git a/integration-tests/src/test_loop/utils/network.rs b/integration-tests/src/test_loop/utils/network.rs index 6181045bb90..099b9e7f3aa 100644 --- a/integration-tests/src/test_loop/utils/network.rs +++ b/integration-tests/src/test_loop/utils/network.rs @@ -1,20 +1,63 @@ use crate::test_loop::env::TestLoopChunksStorage; use near_epoch_manager::EpochManagerAdapter; use near_network::types::NetworkRequests; -use near_primitives::types::AccountId; +use near_primitives::hash::CryptoHash; +use near_primitives::types::{AccountId, BlockHeight, EpochId, ShardId}; use std::sync::{Arc, Mutex}; +/// returns chunks_produced[height_created - epoch_start][shard_id]. +/// If the chunk producer for this height and shard is the same as the block producer +/// for this height, however, we always return true. Because in that case the block producer +/// will include the chunk anyway even if we return false here, and then the other clients +/// will fail to download chunks for that block. +fn should_produce_chunk( + chunks_produced: &[Vec], + epoch_id: &EpochId, + prev_block_hash: &CryptoHash, + height_created: BlockHeight, + shard_id: ShardId, + epoch_manager_adapter: &dyn EpochManagerAdapter, +) -> bool { + let block_producer = + epoch_manager_adapter.get_block_producer(epoch_id, height_created).unwrap(); + let chunk_producer = + epoch_manager_adapter.get_chunk_producer(epoch_id, height_created, shard_id).unwrap(); + + if block_producer == chunk_producer { + return true; + } + let height_in_epoch = + if epoch_manager_adapter.is_next_block_epoch_start(prev_block_hash).unwrap() { + 0 + } else { + let epoch_start = + epoch_manager_adapter.get_epoch_start_height(prev_block_hash).unwrap(); + height_created - epoch_start + }; + let chunks_produced = match chunks_produced.get(height_in_epoch as usize) { + Some(s) => s, + None => return true, + }; + match chunks_produced.get(shard_id as usize) { + Some(should_produce) => *should_produce, + None => false, + } +} + /// Handler to drop all network messages relevant to chunk validated by -/// `validator_of_chunks_to_drop`. If number of nodes on chain is significant -/// enough (at least three?), this is enough to prevent chunk from being -/// included. +/// `validator_of_chunks_to_drop` or chunks we shouldn't include on chain as indicated +/// by `chunks_produced`. If number of nodes on chain is significant enough (at least three?), +/// dropping messages by `validator_of_chunks_to_drop` is enough to prevent chunk from being included. /// /// This logic can be easily extended to dropping chunk based on any rule. pub fn partial_encoded_chunks_dropper( chunks_storage: Arc>, epoch_manager_adapter: Arc, - validator_of_chunks_to_drop: AccountId, + validator_of_chunks_to_drop: Option, + chunks_produced: Option>>, ) -> Box Option> { + assert!(validator_of_chunks_to_drop.is_some() || chunks_produced.is_some()); + Box::new(move |request| { // Filter out only messages related to distributing chunk in the // network; extract `chunk_hash` from the message. @@ -65,13 +108,28 @@ pub fn partial_encoded_chunks_dropper( // Finally, we drop chunk if the given account is present in the list // of its validators. - let chunk_validators = epoch_manager_adapter - .get_chunk_validator_assignments(&epoch_id, shard_id, height_created) - .unwrap(); - if !chunk_validators.contains(&validator_of_chunks_to_drop) { - return Some(request); + if let Some(validator_of_chunks_to_drop) = &validator_of_chunks_to_drop { + let chunk_validators = epoch_manager_adapter + .get_chunk_validator_assignments(&epoch_id, shard_id, height_created) + .unwrap(); + if !chunk_validators.contains(validator_of_chunks_to_drop) { + return Some(request); + } } + // If that isn't the case, we drop the chunk if this shard and height_created should miss producing a chunk + if let Some(chunks_produced) = &chunks_produced { + if should_produce_chunk( + chunks_produced, + &epoch_id, + prev_block_hash, + height_created, + shard_id, + epoch_manager_adapter.as_ref(), + ) { + return Some(request); + } + } return None; }) } diff --git a/integration-tests/src/test_loop/utils/transactions.rs b/integration-tests/src/test_loop/utils/transactions.rs index 1f95988f16f..76a529ccd27 100644 --- a/integration-tests/src/test_loop/utils/transactions.rs +++ b/integration-tests/src/test_loop/utils/transactions.rs @@ -5,6 +5,7 @@ use near_async::messaging::{CanSend, MessageWithCallback, SendAsync}; use near_async::test_loop::TestLoopV2; use near_async::time::Duration; use near_client::test_utils::test_loop::ClientQueries; +use near_client::Client; use near_client::ProcessTxResponse; use near_network::client::ProcessTxRequest; use near_primitives::errors::InvalidTxError; @@ -18,6 +19,21 @@ use std::sync::{Arc, Mutex}; use super::{ONE_NEAR, TGAS}; +// Transactions have to be built on top of some block in chain. To make +// sure all clients accept them, we select the head of the client with +// the smallest height. +pub(crate) fn get_anchor_hash(clients: &[&Client]) -> CryptoHash { + let (_, anchor_hash) = clients + .iter() + .map(|client| { + let head = client.chain.head().unwrap(); + (head.height, head.last_block_hash) + }) + .min_by_key(|&(height, _)| height) + .unwrap(); + anchor_hash +} + /// Execute money transfers within given `TestLoop` between given accounts. /// Runs chain long enough for the transfers to be optimistically executed. /// Used to generate state changes and check that chain is able to update @@ -39,17 +55,8 @@ pub(crate) fn execute_money_transfers( .collect::>(); let num_clients = clients.len(); - // Transactions have to be built on top of some block in chain. To make - // sure all clients accept them, we select the head of the client with - // the smallest height. - let (_, anchor_hash) = clients - .iter() - .map(|client| { - let head = client.chain.head().unwrap(); - (head.height, head.last_block_hash) - }) - .min_by_key(|&(height, _)| height) - .unwrap(); + let anchor_hash = get_anchor_hash(&clients); + drop(clients); for i in 0..accounts.len() { From 4e9195a234b736c2a1cbf718f857e5858c3307f9 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 2 Oct 2024 20:55:01 -0400 Subject: [PATCH 19/70] update the StateSyncInfo struct and make it a DB migration --- chain/chain/src/chain.rs | 6 ++-- chain/chain/src/store_validator.rs | 7 ++++ chain/chain/src/store_validator/validate.rs | 13 +++++++ chain/client/src/client.rs | 24 +++++++------ core/primitives/src/sharding.rs | 19 +++++++---- core/store/src/metadata.rs | 2 +- core/store/src/migrations.rs | 38 +++++++++++++++++++++ nearcore/src/migrations.rs | 1 + 8 files changed, 90 insertions(+), 20 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 71ca3b4e92c..6f12ec947ab 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -789,11 +789,13 @@ impl Chain { let protocol_version = self.epoch_manager.get_epoch_protocol_version(block.header().epoch_id())?; let sync_hash = if checked_feature!("stable", StateSyncHashUpdate, protocol_version) { - CryptoHash::default() + None } else { - *block.header().hash() + Some(*block.header().hash()) }; let state_sync_info = StateSyncInfo { + state_sync_version: 0, + epoch_first_block: *block.header().hash(), sync_hash, shards: shards_to_state_sync .iter() diff --git a/chain/chain/src/store_validator.rs b/chain/chain/src/store_validator.rs index 6a3f0d8471a..43744ce105d 100644 --- a/chain/chain/src/store_validator.rs +++ b/chain/chain/src/store_validator.rs @@ -249,6 +249,13 @@ impl StoreValidator { DBCol::StateDlInfos => { let block_hash = CryptoHash::try_from(key_ref)?; let state_sync_info = StateSyncInfo::try_from_slice(value_ref)?; + // StateSyncInfo is valid + self.check( + &validate::state_sync_info_valid, + &block_hash, + &state_sync_info, + col, + ); // Block which can be indexed by StateSyncInfo exists self.check( &validate::state_sync_info_block_exists, diff --git a/chain/chain/src/store_validator/validate.rs b/chain/chain/src/store_validator/validate.rs index 1cc352d588c..e5ee7aae9a0 100644 --- a/chain/chain/src/store_validator/validate.rs +++ b/chain/chain/src/store_validator/validate.rs @@ -715,6 +715,19 @@ pub(crate) fn outcome_indexed_by_block_hash( err!("Outcome id {:?} is not found in DBCol::OutcomeIds", outcome_id) } +pub(crate) fn state_sync_info_valid( + _sv: &mut StoreValidator, + block_hash: &CryptoHash, + state_sync_info: &StateSyncInfo, +) -> Result<(), StoreValidatorError> { + check_discrepancy!( + state_sync_info.epoch_first_block, + *block_hash, + "Invalid StateSyncInfo stored" + ); + Ok(()) +} + pub(crate) fn state_sync_info_block_exists( sv: &mut StoreValidator, block_hash: &CryptoHash, diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 459e23ebe67..ac2b8e71360 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2565,19 +2565,21 @@ impl Client { for (epoch_first_block, mut state_sync_info) in self.chain.chain_store().iterate_state_sync_infos()? { + assert_eq!(epoch_first_block, state_sync_info.epoch_first_block); + let shards_to_split = self.get_shards_to_split(epoch_first_block, &state_sync_info, &me)?; let state_sync_timeout = self.config.state_sync_timeout; let block_header = self.chain.get_block(&epoch_first_block)?.header().clone(); let epoch_id = block_header.epoch_id(); - let CatchupState { state_sync, state_downloads, catchup } = if state_sync_info.sync_hash - != CryptoHash::default() + let CatchupState { state_sync, state_downloads, catchup } = if let Some(sync_hash) = + state_sync_info.sync_hash { - match self.catchup_state_syncs.entry(state_sync_info.sync_hash) { + match self.catchup_state_syncs.entry(sync_hash) { std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), std::collections::hash_map::Entry::Vacant(e) => { - tracing::debug!(target: "client", ?epoch_first_block, sync_hash = ?&state_sync_info.sync_hash, "inserting new state sync"); + tracing::debug!(target: "client", ?epoch_first_block, ?sync_hash, "inserting new state sync"); notify_state_sync = true; e.insert(CatchupState { state_sync: StateSync::new( @@ -2589,7 +2591,7 @@ impl Client { true, ), state_downloads: shards_to_split.clone(), - catchup: BlocksCatchUpState::new(state_sync_info.sync_hash, *epoch_id), + catchup: BlocksCatchUpState::new(sync_hash, *epoch_id), }) } } @@ -2606,7 +2608,7 @@ impl Client { } }; if let Some(sync_hash) = new_chunk_tracker.find_sync_hash(&self.chain)? { - state_sync_info.sync_hash = sync_hash; + state_sync_info.sync_hash = Some(sync_hash); let mut update = self.chain.mut_chain_store().store_update(); // note that iterate_state_sync_infos() collects everything into a Vec, so we're not // actually writing to the DB while actively iterating this column @@ -2640,10 +2642,10 @@ impl Client { }; // Here if the state sync structs are set, it must mean we've found a sync hash to use - assert_ne!(state_sync_info.sync_hash, CryptoHash::default()); + let sync_hash = state_sync_info.sync_hash.unwrap(); // For colour decorators to work, they need to printed directly. Otherwise the decorators get escaped, garble output and don't add colours. - debug!(target: "catchup", ?me, sync_hash = ?&state_sync_info.sync_hash, progress_per_shard = ?format_shard_sync_phase_per_shard(&state_downloads, false), "Catchup"); + debug!(target: "catchup", ?me, ?sync_hash, progress_per_shard = ?format_shard_sync_phase_per_shard(&state_downloads, false), "Catchup"); let use_colour = matches!(self.config.log_summary_style, LogSummaryStyle::Colored); let tracking_shards: Vec = @@ -2679,7 +2681,7 @@ impl Client { let new_shard_sync = state_downloads; match state_sync.run( &me, - state_sync_info.sync_hash, + sync_hash, new_shard_sync, &mut self.chain, self.epoch_manager.as_ref(), @@ -2696,7 +2698,7 @@ impl Client { debug!(target: "catchup", "state sync completed now catch up blocks"); self.chain.catchup_blocks_step( &me, - &state_sync_info.sync_hash, + &sync_hash, catchup, block_catch_up_task_scheduler, )?; @@ -2707,7 +2709,7 @@ impl Client { self.chain.finish_catchup_blocks( &me, &epoch_first_block, - &state_sync_info.sync_hash, + &sync_hash, &mut block_processing_artifacts, apply_chunks_done_sender.clone(), &catchup.done_blocks, diff --git a/core/primitives/src/sharding.rs b/core/primitives/src/sharding.rs index c3641f253df..5a396753bfc 100644 --- a/core/primitives/src/sharding.rs +++ b/core/primitives/src/sharding.rs @@ -62,16 +62,23 @@ pub struct ShardInfo(pub ShardId, pub ChunkHash); /// Contains the information that is used to sync state for shards as epochs switch #[derive(Clone, Debug, PartialEq, BorshSerialize, BorshDeserialize)] pub struct StateSyncInfo { + /// For now this is always set to 0. This is included because an improvement we might want to make in the future + /// is that when syncing to the current epoch's state, we currently wait for two new chunks in each shard, but + /// with some changes to the meaning of the "sync_hash", we should only need to wait for one. So this is included + /// in order to allow for this change in the future without needing another database migration. + pub state_sync_version: u32, + /// The first block of the epoch we want to state sync for. This field is not strictly required since + /// this struct is keyed by this hash in the database, but it's a small amount of data that makes + /// the info in this type more complete. + pub epoch_first_block: CryptoHash, /// The block we'll use as the "sync_hash" when state syncing. Previously, state sync /// used the first block of an epoch as the "sync_hash", and synced state to the epoch before. /// Now that state sync downloads the state of the current epoch, we need to wait a few blocks /// after applying the first block in an epoch to know what "sync_hash" we'll use. - /// In order to avoid the need for a database migration while keeping backward compatibility, - /// this field is no longer equal to the database key associated with this value, and is set to - /// CryptoHash::default() when we apply the first block in an epoch and want to sync the current - /// epoch's state. If we instead want to sync the previous epoch's state, we set this field in the same - /// way it used to be set - pub sync_hash: CryptoHash, + /// After we begin state syncing to the current epoch instead of the previous, this field is set to None + /// when we apply the first block. Before this change in the state sync protocol, we set this field + /// to Some(self.epoch_first_block). In either case, if it's set, that's the sync hash we want to use. + pub sync_hash: Option, /// Shards to fetch state pub shards: Vec, } diff --git a/core/store/src/metadata.rs b/core/store/src/metadata.rs index a07581e3922..eec060fdd5c 100644 --- a/core/store/src/metadata.rs +++ b/core/store/src/metadata.rs @@ -2,7 +2,7 @@ pub type DbVersion = u32; /// Current version of the database. -pub const DB_VERSION: DbVersion = 40; +pub const DB_VERSION: DbVersion = 41; /// Database version at which point DbKind was introduced. const DB_VERSION_WITH_KIND: DbVersion = 34; diff --git a/core/store/src/migrations.rs b/core/store/src/migrations.rs index e80cd7e6ec6..b8b2876b4d1 100644 --- a/core/store/src/migrations.rs +++ b/core/store/src/migrations.rs @@ -1,9 +1,11 @@ use crate::metadata::DbKind; use crate::{DBCol, Store, StoreUpdate}; +use anyhow::Context; use borsh::{BorshDeserialize, BorshSerialize}; use near_primitives::epoch_manager::EpochSummary; use near_primitives::epoch_manager::AGGREGATOR_KEY; use near_primitives::hash::CryptoHash; +use near_primitives::sharding::{ShardInfo, StateSyncInfo}; use near_primitives::state::FlatStateValue; use near_primitives::transaction::{ExecutionOutcomeWithIdAndProof, ExecutionOutcomeWithProof}; use near_primitives::types::{ @@ -358,3 +360,39 @@ pub fn migrate_39_to_40(store: &Store) -> anyhow::Result<()> { update.commit()?; Ok(()) } + +/// Migrates the database from version 40 to 41. +/// +/// This rewrites the contents of the StateDlInfos column +pub fn migrate_40_to_41(store: &Store) -> anyhow::Result<()> { + #[derive(BorshSerialize, BorshDeserialize)] + struct LegacyStateSyncInfo { + sync_hash: CryptoHash, + shards: Vec, + } + + let mut update = store.store_update(); + + for row in store.iter_prefix_ser::(DBCol::StateDlInfos, &[]) { + let (key, LegacyStateSyncInfo { sync_hash, shards }) = + row.context("failed deserializing legacy StateSyncInfo in StateDlInfos")?; + + let epoch_first_block = CryptoHash::try_from_slice(&key) + .context("failed deserializing CryptoHash key in StateDlInfos")?; + + if epoch_first_block != sync_hash { + tracing::warn!(key = %epoch_first_block, %sync_hash, "sync_hash field of legacy StateSyncInfo not equal to the key. Something is wrong with this node's catchup info"); + } + let new_info = StateSyncInfo { + state_sync_version: 0, + epoch_first_block: epoch_first_block, + sync_hash: Some(sync_hash), + shards, + }; + update + .set_ser(DBCol::StateDlInfos, &key, &new_info) + .context("failed writing to StateDlInfos")?; + } + update.commit()?; + Ok(()) +} diff --git a/nearcore/src/migrations.rs b/nearcore/src/migrations.rs index fbf5dc9256f..83c26937779 100644 --- a/nearcore/src/migrations.rs +++ b/nearcore/src/migrations.rs @@ -86,6 +86,7 @@ impl<'a> near_store::StoreMigrator for Migrator<'a> { 37 => near_store::migrations::migrate_37_to_38(store), 38 => near_store::migrations::migrate_38_to_39(store), 39 => near_store::migrations::migrate_39_to_40(store), + 40 => near_store::migrations::migrate_40_to_41(store), DB_VERSION.. => unreachable!(), } } From 78d6a756d5dbb5347b1d57c78cb1170dd356f832 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 2 Oct 2024 21:55:27 -0400 Subject: [PATCH 20/70] update comment :) --- chain/chain/src/chain.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 6f12ec947ab..9fdcb0e2a4b 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -4029,7 +4029,7 @@ impl Chain { if !checked_feature!("stable", StateSyncHashUpdate, next_block_protocol_version) { is_epoch_boundary } else { - // FIXME: before submitting, this needs to be fixed. can't be iterating over the whole chain inside of preprocess + // FIXME: this needs to be fixed. can't be iterating over the whole chain inside of preprocess // block like that if there are many missed chunks match self.get_current_epoch_sync_hash(next_block)? { Some(sync_hash) => sync_hash == *next_block.hash(), From 135921d4afeb595402fabc096d0800553cb7f8fb Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 2 Oct 2024 22:16:26 -0400 Subject: [PATCH 21/70] remove comment --- chain/chain/src/chain.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index c4747a106b4..cc59f3fd39e 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -782,9 +782,6 @@ impl Chain { } else { debug!(target: "chain", "Downloading state for {:?}, I'm {:?}", shards_to_state_sync, me); - // This is not the way protocol features are normally gated, but this is not actually - // a protocol feature/change. The protocol version is just an easy way to coordinate - // among nodes for now let protocol_version = self.epoch_manager.get_epoch_protocol_version(block.header().epoch_id())?; let sync_hash = if checked_feature!("stable", StateSyncHashUpdate, protocol_version) { From 15d707bf610536789bad1f51b36d6f805a49fc2e Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 2 Oct 2024 22:16:47 -0400 Subject: [PATCH 22/70] fix test --- integration-tests/src/test_loop/tests/state_sync.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/integration-tests/src/test_loop/tests/state_sync.rs b/integration-tests/src/test_loop/tests/state_sync.rs index 1789b14d183..63ba74bb82f 100644 --- a/integration-tests/src/test_loop/tests/state_sync.rs +++ b/integration-tests/src/test_loop/tests/state_sync.rs @@ -111,10 +111,14 @@ fn setup_initial_blockchain(num_shards: usize, chunks_produced: Vec>) genesis_builder.add_user_account_simple(account.clone(), 10000 * ONE_NEAR); } } - let genesis = genesis_builder.build(); - - let env = - builder.genesis(genesis.clone()).clients(clients).chunks_produced(chunks_produced).build(); + let (genesis, epoch_config_store) = genesis_builder.build(); + + let env = builder + .genesis(genesis) + .epoch_config_store(epoch_config_store) + .clients(clients) + .chunks_produced(chunks_produced) + .build(); TestState { env, accounts } } From 776de8f14a3db993f0141788dd3e837e9ed64470 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 2 Oct 2024 22:30:40 -0400 Subject: [PATCH 23/70] nit --- core/store/src/migrations.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/store/src/migrations.rs b/core/store/src/migrations.rs index b8b2876b4d1..59a6e79b623 100644 --- a/core/store/src/migrations.rs +++ b/core/store/src/migrations.rs @@ -385,7 +385,7 @@ pub fn migrate_40_to_41(store: &Store) -> anyhow::Result<()> { } let new_info = StateSyncInfo { state_sync_version: 0, - epoch_first_block: epoch_first_block, + epoch_first_block, sync_hash: Some(sync_hash), shards, }; From 9ae019ac99fa6df6e57a50342ba787638df9b550 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Fri, 11 Oct 2024 16:28:57 +0100 Subject: [PATCH 24/70] move find_sync_hash() to Client so we can call it in tests --- chain/client/src/client.rs | 23 +++++++++++++++++++++++ chain/client/src/client_actor.rs | 25 +------------------------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index e84517458be..73829bb9fb6 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2547,6 +2547,29 @@ impl Client { Ok(false) } + /// Select the block hash we are using to sync state. It will sync with the state before applying the + /// content of such block. + /// + /// The selected block will always be the first block on a new epoch: + /// . + /// TODO(current_epoch_state_sync): allow the new way of computing the sync hash for syncing to the current epoch + pub fn find_sync_hash(&self) -> Result { + let header_head = self.chain.header_head()?; + let sync_hash = header_head.last_block_hash; + let epoch_start_sync_hash = self.chain.get_epoch_start_sync_hash(&sync_hash)?; + + let genesis_hash = self.chain.genesis().hash(); + tracing::debug!( + target: "sync", + ?header_head, + ?sync_hash, + ?epoch_start_sync_hash, + ?genesis_hash, + "find_sync_hash"); + assert_ne!(&epoch_start_sync_hash, genesis_hash); + Ok(epoch_start_sync_hash) + } + /// Walks through all the ongoing state syncs for future epochs and processes them pub fn run_catchup( &mut self, diff --git a/chain/client/src/client_actor.rs b/chain/client/src/client_actor.rs index e6c3353137f..58b3ea313f8 100644 --- a/chain/client/src/client_actor.rs +++ b/chain/client/src/client_actor.rs @@ -1579,29 +1579,6 @@ impl ClientActorInner { // Sync loop will be started by check_triggers. } - /// Select the block hash we are using to sync state. It will sync with the state before applying the - /// content of such block. - /// - /// The selected block will always be the first block on a new epoch: - /// . - /// TODO(current_epoch_state_sync): allow the new way of computing the sync hash for syncing to the current epoch - fn find_sync_hash(&mut self) -> Result { - let header_head = self.client.chain.header_head()?; - let sync_hash = header_head.last_block_hash; - let epoch_start_sync_hash = self.client.chain.get_epoch_start_sync_hash(&sync_hash)?; - - let genesis_hash = self.client.chain.genesis().hash(); - tracing::debug!( - target: "sync", - ?header_head, - ?sync_hash, - ?epoch_start_sync_hash, - ?genesis_hash, - "find_sync_hash"); - assert_ne!(&epoch_start_sync_hash, genesis_hash); - Ok(epoch_start_sync_hash) - } - /// Runs catchup on repeat, if this client is a validator. /// Schedules itself again if it was not ran as response to state parts job result fn catchup(&mut self, ctx: &mut dyn DelayedActionRunner) { @@ -1938,7 +1915,7 @@ impl ClientActorInner { return Ok(false); } - let sync_hash = self.find_sync_hash()?; + let sync_hash = self.client.find_sync_hash()?; if !self.client.config.archive { self.client.chain.mut_chain_store().reset_data_pre_state_sync( sync_hash, From e4649ec89a75f33269e204a66f629682c5a5c66b Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 15 Oct 2024 13:23:36 -0400 Subject: [PATCH 25/70] refactor notify_start_sync calculation --- chain/client/src/client_actor.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/chain/client/src/client_actor.rs b/chain/client/src/client_actor.rs index 58b3ea313f8..eb78d85e885 100644 --- a/chain/client/src/client_actor.rs +++ b/chain/client/src/client_actor.rs @@ -1715,13 +1715,15 @@ impl ClientActorInner { if !should_state_sync { return; } + let was_state_syncing = matches!(&self.client.sync_status, SyncStatus::StateSync(_)); let update_sync_status_result = self.update_sync_status(); - let notify_start_sync = unwrap_and_report_state_sync_result!(update_sync_status_result); + unwrap_and_report_state_sync_result!(update_sync_status_result); let sync_hash = match &self.client.sync_status { SyncStatus::StateSync(s) => s.sync_hash, _ => unreachable!("Sync status should have been StateSync!"), }; + let notify_start_sync = !was_state_syncing; let me = signer.as_ref().map(|x| x.validator_id().clone()); let block_header = self.client.chain.get_block_header(&sync_hash); @@ -1908,11 +1910,9 @@ impl ClientActorInner { } /// Update sync status to StateSync and reset data if needed. - /// Returns true if this is the first time we run state sync and we should - /// notify shards to start syncing. - fn update_sync_status(&mut self) -> Result { + fn update_sync_status(&mut self) -> Result<(), near_chain::Error> { if let SyncStatus::StateSync(_) = self.client.sync_status { - return Ok(false); + return Ok(()); } let sync_hash = self.client.find_sync_hash()?; @@ -1927,8 +1927,7 @@ impl ClientActorInner { let new_sync_status = SyncStatus::StateSync(new_state_sync_status); self.client.sync_status.update(new_sync_status); self.client.last_time_sync_block_requested.clear(); - // This is the first time we run state sync. - return Ok(true); + return Ok(()); } /// This method returns whether we should move on to state sync. It may run From 7f42d30513af6e4095081a63b7c22414fdd367b3 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 15 Oct 2024 14:34:01 -0400 Subject: [PATCH 26/70] sync the new way when state syncing due to being behind the chain rather than catching up for chunk production --- chain/chain/src/chain.rs | 1 + chain/chain/src/garbage_collection.rs | 1 + chain/client/src/client.rs | 21 ++++++++++++++------- chain/client/src/client_actor.rs | 9 +++++++-- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index cc59f3fd39e..2df1744a82f 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -1660,6 +1660,7 @@ impl Chain { let prev_hash = *header.prev_hash(); let prev_block = self.get_block(&prev_hash)?; + // TODO(current_epoch_state_sync): fix this when syncing to the current epoch's state // The congestion control added a dependency on the prev block when // applying chunks in a block. This means that we need to keep the // blocks at sync hash, prev hash and prev prev hash. diff --git a/chain/chain/src/garbage_collection.rs b/chain/chain/src/garbage_collection.rs index 876617fb357..edc6e3e1cbb 100644 --- a/chain/chain/src/garbage_collection.rs +++ b/chain/chain/src/garbage_collection.rs @@ -318,6 +318,7 @@ impl ChainStore { let header = self.get_block_header(&sync_hash)?; let prev_hash = *header.prev_hash(); let sync_height = header.height(); + // TODO(current_epoch_state_sync): fix this when syncing to the current epoch's state // The congestion control added a dependency on the prev block when // applying chunks in a block. This means that we need to keep the // blocks at sync hash, prev hash and prev prev hash. The heigh of that diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 73829bb9fb6..a448dc5c20e 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -62,6 +62,7 @@ use near_pool::InsertTransactionResult; use near_primitives::block::{Approval, ApprovalInner, ApprovalMessage, Block, BlockHeader, Tip}; use near_primitives::block_header::ApprovalType; use near_primitives::challenge::{Challenge, ChallengeBody, PartialState}; +use near_primitives::checked_feature; use near_primitives::epoch_info::RngSeed; use near_primitives::errors::EpochError; use near_primitives::hash::CryptoHash; @@ -2552,22 +2553,28 @@ impl Client { /// /// The selected block will always be the first block on a new epoch: /// . - /// TODO(current_epoch_state_sync): allow the new way of computing the sync hash for syncing to the current epoch - pub fn find_sync_hash(&self) -> Result { + pub(crate) fn find_sync_hash(&self) -> Result, near_chain::Error> { let header_head = self.chain.header_head()?; - let sync_hash = header_head.last_block_hash; - let epoch_start_sync_hash = self.chain.get_epoch_start_sync_hash(&sync_hash)?; + let header = self.chain.get_block_header(&header_head.last_block_hash)?; + let protocol_version = self.epoch_manager.get_epoch_protocol_version(header.epoch_id())?; + let sync_hash = if checked_feature!("stable", StateSyncHashUpdate, protocol_version) { + match self.chain.get_current_epoch_sync_hash(&header)? { + Some(h) => h, + None => return Ok(None), + } + } else { + self.chain.get_epoch_start_sync_hash(&header_head.last_block_hash)? + }; let genesis_hash = self.chain.genesis().hash(); tracing::debug!( target: "sync", ?header_head, ?sync_hash, - ?epoch_start_sync_hash, ?genesis_hash, "find_sync_hash"); - assert_ne!(&epoch_start_sync_hash, genesis_hash); - Ok(epoch_start_sync_hash) + assert_ne!(&sync_hash, genesis_hash); + Ok(Some(sync_hash)) } /// Walks through all the ongoing state syncs for future epochs and processes them diff --git a/chain/client/src/client_actor.rs b/chain/client/src/client_actor.rs index eb78d85e885..c93fd3ef21e 100644 --- a/chain/client/src/client_actor.rs +++ b/chain/client/src/client_actor.rs @@ -1721,7 +1721,8 @@ impl ClientActorInner { let sync_hash = match &self.client.sync_status { SyncStatus::StateSync(s) => s.sync_hash, - _ => unreachable!("Sync status should have been StateSync!"), + // sync hash isn't known yet. Return and try again later. + _ => return, }; let notify_start_sync = !was_state_syncing; @@ -1915,7 +1916,11 @@ impl ClientActorInner { return Ok(()); } - let sync_hash = self.client.find_sync_hash()?; + let sync_hash = if let Some(sync_hash) = self.client.find_sync_hash()? { + sync_hash + } else { + return Ok(()); + }; if !self.client.config.archive { self.client.chain.mut_chain_store().reset_data_pre_state_sync( sync_hash, From abb174ddd48f54eeac14e37e69faae4785984d53 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 15 Oct 2024 15:52:08 -0400 Subject: [PATCH 27/70] fix check_sync_hash_validity() --- chain/chain/src/chain.rs | 29 +++++++++++++++-------------- chain/client/src/client.rs | 2 +- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 2df1744a82f..4eafcd47c61 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -4529,24 +4529,25 @@ impl Chain { } /// Check that sync_hash is the first block of an epoch. - /// TODO(current_epoch_state_sync): allow the new way of computing the sync hash for syncing to the current epoch pub fn check_sync_hash_validity(&self, sync_hash: &CryptoHash) -> Result { // It's important to check that Block exists because we will sync with it. // Do not replace with `get_block_header()`. let sync_block = self.get_block(sync_hash)?; - let prev_hash = *sync_block.header().prev_hash(); - let is_first_block_of_epoch = self.epoch_manager.is_next_block_epoch_start(&prev_hash); - tracing::debug!( - target: "chain", - ?sync_hash, - ?prev_hash, - sync_hash_epoch_id = ?sync_block.header().epoch_id(), - sync_hash_next_epoch_id = ?sync_block.header().next_epoch_id(), - ?is_first_block_of_epoch, - "check_sync_hash_validity"); - - // If sync_hash is not on the Epoch boundary, it's malicious behavior - Ok(is_first_block_of_epoch?) + let protocol_version = + self.epoch_manager.get_epoch_protocol_version(sync_block.header().epoch_id())?; + + if checked_feature!("stable", StateSyncHashUpdate, protocol_version) { + // TODO(current_epoch_state_sync): replace this with a more efficient lookup + match self.get_current_epoch_sync_hash(sync_block.header())? { + Some(h) => Ok(*sync_hash == h), + None => Ok(false), + } + } else { + let prev_hash = *sync_block.header().prev_hash(); + let is_first_block_of_epoch = self.epoch_manager.is_next_block_epoch_start(&prev_hash); + // If sync_hash is not on the Epoch boundary, it's malicious behavior + Ok(is_first_block_of_epoch?) + } } /// Get transaction result for given hash of transaction or receipt id diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index a448dc5c20e..ee9097e04d3 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2553,7 +2553,7 @@ impl Client { /// /// The selected block will always be the first block on a new epoch: /// . - pub(crate) fn find_sync_hash(&self) -> Result, near_chain::Error> { + pub fn find_sync_hash(&self) -> Result, near_chain::Error> { let header_head = self.chain.header_head()?; let header = self.chain.get_block_header(&header_head.last_block_hash)?; let protocol_version = self.epoch_manager.get_epoch_protocol_version(header.epoch_id())?; From e8e9f3593ac95b6119e3986992180f835c6741c6 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 15 Oct 2024 15:58:59 -0400 Subject: [PATCH 28/70] fix test_catchup_gas_price_change --- .../src/tests/client/process_blocks.rs | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/integration-tests/src/tests/client/process_blocks.rs b/integration-tests/src/tests/client/process_blocks.rs index 11d1bd34229..e1c235b94e1 100644 --- a/integration-tests/src/tests/client/process_blocks.rs +++ b/integration-tests/src/tests/client/process_blocks.rs @@ -2381,7 +2381,9 @@ fn test_catchup_gas_price_change() { assert_eq!(env.clients[0].process_tx(tx, false, false), ProcessTxResponse::ValidTx); } - for i in 3..=6 { + // We go up to height 8 because height 6 is the first block of the new epoch, and we want at least + // two more blocks if syncing to the current epoch's state + for i in 3..=8 { let block = env.clients[0].produce_block(i).unwrap().unwrap(); blocks.push(block.clone()); env.process_block(0, block.clone(), Provenance::PRODUCED); @@ -2397,8 +2399,17 @@ fn test_catchup_gas_price_change() { .is_err()); // Simulate state sync - let sync_hash = *blocks[5].hash(); - assert_ne!(blocks[4].header().epoch_id(), blocks[5].header().epoch_id()); + + let sync_hash = env.clients[0].find_sync_hash().unwrap().unwrap(); + let sync_block_idx = blocks + .iter() + .position(|b| *b.hash() == sync_hash) + .expect("block with hash matching sync hash not found"); + if sync_block_idx == 0 { + panic!("sync block should not be the first block produced"); + } + let sync_prev_block = &blocks[sync_block_idx - 1]; + assert!(env.clients[0].chain.check_sync_hash_validity(&sync_hash).unwrap()); let state_sync_header = env.clients[0].chain.get_state_response_header(0, sync_hash).unwrap(); let num_parts = state_sync_header.num_state_parts(); @@ -2440,10 +2451,14 @@ fn test_catchup_gas_price_change() { }); env.clients[1].chain.schedule_apply_state_parts(0, sync_hash, num_parts, &f).unwrap(); env.clients[1].chain.set_state_finalize(0, sync_hash).unwrap(); - let chunk_extra_after_sync = - env.clients[1].chain.get_chunk_extra(blocks[4].hash(), &ShardUId::single_shard()).unwrap(); - let expected_chunk_extra = - env.clients[0].chain.get_chunk_extra(blocks[4].hash(), &ShardUId::single_shard()).unwrap(); + let chunk_extra_after_sync = env.clients[1] + .chain + .get_chunk_extra(sync_prev_block.hash(), &ShardUId::single_shard()) + .unwrap(); + let expected_chunk_extra = env.clients[0] + .chain + .get_chunk_extra(sync_prev_block.hash(), &ShardUId::single_shard()) + .unwrap(); // The chunk extra of the prev block of sync block should be the same as the node that it is syncing from assert_eq!(chunk_extra_after_sync, expected_chunk_extra); } From b45cff187d6bdf6cee02fb45a753cd09a50040a8 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 15 Oct 2024 17:37:15 -0400 Subject: [PATCH 29/70] fix test_dump_epoch_missing_chunk_in_last_block --- .../src/tests/client/sync_state_nodes.rs | 72 +++++++++++++------ 1 file changed, 52 insertions(+), 20 deletions(-) diff --git a/integration-tests/src/tests/client/sync_state_nodes.rs b/integration-tests/src/tests/client/sync_state_nodes.rs index 71129748dbe..60f7b4b5bdc 100644 --- a/integration-tests/src/tests/client/sync_state_nodes.rs +++ b/integration-tests/src/tests/client/sync_state_nodes.rs @@ -16,6 +16,7 @@ use near_network::tcp; use near_network::test_utils::{convert_boot_nodes, wait_or_timeout, WaitOrTimeoutActor}; use near_o11y::testonly::{init_integration_logger, init_test_logger}; use near_o11y::WithSpanContextExt; +use near_primitives::checked_feature; use near_primitives::shard_layout::ShardUId; use near_primitives::state_part::PartId; use near_primitives::state_sync::{CachedParts, StatePartKey}; @@ -552,12 +553,12 @@ fn test_dump_epoch_missing_chunk_in_last_block() { init_test_logger(); let epoch_length = 10; - for num_last_chunks_missing in 0..6 { - assert!(num_last_chunks_missing < epoch_length); + for num_chunks_missing in 0..6 { + assert!(num_chunks_missing < epoch_length); tracing::info!( target: "test", - ?num_last_chunks_missing, + ?num_chunks_missing, "starting test_dump_epoch_missing_chunk_in_last_block" ); let mut genesis = @@ -575,8 +576,35 @@ fn test_dump_epoch_missing_chunk_in_last_block() { let signer = InMemorySigner::from_seed("test0".parse().unwrap(), KeyType::ED25519, "test0") .into(); - let target_height = epoch_length + 1; - for i in 1..=target_height { + + let next_epoch_start = epoch_length + 1; + let protocol_version = env.clients[0] + .epoch_manager + .get_epoch_protocol_version(&EpochId::default()) + .unwrap(); + // Note that the height to skip here refers to the height at which not to produce chunks for the next block, so really + // one before the block height that will have no chunks. The sync_height is the height of the sync_hash block. + let (start_skipping_chunks, sync_height) = + if checked_feature!("stable", StateSyncHashUpdate, protocol_version) { + // At the beginning of the epoch, produce one block with chunks and then start skipping chunks. + let start_skipping_chunks = next_epoch_start + 1; + // Then we will skip `num_chunks_missing` chunks, and produce one more with chunks, which will be the sync height. + let sync_height = start_skipping_chunks + num_chunks_missing + 1; + (start_skipping_chunks, sync_height) + } else { + // here the sync hash is the first hash of the epoch + let sync_height = next_epoch_start; + // skip chunks before the epoch start, but not including the one right before the epochs start. + let start_skipping_chunks = sync_height - num_chunks_missing - 1; + (start_skipping_chunks, sync_height) + }; + + // produce chunks right before the sync hash block, so that the sync hash block itself will have new chunks. + let stop_skipping_chunks = sync_height - 1; + + assert!(sync_height < 2 * epoch_length + 1); + + for i in 1..=sync_height { tracing::info!( target: "test", height=i, @@ -585,9 +613,7 @@ fn test_dump_epoch_missing_chunk_in_last_block() { let block = env.clients[0].produce_block(i).unwrap().unwrap(); blocks.push(block.clone()); - if (i % epoch_length) != 0 - && epoch_length - (i % epoch_length) <= num_last_chunks_missing - { + if i >= start_skipping_chunks && i < stop_skipping_chunks { // Don't produce chunks for the last blocks of an epoch. env.clients[0] .process_block_test_no_produce_chunk( @@ -622,13 +648,17 @@ fn test_dump_epoch_missing_chunk_in_last_block() { // Simulate state sync tracing::info!(target: "test", "state sync - get parts"); - // No blocks were skipped, therefore we can compute the block height of the first block of the current epoch. - let sync_hash_height = ((target_height / epoch_length) * epoch_length + 1) as usize; - let sync_hash = *blocks[sync_hash_height].hash(); - assert_ne!( - blocks[sync_hash_height].header().epoch_id(), - blocks[sync_hash_height - 1].header().epoch_id() - ); + let sync_hash = env.clients[0].find_sync_hash().unwrap().unwrap(); + let sync_block_idx = blocks + .iter() + .position(|b| *b.hash() == sync_hash) + .expect("block with hash matching sync hash not found"); + let sync_block = &blocks[sync_block_idx]; + if sync_block_idx == 0 { + panic!("sync block should not be the first block produced"); + } + let sync_prev_block = &blocks[sync_block_idx - 1]; + let sync_prev_height_included = sync_prev_block.chunks()[0].height_included(); let state_sync_header = env.clients[0].chain.get_state_response_header(0, sync_hash).unwrap(); @@ -644,7 +674,7 @@ fn test_dump_epoch_missing_chunk_in_last_block() { tracing::info!(target: "test", "state sync - apply parts"); env.clients[1].chain.reset_data_pre_state_sync(sync_hash).unwrap(); - let epoch_id = blocks.last().unwrap().header().epoch_id(); + let epoch_id = sync_block.header().epoch_id(); for i in 0..num_parts { env.clients[1] .runtime_adapter @@ -708,9 +738,11 @@ fn test_dump_epoch_missing_chunk_in_last_block() { tracing::info!(target: "test", "state sync - set state finalize"); env.clients[1].chain.set_state_finalize(0, sync_hash).unwrap(); - let last_chunk_height = epoch_length - num_last_chunks_missing; - for height in 1..epoch_length { - if height < last_chunk_height { + // We apply chunks from the block with height `sync_prev_height_included` up to `sync_prev`. So there should + // be chunk extras for those all equal to the chunk extra for `sync_prev_height_included`, and no chunk extras + // for any other height. + for height in 1..sync_height { + if height < sync_prev_height_included || height >= sync_height { assert!(env.clients[1] .chain .get_chunk_extra(blocks[height as usize].hash(), &ShardUId::single_shard()) @@ -723,7 +755,7 @@ fn test_dump_epoch_missing_chunk_in_last_block() { let expected_chunk_extra = env.clients[0] .chain .get_chunk_extra( - blocks[last_chunk_height as usize].hash(), + blocks[sync_prev_height_included as usize].hash(), &ShardUId::single_shard(), ) .unwrap(); From f6d860f72c898bc8e2ec72d5e18e44f47ef23423 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 15 Oct 2024 23:28:14 -0400 Subject: [PATCH 30/70] fix test_state_sync_headers --- integration-tests/src/tests/client/sync_state_nodes.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/integration-tests/src/tests/client/sync_state_nodes.rs b/integration-tests/src/tests/client/sync_state_nodes.rs index 60f7b4b5bdc..5d9f7ca14ac 100644 --- a/integration-tests/src/tests/client/sync_state_nodes.rs +++ b/integration-tests/src/tests/client/sync_state_nodes.rs @@ -24,6 +24,7 @@ use near_primitives::transaction::SignedTransaction; use near_primitives::types::{BlockId, BlockReference, EpochId, EpochReference}; use near_primitives::utils::MaybeValidated; use near_primitives_core::types::ShardId; +use near_primitives_core::version::PROTOCOL_VERSION; use near_store::adapter::StoreUpdateAdapter; use near_store::DBCol; use nearcore::test_utils::TestEnvNightshadeSetupExt; @@ -827,9 +828,16 @@ fn test_state_sync_headers() { }; tracing::info!(epoch_start_height, "got epoch_start_height"); + let sync_height = + if checked_feature!("stable", StateSyncHashUpdate, PROTOCOL_VERSION) { + // here since there's only one block/chunk producer, we assume that no blocks will be missing chunks. + epoch_start_height + 2 + } else { + epoch_start_height + }; let sync_hash_and_num_shards = match view_client1 .send( - GetBlock(BlockReference::BlockId(BlockId::Height(epoch_start_height))) + GetBlock(BlockReference::BlockId(BlockId::Height(sync_height))) .with_span_context(), ) .await From 8c58ee5072d50859239891f115f9d104b9b2cf47 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 15 Oct 2024 23:31:13 -0400 Subject: [PATCH 31/70] fix test_state_sync_headers_no_tracked_shards --- integration-tests/src/tests/client/sync_state_nodes.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/integration-tests/src/tests/client/sync_state_nodes.rs b/integration-tests/src/tests/client/sync_state_nodes.rs index 5d9f7ca14ac..1769048b997 100644 --- a/integration-tests/src/tests/client/sync_state_nodes.rs +++ b/integration-tests/src/tests/client/sync_state_nodes.rs @@ -1051,9 +1051,16 @@ fn test_state_sync_headers_no_tracked_shards() { return ControlFlow::Continue(()); } + let sync_height = + if checked_feature!("stable", StateSyncHashUpdate, PROTOCOL_VERSION) { + // here since there's only one block/chunk producer, we assume that no blocks will be missing chunks. + epoch_start_height + 2 + } else { + epoch_start_height + }; let sync_hash_and_num_shards = match view_client2 .send( - GetBlock(BlockReference::BlockId(BlockId::Height(epoch_start_height))) + GetBlock(BlockReference::BlockId(BlockId::Height(sync_height))) .with_span_context(), ) .await From 6fa6c3c6e460c786055bced1bf830602c8b723ec Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 16 Oct 2024 00:24:09 -0400 Subject: [PATCH 32/70] fix test_sync_and_call_cached_contract --- integration-tests/src/tests/client/process_blocks.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/integration-tests/src/tests/client/process_blocks.rs b/integration-tests/src/tests/client/process_blocks.rs index e1c235b94e1..3424387537b 100644 --- a/integration-tests/src/tests/client/process_blocks.rs +++ b/integration-tests/src/tests/client/process_blocks.rs @@ -39,6 +39,7 @@ use near_parameters::{ActionCosts, ExtCosts}; use near_parameters::{RuntimeConfig, RuntimeConfigStore}; use near_primitives::block::Approval; use near_primitives::block_header::BlockHeader; +use near_primitives::checked_feature; use near_primitives::errors::TxExecutionError; use near_primitives::errors::{ActionError, ActionErrorKind, InvalidTxError}; use near_primitives::hash::{hash, CryptoHash}; @@ -3672,8 +3673,15 @@ mod contract_precompilation_tests { start_height, ); + let sync_height = if checked_feature!("stable", StateSyncHashUpdate, PROTOCOL_VERSION) { + // `height` is one more than the start of the epoch. Produce two more blocks with chunks. + produce_blocks_from_height(&mut env, 2, height) - 1 + } else { + height - 1 + }; + // Perform state sync for the second client. - state_sync_on_height(&mut env, height - 1); + state_sync_on_height(&mut env, sync_height); // Check existence of contract in both caches. let contract_code = ContractCode::new(wasm_code.clone(), None); @@ -3693,7 +3701,7 @@ mod contract_precompilation_tests { // Check that contract function may be successfully called on the second client. // Note that we can't test that behaviour is the same on two clients, because // compile_module_cached_wasmer0 is cached by contract key via macro. - let block = env.clients[0].chain.get_block_by_height(EPOCH_LENGTH).unwrap(); + let block = env.clients[0].chain.get_block_by_height(sync_height - 1).unwrap(); let chunk_extra = env.clients[0].chain.get_chunk_extra(block.hash(), &ShardUId::single_shard()).unwrap(); let state_root = *chunk_extra.state_root(); From 7be867918d6f0685dce71992c4179ba0dc3426b9 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 16 Oct 2024 00:30:44 -0400 Subject: [PATCH 33/70] fix test_two_deployments --- integration-tests/src/tests/client/process_blocks.rs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/integration-tests/src/tests/client/process_blocks.rs b/integration-tests/src/tests/client/process_blocks.rs index 3424387537b..155b0edfd0c 100644 --- a/integration-tests/src/tests/client/process_blocks.rs +++ b/integration-tests/src/tests/client/process_blocks.rs @@ -3742,6 +3742,7 @@ mod contract_precompilation_tests { #[test] #[cfg_attr(all(target_arch = "aarch64", target_vendor = "apple"), ignore)] fn test_two_deployments() { + init_integration_logger(); let num_clients = 2; let mut genesis = Genesis::test(vec!["test0".parse().unwrap(), "test1".parse().unwrap()], 1); @@ -3782,11 +3783,18 @@ mod contract_precompilation_tests { height, ); + let sync_height = if checked_feature!("stable", StateSyncHashUpdate, PROTOCOL_VERSION) { + // `height` is one more than the start of the epoch. Produce two more blocks with chunks. + produce_blocks_from_height(&mut env, 2, height) - 1 + } else { + height - 1 + }; + // Perform state sync for the second client on the last produced height. - state_sync_on_height(&mut env, height - 1); + state_sync_on_height(&mut env, sync_height); let epoch_id = - *env.clients[0].chain.get_block_by_height(height - 1).unwrap().header().epoch_id(); + *env.clients[0].chain.get_block_by_height(sync_height).unwrap().header().epoch_id(); let runtime_config = env.get_runtime_config(0, epoch_id); let tiny_contract_key = get_contract_cache_key( *ContractCode::new(tiny_wasm_code.clone(), None).hash(), From 80174d5aac12771112513b3f5b8bf6471bff1b2a Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 16 Oct 2024 12:31:01 -0400 Subject: [PATCH 34/70] fix test_sync_after_delete_account --- integration-tests/src/tests/client/process_blocks.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/integration-tests/src/tests/client/process_blocks.rs b/integration-tests/src/tests/client/process_blocks.rs index 155b0edfd0c..58ed9365f19 100644 --- a/integration-tests/src/tests/client/process_blocks.rs +++ b/integration-tests/src/tests/client/process_blocks.rs @@ -3861,7 +3861,14 @@ mod contract_precompilation_tests { env.clients[0].process_tx(delete_account_tx, false, false), ProcessTxResponse::ValidTx ); - height = produce_blocks_from_height(&mut env, EPOCH_LENGTH + 1, height); + // `height` is the first block of a new epoch (which has not been produced yet), + // so if we want to state sync the old way, we produce `EPOCH_LENGTH` + 1 new blocks + // to get to produce the first block of the next epoch. If we want to state sync the new + // way, we produce two more than that + let num_new_blocks_in_epoch = + if checked_feature!("stable", StateSyncHashUpdate, PROTOCOL_VERSION) { 3 } else { 1 }; + height = + produce_blocks_from_height(&mut env, EPOCH_LENGTH + num_new_blocks_in_epoch, height); // Perform state sync for the second client. state_sync_on_height(&mut env, height - 1); From 97d3abc68f27ae59dabcaa35ca9bf9b12a0d8e20 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Fri, 18 Oct 2024 17:17:54 -0400 Subject: [PATCH 35/70] fix test_state_sync_w_dumped_parts --- .../src/tests/client/state_dump.rs | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/integration-tests/src/tests/client/state_dump.rs b/integration-tests/src/tests/client/state_dump.rs index 266f426458c..2cb45378801 100644 --- a/integration-tests/src/tests/client/state_dump.rs +++ b/integration-tests/src/tests/client/state_dump.rs @@ -191,6 +191,7 @@ fn run_state_sync_with_dumped_parts( account_creation_at_epoch_height * epoch_length + 1 }; + let mut sync_hash = None; let signer: Signer = signer.into(); for i in 1..=dump_node_head_height { if i == account_creation_at_height { @@ -209,7 +210,21 @@ fn run_state_sync_with_dumped_parts( blocks.push(block.clone()); env.process_block(0, block.clone(), Provenance::PRODUCED); env.process_block(1, block.clone(), Provenance::NONE); + + if block.header().epoch_id() != &Default::default() { + let final_header = env.clients[0] + .chain + .get_block_header(block.header().last_final_block()) + .unwrap(); + if block.header().epoch_id() == final_header.epoch_id() { + if let Some(current_sync_hash) = env.clients[0].find_sync_hash().unwrap() { + sync_hash = Some(current_sync_hash); + } + } + } } + // We must have seen at least one block that was two ahead of the epoch start + let sync_hash = sync_hash.unwrap(); // check that the new account exists let head = env.clients[0].chain.head().unwrap(); @@ -250,14 +265,6 @@ fn run_state_sync_with_dumped_parts( let epoch_info = epoch_manager.get_epoch_info(&epoch_id).unwrap(); let epoch_height = epoch_info.epoch_height(); - let sync_block_height = (epoch_length * epoch_height + 1) as usize; - let sync_hash = *blocks[sync_block_height - 1].hash(); - - // the block at sync_block_height should be the start of an epoch - assert_ne!( - blocks[sync_block_height - 1].header().epoch_id(), - blocks[sync_block_height - 2].header().epoch_id() - ); assert!(env.clients[0].chain.check_sync_hash_validity(&sync_hash).unwrap()); let state_sync_header = env.clients[0].chain.get_state_response_header(0, sync_hash).unwrap(); @@ -380,7 +387,7 @@ fn run_state_sync_with_dumped_parts( } #[test] -/// This test verifies that after state sync, the syncing node has the data that corresponds to the state of the epoch previous to the dumping node's final block. +/// This test verifies that after state sync, the syncing node has the data that corresponds to the state of the epoch previous (or current) to the dumping node's final block. /// Specifically, it tests that the above holds true in both conditions: /// - the dumping node's head is in new epoch but final block is not; /// - the dumping node's head and final block are in same epoch From 3566cd033325e5400b837ea87f9ad60f945c69b5 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Fri, 18 Oct 2024 20:33:38 -0400 Subject: [PATCH 36/70] use ProtocolFeature::enabled() --- chain/chain/src/chain.rs | 6 ++-- chain/client/src/client.rs | 3 +- .../src/tests/client/process_blocks.rs | 7 ++-- .../src/tests/client/sync_state_nodes.rs | 33 +++++++++---------- nearcore/src/state_sync.rs | 13 ++++---- 5 files changed, 29 insertions(+), 33 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 4eafcd47c61..f2e628fc8ca 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -784,7 +784,7 @@ impl Chain { let protocol_version = self.epoch_manager.get_epoch_protocol_version(block.header().epoch_id())?; - let sync_hash = if checked_feature!("stable", StateSyncHashUpdate, protocol_version) { + let sync_hash = if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { None } else { Some(*block.header().hash()) @@ -3944,7 +3944,7 @@ impl Chain { let next_block_protocol_version = self.epoch_manager.get_epoch_protocol_version(&next_block_epoch)?; let next_block_is_sync_block = - if !checked_feature!("stable", StateSyncHashUpdate, next_block_protocol_version) { + if !ProtocolFeature::StateSyncHashUpdate.enabled(next_block_protocol_version) { is_epoch_boundary } else { // FIXME: this needs to be fixed. can't be iterating over the whole chain inside of preprocess @@ -4536,7 +4536,7 @@ impl Chain { let protocol_version = self.epoch_manager.get_epoch_protocol_version(sync_block.header().epoch_id())?; - if checked_feature!("stable", StateSyncHashUpdate, protocol_version) { + if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { // TODO(current_epoch_state_sync): replace this with a more efficient lookup match self.get_current_epoch_sync_hash(sync_block.header())? { Some(h) => Ok(*sync_hash == h), diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index ee9097e04d3..5e8eba62759 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -62,7 +62,6 @@ use near_pool::InsertTransactionResult; use near_primitives::block::{Approval, ApprovalInner, ApprovalMessage, Block, BlockHeader, Tip}; use near_primitives::block_header::ApprovalType; use near_primitives::challenge::{Challenge, ChallengeBody, PartialState}; -use near_primitives::checked_feature; use near_primitives::epoch_info::RngSeed; use near_primitives::errors::EpochError; use near_primitives::hash::CryptoHash; @@ -2557,7 +2556,7 @@ impl Client { let header_head = self.chain.header_head()?; let header = self.chain.get_block_header(&header_head.last_block_hash)?; let protocol_version = self.epoch_manager.get_epoch_protocol_version(header.epoch_id())?; - let sync_hash = if checked_feature!("stable", StateSyncHashUpdate, protocol_version) { + let sync_hash = if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { match self.chain.get_current_epoch_sync_hash(&header)? { Some(h) => h, None => return Ok(None), diff --git a/integration-tests/src/tests/client/process_blocks.rs b/integration-tests/src/tests/client/process_blocks.rs index 58ed9365f19..67ae50decc7 100644 --- a/integration-tests/src/tests/client/process_blocks.rs +++ b/integration-tests/src/tests/client/process_blocks.rs @@ -39,7 +39,6 @@ use near_parameters::{ActionCosts, ExtCosts}; use near_parameters::{RuntimeConfig, RuntimeConfigStore}; use near_primitives::block::Approval; use near_primitives::block_header::BlockHeader; -use near_primitives::checked_feature; use near_primitives::errors::TxExecutionError; use near_primitives::errors::{ActionError, ActionErrorKind, InvalidTxError}; use near_primitives::hash::{hash, CryptoHash}; @@ -3673,7 +3672,7 @@ mod contract_precompilation_tests { start_height, ); - let sync_height = if checked_feature!("stable", StateSyncHashUpdate, PROTOCOL_VERSION) { + let sync_height = if ProtocolFeature::StateSyncHashUpdate.enabled(PROTOCOL_VERSION) { // `height` is one more than the start of the epoch. Produce two more blocks with chunks. produce_blocks_from_height(&mut env, 2, height) - 1 } else { @@ -3783,7 +3782,7 @@ mod contract_precompilation_tests { height, ); - let sync_height = if checked_feature!("stable", StateSyncHashUpdate, PROTOCOL_VERSION) { + let sync_height = if ProtocolFeature::StateSyncHashUpdate.enabled(PROTOCOL_VERSION) { // `height` is one more than the start of the epoch. Produce two more blocks with chunks. produce_blocks_from_height(&mut env, 2, height) - 1 } else { @@ -3866,7 +3865,7 @@ mod contract_precompilation_tests { // to get to produce the first block of the next epoch. If we want to state sync the new // way, we produce two more than that let num_new_blocks_in_epoch = - if checked_feature!("stable", StateSyncHashUpdate, PROTOCOL_VERSION) { 3 } else { 1 }; + if ProtocolFeature::StateSyncHashUpdate.enabled(PROTOCOL_VERSION) { 3 } else { 1 }; height = produce_blocks_from_height(&mut env, EPOCH_LENGTH + num_new_blocks_in_epoch, height); diff --git a/integration-tests/src/tests/client/sync_state_nodes.rs b/integration-tests/src/tests/client/sync_state_nodes.rs index 1769048b997..5441bc76f72 100644 --- a/integration-tests/src/tests/client/sync_state_nodes.rs +++ b/integration-tests/src/tests/client/sync_state_nodes.rs @@ -16,7 +16,6 @@ use near_network::tcp; use near_network::test_utils::{convert_boot_nodes, wait_or_timeout, WaitOrTimeoutActor}; use near_o11y::testonly::{init_integration_logger, init_test_logger}; use near_o11y::WithSpanContextExt; -use near_primitives::checked_feature; use near_primitives::shard_layout::ShardUId; use near_primitives::state_part::PartId; use near_primitives::state_sync::{CachedParts, StatePartKey}; @@ -24,7 +23,7 @@ use near_primitives::transaction::SignedTransaction; use near_primitives::types::{BlockId, BlockReference, EpochId, EpochReference}; use near_primitives::utils::MaybeValidated; use near_primitives_core::types::ShardId; -use near_primitives_core::version::PROTOCOL_VERSION; +use near_primitives_core::version::{ProtocolFeature, PROTOCOL_VERSION}; use near_store::adapter::StoreUpdateAdapter; use near_store::DBCol; use nearcore::test_utils::TestEnvNightshadeSetupExt; @@ -586,7 +585,7 @@ fn test_dump_epoch_missing_chunk_in_last_block() { // Note that the height to skip here refers to the height at which not to produce chunks for the next block, so really // one before the block height that will have no chunks. The sync_height is the height of the sync_hash block. let (start_skipping_chunks, sync_height) = - if checked_feature!("stable", StateSyncHashUpdate, protocol_version) { + if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { // At the beginning of the epoch, produce one block with chunks and then start skipping chunks. let start_skipping_chunks = next_epoch_start + 1; // Then we will skip `num_chunks_missing` chunks, and produce one more with chunks, which will be the sync height. @@ -828,13 +827,13 @@ fn test_state_sync_headers() { }; tracing::info!(epoch_start_height, "got epoch_start_height"); - let sync_height = - if checked_feature!("stable", StateSyncHashUpdate, PROTOCOL_VERSION) { - // here since there's only one block/chunk producer, we assume that no blocks will be missing chunks. - epoch_start_height + 2 - } else { - epoch_start_height - }; + let sync_height = if ProtocolFeature::StateSyncHashUpdate.enabled(PROTOCOL_VERSION) + { + // here since there's only one block/chunk producer, we assume that no blocks will be missing chunks. + epoch_start_height + 2 + } else { + epoch_start_height + }; let sync_hash_and_num_shards = match view_client1 .send( GetBlock(BlockReference::BlockId(BlockId::Height(sync_height))) @@ -1051,13 +1050,13 @@ fn test_state_sync_headers_no_tracked_shards() { return ControlFlow::Continue(()); } - let sync_height = - if checked_feature!("stable", StateSyncHashUpdate, PROTOCOL_VERSION) { - // here since there's only one block/chunk producer, we assume that no blocks will be missing chunks. - epoch_start_height + 2 - } else { - epoch_start_height - }; + let sync_height = if ProtocolFeature::StateSyncHashUpdate.enabled(PROTOCOL_VERSION) + { + // here since there's only one block/chunk producer, we assume that no blocks will be missing chunks. + epoch_start_height + 2 + } else { + epoch_start_height + }; let sync_hash_and_num_shards = match view_client2 .send( GetBlock(BlockReference::BlockId(BlockId::Height(sync_height))) diff --git a/nearcore/src/state_sync.rs b/nearcore/src/state_sync.rs index 727bacdd525..2cbdfe357c0 100644 --- a/nearcore/src/state_sync.rs +++ b/nearcore/src/state_sync.rs @@ -18,11 +18,11 @@ use near_client::sync::external::{ use near_client::sync::state::STATE_DUMP_ITERATION_TIME_LIMIT_SECS; use near_epoch_manager::shard_tracker::ShardTracker; use near_epoch_manager::EpochManagerAdapter; -use near_primitives::checked_feature; use near_primitives::hash::CryptoHash; use near_primitives::state_part::PartId; use near_primitives::state_sync::{StatePartKey, StateSyncDumpProgress}; use near_primitives::types::{AccountId, EpochHeight, EpochId, ShardId, StateRoot}; +use near_primitives::version::ProtocolFeature; use near_store::DBCol; use rand::{thread_rng, Rng}; use std::collections::HashSet; @@ -677,12 +677,11 @@ fn get_latest_epoch( // This is not the way protocol features are normally gated, but this is not actually // a protocol feature/change. The protocol version is just an easy way to coordinate // among nodes for now - let sync_hash = - if checked_feature!("stable", StateSyncHashUpdate, epoch_info.protocol_version()) { - chain.get_current_epoch_sync_hash(&final_block_header)? - } else { - Some(chain.get_epoch_start_sync_hash(final_hash)?) - }; + let sync_hash = if ProtocolFeature::StateSyncHashUpdate.enabled(epoch_info.protocol_version()) { + chain.get_current_epoch_sync_hash(&final_block_header)? + } else { + Some(chain.get_epoch_start_sync_hash(final_hash)?) + }; tracing::debug!(target: "state_sync_dump", ?final_hash, ?sync_hash, ?epoch_id, epoch_height, "get_latest_epoch"); Ok(LatestEpochInfo { prev_epoch_id, epoch_id, epoch_height, sync_hash }) From 105257e33303169920a02a27422ce2378e70206f Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 21 Oct 2024 14:37:56 -0400 Subject: [PATCH 37/70] delete comment --- nearcore/src/state_sync.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/nearcore/src/state_sync.rs b/nearcore/src/state_sync.rs index 2cbdfe357c0..2e0fd302a01 100644 --- a/nearcore/src/state_sync.rs +++ b/nearcore/src/state_sync.rs @@ -674,9 +674,6 @@ fn get_latest_epoch( let prev_epoch_id = epoch_manager.get_prev_epoch_id_from_prev_block(&head.prev_block_hash)?; let epoch_height = epoch_info.epoch_height(); - // This is not the way protocol features are normally gated, but this is not actually - // a protocol feature/change. The protocol version is just an easy way to coordinate - // among nodes for now let sync_hash = if ProtocolFeature::StateSyncHashUpdate.enabled(epoch_info.protocol_version()) { chain.get_current_epoch_sync_hash(&final_block_header)? } else { From c008b90e26ef20597c45c68d8dae84cd7464d650 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 21 Oct 2024 14:49:56 -0400 Subject: [PATCH 38/70] don't pass block header to should_make_or_delete_snapshot() This makes it simpler overall. Also refactor this function to return an enum saying what to do instead of two bools, which was a little bit confusing before. --- chain/chain/src/chain.rs | 177 ++++++++++-------- chain/client/src/client.rs | 2 +- .../src/tests/client/process_blocks.rs | 31 +-- .../src/tests/client/sync_state_nodes.rs | 3 +- nearcore/src/state_sync.rs | 2 +- 5 files changed, 117 insertions(+), 98 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index f2e628fc8ca..8a8560d48e6 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -304,6 +304,14 @@ pub enum VerifyBlockHashAndSignatureResult { CannotVerifyBecauseBlockIsOrphan, } +/// returned by should_make_or_delete_snapshot(), this type tells what we should do to the state snapshot +enum SnapshotAction { + /// Make a new snapshot. Contains the prev_hash of the sync_hash that is used for state sync + MakeSnapshot(CryptoHash), + DeleteSnapshot, + None, +} + impl Chain { /// Builds genesis block and chunks from the current configuration obtained through the arguments. pub fn make_genesis_block( @@ -1808,7 +1816,7 @@ impl Chain { let (apply_chunk_work, block_preprocess_info) = preprocess_res; // 2) Start creating snapshot if needed. - if let Err(err) = self.process_snapshot(block.header()) { + if let Err(err) = self.process_snapshot() { tracing::error!(target: "state_snapshot", ?err, "Failed to make a state snapshot"); } @@ -2429,13 +2437,12 @@ impl Chain { // when this is no longer needed in this file after we more efficiently // implement should_make_or_delete_snapshot(). Also, the logic in this function can probably // be made simpler, as we shouldn't need to iterate over all blocks just to find the epoch start. - fn get_epoch_start_sync_hash_impl( - &self, - sync_header: &BlockHeader, - ) -> Result { - let mut epoch_id = *sync_header.epoch_id(); - let mut hash = *sync_header.hash(); - let mut prev_hash = *sync_header.prev_hash(); + /// Find the hash of the first block on the same epoch (and chain) of block with hash `sync_hash`. + pub fn get_epoch_start_sync_hash(&self, sync_hash: &CryptoHash) -> Result { + let header = self.get_block_header(sync_hash)?; + let mut epoch_id = *header.epoch_id(); + let mut hash = *header.hash(); + let mut prev_hash = *header.prev_hash(); loop { if prev_hash == CryptoHash::default() { return Ok(hash); @@ -2450,22 +2457,11 @@ impl Chain { } } - /// Find the hash of the first block on the same epoch (and chain) of block with hash `sync_hash`. - pub fn get_epoch_start_sync_hash(&self, sync_hash: &CryptoHash) -> Result { - let header = self.get_block_header(sync_hash)?; - self.get_epoch_start_sync_hash_impl(&header) - } - // Returns the first block for which at least two new chunks have been produced for every shard in the epoch // `next_header` is the next block header that is being processed in start_process_block_impl() - pub fn get_current_epoch_sync_hash( - &self, - next_header: &BlockHeader, - ) -> Result, Error> { - let epoch_start = Self::get_epoch_start_sync_hash_impl(self, next_header)?; - if epoch_start == *next_header.hash() { - return Ok(None); - } + pub fn get_current_epoch_sync_hash(&self) -> Result, Error> { + let head = self.head()?; + let epoch_start = Self::get_epoch_start_sync_hash(self, &head.last_block_hash)?; let mut header = self.get_block_header(&epoch_start)?; let shard_ids = self.epoch_manager.shard_ids(header.epoch_id())?; @@ -2473,12 +2469,12 @@ impl Chain { shard_ids.iter().map(|shard_id| (*shard_id, 0)).collect(); loop { - header = if header.hash() == next_header.prev_hash() { - next_header.clone() - } else { - let next_hash = self.chain_store().get_next_block_hash(header.hash())?; - self.get_block_header(&next_hash)? + let next_hash = match self.chain_store().get_next_block_hash(header.hash()) { + Ok(h) => h, + Err(Error::DBNotFoundErr(_)) => return Ok(None), + Err(e) => return Err(e), }; + header = self.get_block_header(&next_hash)?; let mut done = true; for (shard_id, num_new_chunks) in num_new_chunks.iter_mut() { @@ -2502,9 +2498,6 @@ impl Chain { if done { return Ok(Some(*header.hash())); } - if header.hash() == next_header.hash() { - return Ok(None); - } } } @@ -3875,26 +3868,35 @@ impl Chain { } /// Function to create or delete a snapshot if necessary. - /// TODO: this function calls head() inside of start_process_block_impl(), consider changing to next_block.header().prev_hash() - fn process_snapshot(&mut self, next_block: &BlockHeader) -> Result<(), Error> { - let (make_snapshot, delete_snapshot) = self.should_make_or_delete_snapshot(next_block)?; - if !make_snapshot && !delete_snapshot { - return Ok(()); - } + /// TODO: this function calls head() inside of start_process_block_impl(), consider moving this to be called right after HEAD gets updated + fn process_snapshot(&mut self) -> Result<(), Error> { + let snapshot_action = self.should_make_or_delete_snapshot()?; let Some(snapshot_callbacks) = &self.snapshot_callbacks else { return Ok(()) }; - if make_snapshot { - let head = self.head()?; - let prev_hash = head.prev_block_hash; - let epoch_height = self.epoch_manager.get_epoch_height_from_prev_block(&prev_hash)?; - let shard_layout = &self.epoch_manager.get_shard_layout_from_prev_block(&prev_hash)?; - let shard_uids = shard_layout.shard_uids().collect(); - let last_block = self.get_block(&head.last_block_hash)?; - let make_snapshot_callback = &snapshot_callbacks.make_snapshot_callback; - make_snapshot_callback(prev_hash, epoch_height, shard_uids, last_block); - } else if delete_snapshot { - let delete_snapshot_callback = &snapshot_callbacks.delete_snapshot_callback; - delete_snapshot_callback(); - } + match snapshot_action { + SnapshotAction::MakeSnapshot(prev_hash) => { + let block = self.get_block(&prev_hash)?; + let epoch_height = self + .epoch_manager + .get_epoch_height_from_prev_block(block.header().prev_hash())?; + let shard_layout = &self + .epoch_manager + .get_shard_layout_from_prev_block(block.header().prev_hash())?; + let shard_uids = shard_layout.shard_uids().collect(); + + let make_snapshot_callback = &snapshot_callbacks.make_snapshot_callback; + make_snapshot_callback( + *block.header().prev_hash(), + epoch_height, + shard_uids, + block, + ); + } + SnapshotAction::DeleteSnapshot => { + let delete_snapshot_callback = &snapshot_callbacks.delete_snapshot_callback; + delete_snapshot_callback(); + } + SnapshotAction::None => {} + }; Ok(()) } @@ -3924,51 +3926,64 @@ impl Chain { /// Function to check whether we need to create a new snapshot while processing the current block /// Note that this functions is called as a part of block preprocesing, so the head is not updated to current block - fn should_make_or_delete_snapshot( - &mut self, - next_block: &BlockHeader, - ) -> Result<(bool, bool), Error> { + fn should_make_or_delete_snapshot(&mut self) -> Result { // head value is that of the previous block, i.e. curr_block.prev_hash let head = self.head()?; if head.prev_block_hash == CryptoHash::default() { // genesis block, do not snapshot - return Ok((false, false)); + return Ok(SnapshotAction::None); } let is_epoch_boundary = self.epoch_manager.is_next_block_epoch_start(&head.last_block_hash)?; let will_shard_layout_change = self.epoch_manager.will_shard_layout_change(&head.last_block_hash)?; - let next_block_epoch = - self.epoch_manager.get_epoch_id_from_prev_block(&head.last_block_hash)?; - let next_block_protocol_version = - self.epoch_manager.get_epoch_protocol_version(&next_block_epoch)?; - let next_block_is_sync_block = - if !ProtocolFeature::StateSyncHashUpdate.enabled(next_block_protocol_version) { - is_epoch_boundary - } else { - // FIXME: this needs to be fixed. can't be iterating over the whole chain inside of preprocess - // block like that if there are many missed chunks - match self.get_current_epoch_sync_hash(next_block)? { - Some(sync_hash) => sync_hash == *next_block.hash(), - None => false, - } - }; + let protocol_version = self.epoch_manager.get_epoch_protocol_version(&head.epoch_id)?; let tries = self.runtime_adapter.get_tries(); let snapshot_config = tries.state_snapshot_config(); - let make_snapshot = match snapshot_config.state_snapshot_type { - // For every epoch, we snapshot if the next block would be in a different epoch - StateSnapshotType::EveryEpoch => next_block_is_sync_block, + match snapshot_config.state_snapshot_type { + // For every epoch, we snapshot if the next block is the state sync "sync_hash" block + StateSnapshotType::EveryEpoch => { + if !ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { + if is_epoch_boundary { + // Here we return head.last_block_hash as the prev_hash of the first block of the next epoch + Ok(SnapshotAction::MakeSnapshot(head.last_block_hash)) + } else { + Ok(SnapshotAction::None) + } + } else { + // FIXME: this needs to be fixed. can't be iterating over the whole chain inside of preprocess + // block like that if there are many missed chunks + match self.get_current_epoch_sync_hash()? { + Some(sync_hash) => { + if sync_hash == head.last_block_hash { + // note that here we're returning prev_block_hash instead of last_block_hash because in this case + // we can't detect the right sync hash until it is actually applied as the head block + Ok(SnapshotAction::MakeSnapshot(head.prev_block_hash)) + } else { + Ok(SnapshotAction::None) + } + } + None => Ok(SnapshotAction::None), + } + } + } // For resharding only, we snapshot if next block would be in a different shard layout - StateSnapshotType::ForReshardingOnly => is_epoch_boundary && will_shard_layout_change, - }; - - // We need to delete the existing snapshot at the epoch boundary if we are not making a new snapshot - // This is useful for the next epoch after resharding where make_snapshot is false but it's an epoch boundary - let delete_snapshot = !make_snapshot && is_epoch_boundary; - - Ok((make_snapshot, delete_snapshot)) + StateSnapshotType::ForReshardingOnly => { + if is_epoch_boundary { + if will_shard_layout_change { + Ok(SnapshotAction::MakeSnapshot(head.last_block_hash)) + } else { + // We need to delete the existing snapshot at the epoch boundary if we are not making a new snapshot + // This is useful for the next epoch after resharding where we don't make a snapshot but it's an epoch boundary + Ok(SnapshotAction::DeleteSnapshot) + } + } else { + Ok(SnapshotAction::None) + } + } + } } /// Returns a description of state parts cached for the given shard of the given epoch. @@ -4538,7 +4553,7 @@ impl Chain { if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { // TODO(current_epoch_state_sync): replace this with a more efficient lookup - match self.get_current_epoch_sync_hash(sync_block.header())? { + match self.get_current_epoch_sync_hash()? { Some(h) => Ok(*sync_hash == h), None => Ok(false), } diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 5e8eba62759..9697cfc3378 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2557,7 +2557,7 @@ impl Client { let header = self.chain.get_block_header(&header_head.last_block_hash)?; let protocol_version = self.epoch_manager.get_epoch_protocol_version(header.epoch_id())?; let sync_hash = if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { - match self.chain.get_current_epoch_sync_hash(&header)? { + match self.chain.get_current_epoch_sync_hash()? { Some(h) => h, None => return Ok(None), } diff --git a/integration-tests/src/tests/client/process_blocks.rs b/integration-tests/src/tests/client/process_blocks.rs index 67ae50decc7..7e23a8904a3 100644 --- a/integration-tests/src/tests/client/process_blocks.rs +++ b/integration-tests/src/tests/client/process_blocks.rs @@ -2381,9 +2381,9 @@ fn test_catchup_gas_price_change() { assert_eq!(env.clients[0].process_tx(tx, false, false), ProcessTxResponse::ValidTx); } - // We go up to height 8 because height 6 is the first block of the new epoch, and we want at least - // two more blocks if syncing to the current epoch's state - for i in 3..=8 { + // We go up to height 9 because height 6 is the first block of the new epoch, and we want at least + // two more blocks (plus one more for nodes to create snapshots) if syncing to the current epoch's state + for i in 3..=9 { let block = env.clients[0].produce_block(i).unwrap().unwrap(); blocks.push(block.clone()); env.process_block(0, block.clone(), Provenance::PRODUCED); @@ -3673,8 +3673,9 @@ mod contract_precompilation_tests { ); let sync_height = if ProtocolFeature::StateSyncHashUpdate.enabled(PROTOCOL_VERSION) { - // `height` is one more than the start of the epoch. Produce two more blocks with chunks. - produce_blocks_from_height(&mut env, 2, height) - 1 + // `height` is one more than the start of the epoch. Produce two more blocks with chunks, + // and then one more than that so the node will generate the neede snapshot. + produce_blocks_from_height(&mut env, 3, height) - 2 } else { height - 1 }; @@ -3783,8 +3784,9 @@ mod contract_precompilation_tests { ); let sync_height = if ProtocolFeature::StateSyncHashUpdate.enabled(PROTOCOL_VERSION) { - // `height` is one more than the start of the epoch. Produce two more blocks with chunks. - produce_blocks_from_height(&mut env, 2, height) - 1 + // `height` is one more than the start of the epoch. Produce two more blocks with chunks, + // and then one more than that so the node will generate the neede snapshot. + produce_blocks_from_height(&mut env, 3, height) - 2 } else { height - 1 }; @@ -3863,17 +3865,18 @@ mod contract_precompilation_tests { // `height` is the first block of a new epoch (which has not been produced yet), // so if we want to state sync the old way, we produce `EPOCH_LENGTH` + 1 new blocks // to get to produce the first block of the next epoch. If we want to state sync the new - // way, we produce two more than that - let num_new_blocks_in_epoch = - if ProtocolFeature::StateSyncHashUpdate.enabled(PROTOCOL_VERSION) { 3 } else { 1 }; - height = - produce_blocks_from_height(&mut env, EPOCH_LENGTH + num_new_blocks_in_epoch, height); + // way, we produce two more than that, plus one more so that the node will generate the needed snapshot. + let sync_height = if ProtocolFeature::StateSyncHashUpdate.enabled(PROTOCOL_VERSION) { + produce_blocks_from_height(&mut env, EPOCH_LENGTH + 4, height) - 2 + } else { + produce_blocks_from_height(&mut env, EPOCH_LENGTH + 1, height) - 1 + }; // Perform state sync for the second client. - state_sync_on_height(&mut env, height - 1); + state_sync_on_height(&mut env, sync_height); let epoch_id = - *env.clients[0].chain.get_block_by_height(height - 1).unwrap().header().epoch_id(); + *env.clients[0].chain.get_block_by_height(sync_height).unwrap().header().epoch_id(); let runtime_config = env.get_runtime_config(0, epoch_id); let contract_key = get_contract_cache_key( *ContractCode::new(wasm_code.clone(), None).hash(), diff --git a/integration-tests/src/tests/client/sync_state_nodes.rs b/integration-tests/src/tests/client/sync_state_nodes.rs index 5441bc76f72..44d18da3aef 100644 --- a/integration-tests/src/tests/client/sync_state_nodes.rs +++ b/integration-tests/src/tests/client/sync_state_nodes.rs @@ -604,7 +604,8 @@ fn test_dump_epoch_missing_chunk_in_last_block() { assert!(sync_height < 2 * epoch_length + 1); - for i in 1..=sync_height { + // Produce blocks up to sync_height + 1 to give nodes a chance to create the necessary state snapshot + for i in 1..=sync_height + 1 { tracing::info!( target: "test", height=i, diff --git a/nearcore/src/state_sync.rs b/nearcore/src/state_sync.rs index 2e0fd302a01..edf897ca2d3 100644 --- a/nearcore/src/state_sync.rs +++ b/nearcore/src/state_sync.rs @@ -675,7 +675,7 @@ fn get_latest_epoch( let epoch_height = epoch_info.epoch_height(); let sync_hash = if ProtocolFeature::StateSyncHashUpdate.enabled(epoch_info.protocol_version()) { - chain.get_current_epoch_sync_hash(&final_block_header)? + chain.get_current_epoch_sync_hash()? } else { Some(chain.get_epoch_start_sync_hash(final_hash)?) }; From 89bc54700cde60da5978504a3de832b52262c198 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 21 Oct 2024 15:21:53 -0400 Subject: [PATCH 39/70] version the state sync struct by enum instead of u32 field --- chain/chain/src/chain.rs | 7 +++---- chain/chain/src/store_validator/validate.rs | 14 +++++++++----- chain/client/src/client.rs | 14 +++++++++----- core/primitives/src/sharding.rs | 17 +++++++++++------ core/store/src/migrations.rs | 7 +++---- 5 files changed, 35 insertions(+), 24 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 8a8560d48e6..328bdaf98bb 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -65,7 +65,7 @@ use near_primitives::sandbox::state_patch::SandboxStatePatch; use near_primitives::shard_layout::{account_id_to_shard_id, ShardLayout, ShardUId}; use near_primitives::sharding::{ ChunkHash, ChunkHashHeight, EncodedShardChunk, ReceiptList, ReceiptProof, ShardChunk, - ShardChunkHeader, ShardInfo, ShardProof, StateSyncInfo, + ShardChunkHeader, ShardInfo, ShardProof, StateSyncInfo, StateSyncInfoV0, }; use near_primitives::state_part::PartId; use near_primitives::state_sync::{ @@ -797,8 +797,7 @@ impl Chain { } else { Some(*block.header().hash()) }; - let state_sync_info = StateSyncInfo { - state_sync_version: 0, + let state_sync_info = StateSyncInfo::V0(StateSyncInfoV0 { epoch_first_block: *block.header().hash(), sync_hash, shards: shards_to_state_sync @@ -808,7 +807,7 @@ impl Chain { ShardInfo(*shard_id, chunk.chunk_hash()) }) .collect(), - }; + }); Ok(Some(state_sync_info)) } diff --git a/chain/chain/src/store_validator/validate.rs b/chain/chain/src/store_validator/validate.rs index e5ee7aae9a0..7332e959d0d 100644 --- a/chain/chain/src/store_validator/validate.rs +++ b/chain/chain/src/store_validator/validate.rs @@ -720,11 +720,15 @@ pub(crate) fn state_sync_info_valid( block_hash: &CryptoHash, state_sync_info: &StateSyncInfo, ) -> Result<(), StoreValidatorError> { - check_discrepancy!( - state_sync_info.epoch_first_block, - *block_hash, - "Invalid StateSyncInfo stored" - ); + match state_sync_info { + StateSyncInfo::V0(state_sync_info) => { + check_discrepancy!( + state_sync_info.epoch_first_block, + *block_hash, + "Invalid StateSyncInfo stored" + ); + } + }; Ok(()) } diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 9697cfc3378..c4900d26829 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -68,9 +68,9 @@ use near_primitives::hash::CryptoHash; use near_primitives::merkle::{merklize, MerklePath, PartialMerkleTree}; use near_primitives::network::PeerId; use near_primitives::receipt::Receipt; -use near_primitives::sharding::StateSyncInfo; use near_primitives::sharding::{ - EncodedShardChunk, PartialEncodedChunk, ShardChunk, ShardChunkHeader, ShardInfo, + EncodedShardChunk, PartialEncodedChunk, ShardChunk, ShardChunkHeader, ShardInfo, StateSyncInfo, + StateSyncInfoV0, }; use near_primitives::transaction::SignedTransaction; use near_primitives::types::chunk_extra::ChunkExtra; @@ -2591,9 +2591,10 @@ impl Client { let mut notify_state_sync = false; let me = signer.as_ref().map(|x| x.validator_id().clone()); - for (epoch_first_block, mut state_sync_info) in + for (epoch_first_block, state_sync_info) in self.chain.chain_store().iterate_state_sync_infos()? { + let StateSyncInfo::V0(mut state_sync_info) = state_sync_info; assert_eq!(epoch_first_block, state_sync_info.epoch_first_block); // I *think* this is not relevant anymore, since we download @@ -2643,7 +2644,10 @@ impl Client { let mut update = self.chain.mut_chain_store().store_update(); // note that iterate_state_sync_infos() collects everything into a Vec, so we're not // actually writing to the DB while actively iterating this column - update.add_state_sync_info(epoch_first_block, state_sync_info.clone()); + update.add_state_sync_info( + epoch_first_block, + StateSyncInfo::V0(state_sync_info.clone()), + ); // TODO: would be nice to be able to propagate context up the call stack so we can just log // once at the top with all the info. Otherwise this error will look very cryptic update.commit()?; @@ -2768,7 +2772,7 @@ impl Client { fn get_shards_to_split( &mut self, sync_hash: CryptoHash, - state_sync_info: &StateSyncInfo, + state_sync_info: &StateSyncInfoV0, me: &Option, ) -> Result, Error> { let prev_hash = *self.chain.get_block(&sync_hash)?.header().prev_hash(); diff --git a/core/primitives/src/sharding.rs b/core/primitives/src/sharding.rs index 5a396753bfc..bb417725643 100644 --- a/core/primitives/src/sharding.rs +++ b/core/primitives/src/sharding.rs @@ -61,12 +61,7 @@ pub struct ShardInfo(pub ShardId, pub ChunkHash); /// Contains the information that is used to sync state for shards as epochs switch #[derive(Clone, Debug, PartialEq, BorshSerialize, BorshDeserialize)] -pub struct StateSyncInfo { - /// For now this is always set to 0. This is included because an improvement we might want to make in the future - /// is that when syncing to the current epoch's state, we currently wait for two new chunks in each shard, but - /// with some changes to the meaning of the "sync_hash", we should only need to wait for one. So this is included - /// in order to allow for this change in the future without needing another database migration. - pub state_sync_version: u32, +pub struct StateSyncInfoV0 { /// The first block of the epoch we want to state sync for. This field is not strictly required since /// this struct is keyed by this hash in the database, but it's a small amount of data that makes /// the info in this type more complete. @@ -83,6 +78,16 @@ pub struct StateSyncInfo { pub shards: Vec, } +/// Contains the information that is used to sync state for shards as epochs switch +/// Currently there is only one version possible, but an improvement we might want to make in the future +/// is that when syncing to the current epoch's state, we currently wait for two new chunks in each shard, but +/// with some changes to the meaning of the "sync_hash", we should only need to wait for one. So this is included +/// in order to allow for this change in the future without needing another database migration. +#[derive(Clone, Debug, PartialEq, BorshSerialize, BorshDeserialize)] +pub enum StateSyncInfo { + V0(StateSyncInfoV0), +} + pub mod shard_chunk_header_inner; pub use shard_chunk_header_inner::{ ShardChunkHeaderInner, ShardChunkHeaderInnerV1, ShardChunkHeaderInnerV2, diff --git a/core/store/src/migrations.rs b/core/store/src/migrations.rs index 59a6e79b623..be6874ab1d2 100644 --- a/core/store/src/migrations.rs +++ b/core/store/src/migrations.rs @@ -5,7 +5,7 @@ use borsh::{BorshDeserialize, BorshSerialize}; use near_primitives::epoch_manager::EpochSummary; use near_primitives::epoch_manager::AGGREGATOR_KEY; use near_primitives::hash::CryptoHash; -use near_primitives::sharding::{ShardInfo, StateSyncInfo}; +use near_primitives::sharding::{ShardInfo, StateSyncInfo, StateSyncInfoV0}; use near_primitives::state::FlatStateValue; use near_primitives::transaction::{ExecutionOutcomeWithIdAndProof, ExecutionOutcomeWithProof}; use near_primitives::types::{ @@ -383,12 +383,11 @@ pub fn migrate_40_to_41(store: &Store) -> anyhow::Result<()> { if epoch_first_block != sync_hash { tracing::warn!(key = %epoch_first_block, %sync_hash, "sync_hash field of legacy StateSyncInfo not equal to the key. Something is wrong with this node's catchup info"); } - let new_info = StateSyncInfo { - state_sync_version: 0, + let new_info = StateSyncInfo::V0(StateSyncInfoV0 { epoch_first_block, sync_hash: Some(sync_hash), shards, - }; + }); update .set_ser(DBCol::StateDlInfos, &key, &new_info) .context("failed writing to StateDlInfos")?; From 1b81f8c0e6928c34b66112173a93a262564efaa8 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 21 Oct 2024 15:29:16 -0400 Subject: [PATCH 40/70] remove first block hash field of add_state_sync_infos again --- chain/chain/src/chain_update.rs | 2 +- chain/chain/src/store/mod.rs | 14 +++++++++----- chain/client/src/client.rs | 5 +---- core/primitives/src/sharding.rs | 9 +++++++++ 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/chain/chain/src/chain_update.rs b/chain/chain/src/chain_update.rs index 4c10bdaca6b..9d0a089d767 100644 --- a/chain/chain/src/chain_update.rs +++ b/chain/chain/src/chain_update.rs @@ -237,7 +237,7 @@ impl<'a> ChainUpdate<'a> { ); } if let Some(state_sync_info) = state_sync_info { - self.chain_store_update.add_state_sync_info(*block.hash(), state_sync_info); + self.chain_store_update.add_state_sync_info(state_sync_info); } self.chain_store_update.save_block_extra(block.hash(), BlockExtra { challenges_result }); diff --git a/chain/chain/src/store/mod.rs b/chain/chain/src/store/mod.rs index 611a4662d33..88bce08f05d 100644 --- a/chain/chain/src/store/mod.rs +++ b/chain/chain/src/store/mod.rs @@ -1383,7 +1383,7 @@ pub struct ChainStoreUpdate<'a> { remove_blocks_to_catchup: Vec<(CryptoHash, CryptoHash)>, // A prev_hash to be removed with all the hashes associated with it remove_prev_blocks_to_catchup: Vec, - add_state_sync_infos: Vec<(CryptoHash, StateSyncInfo)>, + add_state_sync_infos: Vec, remove_state_sync_infos: Vec, challenged_blocks: HashSet, } @@ -2028,8 +2028,8 @@ impl<'a> ChainStoreUpdate<'a> { self.remove_prev_blocks_to_catchup.push(hash); } - pub fn add_state_sync_info(&mut self, block_hash: CryptoHash, info: StateSyncInfo) { - self.add_state_sync_infos.push((block_hash, info)); + pub fn add_state_sync_info(&mut self, info: StateSyncInfo) { + self.add_state_sync_infos.push(info); } pub fn remove_state_sync_info(&mut self, hash: CryptoHash) { @@ -2548,8 +2548,12 @@ impl<'a> ChainStoreUpdate<'a> { } } - for (block_hash, state_sync_info) in self.add_state_sync_infos.drain(..) { - store_update.set_ser(DBCol::StateDlInfos, block_hash.as_ref(), &state_sync_info)?; + for state_sync_info in self.add_state_sync_infos.drain(..) { + store_update.set_ser( + DBCol::StateDlInfos, + state_sync_info.block_hash().as_ref(), + &state_sync_info, + )?; } for hash in self.remove_state_sync_infos.drain(..) { store_update.delete(DBCol::StateDlInfos, hash.as_ref()); diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index c4900d26829..fda2080126c 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2644,10 +2644,7 @@ impl Client { let mut update = self.chain.mut_chain_store().store_update(); // note that iterate_state_sync_infos() collects everything into a Vec, so we're not // actually writing to the DB while actively iterating this column - update.add_state_sync_info( - epoch_first_block, - StateSyncInfo::V0(state_sync_info.clone()), - ); + update.add_state_sync_info(StateSyncInfo::V0(state_sync_info.clone())); // TODO: would be nice to be able to propagate context up the call stack so we can just log // once at the top with all the info. Otherwise this error will look very cryptic update.commit()?; diff --git a/core/primitives/src/sharding.rs b/core/primitives/src/sharding.rs index bb417725643..806fb06fdf8 100644 --- a/core/primitives/src/sharding.rs +++ b/core/primitives/src/sharding.rs @@ -88,6 +88,15 @@ pub enum StateSyncInfo { V0(StateSyncInfoV0), } +impl StateSyncInfo { + /// Block hash that identifies this state sync struct on disk + pub fn block_hash(&self) -> &CryptoHash { + match self { + Self::V0(info) => &info.epoch_first_block, + } + } +} + pub mod shard_chunk_header_inner; pub use shard_chunk_header_inner::{ ShardChunkHeaderInner, ShardChunkHeaderInnerV1, ShardChunkHeaderInnerV2, From 1c10b15988c25b0d47b25e7c13629919dbae31c5 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 21 Oct 2024 15:30:27 -0400 Subject: [PATCH 41/70] use ShardId in state_downloads field --- chain/client/src/client.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index fda2080126c..e8a1223f6b6 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -117,7 +117,7 @@ pub struct CatchupState { /// Manages downloading the state. pub state_sync: StateSync, /// Keeps track of state downloads, and gets passed to `state_sync`. - pub state_downloads: HashMap, + pub state_downloads: HashMap, /// Manages going back to apply chunks after state has been downloaded. pub catchup: BlocksCatchUpState, } From 18d36234cca66f5993a3465bfea29dffbe8b0f0a Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 21 Oct 2024 15:40:50 -0400 Subject: [PATCH 42/70] rename block -> epoch_first_block in get_state_sync_info() --- chain/chain/src/chain.rs | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 328bdaf98bb..4014f03b883 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -757,9 +757,9 @@ impl Chain { fn get_state_sync_info( &self, me: &Option, - block: &Block, + epoch_first_block: &Block, ) -> Result, Error> { - let prev_hash = *block.header().prev_hash(); + let prev_hash = *epoch_first_block.header().prev_hash(); let shards_to_state_sync = Chain::get_shards_to_state_sync( self.epoch_manager.as_ref(), &self.shard_tracker, @@ -768,7 +768,9 @@ impl Chain { )?; let prev_block = self.get_block(&prev_hash)?; - if prev_block.chunks().len() != block.chunks().len() && !shards_to_state_sync.is_empty() { + if prev_block.chunks().len() != epoch_first_block.chunks().len() + && !shards_to_state_sync.is_empty() + { // Currently, the state sync algorithm assumes that the number of chunks do not change // between the epoch being synced to and the last epoch. // For example, if shard layout changes at the beginning of epoch T, validators @@ -780,7 +782,7 @@ impl Chain { // the validator will not have the states ready so it will halt. error!( "Cannot download states for epoch {:?} because sharding just changed. I'm {:?}", - block.header().epoch_id(), + epoch_first_block.header().epoch_id(), me ); debug_assert!(false); @@ -790,15 +792,18 @@ impl Chain { } else { debug!(target: "chain", "Downloading state for {:?}, I'm {:?}", shards_to_state_sync, me); - let protocol_version = - self.epoch_manager.get_epoch_protocol_version(block.header().epoch_id())?; + let protocol_version = self + .epoch_manager + .get_epoch_protocol_version(epoch_first_block.header().epoch_id())?; let sync_hash = if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { None } else { - Some(*block.header().hash()) + Some(*epoch_first_block.header().hash()) }; + // This block is the first block in an epoch because this function is only called in get_catchup_and_state_sync_infos() + // when that is the case. let state_sync_info = StateSyncInfo::V0(StateSyncInfoV0 { - epoch_first_block: *block.header().hash(), + epoch_first_block: *epoch_first_block.header().hash(), sync_hash, shards: shards_to_state_sync .iter() From eb88ad309540dac1fd4a83494d84977659df8971 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 21 Oct 2024 15:41:07 -0400 Subject: [PATCH 43/70] comments --- chain/client/src/client.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index e8a1223f6b6..18cd97a4417 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -242,8 +242,8 @@ pub struct ProduceChunkResult { pub transactions_storage_proof: Option, } -/// This keeps track of the numbef of new chunks seen in each shard since the block that was passed to new() -/// This whole thing could be replaced with a much simpler function that just computes the numbef of new chunks +/// This keeps track of the number of new chunks seen in each shard since the block that was passed to new() +/// This whole thing could be replaced with a much simpler function that just computes the number of new chunks /// in each shard from scratch every time we call it, but in the unlikely and unfortunate case where a shard /// hasn't had any chunks for a very long time, it would end up being a nontrivial inefficiency to do that /// every time run_catchup() is called @@ -268,6 +268,9 @@ impl NewChunkTracker { } } + // TODO(current_epoch_sync_hash): refactor this and use the same logic in get_current_epoch_sync_hash(). Ideally + // that function should go away (at least as it is now) in favor of a more efficient approach that we can call on + // every new block application fn record_new_chunks(&mut self, header: &BlockHeader) -> Result { let mut done = true; for (shard_id, num_new_chunks) in self.num_new_chunks.iter_mut() { From 7be1c8a875e580c64baaea2f169097efa1fd9905 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 21 Oct 2024 15:50:08 -0400 Subject: [PATCH 44/70] simplify get_epoch_start_sync_hash() --- chain/chain/src/chain.rs | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 4014f03b883..5e31f7fa647 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -2439,26 +2439,9 @@ impl Chain { // TODO(current_epoch_state_sync): move state sync related code to state sync files // when this is no longer needed in this file after we more efficiently - // implement should_make_or_delete_snapshot(). Also, the logic in this function can probably - // be made simpler, as we shouldn't need to iterate over all blocks just to find the epoch start. - /// Find the hash of the first block on the same epoch (and chain) of block with hash `sync_hash`. - pub fn get_epoch_start_sync_hash(&self, sync_hash: &CryptoHash) -> Result { - let header = self.get_block_header(sync_hash)?; - let mut epoch_id = *header.epoch_id(); - let mut hash = *header.hash(); - let mut prev_hash = *header.prev_hash(); - loop { - if prev_hash == CryptoHash::default() { - return Ok(hash); - } - let header = self.get_block_header(&prev_hash)?; - if &epoch_id != header.epoch_id() { - return Ok(hash); - } - epoch_id = *header.epoch_id(); - hash = *header.hash(); - prev_hash = *header.prev_hash(); - } + // implement should_make_or_delete_snapshot(). + pub fn get_epoch_start_sync_hash(&self, block_hash: &CryptoHash) -> Result { + Ok(*self.epoch_manager.get_block_info(block_hash)?.epoch_first_block()) } // Returns the first block for which at least two new chunks have been produced for every shard in the epoch From 7902bb785443519edf3e08efbf35da26f985ee6c Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 21 Oct 2024 15:59:11 -0400 Subject: [PATCH 45/70] make get_current_epoch_sync_hash() take a block hash argument like get_epoch_start_sync_hash() --- chain/chain/src/chain.rs | 12 +++++++----- chain/client/src/client.rs | 2 +- nearcore/src/state_sync.rs | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 5e31f7fa647..76fc3aa28d8 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -2446,9 +2446,11 @@ impl Chain { // Returns the first block for which at least two new chunks have been produced for every shard in the epoch // `next_header` is the next block header that is being processed in start_process_block_impl() - pub fn get_current_epoch_sync_hash(&self) -> Result, Error> { - let head = self.head()?; - let epoch_start = Self::get_epoch_start_sync_hash(self, &head.last_block_hash)?; + pub fn get_current_epoch_sync_hash( + &self, + block_hash: &CryptoHash, + ) -> Result, Error> { + let epoch_start = self.get_epoch_start_sync_hash(block_hash)?; let mut header = self.get_block_header(&epoch_start)?; let shard_ids = self.epoch_manager.shard_ids(header.epoch_id())?; @@ -3942,7 +3944,7 @@ impl Chain { } else { // FIXME: this needs to be fixed. can't be iterating over the whole chain inside of preprocess // block like that if there are many missed chunks - match self.get_current_epoch_sync_hash()? { + match self.get_current_epoch_sync_hash(&head.last_block_hash)? { Some(sync_hash) => { if sync_hash == head.last_block_hash { // note that here we're returning prev_block_hash instead of last_block_hash because in this case @@ -4540,7 +4542,7 @@ impl Chain { if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { // TODO(current_epoch_state_sync): replace this with a more efficient lookup - match self.get_current_epoch_sync_hash()? { + match self.get_current_epoch_sync_hash(sync_hash)? { Some(h) => Ok(*sync_hash == h), None => Ok(false), } diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 18cd97a4417..df3b4d72df1 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2560,7 +2560,7 @@ impl Client { let header = self.chain.get_block_header(&header_head.last_block_hash)?; let protocol_version = self.epoch_manager.get_epoch_protocol_version(header.epoch_id())?; let sync_hash = if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { - match self.chain.get_current_epoch_sync_hash()? { + match self.chain.get_current_epoch_sync_hash(&header_head.last_block_hash)? { Some(h) => h, None => return Ok(None), } diff --git a/nearcore/src/state_sync.rs b/nearcore/src/state_sync.rs index edf897ca2d3..78d8580bd23 100644 --- a/nearcore/src/state_sync.rs +++ b/nearcore/src/state_sync.rs @@ -675,7 +675,7 @@ fn get_latest_epoch( let epoch_height = epoch_info.epoch_height(); let sync_hash = if ProtocolFeature::StateSyncHashUpdate.enabled(epoch_info.protocol_version()) { - chain.get_current_epoch_sync_hash()? + chain.get_current_epoch_sync_hash(final_hash)? } else { Some(chain.get_epoch_start_sync_hash(final_hash)?) }; From 7dbf8121d055f20b15072ef4982abc47054e69af Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 21 Oct 2024 17:06:55 -0400 Subject: [PATCH 46/70] consolidate calls to get_current_epoch_sync_hash() and get_epoch_start_sync_hash() make this just one get_sync_hash() with the protocol feature logic in there, and make the other two private --- chain/chain/src/chain.rs | 38 +++++++++++++-------------- chain/client/src/client.rs | 12 +++------ nearcore/src/state_sync.rs | 7 +---- tools/state-viewer/src/state_parts.rs | 24 ++++++++++++++--- 4 files changed, 43 insertions(+), 38 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 76fc3aa28d8..a3c77f849fa 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -2438,15 +2438,23 @@ impl Chain { } // TODO(current_epoch_state_sync): move state sync related code to state sync files - // when this is no longer needed in this file after we more efficiently - // implement should_make_or_delete_snapshot(). - pub fn get_epoch_start_sync_hash(&self, block_hash: &CryptoHash) -> Result { + pub fn get_sync_hash(&self, block_hash: &CryptoHash) -> Result, Error> { + let header = self.get_block_header(block_hash)?; + let protocol_version = self.epoch_manager.get_epoch_protocol_version(header.epoch_id())?; + if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { + self.get_current_epoch_sync_hash(block_hash) + } else { + self.get_epoch_start_sync_hash(block_hash).map(Some) + } + } + + fn get_epoch_start_sync_hash(&self, block_hash: &CryptoHash) -> Result { Ok(*self.epoch_manager.get_block_info(block_hash)?.epoch_first_block()) } // Returns the first block for which at least two new chunks have been produced for every shard in the epoch // `next_header` is the next block header that is being processed in start_process_block_impl() - pub fn get_current_epoch_sync_hash( + fn get_current_epoch_sync_hash( &self, block_hash: &CryptoHash, ) -> Result, Error> { @@ -4532,26 +4540,16 @@ impl Chain { self.invalid_blocks.contains(hash) } - /// Check that sync_hash is the first block of an epoch. + /// Check that sync_hash matches the one we expect for the epoch containing that block. pub fn check_sync_hash_validity(&self, sync_hash: &CryptoHash) -> Result { // It's important to check that Block exists because we will sync with it. // Do not replace with `get_block_header()`. - let sync_block = self.get_block(sync_hash)?; - let protocol_version = - self.epoch_manager.get_epoch_protocol_version(sync_block.header().epoch_id())?; + let _sync_block = self.get_block(sync_hash)?; - if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { - // TODO(current_epoch_state_sync): replace this with a more efficient lookup - match self.get_current_epoch_sync_hash(sync_hash)? { - Some(h) => Ok(*sync_hash == h), - None => Ok(false), - } - } else { - let prev_hash = *sync_block.header().prev_hash(); - let is_first_block_of_epoch = self.epoch_manager.is_next_block_epoch_start(&prev_hash); - // If sync_hash is not on the Epoch boundary, it's malicious behavior - Ok(is_first_block_of_epoch?) - } + // TODO(current_epoch_state_sync): replace this with a more efficient lookup. In the case + // we're syncing to the current epoch, this iterates over blocks in the epoch + let good_sync_hash = self.get_sync_hash(sync_hash)?; + Ok(good_sync_hash.as_ref() == Some(sync_hash)) } /// Get transaction result for given hash of transaction or receipt id diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index df3b4d72df1..7865192582b 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2557,15 +2557,9 @@ impl Client { /// . pub fn find_sync_hash(&self) -> Result, near_chain::Error> { let header_head = self.chain.header_head()?; - let header = self.chain.get_block_header(&header_head.last_block_hash)?; - let protocol_version = self.epoch_manager.get_epoch_protocol_version(header.epoch_id())?; - let sync_hash = if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { - match self.chain.get_current_epoch_sync_hash(&header_head.last_block_hash)? { - Some(h) => h, - None => return Ok(None), - } - } else { - self.chain.get_epoch_start_sync_hash(&header_head.last_block_hash)? + let sync_hash = match self.chain.get_sync_hash(&header_head.last_block_hash)? { + Some(h) => h, + None => return Ok(None), }; let genesis_hash = self.chain.genesis().hash(); diff --git a/nearcore/src/state_sync.rs b/nearcore/src/state_sync.rs index 78d8580bd23..4f869548511 100644 --- a/nearcore/src/state_sync.rs +++ b/nearcore/src/state_sync.rs @@ -22,7 +22,6 @@ use near_primitives::hash::CryptoHash; use near_primitives::state_part::PartId; use near_primitives::state_sync::{StatePartKey, StateSyncDumpProgress}; use near_primitives::types::{AccountId, EpochHeight, EpochId, ShardId, StateRoot}; -use near_primitives::version::ProtocolFeature; use near_store::DBCol; use rand::{thread_rng, Rng}; use std::collections::HashSet; @@ -674,11 +673,7 @@ fn get_latest_epoch( let prev_epoch_id = epoch_manager.get_prev_epoch_id_from_prev_block(&head.prev_block_hash)?; let epoch_height = epoch_info.epoch_height(); - let sync_hash = if ProtocolFeature::StateSyncHashUpdate.enabled(epoch_info.protocol_version()) { - chain.get_current_epoch_sync_hash(final_hash)? - } else { - Some(chain.get_epoch_start_sync_hash(final_hash)?) - }; + let sync_hash = chain.get_sync_hash(final_hash)?; tracing::debug!(target: "state_sync_dump", ?final_hash, ?sync_hash, ?epoch_id, epoch_height, "get_latest_epoch"); Ok(LatestEpochInfo { prev_epoch_id, epoch_id, epoch_height, sync_hash }) diff --git a/tools/state-viewer/src/state_parts.rs b/tools/state-viewer/src/state_parts.rs index 26bf8385118..4cebf09716c 100644 --- a/tools/state-viewer/src/state_parts.rs +++ b/tools/state-viewer/src/state_parts.rs @@ -338,7 +338,13 @@ async fn load_state_parts( let epoch = chain.epoch_manager.get_epoch_info(&epoch_id).unwrap(); let sync_hash = get_any_block_hash_of_epoch(&epoch, chain); - let sync_hash = chain.get_epoch_start_sync_hash(&sync_hash).unwrap(); + let sync_hash = match chain.get_sync_hash(&sync_hash).unwrap() { + Some(h) => h, + None => { + tracing::warn!(target: "state-parts", ?epoch_id, "sync hash not yet known"); + return; + } + }; let state_header = chain.get_state_response_header(shard_id, sync_hash).unwrap(); let state_root = state_header.chunk_prev_state_root(); @@ -439,7 +445,13 @@ async fn dump_state_parts( let epoch_id = epoch_selection.to_epoch_id(store, chain); let epoch = chain.epoch_manager.get_epoch_info(&epoch_id).unwrap(); let sync_hash = get_any_block_hash_of_epoch(&epoch, chain); - let sync_hash = chain.get_epoch_start_sync_hash(&sync_hash).unwrap(); + let sync_hash = match chain.get_sync_hash(&sync_hash).unwrap() { + Some(h) => h, + None => { + tracing::warn!(target: "state-parts", ?epoch_id, "sync hash not yet known"); + return; + } + }; let sync_block_header = chain.get_block_header(&sync_hash).unwrap(); let sync_prev_header = chain.get_previous_header(&sync_block_header).unwrap(); let sync_prev_prev_hash = sync_prev_header.prev_hash(); @@ -541,7 +553,13 @@ fn read_state_header( let epoch = chain.epoch_manager.get_epoch_info(&epoch_id).unwrap(); let sync_hash = get_any_block_hash_of_epoch(&epoch, chain); - let sync_hash = chain.get_epoch_start_sync_hash(&sync_hash).unwrap(); + let sync_hash = match chain.get_sync_hash(&sync_hash).unwrap() { + Some(h) => h, + None => { + tracing::warn!(target: "state-parts", ?epoch_id, "sync hash not yet known"); + return; + } + }; let state_header = chain.chain_store().get_state_header(shard_id, sync_hash); tracing::info!(target: "state-parts", ?epoch_id, ?sync_hash, ?state_header); From 741862bebadf06737e95523b9084f438bb7e1e8b Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 21 Oct 2024 17:08:59 -0400 Subject: [PATCH 47/70] rename get_epoch_start_sync_hash -> get_previous_epoch_sync_hash --- chain/chain/src/chain.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index a3c77f849fa..dabda03811b 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -2444,11 +2444,11 @@ impl Chain { if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { self.get_current_epoch_sync_hash(block_hash) } else { - self.get_epoch_start_sync_hash(block_hash).map(Some) + self.get_previous_epoch_sync_hash(block_hash).map(Some) } } - fn get_epoch_start_sync_hash(&self, block_hash: &CryptoHash) -> Result { + fn get_previous_epoch_sync_hash(&self, block_hash: &CryptoHash) -> Result { Ok(*self.epoch_manager.get_block_info(block_hash)?.epoch_first_block()) } @@ -2458,7 +2458,7 @@ impl Chain { &self, block_hash: &CryptoHash, ) -> Result, Error> { - let epoch_start = self.get_epoch_start_sync_hash(block_hash)?; + let epoch_start = self.get_previous_epoch_sync_hash(block_hash)?; let mut header = self.get_block_header(&epoch_start)?; let shard_ids = self.epoch_manager.shard_ids(header.epoch_id())?; From 3176a89cb00f61c97e0d9f869c45d31143ef6e12 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 21 Oct 2024 17:26:55 -0400 Subject: [PATCH 48/70] move find_sync_hash() back to the ClientActor this was moved to the Client so we could call it in tests in 9ae019ac99fa6df6e57a50342ba787638df9b550 but now that we have a Chain::get_sync_hash() function, this is no longer needed --- chain/client/src/client.rs | 23 ----------------- chain/client/src/client_actor.rs | 25 ++++++++++++++++++- .../src/tests/client/process_blocks.rs | 3 ++- .../src/tests/client/state_dump.rs | 16 +----------- .../src/tests/client/sync_state_nodes.rs | 3 ++- 5 files changed, 29 insertions(+), 41 deletions(-) diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 7865192582b..e1763687b64 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2550,29 +2550,6 @@ impl Client { Ok(false) } - /// Select the block hash we are using to sync state. It will sync with the state before applying the - /// content of such block. - /// - /// The selected block will always be the first block on a new epoch: - /// . - pub fn find_sync_hash(&self) -> Result, near_chain::Error> { - let header_head = self.chain.header_head()?; - let sync_hash = match self.chain.get_sync_hash(&header_head.last_block_hash)? { - Some(h) => h, - None => return Ok(None), - }; - - let genesis_hash = self.chain.genesis().hash(); - tracing::debug!( - target: "sync", - ?header_head, - ?sync_hash, - ?genesis_hash, - "find_sync_hash"); - assert_ne!(&sync_hash, genesis_hash); - Ok(Some(sync_hash)) - } - /// Walks through all the ongoing state syncs for future epochs and processes them pub fn run_catchup( &mut self, diff --git a/chain/client/src/client_actor.rs b/chain/client/src/client_actor.rs index c93fd3ef21e..9e717ada142 100644 --- a/chain/client/src/client_actor.rs +++ b/chain/client/src/client_actor.rs @@ -1579,6 +1579,29 @@ impl ClientActorInner { // Sync loop will be started by check_triggers. } + /// Select the block hash we are using to sync state. It will sync with the state before applying the + /// content of such block. + /// + /// The selected block will always be the first block on a new epoch: + /// . + pub fn find_sync_hash(&self) -> Result, near_chain::Error> { + let header_head = self.client.chain.header_head()?; + let sync_hash = match self.client.chain.get_sync_hash(&header_head.last_block_hash)? { + Some(h) => h, + None => return Ok(None), + }; + + let genesis_hash = self.client.chain.genesis().hash(); + tracing::debug!( + target: "sync", + ?header_head, + ?sync_hash, + ?genesis_hash, + "find_sync_hash"); + assert_ne!(&sync_hash, genesis_hash); + Ok(Some(sync_hash)) + } + /// Runs catchup on repeat, if this client is a validator. /// Schedules itself again if it was not ran as response to state parts job result fn catchup(&mut self, ctx: &mut dyn DelayedActionRunner) { @@ -1916,7 +1939,7 @@ impl ClientActorInner { return Ok(()); } - let sync_hash = if let Some(sync_hash) = self.client.find_sync_hash()? { + let sync_hash = if let Some(sync_hash) = self.find_sync_hash()? { sync_hash } else { return Ok(()); diff --git a/integration-tests/src/tests/client/process_blocks.rs b/integration-tests/src/tests/client/process_blocks.rs index 7e23a8904a3..761430fabb9 100644 --- a/integration-tests/src/tests/client/process_blocks.rs +++ b/integration-tests/src/tests/client/process_blocks.rs @@ -2400,7 +2400,8 @@ fn test_catchup_gas_price_change() { // Simulate state sync - let sync_hash = env.clients[0].find_sync_hash().unwrap().unwrap(); + let sync_hash = + env.clients[0].chain.get_sync_hash(blocks.last().unwrap().hash()).unwrap().unwrap(); let sync_block_idx = blocks .iter() .position(|b| *b.hash() == sync_hash) diff --git a/integration-tests/src/tests/client/state_dump.rs b/integration-tests/src/tests/client/state_dump.rs index 2cb45378801..9ec2cb855e8 100644 --- a/integration-tests/src/tests/client/state_dump.rs +++ b/integration-tests/src/tests/client/state_dump.rs @@ -191,7 +191,6 @@ fn run_state_sync_with_dumped_parts( account_creation_at_epoch_height * epoch_length + 1 }; - let mut sync_hash = None; let signer: Signer = signer.into(); for i in 1..=dump_node_head_height { if i == account_creation_at_height { @@ -210,21 +209,7 @@ fn run_state_sync_with_dumped_parts( blocks.push(block.clone()); env.process_block(0, block.clone(), Provenance::PRODUCED); env.process_block(1, block.clone(), Provenance::NONE); - - if block.header().epoch_id() != &Default::default() { - let final_header = env.clients[0] - .chain - .get_block_header(block.header().last_final_block()) - .unwrap(); - if block.header().epoch_id() == final_header.epoch_id() { - if let Some(current_sync_hash) = env.clients[0].find_sync_hash().unwrap() { - sync_hash = Some(current_sync_hash); - } - } - } } - // We must have seen at least one block that was two ahead of the epoch start - let sync_hash = sync_hash.unwrap(); // check that the new account exists let head = env.clients[0].chain.head().unwrap(); @@ -265,6 +250,7 @@ fn run_state_sync_with_dumped_parts( let epoch_info = epoch_manager.get_epoch_info(&epoch_id).unwrap(); let epoch_height = epoch_info.epoch_height(); + let sync_hash = env.clients[0].chain.get_sync_hash(final_block_hash).unwrap().unwrap(); assert!(env.clients[0].chain.check_sync_hash_validity(&sync_hash).unwrap()); let state_sync_header = env.clients[0].chain.get_state_response_header(0, sync_hash).unwrap(); diff --git a/integration-tests/src/tests/client/sync_state_nodes.rs b/integration-tests/src/tests/client/sync_state_nodes.rs index 44d18da3aef..2d5fbd5bcf8 100644 --- a/integration-tests/src/tests/client/sync_state_nodes.rs +++ b/integration-tests/src/tests/client/sync_state_nodes.rs @@ -649,7 +649,8 @@ fn test_dump_epoch_missing_chunk_in_last_block() { // Simulate state sync tracing::info!(target: "test", "state sync - get parts"); - let sync_hash = env.clients[0].find_sync_hash().unwrap().unwrap(); + let sync_hash = + env.clients[0].chain.get_sync_hash(blocks.last().unwrap().hash()).unwrap().unwrap(); let sync_block_idx = blocks .iter() .position(|b| *b.hash() == sync_hash) From 6c645ed55a48e68af863ab58830a92218bbbd4e3 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 21 Oct 2024 20:46:37 -0400 Subject: [PATCH 49/70] implement Store::iter_ser() and call it in the migration --- core/store/src/lib.rs | 9 +++++++++ core/store/src/migrations.rs | 6 ++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/core/store/src/lib.rs b/core/store/src/lib.rs index 3804b60a47b..131d618764d 100644 --- a/core/store/src/lib.rs +++ b/core/store/src/lib.rs @@ -334,6 +334,15 @@ impl Store { self.storage.iter(col) } + pub fn iter_ser<'a, T: BorshDeserialize>( + &'a self, + col: DBCol, + ) -> impl Iterator, T)>> + 'a { + self.storage + .iter(col) + .map(|item| item.and_then(|(key, value)| Ok((key, T::try_from_slice(value.as_ref())?)))) + } + /// Fetches raw key/value pairs from the database. /// /// Practically, this means that for rc columns rc is included in the value. diff --git a/core/store/src/migrations.rs b/core/store/src/migrations.rs index be6874ab1d2..787903885b0 100644 --- a/core/store/src/migrations.rs +++ b/core/store/src/migrations.rs @@ -116,9 +116,7 @@ impl<'a> BatchedStoreUpdate<'a> { /// new blocks. pub fn migrate_32_to_33(store: &Store) -> anyhow::Result<()> { let mut update = BatchedStoreUpdate::new(&store, 10_000_000); - for row in - store.iter_prefix_ser::>(DBCol::_TransactionResult, &[]) - { + for row in store.iter_ser::>(DBCol::_TransactionResult) { let (_, mut outcomes) = row?; // It appears that it was possible that the same entry in the original column contained // duplicate outcomes. We remove them here to avoid panicing due to issuing a @@ -373,7 +371,7 @@ pub fn migrate_40_to_41(store: &Store) -> anyhow::Result<()> { let mut update = store.store_update(); - for row in store.iter_prefix_ser::(DBCol::StateDlInfos, &[]) { + for row in store.iter_ser::(DBCol::StateDlInfos) { let (key, LegacyStateSyncInfo { sync_hash, shards }) = row.context("failed deserializing legacy StateSyncInfo in StateDlInfos")?; From f96f76203a4c802466f4c14bfb8bdabc5ca4635b Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 21 Oct 2024 21:06:06 -0400 Subject: [PATCH 50/70] refactor: extract get_catchup_sync_hash() from run_catchup() --- chain/client/src/client.rs | 123 +++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 65 deletions(-) diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index e1763687b64..e2c6b50cc24 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2550,6 +2550,42 @@ impl Client { Ok(false) } + // Find the sync hash. Most of the time it will already be set in `state_sync_info`. If not, try to find it, + // and set the corresponding field in `state_sync_info`. + fn get_catchup_sync_hash( + &mut self, + state_sync_info: &mut StateSyncInfoV0, + epoch_first_block: &BlockHeader, + ) -> Result, Error> { + if state_sync_info.sync_hash.is_some() { + return Ok(state_sync_info.sync_hash); + } + + let new_chunk_tracker = match self.catchup_tracker.entry(*epoch_first_block.epoch_id()) { + std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), + std::collections::hash_map::Entry::Vacant(e) => { + let shard_ids = self.epoch_manager.shard_ids(epoch_first_block.epoch_id())?; + e.insert(NewChunkTracker::new( + *epoch_first_block.hash(), + epoch_first_block.height(), + &shard_ids, + )) + } + }; + + if let Some(sync_hash) = new_chunk_tracker.find_sync_hash(&self.chain)? { + state_sync_info.sync_hash = Some(sync_hash); + let mut update = self.chain.mut_chain_store().store_update(); + // note that iterate_state_sync_infos() collects everything into a Vec, so we're not + // actually writing to the DB while actively iterating this column + update.add_state_sync_info(StateSyncInfo::V0(state_sync_info.clone())); + // TODO: would be nice to be able to propagate context up the call stack so we can just log + // once at the top with all the info. Otherwise this error will look very cryptic + update.commit()?; + } + Ok(state_sync_info.sync_hash) + } + /// Walks through all the ongoing state syncs for future epochs and processes them pub fn run_catchup( &mut self, @@ -2579,77 +2615,34 @@ impl Client { let block_header = self.chain.get_block(&epoch_first_block)?.header().clone(); let epoch_id = block_header.epoch_id(); - let CatchupState { state_sync, state_downloads, catchup } = if let Some(sync_hash) = - state_sync_info.sync_hash - { - match self.catchup_state_syncs.entry(sync_hash) { - std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), - std::collections::hash_map::Entry::Vacant(e) => { - tracing::debug!(target: "client", ?epoch_first_block, ?sync_hash, "inserting new state sync"); - notify_state_sync = true; - e.insert(CatchupState { - state_sync: StateSync::new( - self.clock.clone(), - self.network_adapter.clone(), - state_sync_timeout, - &self.config.chain_id, - &self.config.state_sync.sync, - true, - ), - state_downloads: shards_to_split.clone(), - catchup: BlocksCatchUpState::new(sync_hash, *epoch_id), - }) - } - } - } else { - let new_chunk_tracker = match self.catchup_tracker.entry(*block_header.epoch_id()) { - std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), - std::collections::hash_map::Entry::Vacant(e) => { - let shard_ids = self.epoch_manager.shard_ids(block_header.epoch_id())?; - e.insert(NewChunkTracker::new( - *block_header.hash(), - block_header.height(), - &shard_ids, - )) - } - }; - if let Some(sync_hash) = new_chunk_tracker.find_sync_hash(&self.chain)? { - state_sync_info.sync_hash = Some(sync_hash); - let mut update = self.chain.mut_chain_store().store_update(); - // note that iterate_state_sync_infos() collects everything into a Vec, so we're not - // actually writing to the DB while actively iterating this column - update.add_state_sync_info(StateSyncInfo::V0(state_sync_info.clone())); - // TODO: would be nice to be able to propagate context up the call stack so we can just log - // once at the top with all the info. Otherwise this error will look very cryptic - update.commit()?; + let sync_hash = match self.get_catchup_sync_hash(&mut state_sync_info, &block_header)? { + Some(h) => h, + None => continue, + }; + let CatchupState { state_sync, state_downloads, catchup } = match self + .catchup_state_syncs + .entry(sync_hash) + { + std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), + std::collections::hash_map::Entry::Vacant(e) => { tracing::debug!(target: "client", ?epoch_first_block, ?sync_hash, "inserting new state sync"); notify_state_sync = true; - match self.catchup_state_syncs.entry(sync_hash) { - std::collections::hash_map::Entry::Vacant(e) => e.insert(CatchupState { - state_sync: StateSync::new( - self.clock.clone(), - self.network_adapter.clone(), - state_sync_timeout, - &self.config.chain_id, - &self.config.state_sync.sync, - true, - ), - state_downloads: shards_to_split.clone(), - catchup: BlocksCatchUpState::new(sync_hash, *epoch_id), - }), - std::collections::hash_map::Entry::Occupied(_) => { - panic!("Error occurred during catchup. Already stored state sync info for {}", sync_hash); - } - } - } else { - continue; + e.insert(CatchupState { + state_sync: StateSync::new( + self.clock.clone(), + self.network_adapter.clone(), + state_sync_timeout, + &self.config.chain_id, + &self.config.state_sync.sync, + true, + ), + state_downloads: shards_to_split.clone(), + catchup: BlocksCatchUpState::new(sync_hash, *epoch_id), + }) } }; - // Here if the state sync structs are set, it must mean we've found a sync hash to use - let sync_hash = state_sync_info.sync_hash.unwrap(); - // For colour decorators to work, they need to printed directly. Otherwise the decorators get escaped, garble output and don't add colours. debug!(target: "catchup", ?me, ?sync_hash, progress_per_shard = ?format_shard_sync_phase_per_shard(&state_downloads, false), "Catchup"); let use_colour = matches!(self.config.log_summary_style, LogSummaryStyle::Colored); From 0628634e62b109ad844c84145aa00041ba857707 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Mon, 21 Oct 2024 21:07:29 -0400 Subject: [PATCH 51/70] recator: use or_insert_with() --- chain/client/src/client.rs | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index e2c6b50cc24..7228fc2162b 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2620,15 +2620,13 @@ impl Client { None => continue, }; - let CatchupState { state_sync, state_downloads, catchup } = match self + let CatchupState { state_sync, state_downloads, catchup } = self .catchup_state_syncs .entry(sync_hash) - { - std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), - std::collections::hash_map::Entry::Vacant(e) => { + .or_insert_with(|| { tracing::debug!(target: "client", ?epoch_first_block, ?sync_hash, "inserting new state sync"); notify_state_sync = true; - e.insert(CatchupState { + CatchupState { state_sync: StateSync::new( self.clock.clone(), self.network_adapter.clone(), @@ -2639,9 +2637,8 @@ impl Client { ), state_downloads: shards_to_split.clone(), catchup: BlocksCatchUpState::new(sync_hash, *epoch_id), - }) - } - }; + } + }); // For colour decorators to work, they need to printed directly. Otherwise the decorators get escaped, garble output and don't add colours. debug!(target: "catchup", ?me, ?sync_hash, progress_per_shard = ?format_shard_sync_phase_per_shard(&state_downloads, false), "Catchup"); From 7d903ec97290a3c11e4ec97d03ac2ad32c86c537 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 22 Oct 2024 14:46:58 -0400 Subject: [PATCH 52/70] add another enum variant to StateSyncInfo for the new version previous epoch -> V0 current epoch -> V1 --- chain/chain/src/chain.rs | 29 +++----- chain/chain/src/store_validator/validate.rs | 10 +-- chain/client/src/client.rs | 36 ++++++--- core/primitives/src/sharding.rs | 82 +++++++++++++++++++-- core/store/src/migrations.rs | 6 +- 5 files changed, 111 insertions(+), 52 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index dabda03811b..d2f04e84ecc 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -65,7 +65,7 @@ use near_primitives::sandbox::state_patch::SandboxStatePatch; use near_primitives::shard_layout::{account_id_to_shard_id, ShardLayout, ShardUId}; use near_primitives::sharding::{ ChunkHash, ChunkHashHeight, EncodedShardChunk, ReceiptList, ReceiptProof, ShardChunk, - ShardChunkHeader, ShardInfo, ShardProof, StateSyncInfo, StateSyncInfoV0, + ShardChunkHeader, ShardProof, StateSyncInfo, }; use near_primitives::state_part::PartId; use near_primitives::state_sync::{ @@ -795,25 +795,14 @@ impl Chain { let protocol_version = self .epoch_manager .get_epoch_protocol_version(epoch_first_block.header().epoch_id())?; - let sync_hash = if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { - None - } else { - Some(*epoch_first_block.header().hash()) - }; - // This block is the first block in an epoch because this function is only called in get_catchup_and_state_sync_infos() - // when that is the case. - let state_sync_info = StateSyncInfo::V0(StateSyncInfoV0 { - epoch_first_block: *epoch_first_block.header().hash(), - sync_hash, - shards: shards_to_state_sync - .iter() - .map(|shard_id| { - let chunk = &prev_block.chunks()[*shard_id as usize]; - ShardInfo(*shard_id, chunk.chunk_hash()) - }) - .collect(), - }); - + // Note that this block is the first block in an epoch because this function is only called + // in get_catchup_and_state_sync_infos() when that is the case. + let state_sync_info = StateSyncInfo::new( + protocol_version, + *epoch_first_block.header().hash(), + &prev_block, + &shards_to_state_sync, + ); Ok(Some(state_sync_info)) } } diff --git a/chain/chain/src/store_validator/validate.rs b/chain/chain/src/store_validator/validate.rs index 7332e959d0d..279931c1019 100644 --- a/chain/chain/src/store_validator/validate.rs +++ b/chain/chain/src/store_validator/validate.rs @@ -720,15 +720,7 @@ pub(crate) fn state_sync_info_valid( block_hash: &CryptoHash, state_sync_info: &StateSyncInfo, ) -> Result<(), StoreValidatorError> { - match state_sync_info { - StateSyncInfo::V0(state_sync_info) => { - check_discrepancy!( - state_sync_info.epoch_first_block, - *block_hash, - "Invalid StateSyncInfo stored" - ); - } - }; + check_discrepancy!(state_sync_info.block_hash(), block_hash, "Invalid StateSyncInfo stored"); Ok(()) } diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 7228fc2162b..2b9eb7e5e2d 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -70,7 +70,7 @@ use near_primitives::network::PeerId; use near_primitives::receipt::Receipt; use near_primitives::sharding::{ EncodedShardChunk, PartialEncodedChunk, ShardChunk, ShardChunkHeader, ShardInfo, StateSyncInfo, - StateSyncInfoV0, + StateSyncInfoV1, }; use near_primitives::transaction::SignedTransaction; use near_primitives::types::chunk_extra::ChunkExtra; @@ -2550,11 +2550,11 @@ impl Client { Ok(false) } - // Find the sync hash. Most of the time it will already be set in `state_sync_info`. If not, try to find it, - // and set the corresponding field in `state_sync_info`. - fn get_catchup_sync_hash( + /// Find the sync hash. Most of the time it will already be set in `state_sync_info`. If not, try to find it, + /// and set the corresponding field in `state_sync_info`. + fn get_catchup_sync_hash_v1( &mut self, - state_sync_info: &mut StateSyncInfoV0, + state_sync_info: &mut StateSyncInfoV1, epoch_first_block: &BlockHeader, ) -> Result, Error> { if state_sync_info.sync_hash.is_some() { @@ -2578,7 +2578,7 @@ impl Client { let mut update = self.chain.mut_chain_store().store_update(); // note that iterate_state_sync_infos() collects everything into a Vec, so we're not // actually writing to the DB while actively iterating this column - update.add_state_sync_info(StateSyncInfo::V0(state_sync_info.clone())); + update.add_state_sync_info(StateSyncInfo::V1(state_sync_info.clone())); // TODO: would be nice to be able to propagate context up the call stack so we can just log // once at the top with all the info. Otherwise this error will look very cryptic update.commit()?; @@ -2586,6 +2586,19 @@ impl Client { Ok(state_sync_info.sync_hash) } + /// Find the sync hash. If syncing to the old epoch's state, it's always set. If syncing to + /// the current epoch's state, it might not yet be known, in which case we try to find it. + fn get_catchup_sync_hash( + &mut self, + state_sync_info: &mut StateSyncInfo, + epoch_first_block: &BlockHeader, + ) -> Result, Error> { + match state_sync_info { + StateSyncInfo::V0(info) => Ok(Some(info.sync_hash)), + StateSyncInfo::V1(info) => self.get_catchup_sync_hash_v1(info, epoch_first_block), + } + } + /// Walks through all the ongoing state syncs for future epochs and processes them pub fn run_catchup( &mut self, @@ -2601,11 +2614,10 @@ impl Client { let mut notify_state_sync = false; let me = signer.as_ref().map(|x| x.validator_id().clone()); - for (epoch_first_block, state_sync_info) in + for (epoch_first_block, mut state_sync_info) in self.chain.chain_store().iterate_state_sync_infos()? { - let StateSyncInfo::V0(mut state_sync_info) = state_sync_info; - assert_eq!(epoch_first_block, state_sync_info.epoch_first_block); + assert_eq!(&epoch_first_block, state_sync_info.block_hash()); // I *think* this is not relevant anymore, since we download // already the next epoch's state. @@ -2645,7 +2657,7 @@ impl Client { let use_colour = matches!(self.config.log_summary_style, LogSummaryStyle::Colored); let tracking_shards: Vec = - state_sync_info.shards.iter().map(|tuple| tuple.0).collect(); + state_sync_info.shards().iter().map(|tuple| tuple.0).collect(); // Notify each shard to sync. if notify_state_sync { @@ -2733,7 +2745,7 @@ impl Client { fn get_shards_to_split( &mut self, sync_hash: CryptoHash, - state_sync_info: &StateSyncInfoV0, + state_sync_info: &StateSyncInfo, me: &Option, ) -> Result, Error> { let prev_hash = *self.chain.get_block(&sync_hash)?.header().prev_hash(); @@ -2746,7 +2758,7 @@ impl Client { // If the client already has the state for this epoch, skip the downloading phase let shards_to_split = state_sync_info - .shards + .shards() .iter() .filter_map(|ShardInfo(shard_id, _)| self.should_split_shard(shard_id, me, prev_hash)) .collect(); diff --git a/core/primitives/src/sharding.rs b/core/primitives/src/sharding.rs index 806fb06fdf8..322b909f001 100644 --- a/core/primitives/src/sharding.rs +++ b/core/primitives/src/sharding.rs @@ -1,3 +1,4 @@ +use crate::block::Block; use crate::congestion_info::CongestionInfo; use crate::hash::{hash, CryptoHash}; use crate::merkle::{combine_hash, merklize, verify_path, MerklePath}; @@ -59,9 +60,28 @@ impl From for ChunkHash { #[derive(Clone, Debug, PartialEq, BorshSerialize, BorshDeserialize)] pub struct ShardInfo(pub ShardId, pub ChunkHash); -/// Contains the information that is used to sync state for shards as epochs switch +impl ShardInfo { + fn new(prev_block: &Block, shard_id: ShardId) -> Self { + let chunk = &prev_block.chunks()[shard_id as usize]; + Self(shard_id, chunk.chunk_hash()) + } +} + +/// This version of the type is used in the old state sync, where we sync to the state right before the new epoch #[derive(Clone, Debug, PartialEq, BorshSerialize, BorshDeserialize)] pub struct StateSyncInfoV0 { + /// The "sync_hash" block referred to in the state sync algorithm. This is the first block of the + /// epoch we want to state sync for. This field is not strictly required since this struct is keyed + /// by this hash in the database, but it's a small amount of data that makes the info in this type more complete. + pub sync_hash: CryptoHash, + /// Shards to fetch state + pub shards: Vec, +} + +/// This version of the type is used when syncing to the current epoch's state, and `sync_hash` is an +/// Option because it is not known at the beginning of the epoch, but only until a few more blocks are produced. +#[derive(Clone, Debug, PartialEq, BorshSerialize, BorshDeserialize)] +pub struct StateSyncInfoV1 { /// The first block of the epoch we want to state sync for. This field is not strictly required since /// this struct is keyed by this hash in the database, but it's a small amount of data that makes /// the info in this type more complete. @@ -69,10 +89,8 @@ pub struct StateSyncInfoV0 { /// The block we'll use as the "sync_hash" when state syncing. Previously, state sync /// used the first block of an epoch as the "sync_hash", and synced state to the epoch before. /// Now that state sync downloads the state of the current epoch, we need to wait a few blocks - /// after applying the first block in an epoch to know what "sync_hash" we'll use. - /// After we begin state syncing to the current epoch instead of the previous, this field is set to None - /// when we apply the first block. Before this change in the state sync protocol, we set this field - /// to Some(self.epoch_first_block). In either case, if it's set, that's the sync hash we want to use. + /// after applying the first block in an epoch to know what "sync_hash" we'll use, so this field + /// is first set to None until we find the right "sync_hash". pub sync_hash: Option, /// Shards to fetch state pub shards: Vec, @@ -85,14 +103,66 @@ pub struct StateSyncInfoV0 { /// in order to allow for this change in the future without needing another database migration. #[derive(Clone, Debug, PartialEq, BorshSerialize, BorshDeserialize)] pub enum StateSyncInfo { + /// Old state sync: sync to the state right before the new epoch V0(StateSyncInfoV0), + /// New state sync: sync to the state right after the new epoch + V1(StateSyncInfoV1), +} + +fn shard_infos(prev_block: &Block, shards: &[ShardId]) -> Vec { + shards.iter().map(|shard_id| ShardInfo::new(prev_block, *shard_id)).collect() } impl StateSyncInfo { + pub fn new_previous_epoch( + epoch_first_block: CryptoHash, + prev_block: &Block, + shards: &[ShardId], + ) -> Self { + Self::V0(StateSyncInfoV0 { + sync_hash: epoch_first_block, + shards: shard_infos(prev_block, shards), + }) + } + + fn new_current_epoch( + epoch_first_block: CryptoHash, + prev_block: &Block, + shards: &[ShardId], + ) -> Self { + Self::V1(StateSyncInfoV1 { + epoch_first_block, + sync_hash: None, + shards: shard_infos(prev_block, shards), + }) + } + + pub fn new( + protocol_version: ProtocolVersion, + epoch_first_block: CryptoHash, + prev_block: &Block, + shards: &[ShardId], + ) -> Self { + if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { + Self::new_current_epoch(epoch_first_block, prev_block, shards) + } else { + Self::new_previous_epoch(epoch_first_block, prev_block, shards) + } + } + + // TODO: rename /// Block hash that identifies this state sync struct on disk pub fn block_hash(&self) -> &CryptoHash { match self { - Self::V0(info) => &info.epoch_first_block, + Self::V0(info) => &info.sync_hash, + Self::V1(info) => &info.epoch_first_block, + } + } + + pub fn shards(&self) -> &[ShardInfo] { + match self { + Self::V0(info) => &info.shards, + Self::V1(info) => &info.shards, } } } diff --git a/core/store/src/migrations.rs b/core/store/src/migrations.rs index 787903885b0..6c551dc23ba 100644 --- a/core/store/src/migrations.rs +++ b/core/store/src/migrations.rs @@ -381,11 +381,7 @@ pub fn migrate_40_to_41(store: &Store) -> anyhow::Result<()> { if epoch_first_block != sync_hash { tracing::warn!(key = %epoch_first_block, %sync_hash, "sync_hash field of legacy StateSyncInfo not equal to the key. Something is wrong with this node's catchup info"); } - let new_info = StateSyncInfo::V0(StateSyncInfoV0 { - epoch_first_block, - sync_hash: Some(sync_hash), - shards, - }); + let new_info = StateSyncInfo::V0(StateSyncInfoV0 { sync_hash, shards }); update .set_ser(DBCol::StateDlInfos, &key, &new_info) .context("failed writing to StateDlInfos")?; From ba3397a843a7f482a95d2120da5dc36fd954a121 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 22 Oct 2024 14:50:28 -0400 Subject: [PATCH 53/70] rename StateSyncInfo::block_hash() -> StateSyncInfo::epoch_first_block() more clear what is being talked about --- chain/chain/src/store/mod.rs | 2 +- chain/chain/src/store_validator/validate.rs | 6 +++++- chain/client/src/client.rs | 2 +- core/primitives/src/sharding.rs | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/chain/chain/src/store/mod.rs b/chain/chain/src/store/mod.rs index 88bce08f05d..a1589c06254 100644 --- a/chain/chain/src/store/mod.rs +++ b/chain/chain/src/store/mod.rs @@ -2551,7 +2551,7 @@ impl<'a> ChainStoreUpdate<'a> { for state_sync_info in self.add_state_sync_infos.drain(..) { store_update.set_ser( DBCol::StateDlInfos, - state_sync_info.block_hash().as_ref(), + state_sync_info.epoch_first_block().as_ref(), &state_sync_info, )?; } diff --git a/chain/chain/src/store_validator/validate.rs b/chain/chain/src/store_validator/validate.rs index 279931c1019..0f6f3dc634a 100644 --- a/chain/chain/src/store_validator/validate.rs +++ b/chain/chain/src/store_validator/validate.rs @@ -720,7 +720,11 @@ pub(crate) fn state_sync_info_valid( block_hash: &CryptoHash, state_sync_info: &StateSyncInfo, ) -> Result<(), StoreValidatorError> { - check_discrepancy!(state_sync_info.block_hash(), block_hash, "Invalid StateSyncInfo stored"); + check_discrepancy!( + state_sync_info.epoch_first_block(), + block_hash, + "Invalid StateSyncInfo stored" + ); Ok(()) } diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 2b9eb7e5e2d..a3f4a315c16 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2617,7 +2617,7 @@ impl Client { for (epoch_first_block, mut state_sync_info) in self.chain.chain_store().iterate_state_sync_infos()? { - assert_eq!(&epoch_first_block, state_sync_info.block_hash()); + assert_eq!(&epoch_first_block, state_sync_info.epoch_first_block()); // I *think* this is not relevant anymore, since we download // already the next epoch's state. diff --git a/core/primitives/src/sharding.rs b/core/primitives/src/sharding.rs index 322b909f001..514527070ec 100644 --- a/core/primitives/src/sharding.rs +++ b/core/primitives/src/sharding.rs @@ -152,7 +152,7 @@ impl StateSyncInfo { // TODO: rename /// Block hash that identifies this state sync struct on disk - pub fn block_hash(&self) -> &CryptoHash { + pub fn epoch_first_block(&self) -> &CryptoHash { match self { Self::V0(info) => &info.sync_hash, Self::V1(info) => &info.epoch_first_block, From 78497d4e9b0873c79a8fc123a630b670419cb088 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 22 Oct 2024 14:57:34 -0400 Subject: [PATCH 54/70] add comments --- chain/chain/src/chain.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index d2f04e84ecc..5c37839a09d 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -2427,6 +2427,11 @@ impl Chain { } // TODO(current_epoch_state_sync): move state sync related code to state sync files + /// Find the hash that should be used as the reference point when requesting state sync + /// headers and parts from other nodes for the epoch the block with hash `block_hash` belongs to. + /// If syncing to the state of that epoch (the new way), this block hash might not yet be known, + /// in which case this returns None. If syncing to the state of the previous epoch (the old way), + /// it's the hash of the first block in that epoch. pub fn get_sync_hash(&self, block_hash: &CryptoHash) -> Result, Error> { let header = self.get_block_header(block_hash)?; let protocol_version = self.epoch_manager.get_epoch_protocol_version(header.epoch_id())?; @@ -2437,12 +2442,14 @@ impl Chain { } } + /// Find the hash of the first block in the epoch the block with hash `block_hash` belongs to. + /// This is the "sync_hash" that will be used as a reference point when state syncing the previous epoch's state fn get_previous_epoch_sync_hash(&self, block_hash: &CryptoHash) -> Result { Ok(*self.epoch_manager.get_block_info(block_hash)?.epoch_first_block()) } - // Returns the first block for which at least two new chunks have been produced for every shard in the epoch - // `next_header` is the next block header that is being processed in start_process_block_impl() + /// Returns the first block for which at least two new chunks have been produced for every shard in the epoch + /// This is the "sync_hash" that will be used as a reference point when state syncing the current epoch's state fn get_current_epoch_sync_hash( &self, block_hash: &CryptoHash, From 98b6f6da8410c786fbceb8848141f7019954f8bb Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 22 Oct 2024 15:00:23 -0400 Subject: [PATCH 55/70] match nit --- chain/chain/src/chain.rs | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 5c37839a09d..8b06f7c8718 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -2471,17 +2471,14 @@ impl Chain { let mut done = true; for (shard_id, num_new_chunks) in num_new_chunks.iter_mut() { - let included = match header.chunk_mask().get(*shard_id as usize) { - Some(i) => *i, - None => { - return Err(Error::Other(format!( - "can't get shard {} in chunk mask for block {}", - shard_id, - header.hash() - ))); - } + let Some(included) = header.chunk_mask().get(*shard_id as usize) else { + return Err(Error::Other(format!( + "can't get shard {} in chunk mask for block {}", + shard_id, + header.hash() + ))); }; - if included { + if *included { *num_new_chunks += 1; } if *num_new_chunks < 2 { From f4d4866ef9cbb42f6572eca55ec56562b8f5755b Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 22 Oct 2024 15:02:06 -0400 Subject: [PATCH 56/70] rename block -> prev_block --- chain/chain/src/chain.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 8b06f7c8718..d751e394676 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -3864,21 +3864,21 @@ impl Chain { let Some(snapshot_callbacks) = &self.snapshot_callbacks else { return Ok(()) }; match snapshot_action { SnapshotAction::MakeSnapshot(prev_hash) => { - let block = self.get_block(&prev_hash)?; + let prev_block = self.get_block(&prev_hash)?; let epoch_height = self .epoch_manager - .get_epoch_height_from_prev_block(block.header().prev_hash())?; + .get_epoch_height_from_prev_block(prev_block.header().prev_hash())?; let shard_layout = &self .epoch_manager - .get_shard_layout_from_prev_block(block.header().prev_hash())?; + .get_shard_layout_from_prev_block(prev_block.header().prev_hash())?; let shard_uids = shard_layout.shard_uids().collect(); let make_snapshot_callback = &snapshot_callbacks.make_snapshot_callback; make_snapshot_callback( - *block.header().prev_hash(), + *prev_block.header().prev_hash(), epoch_height, shard_uids, - block, + prev_block, ); } SnapshotAction::DeleteSnapshot => { From dc8e3db424ce8a523d424a5de759dba1574f370c Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 22 Oct 2024 15:04:22 -0400 Subject: [PATCH 57/70] save prev_prev_hash variable --- chain/chain/src/chain.rs | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index d751e394676..8eb92082d07 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -3865,21 +3865,15 @@ impl Chain { match snapshot_action { SnapshotAction::MakeSnapshot(prev_hash) => { let prev_block = self.get_block(&prev_hash)?; - let epoch_height = self - .epoch_manager - .get_epoch_height_from_prev_block(prev_block.header().prev_hash())?; - let shard_layout = &self - .epoch_manager - .get_shard_layout_from_prev_block(prev_block.header().prev_hash())?; + let prev_prev_hash = prev_block.header().prev_hash(); + let epoch_height = + self.epoch_manager.get_epoch_height_from_prev_block(prev_prev_hash)?; + let shard_layout = + &self.epoch_manager.get_shard_layout_from_prev_block(prev_prev_hash)?; let shard_uids = shard_layout.shard_uids().collect(); let make_snapshot_callback = &snapshot_callbacks.make_snapshot_callback; - make_snapshot_callback( - *prev_block.header().prev_hash(), - epoch_height, - shard_uids, - prev_block, - ); + make_snapshot_callback(*prev_prev_hash, epoch_height, shard_uids, prev_block); } SnapshotAction::DeleteSnapshot => { let delete_snapshot_callback = &snapshot_callbacks.delete_snapshot_callback; From 0f79f7fde820901bf3fd59cc7506ce850b7816bf Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 22 Oct 2024 15:13:37 -0400 Subject: [PATCH 58/70] FIXME -> TODO --- chain/chain/src/chain.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 8eb92082d07..19e21f380e0 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -3937,7 +3937,7 @@ impl Chain { Ok(SnapshotAction::None) } } else { - // FIXME: this needs to be fixed. can't be iterating over the whole chain inside of preprocess + // TODO(current_epoch_state_sync): this needs to be fixed. can't be iterating over the whole chain inside of preprocess // block like that if there are many missed chunks match self.get_current_epoch_sync_hash(&head.last_block_hash)? { Some(sync_hash) => { From c48db128dd47e9ae3a5ecfb98d404585812544d5 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Tue, 22 Oct 2024 15:17:16 -0400 Subject: [PATCH 59/70] match nit --- chain/chain/src/chain.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 19e21f380e0..25983ff97c0 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -3939,17 +3939,17 @@ impl Chain { } else { // TODO(current_epoch_state_sync): this needs to be fixed. can't be iterating over the whole chain inside of preprocess // block like that if there are many missed chunks - match self.get_current_epoch_sync_hash(&head.last_block_hash)? { - Some(sync_hash) => { - if sync_hash == head.last_block_hash { - // note that here we're returning prev_block_hash instead of last_block_hash because in this case - // we can't detect the right sync hash until it is actually applied as the head block - Ok(SnapshotAction::MakeSnapshot(head.prev_block_hash)) - } else { - Ok(SnapshotAction::None) - } - } - None => Ok(SnapshotAction::None), + let Some(sync_hash) = + self.get_current_epoch_sync_hash(&head.last_block_hash)? + else { + return Ok(SnapshotAction::None); + }; + if sync_hash == head.last_block_hash { + // note that here we're returning prev_block_hash instead of last_block_hash because in this case + // we can't detect the right sync hash until it is actually applied as the head block + Ok(SnapshotAction::MakeSnapshot(head.prev_block_hash)) + } else { + Ok(SnapshotAction::None) } } } From 720e2c5cb83d9ae8a9fd27ad481a8d208ac8e5c9 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 23 Oct 2024 00:39:28 -0400 Subject: [PATCH 60/70] nits --- chain/client/src/client.rs | 4 ++-- chain/client/src/client_actor.rs | 2 +- core/primitives/src/sharding.rs | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 8ee2de9b1a0..a45f013b197 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -2887,11 +2887,11 @@ impl Client { impl Client { pub fn get_catchup_status(&self) -> Result, near_chain::Error> { let mut ret = vec![]; - for (sync_hash, CatchupState { sync_status: status, catchup, .. }) in + for (sync_hash, CatchupState { sync_status, catchup, .. }) in self.catchup_state_syncs.iter() { let sync_block_height = self.chain.get_block_header(sync_hash)?.height(); - let shard_sync_status: HashMap<_, _> = status + let shard_sync_status: HashMap<_, _> = sync_status .sync_status .iter() .map(|(shard_id, state)| (*shard_id, state.to_string())) diff --git a/chain/client/src/client_actor.rs b/chain/client/src/client_actor.rs index f7ac36b3e59..e8117fdd9ff 100644 --- a/chain/client/src/client_actor.rs +++ b/chain/client/src/client_actor.rs @@ -1575,7 +1575,7 @@ impl ClientActorInner { /// /// The selected block will always be the first block on a new epoch: /// . - pub fn find_sync_hash(&self) -> Result, near_chain::Error> { + fn find_sync_hash(&self) -> Result, near_chain::Error> { let header_head = self.client.chain.header_head()?; let sync_hash = match self.client.chain.get_sync_hash(&header_head.last_block_hash)? { Some(h) => h, diff --git a/core/primitives/src/sharding.rs b/core/primitives/src/sharding.rs index 0f240747a5e..f259a56f803 100644 --- a/core/primitives/src/sharding.rs +++ b/core/primitives/src/sharding.rs @@ -159,7 +159,6 @@ impl StateSyncInfo { } } - // TODO: rename /// Block hash that identifies this state sync struct on disk pub fn epoch_first_block(&self) -> &CryptoHash { match self { From 920156aa802b90a8d798388637efd37ca44bb9f3 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 23 Oct 2024 17:29:12 -0400 Subject: [PATCH 61/70] add TODO --- chain/chain/src/chain.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 28aee3aff7f..7f6fe139642 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -2511,6 +2511,9 @@ impl Chain { /// Returns the first block for which at least two new chunks have been produced for every shard in the epoch /// This is the "sync_hash" that will be used as a reference point when state syncing the current epoch's state + // TODO(current_epoch_state_sync): remove this and replace it with a single more efficient function that somehow + // keeps track of the number of new chunks per shard so for, like in `NewChunkTracker::find_sync_hash()`, and then + // only implement this in one place. fn get_current_epoch_sync_hash( &self, block_hash: &CryptoHash, From db3e2cb730bbcc496472f7cfa524d4d0fa429687 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 23 Oct 2024 20:06:01 -0400 Subject: [PATCH 62/70] migrate test_state_request to test loop This test fails because the mock epoch manager does not implement get_block_info(), which is needed in the new implementation of get_sync_hash(). Instead of trying to fix it, just move the test to test loop --- chain/client/src/tests/query_client.rs | 58 ------------------- .../src/test_loop/tests/state_sync.rs | 55 +++++++++++++++++- 2 files changed, 53 insertions(+), 60 deletions(-) diff --git a/chain/client/src/tests/query_client.rs b/chain/client/src/tests/query_client.rs index 12cbd1df494..aca779d24a2 100644 --- a/chain/client/src/tests/query_client.rs +++ b/chain/client/src/tests/query_client.rs @@ -217,61 +217,3 @@ fn test_execution_outcome_for_chunk() { near_network::test_utils::wait_or_panic(5000); }); } - -#[test] -fn test_state_request() { - run_actix(async { - let vs = - ValidatorSchedule::new().block_producers_per_epoch(vec![vec!["test".parse().unwrap()]]); - let view_client = setup_only_view( - Clock::real(), - vs, - 10000000, - "test".parse().unwrap(), - true, - 200, - 400, - false, - true, - true, - MockPeerManagerAdapter::default().into_multi_sender(), - 100, - ); - actix::spawn(async move { - actix::clock::sleep(std::time::Duration::from_millis(500)).await; - let block_hash = view_client - .send(GetBlock(BlockReference::BlockId(BlockId::Height(0))).with_span_context()) - .await - .unwrap() - .unwrap() - .header - .hash; - for _ in 0..30 { - let res = view_client - .send( - StateRequestHeader { shard_id: ShardId::new(0), sync_hash: block_hash } - .with_span_context(), - ) - .await - .unwrap(); - assert!(res.is_some()); - } - - // immediately query again, should be rejected - let shard_id = ShardId::new(0); - let res = view_client - .send(StateRequestHeader { shard_id, sync_hash: block_hash }.with_span_context()) - .await - .unwrap(); - assert!(res.is_none()); - actix::clock::sleep(std::time::Duration::from_secs(40)).await; - let res = view_client - .send(StateRequestHeader { shard_id, sync_hash: block_hash }.with_span_context()) - .await - .unwrap(); - assert!(res.is_some()); - System::current().stop(); - }); - near_network::test_utils::wait_or_panic(50000); - }); -} diff --git a/integration-tests/src/test_loop/tests/state_sync.rs b/integration-tests/src/test_loop/tests/state_sync.rs index d108a933e1b..94d79167379 100644 --- a/integration-tests/src/test_loop/tests/state_sync.rs +++ b/integration-tests/src/test_loop/tests/state_sync.rs @@ -1,9 +1,10 @@ -use near_async::messaging::SendAsync; +use near_async::messaging::{Handler, SendAsync}; use near_async::test_loop::TestLoopV2; use near_async::time::Duration; use near_chain_configs::test_genesis::TestGenesisBuilder; -use near_network::client::ProcessTxRequest; +use near_network::client::{ProcessTxRequest, StateRequestHeader}; use near_o11y::testonly::init_test_logger; +use near_primitives::hash::CryptoHash; use near_primitives::test_utils::create_user_test_signer; use near_primitives::transaction::SignedTransaction; use near_primitives::types::{AccountId, AccountInfo, BlockHeightDelta, Nonce, NumSeats, ShardId}; @@ -306,3 +307,53 @@ fn test_state_sync_current_epoch() { run_test(state); } } + +fn spam_state_sync_header_reqs(env: &mut TestLoopEnv, sync_hash: CryptoHash) { + let view_client_handle = env.datas[0].view_client_sender.actor_handle(); + let view_client = env.test_loop.data.get_mut(&view_client_handle); + + for _ in 0..30 { + let res = view_client.handle(StateRequestHeader { shard_id: ShardId::new(0), sync_hash }); + assert!(res.is_some()); + } + + // immediately query again, should be rejected + let shard_id = ShardId::new(0); + let res = view_client.handle(StateRequestHeader { shard_id, sync_hash }); + assert!(res.is_none()); + + env.test_loop.run_for(Duration::seconds(40)); + + let view_client_handle = env.datas[0].view_client_sender.actor_handle(); + let view_client = env.test_loop.data.get_mut(&view_client_handle); + + let res = view_client.handle(StateRequestHeader { shard_id, sync_hash }); + assert!(res.is_some()); +} + +#[test] +fn test_state_request() { + init_test_logger(); + + let TestState { mut env, .. } = setup_initial_blockchain(4, HashMap::default()); + + env.test_loop.run_until( + |data| { + let handle = env.datas[0].client_sender.actor_handle(); + let client = &data.get(&handle).client; + let tip = client.chain.head().unwrap(); + if tip.epoch_id == Default::default() { + return false; + } + client.chain.get_sync_hash(&tip.last_block_hash).unwrap().is_some() + }, + Duration::seconds(20), + ); + let client_handle = env.datas[0].client_sender.actor_handle(); + let client = &env.test_loop.data.get(&client_handle).client; + let tip = client.chain.head().unwrap(); + let sync_hash = client.chain.get_sync_hash(&tip.last_block_hash).unwrap().unwrap(); + + spam_state_sync_header_reqs(&mut env, sync_hash); + env.shutdown_and_drain_remaining_events(Duration::seconds(3)); +} From 9991c89c9295874d853de15b48abff1a30d4e072 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 23 Oct 2024 22:36:09 -0400 Subject: [PATCH 63/70] return None in get_sync_hash() for the genesis block Otherwise this might return an error when looking up the prev header in get_current_epoch_sync_hash(), and we don't want to sync state from before the genesis anyway --- chain/chain/src/chain.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 7f6fe139642..6037398a228 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -2494,6 +2494,10 @@ impl Chain { /// in which case this returns None. If syncing to the state of the previous epoch (the old way), /// it's the hash of the first block in that epoch. pub fn get_sync_hash(&self, block_hash: &CryptoHash) -> Result, Error> { + if block_hash == self.genesis().hash() { + // We shouldn't be trying to sync state from before the genesis block + return Ok(None); + } let header = self.get_block_header(block_hash)?; let protocol_version = self.epoch_manager.get_epoch_protocol_version(header.epoch_id())?; if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { From 99f98a5f64edabb9dfb4bc461f95da93c4569d5e Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 23 Oct 2024 22:44:16 -0400 Subject: [PATCH 64/70] Don't call check_sync_hash_validity() for the genesis block in test_sync_hash_validity() The behavior has changed to return false for the genesis block, but getting the sync hash corresponding to the genesis block is not something we ever want to do, since we're not going to be syncing that state, and there's an assertion that checks we dont use the genesis hash as a sync hash in ClientActorInner::find_sync_hash() anyway. --- integration-tests/src/tests/client/process_blocks.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/integration-tests/src/tests/client/process_blocks.rs b/integration-tests/src/tests/client/process_blocks.rs index 62216609a9c..dc786ede17e 100644 --- a/integration-tests/src/tests/client/process_blocks.rs +++ b/integration-tests/src/tests/client/process_blocks.rs @@ -2151,6 +2151,7 @@ fn test_data_reset_before_state_sync() { #[test] fn test_sync_hash_validity() { + init_test_logger(); let epoch_length = 5; let mut genesis = Genesis::test(vec!["test0".parse().unwrap(), "test1".parse().unwrap()], 1); genesis.config.epoch_length = epoch_length; @@ -2158,11 +2159,11 @@ fn test_sync_hash_validity() { for i in 1..19 { env.produce_block(0, i); } - for i in 0..19 { + for i in 1..19 { let block_hash = *env.clients[0].chain.get_block_header_by_height(i).unwrap().hash(); let res = env.clients[0].chain.check_sync_hash_validity(&block_hash); println!("height {:?} -> {:?}", i, res); - assert_eq!(res.unwrap(), i == 0 || (i % epoch_length) == 1); + assert_eq!(res.unwrap(), (i % epoch_length) == 1); } let bad_hash = CryptoHash::from_str("7tkzFg8RHBmMw1ncRJZCCZAizgq4rwCftTKYLce8RU8t").unwrap(); let res = env.clients[0].chain.check_sync_hash_validity(&bad_hash); From 66f30f1f28fc57491e9ccf56f45da64c5dfd6323 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 23 Oct 2024 23:04:40 -0400 Subject: [PATCH 65/70] fix test_sync_hash_validity for nightly builds --- integration-tests/src/tests/client/process_blocks.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/integration-tests/src/tests/client/process_blocks.rs b/integration-tests/src/tests/client/process_blocks.rs index dc786ede17e..3bbca48a129 100644 --- a/integration-tests/src/tests/client/process_blocks.rs +++ b/integration-tests/src/tests/client/process_blocks.rs @@ -2161,9 +2161,15 @@ fn test_sync_hash_validity() { } for i in 1..19 { let block_hash = *env.clients[0].chain.get_block_header_by_height(i).unwrap().hash(); - let res = env.clients[0].chain.check_sync_hash_validity(&block_hash); - println!("height {:?} -> {:?}", i, res); - assert_eq!(res.unwrap(), (i % epoch_length) == 1); + let valid = env.clients[0].chain.check_sync_hash_validity(&block_hash).unwrap(); + println!("height {} -> {}", i, valid); + if ProtocolFeature::StateSyncHashUpdate.enabled(PROTOCOL_VERSION) { + // This assumes that all shards have new chunks in every block, which should be true + // with TestEnv::produce_block() + assert_eq!(valid, (i % epoch_length) == 3); + } else { + assert_eq!(valid, (i % epoch_length) == 1); + } } let bad_hash = CryptoHash::from_str("7tkzFg8RHBmMw1ncRJZCCZAizgq4rwCftTKYLce8RU8t").unwrap(); let res = env.clients[0].chain.check_sync_hash_validity(&bad_hash); From 902e90a2a80ca332882f9dc6e36fe09a7de474a0 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 23 Oct 2024 23:34:59 -0400 Subject: [PATCH 66/70] rm unused uses --- chain/client/src/tests/query_client.rs | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/chain/client/src/tests/query_client.rs b/chain/client/src/tests/query_client.rs index aca779d24a2..a2ce1c4f997 100644 --- a/chain/client/src/tests/query_client.rs +++ b/chain/client/src/tests/query_client.rs @@ -1,18 +1,13 @@ -use crate::test_utils::{setup_no_network, setup_only_view}; +use crate::test_utils::setup_no_network; use crate::{ GetBlock, GetBlockWithMerkleTree, GetExecutionOutcomesForBlock, Query, Status, TxStatus, }; use actix::System; use futures::{future, FutureExt}; use near_actix_test_utils::run_actix; -use near_async::messaging::IntoMultiSender; use near_async::time::{Clock, Duration}; -use near_chain::test_utils::ValidatorSchedule; use near_crypto::{InMemorySigner, KeyType}; -use near_network::client::{ - BlockResponse, ProcessTxRequest, ProcessTxResponse, StateRequestHeader, -}; -use near_network::test_utils::MockPeerManagerAdapter; +use near_network::client::{BlockResponse, ProcessTxRequest, ProcessTxResponse}; use near_network::types::PeerInfo; use near_o11y::testonly::init_test_logger; use near_o11y::WithSpanContextExt; @@ -20,7 +15,7 @@ use near_primitives::block::{Block, BlockHeader}; use near_primitives::merkle::PartialMerkleTree; use near_primitives::test_utils::create_test_signer; use near_primitives::transaction::SignedTransaction; -use near_primitives::types::{BlockId, BlockReference, EpochId, ShardId}; +use near_primitives::types::{BlockReference, EpochId, ShardId}; use near_primitives::version::PROTOCOL_VERSION; use near_primitives::views::{QueryRequest, QueryResponseKind}; use num_rational::Ratio; From 752f9e8af098395d286f0cf585546b561846cedf Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Thu, 24 Oct 2024 22:03:25 -0400 Subject: [PATCH 67/70] fix test_process_block_after_state_sync --- .../src/tests/client/process_blocks.rs | 37 +++++++++++++++---- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/integration-tests/src/tests/client/process_blocks.rs b/integration-tests/src/tests/client/process_blocks.rs index 3bbca48a129..38d3899c44d 100644 --- a/integration-tests/src/tests/client/process_blocks.rs +++ b/integration-tests/src/tests/client/process_blocks.rs @@ -1670,12 +1670,33 @@ fn test_process_block_after_state_sync() { .nightshade_runtimes(&genesis) .build(); - let sync_height = epoch_length * 4 + 1; - for i in 1..=sync_height { - env.produce_block(0, i); - } - let sync_block = env.clients[0].chain.get_block_by_height(sync_height).unwrap(); - let sync_hash = *sync_block.hash(); + let mut sync_hash_attempts = 0; + let mut next_height = 1; + let sync_hash = loop { + let block = env.clients[0].produce_block(next_height).unwrap().unwrap(); + let block_hash = *block.hash(); + let prev_hash = *block.header().prev_hash(); + env.process_block(0, block, Provenance::PRODUCED); + next_height += 1; + + let epoch_height = + env.clients[0].epoch_manager.get_epoch_height_from_prev_block(&prev_hash).unwrap(); + if epoch_height < 4 { + continue; + } + let Some(sync_hash) = env.clients[0].chain.get_sync_hash(&block_hash).unwrap() else { + sync_hash_attempts += 1; + // This should not happen, but we guard against it defensively so we don't have some infinite loop in + // case of a bug + assert!(sync_hash_attempts <= 2, "sync_hash_attempts: {}", sync_hash_attempts); + continue; + }; + // Produce one more block after the sync hash is found so that the snapshot will be created + if sync_hash != block_hash { + break sync_hash; + } + }; + let sync_block = env.clients[0].chain.get_block(&sync_hash).unwrap(); let shard_id = ShardId::new(0); let header = env.clients[0].chain.compute_state_response_header(shard_id, sync_hash).unwrap(); @@ -1688,7 +1709,7 @@ fn test_process_block_after_state_sync() { .obtain_state_part(shard_id, &sync_prev_prev_hash, &state_root, PartId::new(0, 1)) .unwrap(); // reset cache - for i in epoch_length * 3 - 1..sync_height - 1 { + for i in epoch_length * 3 - 1..sync_block.header().height() - 1 { let block_hash = *env.clients[0].chain.get_block_by_height(i).unwrap().hash(); assert!(env.clients[0].chain.epoch_manager.get_epoch_start_height(&block_hash).is_ok()); } @@ -1698,7 +1719,7 @@ fn test_process_block_after_state_sync() { .runtime_adapter .apply_state_part(shard_id, &state_root, PartId::new(0, 1), &state_part, &epoch_id) .unwrap(); - let block = env.clients[0].produce_block(sync_height + 1).unwrap().unwrap(); + let block = env.clients[0].produce_block(next_height).unwrap().unwrap(); env.clients[0].process_block_test(block.into(), Provenance::PRODUCED).unwrap(); } From 598f49c1795c313df32cf79ddc8e6e0457ee1a56 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Fri, 25 Oct 2024 20:09:36 -0400 Subject: [PATCH 68/70] don't return a sync hash for the first epoch in the old state sync (previous epoch) --- chain/chain/src/chain.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 6037398a228..83fd50f13f9 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -2503,6 +2503,10 @@ impl Chain { if ProtocolFeature::StateSyncHashUpdate.enabled(protocol_version) { self.get_current_epoch_sync_hash(block_hash) } else { + // In the first epoch, it doesn't make sense to sync state to the previous epoch. + if header.epoch_id() == &EpochId::default() { + return Ok(None); + } self.get_previous_epoch_sync_hash(block_hash).map(Some) } } From 5133b31238a6b4cbbf550bfae070b3f6988835a2 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Fri, 25 Oct 2024 20:15:00 -0400 Subject: [PATCH 69/70] fix state_parts_dump_check.py The node will actually dump state in the first epoch now with current epoch state sync --- pytest/tests/sanity/state_parts_dump_check.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pytest/tests/sanity/state_parts_dump_check.py b/pytest/tests/sanity/state_parts_dump_check.py index 815dd4881db..77e576b4792 100644 --- a/pytest/tests/sanity/state_parts_dump_check.py +++ b/pytest/tests/sanity/state_parts_dump_check.py @@ -91,14 +91,6 @@ def main(): val for metric, val in metrics.items() if 'state_sync_dump_check_process_is_up' in metric ]) == NUM_SHARDS, f"Dumper process missing for some shards. {metrics}" - assert sum([ - val for metric, val in metrics.items() - if 'state_sync_dump_check_num_parts_dumped' in metric - ]) == 0, f"No node was supposed to dump parts. {metrics}" - assert sum([ - val for metric, val in metrics.items() - if 'state_sync_dump_check_num_header_dumped' in metric - ]) == 0, f"No node was supposed to dump headers. {metrics}" # wait for 10 more blocks. list(islice(poll_blocks(boot_node), 10)) From 286693b6bb6da0ed88a41cf946fc5de6911348e5 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Fri, 25 Oct 2024 21:09:14 -0400 Subject: [PATCH 70/70] fix test_sync_hash_validity again --- integration-tests/src/tests/client/process_blocks.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/integration-tests/src/tests/client/process_blocks.rs b/integration-tests/src/tests/client/process_blocks.rs index fc2474ff64d..c27f3066025 100644 --- a/integration-tests/src/tests/client/process_blocks.rs +++ b/integration-tests/src/tests/client/process_blocks.rs @@ -2187,7 +2187,8 @@ fn test_sync_hash_validity() { env.produce_block(0, i); } for i in 1..19 { - let block_hash = *env.clients[0].chain.get_block_header_by_height(i).unwrap().hash(); + let header = env.clients[0].chain.get_block_header_by_height(i).unwrap(); + let block_hash = *header.hash(); let valid = env.clients[0].chain.check_sync_hash_validity(&block_hash).unwrap(); println!("height {} -> {}", i, valid); if ProtocolFeature::StateSyncHashUpdate.enabled(PROTOCOL_VERSION) { @@ -2195,7 +2196,7 @@ fn test_sync_hash_validity() { // with TestEnv::produce_block() assert_eq!(valid, (i % epoch_length) == 3); } else { - assert_eq!(valid, (i % epoch_length) == 1); + assert_eq!(valid, header.epoch_id() != &EpochId::default() && (i % epoch_length) == 1); } } let bad_hash = CryptoHash::from_str("7tkzFg8RHBmMw1ncRJZCCZAizgq4rwCftTKYLce8RU8t").unwrap();