From 959db0cf1305d618ee7f788796333721178bdad7 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 25 Sep 2024 23:17:15 -0400 Subject: [PATCH 1/4] fix(tests): fix state sync test errors These tests create node home dirs in tempfiles, but these are dropped too soon, and the directories are removed while the nodes are still running. This is visible in the logs as snapshot creation failure messages. After https://github.com/near/nearcore/pull/12147, these no longer cause test failures, but the underlying failures to create snapshots are still there. Fix it by keeping them around in an Arc in the enclosing scopes Note that every now and then these tests still fail with many messages showing `Received an invalid block during state sync` followed by a test timeout, but it seems to be for an unrelated reason. --- .../src/tests/client/sync_state_nodes.rs | 76 ++++++++++++------- 1 file changed, 48 insertions(+), 28 deletions(-) diff --git a/integration-tests/src/tests/client/sync_state_nodes.rs b/integration-tests/src/tests/client/sync_state_nodes.rs index d7473ce51fe..f1f79022a04 100644 --- a/integration-tests/src/tests/client/sync_state_nodes.rs +++ b/integration-tests/src/tests/client/sync_state_nodes.rs @@ -45,8 +45,13 @@ fn sync_state_nodes() { let mut near1 = load_test_config("test1", port1, genesis.clone()); near1.network_config.peer_store.boot_nodes = convert_boot_nodes(vec![]); near1.client_config.min_num_peers = 0; + + let dir1 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap()); + let dir1 = dir1.clone(); + let dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap()); + let dir2 = dir2.clone(); + run_actix(async move { - let dir1 = tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap(); let nearcore::NearNode { view_client: view_client1, .. } = start_with_config(dir1.path(), near1).expect("start_with_config"); @@ -60,6 +65,7 @@ fn sync_state_nodes() { let view_client2_holder2 = view_client2_holder.clone(); let arbiters_holder2 = arbiters_holder2.clone(); let genesis2 = genesis.clone(); + let dir2 = dir2.clone(); let actor = view_client1.send(GetBlock::latest().with_span_context()); let actor = actor.then(move |res| { @@ -77,10 +83,6 @@ fn sync_state_nodes() { near2.network_config.peer_store.boot_nodes = convert_boot_nodes(vec![("test1", *port1)]); - let dir2 = tempfile::Builder::new() - .prefix("sync_nodes_2") - .tempdir() - .unwrap(); let nearcore::NearNode { view_client: view_client2, arbiters, @@ -147,6 +149,15 @@ fn sync_state_nodes_multishard() { ); genesis.config.epoch_length = 150; // so that by the time test2 joins it is not kicked out yet + let dir1 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap()); + let dir1 = dir1.clone(); + let dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap()); + let dir2 = dir2.clone(); + let dir3 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_3").tempdir().unwrap()); + let dir3 = dir3.clone(); + let dir4 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_4").tempdir().unwrap()); + let dir4 = dir4.clone(); + run_actix(async move { let (port1, port2, port3, port4) = ( tcp::ListenerAddr::reserve_for_test(), @@ -180,14 +191,10 @@ fn sync_state_nodes_multishard() { near4.client_config.max_block_production_delay = near1.client_config.max_block_production_delay; - let dir1 = tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap(); let nearcore::NearNode { view_client: view_client1, .. } = start_with_config(dir1.path(), near1).expect("start_with_config"); - let dir3 = tempfile::Builder::new().prefix("sync_nodes_3").tempdir().unwrap(); start_with_config(dir3.path(), near3).expect("start_with_config"); - - let dir4 = tempfile::Builder::new().prefix("sync_nodes_4").tempdir().unwrap(); start_with_config(dir4.path(), near4).expect("start_with_config"); let view_client2_holder = Arc::new(RwLock::new(None)); @@ -200,6 +207,7 @@ fn sync_state_nodes_multishard() { let view_client2_holder2 = view_client2_holder.clone(); let arbiter_holder2 = arbiter_holder2.clone(); let genesis2 = genesis.clone(); + let dir2 = dir2.clone(); let actor = view_client1.send(GetBlock::latest().with_span_context()); let actor = actor.then(move |res| { @@ -224,10 +232,6 @@ fn sync_state_nodes_multishard() { ("test4", *port4), ]); - let dir2 = tempfile::Builder::new() - .prefix("sync_nodes_2") - .tempdir() - .unwrap(); let nearcore::NearNode { view_client: view_client2, arbiters, @@ -297,9 +301,15 @@ fn sync_empty_state() { ); genesis.config.epoch_length = 20; + let dir1 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap()); + let dir1 = dir1.clone(); + let dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap()); + let dir2 = dir2.clone(); + run_actix(async move { let (port1, port2) = (tcp::ListenerAddr::reserve_for_test(), tcp::ListenerAddr::reserve_for_test()); + // State sync triggers when header head is two epochs in the future. // Produce more blocks to make sure that state sync gets triggered when the second node starts. let state_sync_horizon = 10; @@ -311,10 +321,8 @@ fn sync_empty_state() { near1.client_config.min_block_production_delay = Duration::milliseconds(200); near1.client_config.max_block_production_delay = Duration::milliseconds(400); - let dir1 = tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap(); let nearcore::NearNode { view_client: view_client1, .. } = start_with_config(dir1.path(), near1).expect("start_with_config"); - let dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap()); let view_client2_holder = Arc::new(RwLock::new(None)); let arbiters_holder = Arc::new(RwLock::new(vec![])); @@ -425,6 +433,13 @@ fn sync_state_dump() { // start, sync headers and find a dump of state. genesis.config.epoch_length = 30; + let dump_dir = Arc::new(tempfile::Builder::new().prefix("state_dump_1").tempdir().unwrap()); + let dump_dir = dump_dir.clone(); + let dir1 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap()); + let dir1 = dir1.clone(); + let dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap()); + let dir2 = dir2.clone(); + run_actix(async move { let (port1, port2) = (tcp::ListenerAddr::reserve_for_test(), tcp::ListenerAddr::reserve_for_test()); @@ -439,7 +454,7 @@ fn sync_state_dump() { near1.client_config.min_block_production_delay = Duration::milliseconds(300); near1.client_config.max_block_production_delay = Duration::milliseconds(600); near1.client_config.tracked_shards = vec![0]; // Track all shards. - let dump_dir = tempfile::Builder::new().prefix("state_dump_1").tempdir().unwrap(); + near1.client_config.state_sync.dump = Some(DumpConfig { location: Filesystem { root_dir: dump_dir.path().to_path_buf() }, restart_dump_for_shards: None, @@ -448,14 +463,12 @@ fn sync_state_dump() { }); near1.config.store.state_snapshot_enabled = true; - let dir1 = tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap(); let nearcore::NearNode { view_client: view_client1, // State sync dumper should be kept in the scope to avoid dropping it, which stops the state dumper loop. state_sync_dumper: _dumper, .. } = start_with_config(dir1.path(), near1).expect("start_with_config"); - let dir2 = tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap(); let view_client2_holder = Arc::new(RwLock::new(None)); let arbiters_holder = Arc::new(RwLock::new(vec![])); @@ -734,6 +747,9 @@ fn test_dump_epoch_missing_chunk_in_last_block() { #[test] // Tests StateRequestHeader and StateRequestPart. fn test_state_sync_headers() { + let dir1 = + Arc::new(tempfile::Builder::new().prefix("test_state_sync_headers").tempdir().unwrap()); + heavy_test(|| { init_test_logger(); @@ -746,8 +762,6 @@ fn test_state_sync_headers() { load_test_config("test1", tcp::ListenerAddr::reserve_for_test(), genesis.clone()); near1.client_config.min_num_peers = 0; near1.client_config.tracked_shards = vec![0]; // Track all shards. - let dir1 = - tempfile::Builder::new().prefix("test_state_sync_headers").tempdir().unwrap(); near1.config.store.state_snapshot_enabled = true; let nearcore::NearNode { view_client: view_client1, .. } = @@ -929,6 +943,20 @@ fn test_state_sync_headers_no_tracked_shards() { heavy_test(|| { init_test_logger(); + let dir1 = Arc::new( + tempfile::Builder::new() + .prefix("test_state_sync_headers_no_tracked_shards_1") + .tempdir() + .unwrap(), + ); + let dir1 = dir1.clone(); + let dir2 = Arc::new( + tempfile::Builder::new() + .prefix("test_state_sync_headers_no_tracked_shards_2") + .tempdir() + .unwrap(), + ); + let dir2 = dir2.clone(); run_actix(async { let mut genesis = Genesis::test(vec!["test1".parse().unwrap()], 1); // Increase epoch_length if the test is flaky. @@ -939,10 +967,6 @@ fn test_state_sync_headers_no_tracked_shards() { let mut near1 = load_test_config("test1", port1, genesis.clone()); near1.client_config.min_num_peers = 0; near1.client_config.tracked_shards = vec![0]; // Track all shards, it is a validator. - let dir1 = tempfile::Builder::new() - .prefix("test_state_sync_headers_no_tracked_shards_1") - .tempdir() - .unwrap(); near1.config.store.state_snapshot_enabled = false; near1.config.state_sync_enabled = false; near1.client_config.state_sync_enabled = false; @@ -955,10 +979,6 @@ fn test_state_sync_headers_no_tracked_shards() { convert_boot_nodes(vec![("test1", *port1)]); near2.client_config.min_num_peers = 0; near2.client_config.tracked_shards = vec![]; // Track no shards. - let dir2 = tempfile::Builder::new() - .prefix("test_state_sync_headers_no_tracked_shards_2") - .tempdir() - .unwrap(); near2.config.store.state_snapshot_enabled = true; near2.config.state_sync_enabled = false; near2.client_config.state_sync_enabled = false; From 9adb4e956133c5a0f455948e5d8e4f8e0abc9556 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Thu, 26 Sep 2024 11:00:01 -0400 Subject: [PATCH 2/4] one more --- integration-tests/src/tests/client/sync_state_nodes.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/integration-tests/src/tests/client/sync_state_nodes.rs b/integration-tests/src/tests/client/sync_state_nodes.rs index f1f79022a04..d74b71a511a 100644 --- a/integration-tests/src/tests/client/sync_state_nodes.rs +++ b/integration-tests/src/tests/client/sync_state_nodes.rs @@ -749,6 +749,7 @@ fn test_dump_epoch_missing_chunk_in_last_block() { fn test_state_sync_headers() { let dir1 = Arc::new(tempfile::Builder::new().prefix("test_state_sync_headers").tempdir().unwrap()); + let dir1 = dir1.clone(); heavy_test(|| { init_test_logger(); From 822fd239e9e88ad9d54c8932606fffeabc3544d7 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Thu, 26 Sep 2024 19:50:03 -0400 Subject: [PATCH 3/4] fix clippy --- .../src/tests/client/sync_state_nodes.rs | 75 +++++++++++-------- 1 file changed, 45 insertions(+), 30 deletions(-) diff --git a/integration-tests/src/tests/client/sync_state_nodes.rs b/integration-tests/src/tests/client/sync_state_nodes.rs index d74b71a511a..a5706f8a11b 100644 --- a/integration-tests/src/tests/client/sync_state_nodes.rs +++ b/integration-tests/src/tests/client/sync_state_nodes.rs @@ -46,10 +46,10 @@ fn sync_state_nodes() { near1.network_config.peer_store.boot_nodes = convert_boot_nodes(vec![]); near1.client_config.min_num_peers = 0; - let dir1 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap()); - let dir1 = dir1.clone(); - let dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap()); - let dir2 = dir2.clone(); + let _dir1 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap()); + let dir1 = _dir1.clone(); + let _dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap()); + let dir2 = _dir2.clone(); run_actix(async move { let nearcore::NearNode { view_client: view_client1, .. } = @@ -126,6 +126,8 @@ fn sync_state_nodes() { ) .start(); }); + drop(_dir1); + drop(_dir2); }); } @@ -149,14 +151,14 @@ fn sync_state_nodes_multishard() { ); genesis.config.epoch_length = 150; // so that by the time test2 joins it is not kicked out yet - let dir1 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap()); - let dir1 = dir1.clone(); - let dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap()); - let dir2 = dir2.clone(); - let dir3 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_3").tempdir().unwrap()); - let dir3 = dir3.clone(); - let dir4 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_4").tempdir().unwrap()); - let dir4 = dir4.clone(); + let _dir1 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap()); + let dir1 = _dir1.clone(); + let _dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap()); + let dir2 = _dir2.clone(); + let _dir3 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_3").tempdir().unwrap()); + let dir3 = _dir3.clone(); + let _dir4 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_4").tempdir().unwrap()); + let dir4 = _dir4.clone(); run_actix(async move { let (port1, port2, port3, port4) = ( @@ -283,6 +285,10 @@ fn sync_state_nodes_multishard() { ) .start(); }); + drop(_dir1); + drop(_dir2); + drop(_dir3); + drop(_dir4); }); } @@ -301,10 +307,10 @@ fn sync_empty_state() { ); genesis.config.epoch_length = 20; - let dir1 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap()); - let dir1 = dir1.clone(); - let dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap()); - let dir2 = dir2.clone(); + let _dir1 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap()); + let dir1 = _dir1.clone(); + let _dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap()); + let dir2 = _dir2.clone(); run_actix(async move { let (port1, port2) = @@ -410,6 +416,8 @@ fn sync_empty_state() { ) .start(); }); + drop(_dir1); + drop(_dir2); }); } @@ -433,12 +441,13 @@ fn sync_state_dump() { // start, sync headers and find a dump of state. genesis.config.epoch_length = 30; - let dump_dir = Arc::new(tempfile::Builder::new().prefix("state_dump_1").tempdir().unwrap()); - let dump_dir = dump_dir.clone(); - let dir1 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap()); - let dir1 = dir1.clone(); - let dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap()); - let dir2 = dir2.clone(); + let _dump_dir = + Arc::new(tempfile::Builder::new().prefix("state_dump_1").tempdir().unwrap()); + let dump_dir = _dump_dir.clone(); + let _dir1 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap()); + let dir1 = _dir1.clone(); + let _dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap()); + let dir2 = _dir2.clone(); run_actix(async move { let (port1, port2) = @@ -554,6 +563,9 @@ fn sync_state_dump() { .unwrap(); System::current().stop(); }); + drop(_dump_dir); + drop(_dir1); + drop(_dir2); }); } @@ -747,13 +759,13 @@ fn test_dump_epoch_missing_chunk_in_last_block() { #[test] // Tests StateRequestHeader and StateRequestPart. fn test_state_sync_headers() { - let dir1 = - Arc::new(tempfile::Builder::new().prefix("test_state_sync_headers").tempdir().unwrap()); - let dir1 = dir1.clone(); - heavy_test(|| { init_test_logger(); + let _dir1 = + Arc::new(tempfile::Builder::new().prefix("test_state_sync_headers").tempdir().unwrap()); + let dir1 = _dir1.clone(); + run_actix(async { let mut genesis = Genesis::test(vec!["test1".parse().unwrap()], 1); // Increase epoch_length if the test is flaky. @@ -935,6 +947,7 @@ fn test_state_sync_headers() { .unwrap(); System::current().stop(); }); + drop(_dir1); }); } @@ -944,20 +957,20 @@ fn test_state_sync_headers_no_tracked_shards() { heavy_test(|| { init_test_logger(); - let dir1 = Arc::new( + let _dir1 = Arc::new( tempfile::Builder::new() .prefix("test_state_sync_headers_no_tracked_shards_1") .tempdir() .unwrap(), ); - let dir1 = dir1.clone(); - let dir2 = Arc::new( + let dir1 = _dir1.clone(); + let _dir2 = Arc::new( tempfile::Builder::new() .prefix("test_state_sync_headers_no_tracked_shards_2") .tempdir() .unwrap(), ); - let dir2 = dir2.clone(); + let dir2 = _dir2.clone(); run_actix(async { let mut genesis = Genesis::test(vec!["test1".parse().unwrap()], 1); // Increase epoch_length if the test is flaky. @@ -1099,5 +1112,7 @@ fn test_state_sync_headers_no_tracked_shards() { .unwrap(); System::current().stop(); }); + drop(_dir1); + drop(_dir2); }); } From 09884ea0278a3bbab8689afc609842970156beda Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Fri, 4 Oct 2024 00:13:44 -0400 Subject: [PATCH 4/4] add comment --- integration-tests/src/tests/client/sync_state_nodes.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/integration-tests/src/tests/client/sync_state_nodes.rs b/integration-tests/src/tests/client/sync_state_nodes.rs index a5706f8a11b..07f5d3b4c09 100644 --- a/integration-tests/src/tests/client/sync_state_nodes.rs +++ b/integration-tests/src/tests/client/sync_state_nodes.rs @@ -46,6 +46,9 @@ fn sync_state_nodes() { near1.network_config.peer_store.boot_nodes = convert_boot_nodes(vec![]); near1.client_config.min_num_peers = 0; + // In this test and the ones below, we have an Arc, that we make sure to keep alive by cloning it + // and keeping the original one around after we pass the clone to run_actix(). Otherwise it will be dropped early + // and the directories will actually be removed while the nodes are running. let _dir1 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_1").tempdir().unwrap()); let dir1 = _dir1.clone(); let _dir2 = Arc::new(tempfile::Builder::new().prefix("sync_nodes_2").tempdir().unwrap());