From 9cd247ffafbc5c4385e67ae6655e9b3cad475d9c Mon Sep 17 00:00:00 2001 From: Andrew Ross Date: Sat, 15 Jun 2024 09:17:38 -0700 Subject: [PATCH] Fix AwarenessAttributeDecommissionIT.testConcurrentDecommissionAction (#14372) The problem is that this test would decommission one of six nodes. The tear down logic of the test would attempt to assert on the health of the cluster by randomly selecting a node and requesting the cluster health. If this random check happened to select the node that was decommissioned, then the test would fail. The fix is to recommission the node at the end of the test. Also, the "recommission node and assert cluster health" logic was used in multiple places and could be refactored out to a helper method. Resolves #14290 Resolves #12197 Signed-off-by: Andrew Ross --- .../AwarenessAttributeDecommissionIT.java | 74 +++++++------------ 1 file changed, 25 insertions(+), 49 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/cluster/coordination/AwarenessAttributeDecommissionIT.java b/server/src/internalClusterTest/java/org/opensearch/cluster/coordination/AwarenessAttributeDecommissionIT.java index b33d57ed43189..beed6e6846b46 100644 --- a/server/src/internalClusterTest/java/org/opensearch/cluster/coordination/AwarenessAttributeDecommissionIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/cluster/coordination/AwarenessAttributeDecommissionIT.java @@ -539,18 +539,7 @@ private void assertNodesRemovedAfterZoneDecommission(boolean originalClusterMana assertEquals(originalClusterManager, currentClusterManager); } - // Will wait for all events to complete - client(activeNode).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get(); - - // Recommissioning the zone back to gracefully succeed the test once above tests succeeds - DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(currentClusterManager).execute( - DeleteDecommissionStateAction.INSTANCE, - new DeleteDecommissionStateRequest() - ).get(); - assertTrue(deleteDecommissionStateResponse.isAcknowledged()); - - // will wait for cluster to stabilise with a timeout of 2 min as by then all nodes should have joined the cluster - ensureStableCluster(15, TimeValue.timeValueMinutes(2)); + deleteDecommissionStateAndWaitForStableCluster(currentClusterManager, 15); } public void testDecommissionFailedWhenDifferentAttributeAlreadyDecommissioned() throws Exception { @@ -617,18 +606,7 @@ public void testDecommissionFailedWhenDifferentAttributeAlreadyDecommissioned() ) ); - // Will wait for all events to complete - client(node_in_c).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get(); - - // Recommissioning the zone back to gracefully succeed the test once above tests succeeds - DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(node_in_c).execute( - DeleteDecommissionStateAction.INSTANCE, - new DeleteDecommissionStateRequest() - ).get(); - assertTrue(deleteDecommissionStateResponse.isAcknowledged()); - - // will wait for cluster to stabilise with a timeout of 2 min as by then all nodes should have joined the cluster - ensureStableCluster(6, TimeValue.timeValueMinutes(2)); + deleteDecommissionStateAndWaitForStableCluster(node_in_c, 6); } public void testDecommissionStatusUpdatePublishedToAllNodes() throws ExecutionException, InterruptedException { @@ -748,20 +726,7 @@ public void testDecommissionStatusUpdatePublishedToAllNodes() throws ExecutionEx ); logger.info("--> Verified the decommissioned node has in_progress state."); - // Will wait for all events to complete - client(activeNode).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get(); - logger.info("--> Got LANGUID event"); - // Recommissioning the zone back to gracefully succeed the test once above tests succeeds - DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(activeNode).execute( - DeleteDecommissionStateAction.INSTANCE, - new DeleteDecommissionStateRequest() - ).get(); - assertTrue(deleteDecommissionStateResponse.isAcknowledged()); - logger.info("--> Deleting decommission done."); - - // will wait for cluster to stabilise with a timeout of 2 min (findPeerInterval for decommissioned nodes) - // as by then all nodes should have joined the cluster - ensureStableCluster(6, TimeValue.timeValueSeconds(121)); + deleteDecommissionStateAndWaitForStableCluster(activeNode, 6); } public void testDecommissionFailedWhenAttributeNotWeighedAway() throws Exception { @@ -983,15 +948,7 @@ public void testDecommissionAcknowledgedIfWeightsNotSetForNonRoutingNode() throw assertEquals(clusterState.nodes().getDataNodes().size(), 3); assertEquals(clusterState.nodes().getClusterManagerNodes().size(), 2); - // Recommissioning the zone back to gracefully succeed the test once above tests succeeds - DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(dataNodes.get(0)).execute( - DeleteDecommissionStateAction.INSTANCE, - new DeleteDecommissionStateRequest() - ).get(); - assertTrue(deleteDecommissionStateResponse.isAcknowledged()); - - // will wait for cluster to stabilise with a timeout of 2 min as by then all nodes should have joined the cluster - ensureStableCluster(6, TimeValue.timeValueMinutes(2)); + deleteDecommissionStateAndWaitForStableCluster(dataNodes.get(0), 6); } public void testConcurrentDecommissionAction() throws Exception { @@ -1019,7 +976,7 @@ public void testConcurrentDecommissionAction() throws Exception { .build() ); logger.info("--> start 3 data nodes on zones 'a' & 'b' & 'c'"); - internalCluster().startNodes( + final String bZoneDataNode = internalCluster().startNodes( Settings.builder() .put(commonSettings) .put("node.attr.zone", "a") @@ -1035,7 +992,7 @@ public void testConcurrentDecommissionAction() throws Exception { .put("node.attr.zone", "c") .put(onlyRole(commonSettings, DiscoveryNodeRole.DATA_ROLE)) .build() - ); + ).get(1); ensureStableCluster(6); ClusterHealthResponse health = client().admin() @@ -1100,6 +1057,25 @@ public void testConcurrentDecommissionAction() throws Exception { assertEquals(concurrentRuns, numRequestAcknowledged.get() + numRequestUnAcknowledged.get() + numRequestFailed.get()); assertEquals(concurrentRuns - 1, numRequestFailed.get()); assertEquals(1, numRequestAcknowledged.get() + numRequestUnAcknowledged.get()); + + deleteDecommissionStateAndWaitForStableCluster(bZoneDataNode, 6); + } + + private void deleteDecommissionStateAndWaitForStableCluster(String activeNodeName, int expectedClusterSize) throws ExecutionException, + InterruptedException { + client(activeNodeName).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get(); + + // Recommissioning the zone back to gracefully succeed the test once above tests succeeds + DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(activeNodeName).execute( + DeleteDecommissionStateAction.INSTANCE, + new DeleteDecommissionStateRequest() + ).get(); + assertTrue(deleteDecommissionStateResponse.isAcknowledged()); + logger.info("--> Deleting decommission done."); + + // will wait for cluster to stabilise with a timeout of 2 min (findPeerInterval for decommissioned nodes) + // as by then all nodes should have joined the cluster + ensureStableCluster(expectedClusterSize, TimeValue.timeValueSeconds(121)); } private static class WaitForFailedDecommissionState implements ClusterStateObserver.Listener {