From f565b947159fa62f6c631ac1792a3b67da578164 Mon Sep 17 00:00:00 2001 From: Rahul Karajgikar Date: Mon, 22 Jul 2024 10:09:51 +0530 Subject: [PATCH] Fix for hasInitiatedFetching() in batch mode Signed-off-by: Rahul Karajgikar --- .../gateway/RecoveryFromGatewayIT.java | 196 ++++++++++++++++++ .../gateway/AsyncShardBatchFetch.java | 8 + .../gateway/ShardsBatchGatewayAllocator.java | 27 ++- 3 files changed, 230 insertions(+), 1 deletion(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/gateway/RecoveryFromGatewayIT.java b/server/src/internalClusterTest/java/org/opensearch/gateway/RecoveryFromGatewayIT.java index 6296608c64d37..2c5a8dd69b790 100644 --- a/server/src/internalClusterTest/java/org/opensearch/gateway/RecoveryFromGatewayIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/gateway/RecoveryFromGatewayIT.java @@ -925,6 +925,202 @@ public void testMultipleReplicaShardAssignmentWithDelayedAllocationAndDifferentN ensureGreen("test"); } + public void testAllocationExplainReturnsNoWhenExtraReplicaShardInNonBatchMode() throws Exception { + // Non batch mode - This test is to validate that we don't return AWAITING_INFO in allocation explain API when the deciders are + // returning NO + internalCluster().startClusterManagerOnlyNodes( + 1, + Settings.builder().put(ExistingShardsAllocator.EXISTING_SHARDS_ALLOCATOR_BATCH_MODE.getKey(), false).build() + ); + internalCluster().startDataOnlyNodes(5); + createIndex( + "test", + Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 4).build() + ); + ensureGreen("test"); + ensureStableCluster(6); + + // Stop one of the nodes to make the cluster yellow + // We cannot directly create an index with replica = data node count because then the whole flow will get skipped due to + // INDEX_CREATED + List nodesWithReplicaShards = findNodesWithShard(false); + Settings replicaNodeDataPathSettings = internalCluster().dataPathSettings(nodesWithReplicaShards.get(0)); + internalCluster().stopRandomNode(InternalTestCluster.nameFilter(nodesWithReplicaShards.get(0))); + + ensureStableCluster(5); + ensureYellow("test"); + + logger.info("--> calling allocation explain API"); + // shard should have decision NO because there is no valid node for the extra replica to go to + assertEquals( + AllocationDecision.NO, + client().admin() + .cluster() + .prepareAllocationExplain() + .setIndex("test") + .setShard(0) + .setPrimary(false) + .get() + .getExplanation() + .getShardAllocationDecision() + .getAllocateDecision() + .getAllocationDecision() + ); + + // Now creating a new index with too many replicas and trying again + createIndex( + "test2", + Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 5).build() + ); + + ensureYellowAndNoInitializingShards("test2"); + + logger.info("--> calling allocation explain API again"); + // shard should have decision NO because there are 6 replicas and 4 data nodes + assertEquals( + AllocationDecision.NO, + client().admin() + .cluster() + .prepareAllocationExplain() + .setIndex("test2") + .setShard(0) + .setPrimary(false) + .get() + .getExplanation() + .getShardAllocationDecision() + .getAllocateDecision() + .getAllocationDecision() + ); + + logger.info("--> restarting the stopped node"); + internalCluster().startDataOnlyNode( + Settings.builder().put("node.name", nodesWithReplicaShards.get(0)).put(replicaNodeDataPathSettings).build() + ); + + ensureStableCluster(6); + ensureGreen("test"); + + logger.info("--> calling allocation explain API 3rd time"); + // shard should still have decision NO because there are 6 replicas and 5 data nodes + assertEquals( + AllocationDecision.NO, + client().admin() + .cluster() + .prepareAllocationExplain() + .setIndex("test2") + .setShard(0) + .setPrimary(false) + .get() + .getExplanation() + .getShardAllocationDecision() + .getAllocateDecision() + .getAllocationDecision() + ); + + internalCluster().startDataOnlyNodes(1); + + ensureStableCluster(7); + ensureGreen("test2"); + } + + public void testAllocationExplainReturnsNoWhenExtraReplicaShardInBatchMode() throws Exception { + // Batch mode - This test is to validate that we don't return AWAITING_INFO in allocation explain API when the deciders are + // returning NO + internalCluster().startClusterManagerOnlyNodes( + 1, + Settings.builder().put(ExistingShardsAllocator.EXISTING_SHARDS_ALLOCATOR_BATCH_MODE.getKey(), true).build() + ); + internalCluster().startDataOnlyNodes(5); + createIndex( + "test", + Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 4).build() + ); + ensureGreen("test"); + ensureStableCluster(6); + + // Stop one of the nodes to make the cluster yellow + // We cannot directly create an index with replica = data node count because then the whole flow will get skipped due to + // INDEX_CREATED + List nodesWithReplicaShards = findNodesWithShard(false); + Settings replicaNodeDataPathSettings = internalCluster().dataPathSettings(nodesWithReplicaShards.get(0)); + internalCluster().stopRandomNode(InternalTestCluster.nameFilter(nodesWithReplicaShards.get(0))); + + ensureStableCluster(5); + ensureYellow("test"); + + logger.info("--> calling allocation explain API"); + // shard should have decision NO because there is no valid node for the extra replica to go to + assertEquals( + AllocationDecision.NO, + client().admin() + .cluster() + .prepareAllocationExplain() + .setIndex("test") + .setShard(0) + .setPrimary(false) + .get() + .getExplanation() + .getShardAllocationDecision() + .getAllocateDecision() + .getAllocationDecision() + ); + + // Now creating a new index with too many replicas and trying again + createIndex( + "test2", + Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 5).build() + ); + + ensureYellowAndNoInitializingShards("test2"); + + logger.info("--> calling allocation explain API again"); + // shard should have decision NO because there are 6 replicas and 4 data nodes + assertEquals( + AllocationDecision.NO, + client().admin() + .cluster() + .prepareAllocationExplain() + .setIndex("test2") + .setShard(0) + .setPrimary(false) + .get() + .getExplanation() + .getShardAllocationDecision() + .getAllocateDecision() + .getAllocationDecision() + ); + + logger.info("--> restarting the stopped node"); + internalCluster().startDataOnlyNode( + Settings.builder().put("node.name", nodesWithReplicaShards.get(0)).put(replicaNodeDataPathSettings).build() + ); + + ensureStableCluster(6); + ensureGreen("test"); + + logger.info("--> calling allocation explain API 3rd time"); + // shard should still have decision NO because there are 6 replicas and 5 data nodes + assertEquals( + AllocationDecision.NO, + client().admin() + .cluster() + .prepareAllocationExplain() + .setIndex("test2") + .setShard(0) + .setPrimary(false) + .get() + .getExplanation() + .getShardAllocationDecision() + .getAllocateDecision() + .getAllocationDecision() + ); + + internalCluster().startDataOnlyNodes(1); + + ensureStableCluster(7); + ensureGreen("test2"); + } + public void testNBatchesCreationAndAssignment() throws Exception { // we will reduce batch size to 5 to make sure we have enough batches to test assignment // Total number of primary shards = 50 (50 indices*1) diff --git a/server/src/main/java/org/opensearch/gateway/AsyncShardBatchFetch.java b/server/src/main/java/org/opensearch/gateway/AsyncShardBatchFetch.java index 4f39a39cea678..df642a9f5a743 100644 --- a/server/src/main/java/org/opensearch/gateway/AsyncShardBatchFetch.java +++ b/server/src/main/java/org/opensearch/gateway/AsyncShardBatchFetch.java @@ -80,6 +80,14 @@ public synchronized void clearShard(ShardId shardId) { this.cache.deleteShard(shardId); } + public boolean hasEmptyCache() { + return this.cache.getCache().isEmpty(); + } + + public AsyncShardFetchCache getCache() { + return this.cache; + } + /** * Cache implementation of transport actions returning batch of shards related data in the response. * Store node level responses of transport actions like {@link TransportNodesListGatewayStartedShardsBatch} or diff --git a/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java b/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java index 3c0797cd450d2..6b00b10b5146b 100644 --- a/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java +++ b/server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java @@ -516,8 +516,33 @@ protected AsyncShardFetch.FetchResult