From 079c4d7c05e6015e744df9889d604c859e4ce0fe Mon Sep 17 00:00:00 2001 From: Ashish Singh Date: Fri, 6 Oct 2023 11:50:24 +0530 Subject: [PATCH 1/5] Disable async trim translog task for remote indexes Signed-off-by: Ashish Singh --- .../src/main/java/org/opensearch/index/shard/IndexShard.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index 833c91c1766c8..b8a3e04701956 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -1465,6 +1465,9 @@ public void flush(FlushRequest request) { * {@link org.opensearch.index.translog.TranslogDeletionPolicy} for details */ public void trimTranslog() { + if (isRemoteTranslogEnabled()) { + return; + } verifyNotClosed(); final Engine engine = getEngine(); engine.translogManager().trimUnreferencedTranslogFiles(); From 2b17e0663e1fbed9bc0c7ca2a2f50a499ee626e6 Mon Sep 17 00:00:00 2001 From: Ashish Singh Date: Fri, 6 Oct 2023 17:22:37 +0530 Subject: [PATCH 2/5] Skip load global checkpoint to replication tracker for remote indexes Signed-off-by: Ashish Singh --- server/src/main/java/org/opensearch/index/shard/IndexShard.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index b8a3e04701956..251f9a5ae01c0 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -2317,7 +2317,7 @@ public void openEngineAndRecoverFromTranslog() throws IOException { }; // Do not load the global checkpoint if this is a remote snapshot index - if (indexSettings.isRemoteSnapshot() == false) { + if (indexSettings.isRemoteSnapshot() == false && indexSettings.isRemoteTranslogStoreEnabled() == false) { loadGlobalCheckpointToReplicationTracker(); } From 4657a37975b0ceebcb895daf3445b6d20dc2df14 Mon Sep 17 00:00:00 2001 From: Ashish Singh Date: Sat, 7 Oct 2023 14:52:26 +0530 Subject: [PATCH 3/5] Add Integ Tests Signed-off-by: Ashish Singh --- ...emoteStoreMockRepositoryIntegTestCase.java | 2 +- ...moteStoreBackpressureAndResiliencyIT.java} | 74 ++++++++++++++++++- .../org/opensearch/index/IndexService.java | 2 +- 3 files changed, 75 insertions(+), 3 deletions(-) rename server/src/internalClusterTest/java/org/opensearch/remotestore/{RemoteStoreBackpressureIT.java => RemoteStoreBackpressureAndResiliencyIT.java} (67%) diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/AbstractRemoteStoreMockRepositoryIntegTestCase.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/AbstractRemoteStoreMockRepositoryIntegTestCase.java index 9d4d8aa24bd51..2053800504c89 100644 --- a/server/src/internalClusterTest/java/org/opensearch/remotestore/AbstractRemoteStoreMockRepositoryIntegTestCase.java +++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/AbstractRemoteStoreMockRepositoryIntegTestCase.java @@ -160,7 +160,7 @@ private String getLocalSegmentFilename(String remoteFilename) { return remoteFilename.split(RemoteSegmentStoreDirectory.SEGMENT_NAME_UUID_SEPARATOR)[0]; } - private IndexResponse indexSingleDoc() { + protected IndexResponse indexSingleDoc() { return client().prepareIndex(INDEX_NAME) .setId(UUIDs.randomBase64UUID()) .setSource(randomAlphaOfLength(5), randomAlphaOfLength(5)) diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBackpressureIT.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBackpressureAndResiliencyIT.java similarity index 67% rename from server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBackpressureIT.java rename to server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBackpressureAndResiliencyIT.java index 3462054c23630..2c6db6ae19a9a 100644 --- a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBackpressureIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBackpressureAndResiliencyIT.java @@ -12,12 +12,18 @@ import org.opensearch.action.admin.cluster.remotestore.stats.RemoteStoreStatsResponse; import org.opensearch.action.admin.cluster.settings.ClusterUpdateSettingsResponse; import org.opensearch.common.settings.Settings; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.common.util.concurrent.AbstractAsyncTask; +import org.opensearch.common.util.concurrent.UncategorizedExecutionException; import org.opensearch.core.common.bytes.BytesArray; import org.opensearch.core.common.bytes.BytesReference; import org.opensearch.core.common.unit.ByteSizeUnit; import org.opensearch.core.concurrency.OpenSearchRejectedExecutionException; import org.opensearch.core.xcontent.MediaTypeRegistry; +import org.opensearch.index.IndexService; import org.opensearch.index.remote.RemoteSegmentTransferTracker; +import org.opensearch.index.shard.IndexShard; +import org.opensearch.indices.IndicesService; import org.opensearch.repositories.RepositoriesService; import org.opensearch.snapshots.mockstore.MockRepository; import org.opensearch.test.OpenSearchIntegTestCase; @@ -33,7 +39,7 @@ import static org.opensearch.index.remote.RemoteStorePressureSettings.REMOTE_REFRESH_SEGMENT_PRESSURE_ENABLED; @OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, numDataNodes = 0) -public class RemoteStoreBackpressureIT extends AbstractRemoteStoreMockRepositoryIntegTestCase { +public class RemoteStoreBackpressureAndResiliencyIT extends AbstractRemoteStoreMockRepositoryIntegTestCase { public void testWritesRejectedDueToConsecutiveFailureBreach() throws Exception { // Here the doc size of the request remains same throughout the test. After initial indexing, all remote store interactions // fail leading to consecutive failure limit getting exceeded and leading to rejections. @@ -156,4 +162,70 @@ private String generateString(int sizeInBytes) { sb.append("}"); return sb.toString(); } + + /** + * Fixes Github#10398 + */ + public void testAsyncTrimTaskSucceeds() { + Path location = randomRepoPath().toAbsolutePath(); + String dataNodeName = setup(location, 0d, "metadata", Long.MAX_VALUE); + + logger.info("Increasing the frequency of async trim task to ensure it runs in background while indexing"); + IndexService indexService = internalCluster().getInstance(IndicesService.class, dataNodeName).iterator().next(); + ((AbstractAsyncTask) indexService.getTrimTranslogTask()).setInterval(TimeValue.timeValueMillis(100)); + + logger.info("--> Indexing data"); + indexData(randomIntBetween(2, 5), true); + logger.info("--> Indexing succeeded"); + + MockRepository translogRepo = (MockRepository) internalCluster().getInstance(RepositoriesService.class, dataNodeName) + .repository(TRANSLOG_REPOSITORY_NAME); + logger.info("--> Failing all remote store interaction"); + translogRepo.setRandomControlIOExceptionRate(1d); + + for (int i = 0; i < randomIntBetween(5, 10); i++) { + UncategorizedExecutionException exception = assertThrows(UncategorizedExecutionException.class, this::indexSingleDoc); + assertEquals("Failed execution", exception.getMessage()); + } + + translogRepo.setRandomControlIOExceptionRate(0d); + indexSingleDoc(); + logger.info("Indexed single doc successfully"); + } + + /** + * Fixes Github#10400 + */ + public void testSkipLoadGlobalCheckpointToReplicationTracker() { + Path location = randomRepoPath().toAbsolutePath(); + String dataNodeName = setup(location, 0d, "metadata", Long.MAX_VALUE); + + logger.info("--> Indexing data"); + indexData(randomIntBetween(1, 2), true); + logger.info("--> Indexing succeeded"); + + IndexService indexService = internalCluster().getInstance(IndicesService.class, dataNodeName).iterator().next(); + IndexShard indexShard = indexService.getShard(0); + indexShard.failShard("failing shard", null); + + ensureRed(INDEX_NAME); + + MockRepository translogRepo = (MockRepository) internalCluster().getInstance(RepositoriesService.class, dataNodeName) + .repository(TRANSLOG_REPOSITORY_NAME); + logger.info("--> Failing all remote store interaction"); + translogRepo.setRandomControlIOExceptionRate(1d); + client().admin().cluster().prepareReroute().setRetryFailed(true).get(); + // CLuster stays red still as the remote interactions are still failing + ensureRed(INDEX_NAME); + + logger.info("Retrying to allocate failed shards"); + client().admin().cluster().prepareReroute().setRetryFailed(true).get(); + // CLuster stays red still as the remote interactions are still failing + ensureRed(INDEX_NAME); + + logger.info("Stop failing all remote store interactions"); + translogRepo.setRandomControlIOExceptionRate(0d); + client().admin().cluster().prepareReroute().setRetryFailed(true).get(); + ensureGreen(INDEX_NAME); + } } diff --git a/server/src/main/java/org/opensearch/index/IndexService.java b/server/src/main/java/org/opensearch/index/IndexService.java index df8e8070b8e03..af23145be9f89 100644 --- a/server/src/main/java/org/opensearch/index/IndexService.java +++ b/server/src/main/java/org/opensearch/index/IndexService.java @@ -1286,7 +1286,7 @@ AsyncTranslogFSync getFsyncTask() { // for tests return fsyncTask; } - AsyncTrimTranslogTask getTrimTranslogTask() { // for tests + public AsyncTrimTranslogTask getTrimTranslogTask() { // for tests return trimTranslogTask; } From e0c59a635ee927c697547cdeffde50e87b068dd7 Mon Sep 17 00:00:00 2001 From: Ashish Singh Date: Sat, 7 Oct 2023 16:42:17 +0530 Subject: [PATCH 4/5] Empty-Commit Signed-off-by: Ashish Singh From 97e5b34ada6fd68d48cc2de9e188c0959331d2f0 Mon Sep 17 00:00:00 2001 From: Ashish Singh Date: Sat, 7 Oct 2023 17:27:28 +0530 Subject: [PATCH 5/5] Empty-Commit Signed-off-by: Ashish Singh