From aea3ec7f6baeee582180e4527507a87932d5e23b Mon Sep 17 00:00:00 2001 From: Jorge Esteban Quilcate Otoya Date: Mon, 15 Jul 2024 16:39:49 +0300 Subject: [PATCH 1/3] fix: handle cached file delete where path does not exist While testing flakiness of disk-based cache delete metrics, it was found that eviction may happen from different reasons and the path may already be deleted. To avoid a runtime exception (that is anyway shallowed by listener execution), this PR introduces some validation before checking size. --- .../tieredstorage/fetch/cache/DiskChunkCache.java | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/core/src/main/java/io/aiven/kafka/tieredstorage/fetch/cache/DiskChunkCache.java b/core/src/main/java/io/aiven/kafka/tieredstorage/fetch/cache/DiskChunkCache.java index 005dca0d4..ab9ffb592 100644 --- a/core/src/main/java/io/aiven/kafka/tieredstorage/fetch/cache/DiskChunkCache.java +++ b/core/src/main/java/io/aiven/kafka/tieredstorage/fetch/cache/DiskChunkCache.java @@ -98,11 +98,16 @@ public RemovalListener removalListener() { return (key, path, cause) -> { try { if (path != null) { - final long fileSize = Files.size(path); - Files.delete(path); - metrics.chunkDeleted(fileSize); - log.trace("Deleted cached file for key {} with path {} from cache directory." - + " The reason of the deletion is {}", key, path, cause); + if (Files.exists(path)) { + final long fileSize = Files.size(path); + Files.delete(path); + metrics.chunkDeleted(fileSize); + log.trace("Deleted cached file for key {} with path {} from cache directory." + + " The reason of the deletion is {}", key, path, cause); + } else { + log.debug("Deletion of cached file for key {} with " + + "path {} is requested by file is not found", key, path); + } } else { log.warn("Path not present when trying to delete cached file for key {} from cache directory." + " The reason of the deletion is {}", key, cause); From 615a0afa581b422237333de1bb820e7de161751c Mon Sep 17 00:00:00 2001 From: Jorge Esteban Quilcate Otoya Date: Mon, 15 Jul 2024 16:42:49 +0300 Subject: [PATCH 2/3] fix: add retention-based eviction to disk-based cache metrics test To reduce flakiness where deletion is not executed in a consistent manner based on size, a time-based eviction configuration is added to have either 1 or 2 deletions happening while test is running to validate results. Before it was only checking either first or second value where deleted. Now it is checking 1 or 2 or both. To validated flakiness, @RepeatedTest(100000) was used, and it's now passing fine. --- .../cache/DiskChunkCacheMetricsTest.java | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/core/src/test/java/io/aiven/kafka/tieredstorage/fetch/cache/DiskChunkCacheMetricsTest.java b/core/src/test/java/io/aiven/kafka/tieredstorage/fetch/cache/DiskChunkCacheMetricsTest.java index 1c769a606..467ae23fe 100644 --- a/core/src/test/java/io/aiven/kafka/tieredstorage/fetch/cache/DiskChunkCacheMetricsTest.java +++ b/core/src/test/java/io/aiven/kafka/tieredstorage/fetch/cache/DiskChunkCacheMetricsTest.java @@ -89,7 +89,8 @@ void metrics() throws IOException, JMException, StorageBackendException { final DiskChunkCache diskChunkCache = new DiskChunkCache(chunkManager, time); diskChunkCache.configure(Map.of( "size", size1, // enough to put the first, but not both - "path", baseCachePath.toString() + "path", baseCachePath.toString(), + "retention.ms", String.valueOf(Duration.ofSeconds(10).toMillis()) )); diskChunkCache.getChunk(OBJECT_KEY_PATH, SEGMENT_MANIFEST, 0); @@ -118,28 +119,40 @@ void metrics() throws IOException, JMException, StorageBackendException { assertThat(MBEAN_SERVER.getAttribute(objectName, "write-bytes-rate")) .isEqualTo(((double) (size1 + size2)) / METRIC_TIME_WINDOW_SEC); + // because of the retention ms, it may be deleting cached values 1, 2 or both. await("Deletion happens") - .atMost(Duration.ofSeconds(30)) // increase to reduce chance of flakiness + .atMost(Duration.ofSeconds(30)) .pollDelay(Duration.ofMillis(100)) .pollInterval(Duration.ofMillis(100)) .until(() -> (double) MBEAN_SERVER.getAttribute(objectName, "delete-total") > 0); assertThat(MBEAN_SERVER.getAttribute(objectName, "delete-total")) - .isEqualTo(1.0); + .asInstanceOf(DOUBLE) + .satisfiesAnyOf( + deleteTotal -> assertThat(deleteTotal).isEqualTo(1), + deleteTotal -> assertThat(deleteTotal).isEqualTo(2) + ); assertThat(MBEAN_SERVER.getAttribute(objectName, "delete-rate")) - .isEqualTo(1.0 / METRIC_TIME_WINDOW_SEC); + .satisfiesAnyOf( + deleteTotalRate -> assertThat(deleteTotalRate).isEqualTo(1.0 / METRIC_TIME_WINDOW_SEC), + deleteTotalRate -> assertThat(deleteTotalRate).isEqualTo(2.0 / METRIC_TIME_WINDOW_SEC) + ); assertThat(MBEAN_SERVER.getAttribute(objectName, "delete-bytes-total")) + .asInstanceOf(DOUBLE) .satisfiesAnyOf( - deleteBytesTotal -> assertThat(deleteBytesTotal).asInstanceOf(DOUBLE).isEqualTo(size1), - deleteBytesTotal -> assertThat(deleteBytesTotal).asInstanceOf(DOUBLE).isEqualTo(size2) + deleteBytesTotal -> assertThat(deleteBytesTotal).isEqualTo(size1), + deleteBytesTotal -> assertThat(deleteBytesTotal).isEqualTo(size2), + deleteBytesTotal -> assertThat(deleteBytesTotal).isEqualTo(size1 + size2) ); assertThat(MBEAN_SERVER.getAttribute(objectName, "delete-bytes-rate")) .satisfiesAnyOf( deleteBytesRate -> assertThat(deleteBytesRate) .isEqualTo((double) size1 / METRIC_TIME_WINDOW_SEC), deleteBytesRate -> assertThat(deleteBytesRate) - .isEqualTo((double) size2 / METRIC_TIME_WINDOW_SEC) + .isEqualTo((double) size2 / METRIC_TIME_WINDOW_SEC), + deleteBytesRate -> assertThat(deleteBytesRate) + .isEqualTo((double) (size1 + size2) / METRIC_TIME_WINDOW_SEC) ); } } From 01deb757c5b2c7e03943b7ada75ab5584d105fd7 Mon Sep 17 00:00:00 2001 From: Jorge Esteban Quilcate Otoya Date: Thu, 8 Aug 2024 20:42:25 +0300 Subject: [PATCH 3/3] fix: add retention-based eviction to memory-based cache metrics test Similar to disk-based test, as deletion is not happening consistently leading to flakiness, this commit includes time-based retention to force deletion and validate metrics. --- .../fetch/index/MemorySegmentIndexesCacheTest.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/test/java/io/aiven/kafka/tieredstorage/fetch/index/MemorySegmentIndexesCacheTest.java b/core/src/test/java/io/aiven/kafka/tieredstorage/fetch/index/MemorySegmentIndexesCacheTest.java index 83d4ee748..04edb95a6 100644 --- a/core/src/test/java/io/aiven/kafka/tieredstorage/fetch/index/MemorySegmentIndexesCacheTest.java +++ b/core/src/test/java/io/aiven/kafka/tieredstorage/fetch/index/MemorySegmentIndexesCacheTest.java @@ -215,7 +215,7 @@ void timeBasedEviction() throws IOException, StorageBackendException, Interrupte void sizeBasedEviction() throws IOException, StorageBackendException { cache.configure(Map.of( "size", "18", - "retention.ms", "-1" + "retention.ms", String.valueOf(Duration.ofSeconds(10).toMillis()) )); assertThat(cache.cache.asMap()).isEmpty(); @@ -253,13 +253,14 @@ void sizeBasedEviction() throws IOException, StorageBackendException { assertThat(timeIndex).hasBinaryContent(TIME_INDEX); assertThat(cache.cache.asMap()).isNotEmpty(); + // because of the retention ms, it may be deleting cached values 1, 2 or both. await() - .atMost(Duration.ofSeconds(30)) // increase to reduce chance of flakiness + .atMost(Duration.ofSeconds(30)) .pollDelay(Duration.ofSeconds(2)) .pollInterval(Duration.ofMillis(10)) .until(() -> !mockingDetails(removalListener).getInvocations().isEmpty()); - assertThat(cache.cache.asMap()).hasSize(1); + assertThat(cache.cache.asMap().size()).isLessThanOrEqualTo(1); verify(removalListener).onRemoval(any(SegmentIndexKey.class), any(), eq(RemovalCause.SIZE)); } }