From c227fa60e30f73fae13ba63eedccea33991ea5b6 Mon Sep 17 00:00:00 2001 From: Star Poon Date: Thu, 9 Jan 2025 18:54:36 +0900 Subject: [PATCH 1/9] Add health check API endpoints --- docs/installation.md | 2 +- docs/operation.md | 9 ++++ .../io/trino/gateway/baseapp/BaseApp.java | 2 + .../clustermonitor/ActiveClusterMonitor.java | 7 +++ .../resource/GatewayHealthCheckResource.java | 54 +++++++++++++++++++ .../ha/TestGatewayHaMultipleBackend.java | 19 +++++++ 6 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 gateway-ha/src/main/java/io/trino/gateway/ha/resource/GatewayHealthCheckResource.java diff --git a/docs/installation.md b/docs/installation.md index d63d7a823..6f5c04bf4 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -347,7 +347,7 @@ Standard Helm options such as `replicaCount`, `image`, `imagePullSecrets`, More detail about the chart are available in the [values reference documentation](https://github.com/trinodb/charts/blob/main/charts/gateway/README.md) -### Health Checks +### Health Checks on Trino Clusters The Trino Gateway periodically performs health checks and maintains an in-memory TrinoStatus for each backend. If a backend fails a health check, diff --git a/docs/operation.md b/docs/operation.md index 26dd9e07a..955dbc528 100644 --- a/docs/operation.md +++ b/docs/operation.md @@ -36,6 +36,7 @@ documentation](https://trino.io/docs/current/admin/graceful-shutdown.html) for more details. ## Query routing options + - The default router selects the backend randomly to route the queries. - If you want to route the queries to the least loaded backend for a user i.e. backend with the least number of queries running or queued from a particular user, @@ -65,3 +66,11 @@ scrape_configs: - targets: - gateway1.example.com:8080 ``` + +## API health endpoints + +Trino Gateway provides two API endpoints to indicate the current status of the server. +`/trino-gateway/livez` returns status code 200, indicating the server is alive. +`/trino-gateway/readyz` returns status code 200, indicating the server has +completed initialization and is ready to serve requests. Otherwise, status code +503 will be returned. diff --git a/gateway-ha/src/main/java/io/trino/gateway/baseapp/BaseApp.java b/gateway-ha/src/main/java/io/trino/gateway/baseapp/BaseApp.java index 391e284dd..a9635ead8 100644 --- a/gateway-ha/src/main/java/io/trino/gateway/baseapp/BaseApp.java +++ b/gateway-ha/src/main/java/io/trino/gateway/baseapp/BaseApp.java @@ -26,6 +26,7 @@ import io.trino.gateway.ha.module.RouterBaseModule; import io.trino.gateway.ha.module.StochasticRoutingManagerProvider; import io.trino.gateway.ha.resource.EntityEditorResource; +import io.trino.gateway.ha.resource.GatewayHealthCheckResource; import io.trino.gateway.ha.resource.GatewayResource; import io.trino.gateway.ha.resource.GatewayViewResource; import io.trino.gateway.ha.resource.GatewayWebAppResource; @@ -179,6 +180,7 @@ private static void registerResources(Binder binder) jaxrsBinder(binder).bind(PublicResource.class); jaxrsBinder(binder).bind(TrinoResource.class); jaxrsBinder(binder).bind(WebUIStaticResource.class); + jaxrsBinder(binder).bind(GatewayHealthCheckResource.class); } private static void registerAuthFilters(Binder binder) diff --git a/gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ActiveClusterMonitor.java b/gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ActiveClusterMonitor.java index 235c0caec..925a393b4 100644 --- a/gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ActiveClusterMonitor.java +++ b/gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ActiveClusterMonitor.java @@ -37,6 +37,7 @@ public class ActiveClusterMonitor public static final int DEFAULT_THREAD_POOL_SIZE = 20; private static final Logger log = Logger.get(ActiveClusterMonitor.class); + private volatile boolean isInitialized; private final List clusterStatsObservers; private final GatewayBackendManager gatewayBackendManager; @@ -83,6 +84,7 @@ public void start() observer.observe(stats); } } + isInitialized = true; } catch (Exception e) { log.error(e, "Error performing backend monitor tasks"); @@ -96,4 +98,9 @@ public void stop() executorService.shutdownNow(); scheduledExecutor.shutdownNow(); } + + public boolean isInitialized() + { + return isInitialized; + } } diff --git a/gateway-ha/src/main/java/io/trino/gateway/ha/resource/GatewayHealthCheckResource.java b/gateway-ha/src/main/java/io/trino/gateway/ha/resource/GatewayHealthCheckResource.java new file mode 100644 index 000000000..21ad30ebb --- /dev/null +++ b/gateway-ha/src/main/java/io/trino/gateway/ha/resource/GatewayHealthCheckResource.java @@ -0,0 +1,54 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.gateway.ha.resource; + +import com.google.inject.Inject; +import io.trino.gateway.ha.clustermonitor.ActiveClusterMonitor; +import jakarta.ws.rs.GET; +import jakarta.ws.rs.Path; +import jakarta.ws.rs.core.Response; + +import static java.util.Objects.requireNonNull; + +@Path("/trino-gateway") +public class GatewayHealthCheckResource +{ + private final ActiveClusterMonitor activeClusterMonitor; + + @Inject + public GatewayHealthCheckResource(ActiveClusterMonitor activeClusterMonitor) + { + this.activeClusterMonitor = requireNonNull(activeClusterMonitor, "activeClusterMonitor is null"); + } + + @GET + @Path("/livez") + public Response liveness() + { + return Response.ok("ok").build(); + } + + @GET + @Path("/readyz") + public Response readiness() + { + if (!activeClusterMonitor.isInitialized()) { + return Response + .status(Response.Status.SERVICE_UNAVAILABLE) + .entity("Trino Gateway is still initializing") + .build(); + } + return Response.ok("ok").build(); + } +} diff --git a/gateway-ha/src/test/java/io/trino/gateway/ha/TestGatewayHaMultipleBackend.java b/gateway-ha/src/test/java/io/trino/gateway/ha/TestGatewayHaMultipleBackend.java index 56ebfb5eb..635767b07 100644 --- a/gateway-ha/src/test/java/io/trino/gateway/ha/TestGatewayHaMultipleBackend.java +++ b/gateway-ha/src/test/java/io/trino/gateway/ha/TestGatewayHaMultipleBackend.java @@ -47,6 +47,7 @@ import static com.google.common.collect.MoreCollectors.onlyElement; import static com.google.common.net.HttpHeaders.CONTENT_TYPE; import static com.google.common.net.MediaType.JSON_UTF_8; +import static com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly; import static org.assertj.core.api.Assertions.assertThat; import static org.testcontainers.utility.MountableFile.forClasspathResource; @@ -359,6 +360,24 @@ void testCookieSigning() assertThat(callbackResponse.code()).isEqualTo(500); } + @Test + void testHealthCheckEndpoints() + throws IOException + { + Request livenessCheck = new Request.Builder() + .url("http://localhost:" + routerPort + "/trino-gateway/livez") + .build(); + Response livenessResponse = httpClient.newCall(livenessCheck).execute(); + assertThat(livenessResponse.code()).isEqualTo(200); + + sleepUninterruptibly(1, TimeUnit.SECONDS); // wait for server initialization + Request readinessCheck = new Request.Builder() + .url("http://localhost:" + routerPort + "/trino-gateway/readyz") + .build(); + Response readinessResponse = httpClient.newCall(readinessCheck).execute(); + assertThat(readinessResponse.code()).isEqualTo(200); + } + @AfterAll void cleanup() { From 034b5f6e29a7066ebeb35ff32687271131f2dfde Mon Sep 17 00:00:00 2001 From: Star Poon Date: Sat, 25 Jan 2025 09:23:02 +0900 Subject: [PATCH 2/9] fixup! Add health check API endpoints --- .../gateway/ha/TestGatewayHaMultipleBackend.java | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/gateway-ha/src/test/java/io/trino/gateway/ha/TestGatewayHaMultipleBackend.java b/gateway-ha/src/test/java/io/trino/gateway/ha/TestGatewayHaMultipleBackend.java index 635767b07..64c2daced 100644 --- a/gateway-ha/src/test/java/io/trino/gateway/ha/TestGatewayHaMultipleBackend.java +++ b/gateway-ha/src/test/java/io/trino/gateway/ha/TestGatewayHaMultipleBackend.java @@ -370,12 +370,18 @@ void testHealthCheckEndpoints() Response livenessResponse = httpClient.newCall(livenessCheck).execute(); assertThat(livenessResponse.code()).isEqualTo(200); - sleepUninterruptibly(1, TimeUnit.SECONDS); // wait for server initialization Request readinessCheck = new Request.Builder() .url("http://localhost:" + routerPort + "/trino-gateway/readyz") .build(); - Response readinessResponse = httpClient.newCall(readinessCheck).execute(); - assertThat(readinessResponse.code()).isEqualTo(200); + for (int i = 0; i < 100; i++) { + try (Response readinessResponse = httpClient.newCall(readinessCheck).execute()) { + if (readinessResponse.code() == 200) { + return; + } + } + sleepUninterruptibly(100, TimeUnit.MILLISECONDS); + } + throw new IllegalStateException("Trino Gateway health check failed"); } @AfterAll From b2b4abc2d931c109b8b7e13d23a3820111d85c00 Mon Sep 17 00:00:00 2001 From: Star Poon Date: Wed, 29 Jan 2025 11:26:52 +0900 Subject: [PATCH 3/9] fixup! Add health check API endpoints --- docs/installation.md | 2 +- docs/operation.md | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 6f5c04bf4..1579248f2 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -347,7 +347,7 @@ Standard Helm options such as `replicaCount`, `image`, `imagePullSecrets`, More detail about the chart are available in the [values reference documentation](https://github.com/trinodb/charts/blob/main/charts/gateway/README.md) -### Health Checks on Trino Clusters +### Health checks on Trino clusters The Trino Gateway periodically performs health checks and maintains an in-memory TrinoStatus for each backend. If a backend fails a health check, diff --git a/docs/operation.md b/docs/operation.md index 955dbc528..beba7ea61 100644 --- a/docs/operation.md +++ b/docs/operation.md @@ -67,10 +67,10 @@ scrape_configs: - gateway1.example.com:8080 ``` -## API health endpoints +## Trino Gateway health endpoints -Trino Gateway provides two API endpoints to indicate the current status of the server. -`/trino-gateway/livez` returns status code 200, indicating the server is alive. -`/trino-gateway/readyz` returns status code 200, indicating the server has +Trino Gateway provides two API endpoints to indicate the current status of the server: +* `/trino-gateway/livez` returns status code 200, indicating the server is alive. +* `/trino-gateway/readyz` returns status code 200, indicating the server has completed initialization and is ready to serve requests. Otherwise, status code -503 will be returned. +503 is returned. From d614e164ec8c319c9a4a855beebf7a8f491fdcb1 Mon Sep 17 00:00:00 2001 From: Star Poon Date: Wed, 29 Jan 2025 16:26:13 +0900 Subject: [PATCH 4/9] fixup! Add health check API endpoints --- docs/operation.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/operation.md b/docs/operation.md index beba7ea61..eee4dba88 100644 --- a/docs/operation.md +++ b/docs/operation.md @@ -72,5 +72,6 @@ scrape_configs: Trino Gateway provides two API endpoints to indicate the current status of the server: * `/trino-gateway/livez` returns status code 200, indicating the server is alive. * `/trino-gateway/readyz` returns status code 200, indicating the server has -completed initialization and is ready to serve requests. Otherwise, status code -503 is returned. +completed initialization and is ready to serve requests. This means the initial +connection to database and the first round of health check on Trino clusters +were completed. Otherwise, status code 503 is returned. From 8a0e89f5cfae31836c167dbd42fde09e91652e23 Mon Sep 17 00:00:00 2001 From: Star Poon Date: Thu, 30 Jan 2025 19:57:16 +0900 Subject: [PATCH 5/9] fixup! Add health check API endpoints --- docs/operation.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/operation.md b/docs/operation.md index eee4dba88..b038451c1 100644 --- a/docs/operation.md +++ b/docs/operation.md @@ -70,7 +70,9 @@ scrape_configs: ## Trino Gateway health endpoints Trino Gateway provides two API endpoints to indicate the current status of the server: -* `/trino-gateway/livez` returns status code 200, indicating the server is alive. +* `/trino-gateway/livez` always returns status code 200, indicating the server is +alive. However, it might not respond if the gateway is too busy, stuck, or +taking a long time for garbage collection. * `/trino-gateway/readyz` returns status code 200, indicating the server has completed initialization and is ready to serve requests. This means the initial connection to database and the first round of health check on Trino clusters From 2eafbfb7e88fd7154e9fffae6c923dbd7b8b9e05 Mon Sep 17 00:00:00 2001 From: oneonestar Date: Thu, 6 Feb 2025 08:06:02 +0900 Subject: [PATCH 6/9] Update docs/operation.md Co-authored-by: Manfred Moser --- docs/operation.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/operation.md b/docs/operation.md index b038451c1..94bab1424 100644 --- a/docs/operation.md +++ b/docs/operation.md @@ -70,6 +70,7 @@ scrape_configs: ## Trino Gateway health endpoints Trino Gateway provides two API endpoints to indicate the current status of the server: + * `/trino-gateway/livez` always returns status code 200, indicating the server is alive. However, it might not respond if the gateway is too busy, stuck, or taking a long time for garbage collection. From 67243b4c33bb4d2b20dcba1a2b9353b9c47bb936 Mon Sep 17 00:00:00 2001 From: oneonestar Date: Thu, 6 Feb 2025 08:06:22 +0900 Subject: [PATCH 7/9] Update docs/operation.md Co-authored-by: Manfred Moser --- docs/operation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/operation.md b/docs/operation.md index 94bab1424..378a8c73f 100644 --- a/docs/operation.md +++ b/docs/operation.md @@ -72,7 +72,7 @@ scrape_configs: Trino Gateway provides two API endpoints to indicate the current status of the server: * `/trino-gateway/livez` always returns status code 200, indicating the server is -alive. However, it might not respond if the gateway is too busy, stuck, or +alive. However, it might not respond if the Trino Gateway is too busy, stuck, or taking a long time for garbage collection. * `/trino-gateway/readyz` returns status code 200, indicating the server has completed initialization and is ready to serve requests. This means the initial From b209394cb925a7c14f167cc4ddca4c7987732ca8 Mon Sep 17 00:00:00 2001 From: oneonestar Date: Thu, 6 Feb 2025 08:06:31 +0900 Subject: [PATCH 8/9] Update docs/operation.md Co-authored-by: Manfred Moser --- docs/operation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/operation.md b/docs/operation.md index 378a8c73f..0f55d8a5a 100644 --- a/docs/operation.md +++ b/docs/operation.md @@ -76,5 +76,5 @@ alive. However, it might not respond if the Trino Gateway is too busy, stuck, or taking a long time for garbage collection. * `/trino-gateway/readyz` returns status code 200, indicating the server has completed initialization and is ready to serve requests. This means the initial -connection to database and the first round of health check on Trino clusters +connection to the database and the first round of health check on Trino clusters were completed. Otherwise, status code 503 is returned. From bf11632d7b53e39586269f7c1591f9ad760835a4 Mon Sep 17 00:00:00 2001 From: oneonestar Date: Thu, 6 Feb 2025 08:06:43 +0900 Subject: [PATCH 9/9] Update docs/operation.md Co-authored-by: Manfred Moser --- docs/operation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/operation.md b/docs/operation.md index 0f55d8a5a..2fcd1114c 100644 --- a/docs/operation.md +++ b/docs/operation.md @@ -77,4 +77,4 @@ taking a long time for garbage collection. * `/trino-gateway/readyz` returns status code 200, indicating the server has completed initialization and is ready to serve requests. This means the initial connection to the database and the first round of health check on Trino clusters -were completed. Otherwise, status code 503 is returned. +are completed. Otherwise, status code 503 is returned.