diff --git a/docs/installation.md b/docs/installation.md index d63d7a823..1579248f2 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -347,7 +347,7 @@ Standard Helm options such as `replicaCount`, `image`, `imagePullSecrets`, More detail about the chart are available in the [values reference documentation](https://github.com/trinodb/charts/blob/main/charts/gateway/README.md) -### Health Checks +### Health checks on Trino clusters The Trino Gateway periodically performs health checks and maintains an in-memory TrinoStatus for each backend. If a backend fails a health check, diff --git a/docs/operation.md b/docs/operation.md index 26dd9e07a..b038451c1 100644 --- a/docs/operation.md +++ b/docs/operation.md @@ -36,6 +36,7 @@ documentation](https://trino.io/docs/current/admin/graceful-shutdown.html) for more details. ## Query routing options + - The default router selects the backend randomly to route the queries. - If you want to route the queries to the least loaded backend for a user i.e. backend with the least number of queries running or queued from a particular user, @@ -65,3 +66,14 @@ scrape_configs: - targets: - gateway1.example.com:8080 ``` + +## Trino Gateway health endpoints + +Trino Gateway provides two API endpoints to indicate the current status of the server: +* `/trino-gateway/livez` always returns status code 200, indicating the server is +alive. However, it might not respond if the gateway is too busy, stuck, or +taking a long time for garbage collection. +* `/trino-gateway/readyz` returns status code 200, indicating the server has +completed initialization and is ready to serve requests. This means the initial +connection to database and the first round of health check on Trino clusters +were completed. Otherwise, status code 503 is returned. diff --git a/gateway-ha/src/main/java/io/trino/gateway/baseapp/BaseApp.java b/gateway-ha/src/main/java/io/trino/gateway/baseapp/BaseApp.java index 391e284dd..a9635ead8 100644 --- a/gateway-ha/src/main/java/io/trino/gateway/baseapp/BaseApp.java +++ b/gateway-ha/src/main/java/io/trino/gateway/baseapp/BaseApp.java @@ -26,6 +26,7 @@ import io.trino.gateway.ha.module.RouterBaseModule; import io.trino.gateway.ha.module.StochasticRoutingManagerProvider; import io.trino.gateway.ha.resource.EntityEditorResource; +import io.trino.gateway.ha.resource.GatewayHealthCheckResource; import io.trino.gateway.ha.resource.GatewayResource; import io.trino.gateway.ha.resource.GatewayViewResource; import io.trino.gateway.ha.resource.GatewayWebAppResource; @@ -179,6 +180,7 @@ private static void registerResources(Binder binder) jaxrsBinder(binder).bind(PublicResource.class); jaxrsBinder(binder).bind(TrinoResource.class); jaxrsBinder(binder).bind(WebUIStaticResource.class); + jaxrsBinder(binder).bind(GatewayHealthCheckResource.class); } private static void registerAuthFilters(Binder binder) diff --git a/gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ActiveClusterMonitor.java b/gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ActiveClusterMonitor.java index 235c0caec..925a393b4 100644 --- a/gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ActiveClusterMonitor.java +++ b/gateway-ha/src/main/java/io/trino/gateway/ha/clustermonitor/ActiveClusterMonitor.java @@ -37,6 +37,7 @@ public class ActiveClusterMonitor public static final int DEFAULT_THREAD_POOL_SIZE = 20; private static final Logger log = Logger.get(ActiveClusterMonitor.class); + private volatile boolean isInitialized; private final List clusterStatsObservers; private final GatewayBackendManager gatewayBackendManager; @@ -83,6 +84,7 @@ public void start() observer.observe(stats); } } + isInitialized = true; } catch (Exception e) { log.error(e, "Error performing backend monitor tasks"); @@ -96,4 +98,9 @@ public void stop() executorService.shutdownNow(); scheduledExecutor.shutdownNow(); } + + public boolean isInitialized() + { + return isInitialized; + } } diff --git a/gateway-ha/src/main/java/io/trino/gateway/ha/resource/GatewayHealthCheckResource.java b/gateway-ha/src/main/java/io/trino/gateway/ha/resource/GatewayHealthCheckResource.java new file mode 100644 index 000000000..21ad30ebb --- /dev/null +++ b/gateway-ha/src/main/java/io/trino/gateway/ha/resource/GatewayHealthCheckResource.java @@ -0,0 +1,54 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.gateway.ha.resource; + +import com.google.inject.Inject; +import io.trino.gateway.ha.clustermonitor.ActiveClusterMonitor; +import jakarta.ws.rs.GET; +import jakarta.ws.rs.Path; +import jakarta.ws.rs.core.Response; + +import static java.util.Objects.requireNonNull; + +@Path("/trino-gateway") +public class GatewayHealthCheckResource +{ + private final ActiveClusterMonitor activeClusterMonitor; + + @Inject + public GatewayHealthCheckResource(ActiveClusterMonitor activeClusterMonitor) + { + this.activeClusterMonitor = requireNonNull(activeClusterMonitor, "activeClusterMonitor is null"); + } + + @GET + @Path("/livez") + public Response liveness() + { + return Response.ok("ok").build(); + } + + @GET + @Path("/readyz") + public Response readiness() + { + if (!activeClusterMonitor.isInitialized()) { + return Response + .status(Response.Status.SERVICE_UNAVAILABLE) + .entity("Trino Gateway is still initializing") + .build(); + } + return Response.ok("ok").build(); + } +} diff --git a/gateway-ha/src/test/java/io/trino/gateway/ha/TestGatewayHaMultipleBackend.java b/gateway-ha/src/test/java/io/trino/gateway/ha/TestGatewayHaMultipleBackend.java index 56ebfb5eb..64c2daced 100644 --- a/gateway-ha/src/test/java/io/trino/gateway/ha/TestGatewayHaMultipleBackend.java +++ b/gateway-ha/src/test/java/io/trino/gateway/ha/TestGatewayHaMultipleBackend.java @@ -47,6 +47,7 @@ import static com.google.common.collect.MoreCollectors.onlyElement; import static com.google.common.net.HttpHeaders.CONTENT_TYPE; import static com.google.common.net.MediaType.JSON_UTF_8; +import static com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly; import static org.assertj.core.api.Assertions.assertThat; import static org.testcontainers.utility.MountableFile.forClasspathResource; @@ -359,6 +360,30 @@ void testCookieSigning() assertThat(callbackResponse.code()).isEqualTo(500); } + @Test + void testHealthCheckEndpoints() + throws IOException + { + Request livenessCheck = new Request.Builder() + .url("http://localhost:" + routerPort + "/trino-gateway/livez") + .build(); + Response livenessResponse = httpClient.newCall(livenessCheck).execute(); + assertThat(livenessResponse.code()).isEqualTo(200); + + Request readinessCheck = new Request.Builder() + .url("http://localhost:" + routerPort + "/trino-gateway/readyz") + .build(); + for (int i = 0; i < 100; i++) { + try (Response readinessResponse = httpClient.newCall(readinessCheck).execute()) { + if (readinessResponse.code() == 200) { + return; + } + } + sleepUninterruptibly(100, TimeUnit.MILLISECONDS); + } + throw new IllegalStateException("Trino Gateway health check failed"); + } + @AfterAll void cleanup() {