diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java index 24f41371232efe..1ed18af9697853 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteModule.java @@ -227,6 +227,17 @@ private static boolean shouldEnableRemoteOutputService(RemoteOptions options) { return retry; }; + public static final Predicate HTTP_SUCCESS_CODES = + e -> { + boolean success = false; + if (e instanceof HttpException) { + int status = ((HttpException) e).response().status().code(); + success = + status == HttpResponseStatus.NOT_FOUND.code(); + } + return success; + }; + private void initHttpAndDiskCache( CommandEnvironment env, Credentials credentials, @@ -247,7 +258,7 @@ private void initHttpAndDiskCache( digestUtil, executorService, new RemoteRetrier( - remoteOptions, RETRIABLE_HTTP_ERRORS, retryScheduler, circuitBreaker)); + remoteOptions, RETRIABLE_HTTP_ERRORS, retryScheduler, circuitBreaker, HTTP_SUCCESS_CODES)); } catch (IOException e) { handleInitFailure(env, e, Code.CACHE_INIT_FAILURE); return; @@ -474,7 +485,7 @@ public void beforeCommand(CommandEnvironment env) throws AbruptExitException { CircuitBreakerFactory.createCircuitBreaker(remoteOptions); RemoteRetrier retrier = new RemoteRetrier( - remoteOptions, RemoteRetrier.RETRIABLE_GRPC_ERRORS, retryScheduler, circuitBreaker); + remoteOptions, RemoteRetrier.RETRIABLE_GRPC_ERRORS, retryScheduler, circuitBreaker, RemoteRetrier.GRPC_SUCCESS_CODES); if (!Strings.isNullOrEmpty(remoteOptions.remoteOutputService)) { var bazelOutputServiceChannel = @@ -648,7 +659,8 @@ public void beforeCommand(CommandEnvironment env) throws AbruptExitException { remoteOptions, RemoteRetrier.RETRIABLE_GRPC_ERRORS, // Handle NOT_FOUND internally retryScheduler, - circuitBreaker); + circuitBreaker, + RemoteRetrier.GRPC_SUCCESS_CODES); remoteExecutor = new ExperimentalGrpcRemoteExecutor( remoteOptions, execChannel.retain(), callCredentialsProvider, execRetrier); @@ -658,7 +670,8 @@ public void beforeCommand(CommandEnvironment env) throws AbruptExitException { remoteOptions, RemoteRetrier.RETRIABLE_GRPC_EXEC_ERRORS, retryScheduler, - circuitBreaker); + circuitBreaker, + RemoteRetrier.GRPC_SUCCESS_CODES); remoteExecutor = new GrpcRemoteExecutor(execChannel.retain(), callCredentialsProvider, execRetrier); } diff --git a/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrier.java b/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrier.java index f9186b85bafa9d..2448cb3df11b4a 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrier.java +++ b/src/main/java/com/google/devtools/build/lib/remote/RemoteRetrier.java @@ -67,12 +67,30 @@ private static Status fromException(Exception e) { return RemoteRetrierUtils.causedByStatus(e, Status.Code.NOT_FOUND); }; + public static final Predicate GRPC_SUCCESS_CODES = + e -> { + Status s = fromException(e); + if (s == null) { + // It's not a gRPC error. + return false; + } + switch (s.getCode()) { + case INVALID_ARGUMENT: + case NOT_FOUND: + case ALREADY_EXISTS: + case OUT_OF_RANGE: + return true; + default: + return false; + } + }; + public RemoteRetrier( RemoteOptions options, Predicate shouldRetry, ListeningScheduledExecutorService retryScheduler, CircuitBreaker circuitBreaker) { - this( + super( options.remoteMaxRetryAttempts > 0 ? () -> new ExponentialBackoff(options) : () -> RETRIES_DISABLED, @@ -81,6 +99,22 @@ public RemoteRetrier( circuitBreaker); } + public RemoteRetrier( + RemoteOptions options, + Predicate shouldRetry, + ListeningScheduledExecutorService retryScheduler, + CircuitBreaker circuitBreaker, + Predicate isSuccess) { + super( + options.remoteMaxRetryAttempts > 0 + ? () -> new ExponentialBackoff(options) + : () -> RETRIES_DISABLED, + shouldRetry, + retryScheduler, + circuitBreaker, + isSuccess); + } + public RemoteRetrier( Supplier backoff, Predicate shouldRetry, diff --git a/src/main/java/com/google/devtools/build/lib/remote/Retrier.java b/src/main/java/com/google/devtools/build/lib/remote/Retrier.java index 6647b155465cb7..db1df9f71b3adc 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/Retrier.java +++ b/src/main/java/com/google/devtools/build/lib/remote/Retrier.java @@ -176,11 +176,18 @@ public int getRetryAttempts() { } } + /* Treating all retriable errors as successful api call. */ + public static final Predicate ALWAYS_SUCCESS = + e -> { + return true; + }; + private final Supplier backoffSupplier; private final Predicate shouldRetry; private final CircuitBreaker circuitBreaker; private final ListeningScheduledExecutorService retryService; private final Sleeper sleeper; + private final Predicate isSuccess; public Retrier( Supplier backoffSupplier, @@ -191,6 +198,20 @@ public Retrier( backoffSupplier, shouldRetry, retryScheduler, circuitBreaker, TimeUnit.MILLISECONDS::sleep); } + public Retrier( + Supplier backoffSupplier, + Predicate shouldRetry, + ListeningScheduledExecutorService retryScheduler, + CircuitBreaker circuitBreaker, + Predicate isSuccess) { + this.backoffSupplier = backoffSupplier; + this.shouldRetry = shouldRetry; + this.retryService = retryScheduler; + this.circuitBreaker = circuitBreaker; + this.sleeper = TimeUnit.MILLISECONDS::sleep; + this.isSuccess = isSuccess; + } + @VisibleForTesting Retrier( Supplier backoffSupplier, @@ -203,6 +224,7 @@ public Retrier( this.retryService = retryService; this.circuitBreaker = circuitBreaker; this.sleeper = sleeper; + this.isSuccess = ALWAYS_SUCCESS; } ListeningScheduledExecutorService getRetryService() { @@ -248,8 +270,12 @@ public T execute(Callable call, Backoff backoff) throws Exception { } catch (Exception e) { Throwables.throwIfInstanceOf(e, InterruptedException.class); if (!shouldRetry.test(e)) { - // A non-retriable error doesn't represent server failure. - circuitBreaker.recordSuccess(); + // A non-retriable error may represent either success or server failure. + if (isSuccess.test(e)) { + circuitBreaker.recordSuccess(); + } else { + circuitBreaker.recordFailure(); + } throw e; } circuitBreaker.recordFailure(); @@ -321,7 +347,11 @@ private ListenableFuture onExecuteAsyncFailure( // gRPC Errors NOT_FOUND, OUT_OF_RANGE, ALREADY_EXISTS etc. are non-retriable error, and they // don't represent an // issue in Server. So treating these errors as successful api call. - circuitBreaker.recordSuccess(); + if (isSuccess.test(t)) { + circuitBreaker.recordSuccess(); + } else { + circuitBreaker.recordFailure(); + } return Futures.immediateFailedFuture(t); } } diff --git a/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/CircuitBreakerFactory.java b/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/CircuitBreakerFactory.java index 7781440880c885..421c57be2dec5d 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/CircuitBreakerFactory.java +++ b/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/CircuitBreakerFactory.java @@ -19,6 +19,7 @@ /** Factory for {@link Retrier.CircuitBreaker} */ public class CircuitBreakerFactory { public static final int DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE = 100; + public static final int DEFAULT_MIN_FAIL_COUNT_TO_COMPUTE_FAILURE_RATE = 12; private CircuitBreakerFactory() {} diff --git a/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreaker.java b/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreaker.java index 8baa72bc13d0f1..3fe502daf50b06 100644 --- a/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreaker.java +++ b/src/main/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreaker.java @@ -35,6 +35,7 @@ public class FailureCircuitBreaker implements Retrier.CircuitBreaker { private final int failureRateThreshold; private final int slidingWindowSize; private final int minCallCountToComputeFailureRate; + private final int minFailCountToComputeFailureRate; private final ScheduledExecutorService scheduledExecutor; /** @@ -52,6 +53,8 @@ public FailureCircuitBreaker(int failureRateThreshold, int slidingWindowSize) { this.slidingWindowSize = slidingWindowSize; this.minCallCountToComputeFailureRate = CircuitBreakerFactory.DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE; + this.minFailCountToComputeFailureRate = + CircuitBreakerFactory.DEFAULT_MIN_FAIL_COUNT_TO_COMPUTE_FAILURE_RATE; this.state = State.ACCEPT_CALLS; this.scheduledExecutor = slidingWindowSize > 0 ? Executors.newSingleThreadScheduledExecutor() : null; @@ -72,7 +75,7 @@ public void recordFailure() { failures::decrementAndGet, slidingWindowSize, TimeUnit.MILLISECONDS); } - if (totalCallCount < minCallCountToComputeFailureRate) { + if (totalCallCount < minCallCountToComputeFailureRate && failureCount < minFailCountToComputeFailureRate) { // The remote call count is below the threshold required to calculate the failure rate. return; } diff --git a/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java b/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java index 0889ae9ce52f11..dcb1a8bb578133 100644 --- a/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java +++ b/src/test/java/com/google/devtools/build/lib/remote/RetrierTest.java @@ -375,6 +375,45 @@ public void testCircuitBreakerFailureAndSuccessCallOnDifferentGrpcError() { assertThat(cb.state).isEqualTo(State.ACCEPT_CALLS); } + @Test + public void testCircuitBreakerFailureAndSuccessCallOnNonRetriableGrpcError() { + Supplier s = () -> new ZeroBackoff(/*maxRetries=*/ 2); + List nonRetriableFailure = + Arrays.asList(Status.PERMISSION_DENIED, Status.UNIMPLEMENTED, Status.DATA_LOSS); + List nonRetriableSuccess = + Arrays.asList(Status.NOT_FOUND, Status.OUT_OF_RANGE, Status.ALREADY_EXISTS); + TripAfterNCircuitBreaker cb = + new TripAfterNCircuitBreaker(nonRetriableFailure.size()); + Retrier r = new Retrier(s, RemoteRetrier.RETRIABLE_GRPC_ERRORS, retryService, cb, RemoteRetrier.GRPC_SUCCESS_CODES); + + int expectedConsecutiveFailures = 0; + + for (Status status : nonRetriableFailure) { + ListenableFuture res = + r.executeAsync( + () -> { + throw new StatusRuntimeException(status); + }); + expectedConsecutiveFailures += 1; + assertThrows(ExecutionException.class, res::get); + assertThat(cb.consecutiveFailures).isEqualTo(expectedConsecutiveFailures); + } + + assertThat(cb.state).isEqualTo(State.REJECT_CALLS); + cb.trialCall(); + + for (Status status : nonRetriableSuccess) { + ListenableFuture res = + r.executeAsync( + () -> { + throw new StatusRuntimeException(status); + }); + assertThat(cb.consecutiveFailures).isEqualTo(0); + assertThrows(ExecutionException.class, res::get); + } + assertThat(cb.state).isEqualTo(State.ACCEPT_CALLS); + } + /** Simple circuit breaker that trips after N consecutive failures. */ @ThreadSafe private static class TripAfterNCircuitBreaker implements CircuitBreaker { diff --git a/src/test/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreakerTest.java b/src/test/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreakerTest.java index 6740d06f9ce6e9..ebe0aae8e47763 100644 --- a/src/test/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreakerTest.java +++ b/src/test/java/com/google/devtools/build/lib/remote/circuitbreaker/FailureCircuitBreakerTest.java @@ -49,8 +49,8 @@ public void testRecordFailure_circuitTrips() throws InterruptedException { listOfSuccessAndFailureCalls.stream().parallel().forEach(Runnable::run); assertThat(failureCircuitBreaker.state()).isEqualTo(State.ACCEPT_CALLS); - // Sleep for windowInterval + 1ms. - Thread.sleep(windowInterval + 1 /*to compensate any delay*/); + // Sleep for windowInterval + 5ms. + Thread.sleep(windowInterval + 5 /*to compensate any delay*/); // make calls equals to threshold number of not ignored failure calls in parallel. listOfSuccessAndFailureCalls.stream().parallel().forEach(Runnable::run); @@ -64,21 +64,40 @@ public void testRecordFailure_circuitTrips() throws InterruptedException { @Test public void testRecordFailure_minCallCriteriaNotMet() throws InterruptedException { - final int failureRateThreshold = 10; + final int failureRateThreshold = 0; final int windowInterval = 100; final int minCallToComputeFailure = CircuitBreakerFactory.DEFAULT_MIN_CALL_COUNT_TO_COMPUTE_FAILURE_RATE; FailureCircuitBreaker failureCircuitBreaker = new FailureCircuitBreaker(failureRateThreshold, windowInterval); - // make half failure call, half success call and number of total call less than + // make success calls, failure call and number of total calls less than // minCallToComputeFailure. - IntStream.range(0, minCallToComputeFailure >> 1) - .parallel() - .forEach(i -> failureCircuitBreaker.recordFailure()); - IntStream.range(0, minCallToComputeFailure >> 1) + IntStream.range(0, minCallToComputeFailure - 2) .parallel() .forEach(i -> failureCircuitBreaker.recordSuccess()); + failureCircuitBreaker.recordFailure(); + assertThat(failureCircuitBreaker.state()).isEqualTo(State.ACCEPT_CALLS); + + // Sleep for less than windowInterval. + Thread.sleep(windowInterval - 50); + failureCircuitBreaker.recordFailure(); + assertThat(failureCircuitBreaker.state()).isEqualTo(State.REJECT_CALLS); + } + + @Test + public void testRecordFailure_minFailCriteriaNotMet() throws InterruptedException { + final int failureRateThreshold = 10; + final int windowInterval = 100; + final int minFailToComputeFailure = + CircuitBreakerFactory.DEFAULT_MIN_FAIL_COUNT_TO_COMPUTE_FAILURE_RATE; + FailureCircuitBreaker failureCircuitBreaker = + new FailureCircuitBreaker(failureRateThreshold, windowInterval); + + // make number of failure calls less than minFailToComputeFailure. + IntStream.range(0, minFailToComputeFailure - 1) + .parallel() + .forEach(i -> failureCircuitBreaker.recordFailure()); assertThat(failureCircuitBreaker.state()).isEqualTo(State.ACCEPT_CALLS); // Sleep for less than windowInterval.