Skip to content

Commit 3ad0b0e

Browse files
authored
Avoid lock up on unexpected ExecutorService errors while executing Local Activities (#2371)
1 parent b187644 commit 3ad0b0e

13 files changed

+248
-36
lines changed

temporal-sdk/src/main/java/io/temporal/internal/worker/ActivityWorker.java

-1
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,6 @@ public boolean start() {
104104
new TaskHandlerImpl(handler),
105105
pollerOptions,
106106
slotSupplier.maximumSlots().orElse(Integer.MAX_VALUE),
107-
true,
108107
options.isUsingVirtualThreads());
109108
poller =
110109
new Poller<>(

temporal-sdk/src/main/java/io/temporal/internal/worker/LocalActivitySlotSupplierQueue.java

+21-6
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
import io.temporal.worker.tuning.LocalActivitySlotInfo;
2424
import io.temporal.worker.tuning.SlotPermit;
25+
import io.temporal.worker.tuning.SlotReleaseReason;
2526
import io.temporal.workflow.Functions;
2627
import java.util.concurrent.*;
2728
import javax.annotation.Nullable;
@@ -77,10 +78,11 @@ static final class QueuedLARequest {
7778
}
7879

7980
private void processQueue() {
80-
try {
81-
while (running || !requestQueue.isEmpty()) {
82-
QueuedLARequest request = requestQueue.take();
83-
SlotPermit slotPermit;
81+
while (running || !requestQueue.isEmpty()) {
82+
SlotPermit slotPermit = null;
83+
QueuedLARequest request = null;
84+
try {
85+
request = requestQueue.take();
8486
try {
8587
slotPermit = slotSupplier.reserveSlot(request.data);
8688
} catch (InterruptedException e) {
@@ -95,9 +97,22 @@ private void processQueue() {
9597
}
9698
request.task.getExecutionContext().setPermit(slotPermit);
9799
afterReservedCallback.apply(request.task);
100+
} catch (InterruptedException e) {
101+
Thread.currentThread().interrupt();
102+
} catch (Throwable e) {
103+
// Fail the workflow task if something went wrong executing the local activity (at the
104+
// executor level, otherwise, the LA handler itself should be handling errors)
105+
log.error("Unexpected error submitting local activity task to worker", e);
106+
if (slotPermit != null) {
107+
slotSupplier.releaseSlot(SlotReleaseReason.error(new RuntimeException(e)), slotPermit);
108+
}
109+
if (request != null) {
110+
LocalActivityExecutionContext executionContext = request.task.getExecutionContext();
111+
executionContext.callback(
112+
LocalActivityResult.processingFailed(
113+
executionContext.getActivityId(), request.task.getAttemptTask().getAttempt(), e));
114+
}
98115
}
99-
} catch (InterruptedException e) {
100-
Thread.currentThread().interrupt();
101116
}
102117
}
103118

temporal-sdk/src/main/java/io/temporal/internal/worker/LocalActivityWorker.java

-1
Original file line numberDiff line numberDiff line change
@@ -688,7 +688,6 @@ public boolean start() {
688688
new AttemptTaskHandlerImpl(handler),
689689
pollerOptions,
690690
slotSupplier.maximumSlots().orElse(Integer.MAX_VALUE),
691-
false,
692691
options.isUsingVirtualThreads());
693692

694693
this.workerMetricsScope.counter(MetricsType.WORKER_START_COUNTER).inc(1);

temporal-sdk/src/main/java/io/temporal/internal/worker/NexusWorker.java

-1
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,6 @@ public boolean start() {
102102
new TaskHandlerImpl(handler),
103103
pollerOptions,
104104
slotSupplier.maximumSlots().orElse(Integer.MAX_VALUE),
105-
true,
106105
options.isUsingVirtualThreads());
107106
poller =
108107
new Poller<>(

temporal-sdk/src/main/java/io/temporal/internal/worker/PollTaskExecutor.java

+6-16
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,7 @@ public interface TaskHandler<TT> {
5252
@Nonnull String identity,
5353
@Nonnull TaskHandler<T> handler,
5454
@Nonnull PollerOptions pollerOptions,
55-
int workerTaskSlots,
56-
boolean synchronousQueue,
55+
int threadPoolMax,
5756
boolean useVirtualThreads) {
5857
this.namespace = Objects.requireNonNull(namespace);
5958
this.taskQueue = Objects.requireNonNull(taskQueue);
@@ -63,8 +62,10 @@ public interface TaskHandler<TT> {
6362

6463
this.pollThreadNamePrefix =
6564
pollerOptions.getPollThreadNamePrefix().replaceFirst("Poller", "Executor");
66-
// If virtual threads are enabled, we use a virtual thread executor.
67-
if (useVirtualThreads) {
65+
if (pollerOptions.getPollerTaskExecutorOverride() != null) {
66+
this.taskExecutor = pollerOptions.getPollerTaskExecutorOverride();
67+
} else if (useVirtualThreads) {
68+
// If virtual threads are enabled, we use a virtual thread executor.
6869
AtomicInteger threadIndex = new AtomicInteger();
6970
this.taskExecutor =
7071
VirtualThreadDelegate.newVirtualThreadExecutor(
@@ -74,18 +75,7 @@ public interface TaskHandler<TT> {
7475
});
7576
} else {
7677
ThreadPoolExecutor threadPoolTaskExecutor =
77-
new ThreadPoolExecutor(
78-
// for SynchronousQueue we can afford to set it to 0, because the queue is always full
79-
// or empty
80-
// for LinkedBlockingQueue we have to set slots to workerTaskSlots to avoid situation
81-
// when the queue grows, but the amount of threads is not, because the queue is not
82-
// (and
83-
// never) full
84-
synchronousQueue ? 0 : workerTaskSlots,
85-
workerTaskSlots,
86-
10,
87-
TimeUnit.SECONDS,
88-
synchronousQueue ? new SynchronousQueue<>() : new LinkedBlockingQueue<>());
78+
new ThreadPoolExecutor(0, threadPoolMax, 10, TimeUnit.SECONDS, new SynchronousQueue<>());
8979
threadPoolTaskExecutor.allowCoreThreadTimeOut(true);
9080
threadPoolTaskExecutor.setThreadFactory(
9181
new ExecutorThreadFactory(

temporal-sdk/src/main/java/io/temporal/internal/worker/Poller.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ private void logPollErrors(Thread t, Throwable e) {
342342

343343
/**
344344
* Some exceptions are considered normal during shutdown {@link #shouldIgnoreDuringShutdown} and
345-
* we log them in the most quite manner.
345+
* we log them in the most quiet manner.
346346
*
347347
* @param t thread where the exception happened
348348
* @param e the exception itself

temporal-sdk/src/main/java/io/temporal/internal/worker/PollerOptions.java

+19-2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import io.grpc.Status;
2424
import io.grpc.StatusRuntimeException;
2525
import java.time.Duration;
26+
import java.util.concurrent.ExecutorService;
2627
import org.slf4j.Logger;
2728
import org.slf4j.LoggerFactory;
2829

@@ -63,6 +64,7 @@ public static final class Builder {
6364
private String pollThreadNamePrefix;
6465
private Thread.UncaughtExceptionHandler uncaughtExceptionHandler;
6566
private boolean usingVirtualThreads;
67+
private ExecutorService pollerTaskExecutorOverride;
6668

6769
private Builder() {}
6870

@@ -81,6 +83,7 @@ private Builder(PollerOptions options) {
8183
this.pollThreadNamePrefix = options.getPollThreadNamePrefix();
8284
this.uncaughtExceptionHandler = options.getUncaughtExceptionHandler();
8385
this.usingVirtualThreads = options.isUsingVirtualThreads();
86+
this.pollerTaskExecutorOverride = options.getPollerTaskExecutorOverride();
8487
}
8588

8689
/** Defines interval for measuring poll rate. Larger the interval more spiky can be the load. */
@@ -162,6 +165,12 @@ public Builder setUsingVirtualThreads(boolean usingVirtualThreads) {
162165
return this;
163166
}
164167

168+
/** Override the task executor ExecutorService */
169+
public Builder setPollerTaskExecutorOverride(ExecutorService overrideTaskExecutor) {
170+
this.pollerTaskExecutorOverride = overrideTaskExecutor;
171+
return this;
172+
}
173+
165174
public PollerOptions build() {
166175
if (uncaughtExceptionHandler == null) {
167176
uncaughtExceptionHandler =
@@ -189,7 +198,8 @@ public PollerOptions build() {
189198
pollThreadCount,
190199
uncaughtExceptionHandler,
191200
pollThreadNamePrefix,
192-
usingVirtualThreads);
201+
usingVirtualThreads,
202+
pollerTaskExecutorOverride);
193203
}
194204
}
195205

@@ -206,6 +216,7 @@ public PollerOptions build() {
206216
private final Thread.UncaughtExceptionHandler uncaughtExceptionHandler;
207217
private final String pollThreadNamePrefix;
208218
private final boolean usingVirtualThreads;
219+
private final ExecutorService pollerTaskExecutorOverride;
209220

210221
private PollerOptions(
211222
int maximumPollRateIntervalMilliseconds,
@@ -218,7 +229,8 @@ private PollerOptions(
218229
int pollThreadCount,
219230
Thread.UncaughtExceptionHandler uncaughtExceptionHandler,
220231
String pollThreadNamePrefix,
221-
boolean usingVirtualThreads) {
232+
boolean usingVirtualThreads,
233+
ExecutorService pollerTaskExecutorOverride) {
222234
this.maximumPollRateIntervalMilliseconds = maximumPollRateIntervalMilliseconds;
223235
this.maximumPollRatePerSecond = maximumPollRatePerSecond;
224236
this.backoffCoefficient = backoffCoefficient;
@@ -230,6 +242,7 @@ private PollerOptions(
230242
this.uncaughtExceptionHandler = uncaughtExceptionHandler;
231243
this.pollThreadNamePrefix = pollThreadNamePrefix;
232244
this.usingVirtualThreads = usingVirtualThreads;
245+
this.pollerTaskExecutorOverride = pollerTaskExecutorOverride;
233246
}
234247

235248
public int getMaximumPollRateIntervalMilliseconds() {
@@ -276,6 +289,10 @@ public boolean isUsingVirtualThreads() {
276289
return usingVirtualThreads;
277290
}
278291

292+
public ExecutorService getPollerTaskExecutorOverride() {
293+
return pollerTaskExecutorOverride;
294+
}
295+
279296
@Override
280297
public String toString() {
281298
return "PollerOptions{"

temporal-sdk/src/main/java/io/temporal/internal/worker/WorkflowWorker.java

-1
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@ public boolean start() {
120120
new TaskHandlerImpl(handler),
121121
pollerOptions,
122122
this.slotSupplier.maximumSlots().orElse(Integer.MAX_VALUE),
123-
true,
124123
options.isUsingVirtualThreads());
125124
stickyQueueBalancer =
126125
new StickyQueueBalancer(

temporal-sdk/src/main/java/io/temporal/worker/Worker.java

+6-1
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,12 @@ private static SingleWorkerOptions toLocalActivityOptions(
632632
List<ContextPropagator> contextPropagators,
633633
Scope metricsScope) {
634634
return toSingleWorkerOptions(factoryOptions, options, clientOptions, contextPropagators)
635-
.setPollerOptions(PollerOptions.newBuilder().setPollThreadCount(1).build())
635+
.setPollerOptions(
636+
PollerOptions.newBuilder()
637+
.setPollThreadCount(1)
638+
.setPollerTaskExecutorOverride(
639+
factoryOptions.getOverrideLocalActivityTaskExecutor())
640+
.build())
636641
.setMetricsScope(metricsScope)
637642
.setUsingVirtualThreads(options.isUsingVirtualThreadsOnLocalActivityWorker())
638643
.build();

temporal-sdk/src/main/java/io/temporal/worker/WorkerFactoryOptions.java

+27
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,12 @@
2020

2121
package io.temporal.worker;
2222

23+
import com.google.common.annotations.VisibleForTesting;
2324
import com.google.common.base.Preconditions;
2425
import io.temporal.common.Experimental;
2526
import io.temporal.common.interceptors.WorkerInterceptor;
2627
import java.time.Duration;
28+
import java.util.concurrent.ExecutorService;
2729
import javax.annotation.Nullable;
2830

2931
public class WorkerFactoryOptions {
@@ -57,6 +59,7 @@ public static class Builder {
5759
private WorkerInterceptor[] workerInterceptors;
5860
private boolean enableLoggingInReplay;
5961
private boolean usingVirtualWorkflowThreads;
62+
private ExecutorService overrideLocalActivityTaskExecutor;
6063

6164
private Builder() {}
6265

@@ -71,6 +74,7 @@ private Builder(WorkerFactoryOptions options) {
7174
this.workerInterceptors = options.workerInterceptors;
7275
this.enableLoggingInReplay = options.enableLoggingInReplay;
7376
this.usingVirtualWorkflowThreads = options.usingVirtualWorkflowThreads;
77+
this.overrideLocalActivityTaskExecutor = options.overrideLocalActivityTaskExecutor;
7478
}
7579

7680
/**
@@ -143,6 +147,14 @@ public Builder setWorkflowHostLocalPollThreadCount(int workflowHostLocalPollThre
143147
return this;
144148
}
145149

150+
/** For internal use only. Overrides the local activity task ExecutorService. */
151+
@VisibleForTesting
152+
Builder setOverrideLocalActivityTaskExecutor(
153+
ExecutorService overrideLocalActivityTaskExecutor) {
154+
this.overrideLocalActivityTaskExecutor = overrideLocalActivityTaskExecutor;
155+
return this;
156+
}
157+
146158
public WorkerFactoryOptions build() {
147159
return new WorkerFactoryOptions(
148160
workflowCacheSize,
@@ -151,6 +163,7 @@ public WorkerFactoryOptions build() {
151163
workerInterceptors,
152164
enableLoggingInReplay,
153165
usingVirtualWorkflowThreads,
166+
overrideLocalActivityTaskExecutor,
154167
false);
155168
}
156169

@@ -162,6 +175,7 @@ public WorkerFactoryOptions validateAndBuildWithDefaults() {
162175
workerInterceptors == null ? new WorkerInterceptor[0] : workerInterceptors,
163176
enableLoggingInReplay,
164177
usingVirtualWorkflowThreads,
178+
overrideLocalActivityTaskExecutor,
165179
true);
166180
}
167181
}
@@ -172,6 +186,7 @@ public WorkerFactoryOptions validateAndBuildWithDefaults() {
172186
private final WorkerInterceptor[] workerInterceptors;
173187
private final boolean enableLoggingInReplay;
174188
private final boolean usingVirtualWorkflowThreads;
189+
private final ExecutorService overrideLocalActivityTaskExecutor;
175190

176191
private WorkerFactoryOptions(
177192
int workflowCacheSize,
@@ -180,6 +195,7 @@ private WorkerFactoryOptions(
180195
WorkerInterceptor[] workerInterceptors,
181196
boolean enableLoggingInReplay,
182197
boolean usingVirtualWorkflowThreads,
198+
ExecutorService overrideLocalActivityTaskExecutor,
183199
boolean validate) {
184200
if (validate) {
185201
Preconditions.checkState(workflowCacheSize >= 0, "negative workflowCacheSize");
@@ -207,6 +223,7 @@ private WorkerFactoryOptions(
207223
this.workerInterceptors = workerInterceptors;
208224
this.enableLoggingInReplay = enableLoggingInReplay;
209225
this.usingVirtualWorkflowThreads = usingVirtualWorkflowThreads;
226+
this.overrideLocalActivityTaskExecutor = overrideLocalActivityTaskExecutor;
210227
}
211228

212229
public int getWorkflowCacheSize() {
@@ -235,6 +252,16 @@ public boolean isUsingVirtualWorkflowThreads() {
235252
return usingVirtualWorkflowThreads;
236253
}
237254

255+
/**
256+
* For internal use only.
257+
*
258+
* @return the ExecutorService to use for local activity tasks, or null if the default should be
259+
* used
260+
*/
261+
ExecutorService getOverrideLocalActivityTaskExecutor() {
262+
return overrideLocalActivityTaskExecutor;
263+
}
264+
238265
/**
239266
* @deprecated not used anymore by JavaSDK, this value doesn't have any effect
240267
*/

temporal-sdk/src/main/java/io/temporal/worker/tuning/SlotSupplier.java

+7-6
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@
3030
* at once.
3131
*
3232
* @param <SI> The type of information that will be used to reserve a slot. The three info types are
33-
* {@link WorkflowSlotInfo}, {@link ActivitySlotInfo}, and {@link LocalActivitySlotInfo}.
33+
* {@link WorkflowSlotInfo}, {@link ActivitySlotInfo}, {@link LocalActivitySlotInfo}, and {@link
34+
* NexusSlotInfo}.
3435
*/
3536
@Experimental
3637
public interface SlotSupplier<SI extends SlotInfo> {
@@ -77,11 +78,11 @@ public interface SlotSupplier<SI extends SlotInfo> {
7778
void releaseSlot(SlotReleaseContext<SI> ctx);
7879

7980
/**
80-
* Because we currently use thread pools to execute tasks, there must be *some* defined
81-
* upper-limit on the size of the thread pool for each kind of task. You must not hand out more
82-
* permits than this number. If unspecified, the default is {@link Integer#MAX_VALUE}. Be aware
83-
* that if your implementation hands out unreasonable numbers of permits, you could easily
84-
* oversubscribe the worker, and cause it to run out of resources.
81+
* Because we use thread pools to execute tasks when virtual threads are not enabled, there must
82+
* be *some* defined upper-limit on the size of the thread pool for each kind of task. You must
83+
* not hand out more permits than this number. If unspecified, the default is {@link
84+
* Integer#MAX_VALUE}. Be aware that if your implementation hands out unreasonable numbers of
85+
* permits, you could easily oversubscribe the worker, and cause it to run out of resources.
8586
*
8687
* <p>If a non-empty value is returned, it is assumed to be meaningful, and the worker will emit
8788
* {@link io.temporal.worker.MetricsType#WORKER_TASK_SLOTS_AVAILABLE} metrics based on this value.

0 commit comments

Comments
 (0)