Skip to content

Commit

Permalink
Use startup time in async worker thread instead of worker timeout (#3315
Browse files Browse the repository at this point in the history
)

* Use startup time in async worker thread instead of worker timeout

* Fix lint

* Update yaml files to use startupTimeout

* Update vllm/lora readme
  • Loading branch information
mreso authored Sep 18, 2024
1 parent c6dde82 commit e212294
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 11 deletions.
2 changes: 1 addition & 1 deletion examples/large_models/vllm/llama3/model-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
minWorkers: 1
maxWorkers: 1
maxBatchDelay: 100
responseTimeout: 1200
startupTimeout: 1200
deviceType: "gpu"
asyncCommunication: true

Expand Down
2 changes: 1 addition & 1 deletion examples/large_models/vllm/lora/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ The vllm integration uses an OpenAI compatible interface which lets you perform

Curl:
```bash
curl --header "Content-Type: application/json" --request POST --data @prompt.json http://localhost:8080/predictions/llama-8b-lora/1.0/v1
curl --header "Content-Type: application/json" --request POST --data @prompt.json http://localhost:8080/predictions/llama-8b-lora/1.0/v1/completions
```

Python + Request:
Expand Down
2 changes: 1 addition & 1 deletion examples/large_models/vllm/lora/model-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
minWorkers: 1
maxWorkers: 1
maxBatchDelay: 100
responseTimeout: 1200
startupTimeout: 1200
deviceType: "gpu"
asyncCommunication: true

Expand Down
2 changes: 1 addition & 1 deletion examples/large_models/vllm/mistral/model-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
minWorkers: 1
maxWorkers: 1
maxBatchDelay: 100
responseTimeout: 1200
startupTimeout: 1200
deviceType: "gpu"
asyncCommunication: true

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
public class AsyncWorkerThread extends WorkerThread {
// protected ConcurrentHashMap requestsInBackend;
protected static final Logger logger = LoggerFactory.getLogger(AsyncWorkerThread.class);
protected static final long MODEL_LOAD_TIMEOUT = 10L;
protected static final long WORKER_TIMEOUT = 2L;

protected boolean loadingFinished;
protected CountDownLatch latch;
Expand All @@ -53,6 +53,7 @@ public AsyncWorkerThread(
@Override
public void run() {
responseTimeout = model.getResponseTimeout();
startupTimeout = model.getStartupTimeout();
Thread thread = Thread.currentThread();
thread.setName(getWorkerName());
currentThread.set(thread);
Expand Down Expand Up @@ -80,11 +81,11 @@ public void run() {

if (loadingFinished == false) {
latch = new CountDownLatch(1);
if (!latch.await(MODEL_LOAD_TIMEOUT, TimeUnit.MINUTES)) {
if (!latch.await(startupTimeout, TimeUnit.SECONDS)) {
throw new WorkerInitializationException(
"Worker did not load the model within"
+ MODEL_LOAD_TIMEOUT
+ " mins");
"Worker did not load the model within "
+ startupTimeout
+ " seconds");
}
}

Expand All @@ -99,7 +100,7 @@ public void run() {
logger.debug("Shutting down the thread .. Scaling down.");
} else {
logger.debug(
"Backend worker monitoring thread interrupted or backend worker process died., responseTimeout:"
"Backend worker monitoring thread interrupted or backend worker process died. responseTimeout:"
+ responseTimeout
+ "sec",
e);
Expand Down
11 changes: 10 additions & 1 deletion ts/llm_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def get_model_config(args, model_snapshot_path=None):
"batchSize": 1,
"maxBatchDelay": 100,
"responseTimeout": 1200,
"startupTimeout": args.startup_timeout,
"deviceType": "gpu",
"asyncCommunication": True,
}
Expand Down Expand Up @@ -227,7 +228,7 @@ def main(args):
parser.add_argument(
"--vllm_engine.max_num_seqs",
type=int,
default=16,
default=256,
help="Max sequences in vllm engine",
)

Expand All @@ -245,6 +246,13 @@ def main(args):
help="Cache dir",
)

parser.add_argument(
"--startup_timeout",
type=int,
default=1200,
help="Model startup timeout in seconds",
)

parser.add_argument(
"--engine",
type=str,
Expand Down Expand Up @@ -272,6 +280,7 @@ def main(args):
default=0.1,
help="KV Cache free gpu memory fraction",
)

args = parser.parse_args()

main(args)

0 comments on commit e212294

Please sign in to comment.