diff --git a/requirements-common.txt b/requirements-common.txt index 11984260c580d..9bf9cafe71840 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -19,7 +19,7 @@ prometheus_client >= 0.18.0 prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.9, < 0.11 -outlines == 0.1.9 +outlines == 0.1.11 xgrammar >= 0.1.6; platform_machine == "x86_64" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py index 8a7ff38bfeb1a..eb8db882435e6 100644 --- a/vllm/model_executor/guided_decoding/outlines_decoding.py +++ b/vllm/model_executor/guided_decoding/outlines_decoding.py @@ -1,5 +1,6 @@ import asyncio import concurrent.futures +import os from enum import Enum from json import dumps as json_dumps from re import escape as regex_escape @@ -48,6 +49,11 @@ class GuidedDecodingMode(Enum): global_thread_pool = None # used for generating logits processor fsm +# It's not yet clear that using more provides a benefit, and it could +# potentially starve other processes on the machine. We'll cap this for now and +# adjust later if testing proves it to help overcome a bottleneck. +_MAX_THREADPOOL_WORKERS = 16 + async def get_outlines_guided_decoding_logits_processor( guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase @@ -65,8 +71,11 @@ async def get_outlines_guided_decoding_logits_processor( return None if global_thread_pool is None: + max_workers = os.cpu_count() or 2 + if max_workers > _MAX_THREADPOOL_WORKERS: + max_workers = _MAX_THREADPOOL_WORKERS global_thread_pool = concurrent.futures.ThreadPoolExecutor( - max_workers=2) + max_workers=max_workers) loop = asyncio.get_running_loop() return await loop.run_in_executor(global_thread_pool,