Skip to content

Commit

Permalink
[Core] Update outlines and increase its threadpool size (vllm-project…
Browse files Browse the repository at this point in the history
…#11140)

Signed-off-by: Russell Bryant <[email protected]>
  • Loading branch information
russellb authored and ZenPuzzle committed Dec 24, 2024
1 parent 712a419 commit 5f1ffaf
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 2 deletions.
2 changes: 1 addition & 1 deletion requirements-common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer >= 0.10.9, < 0.11
outlines == 0.1.9
outlines == 0.1.11
xgrammar >= 0.1.6; platform_machine == "x86_64"
typing_extensions >= 4.10
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
Expand Down
11 changes: 10 additions & 1 deletion vllm/model_executor/guided_decoding/outlines_decoding.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import concurrent.futures
import os
from enum import Enum
from json import dumps as json_dumps
from re import escape as regex_escape
Expand Down Expand Up @@ -48,6 +49,11 @@ class GuidedDecodingMode(Enum):

global_thread_pool = None # used for generating logits processor fsm

# It's not yet clear that using more provides a benefit, and it could
# potentially starve other processes on the machine. We'll cap this for now and
# adjust later if testing proves it to help overcome a bottleneck.
_MAX_THREADPOOL_WORKERS = 16


async def get_outlines_guided_decoding_logits_processor(
guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
Expand All @@ -65,8 +71,11 @@ async def get_outlines_guided_decoding_logits_processor(
return None

if global_thread_pool is None:
max_workers = os.cpu_count() or 2
if max_workers > _MAX_THREADPOOL_WORKERS:
max_workers = _MAX_THREADPOOL_WORKERS
global_thread_pool = concurrent.futures.ThreadPoolExecutor(
max_workers=2)
max_workers=max_workers)
loop = asyncio.get_running_loop()

return await loop.run_in_executor(global_thread_pool,
Expand Down

0 comments on commit 5f1ffaf

Please sign in to comment.