diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py index 2dde6dad85e8f..9f4d81b583141 100644 --- a/tests/model_executor/test_guided_processors.py +++ b/tests/model_executor/test_guided_processors.py @@ -36,7 +36,8 @@ def test_guided_logits_processors(sample_regex, sample_json_schema): @pytest.mark.asyncio -@pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer", "xgrammar"]) +@pytest.mark.parametrize("backend", + ["outlines", "lm-format-enforcer", "xgrammar"]) async def test_guided_logits_processor_black_box(backend: str, sample_regex, sample_json_schema): tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta') diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index 94520b3e3adb9..7bd7f5002c326 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -1,10 +1,11 @@ +# noqa: UP007 from __future__ import annotations -import json, torch - +import json +import torch from transformers import PreTrainedTokenizerFast from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Dict, Any, Optional, List +from typing import TYPE_CHECKING, Dict, Optional, List, Any try: import xgrammar as xgr @@ -23,7 +24,7 @@ def get_local_xgrammar_guided_decoding_logits_processor( guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer, model_config: ModelConfig, - max_threads=8): + max_threads: int = 8): config = GrammarConfig.from_guided_params(guided_params=guided_params, model_config=model_config, tokenizer=tokenizer, @@ -57,10 +58,9 @@ def from_guided_params(cls, key=lambda x: x[1]) ] except AttributeError as e: - msg = ( - f"Cannot get the vocabulary of the tokenizer {type(tokenizer)}. The tokenizer " - "should have a get_vocab method.") - raise ValueError(msg) from e + raise ValueError( + f"Cannot get the vocabulary of the tokenizer {type(tokenizer)}. The tokenizer should have a get_vocab method." + ) from e stop_token_ids = None backend_str = xgr.VocabType.RAW @@ -79,11 +79,6 @@ def from_guided_params(cls, tokenizer, "eos_token_id") and tokenizer.eos_token_id is not None: stop_token_ids = [tokenizer.eos_token_id] - else: - logger.warning( - "When constructing TokenizerInfo from a huggingface tokenizer, " - "stop_token_ids is neither provided by user nor found from the tokenizer. " - "It will be automatically detected.") if guided_params.json: if not isinstance(guided_params.json, str): @@ -124,8 +119,8 @@ class XGrammarLogitsProcessor: ctx: Optional[xgr.CompiledGrammar] = None matchers: List[xgr.GrammarMatcher] = field(default_factory=list) batch_size: int = 1 - token_bitmask: Optional[torch.Tensor] = None - prefilled: boolean = False + token_bitmask: torch.Tensor = None + prefilled: bool = False def __getstate__(self) -> Dict[str, Any]: return {'config': self.config} @@ -152,7 +147,9 @@ def _ensure_ctx(self): def __call__(self, input_ids: List[int], scores: torch.Tensor) -> torch.Tensor: - if self.ctx is None: self._ensure_ctx() + if self.ctx is None: + self._ensure_ctx() + if len(self.matchers) == 0: self.matchers = [ xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size) @@ -174,9 +171,11 @@ def __call__(self, input_ids: List[int], matcher.fill_next_token_bitmask(self.token_bitmask, i) device_type = scores.device.type - if device_type != "cuda": scores = scores.to("cpu") + if device_type != "cuda": + scores = scores.to("cpu") xgr.apply_token_bitmask_inplace(scores, self.token_bitmask.to(scores.device)) - if device_type != "cuda": scores = scores.to(device_type) + if device_type != "cuda": + scores = scores.to(device_type) return scores