diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 168ba7ba888ef..3783b7cd66a6a 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -73,7 +73,6 @@ def test_serving_chat_should_set_correct_max_tokens(): with suppress(Exception): asyncio.run(serving_chat.create_chat_completion(req)) - # AsyncLLMEngine.generate(inputs, sampling_params, ...) assert mock_engine.generate.call_args.args[1].max_tokens == 93 req.max_tokens = 10 diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py index c0cd820e30c0d..c470c32c27ede 100644 --- a/vllm/entrypoints/openai/logits_processors.py +++ b/vllm/entrypoints/openai/logits_processors.py @@ -71,7 +71,7 @@ def get_logits_processors( # Check if token_id is within the vocab size for token_id, bias in clamped_logit_bias.items(): if token_id < 0 or token_id >= tokenizer.vocab_size: - raise ValueError("token_id in logit_bias contains " + raise ValueError(f"token_id {token_id} in logit_bias contains " "out-of-vocab token id") logits_processors.append(