From 8ca4505c343f6ea7ab851579efc693f5c8499cb3 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Mon, 8 Jul 2024 15:36:51 -0700 Subject: [PATCH 1/9] [BugFix][Frontend] Use LoRA tokenizer in OpenAI APIs Currently the LoRA tokenizers aren't used in the OpenAI APIs, meaning the behaviour won't be correct if adapters are used that have custom added tokens. This PR includes changes to address that. It mostly replaces #3512. More work is needed to address remaining inconsistencies in tokenization behaviour between the OpenAI front-end and standalone LLMEngine/AsyncLLMEngine use, including: - Standalone cases don't honor truncation and add_special_tokens request parameters - OpenAI API cases don't make use of TokenizerGroups for possible parallelization of tokenization As well as some other inefficiencies. But these are to be addressed in follow-on PRs. --- vllm/engine/async_llm_engine.py | 13 +- vllm/engine/llm_engine.py | 7 +- vllm/entrypoints/openai/serving_chat.py | 138 ++++++++++-------- vllm/entrypoints/openai/serving_completion.py | 58 +++++--- vllm/entrypoints/openai/serving_embedding.py | 11 +- vllm/entrypoints/openai/serving_engine.py | 30 ++-- vllm/transformers_utils/tokenizer.py | 3 + 7 files changed, 144 insertions(+), 116 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 33e40c7b3624a..c13ceaf3fa9fc 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -464,11 +464,16 @@ def _error_callback(self, exc: Exception) -> None: self.set_errored(exc) self._request_tracker.propagate_exception(exc) - async def get_tokenizer(self) -> "PreTrainedTokenizer": + async def get_tokenizer( + self, + lora_request: Optional[LoRARequest] = None, + ) -> "PreTrainedTokenizer": if self.engine_use_ray: - return await self.engine.get_tokenizer.remote() # type: ignore - else: - return self.engine.get_tokenizer() + return await self.engine.get_tokenizer.remote( # type: ignore + lora_request) + + return await (self.engine.get_tokenizer_group(). + get_lora_tokenizer_async(lora_request)) def start_background_loop(self) -> None: """Start the background loop.""" diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index de7604ece7c31..244f04fda26c4 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -441,8 +441,11 @@ def get_tokenizer_group( return self.tokenizer - def get_tokenizer(self) -> "PreTrainedTokenizer": - return self.get_tokenizer_group().get_lora_tokenizer(None) + def get_tokenizer( + self, + lora_request: Optional[LoRARequest] = None + ) -> "PreTrainedTokenizer": + return self.get_tokenizer_group().get_lora_tokenizer(lora_request) def get_tokenizer_for_seq(self, sequence: Sequence) -> "PreTrainedTokenizer": diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 415bdbbd7c455..49f9b3580ba07 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -1,7 +1,7 @@ import codecs import time from dataclasses import dataclass, field -from functools import cached_property +from functools import lru_cache from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, Iterable, List, Optional) from typing import Sequence as GenericSequence @@ -10,6 +10,7 @@ from fastapi import Request from openai.types.chat import (ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam) +from transformers import PreTrainedTokenizer from vllm.config import ModelConfig from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -66,39 +67,36 @@ def __init__(self, lora_modules=lora_modules) self.response_role = response_role - self._load_chat_template(chat_template) - def _load_chat_template(self, chat_template: Optional[str]): - tokenizer = self.tokenizer + # If this is None we use the tokenizer's default chat template + self.chat_template = self._load_chat_template(chat_template) - if chat_template is not None: - try: - with open(chat_template, "r") as f: - tokenizer.chat_template = f.read() - except OSError as e: - JINJA_CHARS = "{}\n" - if not any(c in chat_template for c in JINJA_CHARS): - msg = (f"The supplied chat template ({chat_template}) " - f"looks like a file path, but it failed to be " - f"opened. Reason: {e}") - raise ValueError(msg) from e - - # If opening a file fails, set chat template to be args to - # ensure we decode so our escape are interpreted correctly - tokenizer.chat_template = codecs.decode( - chat_template, "unicode_escape") - - logger.info("Using supplied chat template:\n%s", - tokenizer.chat_template) - elif tokenizer.chat_template is not None: - logger.info("Using default chat template:\n%s", - tokenizer.chat_template) - else: - logger.warning( - "No chat template provided. Chat API will not work.") - - @cached_property - def image_token_str(self) -> Optional[str]: + @staticmethod + def _load_chat_template(chat_template: Optional[str]) -> Optional[str]: + if chat_template is None: + return None + try: + with open(chat_template, "r") as f: + resolved_chat_template = f.read() + except OSError as e: + JINJA_CHARS = "{}\n" + if not any(c in chat_template for c in JINJA_CHARS): + msg = (f"The supplied chat template ({chat_template}) " + f"looks like a file path, but it failed to be " + f"opened. Reason: {e}") + raise ValueError(msg) from e + + # If opening a file fails, set chat template to be args to + # ensure we decode so our escape are interpreted correctly + resolved_chat_template = codecs.decode(chat_template, + "unicode_escape") + + logger.info("Using supplied chat template:\n%s", + resolved_chat_template) + return resolved_chat_template + + @lru_cache(maxsize=32) # noqa: B019 + def image_token_str(self, tokenizer: PreTrainedTokenizer) -> Optional[str]: # TODO: Let user specify how to insert image tokens into prompt # (similar to chat template) model_type = self.model_config.hf_config.model_type @@ -110,7 +108,7 @@ def image_token_str(self) -> Optional[str]: # These models do not use image tokens in the prompt return None if model_type.startswith("llava"): - return self.tokenizer.decode( + return tokenizer.decode( self.model_config.hf_config.image_token_index) else: @@ -130,6 +128,7 @@ def _parse_chat_message_content_parts( self, role: str, parts: Iterable[ChatCompletionContentPartParam], + tokenizer: PreTrainedTokenizer, ) -> ChatMessageParseResult: texts: List[str] = [] mm_futures: List[Awaitable[MultiModalDataDict]] = [] @@ -161,7 +160,7 @@ def _parse_chat_message_content_parts( text_prompt = "\n".join(texts) if mm_futures: - image_token_str = self.image_token_str + image_token_str = self.image_token_str(tokenizer) if image_token_str is not None: if image_token_str in text_prompt: logger.warning( @@ -180,6 +179,7 @@ def _parse_chat_message_content_parts( def _parse_chat_message_content( self, message: ChatCompletionMessageParam, + tokenizer: PreTrainedTokenizer, ) -> ChatMessageParseResult: role = message["role"] content = message.get("content") @@ -190,7 +190,7 @@ def _parse_chat_message_content( messages = [ConversationMessage(role=role, content=content)] return ChatMessageParseResult(messages=messages, mm_futures=[]) - return self._parse_chat_message_content_parts(role, content) + return self._parse_chat_message_content_parts(role, content, tokenizer) async def create_chat_completion( self, @@ -212,11 +212,15 @@ async def create_chat_completion( return error_check_ret try: + lora_request = self._maybe_get_lora(request) + tokenizer = await self.engine.get_tokenizer(lora_request) + conversation: List[ConversationMessage] = [] mm_futures: List[Awaitable[MultiModalDataDict]] = [] for msg in request.messages: - chat_parsed_result = self._parse_chat_message_content(msg) + chat_parsed_result = self._parse_chat_message_content( + msg, tokenizer) conversation.extend(chat_parsed_result.messages) mm_futures.extend(chat_parsed_result.mm_futures) @@ -225,7 +229,9 @@ async def create_chat_completion( tool.model_dump() for tool in request.tools ] - prompt = self.tokenizer.apply_chat_template( + if self.chat_template is not None: + tokenizer.chat_template = self.chat_template + prompt = tokenizer.apply_chat_template( conversation=conversation, tokenize=False, add_generation_prompt=request.add_generation_prompt, @@ -253,19 +259,19 @@ async def create_chat_completion( request_id = f"cmpl-{random_uuid()}" try: # Tokenize/detokenize depending on prompt format (string/token list) - prompt_ids, prompt_text = self._validate_prompt_and_tokenize( + prompt_ids, prompt_text = await self._validate_prompt_and_tokenize( request, + tokenizer, prompt=prompt, add_special_tokens=request.add_special_tokens) sampling_params = request.to_sampling_params() - lora_request = self._maybe_get_lora(request) decoding_config = await self.engine.get_decoding_config() guided_decoding_backend = request.guided_decoding_backend \ or decoding_config.guided_decoding_backend guided_decode_logits_processor = ( - await get_guided_decoding_logits_processor( - guided_decoding_backend, request, await - self.engine.get_tokenizer())) + await + get_guided_decoding_logits_processor(guided_decoding_backend, + request, tokenizer)) if guided_decode_logits_processor: if sampling_params.logits_processors is None: sampling_params.logits_processors = [] @@ -299,12 +305,12 @@ async def create_chat_completion( # Streaming response if request.stream: return self.chat_completion_stream_generator( - request, result_generator, request_id, conversation) + request, result_generator, request_id, conversation, tokenizer) else: try: return await self.chat_completion_full_generator( request, raw_request, result_generator, request_id, - conversation) + conversation, tokenizer) except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -316,9 +322,12 @@ def get_chat_request_role(self, request: ChatCompletionRequest) -> str: return request.messages[-1]["role"] async def chat_completion_stream_generator( - self, request: ChatCompletionRequest, - result_generator: AsyncIterator[RequestOutput], request_id: str, - conversation: List[ConversationMessage] + self, + request: ChatCompletionRequest, + result_generator: AsyncIterator[RequestOutput], + request_id: str, + conversation: List[ConversationMessage], + tokenizer: PreTrainedTokenizer, ) -> AsyncGenerator[str, None]: model_name = self.served_model_names[0] created_time = int(time.time()) @@ -405,6 +414,7 @@ async def chat_completion_stream_generator( logprobs = self._create_chat_logprobs( token_ids=delta_token_ids, top_logprobs=out_logprobs, + tokenizer=tokenizer, num_output_top_logprobs=request.top_logprobs, ) else: @@ -493,9 +503,13 @@ async def chat_completion_stream_generator( yield "data: [DONE]\n\n" async def chat_completion_full_generator( - self, request: ChatCompletionRequest, raw_request: Optional[Request], - result_generator: AsyncIterator[RequestOutput], request_id: str, - conversation: List[ConversationMessage] + self, + request: ChatCompletionRequest, + raw_request: Optional[Request], + result_generator: AsyncIterator[RequestOutput], + request_id: str, + conversation: List[ConversationMessage], + tokenizer: PreTrainedTokenizer, ) -> Union[ErrorResponse, ChatCompletionResponse]: model_name = self.served_model_names[0] @@ -523,6 +537,7 @@ async def chat_completion_full_generator( token_ids=token_ids, top_logprobs=out_logprobs, num_output_top_logprobs=request.top_logprobs, + tokenizer=tokenizer, ) else: logprobs = None @@ -577,16 +592,14 @@ async def chat_completion_full_generator( return response def _get_top_logprobs( - self, logprobs: Dict[int, Logprob], - top_logprobs: Optional[int]) -> List[ChatCompletionLogProb]: + self, logprobs: Dict[int, Logprob], top_logprobs: Optional[int], + tokenizer: PreTrainedTokenizer) -> List[ChatCompletionLogProb]: return [ ChatCompletionLogProb( - token=self._get_decoded_token(p[1], p[0]), + token=(token := self._get_decoded_token(p[1], p[0], + tokenizer)), logprob=max(p[1].logprob, -9999.0), - bytes=list( - self._get_decoded_token(p[1], - p[0]).encode("utf-8", - errors="replace"))) + bytes=list(token.encode("utf-8", errors="replace"))) for i, p in enumerate(logprobs.items()) if top_logprobs and i < top_logprobs ] @@ -595,6 +608,7 @@ def _create_chat_logprobs( self, token_ids: GenericSequence[int], top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]], + tokenizer: PreTrainedTokenizer, num_output_top_logprobs: Optional[int] = None, ) -> ChatCompletionLogProbs: """Create OpenAI-style logprobs.""" @@ -604,12 +618,11 @@ def _create_chat_logprobs( for i, token_id in enumerate(token_ids): step_top_logprobs = top_logprobs[i] if step_top_logprobs is None: + token = tokenizer.decode(token_id) logprobs_content.append( ChatCompletionLogProbsContent( - token=self.tokenizer.decode(token_id), - bytes=list( - self.tokenizer.decode(token_id).encode( - "utf-8", errors="replace")))) + token=token, + bytes=list(token.encode("utf-8", errors="replace")))) else: logprobs_content.append( ChatCompletionLogProbsContent( @@ -620,6 +633,7 @@ def _create_chat_logprobs( step_top_logprobs[token_id].decoded_token.encode( "utf-8", errors="replace")), top_logprobs=self._get_top_logprobs( - step_top_logprobs, num_output_top_logprobs))) + step_top_logprobs, num_output_top_logprobs, + tokenizer))) return ChatCompletionLogProbs(content=logprobs_content) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 9c719d634ac7d..a6a31814ae3fd 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -5,6 +5,7 @@ from typing import Tuple from fastapi import Request +from transformers import PreTrainedTokenizer from vllm.config import ModelConfig from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -100,15 +101,17 @@ async def create_completion(self, request: CompletionRequest, # Schedule the request and get the result generator. generators: List[AsyncIterator[RequestOutput]] = [] try: - sampling_params = request.to_sampling_params() lora_request = self._maybe_get_lora(request) + tokenizer = await self.engine.get_tokenizer(lora_request) + + sampling_params = request.to_sampling_params() decoding_config = await self.engine.get_decoding_config() guided_decoding_backend = request.guided_decoding_backend \ or decoding_config.guided_decoding_backend guided_decode_logit_processor = ( - await get_guided_decoding_logits_processor( - guided_decoding_backend, request, await - self.engine.get_tokenizer())) + await + get_guided_decoding_logits_processor(guided_decoding_backend, + request, tokenizer)) if guided_decode_logit_processor is not None: if sampling_params.logits_processors is None: sampling_params.logits_processors = [] @@ -117,18 +120,13 @@ async def create_completion(self, request: CompletionRequest, prompt_is_tokens, prompts = parse_prompt_format(request.prompt) for i, prompt in enumerate(prompts): - if prompt_is_tokens: - prompt_formats = self._validate_prompt_and_tokenize( - request, - prompt_ids=prompt, - truncate_prompt_tokens=sampling_params. - truncate_prompt_tokens) - else: - prompt_formats = self._validate_prompt_and_tokenize( - request, - prompt=prompt, - truncate_prompt_tokens=sampling_params. - truncate_prompt_tokens) + prompt_arg = "prompt_ids" if prompt_is_tokens else "prompt" + prompt_formats = await self._validate_prompt_and_tokenize( + request, + tokenizer, + truncate_prompt_tokens=sampling_params. + truncate_prompt_tokens, + **{prompt_arg: prompt}) prompt_ids, prompt_text = prompt_formats is_tracing_enabled = await self.engine.is_tracing_enabled() @@ -173,7 +171,8 @@ async def create_completion(self, request: CompletionRequest, request_id, created_time, model_name, - num_prompts=len(prompts)) + num_prompts=len(prompts), + tokenizer=tokenizer) # Non-streaming response final_res_batch: List[Optional[RequestOutput]] = [None] * len(prompts) @@ -185,7 +184,8 @@ async def create_completion(self, request: CompletionRequest, return self.create_error_response("Client disconnected") final_res_batch[i] = res response = self.request_output_to_completion_response( - final_res_batch, request, request_id, created_time, model_name) + final_res_batch, request, request_id, created_time, model_name, + tokenizer) except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -212,6 +212,7 @@ async def completion_stream_generator( created_time: int, model_name: str, num_prompts: int, + tokenizer: PreTrainedTokenizer, ) -> AsyncGenerator[str, None]: assert request.n is not None previous_texts = [""] * request.n * num_prompts @@ -262,6 +263,7 @@ async def completion_stream_generator( token_ids=delta_token_ids, top_logprobs=out_logprobs, num_output_top_logprobs=request.logprobs, + tokenizer=tokenizer, initial_text_offset=len(previous_texts[i]), ) else: @@ -330,6 +332,7 @@ def request_output_to_completion_response( request_id: str, created_time: int, model_name: str, + tokenizer: PreTrainedTokenizer, ) -> CompletionResponse: choices: List[CompletionResponseChoice] = [] num_prompt_tokens = 0 @@ -361,6 +364,7 @@ def request_output_to_completion_response( logprobs = self._create_completion_logprobs( token_ids=token_ids, top_logprobs=out_logprobs, + tokenizer=tokenizer, num_output_top_logprobs=request.logprobs, ) else: @@ -398,6 +402,7 @@ def _create_completion_logprobs( token_ids: GenericSequence[int], top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]], num_output_top_logprobs: int, + tokenizer: PreTrainedTokenizer, initial_text_offset: int = 0, ) -> CompletionLogProbs: """Create logprobs for OpenAI Completion API.""" @@ -411,13 +416,13 @@ def _create_completion_logprobs( for i, token_id in enumerate(token_ids): step_top_logprobs = top_logprobs[i] if step_top_logprobs is None: - token = self.tokenizer.decode(token_id) + token = tokenizer.decode(token_id) out_tokens.append(token) out_token_logprobs.append(None) out_top_logprobs.append(None) else: token = self._get_decoded_token(step_top_logprobs[token_id], - token_id) + token_id, tokenizer) token_logprob = max(step_top_logprobs[token_id].logprob, -9999.0) out_tokens.append(token) @@ -430,7 +435,7 @@ def _create_completion_logprobs( out_top_logprobs.append({ # Convert float("-inf") to the # JSON-serializable float that OpenAI uses - self._get_decoded_token(top_lp[1], top_lp[0]): + self._get_decoded_token(top_lp[1], top_lp[0], tokenizer): max(top_lp[1].logprob, -9999.0) for i, top_lp in enumerate(step_top_logprobs.items()) if num_output_top_logprobs >= i @@ -455,8 +460,11 @@ async def create_tokenize(self, if error_check_ret is not None: return error_check_ret - (input_ids, input_text) = self._validate_prompt_and_tokenize( + lora_request = self._maybe_get_lora(request) + tokenizer = await self.engine.get_tokenizer(lora_request) + (input_ids, input_text) = await self._validate_prompt_and_tokenize( request, + tokenizer, prompt=request.prompt, add_special_tokens=request.add_special_tokens) @@ -470,7 +478,9 @@ async def create_detokenize( if error_check_ret is not None: return error_check_ret - (input_ids, input_text) = self._validate_prompt_and_tokenize( - request, prompt_ids=request.tokens) + lora_request = self._maybe_get_lora(request) + tokenizer = await self.engine.get_tokenizer(lora_request) + (input_ids, input_text) = await self._validate_prompt_and_tokenize( + request, tokenizer, prompt_ids=request.tokens) return DetokenizeResponse(prompt=input_text) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 4838cb7d0255a..19e4288f5aa1c 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -89,14 +89,11 @@ async def create_embedding(self, request: EmbeddingRequest, prompt_is_tokens, prompts = parse_prompt_format(request.input) pooling_params = request.to_pooling_params() + tokenizer = await self.engine.get_tokenizer() for i, prompt in enumerate(prompts): - if prompt_is_tokens: - prompt_formats = self._validate_prompt_and_tokenize( - request, prompt_ids=prompt) - else: - prompt_formats = self._validate_prompt_and_tokenize( - request, prompt=prompt) - + prompt_arg = "prompt_ids" if prompt_is_tokens else "prompt" + prompt_formats = await self._validate_prompt_and_tokenize( + request, tokenizer, **{prompt_arg: prompt}) prompt_ids, prompt_text = prompt_formats generator = self.engine.encode( diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 8d281c51f02bc..833373b4d2c14 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union from pydantic import Field +from transformers import PreTrainedTokenizer from typing_extensions import Annotated from vllm.config import ModelConfig @@ -17,7 +18,6 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import Logprob -from vllm.transformers_utils.tokenizer import get_tokenizer logger = init_logger(__name__) @@ -39,14 +39,6 @@ def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig, self.model_config = model_config self.max_model_len = model_config.max_model_len - # A separate tokenizer to map token IDs to strings. - self.tokenizer = get_tokenizer( - model_config.tokenizer, - tokenizer_mode=model_config.tokenizer_mode, - tokenizer_revision=model_config.tokenizer_revision, - trust_remote_code=model_config.trust_remote_code, - truncation_side="left") - self.served_model_names = served_model_names if lora_modules is None: @@ -116,7 +108,8 @@ async def _check_model( def _maybe_get_lora( self, request: Union[CompletionRequest, ChatCompletionRequest, - EmbeddingRequest] + EmbeddingRequest, TokenizeRequest, + DetokenizeRequest] ) -> Optional[LoRARequest]: if request.model in self.served_model_names: return None @@ -126,11 +119,12 @@ def _maybe_get_lora( # if _check_model has been called earlier, this will be unreachable raise ValueError(f"The model `{request.model}` does not exist.") - def _validate_prompt_and_tokenize( + async def _validate_prompt_and_tokenize( self, request: Union[ChatCompletionRequest, CompletionRequest, DetokenizeRequest, EmbeddingRequest, TokenizeRequest], + tokenizer: "PreTrainedTokenizer", prompt: Optional[str] = None, prompt_ids: Optional[List[int]] = None, truncate_prompt_tokens: Optional[Annotated[int, @@ -139,7 +133,7 @@ def _validate_prompt_and_tokenize( ) -> Tuple[List[int], str]: if not (prompt or prompt_ids): raise ValueError("Either prompt or prompt_ids should be provided.") - if (prompt and prompt_ids): + if prompt and prompt_ids: raise ValueError( "Only one of prompt or prompt_ids should be provided.") @@ -158,14 +152,14 @@ def _validate_prompt_and_tokenize( "truncation": True, "max_length": truncate_prompt_tokens, }) - input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids + input_ids = tokenizer(prompt, **tokenizer_kwargs).input_ids elif truncate_prompt_tokens is not None: input_ids = prompt_ids[-truncate_prompt_tokens:] else: input_ids = prompt_ids - input_text = prompt if prompt is not None else self.tokenizer.decode( - prompt_ids) + input_text = prompt if prompt is not None else tokenizer.decode( + input_ids) token_num = len(input_ids) # Note: EmbeddingRequest doesn't have max_tokens @@ -203,7 +197,9 @@ def _validate_prompt_and_tokenize( else: return input_ids, input_text - def _get_decoded_token(self, logprob: Logprob, token_id: int) -> str: + @staticmethod + def _get_decoded_token(logprob: Logprob, token_id: int, + tokenizer: PreTrainedTokenizer) -> str: if logprob.decoded_token is not None: return logprob.decoded_token - return self.tokenizer.decode(token_id) + return tokenizer.decode(token_id) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index f5684dbf1271c..7553249544211 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -88,6 +88,9 @@ def get_tokenizer( "Cannot use the fast tokenizer in slow tokenizer mode.") kwargs["use_fast"] = False + if "truncation_side" not in kwargs: + kwargs["truncation_side"] = "left" + try: tokenizer = AutoTokenizer.from_pretrained( tokenizer_name, From cc536cdc55265a26990a43ffdf630348d2462992 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Mon, 8 Jul 2024 16:39:20 -0700 Subject: [PATCH 2/9] Fix tests --- tests/async_engine/test_chat_template.py | 28 ++++++------------- tests/entrypoints/openai/test_serving_chat.py | 3 +- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py index 55b730812ea94..7751c92e668f3 100644 --- a/tests/async_engine/test_chat_template.py +++ b/tests/async_engine/test_chat_template.py @@ -62,12 +62,8 @@ class MockServingChat: def test_load_chat_template(): # Testing chatml template - tokenizer = MockTokenizer() - mock_serving_chat = MockServingChat(tokenizer) - OpenAIServingChat._load_chat_template(mock_serving_chat, - chat_template=chatml_jinja_path) - - template_content = tokenizer.chat_template + template_content = OpenAIServingChat._load_chat_template( + chat_template=chatml_jinja_path) # Test assertions assert template_content is not None @@ -79,24 +75,17 @@ def test_load_chat_template(): def test_no_load_chat_template_filelike(): # Testing chatml template template = "../../examples/does_not_exist" - tokenizer = MockTokenizer() - - mock_serving_chat = MockServingChat(tokenizer) with pytest.raises(ValueError, match="looks like a file path"): - OpenAIServingChat._load_chat_template(mock_serving_chat, - chat_template=template) + OpenAIServingChat._load_chat_template(chat_template=template) def test_no_load_chat_template_literallike(): # Testing chatml template template = "{{ messages }}" - tokenizer = MockTokenizer() - mock_serving_chat = MockServingChat(tokenizer) - OpenAIServingChat._load_chat_template(mock_serving_chat, - chat_template=template) - template_content = tokenizer.chat_template + template_content = OpenAIServingChat._load_chat_template( + chat_template=template) assert template_content == template @@ -108,9 +97,8 @@ def test_get_gen_prompt(model, template, add_generation_prompt, expected_output): # Initialize the tokenizer tokenizer = get_tokenizer(tokenizer_name=model) - mock_serving_chat = MockServingChat(tokenizer) - OpenAIServingChat._load_chat_template(mock_serving_chat, - chat_template=template) + template_content = OpenAIServingChat._load_chat_template( + chat_template=template) # Create a mock request object using keyword arguments mock_request = ChatCompletionRequest( @@ -119,6 +107,8 @@ def test_get_gen_prompt(model, template, add_generation_prompt, add_generation_prompt=add_generation_prompt) # Call the function and get the result + if template_content is not None: + tokenizer.chat_template = template_content result = tokenizer.apply_chat_template( conversation=mock_request.messages, tokenize=False, diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 74b49726734b5..9a7abcfe5e590 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -38,5 +38,4 @@ async def _async_serving_chat_init(): def test_async_serving_chat_init(): serving_completion = asyncio.run(_async_serving_chat_init()) - assert serving_completion.tokenizer is not None - assert serving_completion.tokenizer.chat_template == CHAT_TEMPLATE + assert serving_completion.chat_template == CHAT_TEMPLATE From 6db473add0b46f555288dad8c270df8b37f7f580 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 10 Jul 2024 19:10:10 -0700 Subject: [PATCH 3/9] Address comments from @DarkLight1337 --- tests/async_engine/test_chat_template.py | 16 ++-------------- vllm/entrypoints/openai/serving_chat.py | 12 +++++------- 2 files changed, 7 insertions(+), 21 deletions(-) diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py index 7751c92e668f3..e7cb4dffb4b9d 100644 --- a/tests/async_engine/test_chat_template.py +++ b/tests/async_engine/test_chat_template.py @@ -1,6 +1,5 @@ import os import pathlib -from dataclasses import dataclass import pytest @@ -50,16 +49,6 @@ ] -@dataclass -class MockTokenizer: - chat_template = None - - -@dataclass -class MockServingChat: - tokenizer: MockTokenizer - - def test_load_chat_template(): # Testing chatml template template_content = OpenAIServingChat._load_chat_template( @@ -107,12 +96,11 @@ def test_get_gen_prompt(model, template, add_generation_prompt, add_generation_prompt=add_generation_prompt) # Call the function and get the result - if template_content is not None: - tokenizer.chat_template = template_content result = tokenizer.apply_chat_template( conversation=mock_request.messages, tokenize=False, - add_generation_prompt=mock_request.add_generation_prompt) + add_generation_prompt=mock_request.add_generation_prompt, + chat_template=mock_request.chat_template or template_content) # Test assertion assert result == expected_output, ( diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 611c8e3e223ab..68f6654d54c86 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -96,7 +96,8 @@ def _load_chat_template(chat_template: Optional[str]) -> Optional[str]: return resolved_chat_template @lru_cache(maxsize=32) # noqa: B019 - def image_token_str(self, tokenizer: PreTrainedTokenizer) -> Optional[str]: + def get_image_token_str(self, + tokenizer: PreTrainedTokenizer) -> Optional[str]: # TODO: Let user specify how to insert image tokens into prompt # (similar to chat template) model_type = self.model_config.hf_config.model_type @@ -111,8 +112,7 @@ def image_token_str(self, tokenizer: PreTrainedTokenizer) -> Optional[str]: return tokenizer.decode( self.model_config.hf_config.image_token_index) - else: - raise TypeError("Unknown model type: {model_type}") + raise TypeError("Unknown model type: {model_type}") # TODO: Let user specify how to insert image tokens into prompt # (similar to chat template) @@ -160,7 +160,7 @@ def _parse_chat_message_content_parts( text_prompt = "\n".join(texts) if mm_futures: - image_token_str = self.image_token_str(tokenizer) + image_token_str = self.get_image_token_str(tokenizer) if image_token_str is not None: if image_token_str in text_prompt: logger.warning( @@ -229,15 +229,13 @@ async def create_chat_completion( tool.model_dump() for tool in request.tools ] - if self.chat_template is not None: - tokenizer.chat_template = self.chat_template prompt = tokenizer.apply_chat_template( conversation=conversation, tokenize=False, add_generation_prompt=request.add_generation_prompt, tools=tool_dicts, documents=request.documents, - chat_template=request.chat_template, + chat_template=request.chat_template or self.chat_template, **(request.chat_template_kwargs or {}), ) except Exception as e: From eef9a8c083da80fb6e128dd633e22e88eccac44a Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Tue, 16 Jul 2024 12:21:19 -0700 Subject: [PATCH 4/9] test wip --- tests/entrypoints/openai/test_completion.py | 66 +++++++++++++++++---- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 52a848b7831d5..e732cd26c8a55 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -1,6 +1,8 @@ # imports for guided decoding tests import json import re +import shutil +from tempfile import TemporaryDirectory from typing import List import jsonschema @@ -13,6 +15,7 @@ # downloading lora to test lora requests from huggingface_hub import snapshot_download from openai import BadRequestError +from transformers import AutoTokenizer from vllm.transformers_utils.tokenizer import get_tokenizer @@ -77,6 +80,20 @@ def zephyr_lora_files(): return snapshot_download(repo_id=LORA_NAME) +@pytest.fixture(scope="module") +def zephyr_lora_added_tokens_files(zephyr_lora_files): + tmp_dir = TemporaryDirectory() + tmp_model_dir = f"{tmp_dir.name}/zephyr" + shutil.copytree(zephyr_lora_files, tmp_model_dir) + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + # 32000, 32001, 32002 + tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"], special_tokens=True) + tokenizer.save_pretrained(tmp_model_dir) + #TODO added_embeddings.safetensors? + yield tmp_model_dir + tmp_dir.cleanup() + + @pytest.fixture(scope="module") def ray_ctx(): ray.init(runtime_env={"working_dir": VLLM_PATH}) @@ -85,7 +102,7 @@ def ray_ctx(): @pytest.fixture(scope="module") -def server(zephyr_lora_files, ray_ctx): +def server(zephyr_lora_files, zephyr_lora_added_tokens_files, ray_ctx): return RemoteOpenAIServer([ "--model", MODEL_NAME, @@ -99,7 +116,7 @@ def server(zephyr_lora_files, ray_ctx): "--enable-lora", "--lora-modules", f"zephyr-lora={zephyr_lora_files}", - f"zephyr-lora2={zephyr_lora_files}", + f"zephyr-lora2={zephyr_lora_added_tokens_files}", "--max-lora-rank", "64", "--max-cpu-loras", @@ -137,7 +154,7 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): # test using token IDs completion = await client.completions.create( - model=MODEL_NAME, + model=model_name, prompt=[0, 0, 0, 0, 0], max_tokens=5, temperature=0.0, @@ -145,6 +162,31 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): assert len(completion.choices[0].text) >= 5 +@pytest.mark.asyncio +async def test_added_lora_tokens(client: openai.AsyncOpenAI): + # test using token IDs + completion = await client.completions.create( + model="zephyr-lora2", + prompt=[0, 0, 32000, 32001, 32002], + max_tokens=5, + temperature=0.0, + ) + assert len(completion.choices[0].text) >= 5 + + +@pytest.mark.asyncio +async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI): + with pytest.raises( + (openai.BadRequestError, openai.APIError)): # test using token IDs + completion = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 32000, 32001, 32002], + max_tokens=5, + temperature=0.0, + ) + assert len(completion.choices[0].text) >= 5 + + @pytest.mark.asyncio @pytest.mark.parametrize( # first test base model, then test loras @@ -154,7 +196,7 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs completion = await client.completions.create( - model=MODEL_NAME, + model=model_name, prompt=[0, 0, 0, 0, 0], max_tokens=5, temperature=0.0, @@ -173,7 +215,7 @@ async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs completion = await client.completions.create( - model=MODEL_NAME, + model=model_name, prompt=[0, 0, 0, 0, 0], max_tokens=5, temperature=0.0, @@ -194,7 +236,7 @@ async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs completion = await client.completions.create( - model=MODEL_NAME, + model=model_name, prompt=[0, 0, 0, 0, 0], max_tokens=5, temperature=0.0, @@ -218,7 +260,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, with pytest.raises( (openai.BadRequestError, openai.APIError)): # test using token IDs await client.completions.create( - model=MODEL_NAME, + model=model_name, prompt=[0, 0, 0, 0, 0], max_tokens=5, temperature=0.0, @@ -230,7 +272,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, with pytest.raises( (openai.BadRequestError, openai.APIError)): # test using token IDs stream = await client.completions.create( - model=MODEL_NAME, + model=model_name, prompt=[0, 0, 0, 0, 0], max_tokens=5, temperature=0.0, @@ -679,11 +721,11 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME], + [MODEL_NAME, "zephyr-lora2"], ) async def test_tokenize(client: openai.AsyncOpenAI, model_name: str): base_url = str(client.base_url)[:-3].strip("/") - tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast") + tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") for add_special in [False, True]: prompt = "This is a test prompt." @@ -706,11 +748,11 @@ async def test_tokenize(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME], + [MODEL_NAME, "zephyr-lora2"], ) async def test_detokenize(client: openai.AsyncOpenAI, model_name: str): base_url = str(client.base_url)[:-3] - tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast") + tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") prompt = "This is a test prompt." tokens = tokenizer.encode(prompt, add_special_tokens=False) From abdd2f944afab87d5b5266fbf4016c6970c84ccc Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 17 Jul 2024 16:46:32 -0700 Subject: [PATCH 5/9] Fixes and test updates --- tests/entrypoints/openai/test_chat.py | 13 ++--- tests/entrypoints/openai/test_completion.py | 29 ++++++---- tests/entrypoints/openai/test_tokenization.py | 56 +++++++++++++------ vllm/entrypoints/openai/api_server.py | 3 +- .../openai/serving_tokenization.py | 6 +- vllm/transformers_utils/detokenizer.py | 8 +++ 6 files changed, 75 insertions(+), 40 deletions(-) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index d370c63c0c7ba..32e2d29f2aec5 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -7,11 +7,11 @@ import openai # use the official client for correctness check import pytest import torch -# downloading lora to test lora requests -from huggingface_hub import snapshot_download from openai import BadRequestError from ...utils import RemoteOpenAIServer +from .test_completion import zephyr_lora_added_tokens_files # noqa: F401 +from .test_completion import zephyr_lora_files # noqa: F401 # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -21,12 +21,7 @@ @pytest.fixture(scope="module") -def zephyr_lora_files(): - return snapshot_download(repo_id=LORA_NAME) - - -@pytest.fixture(scope="module") -def server(zephyr_lora_files): +def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811 with RemoteOpenAIServer([ "--model", MODEL_NAME, @@ -40,7 +35,7 @@ def server(zephyr_lora_files): "--enable-lora", "--lora-modules", f"zephyr-lora={zephyr_lora_files}", - f"zephyr-lora2={zephyr_lora_files}", + f"zephyr-lora2={zephyr_lora_added_tokens_files}", "--max-lora-rank", "64", "--max-cpu-loras", diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 0d6c0cd91a448..fc5c301f5d536 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -39,10 +39,12 @@ def zephyr_lora_added_tokens_files(zephyr_lora_files): tmp_model_dir = f"{tmp_dir.name}/zephyr" shutil.copytree(zephyr_lora_files, tmp_model_dir) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + # Copy tokenizer to adapter and add some unique tokens # 32000, 32001, 32002 - tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"], special_tokens=True) + added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"], + special_tokens=True) + assert added == 3 tokenizer.save_pretrained(tmp_model_dir) - #TODO added_embeddings.safetensors? yield tmp_model_dir tmp_dir.cleanup() @@ -134,23 +136,26 @@ async def test_added_lora_tokens(client: openai.AsyncOpenAI): completion = await client.completions.create( model="zephyr-lora2", prompt=[0, 0, 32000, 32001, 32002], + echo=True, max_tokens=5, temperature=0.0, ) - assert len(completion.choices[0].text) >= 1 + # Added tokens should appear in tokenized prompt + assert completion.choices[0].text.startswith("vllm1vllm2vllm3") @pytest.mark.asyncio async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI): - with pytest.raises( - (openai.BadRequestError, openai.APIError)): # test using token IDs - completion = await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 32000, 32001, 32002], - max_tokens=5, - temperature=0.0, - ) - assert len(completion.choices[0].text) >= 1 + # test using token IDs + completion = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 32000, 32001, 32002], + echo=True, + max_tokens=5, + temperature=0.0, + ) + # Added tokens should not appear in tokenized prompt + assert "vllm" not in completion.choices[0].text @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index d33fd222ee150..64f5df50a0eaf 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -5,13 +5,15 @@ from vllm.transformers_utils.tokenizer import get_tokenizer from ...utils import RemoteOpenAIServer +from .test_completion import zephyr_lora_added_tokens_files # noqa: F401 +from .test_completion import zephyr_lora_files # noqa: F401 # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @pytest.fixture(scope="module") -def server(): +def server(zephyr_lora_added_tokens_files: str): # noqa: F811 with RemoteOpenAIServer([ "--model", MODEL_NAME, @@ -23,10 +25,23 @@ def server(): "--enforce-eager", "--max-num-seqs", "128", + # lora config + "--enable-lora", + "--lora-modules", + f"zephyr-lora2={zephyr_lora_added_tokens_files}", + "--max-lora-rank", + "64", ]) as remote_server: yield remote_server +@pytest.fixture(scope="module") +def tokenizer_name(model_name: str, + zephyr_lora_added_tokens_files: str): # noqa: F811 + return zephyr_lora_added_tokens_files if ( + model_name == "zephyr-lora2") else model_name + + @pytest.fixture(scope="module") def client(server): return server.get_async_client() @@ -34,16 +49,18 @@ def client(server): @pytest.mark.asyncio @pytest.mark.parametrize( - "model_name", - [MODEL_NAME], + "model_name,tokenizer_name", + [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + indirect=["tokenizer_name"], ) async def test_tokenize_completions(client: openai.AsyncOpenAI, - model_name: str): + model_name: str, tokenizer_name: str): base_url = str(client.base_url)[:-3].strip("/") - tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") + tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_mode="fast") for add_special in [False, True]: - prompt = "This is a test prompt." + prompt = "vllm1 This is a test prompt." tokens = tokenizer.encode(prompt, add_special_tokens=add_special) response = requests.post(base_url + "/tokenize", @@ -63,12 +80,15 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize( - "model_name", - [MODEL_NAME], + "model_name,tokenizer_name", + [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + indirect=["tokenizer_name"], ) -async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str): +async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, + tokenizer_name: str): base_url = str(client.base_url)[:-3].strip("/") - tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") + tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_mode="fast") for add_generation in [False, True]: for add_special in [False, True]: @@ -80,7 +100,7 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str): "content": "Nice to meet you!" }, { "role": "user", - "content": "Can I ask a question?" + "content": "Can I ask a question? vllm1" }] prompt = tokenizer.apply_chat_template( @@ -108,16 +128,20 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( - "model_name", - [MODEL_NAME], + "model_name,tokenizer_name", + [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + indirect=["tokenizer_name"], ) -async def test_detokenize(client: openai.AsyncOpenAI, model_name: str): +async def test_detokenize(client: openai.AsyncOpenAI, model_name: str, + tokenizer_name: str): base_url = str(client.base_url)[:-3].strip("/") - tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") + tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_mode="fast") - prompt = "This is a test prompt." + prompt = "This is a test prompt. vllm1" tokens = tokenizer.encode(prompt, add_special_tokens=False) + print(f"CALLING {base_url} FOR {model_name}") response = requests.post(base_url + "/detokenize", json={ "model": model_name, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index a35dcbbd6545e..b6bf08e5fae60 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -257,7 +257,8 @@ def run_server(args, llm_engine=None): openai_serving_embedding = OpenAIServingEmbedding(engine, model_config, served_model_names) openai_serving_tokenization = OpenAIServingTokenization( - engine, model_config, served_model_names, args.chat_template) + engine, model_config, served_model_names, args.lora_modules, + args.chat_template) app.root_path = args.root_path logger.info("Available routes are:") diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index e291064158448..28a344c2d176e 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -9,7 +9,8 @@ DetokenizeResponse, TokenizeRequest, TokenizeResponse) -from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, + OpenAIServing) class OpenAIServingTokenization(OpenAIServing): @@ -18,11 +19,12 @@ def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig, served_model_names: List[str], + lora_modules: Optional[List[LoRAModulePath]] = None, chat_template: Optional[str] = None): super().__init__(engine=engine, model_config=model_config, served_model_names=served_model_names, - lora_modules=None) + lora_modules=lora_modules) # If this is None we use the tokenizer's default chat template self.chat_template = load_chat_template(chat_template) diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index cc9a971301afc..0a45028e7759b 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -165,6 +165,12 @@ def decode_sequence_inplace(self, seq: Sequence, return len(new_decoded_token_text) +def _replace_none_with_empty(tokens: List[Optional[str]]): + for i, token in enumerate(tokens): + if token is None: + tokens[i] = "" + + def _convert_tokens_to_string_with_added_encoders( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], output_tokens: List[str], @@ -223,6 +229,8 @@ def convert_prompt_ids_to_tokens( read_offset = len(new_tokens) prefix_offset = max( read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0) + # This is required to guard against out-of-vocab prompt token ids + _replace_none_with_empty(new_tokens) return new_tokens, prefix_offset, read_offset From 9b980683a7575240745223c9c676deec46db9976 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 17 Jul 2024 17:23:10 -0700 Subject: [PATCH 6/9] yapf --- tests/entrypoints/openai/test_tokenization.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index ebf2dbfbb2b4b..18c51c560b511 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -34,6 +34,7 @@ def server(zephyr_lora_added_tokens_files: str): # noqa: F811 with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server + @pytest.fixture(scope="module") def tokenizer_name(model_name: str, zephyr_lora_added_tokens_files: str): # noqa: F811 From 89c96e970bf9d4db23120fe32d6c143d51ba8ab3 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 17 Jul 2024 20:48:45 -0700 Subject: [PATCH 7/9] Fix image token string caching --- vllm/entrypoints/openai/chat_utils.py | 17 +++++++++-------- vllm/entrypoints/openai/serving_chat.py | 2 +- vllm/entrypoints/openai/serving_tokenization.py | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/vllm/entrypoints/openai/chat_utils.py b/vllm/entrypoints/openai/chat_utils.py index a3ee686460cbe..c0dddaeed51f9 100644 --- a/vllm/entrypoints/openai/chat_utils.py +++ b/vllm/entrypoints/openai/chat_utils.py @@ -3,9 +3,10 @@ from functools import lru_cache from typing import Awaitable, Iterable, List, Optional, TypedDict, cast, final +from config import ModelConfig from openai.types.chat import (ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam) -from transformers import PretrainedConfig, PreTrainedTokenizer +from transformers import PreTrainedTokenizer from vllm.entrypoints.openai.protocol import (ChatCompletionContentPartParam, ChatCompletionMessageParam) @@ -52,11 +53,11 @@ def load_chat_template(chat_template: Optional[str]) -> Optional[str]: @lru_cache(maxsize=None) -def _image_token_str(hf_config: PretrainedConfig, +def _image_token_str(model_config: ModelConfig, tokenizer: PreTrainedTokenizer) -> Optional[str]: # TODO: Let user specify how to insert image tokens into prompt # (similar to chat template) - model_type = hf_config.model_type + model_type = model_config.hf_config.model_type if model_type == "phi3_v": # Workaround since this token is not defined in the tokenizer return "<|image_1|>" @@ -64,7 +65,7 @@ def _image_token_str(hf_config: PretrainedConfig, # These models do not use image tokens in the prompt return None if model_type.startswith("llava"): - return tokenizer.decode(hf_config.image_token_index) + return tokenizer.decode(model_config.hf_config.image_token_index) raise TypeError("Unknown model type: {model_type}") @@ -82,7 +83,7 @@ def _get_full_image_text_prompt(image_token_str: str, text_prompt: str) -> str: def _parse_chat_message_content_parts( role: str, parts: Iterable[ChatCompletionContentPartParam], - hf_config: PretrainedConfig, + model_config: ModelConfig, tokenizer: PreTrainedTokenizer, ) -> ChatMessageParseResult: texts: List[str] = [] @@ -114,7 +115,7 @@ def _parse_chat_message_content_parts( text_prompt = "\n".join(texts) if mm_futures: - image_token_str = _image_token_str(hf_config, tokenizer) + image_token_str = _image_token_str(model_config, tokenizer) if image_token_str is not None: if image_token_str in text_prompt: logger.warning( @@ -133,7 +134,7 @@ def _parse_chat_message_content_parts( def parse_chat_message_content( message: ChatCompletionMessageParam, - hf_config: PretrainedConfig, + model_config: ModelConfig, tokenizer: PreTrainedTokenizer, ) -> ChatMessageParseResult: role = message["role"] @@ -145,5 +146,5 @@ def parse_chat_message_content( messages = [ConversationMessage(role=role, content=content)] return ChatMessageParseResult(messages=messages, mm_futures=[]) - return _parse_chat_message_content_parts(role, content, hf_config, + return _parse_chat_message_content_parts(role, content, model_config, tokenizer) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index e5208dfd35bae..0d7eede377ce5 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -82,7 +82,7 @@ async def create_chat_completion( for msg in request.messages: chat_parsed_result = parse_chat_message_content( - msg, self.model_config.hf_config, tokenizer) + msg, self.model_config, tokenizer) conversation.extend(chat_parsed_result.messages) mm_futures.extend(chat_parsed_result.mm_futures) diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 28a344c2d176e..94367bd3a6048 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -49,8 +49,8 @@ async def create_tokenize(self, conversation: List[ConversationMessage] = [] for message in request.messages: - result = parse_chat_message_content( - message, self.model_config.hf_config, tokenizer) + result = parse_chat_message_content(message, self.model_config, + tokenizer) conversation.extend(result.messages) request.prompt = tokenizer.apply_chat_template( From e0c5e39e2880aa6531e53eacaa36cf07ad4b287c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 18 Jul 2024 12:39:51 +0800 Subject: [PATCH 8/9] Fix import --- vllm/entrypoints/openai/chat_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/chat_utils.py b/vllm/entrypoints/openai/chat_utils.py index c0dddaeed51f9..ddc5c869f4cb1 100644 --- a/vllm/entrypoints/openai/chat_utils.py +++ b/vllm/entrypoints/openai/chat_utils.py @@ -3,7 +3,7 @@ from functools import lru_cache from typing import Awaitable, Iterable, List, Optional, TypedDict, cast, final -from config import ModelConfig +from vllm.config import ModelConfig from openai.types.chat import (ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam) from transformers import PreTrainedTokenizer From 17f0b63bd33962a17bc7cc4ba1e9c5df075ecf0c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 18 Jul 2024 12:53:39 +0800 Subject: [PATCH 9/9] Format --- vllm/entrypoints/openai/chat_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/chat_utils.py b/vllm/entrypoints/openai/chat_utils.py index ddc5c869f4cb1..b3d5ca77ac16d 100644 --- a/vllm/entrypoints/openai/chat_utils.py +++ b/vllm/entrypoints/openai/chat_utils.py @@ -3,11 +3,11 @@ from functools import lru_cache from typing import Awaitable, Iterable, List, Optional, TypedDict, cast, final -from vllm.config import ModelConfig from openai.types.chat import (ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam) from transformers import PreTrainedTokenizer +from vllm.config import ModelConfig from vllm.entrypoints.openai.protocol import (ChatCompletionContentPartParam, ChatCompletionMessageParam) from vllm.logger import init_logger