diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index a82212677f63a..9db5951e5fe5b 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -760,22 +760,6 @@ class EmbeddingChatRequest(OpenAIBaseModel): # doc: end-chat-embedding-pooling-params # doc: begin-chat-embedding-extra-params - add_generation_prompt: bool = Field( - default=True, - description= - ("If true, the generation prompt will be added to the chat template. " - "This is a parameter used by chat template in tokenizer config of the " - "model."), - ) - continue_final_message: bool = Field( - default=False, - description= - ("If this is set, the chat will be formatted so that the final " - "message in the chat is open-ended, without any EOS tokens. The " - "model will continue this message rather than starting a new one. " - "This allows you to \"prefill\" part of the model's response for it. " - "Cannot be used at the same time as `add_generation_prompt`."), - ) add_special_tokens: bool = Field( default=False, description=( diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 74ad7389784fc..c84a7d2d8e13e 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -148,8 +148,10 @@ async def create_embedding( chat_template=request.chat_template or self.chat_template, chat_template_content_format=self. chat_template_content_format, - add_generation_prompt=request.add_generation_prompt, - continue_final_message=request.continue_final_message, + # In embedding requests, we are not generating tokens, + # so there is no need to append extra tokens to the input + add_generation_prompt=False, + continue_final_message=False, truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, )