diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index d370c63c0c7ba..32e2d29f2aec5 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -7,11 +7,11 @@ import openai # use the official client for correctness check import pytest import torch -# downloading lora to test lora requests -from huggingface_hub import snapshot_download from openai import BadRequestError from ...utils import RemoteOpenAIServer +from .test_completion import zephyr_lora_added_tokens_files # noqa: F401 +from .test_completion import zephyr_lora_files # noqa: F401 # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -21,12 +21,7 @@ @pytest.fixture(scope="module") -def zephyr_lora_files(): - return snapshot_download(repo_id=LORA_NAME) - - -@pytest.fixture(scope="module") -def server(zephyr_lora_files): +def server(zephyr_lora_files, zephyr_lora_added_tokens_files): # noqa: F811 with RemoteOpenAIServer([ "--model", MODEL_NAME, @@ -40,7 +35,7 @@ def server(zephyr_lora_files): "--enable-lora", "--lora-modules", f"zephyr-lora={zephyr_lora_files}", - f"zephyr-lora2={zephyr_lora_files}", + f"zephyr-lora2={zephyr_lora_added_tokens_files}", "--max-lora-rank", "64", "--max-cpu-loras", diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 0d6c0cd91a448..fc5c301f5d536 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -39,10 +39,12 @@ def zephyr_lora_added_tokens_files(zephyr_lora_files): tmp_model_dir = f"{tmp_dir.name}/zephyr" shutil.copytree(zephyr_lora_files, tmp_model_dir) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + # Copy tokenizer to adapter and add some unique tokens # 32000, 32001, 32002 - tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"], special_tokens=True) + added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"], + special_tokens=True) + assert added == 3 tokenizer.save_pretrained(tmp_model_dir) - #TODO added_embeddings.safetensors? yield tmp_model_dir tmp_dir.cleanup() @@ -134,23 +136,26 @@ async def test_added_lora_tokens(client: openai.AsyncOpenAI): completion = await client.completions.create( model="zephyr-lora2", prompt=[0, 0, 32000, 32001, 32002], + echo=True, max_tokens=5, temperature=0.0, ) - assert len(completion.choices[0].text) >= 1 + # Added tokens should appear in tokenized prompt + assert completion.choices[0].text.startswith("vllm1vllm2vllm3") @pytest.mark.asyncio async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI): - with pytest.raises( - (openai.BadRequestError, openai.APIError)): # test using token IDs - completion = await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 32000, 32001, 32002], - max_tokens=5, - temperature=0.0, - ) - assert len(completion.choices[0].text) >= 1 + # test using token IDs + completion = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 32000, 32001, 32002], + echo=True, + max_tokens=5, + temperature=0.0, + ) + # Added tokens should not appear in tokenized prompt + assert "vllm" not in completion.choices[0].text @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index d33fd222ee150..64f5df50a0eaf 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -5,13 +5,15 @@ from vllm.transformers_utils.tokenizer import get_tokenizer from ...utils import RemoteOpenAIServer +from .test_completion import zephyr_lora_added_tokens_files # noqa: F401 +from .test_completion import zephyr_lora_files # noqa: F401 # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @pytest.fixture(scope="module") -def server(): +def server(zephyr_lora_added_tokens_files: str): # noqa: F811 with RemoteOpenAIServer([ "--model", MODEL_NAME, @@ -23,10 +25,23 @@ def server(): "--enforce-eager", "--max-num-seqs", "128", + # lora config + "--enable-lora", + "--lora-modules", + f"zephyr-lora2={zephyr_lora_added_tokens_files}", + "--max-lora-rank", + "64", ]) as remote_server: yield remote_server +@pytest.fixture(scope="module") +def tokenizer_name(model_name: str, + zephyr_lora_added_tokens_files: str): # noqa: F811 + return zephyr_lora_added_tokens_files if ( + model_name == "zephyr-lora2") else model_name + + @pytest.fixture(scope="module") def client(server): return server.get_async_client() @@ -34,16 +49,18 @@ def client(server): @pytest.mark.asyncio @pytest.mark.parametrize( - "model_name", - [MODEL_NAME], + "model_name,tokenizer_name", + [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + indirect=["tokenizer_name"], ) async def test_tokenize_completions(client: openai.AsyncOpenAI, - model_name: str): + model_name: str, tokenizer_name: str): base_url = str(client.base_url)[:-3].strip("/") - tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") + tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_mode="fast") for add_special in [False, True]: - prompt = "This is a test prompt." + prompt = "vllm1 This is a test prompt." tokens = tokenizer.encode(prompt, add_special_tokens=add_special) response = requests.post(base_url + "/tokenize", @@ -63,12 +80,15 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize( - "model_name", - [MODEL_NAME], + "model_name,tokenizer_name", + [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + indirect=["tokenizer_name"], ) -async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str): +async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, + tokenizer_name: str): base_url = str(client.base_url)[:-3].strip("/") - tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") + tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_mode="fast") for add_generation in [False, True]: for add_special in [False, True]: @@ -80,7 +100,7 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str): "content": "Nice to meet you!" }, { "role": "user", - "content": "Can I ask a question?" + "content": "Can I ask a question? vllm1" }] prompt = tokenizer.apply_chat_template( @@ -108,16 +128,20 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( - "model_name", - [MODEL_NAME], + "model_name,tokenizer_name", + [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + indirect=["tokenizer_name"], ) -async def test_detokenize(client: openai.AsyncOpenAI, model_name: str): +async def test_detokenize(client: openai.AsyncOpenAI, model_name: str, + tokenizer_name: str): base_url = str(client.base_url)[:-3].strip("/") - tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") + tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_mode="fast") - prompt = "This is a test prompt." + prompt = "This is a test prompt. vllm1" tokens = tokenizer.encode(prompt, add_special_tokens=False) + print(f"CALLING {base_url} FOR {model_name}") response = requests.post(base_url + "/detokenize", json={ "model": model_name, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index a35dcbbd6545e..b6bf08e5fae60 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -257,7 +257,8 @@ def run_server(args, llm_engine=None): openai_serving_embedding = OpenAIServingEmbedding(engine, model_config, served_model_names) openai_serving_tokenization = OpenAIServingTokenization( - engine, model_config, served_model_names, args.chat_template) + engine, model_config, served_model_names, args.lora_modules, + args.chat_template) app.root_path = args.root_path logger.info("Available routes are:") diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index e291064158448..28a344c2d176e 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -9,7 +9,8 @@ DetokenizeResponse, TokenizeRequest, TokenizeResponse) -from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, + OpenAIServing) class OpenAIServingTokenization(OpenAIServing): @@ -18,11 +19,12 @@ def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig, served_model_names: List[str], + lora_modules: Optional[List[LoRAModulePath]] = None, chat_template: Optional[str] = None): super().__init__(engine=engine, model_config=model_config, served_model_names=served_model_names, - lora_modules=None) + lora_modules=lora_modules) # If this is None we use the tokenizer's default chat template self.chat_template = load_chat_template(chat_template) diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index cc9a971301afc..0a45028e7759b 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -165,6 +165,12 @@ def decode_sequence_inplace(self, seq: Sequence, return len(new_decoded_token_text) +def _replace_none_with_empty(tokens: List[Optional[str]]): + for i, token in enumerate(tokens): + if token is None: + tokens[i] = "" + + def _convert_tokens_to_string_with_added_encoders( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], output_tokens: List[str], @@ -223,6 +229,8 @@ def convert_prompt_ids_to_tokens( read_offset = len(new_tokens) prefix_offset = max( read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0) + # This is required to guard against out-of-vocab prompt token ids + _replace_none_with_empty(new_tokens) return new_tokens, prefix_offset, read_offset