Fixes and test updates

vllm-project · Jul 18, 2024 · abdd2f9 · abdd2f9
1 parent 197f2cb
commit abdd2f9
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 40 deletions.
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
@@ -7,11 +7,11 @@
 import openai  # use the official client for correctness check
 import pytest
 import torch
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
 from openai import BadRequestError
 
 from ...utils import RemoteOpenAIServer
+from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
+from .test_completion import zephyr_lora_files  # noqa: F401
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -21,12 +21,7 @@
 
 
 @pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.fixture(scope="module")
-def server(zephyr_lora_files):
+def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
     with RemoteOpenAIServer([
             "--model",
             MODEL_NAME,
@@ -40,7 +35,7 @@ def server(zephyr_lora_files):
             "--enable-lora",
             "--lora-modules",
             f"zephyr-lora={zephyr_lora_files}",
-            f"zephyr-lora2={zephyr_lora_files}",
+            f"zephyr-lora2={zephyr_lora_added_tokens_files}",
             "--max-lora-rank",
             "64",
             "--max-cpu-loras",

diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
@@ -39,10 +39,12 @@ def zephyr_lora_added_tokens_files(zephyr_lora_files):
     tmp_model_dir = f"{tmp_dir.name}/zephyr"
     shutil.copytree(zephyr_lora_files, tmp_model_dir)
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    # Copy tokenizer to adapter and add some unique tokens
     # 32000, 32001, 32002
-    tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"], special_tokens=True)
+    added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
+                                 special_tokens=True)
+    assert added == 3
     tokenizer.save_pretrained(tmp_model_dir)
-    #TODO added_embeddings.safetensors?
     yield tmp_model_dir
     tmp_dir.cleanup()
 
@@ -134,23 +136,26 @@ async def test_added_lora_tokens(client: openai.AsyncOpenAI):
     completion = await client.completions.create(
         model="zephyr-lora2",
         prompt=[0, 0, 32000, 32001, 32002],
+        echo=True,
         max_tokens=5,
         temperature=0.0,
     )
-    assert len(completion.choices[0].text) >= 1
+    # Added tokens should appear in tokenized prompt
+    assert completion.choices[0].text.startswith("<unk><unk>vllm1vllm2vllm3")
 
 
 @pytest.mark.asyncio
 async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI):
-    with pytest.raises(
-            (openai.BadRequestError, openai.APIError)):  # test using token IDs
-        completion = await client.completions.create(
-            model=MODEL_NAME,
-            prompt=[0, 0, 32000, 32001, 32002],
-            max_tokens=5,
-            temperature=0.0,
-        )
-        assert len(completion.choices[0].text) >= 1
+    # test using token IDs
+    completion = await client.completions.create(
+        model=MODEL_NAME,
+        prompt=[0, 0, 32000, 32001, 32002],
+        echo=True,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    # Added tokens should not appear in tokenized prompt
+    assert "vllm" not in completion.choices[0].text
 
 
 @pytest.mark.asyncio

diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
@@ -5,13 +5,15 @@
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from ...utils import RemoteOpenAIServer
+from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
+from .test_completion import zephyr_lora_files  # noqa: F401
 
 # any model with a chat template should work here
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
 
 @pytest.fixture(scope="module")
-def server():
+def server(zephyr_lora_added_tokens_files: str):  # noqa: F811
     with RemoteOpenAIServer([
             "--model",
             MODEL_NAME,
@@ -23,27 +25,42 @@ def server():
             "--enforce-eager",
             "--max-num-seqs",
             "128",
+            # lora config
+            "--enable-lora",
+            "--lora-modules",
+            f"zephyr-lora2={zephyr_lora_added_tokens_files}",
+            "--max-lora-rank",
+            "64",
     ]) as remote_server:
         yield remote_server
 
 
+@pytest.fixture(scope="module")
+def tokenizer_name(model_name: str,
+                   zephyr_lora_added_tokens_files: str):  # noqa: F811
+    return zephyr_lora_added_tokens_files if (
+        model_name == "zephyr-lora2") else model_name
+
+
 @pytest.fixture(scope="module")
 def client(server):
     return server.get_async_client()
 
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
 )
 async def test_tokenize_completions(client: openai.AsyncOpenAI,
-                                    model_name: str):
+                                    model_name: str, tokenizer_name: str):
     base_url = str(client.base_url)[:-3].strip("/")
-    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
 
     for add_special in [False, True]:
-        prompt = "This is a test prompt."
+        prompt = "vllm1 This is a test prompt."
         tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
 
         response = requests.post(base_url + "/tokenize",
@@ -63,12 +80,15 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
 )
-async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str):
+async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
+                             tokenizer_name: str):
     base_url = str(client.base_url)[:-3].strip("/")
-    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
 
     for add_generation in [False, True]:
         for add_special in [False, True]:
@@ -80,7 +100,7 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str):
                 "content": "Nice to meet you!"
             }, {
                 "role": "user",
-                "content": "Can I ask a question?"
+                "content": "Can I ask a question? vllm1"
             }]
 
             prompt = tokenizer.apply_chat_template(
@@ -108,16 +128,20 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
 )
-async def test_detokenize(client: openai.AsyncOpenAI, model_name: str):
+async def test_detokenize(client: openai.AsyncOpenAI, model_name: str,
+                          tokenizer_name: str):
     base_url = str(client.base_url)[:-3].strip("/")
-    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                              tokenizer_mode="fast")
 
-    prompt = "This is a test prompt."
+    prompt = "This is a test prompt. vllm1"
     tokens = tokenizer.encode(prompt, add_special_tokens=False)
 
+    print(f"CALLING {base_url} FOR {model_name}")
     response = requests.post(base_url + "/detokenize",
                              json={
                                  "model": model_name,

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -257,7 +257,8 @@ def run_server(args, llm_engine=None):
     openai_serving_embedding = OpenAIServingEmbedding(engine, model_config,
                                                       served_model_names)
     openai_serving_tokenization = OpenAIServingTokenization(
-        engine, model_config, served_model_names, args.chat_template)
+        engine, model_config, served_model_names, args.lora_modules,
+        args.chat_template)
     app.root_path = args.root_path
 
     logger.info("Available routes are:")

diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
@@ -9,7 +9,8 @@
                                               DetokenizeResponse,
                                               TokenizeRequest,
                                               TokenizeResponse)
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
+                                                    OpenAIServing)
 
 
 class OpenAIServingTokenization(OpenAIServing):
@@ -18,11 +19,12 @@ def __init__(self,
                  engine: AsyncLLMEngine,
                  model_config: ModelConfig,
                  served_model_names: List[str],
+                 lora_modules: Optional[List[LoRAModulePath]] = None,
                  chat_template: Optional[str] = None):
         super().__init__(engine=engine,
                          model_config=model_config,
                          served_model_names=served_model_names,
-                         lora_modules=None)
+                         lora_modules=lora_modules)
 
         # If this is None we use the tokenizer's default chat template
         self.chat_template = load_chat_template(chat_template)

diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
@@ -165,6 +165,12 @@ def decode_sequence_inplace(self, seq: Sequence,
         return len(new_decoded_token_text)
 
 
+def _replace_none_with_empty(tokens: List[Optional[str]]):
+    for i, token in enumerate(tokens):
+        if token is None:
+            tokens[i] = ""
+
+
 def _convert_tokens_to_string_with_added_encoders(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     output_tokens: List[str],
@@ -223,6 +229,8 @@ def convert_prompt_ids_to_tokens(
     read_offset = len(new_tokens)
     prefix_offset = max(
         read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
+    # This is required to guard against out-of-vocab prompt token ids
+    _replace_none_with_empty(new_tokens)
     return new_tokens, prefix_offset, read_offset