From bc177d348b298474c99ccb88752ac73191b4a2d4 Mon Sep 17 00:00:00 2001 From: Joe G Date: Sat, 13 Jul 2024 13:36:55 -0700 Subject: [PATCH 1/7] Add prompt adapters to openai entrypoint tests --- tests/entrypoints/openai/test_completion.py | 48 +++++++++++++-------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 6e5fdebe786e1..7d3d4fd4b4e97 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -17,9 +17,10 @@ # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -# technically this needs Mistral-7B-v0.1 as base, but we're not testing -# generation quality here +# technically these adapters use a different base model, +# but we're not testing generation quality here LORA_NAME = "typeof/zephyr-7b-beta-lora" +PA_NAME = "swapnilbp/llama_tweet_ptune" @pytest.fixture(scope="module") @@ -28,7 +29,12 @@ def zephyr_lora_files(): @pytest.fixture(scope="module") -def server(zephyr_lora_files): +def zephyr_pa_files(): + return snapshot_download(repo_id=PA_NAME) + + +@pytest.fixture(scope="module") +def server(zephyr_lora_files, zephyr_pa_files): with RemoteOpenAIServer([ "--model", MODEL_NAME, @@ -37,8 +43,10 @@ def server(zephyr_lora_files): "bfloat16", "--max-model-len", "8192", + "--max-num-seqs" + "128", "--enforce-eager", - # lora config below + # lora config "--enable-lora", "--lora-modules", f"zephyr-lora={zephyr_lora_files}", @@ -47,7 +55,14 @@ def server(zephyr_lora_files): "64", "--max-cpu-loras", "2", - "--max-num-seqs", + # pa config + "--enable-prompt-adapter", + "--prompt-adapters", + f"zephyr-pa={zephyr_pa_files}", + f"zephyr-pa2={zephyr_pa_files}", + "--max-prompt-adapters", + "2", + "--max-prompt-adapter-token", "128", ]) as remote_server: yield remote_server @@ -60,9 +75,9 @@ def client(server): @pytest.mark.asyncio @pytest.mark.parametrize( - # first test base model, then test loras + # first test base model, then test loras, then test prompt adapters "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], + [MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"], ) async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): completion = await client.completions.create(model=model_name, @@ -91,9 +106,9 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( - # first test base model, then test loras + # first test base model, then test loras, then test prompt adapters "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], + [MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"], ) async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs @@ -110,9 +125,9 @@ async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( - # just test 1 lora hereafter + # just test 1 lora and 1 pa hereafter "model_name", - [MODEL_NAME, "zephyr-lora"], + [MODEL_NAME, "zephyr-lora", "zephyr-pa"], ) async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs @@ -133,7 +148,7 @@ async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "zephyr-lora"], + [MODEL_NAME, "zephyr-lora", "zephyr-pa"], ) async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs @@ -154,7 +169,7 @@ async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str): @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "zephyr-lora"], + [MODEL_NAME, "zephyr-lora", "zephyr-pa"], ) async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, model_name: str): @@ -199,7 +214,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - [MODEL_NAME, "zephyr-lora"], + [MODEL_NAME, "zephyr-lora", "zephyr-pa"], ) async def test_completion_streaming(client: openai.AsyncOpenAI, model_name: str): @@ -233,7 +248,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", - ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"], + [MODEL_NAME, "zephyr-lora", "zephyr-pa"], ) async def test_completion_stream_options(client: openai.AsyncOpenAI, model_name: str): @@ -369,9 +384,8 @@ async def test_completion_stream_options(client: openai.AsyncOpenAI, @pytest.mark.asyncio @pytest.mark.parametrize( - # just test 1 lora hereafter "model_name", - [MODEL_NAME, "zephyr-lora"], + [MODEL_NAME, "zephyr-lora", "zephyr-pa"], ) async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str): # test both text and token IDs From a9c0e441da8dff070c786201378b1770fd79a0cd Mon Sep 17 00:00:00 2001 From: Joe G Date: Sat, 13 Jul 2024 16:19:16 -0700 Subject: [PATCH 2/7] Fix model_name args and typo --- tests/entrypoints/openai/test_completion.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 7d3d4fd4b4e97..00de9bf39e6e2 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -43,7 +43,7 @@ def server(zephyr_lora_files, zephyr_pa_files): "bfloat16", "--max-model-len", "8192", - "--max-num-seqs" + "--max-num-seqs", "128", "--enforce-eager", # lora config @@ -96,7 +96,7 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): # test using token IDs completion = await client.completions.create( - model=MODEL_NAME, + model=model_name, prompt=[0, 0, 0, 0, 0], max_tokens=5, temperature=0.0, @@ -113,7 +113,7 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs completion = await client.completions.create( - model=MODEL_NAME, + model=model_name, prompt=[0, 0, 0, 0, 0], max_tokens=5, temperature=0.0, @@ -132,7 +132,7 @@ async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str): async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs completion = await client.completions.create( - model=MODEL_NAME, + model=model_name, prompt=[0, 0, 0, 0, 0], max_tokens=5, temperature=0.0, @@ -153,7 +153,7 @@ async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str): async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str): # test using token IDs completion = await client.completions.create( - model=MODEL_NAME, + model=model_name, prompt=[0, 0, 0, 0, 0], max_tokens=5, temperature=0.0, @@ -177,7 +177,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, with pytest.raises( (openai.BadRequestError, openai.APIError)): # test using token IDs await client.completions.create( - model=MODEL_NAME, + model=model_name, prompt=[0, 0, 0, 0, 0], max_tokens=5, temperature=0.0, @@ -189,7 +189,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI, with pytest.raises( (openai.BadRequestError, openai.APIError)): # test using token IDs stream = await client.completions.create( - model=MODEL_NAME, + model=model_name, prompt=[0, 0, 0, 0, 0], max_tokens=5, temperature=0.0, @@ -637,7 +637,7 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, ) async def test_tokenize(client: openai.AsyncOpenAI, model_name: str): base_url = str(client.base_url)[:-3].strip("/") - tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast") + tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") for add_special in [False, True]: prompt = "This is a test prompt." @@ -664,7 +664,7 @@ async def test_tokenize(client: openai.AsyncOpenAI, model_name: str): ) async def test_detokenize(client: openai.AsyncOpenAI, model_name: str): base_url = str(client.base_url)[:-3] - tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast") + tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") prompt = "This is a test prompt." tokens = tokenizer.encode(prompt, add_special_tokens=False) From 9082eefce461aeeeaad2062f77ddcc0ffcc3792e Mon Sep 17 00:00:00 2001 From: Joe G Date: Sun, 14 Jul 2024 23:01:11 -0700 Subject: [PATCH 3/7] Use abs path when loading adapter config --- vllm/entrypoints/openai/serving_engine.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 58e6571d310e6..47bbcc37115ca 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,4 +1,5 @@ import json +import pathlib from dataclasses import dataclass from http import HTTPStatus from typing import Any, Dict, List, Optional, Tuple, Union @@ -74,8 +75,8 @@ def __init__( self.prompt_adapter_requests = [] if prompt_adapters is not None: for i, prompt_adapter in enumerate(prompt_adapters, start=1): - with open(f"./{prompt_adapter.local_path}" - f"/adapter_config.json") as f: + with open(pathlib.Path( + prompt_adapter.local_path).resolve()) as f: adapter_config = json.load(f) num_virtual_tokens = adapter_config["num_virtual_tokens"] self.prompt_adapter_requests.append( From 8503d2eb055331286db6518e8f5aec386b25c35b Mon Sep 17 00:00:00 2001 From: Joe G Date: Mon, 15 Jul 2024 05:08:15 -0700 Subject: [PATCH 4/7] Fix missing adapter_config.json - Switched to pathlib open --- vllm/entrypoints/openai/serving_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 47bbcc37115ca..14c1df89e064f 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -75,8 +75,8 @@ def __init__( self.prompt_adapter_requests = [] if prompt_adapters is not None: for i, prompt_adapter in enumerate(prompt_adapters, start=1): - with open(pathlib.Path( - prompt_adapter.local_path).resolve()) as f: + with pathlib.Path(prompt_adapter.local_path, + "adapter_config.json").open() as f: adapter_config = json.load(f) num_virtual_tokens = adapter_config["num_virtual_tokens"] self.prompt_adapter_requests.append( From 6b03c49e70f5eaf1127714e93f7f6de73cb93bd2 Mon Sep 17 00:00:00 2001 From: Joe G Date: Mon, 15 Jul 2024 07:11:14 -0700 Subject: [PATCH 5/7] Account for virtual tokens in token counts --- tests/entrypoints/openai/test_completion.py | 22 +++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 00de9bf39e6e2..5e215b25bcee6 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -1,5 +1,6 @@ # imports for guided decoding tests import json +import pathlib import re from typing import List @@ -33,6 +34,14 @@ def zephyr_pa_files(): return snapshot_download(repo_id=PA_NAME) +@pytest.fixture(scope="module") +def zephyr_pa_num_virtual_tokens(zephyr_pa_files): + with pathlib.Path(zephyr_pa_files, "adapter_config.json").open() as f: + adapter_config = json.load(f) + num_virtual_tokens = adapter_config["num_virtual_tokens"] + return num_virtual_tokens + + @pytest.fixture(scope="module") def server(zephyr_lora_files, zephyr_pa_files): with RemoteOpenAIServer([ @@ -76,10 +85,13 @@ def client(server): @pytest.mark.asyncio @pytest.mark.parametrize( # first test base model, then test loras, then test prompt adapters - "model_name", - [MODEL_NAME, "zephyr-lora", "zephyr-lora2", "zephyr-pa", "zephyr-pa2"], + "model_name,num_virtual_tokens", + [(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0), + ("zephyr-pa", zephyr_pa_num_virtual_tokens), + ("zephyr-pa2", zephyr_pa_num_virtual_tokens)], ) -async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): +async def test_single_completion(client: openai.AsyncOpenAI, model_name: str, + num_virtual_tokens: int): completion = await client.completions.create(model=model_name, prompt="Hello, my name is", max_tokens=5, @@ -92,7 +104,9 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): assert len(choice.text) >= 5 assert choice.finish_reason == "length" assert completion.usage == openai.types.CompletionUsage( - completion_tokens=5, prompt_tokens=6, total_tokens=11) + completion_tokens=5, + prompt_tokens=6 + num_virtual_tokens, + total_tokens=11 + num_virtual_tokens) # test using token IDs completion = await client.completions.create( From 6d1e428d8aefefe7d04ffad14f7a3583bcc03963 Mon Sep 17 00:00:00 2001 From: Joe G Date: Mon, 15 Jul 2024 10:54:43 -0700 Subject: [PATCH 6/7] Hardcode virtual token count for tests --- tests/entrypoints/openai/test_completion.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 5e215b25bcee6..955a7a7a98dcd 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -1,6 +1,5 @@ # imports for guided decoding tests import json -import pathlib import re from typing import List @@ -22,6 +21,9 @@ # but we're not testing generation quality here LORA_NAME = "typeof/zephyr-7b-beta-lora" PA_NAME = "swapnilbp/llama_tweet_ptune" +# if PA_NAME changes, PA_NUM_VIRTUAL_TOKENS might also +# need to change to match the prompt adapter +PA_NUM_VIRTUAL_TOKENS = 8 @pytest.fixture(scope="module") @@ -34,14 +36,6 @@ def zephyr_pa_files(): return snapshot_download(repo_id=PA_NAME) -@pytest.fixture(scope="module") -def zephyr_pa_num_virtual_tokens(zephyr_pa_files): - with pathlib.Path(zephyr_pa_files, "adapter_config.json").open() as f: - adapter_config = json.load(f) - num_virtual_tokens = adapter_config["num_virtual_tokens"] - return num_virtual_tokens - - @pytest.fixture(scope="module") def server(zephyr_lora_files, zephyr_pa_files): with RemoteOpenAIServer([ @@ -87,8 +81,8 @@ def client(server): # first test base model, then test loras, then test prompt adapters "model_name,num_virtual_tokens", [(MODEL_NAME, 0), ("zephyr-lora", 0), ("zephyr-lora2", 0), - ("zephyr-pa", zephyr_pa_num_virtual_tokens), - ("zephyr-pa2", zephyr_pa_num_virtual_tokens)], + ("zephyr-pa", PA_NUM_VIRTUAL_TOKENS), + ("zephyr-pa2", PA_NUM_VIRTUAL_TOKENS)], ) async def test_single_completion(client: openai.AsyncOpenAI, model_name: str, num_virtual_tokens: int): From a265a7176d347127a012e5bbe0e5038c99f17823 Mon Sep 17 00:00:00 2001 From: Joe G Date: Mon, 15 Jul 2024 12:13:43 -0700 Subject: [PATCH 7/7] Don't assume len over 5 when all token ids are 0 --- tests/entrypoints/openai/test_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 955a7a7a98dcd..f9dbf69c2eaab 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -109,7 +109,7 @@ async def test_single_completion(client: openai.AsyncOpenAI, model_name: str, max_tokens=5, temperature=0.0, ) - assert len(completion.choices[0].text) >= 5 + assert len(completion.choices[0].text) >= 1 @pytest.mark.asyncio