vllm-project · kevinbu233 · Apr 16, 2024 · Apr 16, 2024 · Apr 17, 2024 · Apr 17, 2024
diff --git a/tests/entrypoints/test_local_LLM.py b/tests/entrypoints/test_local_LLM.py
@@ -0,0 +1,223 @@
+# imports for guided decoding tests
+import json
+import os
+import re
+
+import jsonschema
+import pytest
+
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+
+from vllm.entrypoints.llm import LLM
+from vllm.sampling_params import SamplingParams
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.outputs import RequestOutput
+
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+TEST_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "string"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work history"]
+}
+
+TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+              r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+TEST_CHOICE = [
+    "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
+    "Swift", "Kotlin"
+]
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+@pytest.fixture(scope="session")
+def llm():
+    return LLM(model=MODEL_NAME, max_model_len=15000)
+
+
+@pytest.mark.skip_global_cleanup
+def test_simple_prompts(llm):
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    outputs = llm.generate(
+        prompts=prompts,
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_regex_(llm):
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_options=dict(guided_regex=TEST_REGEX))
+    outputs = llm.generate(
+        prompts=[
+            f"Give an example IPv4 address with this regex: {TEST_REGEX}"
+        ],
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        assert re.fullmatch(TEST_REGEX, generated_text) is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_json_completion(llm):
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_options=dict(guided_json=TEST_SCHEMA),
+                                     max_tokens=1000)
+    outputs = llm.generate(
+        prompts=[
+            f"Give an example JSON for an employee profile "
+            f"that fits this schema: {TEST_SCHEMA}"
+        ],
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+    print(outputs)
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
+
+@pytest.mark.skip_global_cleanup
+def test_guided_choice_completion(llm):
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_options=dict(guided_choice=TEST_CHOICE))
+    outputs = llm.generate(
+        prompts="The best language for type-safe systems programming is ",
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        assert generated_text in TEST_CHOICE
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+def test_guided_grammar(llm):
+    simple_sql_grammar = """
+start: select_statement
+
+select_statement: "SELECT" column "from" table "where" condition
+
+column: "col_1" | "col_2"
+table: "table_1" | "table_2"
+condition: column "=" number
+
+number: "1" | "2"
+"""
+
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_options=dict(guided_grammar=simple_sql_grammar))
+    outputs = llm.generate(
+        prompts=("Generate a sql state that select col_1 from "
+                 "table_1 where it is equals to 1"),
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+
+        # use Lark to parse the output, and make sure it's a valid parse tree
+        from lark import Lark
+        parser = Lark(simple_sql_grammar)
+        parser.parse(generated_text)
+
+        # remove spaces for comparison b/c we removed them in the grammar
+        ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
+            " ", "")
+
+        assert generated_text.strip() == ground_truth
+
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
@@ -1,9 +1,10 @@
 """A block manager that manages token blocks."""
 from abc import ABC, abstractmethod
-from collections.abc import Sequence as GenericSequence
 from itertools import count, takewhile
 from os.path import commonprefix
-from typing import Dict, List, Optional, Set
+from typing import Dict, List, Optional
+from typing import Sequence as GenericSequence
+from typing import Set
 
 from vllm.block import BlockTable, PhysicalTokenBlock
 from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor

diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
@@ -1,7 +1,6 @@
 """A block manager that manages token blocks."""
-from collections.abc import Sequence as GenericSequence
 from typing import Dict, List, Optional
-
+from typing import Sequence as GenericSequence
 from vllm.core.block.block_table import BlockTable
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager

@@ -1,7 +1,8 @@
+from __future__ import annotations
 import enum
 from abc import ABC, abstractmethod
-from collections.abc import Sequence as GenericSequence
 from typing import Dict, List
+from typing import Sequence as GenericSequence
 
 from vllm.sequence import Sequence, SequenceGroup
 

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -13,6 +13,8 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter
 
+from vllm.model_executor.guided_decoding import get_local_guided_decoding_logits_processor
+
 
 class LLM:
     """An LLM for generating texts from given prompts and sampling parameters.
@@ -176,10 +178,17 @@ def generate(
             assert prompt_token_ids is not None
             num_requests = len(prompt_token_ids)
 
+        guided_decode_logits_processor = get_local_guided_decoding_logits_processor(sampling_params, self.get_tokenizer())
+        if guided_decode_logits_processor:
+            if sampling_params.logits_processors is None:
+                sampling_params.logits_processors = []
+            sampling_params.logits_processors.append(
+                guided_decode_logits_processor)
         for i in range(num_requests):
             prompt = prompts[i] if prompts is not None else None
             token_ids = None if prompt_token_ids is None else prompt_token_ids[
                 i]
+
             self._add_request(
                 prompt,
                 sampling_params,

diff --git a/vllm/model_executor/guided_decoding.py b/vllm/model_executor/guided_decoding.py
@@ -5,7 +5,7 @@
 from functools import lru_cache
 from json import dumps as json_dumps
 from re import escape as regex_escape
-from typing import Tuple, Union
+from typing import Tuple, Union, Dict
 
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
@@ -16,6 +16,7 @@
                                                           JSONLogitsProcessor,
                                                           RegexLogitsProcessor)
 
+from vllm.sampling_params import SamplingParams
 
 class GuidedDecodingMode(Enum):
     JSON = "json"
@@ -82,6 +83,60 @@ async def get_guided_decoding_logits_processor(
     logits_processor.init_state()
     return logits_processor
 
+def get_local_guided_decoding_logits_processor(sampling_params, tokenizer):
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    We cache logit processors by (guide, tokenizer), and on cache hit
+    we make a shallow copy to reuse the same underlying FSM.
+    """
+    # global global_thread_pool
+
+    guide, mode = _get_guide_and_mode_from_sampling_params(sampling_params.guided_options)
+    if not guide:
+        return None
+
+    # if global_thread_pool is None:
+    #     global_thread_pool = concurrent.futures.ThreadPoolExecutor(
+    #         max_workers=2)
+
+    result = _get_cached_logits_processor(guide, tokenizer, mode)
+
+    logits_processor = copy(result)
+    # reset logits processor's internal state
+    logits_processor.init_state()
+    return logits_processor
+
+
+def _get_guide_and_mode_from_sampling_params(
+    guided_options: Dict[str, str]
+) -> Tuple[str, GuidedDecodingMode]:
+    if not guided_options:
+        return None, None
+
+    if "guided_json" in guided_options:
+        json = guided_options["guided_json"]
+        if isinstance(json, dict):
+            # turn dict into hashable string
+            json = json_dumps(json)
+        elif isinstance(json, BaseModel):
+            # use pydantic signature so that different model classes
+            # with the same fields will get hashed the same
+            json = str(json.__signature__)
+        return json, GuidedDecodingMode.JSON
+    elif "guided_regex" in guided_options:
+        return guided_options["guided_regex"], GuidedDecodingMode.REGEX
+    elif "guided_choice" in guided_options:
+        # choice just uses regex
+        choices = [
+            regex_escape(str(choice)) for choice in guided_options["guided_choice"]
+        ]
+        choices_regex = "(" + "|".join(choices) + ")"
+        return choices_regex, GuidedDecodingMode.CHOICE
+    elif "guided_grammar" in guided_options:
+        return guided_options["guided_grammar"], GuidedDecodingMode.GRAMMAR
+    else:
+        return None, None
 
 def _get_guide_and_mode(
     request: Union[CompletionRequest, ChatCompletionRequest]