Merge branch 'v1_logprobs' into sample

vllm-project · Dec 13, 2024 · 7cd9a24 · 7cd9a24
2 parents 1b163ab + e962aa7
commit 7cd9a24
Show file tree

Hide file tree

Showing 74 changed files with 1,528 additions and 829 deletions.
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -495,7 +495,7 @@ Text Generation
 ---------------
 
 .. list-table::
-  :widths: 25 25 15 25 5 5
+  :widths: 25 25 15 20 5 5 5
   :header-rows: 1
 
   * - Architecture
@@ -504,144 +504,168 @@ Text Generation
     - Example HF Models
     - :ref:`LoRA <lora>`
     - :ref:`PP <distributed_serving>`
+    - V1
   * - :code:`AriaForConditionalGeneration`
     - Aria
     - T + I
     - :code:`rhymes-ai/Aria`
     - 
     - ✅︎
+    - 
   * - :code:`Blip2ForConditionalGeneration`
     - BLIP-2
     - T + I\ :sup:`E`
     - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
     -
     - ✅︎
+    - 
   * - :code:`ChameleonForConditionalGeneration`
     - Chameleon
     - T + I
     - :code:`facebook/chameleon-7b` etc.
     - 
     - ✅︎
+    - 
   * - :code:`FuyuForCausalLM`
     - Fuyu
     - T + I
     - :code:`adept/fuyu-8b` etc.
     - 
     - ✅︎
+    - 
   * - :code:`ChatGLMModel`
     - GLM-4V
     - T + I
     - :code:`THUDM/glm-4v-9b` etc.
     - ✅︎
     - ✅︎
+    - 
   * - :code:`H2OVLChatModel`
     - H2OVL
     - T + I\ :sup:`E+`
     - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc.
     - 
     - ✅︎
+    - 
   * - :code:`Idefics3ForConditionalGeneration`
     - Idefics3
     - T + I
     - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc.
     - ✅︎
+    -
     - 
   * - :code:`InternVLChatModel`
     - InternVL 2.5, Mono-InternVL, InternVL 2.0
     - T + I\ :sup:`E+`
     - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc.
     - 
     - ✅︎
+    - ✅︎
   * - :code:`LlavaForConditionalGeneration`
     - LLaVA-1.5
     - T + I\ :sup:`E+`
     - :code:`llava-hf/llava-1.5-7b-hf`, :code:`TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`LlavaNextForConditionalGeneration`
     - LLaVA-NeXT
     - T + I\ :sup:`E+`
     - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
     -
     - ✅︎
+    - 
   * - :code:`LlavaNextVideoForConditionalGeneration`
     - LLaVA-NeXT-Video
     - T + V
     - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
     -
     - ✅︎
+    - 
   * - :code:`LlavaOnevisionForConditionalGeneration`
     - LLaVA-Onevision
     - T + I\ :sup:`+` + V\ :sup:`+`
     - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
     -
     - ✅︎
+    - 
   * - :code:`MiniCPMV`
     - MiniCPM-V
     - T + I\ :sup:`E+`
     - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
     - ✅︎
     - ✅︎
+    - 
   * - :code:`MllamaForConditionalGeneration`
     - Llama 3.2
     - T + I\ :sup:`+`
     - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc.
     -
     -
+    -
   * - :code:`MolmoForCausalLM`
     - Molmo
     - T + I
     - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`NVLM_D_Model`
     - NVLM-D 1.0
     - T + I\ :sup:`E+`
     - :code:`nvidia/NVLM-D-72B`, etc.
     - 
     - ✅︎
+    - ✅︎
   * - :code:`PaliGemmaForConditionalGeneration`
     - PaliGemma
     - T + I\ :sup:`E`
     - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
     - 
     - ✅︎
+    - 
   * - :code:`Phi3VForCausalLM`
     - Phi-3-Vision, Phi-3.5-Vision
     - T + I\ :sup:`E+`
     - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`PixtralForConditionalGeneration`
     - Pixtral
     - T + I\ :sup:`+`
     - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc.
     -
     - ✅︎
+    - ✅︎
   * - :code:`QWenLMHeadModel`
     - Qwen-VL
     - T + I\ :sup:`E+`
     - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
     - ✅︎
     - ✅︎
+    -
   * - :code:`Qwen2AudioForConditionalGeneration`
     - Qwen2-Audio
     - T + A\ :sup:`+`
     - :code:`Qwen/Qwen2-Audio-7B-Instruct`
     -
     - ✅︎
+    - 
   * - :code:`Qwen2VLForConditionalGeneration`
     - Qwen2-VL
     - T + I\ :sup:`E+` + V\ :sup:`E+`
     - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
     - ✅︎
     - ✅︎
+    - 
   * - :code:`UltravoxModel`
     - Ultravox
     - T + A\ :sup:`E+`
     - :code:`fixie-ai/ultravox-v0_3`
     -
     - ✅︎
+    - 
 
 | :sup:`E` Pre-computed embeddings can be inputted for this modality.
 | :sup:`+` Multiple items can be inputted per text prompt for this modality.

diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py
@@ -9,9 +9,7 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
     """Create a list of test prompts with their sampling parameters."""
     return [
         ("A robot may not injure a human being",
-         SamplingParams(temperature=0.0,
-                        request_sample_logprobs=1,
-                        request_prompt_logprobs=1)),
+         SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
         ("To be or not to be,",
          SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
         ("What is the meaning of life?",

diff --git a/examples/lora_with_quantization_inference.py b/examples/lora_with_quantization_inference.py
@@ -22,26 +22,26 @@ def create_test_prompts(
         # this is an example of using quantization without LoRA
         ("My name is",
          SamplingParams(temperature=0.0,
-                        request_sample_logprobs=1,
-                        request_prompt_logprobs=1,
+                        logprobs=1,
+                        prompt_logprobs=1,
                         max_tokens=128), None),
         # the next three examples use quantization with LoRA
         ("my name is",
          SamplingParams(temperature=0.0,
-                        request_sample_logprobs=1,
-                        request_prompt_logprobs=1,
+                        logprobs=1,
+                        prompt_logprobs=1,
                         max_tokens=128),
          LoRARequest("lora-test-1", 1, lora_path)),
         ("The capital of USA is",
          SamplingParams(temperature=0.0,
-                        request_sample_logprobs=1,
-                        request_prompt_logprobs=1,
+                        logprobs=1,
+                        prompt_logprobs=1,
                         max_tokens=128),
          LoRARequest("lora-test-2", 1, lora_path)),
         ("The capital of France is",
          SamplingParams(temperature=0.0,
-                        request_sample_logprobs=1,
-                        request_prompt_logprobs=1,
+                        logprobs=1,
+                        prompt_logprobs=1,
                         max_tokens=128),
          LoRARequest("lora-test-3", 1, lora_path)),
     ]

diff --git a/examples/multilora_inference.py b/examples/multilora_inference.py
@@ -27,8 +27,8 @@ def create_test_prompts(
     return [
         ("A robot may not injure a human being",
          SamplingParams(temperature=0.0,
-                        request_sample_logprobs=1,
-                        request_prompt_logprobs=1,
+                        logprobs=1,
+                        prompt_logprobs=1,
                         max_tokens=128), None),
         ("To be or not to be,",
          SamplingParams(temperature=0.8,
@@ -38,16 +38,16 @@ def create_test_prompts(
         (
             "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
             SamplingParams(temperature=0.0,
-                           request_sample_logprobs=1,
-                           request_prompt_logprobs=1,
+                           logprobs=1,
+                           prompt_logprobs=1,
                            max_tokens=128,
                            stop_token_ids=[32003]),
             LoRARequest("sql-lora", 1, lora_path)),
         (
             "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
             SamplingParams(temperature=0.0,
-                           request_sample_logprobs=1,
-                           request_prompt_logprobs=1,
+                           logprobs=1,
+                           prompt_logprobs=1,
                            max_tokens=128,
                            stop_token_ids=[32003]),
             LoRARequest("sql-lora2", 2, lora_path)),

diff --git a/requirements-common.txt b/requirements-common.txt
@@ -19,7 +19,7 @@ prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
 outlines >= 0.0.43, < 0.1
-xgrammar >= 0.1.5; platform_machine == "x86_64"
+xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -794,7 +794,7 @@ def generate_w_logprobs(
             self._final_steps_generate_w_logprobs(req_outputs))
         # Omit prompt logprobs if not required by sampling params
         return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
-                if sampling_params.request_prompt_logprobs is None else
+                if sampling_params.prompt_logprobs is None else
                 toks_str_logsprobs_prompt_logprobs)
 
     def generate_encoder_decoder_w_logprobs(
@@ -807,14 +807,14 @@ def generate_encoder_decoder_w_logprobs(
         Logprobs generation for vLLM encoder/decoder models
         '''
 
-        assert sampling_params.request_sample_logprobs is not None
+        assert sampling_params.logprobs is not None
         req_outputs = self.model.generate(encoder_decoder_prompts,
                                           sampling_params=sampling_params)
         toks_str_logsprobs_prompt_logprobs = (
             self._final_steps_generate_w_logprobs(req_outputs))
         # Omit prompt logprobs if not required by sampling params
         return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
-                if sampling_params.request_prompt_logprobs is None else
+                if sampling_params.prompt_logprobs is None else
                 toks_str_logsprobs_prompt_logprobs)
 
     def generate_greedy(
@@ -850,8 +850,8 @@ def generate_greedy_logprobs(
         greedy_logprobs_params = SamplingParams(
             temperature=0.0,
             max_tokens=max_tokens,
-            request_sample_logprobs=num_logprobs,
-            request_prompt_logprobs=num_prompt_logprobs,
+            logprobs=num_logprobs,
+            prompt_logprobs=num_prompt_logprobs,
             stop_token_ids=stop_token_ids,
             stop=stop)
 
@@ -872,8 +872,8 @@ def generate_encoder_decoder_greedy_logprobs(
         greedy_logprobs_params = SamplingParams(
             temperature=0.0,
             max_tokens=max_tokens,
-            request_sample_logprobs=num_logprobs,
-            request_prompt_logprobs=(num_prompt_logprobs),
+            logprobs=num_logprobs,
+            prompt_logprobs=(num_prompt_logprobs),
         )
         '''
         Greedy logprobs generation for vLLM encoder/decoder models

diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
@@ -50,12 +50,12 @@ def test_compilation_config():
     args = parser.parse_args(["-O=3"])
     assert args.compilation_config.level == 3
 
-    # set to json
-    args = parser.parse_args(["--compilation-config", '{"level": 3}'])
+    # set to string form of a dict
+    args = parser.parse_args(["--compilation-config", "{'level': 3}"])
     assert args.compilation_config.level == 3
 
-    # set to json
-    args = parser.parse_args(['--compilation-config={"level": 3}'])
+    # set to string form of a dict
+    args = parser.parse_args(["--compilation-config={'level': 3}"])
     assert args.compilation_config.level == 3
 
 

diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
@@ -10,8 +10,7 @@ def test_skip_tokenizer_initialization(model: str):
     # of tokenizer and detokenizer. The generated output is expected to contain
     # token ids.
     llm = LLM(model=model, skip_tokenizer_init=True)
-    sampling_params = SamplingParams(request_prompt_logprobs=True,
-                                     detokenize=True)
+    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
 
     with pytest.raises(ValueError, match="cannot pass text prompts when"):
         llm.generate("abc", sampling_params)

diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
@@ -24,9 +24,7 @@
     # "mistralai/Mistral-Nemo-Instruct-2407"
 ]
 
-SAMPLING_PARAMS = SamplingParams(max_tokens=512,
-                                 temperature=0.0,
-                                 request_sample_logprobs=5)
+SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 SYMBOLIC_LANG_PROMPTS = [
     "勇敢な船乗りについての詩を書く",  # japanese
     "寫一首關於勇敢的水手的詩",  # chinese

diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -116,9 +116,7 @@ def _create_engine_inputs_hf(urls: List[str]) -> TextPrompt:
     _create_engine_inputs(IMG_URLS),
 ]
 
-SAMPLING_PARAMS = SamplingParams(max_tokens=512,
-                                 temperature=0.0,
-                                 request_sample_logprobs=5)
+SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 LIMIT_MM_PER_PROMPT = dict(image=4)
 
 MAX_MODEL_LEN = [8192, 65536]

diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
@@ -29,7 +29,7 @@ def pick_vllm(token_ids, logits):
 
         params_with_logprobs = SamplingParams(
             logits_processors=[pick_vllm],
-            request_prompt_logprobs=3,
+            prompt_logprobs=3,
             max_tokens=max_tokens,
         )
 
@@ -43,7 +43,7 @@ def pick_vllm(token_ids, logits):
         vllm_model.model._add_request(
             example_prompts[1],
             params=SamplingParams(
-                request_prompt_logprobs=3,
+                prompt_logprobs=3,
                 max_tokens=max_tokens,
             ),
         )