vllm-project · DarkLight1337 · Dec 13, 2024 · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -181,14 +181,14 @@ steps:
   commands:
     - VLLM_USE_V1=1 pytest -v -s v1
 
-- label: Examples Test # 15min
+- label: Examples Test # 20min
   working_dir: "/vllm-workspace/examples"
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/entrypoints
   - examples/
   commands:
-    - pip install awscli tensorizer # for llava example and tensorizer test
+    - pip install tensorizer # for tensorizer test
     - python3 offline_inference.py
     - python3 cpu_offload.py
     - python3 offline_inference_chat.py
@@ -198,6 +198,9 @@ steps:
     - python3 offline_inference_vision_language_multi_image.py
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference_encoder_decoder.py
+    - python3 offline_inference_classification.py
+    - python3 offline_inference_embedding.py
+    - python3 offline_inference_scoring.py
     - python3 offline_profile.py --model facebook/opt-125m
 
 - label: Prefix Caching Test # 9min

diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst
@@ -6,7 +6,7 @@ Pooling Models
 vLLM also supports pooling models, including embedding, reranking and reward models.
 
 In vLLM, pooling models implement the :class:`~vllm.model_executor.models.VllmModelForPooling` interface.
-These models use a :class:`~vllm.model_executor.layers.Pooler` to aggregate the final hidden states of the input
+These models use a :class:`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input
 before returning them.
 
 .. note::
@@ -45,20 +45,48 @@ which takes priority over both the model's and Sentence Transformers's defaults.
 ^^^^^^^^^^^^^^
 
 The :class:`~vllm.LLM.encode` method is available to all pooling models in vLLM.
-It returns the aggregated hidden states directly.
+It returns the extracted hidden states directly, which is useful for reward models.
+
+.. code-block:: python
+
+    llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
+    output, = llm.encode("Hello, my name is")
+
+    data = output.outputs.data
+    print(f"Prompt: {prompt!r} | Data: {data!r}")
+
+``LLM.embed``
+^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
+It is primarily designed for embedding models.
 
 .. code-block:: python
 
     llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
-    outputs = llm.encode("Hello, my name is")
+    output, = llm.embed("Hello, my name is")
 
-    outputs = model.encode(prompts)
-    for output in outputs:
-        embeddings = output.outputs.embedding
-        print(f"Prompt: {prompt!r}, Embeddings (size={len(embeddings)}: {embeddings!r}")
+    embeds = output.outputs.embedding
+    print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 
 A code example can be found in `examples/offline_inference_embedding.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_embedding.py>`_.
 
+``LLM.classify``
+^^^^^^^^^^^^^^^^
+
+The :class:`~vllm.LLM.classify` method outputs a probability vector for each prompt.
+It is primarily designed for classification models.
+
+.. code-block:: python
+
+    llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
+    output, = llm.classify("Hello, my name is")
+
+    probs = output.outputs.probs
+    print(f"Class Probabilities: {probs!r} (size={len(probs)})")
+
+A code example can be found in `examples/offline_inference_classification.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_classification.py>`_.
+
 ``LLM.score``
 ^^^^^^^^^^^^^
 
@@ -71,7 +99,16 @@ These types of models serve as rerankers between candidate query-document pairs
     vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
     To handle RAG at a higher level, you should use integration frameworks such as `LangChain <https://github.com/langchain-ai/langchain>`_.
 
-You can use `these tests <https://github.com/vllm-project/vllm/blob/main/tests/models/embedding/language/test_scoring.py>`_ as reference.
+.. code-block:: python
+
+    llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
+    output, = llm.score("What is the capital of France?",
+                        "The capital of Brazil is Brasilia.")
+
+    probs = output.outputs.probs
+    print(f"Scores: {probs!r} (size={len(probs)})")
+
+A code example can be found in `examples/offline_inference_scoring.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_scoring.py>`_.
 
 Online Inference
 ----------------

diff --git a/examples/offline_inference_classification.py b/examples/offline_inference_classification.py
@@ -0,0 +1,28 @@
+from vllm import LLM
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create an LLM.
+# You should pass task="classify" for classification models
+model = LLM(
+    model="jason9693/Qwen2.5-1.5B-apeach",
+    task="classify",
+    enforce_eager=True,
+)
+
+# Generate logits. The output is a list of ClassificationRequestOutputs.
+outputs = model.classify(prompts)
+
+# Print the outputs.
+for prompt, output in zip(prompts, outputs):
+    probs = output.outputs.probs
+    probs_trimmed = ((str(probs[:16])[:-1] +
+                      ", ...]") if len(probs) > 16 else probs)
+    print(f"Prompt: {prompt!r} | "
+          f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference_embedding.py
@@ -9,14 +9,20 @@
 ]
 
 # Create an LLM.
+# You should pass task="embed" for embedding models
 model = LLM(
     model="intfloat/e5-mistral-7b-instruct",
-    task="embed",  # You should pass task="embed" for embedding models
+    task="embed",
     enforce_eager=True,
 )
 
-# Generate embedding. The output is a list of PoolingRequestOutputs.
-outputs = model.encode(prompts)
+# Generate embedding. The output is a list of EmbeddingRequestOutputs.
+outputs = model.embed(prompts)
+
 # Print the outputs.
-for output in outputs:
-    print(output.outputs.embedding)  # list of 4096 floats
+for prompt, output in zip(prompts, outputs):
+    embeds = output.outputs.embedding
+    embeds_trimmed = ((str(embeds[:16])[:-1] +
+                       ", ...]") if len(embeds) > 16 else embeds)
+    print(f"Prompt: {prompt!r} | "
+          f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
diff --git a/examples/offline_inference_scoring.py b/examples/offline_inference_scoring.py
@@ -0,0 +1,26 @@
+from vllm import LLM
+
+# Sample prompts.
+text_1 = "What is the capital of France?"
+texts_2 = [
+    "The capital of Brazil is Brasilia.", "The capital of France is Paris."
+]
+
+# Create an LLM.
+# You should pass task="score" for cross-encoder models
+model = LLM(
+    model="BAAI/bge-reranker-v2-m3",
+    task="score",
+    enforce_eager=True,
+)
+
+# Generate scores. The output is a list of ClassificationRequestOutputs.
+outputs = model.score(text_1, texts_2)
+
+# Print the outputs.
+for text_2, output in zip(texts_2, outputs):
+    scores = output.outputs.probs
+    scores_trimmed = ((str(scores[:16])[:-1] +
+                       ", ...]") if len(scores) > 16 else scores)
+    print(f"Pair: {[text_1, text_2]!r} | "
+          f"Scores: {scores_trimmed} (size={len(scores)})")
diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py
@@ -133,7 +133,7 @@ def run_encode(model: str, modality: QueryModality):
     if req_data.image is not None:
         mm_data["image"] = req_data.image
 
-    outputs = req_data.llm.encode({
+    outputs = req_data.llm.embed({
         "prompt": req_data.prompt,
         "multi_modal_data": mm_data,
     })

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -719,14 +719,6 @@ def get_inputs(
 
         return inputs
 
-    def classify(self, prompts: List[str]) -> List[str]:
-        req_outputs = self.model.encode(prompts)
-        outputs = []
-        for req_output in req_outputs:
-            embedding = req_output.outputs.embedding
-            outputs.append(embedding)
-        return outputs
-
     def generate(
         self,
         prompts: List[str],
@@ -897,6 +889,10 @@ def generate_beam_search(
             returned_outputs.append((token_ids, texts))
         return returned_outputs
 
+    def classify(self, prompts: List[str]) -> List[List[float]]:
+        req_outputs = self.model.classify(prompts)
+        return [req_output.outputs.probs for req_output in req_outputs]
+
     def encode(
         self,
         prompts: List[str],
@@ -909,7 +905,7 @@ def encode(
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.encode(inputs)
+        req_outputs = self.model.embed(inputs)
         return [req_output.outputs.embedding for req_output in req_outputs]
 
     def score(
@@ -918,7 +914,7 @@ def score(
         text_2: Union[str, List[str]],
     ) -> List[List[float]]:
         req_outputs = self.model.score(text_1, text_2)
-        return [req_output.outputs.embedding for req_output in req_outputs]
+        return [req_output.outputs.probs for req_output in req_outputs]
 
     def __enter__(self):
         return self

@@ -2,7 +2,7 @@
 
 import pytest
 
-from vllm import LLM, PoolingParams, SamplingParams
+from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 
 from ..utils import fork_new_process_for_each_test
@@ -36,9 +36,8 @@ def test_oot_registration_text_generation(dummy_opt_path):
 def test_oot_registration_embedding(dummy_gemma2_embedding_path):
     os.environ["VLLM_PLUGINS"] = "register_dummy_model"
     prompts = ["Hello, my name is", "The text does not matter"]
-    sampling_params = PoolingParams()
     llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
-    outputs = llm.encode(prompts, sampling_params)
+    outputs = llm.embed(prompts)
 
     for output in outputs:
         assert all(v == 0 for v in output.outputs.embedding)

diff --git a/vllm/__init__.py b/vllm/__init__.py
@@ -7,7 +7,9 @@
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import PromptType, TextPrompt, TokensPrompt
 from vllm.model_executor.models import ModelRegistry
-from vllm.outputs import (CompletionOutput, PoolingOutput,
+from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput,
+                          CompletionOutput, EmbeddingOutput,
+                          EmbeddingRequestOutput, PoolingOutput,
                           PoolingRequestOutput, RequestOutput)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
@@ -27,33 +29,14 @@
     "CompletionOutput",
     "PoolingOutput",
     "PoolingRequestOutput",
+    "EmbeddingOutput",
+    "EmbeddingRequestOutput",
+    "ClassificationOutput",
+    "ClassificationRequestOutput",
     "LLMEngine",
     "EngineArgs",
     "AsyncLLMEngine",
     "AsyncEngineArgs",
     "initialize_ray_cluster",
     "PoolingParams",
 ]
-
-
-def __getattr__(name: str):
-    import warnings
-
-    if name == "EmbeddingOutput":
-        msg = ("EmbeddingOutput has been renamed to PoolingOutput. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return PoolingOutput
-
-    if name == "EmbeddingRequestOutput":
-        msg = ("EmbeddingRequestOutput has been renamed to "
-               "PoolingRequestOutput. "
-               "The original name will be removed in an upcoming version.")
-
-        warnings.warn(DeprecationWarning(msg), stacklevel=2)
-
-        return PoolingRequestOutput
-
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -46,11 +46,10 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
-                           ParallelSampleSequenceGroup, Sequence,
-                           SequenceGroup, SequenceGroupBase,
-                           SequenceGroupMetadata, SequenceGroupOutput,
-                           SequenceStatus)
+from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
+                           PoolingSequenceGroupOutput, Sequence, SequenceGroup,
+                           SequenceGroupBase, SequenceGroupMetadata,
+                           SequenceGroupOutput, SequenceStatus)
 from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
                           init_tracer)
 from vllm.transformers_utils.config import try_get_generation_config
@@ -966,9 +965,9 @@ def has_unfinished_requests_for_virtual_engine(
     @staticmethod
     def _process_sequence_group_outputs(
         seq_group: SequenceGroup,
-        outputs: List[EmbeddingSequenceGroupOutput],
+        outputs: List[PoolingSequenceGroupOutput],
     ) -> None:
-        seq_group.embeddings = outputs[0].embeddings
+        seq_group.pooled_data = outputs[0].data
 
         for seq in seq_group.get_seqs():
             seq.status = SequenceStatus.FINISHED_STOPPED
@@ -1784,8 +1783,8 @@ def _get_stats(self,
                                num_prompt_tokens_iter)
         # Spec decode, if enabled, emits specialized metrics from the worker in
         # sampler output.
-        if model_output and (model_output[0].spec_decode_worker_metrics
-                             is not None):
+        if model_output and isinstance(model_output[0], SamplerOutput) and (
+                model_output[0].spec_decode_worker_metrics is not None):
             spec_decode_metrics = model_output[0].spec_decode_worker_metrics
         else:
             spec_decode_metrics = None