Skip to content

Commit

Permalink
[Doc] Update docs to refer to pooling models (vllm-project#11093)
Browse files Browse the repository at this point in the history
Signed-off-by: DarkLight1337 <[email protected]>
  • Loading branch information
DarkLight1337 authored and weilong.yu committed Dec 13, 2024
1 parent 8737cde commit 36153a1
Show file tree
Hide file tree
Showing 14 changed files with 26 additions and 21 deletions.
7 changes: 6 additions & 1 deletion docs/source/usage/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@ A: Assuming that you're referring to using OpenAI compatible server to serve mul

Q: Which model to use for offline inference embedding?

A: If you want to use an embedding model, try: https://huggingface.co/intfloat/e5-mistral-7b-instruct. Instead models, such as Llama-3-8b, Mistral-7B-Instruct-v0.3, are generation models rather than an embedding model
A: You can try `e5-mistral-7b-instruct <https://huggingface.co/intfloat/e5-mistral-7b-instruct>`__ and `BAAI/bge-base-en-v1.5 <https://huggingface.co/BAAI/bge-base-en-v1.5>`__;
more are listed :ref:`here <supported_models>`.

By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B <https://huggingface.co/meta-llama/Meta-Llama-3-8B>`__,
`Mistral-7B-Instruct-v0.3 <https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3>`__ into embedding models,
but they are expected be inferior to models that are specifically trained on embedding tasks.

----------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion vllm/attention/backends/placeholder_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from vllm.worker.model_runner import (ModelInputForGPUBuilder,
ModelInputForGPUWithSamplingMetadata)

# Placeholder attention backend for models like Mamba and embedding models that
# Placeholder attention backend for models like Mamba and pooling models that
# lack attention.


Expand Down
8 changes: 4 additions & 4 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ class ModelConfig:
this argument will be used to configure the neuron config that
can not be gathered from the vllm arguments.
override_pooler_config: Initialize non default pooling config or
override default pooling config for the embedding model.
override default pooling config for the pooling model.
"""

def __init__(
Expand Down Expand Up @@ -576,7 +576,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
self.use_async_output_proc = False
return

# Async postprocessor is not necessary with embedding mode
# Async postprocessor is not necessary for pooling models
# since there is no token generation
if self.runner_type == "pooling":
self.use_async_output_proc = False
Expand Down Expand Up @@ -1825,11 +1825,11 @@ class MultiModalConfig:

@dataclass
class PoolerConfig:
"""Controls the behavior of output pooling in embedding models."""
"""Controls the behavior of output pooling in pooling models."""

pooling_type: Optional[str] = None
"""
The pooling method of the embedding model. This should be a key in
The pooling method of the pooling model. This should be a key in
:class:`vllm.model_executor.layers.pooler.PoolingType`.
"""

Expand Down
2 changes: 1 addition & 1 deletion vllm/core/placeholder_block_space_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class PlaceholderBlockSpaceManager(BlockSpaceManager):
"""A version of BlockSpaceManager for use in environments
where block management is not required.
For example: embedding models or attention-free models like Mamba.
For example: pooling models or attention-free models like Mamba.
This class provides the same interface as BlockSpaceManager, but its
methods perform no actions or return simple values like True in specific
Expand Down
4 changes: 2 additions & 2 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -893,7 +893,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
'--override-pooler-config',
type=PoolerConfig.from_json,
default=None,
help="Override or set the pooling method in the embedding model. "
help="Override or set the pooling method for pooling models. "
"e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'")

parser.add_argument('--compilation-config',
Expand Down Expand Up @@ -1085,7 +1085,7 @@ def create_engine_config(self,
"setting --max-model-len to a smaller value.", max_model_len)
elif (self.enable_chunked_prefill
and model_config.runner_type == "pooling"):
msg = "Chunked prefill is not supported for embedding models"
msg = "Chunked prefill is not supported for pooling models"
raise ValueError(msg)


Expand Down
2 changes: 1 addition & 1 deletion vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1085,7 +1085,7 @@ async def encode(
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from an embedding model.
"""Generate outputs for a request from a pooling model.
Generate outputs for a request. This method is a coroutine. It adds the
request into the waiting queue of the LLMEngine and streams the outputs
Expand Down
2 changes: 1 addition & 1 deletion vllm/engine/multiprocessing/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ def encode(
*,
inputs: Optional[PromptType] = None # DEPRECATED
) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from an embedding model.
"""Generate outputs for a request from a pooling model.
Generate outputs for a request. This method is a coroutine. It adds the
request into the waiting queue of the LLMEngine and streams the outputs
Expand Down
2 changes: 1 addition & 1 deletion vllm/engine/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def encode(
trace_headers: Optional[Mapping[str, str]] = None,
priority: int = 0,
) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from an embedding model."""
"""Generate outputs for a request from a pooling model."""
...

@abstractmethod
Expand Down
2 changes: 1 addition & 1 deletion vllm/entrypoints/openai/serving_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ async def create_score(

if prompt_adapter_request is not None:
raise NotImplementedError("Prompt adapter is not supported "
"for embedding models")
"for scoring models")

if isinstance(tokenizer, MistralTokenizer):
raise ValueError(
Expand Down
6 changes: 3 additions & 3 deletions vllm/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,9 +618,9 @@ class SequenceGroup:
arrival_time: The arrival time of the request.
lora_request: LoRA request.
embeddings: The embeddings vectors of the prompt of the sequence group
for an embedding model.
for a pooling model.
pooling_params: The pooling parameters used to generate the pooling
for an embedding model.
for a pooling model.
encoder_seq: Optional, the single encoder sequence. Should be None
unless you are working with an encoder/decoder model.
trace_headers: OpenTelemetry trace headers.
Expand Down Expand Up @@ -1102,7 +1102,7 @@ class PoolerOutput(
msgspec.Struct,
omit_defaults=True, # type: ignore[call-arg]
array_like=True): # type: ignore[call-arg]
"""The output from a pooling operation in the embedding model."""
"""The output from a pooling operation in the pooling model."""
outputs: List[EmbeddingSequenceGroupOutput]

# lazy import to avoid circular import
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/engine/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def process_inputs(
priority: int = 0,
) -> Tuple[DetokenizerRequest, EngineCoreRequest]:

# TODO(woosuk): Support embedding mode.
# TODO(woosuk): Support pooling models.
# TODO(woosuk): Check max_logprobs
# TODO(woosuk): Support encoder-decoder models.

Expand Down
2 changes: 1 addition & 1 deletion vllm/worker/cpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def __init__(
# Uninitialized cache engine. Will be initialized by
# initialize_cache.
self.cache_engine: List[CPUCacheEngine]
# Initialize cpu_cache as embedding models don't initialize kv_caches
# Initialize cpu_cache as pooling models don't initialize kv_caches
self.cpu_cache: Optional[List[List[torch.Tensor]]] = None

# Torch profiler. Enabled and configured through env vars:
Expand Down
4 changes: 2 additions & 2 deletions vllm/worker/hpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ def __init__(
# Uninitialized cache engine. Will be initialized by
# initialize_cache.
self.cache_engine: List[HPUCacheEngine]
# Initialize gpu_cache as embedding models don't initialize kv_caches
self.hpu_cache: Optional[List[List[torch.tensor]]] = None
# Initialize gpu_cache as pooling models don't initialize kv_caches
self.hpu_cache: Optional[List[List[torch.Tensor]]] = None
# Torch profiler. Enabled and configured through env vars:
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
if envs.VLLM_TORCH_PROFILER_DIR:
Expand Down
2 changes: 1 addition & 1 deletion vllm/worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def __init__(
# Uninitialized cache engine. Will be initialized by
# initialize_cache.
self.cache_engine: List[CacheEngine]
# Initialize gpu_cache as embedding models don't initialize kv_caches
# Initialize gpu_cache as pooling models don't initialize kv_caches
self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}

Expand Down

0 comments on commit 36153a1

Please sign in to comment.