Skip to content

Commit

Permalink
[V1] Use pickle for serializing EngineCoreRequest & Add multimodal in…
Browse files Browse the repository at this point in the history
…puts to EngineCoreRequest (vllm-project#10245)

Signed-off-by: Woosuk Kwon <[email protected]>
  • Loading branch information
WoosukKwon authored Nov 12, 2024
1 parent 47db6ec commit 7c65527
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 5 deletions.
9 changes: 7 additions & 2 deletions vllm/v1/engine/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import enum
from dataclasses import dataclass
from typing import List, Optional, Union
from typing import Any, Dict, List, Optional, Union

import msgspec

from vllm.lora.request import LoRARequest
from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
from vllm.sampling_params import RequestOutputKind, SamplingParams


Expand All @@ -22,7 +23,8 @@ class DetokenizerRequest:
include_stop_str_in_output: bool


class EngineCoreRequest(msgspec.Struct, omit_defaults=True):
@dataclass
class EngineCoreRequest:

# NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
# but this object is currently not playing well with msgspec
Expand All @@ -33,6 +35,9 @@ class EngineCoreRequest(msgspec.Struct, omit_defaults=True):
# always be tokenized?
prompt: Optional[str]
prompt_token_ids: List[int]
mm_data: Optional[MultiModalDataDict]
mm_placeholders: Optional[MultiModalPlaceholderDict]
mm_processor_kwargs: Optional[Dict[str, Any]]
sampling_params: SamplingParams
eos_token_id: Optional[int]
arrival_time: float
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/engine/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
EngineCoreRequest, EngineCoreRequestType)
from vllm.v1.executor.gpu_executor import GPUExecutor
from vllm.v1.request import Request, RequestStatus
from vllm.v1.serial_utils import PickleEncoder
from vllm.version import __version__ as VLLM_VERSION

logger = init_logger(__name__)
Expand Down Expand Up @@ -315,7 +316,7 @@ def process_input_socket(self, input_path: str):
"""Input socket IO thread."""

# Msgpack serialization decoding.
decoder_add_req = msgpack.Decoder(EngineCoreRequest)
decoder_add_req = PickleEncoder()
decoder_abort_req = msgpack.Decoder(list[str])

with self.make_socket(input_path, zmq.constants.PULL) as socket:
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/engine/core_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
EngineCoreRequest, EngineCoreRequestType)
from vllm.v1.engine.core import EngineCore, EngineCoreProc
from vllm.v1.serial_utils import PickleEncoder

logger = init_logger(__name__)

Expand Down Expand Up @@ -115,7 +116,7 @@ def __init__(
**kwargs,
):
# Serialization setup.
self.encoder = msgspec.msgpack.Encoder()
self.encoder = PickleEncoder()
self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)

# ZMQ setup.
Expand Down
5 changes: 4 additions & 1 deletion vllm/v1/engine/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,10 @@ def process_inputs(
# Make Request for EngineCore.
engine_core_request = EngineCoreRequest(
request_id, processed_inputs.get("prompt"),
processed_inputs.get("prompt_token_ids"), sampling_params,
processed_inputs.get("prompt_token_ids"),
processed_inputs.get("multi_modal_data"),
processed_inputs.get("multi_modal_placeholders"),
processed_inputs.get("mm_processor_kwargs"), sampling_params,
eos_token_id, arrival_time, lora_request)

return detokenizer_request, engine_core_request
Expand Down
10 changes: 10 additions & 0 deletions vllm/v1/serial_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import pickle


class PickleEncoder:

def encode(self, obj):
return pickle.dumps(obj)

def decode(self, data):
return pickle.loads(data)

0 comments on commit 7c65527

Please sign in to comment.