From acb1639f5b86e522798c993d8256a5ab4689bd7c Mon Sep 17 00:00:00 2001 From: Daniel Sola Date: Fri, 29 Nov 2024 15:24:47 -0800 Subject: [PATCH 1/2] vllm inference plugin Signed-off-by: Daniel Sola --- plugins/flytekit-inference/README.md | 63 ++++++++++++++ .../flytekitplugins/inference/__init__.py | 1 + .../inference/vllm/__init__.py | 0 .../flytekitplugins/inference/vllm/serve.py | 85 +++++++++++++++++++ plugins/flytekit-inference/setup.py | 1 + plugins/flytekit-inference/tests/test_vllm.py | 60 +++++++++++++ 6 files changed, 210 insertions(+) create mode 100644 plugins/flytekit-inference/flytekitplugins/inference/vllm/__init__.py create mode 100644 plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py create mode 100644 plugins/flytekit-inference/tests/test_vllm.py diff --git a/plugins/flytekit-inference/README.md b/plugins/flytekit-inference/README.md index 1bc5c8475e..646200c111 100644 --- a/plugins/flytekit-inference/README.md +++ b/plugins/flytekit-inference/README.md @@ -126,3 +126,66 @@ def model_serving(questions: list[str], gguf: FlyteFile) -> list[str]: return responses ``` + +## vLLM + +The vLLM plugin allows you to serve an LLM hosted on HuggingFace. + +```python +import flytekit as fl +from openai import OpenAI + +model_name = "google/gemma-2b-it" +hf_token_key = "vllm_hf_token" + +vllm_args = { + "model": model_name, + "dtype": "half", + "max-model-len": 2000, +} + +hf_secrets = HFSecret( + secrets_prefix="_FSEC_", + hf_token_key=hf_token_key +) + +vllm_instance = VLLM( + hf_secret=hf_secrets, + arg_dict=vllm_args +) + +image = fl.ImageSpec( + name="vllm_serve", + registry="...", + packages=["flytekitplugins-inference"], +) + + +@fl.task( + pod_template=vllm_instance.pod_template, + container_image=image, + secret_requests=[ + fl.Secret( + key=hf_token_key, mount_requirement=fl.Secret.MountType.ENV_VAR # must be mounted as an env var + ) + ], +) +def model_serving() -> str: + client = OpenAI( + base_url=f"{vllm_instance.base_url}/v1", api_key="vllm" # api key required but ignored + ) + + completion = client.chat.completions.create( + model=model_name, + messages=[ + { + "role": "user", + "content": "Compose a haiku about the power of AI.", + } + ], + temperature=0.5, + top_p=1, + max_tokens=1024, + ) + return completion.choices[0].message.content +``` diff --git a/plugins/flytekit-inference/flytekitplugins/inference/__init__.py b/plugins/flytekit-inference/flytekitplugins/inference/__init__.py index cfd14b09a8..8b43dd16a8 100644 --- a/plugins/flytekit-inference/flytekitplugins/inference/__init__.py +++ b/plugins/flytekit-inference/flytekitplugins/inference/__init__.py @@ -14,3 +14,4 @@ from .nim.serve import NIM, NIMSecrets from .ollama.serve import Model, Ollama +from .vllm.serve import VLLM, HFSecret diff --git a/plugins/flytekit-inference/flytekitplugins/inference/vllm/__init__.py b/plugins/flytekit-inference/flytekitplugins/inference/vllm/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py b/plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py new file mode 100644 index 0000000000..82b7c0fe51 --- /dev/null +++ b/plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py @@ -0,0 +1,85 @@ +from dataclasses import dataclass +from typing import Optional + +from ..sidecar_template import ModelInferenceTemplate + + +@dataclass +class HFSecret: + """ + :param secrets_prefix: The secrets prefix that Flyte appends to all mounted secrets. + :param hf_token_group: The group name for the HuggingFace token. + :param hf_token_key: The key name for the HuggingFace token. + """ + + secrets_prefix: str # _UNION_ or _FSEC_ + hf_token_key: str + hf_token_group: Optional[str] = None + + +class VLLM(ModelInferenceTemplate): + def __init__( + self, + hf_secret: HFSecret, + arg_dict: Optional[dict] = None, + image: str = "vllm/vllm-openai", + health_endpoint: str = "/health", + port: int = 8000, + cpu: int = 2, + gpu: int = 1, + mem: str = "10Gi", + ): + """ + Initialize NIM class for managing a Kubernetes pod template. + + :param hf_secret: Instance of HFSecret for managing hugging face secrets. + :param arg_dict: A dictionary of arguments for the VLLM model server (https://docs.vllm.ai/en/stable/models/engine_args.html). + :param image: The Docker image to be used for the model server container. Default is "ç". + :param health_endpoint: The health endpoint for the model server container. Default is "/health". + :param port: The port number for the model server container. Default is 8000. + :param cpu: The number of CPU cores requested for the model server container. Default is 2. + :param gpu: The number of GPU cores requested for the model server container. Default is 1. + :param mem: The amount of memory requested for the model server container. Default is "10Gi". + """ + if hf_secret.hf_token_key is None: + raise ValueError("HuggingFace token key must be provided.") + if hf_secret.secrets_prefix is None: + raise ValueError("Secrets prefix must be provided.") + + self._hf_secret = hf_secret + self._arg_dict = arg_dict + + super().__init__( + image=image, + health_endpoint=health_endpoint, + port=port, + cpu=cpu, + gpu=gpu, + mem=mem, + ) + + self.setup_vllm_pod_template() + + def setup_vllm_pod_template(self): + from kubernetes.client.models import V1EnvVar + + model_server_container = self.pod_template.pod_spec.init_containers[0] + + if self._hf_secret.hf_token_group: + hf_key = f"$({self._hf_secret.secrets_prefix}{self._hf_secret.hf_token_group}_{self._hf_secret.hf_token_key})".upper() + else: + hf_key = f"$({self._hf_secret.secrets_prefix}{self._hf_secret.hf_token_key})".upper() + + model_server_container.env = [ + V1EnvVar(name="HUGGING_FACE_HUB_TOKEN", value=hf_key), + ] + model_server_container.args = self.build_vllm_args() + + def build_vllm_args(self) -> list: + args = [] + if self._arg_dict: + for key, value in self._arg_dict.items(): + args.append(f"--{key}") + if value is not None: + args.append(str(value)) + return args diff --git a/plugins/flytekit-inference/setup.py b/plugins/flytekit-inference/setup.py index c0f42a2e41..ef46849726 100644 --- a/plugins/flytekit-inference/setup.py +++ b/plugins/flytekit-inference/setup.py @@ -19,6 +19,7 @@ f"flytekitplugins.{PLUGIN_NAME}", f"flytekitplugins.{PLUGIN_NAME}.nim", f"flytekitplugins.{PLUGIN_NAME}.ollama", + f"flytekitplugins.{PLUGIN_NAME}.vllm", ], install_requires=plugin_requires, license="apache2", diff --git a/plugins/flytekit-inference/tests/test_vllm.py b/plugins/flytekit-inference/tests/test_vllm.py new file mode 100644 index 0000000000..e1a7901de5 --- /dev/null +++ b/plugins/flytekit-inference/tests/test_vllm.py @@ -0,0 +1,60 @@ +from flytekitplugins.inference import VLLM, HFSecret + + +def test_vllm_init_valid_params(): + vllm_args = { + "model": "google/gemma-2b-it", + "dtype": "half", + "max-model-len": 2000, + } + + hf_secrets = HFSecret( + secrets_prefix="_UNION_", + hf_token_key="vllm_hf_token" + ) + + vllm_instance = VLLM( + hf_secret=hf_secrets, + arg_dict=vllm_args, + image='vllm/vllm-openai:my-tag', + cpu='10', + gpu='2', + mem='50Gi', + port=8080, + ) + + assert len(vllm_instance.pod_template.pod_spec.init_containers) == 1 + assert ( + vllm_instance.pod_template.pod_spec.init_containers[0].image + == 'vllm/vllm-openai:my-tag' + ) + assert ( + vllm_instance.pod_template.pod_spec.init_containers[0].resources.requests[ + "memory" + ] + == "50Gi" + ) + assert ( + vllm_instance.pod_template.pod_spec.init_containers[0].ports[0].container_port + == 8080 + ) + assert vllm_instance.pod_template.pod_spec.init_containers[0].args == ['--model', 'google/gemma-2b-it', '--dtype', 'half', '--max-model-len', '2000'] + assert vllm_instance.pod_template.pod_spec.init_containers[0].env[0].name == 'HUGGING_FACE_HUB_TOKEN' + assert vllm_instance.pod_template.pod_spec.init_containers[0].env[0].value == '$(_UNION_VLLM_HF_TOKEN)' + + + +def test_vllm_default_params(): + vllm_instance = VLLM(hf_secret=HFSecret(secrets_prefix="_FSEC_", hf_token_key="test_token")) + + assert vllm_instance.base_url == "http://localhost:8000" + assert vllm_instance._image == 'vllm/vllm-openai' + assert vllm_instance._port == 8000 + assert vllm_instance._cpu == 2 + assert vllm_instance._gpu == 1 + assert vllm_instance._health_endpoint == "/health" + assert vllm_instance._mem == "10Gi" + assert vllm_instance._arg_dict == None + assert vllm_instance._hf_secret.secrets_prefix == '_FSEC_' + assert vllm_instance._hf_secret.hf_token_key == 'test_token' + assert vllm_instance._hf_secret.hf_token_group == None From 9a65e7e1c0246fec86beffa8ce40e2f97cf1f808 Mon Sep 17 00:00:00 2001 From: Daniel Sola Date: Thu, 12 Dec 2024 16:16:43 -0800 Subject: [PATCH 2/2] fixed default value Signed-off-by: Daniel Sola --- .../flytekit-inference/flytekitplugins/inference/vllm/serve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py b/plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py index 82b7c0fe51..f353aabda4 100644 --- a/plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py +++ b/plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py @@ -34,7 +34,7 @@ def __init__( :param hf_secret: Instance of HFSecret for managing hugging face secrets. :param arg_dict: A dictionary of arguments for the VLLM model server (https://docs.vllm.ai/en/stable/models/engine_args.html). - :param image: The Docker image to be used for the model server container. Default is "ç". + :param image: The Docker image to be used for the model server container. Default is "vllm/vllm-openai". :param health_endpoint: The health endpoint for the model server container. Default is "/health". :param port: The port number for the model server container. Default is 8000. :param cpu: The number of CPU cores requested for the model server container. Default is 2.