From acb1639f5b86e522798c993d8256a5ab4689bd7c Mon Sep 17 00:00:00 2001
From: Daniel Sola <daniel.sola@union.ai>
Date: Fri, 29 Nov 2024 15:24:47 -0800
Subject: [PATCH 1/2] vllm inference plugin

Signed-off-by: Daniel Sola <daniel.sola@union.ai>
---
 plugins/flytekit-inference/README.md          | 63 ++++++++++++++
 .../flytekitplugins/inference/__init__.py     |  1 +
 .../inference/vllm/__init__.py                |  0
 .../flytekitplugins/inference/vllm/serve.py   | 85 +++++++++++++++++++
 plugins/flytekit-inference/setup.py           |  1 +
 plugins/flytekit-inference/tests/test_vllm.py | 60 +++++++++++++
 6 files changed, 210 insertions(+)
 create mode 100644 plugins/flytekit-inference/flytekitplugins/inference/vllm/__init__.py
 create mode 100644 plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py
 create mode 100644 plugins/flytekit-inference/tests/test_vllm.py

diff --git a/plugins/flytekit-inference/README.md b/plugins/flytekit-inference/README.md
index 1bc5c8475e..646200c111 100644
--- a/plugins/flytekit-inference/README.md
+++ b/plugins/flytekit-inference/README.md
@@ -126,3 +126,66 @@ def model_serving(questions: list[str], gguf: FlyteFile) -> list[str]:
 
     return responses
 ```
+
+## vLLM
+
+The vLLM plugin allows you to serve an LLM hosted on HuggingFace.
+
+```python
+import flytekit as fl
+from openai import OpenAI
+
+model_name = "google/gemma-2b-it"
+hf_token_key = "vllm_hf_token"
+
+vllm_args = {
+    "model": model_name,
+    "dtype": "half",
+    "max-model-len": 2000,
+}
+
+hf_secrets = HFSecret(
+    secrets_prefix="_FSEC_",
+    hf_token_key=hf_token_key
+)
+
+vllm_instance = VLLM(
+    hf_secret=hf_secrets,
+    arg_dict=vllm_args
+)
+
+image = fl.ImageSpec(
+    name="vllm_serve",
+    registry="...",
+    packages=["flytekitplugins-inference"],
+)
+
+
+@fl.task(
+    pod_template=vllm_instance.pod_template,
+    container_image=image,
+    secret_requests=[
+        fl.Secret(
+            key=hf_token_key, mount_requirement=fl.Secret.MountType.ENV_VAR  # must be mounted as an env var
+        )
+    ],
+)
+def model_serving() -> str:
+    client = OpenAI(
+        base_url=f"{vllm_instance.base_url}/v1", api_key="vllm"  # api key required but ignored
+    )
+
+    completion = client.chat.completions.create(
+        model=model_name,
+        messages=[
+            {
+                "role": "user",
+                "content": "Compose a haiku about the power of AI.",
+            }
+        ],
+        temperature=0.5,
+        top_p=1,
+        max_tokens=1024,
+    )
+    return completion.choices[0].message.content
+```
diff --git a/plugins/flytekit-inference/flytekitplugins/inference/__init__.py b/plugins/flytekit-inference/flytekitplugins/inference/__init__.py
index cfd14b09a8..8b43dd16a8 100644
--- a/plugins/flytekit-inference/flytekitplugins/inference/__init__.py
+++ b/plugins/flytekit-inference/flytekitplugins/inference/__init__.py
@@ -14,3 +14,4 @@
 
 from .nim.serve import NIM, NIMSecrets
 from .ollama.serve import Model, Ollama
+from .vllm.serve import VLLM, HFSecret
diff --git a/plugins/flytekit-inference/flytekitplugins/inference/vllm/__init__.py b/plugins/flytekit-inference/flytekitplugins/inference/vllm/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py b/plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py
new file mode 100644
index 0000000000..82b7c0fe51
--- /dev/null
+++ b/plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py
@@ -0,0 +1,85 @@
+from dataclasses import dataclass
+from typing import Optional
+
+from ..sidecar_template import ModelInferenceTemplate
+
+
+@dataclass
+class HFSecret:
+    """
+    :param secrets_prefix: The secrets prefix that Flyte appends to all mounted secrets.
+    :param hf_token_group: The group name for the HuggingFace token.
+    :param hf_token_key: The key name for the HuggingFace token.
+    """
+
+    secrets_prefix: str  # _UNION_ or _FSEC_
+    hf_token_key: str
+    hf_token_group: Optional[str] = None
+
+
+class VLLM(ModelInferenceTemplate):
+    def __init__(
+        self,
+        hf_secret: HFSecret,
+        arg_dict: Optional[dict] = None,
+        image: str = "vllm/vllm-openai",
+        health_endpoint: str = "/health",
+        port: int = 8000,
+        cpu: int = 2,
+        gpu: int = 1,
+        mem: str = "10Gi",
+    ):
+        """
+        Initialize NIM class for managing a Kubernetes pod template.
+
+        :param hf_secret: Instance of HFSecret for managing hugging face secrets.
+        :param arg_dict: A dictionary of arguments for the VLLM model server (https://docs.vllm.ai/en/stable/models/engine_args.html).
+        :param image: The Docker image to be used for the model server container. Default is "ç".
+        :param health_endpoint: The health endpoint for the model server container. Default is "/health".
+        :param port: The port number for the model server container. Default is 8000.
+        :param cpu: The number of CPU cores requested for the model server container. Default is 2.
+        :param gpu: The number of GPU cores requested for the model server container. Default is 1.
+        :param mem: The amount of memory requested for the model server container. Default is "10Gi".
+        """
+        if hf_secret.hf_token_key is None:
+            raise ValueError("HuggingFace token key must be provided.")
+        if hf_secret.secrets_prefix is None:
+            raise ValueError("Secrets prefix must be provided.")
+
+        self._hf_secret = hf_secret
+        self._arg_dict = arg_dict
+
+        super().__init__(
+            image=image,
+            health_endpoint=health_endpoint,
+            port=port,
+            cpu=cpu,
+            gpu=gpu,
+            mem=mem,
+        )
+
+        self.setup_vllm_pod_template()
+
+    def setup_vllm_pod_template(self):
+        from kubernetes.client.models import V1EnvVar
+
+        model_server_container = self.pod_template.pod_spec.init_containers[0]
+
+        if self._hf_secret.hf_token_group:
+            hf_key = f"$({self._hf_secret.secrets_prefix}{self._hf_secret.hf_token_group}_{self._hf_secret.hf_token_key})".upper()
+        else:
+            hf_key = f"$({self._hf_secret.secrets_prefix}{self._hf_secret.hf_token_key})".upper()
+
+        model_server_container.env = [
+            V1EnvVar(name="HUGGING_FACE_HUB_TOKEN", value=hf_key),
+        ]
+        model_server_container.args = self.build_vllm_args()
+
+    def build_vllm_args(self) -> list:
+        args = []
+        if self._arg_dict:
+            for key, value in self._arg_dict.items():
+                args.append(f"--{key}")
+                if value is not None:
+                    args.append(str(value))
+        return args
diff --git a/plugins/flytekit-inference/setup.py b/plugins/flytekit-inference/setup.py
index c0f42a2e41..ef46849726 100644
--- a/plugins/flytekit-inference/setup.py
+++ b/plugins/flytekit-inference/setup.py
@@ -19,6 +19,7 @@
         f"flytekitplugins.{PLUGIN_NAME}",
         f"flytekitplugins.{PLUGIN_NAME}.nim",
         f"flytekitplugins.{PLUGIN_NAME}.ollama",
+        f"flytekitplugins.{PLUGIN_NAME}.vllm",
     ],
     install_requires=plugin_requires,
     license="apache2",
diff --git a/plugins/flytekit-inference/tests/test_vllm.py b/plugins/flytekit-inference/tests/test_vllm.py
new file mode 100644
index 0000000000..e1a7901de5
--- /dev/null
+++ b/plugins/flytekit-inference/tests/test_vllm.py
@@ -0,0 +1,60 @@
+from flytekitplugins.inference import VLLM, HFSecret
+
+
+def test_vllm_init_valid_params():
+    vllm_args = {
+        "model": "google/gemma-2b-it",
+        "dtype": "half",
+        "max-model-len": 2000,
+    }
+
+    hf_secrets = HFSecret(
+        secrets_prefix="_UNION_",
+        hf_token_key="vllm_hf_token"
+    )
+
+    vllm_instance = VLLM(
+        hf_secret=hf_secrets,
+        arg_dict=vllm_args,
+        image='vllm/vllm-openai:my-tag',
+        cpu='10',
+        gpu='2',
+        mem='50Gi',
+        port=8080,
+    )
+
+    assert len(vllm_instance.pod_template.pod_spec.init_containers) == 1
+    assert (
+        vllm_instance.pod_template.pod_spec.init_containers[0].image
+        == 'vllm/vllm-openai:my-tag'
+    )
+    assert (
+        vllm_instance.pod_template.pod_spec.init_containers[0].resources.requests[
+            "memory"
+        ]
+        == "50Gi"
+    )
+    assert (
+        vllm_instance.pod_template.pod_spec.init_containers[0].ports[0].container_port
+        == 8080
+    )
+    assert vllm_instance.pod_template.pod_spec.init_containers[0].args == ['--model', 'google/gemma-2b-it', '--dtype', 'half', '--max-model-len', '2000']
+    assert vllm_instance.pod_template.pod_spec.init_containers[0].env[0].name == 'HUGGING_FACE_HUB_TOKEN'
+    assert vllm_instance.pod_template.pod_spec.init_containers[0].env[0].value == '$(_UNION_VLLM_HF_TOKEN)'
+
+
+
+def test_vllm_default_params():
+    vllm_instance = VLLM(hf_secret=HFSecret(secrets_prefix="_FSEC_", hf_token_key="test_token"))
+
+    assert vllm_instance.base_url == "http://localhost:8000"
+    assert vllm_instance._image == 'vllm/vllm-openai'
+    assert vllm_instance._port == 8000
+    assert vllm_instance._cpu == 2
+    assert vllm_instance._gpu == 1
+    assert vllm_instance._health_endpoint == "/health"
+    assert vllm_instance._mem == "10Gi"
+    assert vllm_instance._arg_dict == None
+    assert vllm_instance._hf_secret.secrets_prefix == '_FSEC_'
+    assert vllm_instance._hf_secret.hf_token_key == 'test_token'
+    assert vllm_instance._hf_secret.hf_token_group == None

From 9a65e7e1c0246fec86beffa8ce40e2f97cf1f808 Mon Sep 17 00:00:00 2001
From: Daniel Sola <daniel.sola@union.ai>
Date: Thu, 12 Dec 2024 16:16:43 -0800
Subject: [PATCH 2/2] fixed default value

Signed-off-by: Daniel Sola <daniel.sola@union.ai>
---
 .../flytekit-inference/flytekitplugins/inference/vllm/serve.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py b/plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py
index 82b7c0fe51..f353aabda4 100644
--- a/plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py
+++ b/plugins/flytekit-inference/flytekitplugins/inference/vllm/serve.py
@@ -34,7 +34,7 @@ def __init__(
 
         :param hf_secret: Instance of HFSecret for managing hugging face secrets.
         :param arg_dict: A dictionary of arguments for the VLLM model server (https://docs.vllm.ai/en/stable/models/engine_args.html).
-        :param image: The Docker image to be used for the model server container. Default is "ç".
+        :param image: The Docker image to be used for the model server container. Default is "vllm/vllm-openai".
         :param health_endpoint: The health endpoint for the model server container. Default is "/health".
         :param port: The port number for the model server container. Default is 8000.
         :param cpu: The number of CPU cores requested for the model server container. Default is 2.