From dfd04ddffba8063ab264979c855d489730e9d9da Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Wed, 4 Sep 2024 20:58:47 +0000 Subject: [PATCH 01/16] Add RayService vLLM TPU Inference script Signed-off-by: Ryan O'Leary bug fixes Signed-off-by: Ryan O'Leary remove extra ray init Signed-off-by: Ryan O'Leary Read hf token from os Signed-off-by: Ryan O'Leary Fix bugs Signed-off-by: Ryan O'Leary Remove hf token logic Signed-off-by: Ryan O'Leary Fix serve script Signed-off-by: Ryan O'Leary --- ai-ml/gke-ray/rayserve/llm/serve_tpu.py | 105 ++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 ai-ml/gke-ray/rayserve/llm/serve_tpu.py diff --git a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py new file mode 100644 index 0000000000..e42383e0d1 --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py @@ -0,0 +1,105 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: this file was inspired from: https://github.com/richardsliu/vllm/blob/rayserve/examples/rayserve_tpu.py + +import os + +import json +import logging +from typing import Dict, List, Optional + +import ray +from fastapi import FastAPI +from ray import serve +from starlette.requests import Request +from starlette.responses import Response + +from vllm import LLM, SamplingParams + +logger = logging.getLogger("ray.serve") + +model_id = "meta-llama/Meta-Llama-3-70B" + +app = FastAPI() + +@serve.deployment(name="VLLMDeployment") +@serve.ingress(app) +class VLLMDeployment: + def __init__( + self, + num_tpu_chips, + ): + self.llm = LLM( + model=model_id, + tensor_parallel_size=num_tpu_chips, + enforce_eager=True, + ) + + @app.post("/v1/generate") + async def generate(self, request: Request): + request_dict = await request.json() + prompts = request_dict.pop("prompt") + print("Processing prompt ", prompts) + sampling_params = SamplingParams(temperature=0.7, + top_p=1.0, + n=1, + max_tokens=1000) + + outputs = self.llm.generate(prompts, sampling_params) + for output in outputs: + prompt = output.prompt + generated_text = "" + token_ids = [] + for completion_output in output.outputs: + generated_text += completion_output.text + token_ids.extend(list(completion_output.token_ids)) + + print("Generated text: ", generated_text) + ret = { + "prompt": prompt, + "text": generated_text, + "token_ids": token_ids, + } + + return Response(content=json.dumps(ret)) + +def get_num_tpu_chips() -> int: + if "TPU" not in ray.cluster_resources(): + # Pass in TPU chips when the current Ray cluster resources can't be auto-detected (i.e for autoscaling). + if os.environ.get('TPU_CHIPS') is not None: + return int(os.environ.get('TPU_CHIPS')) + return 0 + return int(ray.cluster_resources()["TPU"]) + +def build_app(cli_args: Dict[str, str]) -> serve.Application: + """Builds the Serve app based on CLI arguments.""" + ray.init(ignore_reinit_error=True) + + # Set the model to use, defaults to Llama-3-70B. + if 'MODEL_ID' in os.environ: + model_id = os.environ.get('MODEL_ID') + + num_tpu_chips = get_num_tpu_chips() + pg_resources = [] + pg_resources.append({"CPU": 1}) # for the deployment replica + for i in range(num_tpu_chips): + pg_resources.append({"CPU": 1, "TPU": 1}) # for the vLLM actors + + # Use PACK strategy since the deployment may use more than one TPU node. + return VLLMDeployment.options( + placement_group_bundles=pg_resources, + placement_group_strategy="PACK").bind(num_tpu_chips) + +model = build_app({}) \ No newline at end of file From 335e75c1656947c45eee2b26a99605e8e12211b8 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 27 Sep 2024 08:11:27 +0000 Subject: [PATCH 02/16] Fix inference script Signed-off-by: Ryan O'Leary --- ai-ml/gke-ray/rayserve/llm/serve_tpu.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py index e42383e0d1..d4d1a5624e 100644 --- a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py +++ b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py @@ -30,8 +30,6 @@ logger = logging.getLogger("ray.serve") -model_id = "meta-llama/Meta-Llama-3-70B" - app = FastAPI() @serve.deployment(name="VLLMDeployment") @@ -39,10 +37,10 @@ class VLLMDeployment: def __init__( self, - num_tpu_chips, + num_tpu_chips, ): self.llm = LLM( - model=model_id, + model=os.environ['MODEL_ID'], tensor_parallel_size=num_tpu_chips, enforce_eager=True, ) @@ -87,10 +85,6 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: """Builds the Serve app based on CLI arguments.""" ray.init(ignore_reinit_error=True) - # Set the model to use, defaults to Llama-3-70B. - if 'MODEL_ID' in os.environ: - model_id = os.environ.get('MODEL_ID') - num_tpu_chips = get_num_tpu_chips() pg_resources = [] pg_resources.append({"CPU": 1}) # for the deployment replica @@ -102,4 +96,4 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: placement_group_bundles=pg_resources, placement_group_strategy="PACK").bind(num_tpu_chips) -model = build_app({}) \ No newline at end of file +model = build_app({}) From fe6440c4329383dc20e2c3c2d65303c363b32ee1 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 27 Sep 2024 09:57:16 +0000 Subject: [PATCH 03/16] Add RayCluster and RayService CRs Signed-off-by: Ryan O'Leary --- .../llm/llama-3-8b-it/ray-cluster-tpu.yaml | 94 ++++++++++++++++++ .../llm/llama-3-8b-it/ray-service-tpu.yaml | 98 +++++++++++++++++++ .../llm/llama-3.1-70b/ray-cluster-tpu.yaml | 94 ++++++++++++++++++ .../llm/llama-3.1-70b/ray-service-tpu.yaml | 98 +++++++++++++++++++ 4 files changed, 384 insertions(+) create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-tpu.yaml create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-tpu.yaml create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-tpu.yaml new file mode 100644 index 0000000000..b40a79cfb5 --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-tpu.yaml @@ -0,0 +1,94 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu] +apiVersion: ray.io/v1 +kind: RayCluster +metadata: + name: vllm-tpu +spec: + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "8" + ephemeral-storage: 5Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 5Gi + memory: 40G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "meta-llama/Llama-3-8B-Instruct" + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + - containerPort: 8471 + name: slicebuilder + - containerPort: 8081 + name: mxla + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 1 + numOfHosts: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 150Gi + memory: 200G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 150Gi + memory: 200G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "meta-llama/Llama-3-8B-Instruct" + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice + cloud.google.com/gke-tpu-topology: 2x2x2 +# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml new file mode 100644 index 0000000000..a9e57033a1 --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml @@ -0,0 +1,98 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu] +apiVersion: ray.io/v1 +kind: RayService +metadata: + name: vllm-tpu +spec: + serveConfigV2: | + applications: + - name: llm + import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model + deployments: + - name: VLLMDeployment + num_replicas: 1 + runtime_env: + working_dir: "https://github.com/ryanaoleary/kubernetes-engine-samples/archive/refs/heads/multihost-example.zip" + env_vars: + MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" + TPU_CHIPS: "8" + rayClusterConfig: + rayVersion: 2.34.0 + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: ryanaoleary/vllm:tpu + imagePullPolicy: IfNotPresent + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + resources: + limits: + cpu: "8" + memory: 40G + requests: + cpu: "8" + memory: 40G + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 2 + numOfHosts: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: ryanaoleary/vllm:tpu + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 150Gi + memory: 100G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 150Gi + memory: 100G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice + cloud.google.com/gke-tpu-topology: 2x2x2 +# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-tpu.yaml new file mode 100644 index 0000000000..9198f6b393 --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-tpu.yaml @@ -0,0 +1,94 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu] +apiVersion: ray.io/v1 +kind: RayCluster +metadata: + name: vllm-tpu +spec: + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "8" + ephemeral-storage: 5Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 5Gi + memory: 40G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "meta-llama/Llama-3.1-70B" + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + - containerPort: 8471 + name: slicebuilder + - containerPort: 8081 + name: mxla + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 1 + numOfHosts: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 150Gi + memory: 200G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 150Gi + memory: 200G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "meta-llama/Llama-3.1-70B" + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice + cloud.google.com/gke-tpu-topology: 2x2x2 +# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml new file mode 100644 index 0000000000..b6406df6f6 --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml @@ -0,0 +1,98 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu] +apiVersion: ray.io/v1 +kind: RayService +metadata: + name: vllm-tpu +spec: + serveConfigV2: | + applications: + - name: llm + import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model + deployments: + - name: VLLMDeployment + num_replicas: 1 + runtime_env: + working_dir: "https://github.com/ryanaoleary/kubernetes-engine-samples/archive/refs/heads/multihost-example.zip" + env_vars: + MODEL_ID: "meta-llama/Meta-Llama-3.1-70B" + TPU_CHIPS: "8" + rayClusterConfig: + rayVersion: 2.34.0 + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: ryanaoleary/vllm:tpu + imagePullPolicy: IfNotPresent + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + resources: + limits: + cpu: "8" + memory: 40G + requests: + cpu: "8" + memory: 40G + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 2 + numOfHosts: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: ryanaoleary/vllm:tpu + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 150Gi + memory: 100G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 150Gi + memory: 100G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice + cloud.google.com/gke-tpu-topology: 2x2x2 +# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu] From ee804fd7c72c1a87a220b938e5bb19fa9f6f2547 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 27 Sep 2024 10:09:47 +0000 Subject: [PATCH 04/16] Fix working_dir link Signed-off-by: Ryan O'Leary --- ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml | 2 +- ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml index a9e57033a1..df90c0bf5e 100644 --- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml @@ -26,7 +26,7 @@ spec: - name: VLLMDeployment num_replicas: 1 runtime_env: - working_dir: "https://github.com/ryanaoleary/kubernetes-engine-samples/archive/refs/heads/multihost-example.zip" + working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" env_vars: MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" TPU_CHIPS: "8" diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml index b6406df6f6..dd48a67ccc 100644 --- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml @@ -26,7 +26,7 @@ spec: - name: VLLMDeployment num_replicas: 1 runtime_env: - working_dir: "https://github.com/ryanaoleary/kubernetes-engine-samples/archive/refs/heads/multihost-example.zip" + working_dir: "htthttps://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" env_vars: MODEL_ID: "meta-llama/Meta-Llama-3.1-70B" TPU_CHIPS: "8" From 37f028079f92370c62e1988fc3b1d4343d52644e Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 4 Oct 2024 00:15:36 +0000 Subject: [PATCH 05/16] Set max model length Signed-off-by: Ryan O'Leary --- ai-ml/gke-ray/rayserve/llm/serve_tpu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py index d4d1a5624e..65d7a4c039 100644 --- a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py +++ b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py @@ -42,6 +42,7 @@ def __init__( self.llm = LLM( model=os.environ['MODEL_ID'], tensor_parallel_size=num_tpu_chips, + max_model_len=1024, enforce_eager=True, ) From 11c7ee6b7ab7693ee26efca22cc7ca3ec4e8c4ad Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Fri, 4 Oct 2024 02:48:25 +0000 Subject: [PATCH 06/16] pass in max model len as env var Signed-off-by: Ryan O'Leary --- ai-ml/gke-ray/rayserve/llm/serve_tpu.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py index 65d7a4c039..1ac693d520 100644 --- a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py +++ b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py @@ -42,7 +42,7 @@ def __init__( self.llm = LLM( model=os.environ['MODEL_ID'], tensor_parallel_size=num_tpu_chips, - max_model_len=1024, + max_model_len=int(os.environ.get('MAX_MODEL_LEN')), enforce_eager=True, ) @@ -50,11 +50,12 @@ def __init__( async def generate(self, request: Request): request_dict = await request.json() prompts = request_dict.pop("prompt") + max_toks = int(request_dict.pop("max_tokens")) print("Processing prompt ", prompts) sampling_params = SamplingParams(temperature=0.7, top_p=1.0, n=1, - max_tokens=1000) + max_tokens=max_toks) outputs = self.llm.generate(prompts, sampling_params) for output in outputs: From deeee5653cc072206b96015039c5d9fd663d3f63 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Mon, 7 Oct 2024 19:20:48 +0000 Subject: [PATCH 07/16] Add Ray CR manifests and fix inference script Signed-off-by: Ryan O'Leary --- ...uster-tpu.yaml => ray-cluster-v4-tpu.yaml} | 16 +-- .../llama-3-8b-it/ray-cluster-v5e-tpu.yaml | 96 ++++++++++++++++ ...rvice-tpu.yaml => ray-service-v4-tpu.yaml} | 14 ++- .../llama-3-8b-it/ray-service-v5e-tpu.yaml | 100 +++++++++++++++++ ...uster-tpu.yaml => ray-cluster-v4-tpu.yaml} | 28 +++-- .../llama-3.1-70b/ray-cluster-v5e-tpu.yaml | 100 +++++++++++++++++ ...rvice-tpu.yaml => ray-service-v4-tpu.yaml} | 29 +++-- .../llama-3.1-70b/ray-service-v5e-tpu.yaml | 103 ++++++++++++++++++ .../llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml | 100 +++++++++++++++++ .../llava-1.5-13b/ray-cluster-v5e-tpu.yaml | 100 +++++++++++++++++ .../llm/llava-1.5-13b/ray-service-v4-tpu.yaml | 103 ++++++++++++++++++ .../llava-1.5-13b/ray-service-v5e-tpu.yaml | 103 ++++++++++++++++++ .../llm/mistral-7b/ray-cluster-v4-tpu.yaml | 100 +++++++++++++++++ .../llm/mistral-7b/ray-cluster-v5e-tpu.yaml | 100 +++++++++++++++++ .../llm/mistral-7b/ray-service-v4-tpu.yaml | 103 ++++++++++++++++++ .../llm/mistral-7b/ray-service-v5e-tpu.yaml | 103 ++++++++++++++++++ ai-ml/gke-ray/rayserve/llm/serve_tpu.py | 26 ++++- 17 files changed, 1285 insertions(+), 39 deletions(-) rename ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/{ray-cluster-tpu.yaml => ray-cluster-v4-tpu.yaml} (90%) create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml rename ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/{ray-service-tpu.yaml => ray-service-v4-tpu.yaml} (92%) create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml rename ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/{ray-cluster-tpu.yaml => ray-cluster-v4-tpu.yaml} (83%) create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml rename ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/{ray-service-tpu.yaml => ray-service-v4-tpu.yaml} (81%) create mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml create mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml create mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml create mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml create mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml create mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml create mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml create mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml create mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml similarity index 90% rename from ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-tpu.yaml rename to ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml index b40a79cfb5..eb54cc59e8 100644 --- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu] +# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v4] apiVersion: ray.io/v1 kind: RayCluster metadata: @@ -73,22 +73,24 @@ spec: limits: cpu: "100" google.com/tpu: "4" - ephemeral-storage: 150Gi - memory: 200G + ephemeral-storage: 50Gi + memory: 100G requests: cpu: "100" google.com/tpu: "4" - ephemeral-storage: 150Gi - memory: 200G + ephemeral-storage: 50Gi + memory: 100G env: + - name: JAX_PLATFORMS + value: "tpu" - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token - name: MODEL_ID - value: "meta-llama/Llama-3-8B-Instruct" + value: "meta-llama/Meta-Llama-3-8B-Instruct" nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice cloud.google.com/gke-tpu-topology: 2x2x2 -# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu] +# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml new file mode 100644 index 0000000000..48f510296e --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml @@ -0,0 +1,96 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v5e] +apiVersion: ray.io/v1 +kind: RayCluster +metadata: + name: vllm-tpu +spec: + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "8" + ephemeral-storage: 5Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 5Gi + memory: 40G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "meta-llama/Meta-Llama-3-8B-Instruct" + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + - containerPort: 8471 + name: slicebuilder + - containerPort: 8081 + name: mxla + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 1 + numOfHosts: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 50Gi + memory: 100G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 50Gi + memory: 100G + env: + - name: JAX_PLATFORMS + value: "tpu" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "meta-llama/Meta-Llama-3-8B-Instruct" + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 2x4 +# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml similarity index 92% rename from ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml rename to ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml index df90c0bf5e..6e95df89f7 100644 --- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu] +# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v4] apiVersion: ray.io/v1 kind: RayService metadata: @@ -38,7 +38,7 @@ spec: spec: containers: - name: ray-head - image: ryanaoleary/vllm:tpu + image: $VLLM_IMAGE imagePullPolicy: IfNotPresent ports: - containerPort: 6379 @@ -73,20 +73,22 @@ spec: spec: containers: - name: ray-worker - image: ryanaoleary/vllm:tpu + image: $VLLM_IMAGE imagePullPolicy: IfNotPresent resources: limits: cpu: "100" google.com/tpu: "4" - ephemeral-storage: 150Gi + ephemeral-storage: 50Gi memory: 100G requests: cpu: "100" google.com/tpu: "4" - ephemeral-storage: 150Gi + ephemeral-storage: 50Gi memory: 100G env: + - name: JAX_PLATFORMS + value: "tpu" - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: @@ -95,4 +97,4 @@ spec: nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice cloud.google.com/gke-tpu-topology: 2x2x2 -# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu] +# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml new file mode 100644 index 0000000000..70aaf5938c --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml @@ -0,0 +1,100 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v5e] +apiVersion: ray.io/v1 +kind: RayService +metadata: + name: vllm-tpu +spec: + serveConfigV2: | + applications: + - name: llm + import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model + deployments: + - name: VLLMDeployment + num_replicas: 1 + runtime_env: + working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" + env_vars: + MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" + TPU_CHIPS: "8" + rayClusterConfig: + rayVersion: 2.34.0 + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + resources: + limits: + cpu: "8" + memory: 40G + requests: + cpu: "8" + memory: 40G + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 2 + numOfHosts: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 50Gi + memory: 100G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 50Gi + memory: 100G + env: + - name: JAX_PLATFORMS + value: "tpu" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 2x4 +# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml similarity index 83% rename from ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-tpu.yaml rename to ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml index 9198f6b393..ea469084e9 100644 --- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu] +# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v4] apiVersion: ray.io/v1 kind: RayCluster metadata: @@ -29,11 +29,11 @@ spec: resources: limits: cpu: "8" - ephemeral-storage: 5Gi + ephemeral-storage: 8Gi memory: 40G requests: cpu: "8" - ephemeral-storage: 5Gi + ephemeral-storage: 8Gi memory: 40G env: - name: HUGGING_FACE_HUB_TOKEN @@ -43,6 +43,8 @@ spec: key: hf_api_token - name: MODEL_ID value: "meta-llama/Llama-3.1-70B" + - name: MAX_MODEL_LEN + value: "4096" ports: - containerPort: 6379 name: gcs @@ -60,8 +62,8 @@ spec: - groupName: tpu-group replicas: 1 minReplicas: 0 - maxReplicas: 1 - numOfHosts: 2 + maxReplicas: 2 + numOfHosts: 4 rayStartParams: {} template: spec: @@ -73,14 +75,16 @@ spec: limits: cpu: "100" google.com/tpu: "4" - ephemeral-storage: 150Gi - memory: 200G + ephemeral-storage: 200Gi + memory: 150G requests: cpu: "100" google.com/tpu: "4" - ephemeral-storage: 150Gi - memory: 200G + ephemeral-storage: 200Gi + memory: 150G env: + - name: JAX_PLATFORMS + value: "tpu" - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: @@ -88,7 +92,9 @@ spec: key: hf_api_token - name: MODEL_ID value: "meta-llama/Llama-3.1-70B" + - name: MAX_MODEL_LEN + value: "4096" nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice - cloud.google.com/gke-tpu-topology: 2x2x2 -# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu] + cloud.google.com/gke-tpu-topology: 2x2x4 +# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml new file mode 100644 index 0000000000..d54cb0985b --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml @@ -0,0 +1,100 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v5e] +apiVersion: ray.io/v1 +kind: RayCluster +metadata: + name: vllm-tpu +spec: + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "meta-llama/Llama-3.1-70B" + - name: MAX_MODEL_LEN + value: "4096" + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + - containerPort: 8471 + name: slicebuilder + - containerPort: 8081 + name: mxla + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 2 + numOfHosts: 4 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 200Gi + memory: 150G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 200Gi + memory: 150G + env: + - name: JAX_PLATFORMS + value: "tpu" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "meta-llama/Llama-3.1-70B" + - name: MAX_MODEL_LEN + value: "4096" + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 4x4 +# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v4-tpu.yaml similarity index 81% rename from ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml rename to ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v4-tpu.yaml index dd48a67ccc..04fdb8056c 100644 --- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v4-tpu.yaml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu] +# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v4] apiVersion: ray.io/v1 kind: RayService metadata: @@ -26,10 +26,11 @@ spec: - name: VLLMDeployment num_replicas: 1 runtime_env: - working_dir: "htthttps://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" + working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" env_vars: MODEL_ID: "meta-llama/Meta-Llama-3.1-70B" - TPU_CHIPS: "8" + MAX_MODEL_LEN: "4096" + TPU_CHIPS: "16" rayClusterConfig: rayVersion: 2.34.0 headGroupSpec: @@ -38,7 +39,7 @@ spec: spec: containers: - name: ray-head - image: ryanaoleary/vllm:tpu + image: $VLLM_IMAGE imagePullPolicy: IfNotPresent ports: - containerPort: 6379 @@ -58,35 +59,39 @@ spec: resources: limits: cpu: "8" + ephemeral-storage: 8Gi memory: 40G requests: cpu: "8" + ephemeral-storage: 8Gi memory: 40G workerGroupSpecs: - groupName: tpu-group replicas: 1 minReplicas: 0 maxReplicas: 2 - numOfHosts: 2 + numOfHosts: 4 rayStartParams: {} template: spec: containers: - name: ray-worker - image: ryanaoleary/vllm:tpu + image: $VLLM_IMAGE imagePullPolicy: IfNotPresent resources: limits: cpu: "100" google.com/tpu: "4" - ephemeral-storage: 150Gi - memory: 100G + ephemeral-storage: 200Gi + memory: 150G requests: cpu: "100" google.com/tpu: "4" - ephemeral-storage: 150Gi - memory: 100G + ephemeral-storage: 200Gi + memory: 150G env: + - name: JAX_PLATFORMS + value: "tpu" - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: @@ -94,5 +99,5 @@ spec: key: hf_api_token nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice - cloud.google.com/gke-tpu-topology: 2x2x2 -# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu] + cloud.google.com/gke-tpu-topology: 2x2x4 +# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml new file mode 100644 index 0000000000..0d05821316 --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml @@ -0,0 +1,103 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v5e] +apiVersion: ray.io/v1 +kind: RayService +metadata: + name: vllm-tpu +spec: + serveConfigV2: | + applications: + - name: llm + import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model + deployments: + - name: VLLMDeployment + num_replicas: 1 + runtime_env: + working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" + env_vars: + MODEL_ID: "meta-llama/Meta-Llama-3.1-70B" + MAX_MODEL_LEN: "4096" + TPU_CHIPS: "16" + rayClusterConfig: + rayVersion: 2.34.0 + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + resources: + limits: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 2 + numOfHosts: 4 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 200Gi + memory: 150G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 200Gi + memory: 150G + env: + - name: JAX_PLATFORMS + value: "tpu" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 4x4 +# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml new file mode 100644 index 0000000000..76b95c4b4d --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml @@ -0,0 +1,100 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v4] +apiVersion: ray.io/v1 +kind: RayCluster +metadata: + name: vllm-tpu +spec: + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "llava-hf/llava-1.5-13b-hf" + - name: DTYPE + value: "bfloat16" + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + - containerPort: 8471 + name: slicebuilder + - containerPort: 8081 + name: mxla + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 2 + numOfHosts: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 200Gi + memory: 150G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 200Gi + memory: 150G + env: + - name: JAX_PLATFORMS + value: "tpu" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "llava-hf/llava-1.5-13b-hf" + - name: DTYPE + value: "bfloat16" + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice + cloud.google.com/gke-tpu-topology: 2x2x2 +# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml new file mode 100644 index 0000000000..1aad70911b --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml @@ -0,0 +1,100 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v5e] +apiVersion: ray.io/v1 +kind: RayCluster +metadata: + name: vllm-tpu +spec: + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "llava-hf/llava-1.5-13b-hf" + - name: DTYPE + value: "bfloat16" + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + - containerPort: 8471 + name: slicebuilder + - containerPort: 8081 + name: mxla + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 2 + numOfHosts: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 200Gi + memory: 150G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 200Gi + memory: 150G + env: + - name: JAX_PLATFORMS + value: "tpu" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "llava-hf/llava-1.5-13b-hf" + - name: DTYPE + value: "bfloat16" + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 2x4 +# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml new file mode 100644 index 0000000000..e9cf9dec32 --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml @@ -0,0 +1,103 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v4] +apiVersion: ray.io/v1 +kind: RayService +metadata: + name: vllm-tpu +spec: + serveConfigV2: | + applications: + - name: llm + import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model + deployments: + - name: VLLMDeployment + num_replicas: 1 + runtime_env: + working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" + env_vars: + MODEL_ID: "llava-hf/llava-1.5-13b-hf" + TPU_CHIPS: "8" + DTYPE: "bfloat16" + rayClusterConfig: + rayVersion: 2.34.0 + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + resources: + limits: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 2 + numOfHosts: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 200Gi + memory: 150G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 200Gi + memory: 150G + env: + - name: JAX_PLATFORMS + value: "tpu" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice + cloud.google.com/gke-tpu-topology: 2x2x2 +# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml new file mode 100644 index 0000000000..ef1db10dee --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml @@ -0,0 +1,103 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v5e] +apiVersion: ray.io/v1 +kind: RayService +metadata: + name: vllm-tpu +spec: + serveConfigV2: | + applications: + - name: llm + import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model + deployments: + - name: VLLMDeployment + num_replicas: 1 + runtime_env: + working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" + env_vars: + MODEL_ID: "llava-hf/llava-1.5-13b-hf" + TPU_CHIPS: "8" + DTYPE: "bfloat16" + rayClusterConfig: + rayVersion: 2.34.0 + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + resources: + limits: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 2 + numOfHosts: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 200Gi + memory: 150G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 200Gi + memory: 150G + env: + - name: JAX_PLATFORMS + value: "tpu" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 2x4 +# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml new file mode 100644 index 0000000000..85165c6ca2 --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml @@ -0,0 +1,100 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v4] +apiVersion: ray.io/v1 +kind: RayCluster +metadata: + name: vllm-tpu +spec: + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "mistralai/Mistral-7B-Instruct-v0.3" + - name: TOKENIZER_MODE + value: "mistral" + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + - containerPort: 8471 + name: slicebuilder + - containerPort: 8081 + name: mxla + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 2 + numOfHosts: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 50Gi + memory: 100G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 50Gi + memory: 100G + env: + - name: JAX_PLATFORMS + value: "tpu" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "mistralai/Mistral-7B-Instruct-v0.3" + - name: TOKENIZER_MODE + value: "mistral" + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice + cloud.google.com/gke-tpu-topology: 2x2x2 +# [END gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml new file mode 100644 index 0000000000..cd0e3a1746 --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml @@ -0,0 +1,100 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v5e] +apiVersion: ray.io/v1 +kind: RayCluster +metadata: + name: vllm-tpu +spec: + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "mistralai/Mistral-7B-Instruct-v0.3" + - name: TOKENIZER_MODE + value: "mistral" + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + - containerPort: 8471 + name: slicebuilder + - containerPort: 8081 + name: mxla + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 2 + numOfHosts: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 50Gi + memory: 100G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 50Gi + memory: 100G + env: + - name: JAX_PLATFORMS + value: "tpu" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: MODEL_ID + value: "mistralai/Mistral-7B-Instruct-v0.3" + - name: TOKENIZER_MODE + value: "mistral" + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 2x4 +# [END gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml new file mode 100644 index 0000000000..65a3958b52 --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml @@ -0,0 +1,103 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_llm_mistral_7b_rayservice_tpu_v4] +apiVersion: ray.io/v1 +kind: RayService +metadata: + name: vllm-tpu +spec: + serveConfigV2: | + applications: + - name: llm + import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model + deployments: + - name: VLLMDeployment + num_replicas: 1 + runtime_env: + working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" + env_vars: + MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3" + TPU_CHIPS: "8" + TOKENIZER_MODE: "mistral" + rayClusterConfig: + rayVersion: 2.34.0 + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + resources: + limits: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 2 + numOfHosts: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 50Gi + memory: 100G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 50Gi + memory: 100G + env: + - name: JAX_PLATFORMS + value: "tpu" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice + cloud.google.com/gke-tpu-topology: 2x2x2 +# [END gke_ai_ml_gke_ray_rayserve_llm_mistral_7b_rayservice_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml new file mode 100644 index 0000000000..184c99ccf1 --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml @@ -0,0 +1,103 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_mistral_7b_rayservice_tpu_v5e] +apiVersion: ray.io/v1 +kind: RayService +metadata: + name: vllm-tpu +spec: + serveConfigV2: | + applications: + - name: llm + import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model + deployments: + - name: VLLMDeployment + num_replicas: 1 + runtime_env: + working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" + env_vars: + MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3" + TPU_CHIPS: "8" + TOKENIZER_MODE: "mistral" + rayClusterConfig: + rayVersion: 2.34.0 + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + resources: + limits: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 8Gi + memory: 40G + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 0 + maxReplicas: 2 + numOfHosts: 2 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 50Gi + memory: 100G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 50Gi + memory: 100G + env: + - name: JAX_PLATFORMS + value: "tpu" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 2x4 +# [END gke_ai_ml_gke_ray_rayserve_mistral_7b_rayservice_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py index 1ac693d520..347a402e05 100644 --- a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py +++ b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py @@ -38,11 +38,16 @@ class VLLMDeployment: def __init__( self, num_tpu_chips, + max_model_len, + tokenizer_mode, + dtype, ): self.llm = LLM( - model=os.environ['MODEL_ID'], + model=os.environ['MODEL_ID'], # Error if not provided. tensor_parallel_size=num_tpu_chips, - max_model_len=int(os.environ.get('MAX_MODEL_LEN')), + max_model_len=max_model_len, + dtype=dtype, + tokenizer_mode=tokenizer_mode, enforce_eager=True, ) @@ -83,6 +88,21 @@ def get_num_tpu_chips() -> int: return 0 return int(ray.cluster_resources()["TPU"]) +def get_max_model_len() -> Optional[int]: + if 'MAX_MODEL_LEN' in os.environ: + return int(os.environ['MAX_MODEL_LEN']) + return None + +def get_tokenizer_mode() -> str: + if 'TOKENIZER_MODE' in os.environ: + return os.environ['TOKENIZER_MODE'] + return "auto" + +def get_dtype() -> str: + if 'DTYPE' in os.environ: + return os.environ['DTYPE'] + return "auto" + def build_app(cli_args: Dict[str, str]) -> serve.Application: """Builds the Serve app based on CLI arguments.""" ray.init(ignore_reinit_error=True) @@ -96,6 +116,6 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application: # Use PACK strategy since the deployment may use more than one TPU node. return VLLMDeployment.options( placement_group_bundles=pg_resources, - placement_group_strategy="PACK").bind(num_tpu_chips) + placement_group_strategy="PACK").bind(num_tpu_chips, get_max_model_len(), get_tokenizer_mode(), get_dtype()) model = build_app({}) From b12ffb7476d5205a24dd8ce3b2877edce63eb80d Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Mon, 7 Oct 2024 19:22:27 +0000 Subject: [PATCH 08/16] Fix model id Signed-off-by: Ryan O'Leary --- .../gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml index eb54cc59e8..b53039590a 100644 --- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml @@ -42,7 +42,7 @@ spec: name: hf-secret key: hf_api_token - name: MODEL_ID - value: "meta-llama/Llama-3-8B-Instruct" + value: "meta-llama/Meta-Llama-3-8B-Instruct" ports: - containerPort: 6379 name: gcs From cbce28377a55670af3dc62be8870994d5f760dde Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Mon, 7 Oct 2024 21:33:16 +0000 Subject: [PATCH 09/16] Don't specify mxla and slicebuilder ports Signed-off-by: Ryan O'Leary --- .../rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml | 4 ---- .../rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml | 4 ---- .../rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml | 4 ---- .../rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml | 4 ---- .../rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml | 4 ---- .../rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml | 4 ---- ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml | 4 ---- .../gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml | 4 ---- 8 files changed, 32 deletions(-) diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml index b53039590a..6e16216651 100644 --- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml @@ -52,10 +52,6 @@ spec: name: client - containerPort: 8000 name: serve - - containerPort: 8471 - name: slicebuilder - - containerPort: 8081 - name: mxla workerGroupSpecs: - groupName: tpu-group replicas: 1 diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml index 48f510296e..f09d16a3db 100644 --- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml @@ -52,10 +52,6 @@ spec: name: client - containerPort: 8000 name: serve - - containerPort: 8471 - name: slicebuilder - - containerPort: 8081 - name: mxla workerGroupSpecs: - groupName: tpu-group replicas: 1 diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml index ea469084e9..9b3df8452a 100644 --- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml @@ -54,10 +54,6 @@ spec: name: client - containerPort: 8000 name: serve - - containerPort: 8471 - name: slicebuilder - - containerPort: 8081 - name: mxla workerGroupSpecs: - groupName: tpu-group replicas: 1 diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml index d54cb0985b..939a395d56 100644 --- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml @@ -54,10 +54,6 @@ spec: name: client - containerPort: 8000 name: serve - - containerPort: 8471 - name: slicebuilder - - containerPort: 8081 - name: mxla workerGroupSpecs: - groupName: tpu-group replicas: 1 diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml index 76b95c4b4d..0d64de23ab 100644 --- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml @@ -54,10 +54,6 @@ spec: name: client - containerPort: 8000 name: serve - - containerPort: 8471 - name: slicebuilder - - containerPort: 8081 - name: mxla workerGroupSpecs: - groupName: tpu-group replicas: 1 diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml index 1aad70911b..d7214d7a7c 100644 --- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml @@ -54,10 +54,6 @@ spec: name: client - containerPort: 8000 name: serve - - containerPort: 8471 - name: slicebuilder - - containerPort: 8081 - name: mxla workerGroupSpecs: - groupName: tpu-group replicas: 1 diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml index 85165c6ca2..3f539689c8 100644 --- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml @@ -54,10 +54,6 @@ spec: name: client - containerPort: 8000 name: serve - - containerPort: 8471 - name: slicebuilder - - containerPort: 8081 - name: mxla workerGroupSpecs: - groupName: tpu-group replicas: 1 diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml index cd0e3a1746..27983bd6eb 100644 --- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml @@ -54,10 +54,6 @@ spec: name: client - containerPort: 8000 name: serve - - containerPort: 8471 - name: slicebuilder - - containerPort: 8081 - name: mxla workerGroupSpecs: - groupName: tpu-group replicas: 1 From aa21d2ca1c031e91a8a1b5ec192478cf19beb6a5 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Mon, 7 Oct 2024 21:37:09 +0000 Subject: [PATCH 10/16] Pass env vars to runtime env Signed-off-by: Ryan O'Leary --- .../rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml | 4 ---- .../rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml | 4 ---- .../rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml | 8 -------- .../rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml | 8 -------- .../rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml | 8 -------- .../rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml | 8 -------- .../rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml | 8 -------- .../rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml | 8 -------- 8 files changed, 56 deletions(-) diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml index 6e16216651..1faecb6860 100644 --- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml @@ -41,8 +41,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "meta-llama/Meta-Llama-3-8B-Instruct" ports: - containerPort: 6379 name: gcs @@ -84,8 +82,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "meta-llama/Meta-Llama-3-8B-Instruct" nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice cloud.google.com/gke-tpu-topology: 2x2x2 diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml index f09d16a3db..a41106e3aa 100644 --- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml @@ -41,8 +41,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "meta-llama/Meta-Llama-3-8B-Instruct" ports: - containerPort: 6379 name: gcs @@ -84,8 +82,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "meta-llama/Meta-Llama-3-8B-Instruct" nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice cloud.google.com/gke-tpu-topology: 2x4 diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml index 9b3df8452a..b7b3f42643 100644 --- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml @@ -41,10 +41,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "meta-llama/Llama-3.1-70B" - - name: MAX_MODEL_LEN - value: "4096" ports: - containerPort: 6379 name: gcs @@ -86,10 +82,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "meta-llama/Llama-3.1-70B" - - name: MAX_MODEL_LEN - value: "4096" nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice cloud.google.com/gke-tpu-topology: 2x2x4 diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml index 939a395d56..7e5aa17b97 100644 --- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml @@ -41,10 +41,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "meta-llama/Llama-3.1-70B" - - name: MAX_MODEL_LEN - value: "4096" ports: - containerPort: 6379 name: gcs @@ -86,10 +82,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "meta-llama/Llama-3.1-70B" - - name: MAX_MODEL_LEN - value: "4096" nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice cloud.google.com/gke-tpu-topology: 4x4 diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml index 0d64de23ab..2ff2780fea 100644 --- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml @@ -41,10 +41,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "llava-hf/llava-1.5-13b-hf" - - name: DTYPE - value: "bfloat16" ports: - containerPort: 6379 name: gcs @@ -86,10 +82,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "llava-hf/llava-1.5-13b-hf" - - name: DTYPE - value: "bfloat16" nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice cloud.google.com/gke-tpu-topology: 2x2x2 diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml index d7214d7a7c..a1c10c73d7 100644 --- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml @@ -41,10 +41,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "llava-hf/llava-1.5-13b-hf" - - name: DTYPE - value: "bfloat16" ports: - containerPort: 6379 name: gcs @@ -86,10 +82,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "llava-hf/llava-1.5-13b-hf" - - name: DTYPE - value: "bfloat16" nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice cloud.google.com/gke-tpu-topology: 2x4 diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml index 3f539689c8..afa639fe35 100644 --- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml @@ -41,10 +41,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "mistralai/Mistral-7B-Instruct-v0.3" - - name: TOKENIZER_MODE - value: "mistral" ports: - containerPort: 6379 name: gcs @@ -86,10 +82,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "mistralai/Mistral-7B-Instruct-v0.3" - - name: TOKENIZER_MODE - value: "mistral" nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice cloud.google.com/gke-tpu-topology: 2x2x2 diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml index 27983bd6eb..290c6cd9a8 100644 --- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml @@ -41,10 +41,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "mistralai/Mistral-7B-Instruct-v0.3" - - name: TOKENIZER_MODE - value: "mistral" ports: - containerPort: 6379 name: gcs @@ -86,10 +82,6 @@ spec: secretKeyRef: name: hf-secret key: hf_api_token - - name: MODEL_ID - value: "mistralai/Mistral-7B-Instruct-v0.3" - - name: TOKENIZER_MODE - value: "mistral" nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice cloud.google.com/gke-tpu-topology: 2x4 From 5b42cb8dd9c2bdcd2f6b1439a844462ec004d08d Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Thu, 10 Oct 2024 17:59:45 +0000 Subject: [PATCH 11/16] Add model composition example Signed-off-by: Ryan O'Leary --- .../model-composition/ray-service-tpu.yaml | 97 ++++++++ .../llm/model-composition/serve_tpu.py | 218 ++++++++++++++++++ 2 files changed, 315 insertions(+) create mode 100644 ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml create mode 100644 ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py diff --git a/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml new file mode 100644 index 0000000000..42c8d3723a --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml @@ -0,0 +1,97 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_llm_model_composition_tpu] +apiVersion: ray.io/v1 +kind: RayService +metadata: + name: model-composition +spec: + serveConfigV2: | + applications: + - name: llm + route_prefix: / + import_path: ai-ml.gke-ray.rayserve.llm.model-composition.serve_tpu:multi_model + deployments: + - name: MultiModelDeployment + num_replicas: 1 + runtime_env: + working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" + env_vars: + ASSIST_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" + SUMMARIZER_MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3" + rayClusterConfig: + headGroupSpec: + rayStartParams: + dashboard-host: '0.0.0.0' + template: + spec: + containers: + - name: ray-head + image: $VLLM_IMAGE + resources: + limits: + cpu: "8" + memory: 40G + requests: + cpu: "8" + memory: 40G + ports: + - containerPort: 6379 + name: gcs-server + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + workerGroupSpecs: + - replicas: 2 + minReplicas: 0 + maxReplicas: 4 + numOfHosts: 2 + groupName: tpu-group + rayStartParams: {} + template: + spec: + containers: + - name: llm + image: $VLLM_IMAGE + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 50Gi + memory: 100G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 50Gi + memory: 100G + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 2x4 +# [END gke_ai_ml_gke_ray_rayserve_llm_model_composition_tpu] diff --git a/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py new file mode 100644 index 0000000000..ac138fad12 --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py @@ -0,0 +1,218 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: this file was inspired from: https://github.com/ray-project/ray/blob//master/doc/source/serve/doc_code/vllm_example.py + +import json +import os +from typing import AsyncGenerator, Dict, List, Optional +import random + +from fastapi import BackgroundTasks +from starlette.requests import Request +from starlette.responses import Response, StreamingResponse +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.sampling_params import SamplingParams +from vllm.utils import random_uuid + +import ray +from ray import serve +from ray.serve.handle import DeploymentHandle + + +@serve.deployment(name="VLLMDeployment") +class VLLMDeployment: + def __init__(self, **kwargs): + """ + Construct a VLLM deployment. + + Refer to https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py + for the full list of arguments. + + Args: + model: name or path of the huggingface model to use + download_dir: directory to download and load the weights, + default to the default cache dir of huggingface. + use_np_weights: save a numpy copy of model weights for + faster loading. This can increase the disk usage by up to 2x. + use_dummy_weights: use dummy values for model weights. + dtype: data type for model weights and activations. + The "auto" option will use FP16 precision + for FP32 and FP16 models, and BF16 precision. + for BF16 models. + seed: random seed. + worker_use_ray: use Ray for distributed serving, will be + automatically set when using more than 1 GPU + pipeline_parallel_size: number of pipeline stages. + tensor_parallel_size: number of tensor parallel replicas. + block_size: token block size. + swap_space: CPU swap space size (GiB) per GPU. + gpu_memory_utilization: the percentage of GPU memory to be used for + the model executor + max_num_batched_tokens: maximum number of batched tokens per iteration + max_num_seqs: maximum number of sequences per iteration. + disable_log_stats: disable logging statistics. + engine_use_ray: use Ray to start the LLM engine in a separate + process as the server process. + disable_log_requests: disable logging requests. + """ + args = AsyncEngineArgs(**kwargs) + self.engine = AsyncLLMEngine.from_engine_args(args) + + async def stream_results(self, results_generator) -> AsyncGenerator[bytes, None]: + num_returned = 0 + async for request_output in results_generator: + text_outputs = [output.text for output in request_output.outputs] + assert len(text_outputs) == 1 + text_output = text_outputs[0][num_returned:] + ret = {"text": text_output} + yield (json.dumps(ret) + "\n").encode("utf-8") + num_returned += len(text_output) + + async def may_abort_request(self, request_id) -> None: + await self.engine.abort(request_id) + + async def __call__(self, request_dict: dict) -> str: + """Generate completion for the request. + + The request should be a JSON object with the following fields: + - prompt: the prompt to use for the generation. + - stream: whether to stream the results or not. + - other fields: the sampling parameters (See `SamplingParams` for details). + """ + # request_dict = await request.json() + prompt = request_dict.pop("prompt") + stream = request_dict.pop("stream", False) + max_tokens = request_dict.pop("max_tokens", 1000) + sampling_params = SamplingParams(**request_dict) + request_id = random_uuid() + results_generator = self.engine.generate( + prompt, sampling_params, request_id) + if stream: + background_tasks = BackgroundTasks() + # Using background_taks to abort the the request + # if the client disconnects. + background_tasks.add_task(self.may_abort_request, request_id) + return StreamingResponse( + self.stream_results(results_generator), background=background_tasks + ) + + final_output = None + async for request_output in results_generator: + final_output = request_output + + assert final_output is not None + prompt = final_output.prompt + text_outputs = [ + output.text for output in final_output.outputs] + ret = {"text": text_outputs, "max_tokens": max_tokens} + return json.dumps(ret) + + +@serve.deployment +class VLLMSummarizerDeployment: + def __init__(self, **kwargs): + args = AsyncEngineArgs(**kwargs) + self.engine = AsyncLLMEngine.from_engine_args(args) + + async def __call__(self, response: str) -> str: + """Generates summarization of a response from another model. + + The response should be a JSON object with the following fields: + - text: the response returned from another model to summarize + """ + request_dict = json.loads(response) + text = request_dict.pop("text") + prompt = f"Summarize the following text into a single sentence: {text}" + sampling_params = SamplingParams(**request_dict) + request_id = random_uuid() + results_generator = self.engine.generate( + prompt, sampling_params, request_id) + + final_output = None + async for request_output in results_generator: + final_output = request_output + + assert final_output is not None + prompt = final_output.prompt + text_outputs = [ + output.text for output in final_output.outputs] + ret = {"text": text_outputs} + return json.dumps(ret) + + +@serve.deployment +class MultiModelDeployment: + def __init__(self, assist_model: DeploymentHandle, summarizer_model: DeploymentHandle): + self.assistant_model = assist_model + self.summarizer_model = summarizer_model + + async def __call__(self, request: Request) -> Response: + model_request = await request.json() + assistant_response = self.assistant_model.remote(model_request) + summarizer_response = await self.summarizer_model.remote(assistant_response) + return Response(content=summarizer_response) + +def get_num_tpu_chips() -> int: + if "TPU" not in ray.cluster_resources(): + # Pass in TPU chips when the current Ray cluster resources can't be auto-detected (i.e for autoscaling). + if os.environ.get('TPU_CHIPS') is not None: + return int(os.environ.get('TPU_CHIPS')) + return 0 + return int(ray.cluster_resources()["TPU"]) + +def get_tpu_head() -> Optional[str]: + # return the TPU-{accelerator}-head resource + for key, _ in ray.cluster_resources().items(): + if key.endswith("head"): + return key + return None + +def build_app(cli_args: Dict[str, str]) -> serve.Application: + """Builds the Serve app based on CLI arguments.""" + ray.init(ignore_reinit_error=True) + + num_tpu_chips = get_num_tpu_chips() + tpu_head = get_tpu_head() + tpu_slices = 1 + if tpu_head is not None: + tpu_slices = ray.cluster_resources()[tpu_head] + num_tpu_chips_per_slice = int(num_tpu_chips/tpu_slices) + # Construct a placement group for 1 TPU slice. Each model should run on its own slice. + pg_resources = [] + pg_resources.append({"CPU": 1}) # for the deployment replica + for i in range(num_tpu_chips_per_slice): + pg_resources.append({"CPU": 1, "TPU": 1}) # for the vLLM actors + # Add a TPU head to the placement group to ensure Ray workers are not placed across slices. + pg_resources.append({tpu_head: 1}) + + return MultiModelDeployment.bind( + VLLMDeployment.options( + placement_group_bundles=pg_resources, + placement_group_strategy="PACK").bind( + model=os.environ['ASSIST_MODEL_ID'], + tensor_parallel_size=num_tpu_chips_per_slice, + enforce_eager=True, + ), + VLLMSummarizerDeployment.options( + placement_group_bundles=pg_resources, + placement_group_strategy="PACK").bind( + model=os.environ['SUMMARIZER_MODEL_ID'], + tensor_parallel_size=num_tpu_chips_per_slice, + enforce_eager=True, + ), + ) + +multi_model = build_app({}) From ced331d74a7eb6a9490a0cfebe0265c2e4a6069f Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Thu, 10 Oct 2024 18:02:43 +0000 Subject: [PATCH 12/16] Update name Signed-off-by: Ryan O'Leary --- .../gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml index 42c8d3723a..cc194af015 100644 --- a/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml @@ -16,7 +16,7 @@ apiVersion: ray.io/v1 kind: RayService metadata: - name: model-composition + name: vllm-tpu spec: serveConfigV2: | applications: From 9e9d46602e8f9a05725532b93fb7348821e07565 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Thu, 10 Oct 2024 18:17:17 +0000 Subject: [PATCH 13/16] Support passing TPU_HEADS as env var Signed-off-by: Ryan O'Leary --- ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py index ac138fad12..36caadb663 100644 --- a/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py +++ b/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py @@ -174,6 +174,10 @@ def get_num_tpu_chips() -> int: return int(ray.cluster_resources()["TPU"]) def get_tpu_head() -> Optional[str]: + if "TPU" not in ray.cluster_resources(): + # Pass in # TPU heads when the current Ray cluster resources can't be auto-detected. + if os.environ.get('TPU_HEADS') is not None: + return int(os.environ.get('TPU_HEADS')) # return the TPU-{accelerator}-head resource for key, _ in ray.cluster_resources().items(): if key.endswith("head"): From 06ed4fcb1d3d9bad089e24f89a01a29ff982b908 Mon Sep 17 00:00:00 2001 From: Ryan O'Leary Date: Thu, 10 Oct 2024 18:38:46 +0000 Subject: [PATCH 14/16] Update RayService TPU CR Signed-off-by: Ryan O'Leary --- .../rayserve/llm/model-composition/ray-service-tpu.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml index cc194af015..b69b8deb27 100644 --- a/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml +++ b/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml @@ -30,7 +30,9 @@ spec: working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" env_vars: ASSIST_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" - SUMMARIZER_MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3" + SUMMARIZER_MODEL_ID: "google/gemma-7b-it" + TPU_CHIPS: "16" + TPU_HEADS: "2" rayClusterConfig: headGroupSpec: rayStartParams: From 41ed84a1ce0963e08297ee1ba84fb0de1c27bcc6 Mon Sep 17 00:00:00 2001 From: ryanaoleary Date: Thu, 14 Nov 2024 01:43:05 +0000 Subject: [PATCH 15/16] Rescope PR to v5e and v6e for Llama-3.1-405B Signed-off-by: ryanaoleary --- .../llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml | 88 ------- .../llama-3-8b-it/ray-cluster-v5e-tpu.yaml | 88 ------- .../llm/llama-3-8b-it/ray-service-v4-tpu.yaml | 100 -------- .../llama-3-8b-it/ray-service-v5e-tpu.yaml | 100 -------- .../llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml | 88 ------- .../llama-3.1-70b/ray-cluster-v5e-tpu.yaml | 88 ------- .../llm/llama-3.1-70b/ray-service-v4-tpu.yaml | 103 -------- .../llama-3.1-70b/ray-service-v5e-tpu.yaml | 103 -------- .../llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml | 88 ------- .../llava-1.5-13b/ray-cluster-v5e-tpu.yaml | 88 ------- .../llm/llava-1.5-13b/ray-service-v4-tpu.yaml | 103 -------- .../llava-1.5-13b/ray-service-v5e-tpu.yaml | 103 -------- .../llm/mistral-7b/ray-cluster-v4-tpu.yaml | 88 ------- .../llm/mistral-7b/ray-cluster-v5e-tpu.yaml | 88 ------- .../llm/mistral-7b/ray-service-v4-tpu.yaml | 103 -------- .../llm/mistral-7b/ray-service-v5e-tpu.yaml | 103 -------- .../model-composition/ray-service-tpu.yaml | 99 -------- .../llm/model-composition/serve_tpu.py | 222 ------------------ ai-ml/gke-ray/rayserve/llm/serve_tpu.py | 3 +- .../tpu/ray-cluster.tpu-v5e-multihost.yaml | 150 ++++++++++++ .../tpu/ray-cluster.tpu-v6e-multihost.yaml | 146 ++++++++++++ 21 files changed, 298 insertions(+), 1844 deletions(-) delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v4-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml delete mode 100644 ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py create mode 100644 ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v5e-multihost.yaml create mode 100644 ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v6e-multihost.yaml diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml deleted file mode 100644 index 1faecb6860..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v4-tpu.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v4] -apiVersion: ray.io/v1 -kind: RayCluster -metadata: - name: vllm-tpu -spec: - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "8" - ephemeral-storage: 5Gi - memory: 40G - requests: - cpu: "8" - ephemeral-storage: 5Gi - memory: 40G - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 1 - numOfHosts: 2 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice - cloud.google.com/gke-tpu-topology: 2x2x2 -# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml deleted file mode 100644 index a41106e3aa..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-cluster-v5e-tpu.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v5e] -apiVersion: ray.io/v1 -kind: RayCluster -metadata: - name: vllm-tpu -spec: - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "8" - ephemeral-storage: 5Gi - memory: 40G - requests: - cpu: "8" - ephemeral-storage: 5Gi - memory: 40G - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 1 - numOfHosts: 2 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice - cloud.google.com/gke-tpu-topology: 2x4 -# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_raycluster_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml deleted file mode 100644 index 6e95df89f7..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v4-tpu.yaml +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v4] -apiVersion: ray.io/v1 -kind: RayService -metadata: - name: vllm-tpu -spec: - serveConfigV2: | - applications: - - name: llm - import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model - deployments: - - name: VLLMDeployment - num_replicas: 1 - runtime_env: - working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" - env_vars: - MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" - TPU_CHIPS: "8" - rayClusterConfig: - rayVersion: 2.34.0 - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - resources: - limits: - cpu: "8" - memory: 40G - requests: - cpu: "8" - memory: 40G - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 2 - numOfHosts: 2 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice - cloud.google.com/gke-tpu-topology: 2x2x2 -# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml deleted file mode 100644 index 70aaf5938c..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/llama-3-8b-it/ray-service-v5e-tpu.yaml +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v5e] -apiVersion: ray.io/v1 -kind: RayService -metadata: - name: vllm-tpu -spec: - serveConfigV2: | - applications: - - name: llm - import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model - deployments: - - name: VLLMDeployment - num_replicas: 1 - runtime_env: - working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" - env_vars: - MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" - TPU_CHIPS: "8" - rayClusterConfig: - rayVersion: 2.34.0 - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - resources: - limits: - cpu: "8" - memory: 40G - requests: - cpu: "8" - memory: 40G - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 2 - numOfHosts: 2 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice - cloud.google.com/gke-tpu-topology: 2x4 -# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_8b_it_rayservice_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml deleted file mode 100644 index b7b3f42643..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v4-tpu.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v4] -apiVersion: ray.io/v1 -kind: RayCluster -metadata: - name: vllm-tpu -spec: - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - requests: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 2 - numOfHosts: 4 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice - cloud.google.com/gke-tpu-topology: 2x2x4 -# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml deleted file mode 100644 index 7e5aa17b97..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-cluster-v5e-tpu.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v5e] -apiVersion: ray.io/v1 -kind: RayCluster -metadata: - name: vllm-tpu -spec: - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - requests: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 2 - numOfHosts: 4 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice - cloud.google.com/gke-tpu-topology: 4x4 -# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_raycluster_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v4-tpu.yaml deleted file mode 100644 index 04fdb8056c..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v4-tpu.yaml +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v4] -apiVersion: ray.io/v1 -kind: RayService -metadata: - name: vllm-tpu -spec: - serveConfigV2: | - applications: - - name: llm - import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model - deployments: - - name: VLLMDeployment - num_replicas: 1 - runtime_env: - working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" - env_vars: - MODEL_ID: "meta-llama/Meta-Llama-3.1-70B" - MAX_MODEL_LEN: "4096" - TPU_CHIPS: "16" - rayClusterConfig: - rayVersion: 2.34.0 - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - resources: - limits: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - requests: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 2 - numOfHosts: 4 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice - cloud.google.com/gke-tpu-topology: 2x2x4 -# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml deleted file mode 100644 index 0d05821316..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/llama-3.1-70b/ray-service-v5e-tpu.yaml +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v5e] -apiVersion: ray.io/v1 -kind: RayService -metadata: - name: vllm-tpu -spec: - serveConfigV2: | - applications: - - name: llm - import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model - deployments: - - name: VLLMDeployment - num_replicas: 1 - runtime_env: - working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" - env_vars: - MODEL_ID: "meta-llama/Meta-Llama-3.1-70B" - MAX_MODEL_LEN: "4096" - TPU_CHIPS: "16" - rayClusterConfig: - rayVersion: 2.34.0 - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - resources: - limits: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - requests: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 2 - numOfHosts: 4 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice - cloud.google.com/gke-tpu-topology: 4x4 -# [END gke_ai_ml_gke_ray_rayserve_llm_llama_3_1_70b_rayservice_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml deleted file mode 100644 index 2ff2780fea..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v4-tpu.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v4] -apiVersion: ray.io/v1 -kind: RayCluster -metadata: - name: vllm-tpu -spec: - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - requests: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 2 - numOfHosts: 2 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice - cloud.google.com/gke-tpu-topology: 2x2x2 -# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml deleted file mode 100644 index a1c10c73d7..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-cluster-v5e-tpu.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v5e] -apiVersion: ray.io/v1 -kind: RayCluster -metadata: - name: vllm-tpu -spec: - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - requests: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 2 - numOfHosts: 2 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice - cloud.google.com/gke-tpu-topology: 2x4 -# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_raycluster_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml deleted file mode 100644 index e9cf9dec32..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v4-tpu.yaml +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v4] -apiVersion: ray.io/v1 -kind: RayService -metadata: - name: vllm-tpu -spec: - serveConfigV2: | - applications: - - name: llm - import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model - deployments: - - name: VLLMDeployment - num_replicas: 1 - runtime_env: - working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" - env_vars: - MODEL_ID: "llava-hf/llava-1.5-13b-hf" - TPU_CHIPS: "8" - DTYPE: "bfloat16" - rayClusterConfig: - rayVersion: 2.34.0 - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - resources: - limits: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - requests: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 2 - numOfHosts: 2 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice - cloud.google.com/gke-tpu-topology: 2x2x2 -# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml deleted file mode 100644 index ef1db10dee..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/llava-1.5-13b/ray-service-v5e-tpu.yaml +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v5e] -apiVersion: ray.io/v1 -kind: RayService -metadata: - name: vllm-tpu -spec: - serveConfigV2: | - applications: - - name: llm - import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model - deployments: - - name: VLLMDeployment - num_replicas: 1 - runtime_env: - working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" - env_vars: - MODEL_ID: "llava-hf/llava-1.5-13b-hf" - TPU_CHIPS: "8" - DTYPE: "bfloat16" - rayClusterConfig: - rayVersion: 2.34.0 - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - resources: - limits: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - requests: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 2 - numOfHosts: 2 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 200Gi - memory: 150G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice - cloud.google.com/gke-tpu-topology: 2x4 -# [END gke_ai_ml_gke_ray_rayserve_llm_llava_1_5_13b_rayservice_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml deleted file mode 100644 index afa639fe35..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v4-tpu.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v4] -apiVersion: ray.io/v1 -kind: RayCluster -metadata: - name: vllm-tpu -spec: - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - requests: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 2 - numOfHosts: 2 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice - cloud.google.com/gke-tpu-topology: 2x2x2 -# [END gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml deleted file mode 100644 index 290c6cd9a8..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-cluster-v5e-tpu.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v5e] -apiVersion: ray.io/v1 -kind: RayCluster -metadata: - name: vllm-tpu -spec: - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - requests: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 2 - numOfHosts: 2 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice - cloud.google.com/gke-tpu-topology: 2x4 -# [END gke_ai_ml_gke_ray_rayserve_mistral_7b_raycluster_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml deleted file mode 100644 index 65a3958b52..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v4-tpu.yaml +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_llm_mistral_7b_rayservice_tpu_v4] -apiVersion: ray.io/v1 -kind: RayService -metadata: - name: vllm-tpu -spec: - serveConfigV2: | - applications: - - name: llm - import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model - deployments: - - name: VLLMDeployment - num_replicas: 1 - runtime_env: - working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" - env_vars: - MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3" - TPU_CHIPS: "8" - TOKENIZER_MODE: "mistral" - rayClusterConfig: - rayVersion: 2.34.0 - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - resources: - limits: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - requests: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 2 - numOfHosts: 2 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice - cloud.google.com/gke-tpu-topology: 2x2x2 -# [END gke_ai_ml_gke_ray_rayserve_llm_mistral_7b_rayservice_tpu_v4] diff --git a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml deleted file mode 100644 index 184c99ccf1..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/mistral-7b/ray-service-v5e-tpu.yaml +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_mistral_7b_rayservice_tpu_v5e] -apiVersion: ray.io/v1 -kind: RayService -metadata: - name: vllm-tpu -spec: - serveConfigV2: | - applications: - - name: llm - import_path: ai-ml.gke-ray.rayserve.llm.serve_tpu:model - deployments: - - name: VLLMDeployment - num_replicas: 1 - runtime_env: - working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" - env_vars: - MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3" - TPU_CHIPS: "8" - TOKENIZER_MODE: "mistral" - rayClusterConfig: - rayVersion: 2.34.0 - headGroupSpec: - rayStartParams: {} - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - resources: - limits: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - requests: - cpu: "8" - ephemeral-storage: 8Gi - memory: 40G - workerGroupSpecs: - - groupName: tpu-group - replicas: 1 - minReplicas: 0 - maxReplicas: 2 - numOfHosts: 2 - rayStartParams: {} - template: - spec: - containers: - - name: ray-worker - image: $VLLM_IMAGE - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - env: - - name: JAX_PLATFORMS - value: "tpu" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice - cloud.google.com/gke-tpu-topology: 2x4 -# [END gke_ai_ml_gke_ray_rayserve_mistral_7b_rayservice_tpu_v5e] diff --git a/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml b/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml deleted file mode 100644 index b69b8deb27..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/model-composition/ray-service-tpu.yaml +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# [START gke_ai_ml_gke_ray_rayserve_llm_model_composition_tpu] -apiVersion: ray.io/v1 -kind: RayService -metadata: - name: vllm-tpu -spec: - serveConfigV2: | - applications: - - name: llm - route_prefix: / - import_path: ai-ml.gke-ray.rayserve.llm.model-composition.serve_tpu:multi_model - deployments: - - name: MultiModelDeployment - num_replicas: 1 - runtime_env: - working_dir: "https://github.com/GoogleCloudPlatform/kubernetes-engine-samples/archive/main.zip" - env_vars: - ASSIST_MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct" - SUMMARIZER_MODEL_ID: "google/gemma-7b-it" - TPU_CHIPS: "16" - TPU_HEADS: "2" - rayClusterConfig: - headGroupSpec: - rayStartParams: - dashboard-host: '0.0.0.0' - template: - spec: - containers: - - name: ray-head - image: $VLLM_IMAGE - resources: - limits: - cpu: "8" - memory: 40G - requests: - cpu: "8" - memory: 40G - ports: - - containerPort: 6379 - name: gcs-server - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: serve - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - workerGroupSpecs: - - replicas: 2 - minReplicas: 0 - maxReplicas: 4 - numOfHosts: 2 - groupName: tpu-group - rayStartParams: {} - template: - spec: - containers: - - name: llm - image: $VLLM_IMAGE - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: hf_api_token - resources: - limits: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - requests: - cpu: "100" - google.com/tpu: "4" - ephemeral-storage: 50Gi - memory: 100G - nodeSelector: - cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice - cloud.google.com/gke-tpu-topology: 2x4 -# [END gke_ai_ml_gke_ray_rayserve_llm_model_composition_tpu] diff --git a/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py deleted file mode 100644 index 36caadb663..0000000000 --- a/ai-ml/gke-ray/rayserve/llm/model-composition/serve_tpu.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# NOTE: this file was inspired from: https://github.com/ray-project/ray/blob//master/doc/source/serve/doc_code/vllm_example.py - -import json -import os -from typing import AsyncGenerator, Dict, List, Optional -import random - -from fastapi import BackgroundTasks -from starlette.requests import Request -from starlette.responses import Response, StreamingResponse -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.sampling_params import SamplingParams -from vllm.utils import random_uuid - -import ray -from ray import serve -from ray.serve.handle import DeploymentHandle - - -@serve.deployment(name="VLLMDeployment") -class VLLMDeployment: - def __init__(self, **kwargs): - """ - Construct a VLLM deployment. - - Refer to https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py - for the full list of arguments. - - Args: - model: name or path of the huggingface model to use - download_dir: directory to download and load the weights, - default to the default cache dir of huggingface. - use_np_weights: save a numpy copy of model weights for - faster loading. This can increase the disk usage by up to 2x. - use_dummy_weights: use dummy values for model weights. - dtype: data type for model weights and activations. - The "auto" option will use FP16 precision - for FP32 and FP16 models, and BF16 precision. - for BF16 models. - seed: random seed. - worker_use_ray: use Ray for distributed serving, will be - automatically set when using more than 1 GPU - pipeline_parallel_size: number of pipeline stages. - tensor_parallel_size: number of tensor parallel replicas. - block_size: token block size. - swap_space: CPU swap space size (GiB) per GPU. - gpu_memory_utilization: the percentage of GPU memory to be used for - the model executor - max_num_batched_tokens: maximum number of batched tokens per iteration - max_num_seqs: maximum number of sequences per iteration. - disable_log_stats: disable logging statistics. - engine_use_ray: use Ray to start the LLM engine in a separate - process as the server process. - disable_log_requests: disable logging requests. - """ - args = AsyncEngineArgs(**kwargs) - self.engine = AsyncLLMEngine.from_engine_args(args) - - async def stream_results(self, results_generator) -> AsyncGenerator[bytes, None]: - num_returned = 0 - async for request_output in results_generator: - text_outputs = [output.text for output in request_output.outputs] - assert len(text_outputs) == 1 - text_output = text_outputs[0][num_returned:] - ret = {"text": text_output} - yield (json.dumps(ret) + "\n").encode("utf-8") - num_returned += len(text_output) - - async def may_abort_request(self, request_id) -> None: - await self.engine.abort(request_id) - - async def __call__(self, request_dict: dict) -> str: - """Generate completion for the request. - - The request should be a JSON object with the following fields: - - prompt: the prompt to use for the generation. - - stream: whether to stream the results or not. - - other fields: the sampling parameters (See `SamplingParams` for details). - """ - # request_dict = await request.json() - prompt = request_dict.pop("prompt") - stream = request_dict.pop("stream", False) - max_tokens = request_dict.pop("max_tokens", 1000) - sampling_params = SamplingParams(**request_dict) - request_id = random_uuid() - results_generator = self.engine.generate( - prompt, sampling_params, request_id) - if stream: - background_tasks = BackgroundTasks() - # Using background_taks to abort the the request - # if the client disconnects. - background_tasks.add_task(self.may_abort_request, request_id) - return StreamingResponse( - self.stream_results(results_generator), background=background_tasks - ) - - final_output = None - async for request_output in results_generator: - final_output = request_output - - assert final_output is not None - prompt = final_output.prompt - text_outputs = [ - output.text for output in final_output.outputs] - ret = {"text": text_outputs, "max_tokens": max_tokens} - return json.dumps(ret) - - -@serve.deployment -class VLLMSummarizerDeployment: - def __init__(self, **kwargs): - args = AsyncEngineArgs(**kwargs) - self.engine = AsyncLLMEngine.from_engine_args(args) - - async def __call__(self, response: str) -> str: - """Generates summarization of a response from another model. - - The response should be a JSON object with the following fields: - - text: the response returned from another model to summarize - """ - request_dict = json.loads(response) - text = request_dict.pop("text") - prompt = f"Summarize the following text into a single sentence: {text}" - sampling_params = SamplingParams(**request_dict) - request_id = random_uuid() - results_generator = self.engine.generate( - prompt, sampling_params, request_id) - - final_output = None - async for request_output in results_generator: - final_output = request_output - - assert final_output is not None - prompt = final_output.prompt - text_outputs = [ - output.text for output in final_output.outputs] - ret = {"text": text_outputs} - return json.dumps(ret) - - -@serve.deployment -class MultiModelDeployment: - def __init__(self, assist_model: DeploymentHandle, summarizer_model: DeploymentHandle): - self.assistant_model = assist_model - self.summarizer_model = summarizer_model - - async def __call__(self, request: Request) -> Response: - model_request = await request.json() - assistant_response = self.assistant_model.remote(model_request) - summarizer_response = await self.summarizer_model.remote(assistant_response) - return Response(content=summarizer_response) - -def get_num_tpu_chips() -> int: - if "TPU" not in ray.cluster_resources(): - # Pass in TPU chips when the current Ray cluster resources can't be auto-detected (i.e for autoscaling). - if os.environ.get('TPU_CHIPS') is not None: - return int(os.environ.get('TPU_CHIPS')) - return 0 - return int(ray.cluster_resources()["TPU"]) - -def get_tpu_head() -> Optional[str]: - if "TPU" not in ray.cluster_resources(): - # Pass in # TPU heads when the current Ray cluster resources can't be auto-detected. - if os.environ.get('TPU_HEADS') is not None: - return int(os.environ.get('TPU_HEADS')) - # return the TPU-{accelerator}-head resource - for key, _ in ray.cluster_resources().items(): - if key.endswith("head"): - return key - return None - -def build_app(cli_args: Dict[str, str]) -> serve.Application: - """Builds the Serve app based on CLI arguments.""" - ray.init(ignore_reinit_error=True) - - num_tpu_chips = get_num_tpu_chips() - tpu_head = get_tpu_head() - tpu_slices = 1 - if tpu_head is not None: - tpu_slices = ray.cluster_resources()[tpu_head] - num_tpu_chips_per_slice = int(num_tpu_chips/tpu_slices) - # Construct a placement group for 1 TPU slice. Each model should run on its own slice. - pg_resources = [] - pg_resources.append({"CPU": 1}) # for the deployment replica - for i in range(num_tpu_chips_per_slice): - pg_resources.append({"CPU": 1, "TPU": 1}) # for the vLLM actors - # Add a TPU head to the placement group to ensure Ray workers are not placed across slices. - pg_resources.append({tpu_head: 1}) - - return MultiModelDeployment.bind( - VLLMDeployment.options( - placement_group_bundles=pg_resources, - placement_group_strategy="PACK").bind( - model=os.environ['ASSIST_MODEL_ID'], - tensor_parallel_size=num_tpu_chips_per_slice, - enforce_eager=True, - ), - VLLMSummarizerDeployment.options( - placement_group_bundles=pg_resources, - placement_group_strategy="PACK").bind( - model=os.environ['SUMMARIZER_MODEL_ID'], - tensor_parallel_size=num_tpu_chips_per_slice, - enforce_eager=True, - ), - ) - -multi_model = build_app({}) diff --git a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py index 347a402e05..1042097fc6 100644 --- a/ai-ml/gke-ray/rayserve/llm/serve_tpu.py +++ b/ai-ml/gke-ray/rayserve/llm/serve_tpu.py @@ -43,7 +43,8 @@ def __init__( dtype, ): self.llm = LLM( - model=os.environ['MODEL_ID'], # Error if not provided. + model="/data/Meta-Llama-3.1-405B-Instruct", # Error if not provided. + served_model_name="meta-llama/Meta-Llama-3.1-405B-Instruct", tensor_parallel_size=num_tpu_chips, max_model_len=max_model_len, dtype=dtype, diff --git a/ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v5e-multihost.yaml b/ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v5e-multihost.yaml new file mode 100644 index 0000000000..5547110fae --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v5e-multihost.yaml @@ -0,0 +1,150 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_tpu_raycluster_v5e_multihost] +apiVersion: ray.io/v1 +kind: RayCluster +metadata: + name: vllm-tpu +spec: + headGroupSpec: + rayStartParams: {} + template: + metadata: + annotations: + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + spec: + serviceAccountName: $KSA_NAME + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "8" + ephemeral-storage: 5Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 5Gi + memory: 40G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: VLLM_XLA_CACHE_PATH + value: "/data" + - name: MODEL_ID + value: "meta-llama/Meta-Llama-3.1-405B-Instruct" + - name: MAX_MODEL_LEN + value: "4096" + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + - containerPort: 8471 + name: slicebuilder + - containerPort: 8081 + name: mxla + volumeMounts: + - name: gcs-fuse-csi-ephemeral + mountPath: /data + - name: dshm + mountPath: /dev/shm + volumes: + - name: gke-gcsfuse-cache + emptyDir: + medium: Memory + - name: dshm + emptyDir: + medium: Memory + - name: gcs-fuse-csi-ephemeral + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: $GSBUCKET + mountOptions: "implicit-dirs" + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 1 + maxReplicas: 1 + numOfHosts: 16 + rayStartParams: {} + template: + metadata: + annotations: + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + spec: + serviceAccountName: $KSA_NAME + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 40G + memory: 150G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 40G + memory: 150G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: VLLM_XLA_CACHE_PATH + value: "/data" + securityContext: + privileged: true + volumeMounts: + - name: gcs-fuse-csi-ephemeral + mountPath: /data + - name: dshm + mountPath: /dev/shm + volumes: + - name: gke-gcsfuse-cache + emptyDir: + medium: Memory + - name: dshm + emptyDir: + medium: Memory + - name: gcs-fuse-csi-ephemeral + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: $GSBUCKET + mountOptions: "implicit-dirs" + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 8x8 +# [END gke_ai_ml_gke_ray_rayserve_tpu_raycluster_v5e_multihost] diff --git a/ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v6e-multihost.yaml b/ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v6e-multihost.yaml new file mode 100644 index 0000000000..9e015cf9c1 --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v6e-multihost.yaml @@ -0,0 +1,146 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_tpu_raycluster_v6e_multihost] +apiVersion: ray.io/v1 +kind: RayCluster +metadata: + name: vllm-tpu +spec: + headGroupSpec: + rayStartParams: {} + template: + metadata: + annotations: + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + spec: + serviceAccountName: $KSA_NAME + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "8" + ephemeral-storage: 5Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 5Gi + memory: 40G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: VLLM_XLA_CACHE_PATH + value: "/data" + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + - containerPort: 8471 + name: slicebuilder + - containerPort: 8081 + name: mxla + volumeMounts: + - name: gcs-fuse-csi-ephemeral + mountPath: /data + - name: dshm + mountPath: /dev/shm + volumes: + - name: gke-gcsfuse-cache + emptyDir: + medium: Memory + - name: dshm + emptyDir: + medium: Memory + - name: gcs-fuse-csi-ephemeral + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: $GSBUCKET + mountOptions: "implicit-dirs" + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 1 + maxReplicas: 1 + numOfHosts: 8 + rayStartParams: {} + template: + metadata: + annotations: + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + spec: + serviceAccountName: $KSA_NAME + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 60G + memory: 200G + requests: + cpu: "100" + google.com/tpu: "4" + ephemeral-storage: 60G + memory: 200G + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: VLLM_XLA_CACHE_PATH + value: "/data" + securityContext: + privileged: true + volumeMounts: + - name: gcs-fuse-csi-ephemeral + mountPath: /data + - name: dshm + mountPath: /dev/shm + volumes: + - name: gke-gcsfuse-cache + emptyDir: + medium: Memory + - name: dshm + emptyDir: + medium: Memory + - name: gcs-fuse-csi-ephemeral + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: $GSBUCKET + mountOptions: "implicit-dirs" + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice + cloud.google.com/gke-tpu-topology: 4x8 +# [END gke_ai_ml_gke_ray_rayserve_tpu_raycluster_v6e_multihost] From 53bbfeb22eaf5fe9cf232d2713e8090a9fc35e4e Mon Sep 17 00:00:00 2001 From: ryanaoleary Date: Thu, 14 Nov 2024 01:48:30 +0000 Subject: [PATCH 16/16] Add v5e and v6e RayServices Signed-off-by: ryanaoleary --- .../tpu/ray-service.tpu-v5e-multihost.yaml | 156 ++++++++++++++++++ .../tpu/ray-service.tpu-v6e-multihost.yaml | 156 ++++++++++++++++++ 2 files changed, 312 insertions(+) create mode 100644 ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v5e-multihost.yaml create mode 100644 ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v6e-multihost.yaml diff --git a/ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v5e-multihost.yaml b/ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v5e-multihost.yaml new file mode 100644 index 0000000000..b09d6afa4b --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v5e-multihost.yaml @@ -0,0 +1,156 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_tpu_rayservice_v5e_multihost] +apiVersion: ray.io/v1 +kind: RayService +metadata: + name: vllm-tpu +spec: + serveConfigV2: | + applications: + - name: llm + import_path: ai-ml.gke-ray.rayserve.llm.tpu.serve_tpu:model + deployments: + - name: VLLMDeployment + num_replicas: 1 + runtime_env: + working_dir: "https://github.com/ryanaoleary/kubernetes-engine-samples/archive/refs/heads/multihost-example.zip" + env_vars: + MODEL_ID: "$MODEL_ID" + MAX_MODEL_LEN: "$MAX_MODEL_LEN" + TPU_CHIPS: "64" + rayClusterConfig: + headGroupSpec: + rayStartParams: {} + template: + metadata: + annotations: + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + spec: + serviceAccountName: $KSA_NAME + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: VLLM_XLA_CACHE_PATH + value: "/data" + resources: + limits: + cpu: "8" + ephemeral-storage: 5Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 5Gi + memory: 40G + volumeMounts: + - name: gcs-fuse-csi-ephemeral + mountPath: /data + - name: dshm + mountPath: /dev/shm + volumes: + - name: gke-gcsfuse-cache + emptyDir: + medium: Memory + - name: dshm + emptyDir: + medium: Memory + - name: gcs-fuse-csi-ephemeral + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: $GSBUCKET + mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1" + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 1 + maxReplicas: 1 + numOfHosts: 16 + rayStartParams: {} + template: + metadata: + annotations: + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + spec: + serviceAccountName: $KSA_NAME + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "8" + ephemeral-storage: 40G + memory: 200G + requests: + cpu: "100" + google.com/tpu: "8" + ephemeral-storage: 40G + memory: 200G + env: + - name: JAX_PLATFORMS + value: "tpu" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: VLLM_XLA_CACHE_PATH + value: "/data" + volumeMounts: + - name: gcs-fuse-csi-ephemeral + mountPath: /data + - name: dshm + mountPath: /dev/shm + volumes: + - name: gke-gcsfuse-cache + emptyDir: + medium: Memory + - name: dshm + emptyDir: + medium: Memory + - name: gcs-fuse-csi-ephemeral + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: $GSBUCKET + mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1" + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice + cloud.google.com/gke-tpu-topology: 8x8 +# [END gke_ai_ml_gke_ray_rayserve_tpu_rayservice_v5e_multihost] diff --git a/ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v6e-multihost.yaml b/ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v6e-multihost.yaml new file mode 100644 index 0000000000..6c34728bf3 --- /dev/null +++ b/ai-ml/gke-ray/rayserve/llm/tpu/ray-service.tpu-v6e-multihost.yaml @@ -0,0 +1,156 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_gke_ray_rayserve_tpu_rayservice_v6e_multihost] +apiVersion: ray.io/v1 +kind: RayService +metadata: + name: vllm-tpu +spec: + serveConfigV2: | + applications: + - name: llm + import_path: ai-ml.gke-ray.rayserve.llm.tpu.serve_tpu:model + deployments: + - name: VLLMDeployment + num_replicas: 1 + runtime_env: + working_dir: "https://github.com/ryanaoleary/kubernetes-engine-samples/archive/refs/heads/multihost-example.zip" + env_vars: + MODEL_ID: "$MODEL_ID" + MAX_MODEL_LEN: "$MAX_MODEL_LEN" + TPU_CHIPS: "32" + rayClusterConfig: + headGroupSpec: + rayStartParams: {} + template: + metadata: + annotations: + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + spec: + serviceAccountName: $KSA_NAME + containers: + - name: ray-head + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: VLLM_XLA_CACHE_PATH + value: "/data" + resources: + limits: + cpu: "8" + ephemeral-storage: 5Gi + memory: 40G + requests: + cpu: "8" + ephemeral-storage: 5Gi + memory: 40G + volumeMounts: + - name: gcs-fuse-csi-ephemeral + mountPath: /data + - name: dshm + mountPath: /dev/shm + volumes: + - name: gke-gcsfuse-cache + emptyDir: + medium: Memory + - name: dshm + emptyDir: + medium: Memory + - name: gcs-fuse-csi-ephemeral + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: $GSBUCKET + mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1" + workerGroupSpecs: + - groupName: tpu-group + replicas: 1 + minReplicas: 1 + maxReplicas: 1 + numOfHosts: 8 + rayStartParams: {} + template: + metadata: + annotations: + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + spec: + serviceAccountName: $KSA_NAME + containers: + - name: ray-worker + image: $VLLM_IMAGE + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: "100" + google.com/tpu: "8" + ephemeral-storage: 40G + memory: 200G + requests: + cpu: "100" + google.com/tpu: "8" + ephemeral-storage: 40G + memory: 200G + env: + - name: JAX_PLATFORMS + value: "tpu" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: hf_api_token + - name: VLLM_XLA_CACHE_PATH + value: "/data" + volumeMounts: + - name: gcs-fuse-csi-ephemeral + mountPath: /data + - name: dshm + mountPath: /dev/shm + volumes: + - name: gke-gcsfuse-cache + emptyDir: + medium: Memory + - name: dshm + emptyDir: + medium: Memory + - name: gcs-fuse-csi-ephemeral + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: $GSBUCKET + mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1" + nodeSelector: + cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice + cloud.google.com/gke-tpu-topology: 4x8 +# [END gke_ai_ml_gke_ray_rayserve_tpu_rayservice_v6e_multihost]