Map resource profiles to images & add resource profile doc (#154)

Define imageNames for given model servers, reference imageNames in resourceProfiles. Switch reference resource profiles to be lowercase to support the possibility of introducing a ResourceProfile CRD in the future (would require a lowercase .metadata.name). Fixes #152 via a different technique.
substratusai · Sep 1, 2024 · 0b2b2c8 · 0b2b2c8
1 parent f453476
commit 0b2b2c8
Show file tree

Hide file tree

Showing 14 changed files with 187 additions and 58 deletions.
diff --git a/api/v1/model_types.go b/api/v1/model_types.go
@@ -38,6 +38,10 @@ type ModelSpec struct {
 	// ResourceProfile maps to specific pre-configured resources.
 	ResourceProfile string `json:"resourceProfile,omitempty"`
 
+	// Image to be used for the server process.
+	// Will be set from the ResourceProfile if provided.
+	Image string `json:"image,omitempty"`
+
 	// Resources to be allocated to the server process.
 	// Will be set from the ResourceProfile if provided.
 	Resources *corev1.ResourceRequirements `json:"resources,omitempty"`

diff --git a/charts/kubeai/Chart.yaml b/charts/kubeai/Chart.yaml
@@ -7,7 +7,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.2.1
+version: 0.2.2
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to

diff --git a/charts/kubeai/charts/crds/crds/kubeai.org_models.yaml b/charts/kubeai/charts/crds/crds/kubeai.org_models.yaml
@@ -61,6 +61,11 @@ spec:
                   - TextEmbedding
                   type: string
                 type: array
+              image:
+                description: |-
+                  Image to be used for the server process.
+                  Will be set from the ResourceProfile if provided.
+                type: string
               maxReplicas:
                 format: int32
                 type: integer

diff --git a/charts/kubeai/charts/models/values.yaml b/charts/kubeai/charts/models/values.yaml
@@ -9,7 +9,8 @@ catalog:
     features: ["TextEmbedding"]
     owner: intfloat
     url: "hf://intfloat/e5-mistral-7b-instruct"
-    resourceProfile: CPU:1
+    engine: VLLM
+    resourceProfile: cpu:1
     args:
     - --gpu-memory-utilization=0.9
   # Gemma #
@@ -19,15 +20,15 @@ catalog:
     owner: google
     url: "ollama://gemma2:2b"
     engine: OLlama
-    resourceProfile: CPU:2
+    resourceProfile: cpu:2
   # Llama #
   llama-3.1-8b-instruct-cpu:
     enabled: false
     features: ["TextGeneration"]
     owner: "meta-llama"
     url: "hf://meta-llama/Meta-Llama-3.1-8B-Instruct"
     engine: VLLM
-    resourceProfile: CPU:6
+    resourceProfile: cpu:6
     env:
       VLLM_CPU_KVCACHE_SPACE: "4"
     args:
@@ -39,7 +40,7 @@ catalog:
     owner: "neuralmagic"
     url: "hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
     engine: VLLM
-    resourceProfile: L4:1
+    resourceProfile: nvidia-gpu-l4:1
     args:
     - --max-model-len=16384
     - --max-num-batched-token=16384
@@ -51,28 +52,28 @@ catalog:
     owner: nomic
     url: "ollama://nomic-embed-text"
     engine: OLlama
-    resourceProfile: CPU:1
+    resourceProfile: cpu:1
   # Opt #
   opt-125m-cpu:
     enabled: false
     features: ["TextGeneration"]
     owner: facebook
     url: "hf://facebook/opt-125m"
     engine: VLLM
-    resourceProfile: CPU:1
+    resourceProfile: cpu:1
   opt-125m-l4:
     enabled: false
     features: ["TextGeneration"]
     owner: facebook
     url: "hf://facebook/opt-125m"
     engine: VLLM
-    resourceProfile: L4:1
+    resourceProfile: nvidia-gpu-l4:1
   # Qwen #
   qwen2-500m-cpu:
     enabled: false
     features: ["TextGeneration"]
     owner: alibaba
     url: "ollama://qwen2:0.5b"
     engine: OLlama
-    resourceProfile: CPU:1
+    resourceProfile: cpu:1
 
diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml
@@ -12,24 +12,43 @@ secrets:
     name: ""
 
 modelServers:
-  vLLM:
-    gpuImage: "vllm/vllm-openai:v0.5.5"
-    cpuImage: "substratusai/vllm:v0.5.5-cpu"
-  ollama:
-    image: "ollama/ollama:latest"
+  VLLM:
+    images:
+      # The key is the image name (referenced from resourceProfiles) and the value is the image.
+      # The "default" image should always be specified.
+      # "default" is used when no imageName is specified or if a specific image is not found.
+      default: "vllm/vllm-openai:v0.5.5"
+      cpu: "substratusai/vllm-openai-cpu:v0.5.5"
+      nvidia-gpu: "vllm/vllm-openai:v0.5.5"
+      google-tpu: "substratusai/vllm-openai-tpu:v0.5.5"
+  OLlama:
+    images:
+      default: "ollama/ollama:latest"
 
 resourceProfiles:
-  CPU:
+  cpu:
+    imageName: "cpu"
     requests:
       cpu: 1
       memory: "2Gi"
-  L4:
+  nvidia-gpu-l4:
+    imageName: "nvidia-gpu"
     limits:
       nvidia.com/gpu: "1"
     requests:
       nvidia.com/gpu: "1"
       cpu: "6"
       memory: "24Gi"
+  # nvidia-gpu-v100:
+  # nvidia-gpu-a100:
+  # google-tpu-v5e:
+  #   imageName: "google-tpu"
+  #   requests:
+  #     google.com/tpu: 4
+  #   limits:
+  #     google.com/tpu: 4
+  #   nodeSelector:
+  #     cloud.google.com/gke-accelerator-type: tpu-v5-lite-podslice
 
 messaging:
   errorMaxBackoff: 30s

diff --git a/docs/concepts/resource-profiles.md b/docs/concepts/resource-profiles.md
@@ -0,0 +1,51 @@
+# Resource Profiles
+
+A resource profile maps a type of compute resource (i.e. NVIDIA L4 GPU) to a collection of Kubernetes settings that are set on inference server Pods. These profiles are defined in the KubeAI `config.yaml` file (via a ConfigMap). Each model specifies the resource profile that it requires.
+
+Kubernetes Model resources specify the resource profile and the count of that resource that they require:
+
+```yaml
+# model.yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: llama-3.1-8b-instruct-fp8-l4
+spec:
+  engine: VLLM
+  resourceProfile: nvidia-gpu-l4:1 # Specified at <profile>:<count>
+  # ...
+```
+A given profile might need to contain slightly different settings based on the cluster/cloud that KubeAI is deployed on.
+
+Example: A resource profile named `NVIDIA_GPU_L4` might contain the following settings on a GKE Kubernetes cluster:
+
+```yaml
+# KubeAI config.yaml
+resourceProfiles:
+  nvidia-gpu-l4:
+    limits:
+      # Typical across most Kubernetes clusters:
+      nvidia.com/gpu: "1"
+    requests:
+      nvidia.com/gpu: "1"
+    nodeSelector:
+      # Specific to GKE:
+      cloud.google.com/gke-accelerator: "nvidia-l4"
+      cloud.google.com/gke-spot: "true"
+    imageName: "nvidia-gpu"
+```
+
+In addition to node selectors and resource requirements, a resource profile may optionally specify an image name. This name maps to the container image that will be selected when serving a model on that resource:
+
+```yaml
+# KubeAI config.yaml
+modelServers:
+  VLLM:
+    images:
+      default: "vllm/vllm-openai:v0.5.5"
+      nvidia-gpu: "vllm/vllm-openai:v0.5.5" # <--
+      cpu: "vllm/vllm-openai-cpu:v0.5.5"
+  OLlama:
+    images:
+      # ...
+```
diff --git a/docs/development.md b/docs/development.md
@@ -1,43 +1,55 @@
 # Development
 This document provides instructions for setting up a development environment for KubeAI.
 
-## Cloud Setup
+## Optional: Cloud Setup
+
+### GCP PubSub
+
+If you are develop PubSub messaging integration on GCP, setup test topics and subscriptions and uncomment the `.messaging.streams` in `./hack/dev-config.yaml`.
 
 ```bash
+gcloud auth login --update-adc
+
 gcloud pubsub topics create test-kubeai-requests
 gcloud pubsub subscriptions create test-kubeai-requests-sub --topic test-kubeai-requests
 gcloud pubsub topics create test-kubeai-responses
 gcloud pubsub subscriptions create test-kubeai-responses-sub --topic test-kubeai-responses
 ```
 
-## Local Cluster
+## Run in Local Cluster
 
 ```bash
 kind create cluster
 # OR
 #./hack/create-dev-gke-cluster.yaml
 
+# Generate CRDs from Go code.
+make generate && make manifests
+
 # When CRDs are changed reapply using kubectl:
 kubectl apply -f ./charts/kubeai/charts/crds/crds
 
 # Model with special address annotations:
 kubectl apply -f ./hack/dev-model.yaml
 
-# For developing in-cluster features:
+# OPTION A #
+# Run KubeAI inside cluster
+# Change `-f` based on the cluster environment.
 helm upgrade --install kubeai ./charts/kubeai \
     --set openwebui.enabled=true \
     --set image.tag=latest \
     --set image.pullPolicy=Always \
     --set image.repository=us-central1-docker.pkg.dev/substratus-dev/default/kubeai \
-    --set replicaCount=1 # 0 if running out-of-cluster (using "go run")
-
-# -f ./helm-values.yaml \
+    --set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \
+    --set replicaCount=1 -f ./hack/dev-gke-helm-values.yaml
 
-# Run in development mode.
+# OPTION B #
+# For quick local interation (run KubeAI outside of cluster)
 CONFIG_PATH=./hack/dev-config.yaml POD_NAMESPACE=default go run ./cmd/main.go --allow-pod-address-override
 
 # In another terminal:
 while true; do kubectl port-forward service/dev-model 7000:7000; done
+############
 ```
 
 ## Running

diff --git a/docs/installation/gke.md b/docs/installation/gke.md
@@ -45,7 +45,7 @@ models:
       enabled: true
 
 resourceProfiles:
-  L4:
+  nvidia-gpu-l4:
     nodeSelector:
       cloud.google.com/gke-accelerator: "nvidia-l4"
       cloud.google.com/gke-spot: "true"

diff --git a/docs/model-management.md b/docs/model-management.md
@@ -20,7 +20,7 @@ spec:
     - --gpu-memory-utilization=0.9
   minReplicas: 0
   maxReplicas: 3
-  resourceProfile: L4:1
+  resourceProfile: nvidia-gpu-l4:1
 ```
 
 ### Listing Models

diff --git a/hack/dev-config.yaml b/hack/dev-config.yaml
@@ -2,22 +2,25 @@ secretNames:
   huggingface: huggingface
 modelServers:
   vLLM:
-    gpuImage: "vllm/vllm-openai:latest"
-    cpuImage: "us-central1-docker.pkg.dev/substratus-dev/default/vllm-cpu:v0.5.4-118-gfc93e561"
+    images:
+      default: "vllm/vllm-openai:latest"
+      cpu: "us-central1-docker.pkg.dev/substratus-dev/default/vllm-cpu:v0.5.4-118-gfc93e561"
   ollama:
-    image: "ollama/ollama:latest"
+    images:
+      default: "ollama/ollama:latest"
+      cpu: "ollama/ollama:0.3.8"
 messaging:
   errorMaxBackoff: 30s
-  streams:
-  - requestsURL: gcppubsub://projects/substratus-dev/subscriptions/test-kubeai-requests-sub
-    responsesURL: gcppubsub://projects/substratus-dev/topics/test-kubeai-responses
-    maxHandlers: 1
+  streams: []
+  #- requestsURL: gcppubsub://projects/substratus-dev/subscriptions/test-kubeai-requests-sub
+  #  responsesURL: gcppubsub://projects/substratus-dev/topics/test-kubeai-responses
+  #  maxHandlers: 1
 resourceProfiles:
-  CPU:
+  cpu:
     requests:
       cpu: 1
       memory: 2Gi
-  L4:
+  nvidia-gpu-l4:
     limits:
       nvidia.com/gpu: "1"
     requests:

diff --git a/hack/dev-gke-helm-values.yaml b/hack/dev-gke-helm-values.yaml
@@ -0,0 +1,10 @@
+models:
+  catalog:
+    llama-3.1-8b-instruct-fp8-l4:
+      enabled: true
+
+resourceProfiles:
+  nvidia-gpu-l4:
+    nodeSelector:
+      cloud.google.com/gke-accelerator: "nvidia-l4"
+      cloud.google.com/gke-spot: "true"
diff --git a/hack/dev-model.yaml b/hack/dev-model.yaml
@@ -12,7 +12,7 @@ spec:
   owner: alibaba
   url: "ollama://qwen2:0.5b"
   engine: OLlama
-  resourceProfile: CPU:1
+  resourceProfile: cpu:1
   minReplicas: 1
   maxReplicas: 3
 ---

diff --git a/internal/config/system.go b/internal/config/system.go
@@ -53,6 +53,7 @@ func (d *Duration) UnmarshalJSON(b []byte) error {
 }
 
 type ResourceProfile struct {
+	ImageName    string              `json:"imageName"`
 	Requests     corev1.ResourceList `json:"requests,omitempty"`
 	Limits       corev1.ResourceList `json:"limits,omitempty"`
 	NodeSelector map[string]string   `json:"nodeSelector,omitempty"`
@@ -65,11 +66,10 @@ type MessageStream struct {
 }
 
 type ModelServers struct {
-	Ollama struct {
-		Image string `json:"image"`
-	} `json:"ollama"`
-	VLLM struct {
-		CPUImage string `json:"cpuImage"`
-		GPUImage string `json:"gpuImage"`
-	}
+	OLlama ModelServer `json:"OLlama"`
+	VLLM   ModelServer `json:"VLLM"`
+}
+
+type ModelServer struct {
+	Images map[string]string `json:"images"`
 }