Skip to content

Commit

Permalink
Map resource profiles to images & add resource profile doc (#154)
Browse files Browse the repository at this point in the history
Define imageNames for given model servers, reference imageNames in
resourceProfiles.

Switch reference resource profiles to be lowercase to support the
possibility of introducing a ResourceProfile CRD in the future (would
require a lowercase .metadata.name).

Fixes #152 via a different technique.
  • Loading branch information
nstogner authored Sep 1, 2024
1 parent f453476 commit 0b2b2c8
Show file tree
Hide file tree
Showing 14 changed files with 187 additions and 58 deletions.
4 changes: 4 additions & 0 deletions api/v1/model_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ type ModelSpec struct {
// ResourceProfile maps to specific pre-configured resources.
ResourceProfile string `json:"resourceProfile,omitempty"`

// Image to be used for the server process.
// Will be set from the ResourceProfile if provided.
Image string `json:"image,omitempty"`

// Resources to be allocated to the server process.
// Will be set from the ResourceProfile if provided.
Resources *corev1.ResourceRequirements `json:"resources,omitempty"`
Expand Down
2 changes: 1 addition & 1 deletion charts/kubeai/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.2.1
version: 0.2.2

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
5 changes: 5 additions & 0 deletions charts/kubeai/charts/crds/crds/kubeai.org_models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ spec:
- TextEmbedding
type: string
type: array
image:
description: |-
Image to be used for the server process.
Will be set from the ResourceProfile if provided.
type: string
maxReplicas:
format: int32
type: integer
Expand Down
17 changes: 9 additions & 8 deletions charts/kubeai/charts/models/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ catalog:
features: ["TextEmbedding"]
owner: intfloat
url: "hf://intfloat/e5-mistral-7b-instruct"
resourceProfile: CPU:1
engine: VLLM
resourceProfile: cpu:1
args:
- --gpu-memory-utilization=0.9
# Gemma #
Expand All @@ -19,15 +20,15 @@ catalog:
owner: google
url: "ollama://gemma2:2b"
engine: OLlama
resourceProfile: CPU:2
resourceProfile: cpu:2
# Llama #
llama-3.1-8b-instruct-cpu:
enabled: false
features: ["TextGeneration"]
owner: "meta-llama"
url: "hf://meta-llama/Meta-Llama-3.1-8B-Instruct"
engine: VLLM
resourceProfile: CPU:6
resourceProfile: cpu:6
env:
VLLM_CPU_KVCACHE_SPACE: "4"
args:
Expand All @@ -39,7 +40,7 @@ catalog:
owner: "neuralmagic"
url: "hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
engine: VLLM
resourceProfile: L4:1
resourceProfile: nvidia-gpu-l4:1
args:
- --max-model-len=16384
- --max-num-batched-token=16384
Expand All @@ -51,28 +52,28 @@ catalog:
owner: nomic
url: "ollama://nomic-embed-text"
engine: OLlama
resourceProfile: CPU:1
resourceProfile: cpu:1
# Opt #
opt-125m-cpu:
enabled: false
features: ["TextGeneration"]
owner: facebook
url: "hf://facebook/opt-125m"
engine: VLLM
resourceProfile: CPU:1
resourceProfile: cpu:1
opt-125m-l4:
enabled: false
features: ["TextGeneration"]
owner: facebook
url: "hf://facebook/opt-125m"
engine: VLLM
resourceProfile: L4:1
resourceProfile: nvidia-gpu-l4:1
# Qwen #
qwen2-500m-cpu:
enabled: false
features: ["TextGeneration"]
owner: alibaba
url: "ollama://qwen2:0.5b"
engine: OLlama
resourceProfile: CPU:1
resourceProfile: cpu:1

33 changes: 26 additions & 7 deletions charts/kubeai/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,43 @@ secrets:
name: ""

modelServers:
vLLM:
gpuImage: "vllm/vllm-openai:v0.5.5"
cpuImage: "substratusai/vllm:v0.5.5-cpu"
ollama:
image: "ollama/ollama:latest"
VLLM:
images:
# The key is the image name (referenced from resourceProfiles) and the value is the image.
# The "default" image should always be specified.
# "default" is used when no imageName is specified or if a specific image is not found.
default: "vllm/vllm-openai:v0.5.5"
cpu: "substratusai/vllm-openai-cpu:v0.5.5"
nvidia-gpu: "vllm/vllm-openai:v0.5.5"
google-tpu: "substratusai/vllm-openai-tpu:v0.5.5"
OLlama:
images:
default: "ollama/ollama:latest"

resourceProfiles:
CPU:
cpu:
imageName: "cpu"
requests:
cpu: 1
memory: "2Gi"
L4:
nvidia-gpu-l4:
imageName: "nvidia-gpu"
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"
cpu: "6"
memory: "24Gi"
# nvidia-gpu-v100:
# nvidia-gpu-a100:
# google-tpu-v5e:
# imageName: "google-tpu"
# requests:
# google.com/tpu: 4
# limits:
# google.com/tpu: 4
# nodeSelector:
# cloud.google.com/gke-accelerator-type: tpu-v5-lite-podslice

messaging:
errorMaxBackoff: 30s
Expand Down
51 changes: 51 additions & 0 deletions docs/concepts/resource-profiles.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Resource Profiles

A resource profile maps a type of compute resource (i.e. NVIDIA L4 GPU) to a collection of Kubernetes settings that are set on inference server Pods. These profiles are defined in the KubeAI `config.yaml` file (via a ConfigMap). Each model specifies the resource profile that it requires.

Kubernetes Model resources specify the resource profile and the count of that resource that they require:

```yaml
# model.yaml
apiVersion: kubeai.org/v1
kind: Model
metadata:
name: llama-3.1-8b-instruct-fp8-l4
spec:
engine: VLLM
resourceProfile: nvidia-gpu-l4:1 # Specified at <profile>:<count>
# ...
```
A given profile might need to contain slightly different settings based on the cluster/cloud that KubeAI is deployed on.

Example: A resource profile named `NVIDIA_GPU_L4` might contain the following settings on a GKE Kubernetes cluster:

```yaml
# KubeAI config.yaml
resourceProfiles:
nvidia-gpu-l4:
limits:
# Typical across most Kubernetes clusters:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"
nodeSelector:
# Specific to GKE:
cloud.google.com/gke-accelerator: "nvidia-l4"
cloud.google.com/gke-spot: "true"
imageName: "nvidia-gpu"
```
In addition to node selectors and resource requirements, a resource profile may optionally specify an image name. This name maps to the container image that will be selected when serving a model on that resource:
```yaml
# KubeAI config.yaml
modelServers:
VLLM:
images:
default: "vllm/vllm-openai:v0.5.5"
nvidia-gpu: "vllm/vllm-openai:v0.5.5" # <--
cpu: "vllm/vllm-openai-cpu:v0.5.5"
OLlama:
images:
# ...
```
26 changes: 19 additions & 7 deletions docs/development.md
Original file line number Diff line number Diff line change
@@ -1,43 +1,55 @@
# Development
This document provides instructions for setting up a development environment for KubeAI.

## Cloud Setup
## Optional: Cloud Setup

### GCP PubSub

If you are develop PubSub messaging integration on GCP, setup test topics and subscriptions and uncomment the `.messaging.streams` in `./hack/dev-config.yaml`.

```bash
gcloud auth login --update-adc

gcloud pubsub topics create test-kubeai-requests
gcloud pubsub subscriptions create test-kubeai-requests-sub --topic test-kubeai-requests
gcloud pubsub topics create test-kubeai-responses
gcloud pubsub subscriptions create test-kubeai-responses-sub --topic test-kubeai-responses
```

## Local Cluster
## Run in Local Cluster

```bash
kind create cluster
# OR
#./hack/create-dev-gke-cluster.yaml

# Generate CRDs from Go code.
make generate && make manifests

# When CRDs are changed reapply using kubectl:
kubectl apply -f ./charts/kubeai/charts/crds/crds

# Model with special address annotations:
kubectl apply -f ./hack/dev-model.yaml

# For developing in-cluster features:
# OPTION A #
# Run KubeAI inside cluster
# Change `-f` based on the cluster environment.
helm upgrade --install kubeai ./charts/kubeai \
--set openwebui.enabled=true \
--set image.tag=latest \
--set image.pullPolicy=Always \
--set image.repository=us-central1-docker.pkg.dev/substratus-dev/default/kubeai \
--set replicaCount=1 # 0 if running out-of-cluster (using "go run")

# -f ./helm-values.yaml \
--set secrets.huggingface.token=$HUGGING_FACE_HUB_TOKEN \
--set replicaCount=1 -f ./hack/dev-gke-helm-values.yaml

# Run in development mode.
# OPTION B #
# For quick local interation (run KubeAI outside of cluster)
CONFIG_PATH=./hack/dev-config.yaml POD_NAMESPACE=default go run ./cmd/main.go --allow-pod-address-override

# In another terminal:
while true; do kubectl port-forward service/dev-model 7000:7000; done
############
```

## Running
Expand Down
2 changes: 1 addition & 1 deletion docs/installation/gke.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ models:
enabled: true
resourceProfiles:
L4:
nvidia-gpu-l4:
nodeSelector:
cloud.google.com/gke-accelerator: "nvidia-l4"
cloud.google.com/gke-spot: "true"
Expand Down
2 changes: 1 addition & 1 deletion docs/model-management.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ spec:
- --gpu-memory-utilization=0.9
minReplicas: 0
maxReplicas: 3
resourceProfile: L4:1
resourceProfile: nvidia-gpu-l4:1
```
### Listing Models
Expand Down
21 changes: 12 additions & 9 deletions hack/dev-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,25 @@ secretNames:
huggingface: huggingface
modelServers:
vLLM:
gpuImage: "vllm/vllm-openai:latest"
cpuImage: "us-central1-docker.pkg.dev/substratus-dev/default/vllm-cpu:v0.5.4-118-gfc93e561"
images:
default: "vllm/vllm-openai:latest"
cpu: "us-central1-docker.pkg.dev/substratus-dev/default/vllm-cpu:v0.5.4-118-gfc93e561"
ollama:
image: "ollama/ollama:latest"
images:
default: "ollama/ollama:latest"
cpu: "ollama/ollama:0.3.8"
messaging:
errorMaxBackoff: 30s
streams:
- requestsURL: gcppubsub://projects/substratus-dev/subscriptions/test-kubeai-requests-sub
responsesURL: gcppubsub://projects/substratus-dev/topics/test-kubeai-responses
maxHandlers: 1
streams: []
#- requestsURL: gcppubsub://projects/substratus-dev/subscriptions/test-kubeai-requests-sub
# responsesURL: gcppubsub://projects/substratus-dev/topics/test-kubeai-responses
# maxHandlers: 1
resourceProfiles:
CPU:
cpu:
requests:
cpu: 1
memory: 2Gi
L4:
nvidia-gpu-l4:
limits:
nvidia.com/gpu: "1"
requests:
Expand Down
10 changes: 10 additions & 0 deletions hack/dev-gke-helm-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
models:
catalog:
llama-3.1-8b-instruct-fp8-l4:
enabled: true

resourceProfiles:
nvidia-gpu-l4:
nodeSelector:
cloud.google.com/gke-accelerator: "nvidia-l4"
cloud.google.com/gke-spot: "true"
2 changes: 1 addition & 1 deletion hack/dev-model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
owner: alibaba
url: "ollama://qwen2:0.5b"
engine: OLlama
resourceProfile: CPU:1
resourceProfile: cpu:1
minReplicas: 1
maxReplicas: 3
---
Expand Down
14 changes: 7 additions & 7 deletions internal/config/system.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ func (d *Duration) UnmarshalJSON(b []byte) error {
}

type ResourceProfile struct {
ImageName string `json:"imageName"`
Requests corev1.ResourceList `json:"requests,omitempty"`
Limits corev1.ResourceList `json:"limits,omitempty"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
Expand All @@ -65,11 +66,10 @@ type MessageStream struct {
}

type ModelServers struct {
Ollama struct {
Image string `json:"image"`
} `json:"ollama"`
VLLM struct {
CPUImage string `json:"cpuImage"`
GPUImage string `json:"gpuImage"`
}
OLlama ModelServer `json:"OLlama"`
VLLM ModelServer `json:"VLLM"`
}

type ModelServer struct {
Images map[string]string `json:"images"`
}
Loading

0 comments on commit 0b2b2c8

Please sign in to comment.