Skip to content

Commit

Permalink
Add nomic embedding model and add model feature filtering (#130)
Browse files Browse the repository at this point in the history
Add Nomic Embedding model:

> nomic-embed-text is a large context length text encoder that surpasses
OpenAI text-embedding-ada-002 and text-embedding-3-small performance on
short and long context tasks.

 See: https://ollama.com/library/nomic-embed-text

Also: Add query string filtering in the model listing endpoint to avoid
OpenWebUI adding embedding models in the dropdown list of chat-able
models.

```go
// Example (default):  /v1/models
// Example (single):   /v1/models?feature=TextEmbedding
// Example (multiple): /v1/models?feature=TextGeneration&feature=TextEmbedding
```
  • Loading branch information
nstogner authored Aug 26, 2024
1 parent 1b4a5d0 commit aa33110
Show file tree
Hide file tree
Showing 12 changed files with 143 additions and 23 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ models:
minReplicas: 1
qwen2-500m-cpu:
enabled: true
nomic-embed-text-cpu:
enabled: true
EOF

helm upgrade --install kubeai ./charts/kubeai \
Expand Down
2 changes: 2 additions & 0 deletions api/v1/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package v1
const (
PodModelLabel = "model"

ModelFeatureLabelDomain = "features.kubeai.org"

// ModelPodIPAnnotation is the annotation key used to specify an IP
// to use for the model Pod instead of the IP address in the status of the Pod.
// Use in conjunction with --allow-pod-address-override for development purposes.
Expand Down
15 changes: 12 additions & 3 deletions api/v1/model_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@ import (

// ModelSpec defines the desired state of Model
type ModelSpec struct {
Owner string `json:"owner"`
URL string `json:"url"`
Features []string `json:"features,omitempty"`
Owner string `json:"owner"`
URL string `json:"url"`

Features []ModelFeature `json:"features"`

// +kubebuilder:validation:Enum=OLlama;VLLM
Engine string `json:"engine"`
Expand All @@ -52,6 +53,14 @@ type ModelSpec struct {
Env map[string]string `json:"env,omitempty"`
}

// +kubebuilder:validation:Enum=TextGeneration;TextEmbedding
type ModelFeature string

const (
ModelFeatureTextGeneration = "TextGeneration"
ModelFeatureTextEmbedding = "TextEmbedding"
)

const (
OLlamaEngine = "OLlama"
VLLMEngine = "VLLM"
Expand Down
2 changes: 1 addition & 1 deletion api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions charts/kubeai/charts/crds/crds/kubeai.org_models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ spec:
type: object
features:
items:
enum:
- TextGeneration
- TextEmbedding
type: string
type: array
maxReplicas:
Expand Down Expand Up @@ -140,6 +143,7 @@ spec:
type: string
required:
- engine
- features
- maxReplicas
- minReplicas
- owner
Expand Down
1 change: 1 addition & 0 deletions charts/kubeai/charts/models/templates/models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ kind: Model
metadata:
name: {{ $name }}
spec:
features: {{ $model.features }}
owner: {{ $model.owner }}
url: {{ $model.url }}
{{- with $model.engine }}
Expand Down
19 changes: 18 additions & 1 deletion charts/kubeai/charts/models/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,25 @@ defaults:
catalog:
# Mistral #
e5-mistral-7b-instruct-cpu:
enabled: false
features: ["TextEmbedding"]
owner: intfloat
url: "hf://intfloat/e5-mistral-7b-instruct"
features: ["embeddings"]
resourceProfile: CPU:1
args:
- --gpu-memory-utilization=0.9
# Gemma #
gemma2-2b-cpu:
enabled: false
features: ["TextGeneration"]
owner: google
url: "ollama://gemma2:2b"
engine: OLlama
resourceProfile: CPU:2
# Llama #
llama-3.1-8b-instruct-cpu:
enabled: false
features: ["TextGeneration"]
owner: "meta-llama"
url: "hf://meta-llama/Meta-Llama-3.1-8B-Instruct"
engine: VLLM
Expand All @@ -30,6 +34,8 @@ catalog:
- --max-model-len=32768
- --max-num-batched-token=32768
llama-3.1-8b-instruct-fp8-l4:
enabled: false
features: ["TextGeneration"]
owner: "neuralmagic"
url: "hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
engine: VLLM
Expand All @@ -38,22 +44,33 @@ catalog:
- --max-model-len=16384
- --max-num-batched-token=16384
- --gpu-memory-utilization=0.9
# Nomic #
nomic-embed-text-cpu:
enabled: false
features: ["TextEmbedding"]
owner: nomic
url: "ollama://nomic-embed-text"
engine: OLlama
resourceProfile: CPU:1
# Opt #
opt-125m-cpu:
enabled: false
features: ["TextGeneration"]
owner: facebook
url: "hf://facebook/opt-125m"
engine: VLLM
resourceProfile: CPU:1
opt-125m-l4:
enabled: false
features: ["TextGeneration"]
owner: facebook
url: "hf://facebook/opt-125m"
engine: VLLM
resourceProfile: L4:1
# Qwen #
qwen2-500m-cpu:
enabled: false
features: ["TextGeneration"]
owner: alibaba
url: "ollama://qwen2:0.5b"
engine: OLlama
Expand Down
7 changes: 5 additions & 2 deletions docs/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,13 @@ kubectl apply -f ./hack/dev-model.yaml

# For developing in-cluster features:
helm upgrade --install kubeai ./charts/kubeai \
--set openwebui.enabled=false \
--set openwebui.enabled=true \
--set image.tag=latest \
--set image.pullPolicy=Always \
--set replicaCount=1 # 0 if running locally
--set image.repository=us-central1-docker.pkg.dev/substratus-dev/default/kubeai \
--set replicaCount=1 # 0 if running out-of-cluster (using "go run")

# -f ./helm-values.yaml \

# Run in development mode.
CONFIG_PATH=./hack/dev-config.yaml POD_NAMESPACE=default go run ./cmd/main.go --allow-pod-address-override
Expand Down
2 changes: 1 addition & 1 deletion hack/dev-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ resourceProfiles:
CPU:
requests:
cpu: 1
memory: 2
memory: 2Gi
L4:
limits:
nvidia.com/gpu: "1"
Expand Down
1 change: 1 addition & 0 deletions hack/dev-model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ metadata:
model-pod-ip: "127.0.0.1"
model-pod-port: "7000"
spec:
features: ["TextGeneration"]
owner: alibaba
url: "ollama://qwen2:0.5b"
engine: OLlama
Expand Down
72 changes: 64 additions & 8 deletions internal/modelcontroller/model_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,14 @@ func (r *ModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
return ctrl.Result{}, nil
}
if changed {
log.Info("applied resource profile")
shouldUpdate = true
}
}
// Apply self labels based on features so that we can easily filter models.
if changed := r.applySelfLabels(model); changed {
shouldUpdate = true
}
if shouldUpdate {
if err := r.Update(ctx, model); err != nil {
return ctrl.Result{}, fmt.Errorf("updating model: %w", err)
Expand Down Expand Up @@ -359,6 +364,29 @@ func (r *ModelReconciler) oLlamaPodForModel(m *kubeaiv1.Model, index int32) *cor
}

ollamaModelRef := strings.TrimPrefix(m.Spec.URL, "ollama://")

featuresMap := map[kubeaiv1.ModelFeature]struct{}{}
for _, f := range m.Spec.Features {
featuresMap[f] = struct{}{}
}

// Pull model and copy to rename it to Model.metadata.name.
// See Ollama issue for rename/copy workaround: https://github.com/ollama/ollama/issues/5914
// NOTE: The cp command should just create a pointer to the old model, not copy data
// (see https://github.com/ollama/ollama/issues/5914#issuecomment-2248168474).
// Use `ollama run` to send a single prompt to ollama to load the model into memory
// before the Pod becomes Ready. (by default it will load on the first prompt request).
startupProbeScript := fmt.Sprintf("/bin/ollama pull %s && /bin/ollama cp %s %s",
ollamaModelRef, ollamaModelRef, m.Name)
if _, ok := featuresMap[kubeaiv1.ModelFeatureTextEmbedding]; ok {
// NOTE: Embedding text models do not support "ollama pull":
//
// ollama run nomic-embed-text hey
// Error: "nomic-embed-text" does not support generate
//
startupProbeScript += fmt.Sprintf(" && /bin/ollama run %s hi", m.Name)
}

pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("model-%s-%d", m.Name, index),
Expand Down Expand Up @@ -396,14 +424,7 @@ func (r *ModelReconciler) oLlamaPodForModel(m *kubeaiv1.Model, index int32) *cor
Exec: &corev1.ExecAction{
Command: []string{
"bash", "-c",
// Pull model and copy to rename it to Model.metadata.name.
// See Ollama issue for rename/copy workaround: https://github.com/ollama/ollama/issues/5914
// NOTE: The cp command should just create a pointer to the old model, not copy data
// (see https://github.com/ollama/ollama/issues/5914#issuecomment-2248168474).
// Use `ollama run` to send a single prompt to ollama to load the model into memory
// before the Pod becomes Ready. (by default it will load on the first prompt request).
fmt.Sprintf("/bin/ollama pull %s && /bin/ollama cp %s %s && /bin/ollama run %s hi",
ollamaModelRef, ollamaModelRef, m.Name, m.Name),
startupProbeScript,
},
},
},
Expand Down Expand Up @@ -543,6 +564,41 @@ func (r *ModelReconciler) applyResourceProfile(model *kubeaiv1.Model) (bool, err
return changed, nil
}

func (r *ModelReconciler) applySelfLabels(model *kubeaiv1.Model) bool {
modelFeaturesMap := make(map[kubeaiv1.ModelFeature]struct{}, len(model.Spec.Features))
for _, f := range model.Spec.Features {
modelFeaturesMap[f] = struct{}{}
}

if model.GetLabels() == nil {
model.SetLabels(map[string]string{})
}

var changed bool

// Delete non-matching feature labels.
for key := range model.GetLabels() {
if strings.HasPrefix(key, kubeaiv1.ModelFeatureLabelDomain) {
feat := strings.TrimPrefix(key, kubeaiv1.ModelFeatureLabelDomain+"/")
if _, ok := modelFeaturesMap[kubeaiv1.ModelFeature(feat)]; !ok {
delete(model.GetLabels(), key)
changed = true
}
}
}

// Add missing feature labels.
for feat := range modelFeaturesMap {
labelKey := fmt.Sprintf("%s/%s", kubeaiv1.ModelFeatureLabelDomain, feat)
if _, ok := model.GetLabels()[labelKey]; !ok {
model.GetLabels()[labelKey] = "true"
changed = true
}
}

return changed
}

func resourcesEqual(a, b corev1.ResourceList) bool {
if len(a) != len(b) {
return false
Expand Down
39 changes: 32 additions & 7 deletions internal/openaiserver/models.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,44 @@ import (
"net/http"

kubeaiv1 "github.com/substratusai/kubeai/api/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)

func (h *Handler) getModels(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")

list := &kubeaiv1.ModelList{}
if err := h.K8sClient.List(r.Context(), list); err != nil {
sendErrorResponse(w, http.StatusInternalServerError, "failed to list models: %v", err)
return
// List models based on the "feature" query parameter.
// Example (default): /v1/models
// Example (single): /v1/models?feature=TextEmbedding
// Example (multiple): /v1/models?feature=TextGeneration&feature=TextEmbedding
features := r.URL.Query()["feature"]
if len(features) == 0 {
// Default to listing text generation models.
// Do this to play nicely with chat UIs like OpenWebUI.
features = []string{kubeaiv1.ModelFeatureTextGeneration}
}

var k8sModels []kubeaiv1.Model
k8sModelNames := map[string]struct{}{}
for _, feature := range features {
// NOTE(nstogner): Could not find a way to do an OR query with the client,
// so we just do multiple queries and merge the results.
labelSelector := client.MatchingLabels{kubeaiv1.ModelFeatureLabelDomain + "/" + feature: "true"}
list := &kubeaiv1.ModelList{}
if err := h.K8sClient.List(r.Context(), list, labelSelector); err != nil {
sendErrorResponse(w, http.StatusInternalServerError, "failed to list models: %v", err)
return
}
for _, model := range list.Items {
if _, ok := k8sModelNames[model.Name]; !ok {
k8sModels = append(k8sModels, model)
k8sModelNames[model.Name] = struct{}{}
}
}
}

models := make([]Model, len(list.Items))
for i, k8sModel := range list.Items {
models := make([]Model, len(k8sModels))
for i, k8sModel := range k8sModels {
model := Model{}
model.FromK8sModel(&k8sModel)
models[i] = model
Expand All @@ -39,7 +64,7 @@ type Model struct {

// Adiditional (non-OpenAI) fields

Features []string `json:"features,omitempty"`
Features []kubeaiv1.ModelFeature `json:"features,omitempty"`
}

func (m *Model) FromK8sModel(model *kubeaiv1.Model) {
Expand Down

0 comments on commit aa33110

Please sign in to comment.