Add nomic embedding model and add model feature filtering (#130)

Add Nomic Embedding model: > nomic-embed-text is a large context length text encoder that surpasses OpenAI text-embedding-ada-002 and text-embedding-3-small performance on short and long context tasks. See: https://ollama.com/library/nomic-embed-text Also: Add query string filtering in the model listing endpoint to avoid OpenWebUI adding embedding models in the dropdown list of chat-able models. ```go // Example (default): /v1/models // Example (single): /v1/models?feature=TextEmbedding // Example (multiple): /v1/models?feature=TextGeneration&feature=TextEmbedding ```
substratusai · Aug 26, 2024 · aa33110 · aa33110
1 parent 1b4a5d0
commit aa33110
Show file tree

Hide file tree

Showing 12 changed files with 143 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -58,6 +58,8 @@ models:
       minReplicas: 1
     qwen2-500m-cpu:
       enabled: true
+    nomic-embed-text-cpu:
+      enabled: true
 EOF
 
 helm upgrade --install kubeai ./charts/kubeai \

diff --git a/api/v1/constants.go b/api/v1/constants.go
@@ -3,6 +3,8 @@ package v1
 const (
 	PodModelLabel = "model"
 
+	ModelFeatureLabelDomain = "features.kubeai.org"
+
 	// ModelPodIPAnnotation is the annotation key used to specify an IP
 	// to use for the model Pod instead of the IP address in the status of the Pod.
 	// Use in conjunction with --allow-pod-address-override for development purposes.

diff --git a/api/v1/model_types.go b/api/v1/model_types.go
@@ -23,9 +23,10 @@ import (
 
 // ModelSpec defines the desired state of Model
 type ModelSpec struct {
-	Owner    string   `json:"owner"`
-	URL      string   `json:"url"`
-	Features []string `json:"features,omitempty"`
+	Owner string `json:"owner"`
+	URL   string `json:"url"`
+
+	Features []ModelFeature `json:"features"`
 
 	// +kubebuilder:validation:Enum=OLlama;VLLM
 	Engine string `json:"engine"`
@@ -52,6 +53,14 @@ type ModelSpec struct {
 	Env map[string]string `json:"env,omitempty"`
 }
 
+// +kubebuilder:validation:Enum=TextGeneration;TextEmbedding
+type ModelFeature string
+
+const (
+	ModelFeatureTextGeneration = "TextGeneration"
+	ModelFeatureTextEmbedding  = "TextEmbedding"
+)
+
 const (
 	OLlamaEngine = "OLlama"
 	VLLMEngine   = "VLLM"

diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
diff --git a/charts/kubeai/charts/crds/crds/kubeai.org_models.yaml b/charts/kubeai/charts/crds/crds/kubeai.org_models.yaml
@@ -56,6 +56,9 @@ spec:
                 type: object
               features:
                 items:
+                  enum:
+                  - TextGeneration
+                  - TextEmbedding
                   type: string
                 type: array
               maxReplicas:
@@ -140,6 +143,7 @@ spec:
                 type: string
             required:
             - engine
+            - features
             - maxReplicas
             - minReplicas
             - owner

diff --git a/charts/kubeai/charts/models/templates/models.yaml b/charts/kubeai/charts/models/templates/models.yaml
@@ -6,6 +6,7 @@ kind: Model
 metadata:
   name: {{ $name }}
 spec:
+  features: {{ $model.features }}
   owner: {{ $model.owner }}
   url: {{ $model.url }}
   {{- with $model.engine }}

diff --git a/charts/kubeai/charts/models/values.yaml b/charts/kubeai/charts/models/values.yaml
@@ -5,21 +5,25 @@ defaults:
 catalog:
   # Mistral #
   e5-mistral-7b-instruct-cpu:
+    enabled: false
+    features: ["TextEmbedding"]
     owner: intfloat
     url: "hf://intfloat/e5-mistral-7b-instruct"
-    features: ["embeddings"]
     resourceProfile: CPU:1
     args:
     - --gpu-memory-utilization=0.9
   # Gemma #
   gemma2-2b-cpu:
     enabled: false
+    features: ["TextGeneration"]
     owner: google
     url: "ollama://gemma2:2b"
     engine: OLlama
     resourceProfile: CPU:2
   # Llama #
   llama-3.1-8b-instruct-cpu:
+    enabled: false
+    features: ["TextGeneration"]
     owner: "meta-llama"
     url: "hf://meta-llama/Meta-Llama-3.1-8B-Instruct"
     engine: VLLM
@@ -30,6 +34,8 @@ catalog:
     - --max-model-len=32768
     - --max-num-batched-token=32768
   llama-3.1-8b-instruct-fp8-l4:
+    enabled: false
+    features: ["TextGeneration"]
     owner: "neuralmagic"
     url: "hf://neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
     engine: VLLM
@@ -38,22 +44,33 @@ catalog:
     - --max-model-len=16384
     - --max-num-batched-token=16384
     - --gpu-memory-utilization=0.9
+  # Nomic #
+  nomic-embed-text-cpu:
+    enabled: false
+    features: ["TextEmbedding"]
+    owner: nomic
+    url: "ollama://nomic-embed-text"
+    engine: OLlama
+    resourceProfile: CPU:1
   # Opt #
   opt-125m-cpu:
     enabled: false
+    features: ["TextGeneration"]
     owner: facebook
     url: "hf://facebook/opt-125m"
     engine: VLLM
     resourceProfile: CPU:1
   opt-125m-l4:
     enabled: false
+    features: ["TextGeneration"]
     owner: facebook
     url: "hf://facebook/opt-125m"
     engine: VLLM
     resourceProfile: L4:1
   # Qwen #
   qwen2-500m-cpu:
     enabled: false
+    features: ["TextGeneration"]
     owner: alibaba
     url: "ollama://qwen2:0.5b"
     engine: OLlama

diff --git a/docs/development.md b/docs/development.md
@@ -24,10 +24,13 @@ kubectl apply -f ./hack/dev-model.yaml
 
 # For developing in-cluster features:
 helm upgrade --install kubeai ./charts/kubeai \
-    --set openwebui.enabled=false \
+    --set openwebui.enabled=true \
     --set image.tag=latest \
     --set image.pullPolicy=Always \
-    --set replicaCount=1 # 0 if running locally
+    --set image.repository=us-central1-docker.pkg.dev/substratus-dev/default/kubeai \
+    --set replicaCount=1 # 0 if running out-of-cluster (using "go run")
+
+# -f ./helm-values.yaml \
 
 # Run in development mode.
 CONFIG_PATH=./hack/dev-config.yaml POD_NAMESPACE=default go run ./cmd/main.go --allow-pod-address-override

diff --git a/hack/dev-config.yaml b/hack/dev-config.yaml
@@ -16,7 +16,7 @@ resourceProfiles:
   CPU:
     requests:
       cpu: 1
-      memory: 2
+      memory: 2Gi
   L4:
     limits:
       nvidia.com/gpu: "1"

diff --git a/hack/dev-model.yaml b/hack/dev-model.yaml
@@ -8,6 +8,7 @@ metadata:
     model-pod-ip: "127.0.0.1"
     model-pod-port: "7000" 
 spec:
+  features: ["TextGeneration"]
   owner: alibaba
   url: "ollama://qwen2:0.5b"
   engine: OLlama

diff --git a/internal/modelcontroller/model_controller.go b/internal/modelcontroller/model_controller.go
@@ -87,9 +87,14 @@ func (r *ModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
 			return ctrl.Result{}, nil
 		}
 		if changed {
+			log.Info("applied resource profile")
 			shouldUpdate = true
 		}
 	}
+	// Apply self labels based on features so that we can easily filter models.
+	if changed := r.applySelfLabels(model); changed {
+		shouldUpdate = true
+	}
 	if shouldUpdate {
 		if err := r.Update(ctx, model); err != nil {
 			return ctrl.Result{}, fmt.Errorf("updating model: %w", err)
@@ -359,6 +364,29 @@ func (r *ModelReconciler) oLlamaPodForModel(m *kubeaiv1.Model, index int32) *cor
 	}
 
 	ollamaModelRef := strings.TrimPrefix(m.Spec.URL, "ollama://")
+
+	featuresMap := map[kubeaiv1.ModelFeature]struct{}{}
+	for _, f := range m.Spec.Features {
+		featuresMap[f] = struct{}{}
+	}
+
+	// Pull model and copy to rename it to Model.metadata.name.
+	// See Ollama issue for rename/copy workaround: https://github.com/ollama/ollama/issues/5914
+	// NOTE: The cp command should just create a pointer to the old model, not copy data
+	// (see https://github.com/ollama/ollama/issues/5914#issuecomment-2248168474).
+	// Use `ollama run` to send a single prompt to ollama to load the model into memory
+	// before the Pod becomes Ready. (by default it will load on the first prompt request).
+	startupProbeScript := fmt.Sprintf("/bin/ollama pull %s && /bin/ollama cp %s %s",
+		ollamaModelRef, ollamaModelRef, m.Name)
+	if _, ok := featuresMap[kubeaiv1.ModelFeatureTextEmbedding]; ok {
+		// NOTE: Embedding text models do not support "ollama pull":
+		//
+		// ollama run nomic-embed-text hey
+		// Error: "nomic-embed-text" does not support generate
+		//
+		startupProbeScript += fmt.Sprintf(" && /bin/ollama run %s hi", m.Name)
+	}
+
 	pod := &corev1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			Name:        fmt.Sprintf("model-%s-%d", m.Name, index),
@@ -396,14 +424,7 @@ func (r *ModelReconciler) oLlamaPodForModel(m *kubeaiv1.Model, index int32) *cor
 							Exec: &corev1.ExecAction{
 								Command: []string{
 									"bash", "-c",
-									// Pull model and copy to rename it to Model.metadata.name.
-									// See Ollama issue for rename/copy workaround: https://github.com/ollama/ollama/issues/5914
-									// NOTE: The cp command should just create a pointer to the old model, not copy data
-									// (see https://github.com/ollama/ollama/issues/5914#issuecomment-2248168474).
-									// Use `ollama run` to send a single prompt to ollama to load the model into memory
-									// before the Pod becomes Ready. (by default it will load on the first prompt request).
-									fmt.Sprintf("/bin/ollama pull %s && /bin/ollama cp %s %s && /bin/ollama run %s hi",
-										ollamaModelRef, ollamaModelRef, m.Name, m.Name),
+									startupProbeScript,
 								},
 							},
 						},
@@ -543,6 +564,41 @@ func (r *ModelReconciler) applyResourceProfile(model *kubeaiv1.Model) (bool, err
 	return changed, nil
 }
 
+func (r *ModelReconciler) applySelfLabels(model *kubeaiv1.Model) bool {
+	modelFeaturesMap := make(map[kubeaiv1.ModelFeature]struct{}, len(model.Spec.Features))
+	for _, f := range model.Spec.Features {
+		modelFeaturesMap[f] = struct{}{}
+	}
+
+	if model.GetLabels() == nil {
+		model.SetLabels(map[string]string{})
+	}
+
+	var changed bool
+
+	// Delete non-matching feature labels.
+	for key := range model.GetLabels() {
+		if strings.HasPrefix(key, kubeaiv1.ModelFeatureLabelDomain) {
+			feat := strings.TrimPrefix(key, kubeaiv1.ModelFeatureLabelDomain+"/")
+			if _, ok := modelFeaturesMap[kubeaiv1.ModelFeature(feat)]; !ok {
+				delete(model.GetLabels(), key)
+				changed = true
+			}
+		}
+	}
+
+	// Add missing feature labels.
+	for feat := range modelFeaturesMap {
+		labelKey := fmt.Sprintf("%s/%s", kubeaiv1.ModelFeatureLabelDomain, feat)
+		if _, ok := model.GetLabels()[labelKey]; !ok {
+			model.GetLabels()[labelKey] = "true"
+			changed = true
+		}
+	}
+
+	return changed
+}
+
 func resourcesEqual(a, b corev1.ResourceList) bool {
 	if len(a) != len(b) {
 		return false

diff --git a/internal/openaiserver/models.go b/internal/openaiserver/models.go
@@ -5,19 +5,44 @@ import (
 	"net/http"
 
 	kubeaiv1 "github.com/substratusai/kubeai/api/v1"
+	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
 func (h *Handler) getModels(w http.ResponseWriter, r *http.Request) {
 	w.Header().Set("Content-Type", "application/json")
 
-	list := &kubeaiv1.ModelList{}
-	if err := h.K8sClient.List(r.Context(), list); err != nil {
-		sendErrorResponse(w, http.StatusInternalServerError, "failed to list models: %v", err)
-		return
+	// List models based on the "feature" query parameter.
+	// Example (default):  /v1/models
+	// Example (single):   /v1/models?feature=TextEmbedding
+	// Example (multiple): /v1/models?feature=TextGeneration&feature=TextEmbedding
+	features := r.URL.Query()["feature"]
+	if len(features) == 0 {
+		// Default to listing text generation models.
+		// Do this to play nicely with chat UIs like OpenWebUI.
+		features = []string{kubeaiv1.ModelFeatureTextGeneration}
+	}
+
+	var k8sModels []kubeaiv1.Model
+	k8sModelNames := map[string]struct{}{}
+	for _, feature := range features {
+		// NOTE(nstogner): Could not find a way to do an OR query with the client,
+		// so we just do multiple queries and merge the results.
+		labelSelector := client.MatchingLabels{kubeaiv1.ModelFeatureLabelDomain + "/" + feature: "true"}
+		list := &kubeaiv1.ModelList{}
+		if err := h.K8sClient.List(r.Context(), list, labelSelector); err != nil {
+			sendErrorResponse(w, http.StatusInternalServerError, "failed to list models: %v", err)
+			return
+		}
+		for _, model := range list.Items {
+			if _, ok := k8sModelNames[model.Name]; !ok {
+				k8sModels = append(k8sModels, model)
+				k8sModelNames[model.Name] = struct{}{}
+			}
+		}
 	}
 
-	models := make([]Model, len(list.Items))
-	for i, k8sModel := range list.Items {
+	models := make([]Model, len(k8sModels))
+	for i, k8sModel := range k8sModels {
 		model := Model{}
 		model.FromK8sModel(&k8sModel)
 		models[i] = model
@@ -39,7 +64,7 @@ type Model struct {
 
 	// Adiditional (non-OpenAI) fields
 
-	Features []string `json:"features,omitempty"`
+	Features []kubeaiv1.ModelFeature `json:"features,omitempty"`
 }
 
 func (m *Model) FromK8sModel(model *kubeaiv1.Model) {