faster whisper implementation (#177)

Co-authored-by: Nick Stogner <[email protected]>
substratusai · Sep 5, 2024 · 604966d · 604966d
1 parent c062fb7
commit 604966d
Show file tree

Hide file tree

Showing 15 changed files with 395 additions and 34 deletions.
diff --git a/api/v1/model_types.go b/api/v1/model_types.go
@@ -28,7 +28,7 @@ type ModelSpec struct {
 
 	Features []ModelFeature `json:"features"`
 
-	// +kubebuilder:validation:Enum=OLlama;VLLM
+	// +kubebuilder:validation:Enum=OLlama;VLLM;FasterWhisper
 	Engine string `json:"engine"`
 
 	Replicas    *int32 `json:"replicas,omitempty"`
@@ -57,17 +57,20 @@ type ModelSpec struct {
 	Env map[string]string `json:"env,omitempty"`
 }
 
-// +kubebuilder:validation:Enum=TextGeneration;TextEmbedding
+// +kubebuilder:validation:Enum=TextGeneration;TextEmbedding;SpeechToText
 type ModelFeature string
 
 const (
 	ModelFeatureTextGeneration = "TextGeneration"
 	ModelFeatureTextEmbedding  = "TextEmbedding"
+	// TODO (samos123): Add validation that Speech to Text only supports Faster Whisper.
+	ModelFeatureSpeechToText = "SpeechToText"
 )
 
 const (
-	OLlamaEngine = "OLlama"
-	VLLMEngine   = "VLLM"
+	OLlamaEngine        = "OLlama"
+	VLLMEngine          = "VLLM"
+	FasterWhisperEngine = "FasterWhisper"
 )
 
 // ModelStatus defines the observed state of Model

diff --git a/charts/kubeai/Chart.yaml b/charts/kubeai/Chart.yaml
@@ -7,7 +7,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.2.7
+version: 0.3.0
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
@@ -26,12 +26,12 @@ dependencies:
   # https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/
   - name: crds
     condition: crds.enabled
-    version: 0.1.0
+    version: 0.1.1
   # Models are the machine learning models that kubeai will serve.
   # These are instances of the Model custom resource.
   - name: models
     condition: models.enabled
-    version: 0.1.0
+    version: 0.1.1
 
 
 keywords: ["LLM", "AI"]

diff --git a/charts/kubeai/charts/crds/Chart.yaml b/charts/kubeai/charts/crds/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.0
+version: 0.1.1
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to

diff --git a/charts/kubeai/charts/crds/crds/kubeai.org_models.yaml b/charts/kubeai/charts/crds/crds/kubeai.org_models.yaml
@@ -48,6 +48,7 @@ spec:
                 enum:
                 - OLlama
                 - VLLM
+                - FasterWhisper
                 type: string
               env:
                 additionalProperties:
@@ -59,6 +60,7 @@ spec:
                   enum:
                   - TextGeneration
                   - TextEmbedding
+                  - SpeechToText
                   type: string
                 type: array
               image:

diff --git a/charts/kubeai/charts/models/Chart.yaml b/charts/kubeai/charts/models/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.0
+version: 0.1.1
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to

diff --git a/charts/kubeai/charts/models/values.yaml b/charts/kubeai/charts/models/values.yaml
@@ -76,4 +76,11 @@ catalog:
     url: "ollama://qwen2:0.5b"
     engine: OLlama
     resourceProfile: cpu:1
+  faster-whisper-medium-en-cpu:
+    enabled: false
+    features: ["SpeechToText"]
+    owner: Systran
+    url: "hf://Systran/faster-whisper-medium.en"
+    engine: FasterWhisper
+    resourceProfile: cpu:1
 
diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml
@@ -25,6 +25,10 @@ modelServers:
   OLlama:
     images:
       default: "ollama/ollama:latest"
+  FasterWhisper:
+    images:
+      default: "fedirz/faster-whisper-server:latest-cpu"
+      nvidia-gpu: "fedirz/faster-whisper-server:latest-cuda"
 
 resourceProfiles:
   cpu:

diff --git a/docs/README.md b/docs/README.md
@@ -6,12 +6,13 @@ The simple AI platform that runs on Kubernetes.
 > \- Some Google Engineer
 
 ✅️  Drop-in replacement for OpenAI with API compatibility  
-🚀  Serve OSS LLMs on CPUs or GPUs  
 ⚖️  Scale from zero, autoscale based on load  
+🚀  Serve OSS LLMs on CPUs or GPUs  
+💬  Fast speed to text on CPU or GPU  
 🛠️  Zero dependencies (no Istio, Knative, etc.)   
-🤖  Operates OSS model servers (vLLM and Ollama)  
-🔋  Additional OSS addons included ([OpenWebUI](https://github.com/open-webui/open-webui) i.e. ChatGPT UI)  
-✉️  Plug-n-play with cloud messaging systems (Kafka, PubSub, etc.)
+🤖  Operates OSS model servers (vLLM, Ollama, FasterWhisper)  
+🔋  Chat UI included ([OpenWebUI](https://github.com/open-webui/open-webui)  
+✉️  Batch inference with messaging systems (Kafka, PubSub, etc.)
 
 ## Architecture
 
@@ -122,6 +123,7 @@ Checkout our documenation on [kubeai.org](https://www.kubeai.org) to find info o
 /v1/completions
 /v1/embeddings
 /v1/models
+/v1/audio/transcriptions
 
 # Planned #
 # /v1/assistants/*

diff --git a/docs/how-to/configure-speech-to-text.md b/docs/how-to/configure-speech-to-text.md
@@ -0,0 +1,48 @@
+# Configure Speech To Text
+
+KubeAI provides a Speech to Text endpoint that can be used to transcribe audio files. This guide will walk you through the steps to enable this feature.
+
+## Enable Speech to Text model
+You can create neew models by creating a Model CRD object or by enabling a model from the model catalog.
+
+### Enable from model catalog
+KubeAI provides predefined models in the model catalog. To enable the Speech to Text model, you can set the `enabled` flag to `true` in the `helm-values.yaml` file.
+
+```yaml
+models:
+  catalog:
+    faster-whisper-medium-en-cpu:
+      enabled: true
+      minReplicas: 1
+```
+
+### Enable by creating Model CRD
+You can also create a Model CRD object to enable the Speech to Text model. Here is an example of a Model CRD object for the Speech to Text model:
+
+```yaml
+apiVersion: kubeai.org/v1
+kind: Model
+metadata:
+  name: faster-whisper-medium-en-cpu
+spec:
+  features: [SpeechToText]
+  owner: Systran
+  url: hf://Systran/faster-whisper-medium.en
+  engine: FasterWhisper
+  minReplicas: 0
+  maxReplicas: 3
+  resourceProfile: cpu:1
+```
+
+## Usage
+The Speech to Text endpoint is available at `/openai/v1/transcriptions`.
+
+Example usage using curl:
+
+```bash
+curl -L -o kubeai.mp4 https://github.com/user-attachments/assets/711d1279-6af9-4c6c-a052-e59e7730b757
+curl http://localhost:8000/openai/v1/audio/transcriptions \
+  -F "[email protected]" \
+  -F "language=en" \
+  -F "model=faster-whisper-medium-en-cpu"
+```
diff --git a/internal/config/system.go b/internal/config/system.go
@@ -66,8 +66,9 @@ type MessageStream struct {
 }
 
 type ModelServers struct {
-	OLlama ModelServer `json:"OLlama"`
-	VLLM   ModelServer `json:"VLLM"`
+	OLlama        ModelServer `json:"OLlama"`
+	VLLM          ModelServer `json:"VLLM"`
+	FasterWhisper ModelServer `json:"FasterWhisper"`
 }
 
 type ModelServer struct {

diff --git a/internal/modelcontroller/model_controller.go b/internal/modelcontroller/model_controller.go
@@ -113,6 +113,8 @@ func (r *ModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
 	switch model.Spec.Engine {
 	case kubeaiv1.OLlamaEngine:
 		podForModel = r.oLlamaPodForModel
+	case kubeaiv1.FasterWhisperEngine:
+		podForModel = r.fasterWhisperPodForModel
 	default:
 		podForModel = r.vLLMPodForModel
 	}
@@ -481,6 +483,136 @@ func (r *ModelReconciler) oLlamaPodForModel(m *kubeaiv1.Model, index int32) *cor
 
 }
 
+func (r *ModelReconciler) fasterWhisperPodForModel(m *kubeaiv1.Model, index int32) *corev1.Pod {
+	lbs := labelsForModel(m)
+	ann := r.annotationsForModel(m)
+	if _, ok := ann[kubeaiv1.ModelPodPortAnnotation]; !ok {
+		ann[kubeaiv1.ModelPodPortAnnotation] = "8000"
+	}
+
+	args := []string{}
+	args = append(args, m.Spec.Args...)
+
+	env := []corev1.EnvVar{
+		{
+			Name:  "WHISPER__MODEL",
+			Value: strings.TrimPrefix(m.Spec.URL, "hf://"),
+		},
+		{
+			Name:  "ENABLE_UI",
+			Value: "false",
+		},
+		{
+			// TODO: Conditionally set this token based on whether
+			// huggingface is the model source.
+			Name: "HUGGING_FACE_HUB_TOKEN",
+			ValueFrom: &corev1.EnvVarSource{
+				SecretKeyRef: &corev1.SecretKeySelector{
+					LocalObjectReference: corev1.LocalObjectReference{
+						Name: r.HuggingfaceSecretName,
+					},
+					Key:      "token",
+					Optional: ptr.To(true),
+				},
+			},
+		},
+	}
+	var envKeys []string
+	for key := range m.Spec.Env {
+		envKeys = append(envKeys, key)
+	}
+	sort.Strings(envKeys)
+	for _, key := range envKeys {
+		env = append(env, corev1.EnvVar{
+			Name:  key,
+			Value: m.Spec.Env[key],
+		})
+	}
+
+	pod := &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:        fmt.Sprintf("model-%s-%d", m.Name, index),
+			Namespace:   m.Namespace,
+			Labels:      lbs,
+			Annotations: ann,
+		},
+		Spec: corev1.PodSpec{
+			NodeSelector: m.Spec.NodeSelector,
+			Containers: []corev1.Container{
+				{
+					Name:      "server",
+					Image:     m.Spec.Image,
+					Args:      args,
+					Env:       env,
+					Resources: *m.Spec.Resources,
+					Ports: []corev1.ContainerPort{
+						{
+							ContainerPort: 8000,
+							Protocol:      corev1.ProtocolTCP,
+							Name:          "http",
+						},
+					},
+					StartupProbe: &corev1.Probe{
+						// Give the model 30 minutes to start up.
+						FailureThreshold: 900,
+						PeriodSeconds:    2,
+						TimeoutSeconds:   2,
+						SuccessThreshold: 1,
+						ProbeHandler: corev1.ProbeHandler{
+							HTTPGet: &corev1.HTTPGetAction{
+								Path: "/health",
+								Port: intstr.FromString("http"),
+							},
+						},
+					},
+					ReadinessProbe: &corev1.Probe{
+						FailureThreshold: 3,
+						PeriodSeconds:    10,
+						TimeoutSeconds:   2,
+						SuccessThreshold: 1,
+						ProbeHandler: corev1.ProbeHandler{
+							HTTPGet: &corev1.HTTPGetAction{
+								Path: "/health",
+								Port: intstr.FromString("http"),
+							},
+						},
+					},
+					LivenessProbe: &corev1.Probe{
+						FailureThreshold: 3,
+						PeriodSeconds:    30,
+						TimeoutSeconds:   3,
+						SuccessThreshold: 1,
+						ProbeHandler: corev1.ProbeHandler{
+							HTTPGet: &corev1.HTTPGetAction{
+								Path: "/health",
+								Port: intstr.FromString("http"),
+							},
+						},
+					},
+					VolumeMounts: []corev1.VolumeMount{
+						{
+							Name:      "dshm",
+							MountPath: "/dev/shm",
+						},
+					},
+				},
+			},
+			Volumes: []corev1.Volume{
+				{
+					Name: "dshm",
+					VolumeSource: corev1.VolumeSource{
+						EmptyDir: &corev1.EmptyDirVolumeSource{
+							Medium: corev1.StorageMediumMemory,
+						},
+					},
+				},
+			},
+		},
+	}
+
+	return pod
+}
+
 func labelsForModel(m *kubeaiv1.Model) map[string]string {
 	return map[string]string{"app": "model", "model": m.Name}
 }
@@ -571,6 +703,8 @@ func (r *ModelReconciler) lookupServerImage(model *kubeaiv1.Model, profile confi
 	switch model.Spec.Engine {
 	case kubeaiv1.OLlamaEngine:
 		serverImgs = r.ModelServers.OLlama.Images
+	case kubeaiv1.FasterWhisperEngine:
+		serverImgs = r.ModelServers.FasterWhisper.Images
 	default:
 		serverImgs = r.ModelServers.VLLM.Images
 	}