Skip to content

Commit

Permalink
faster whisper implementation (#177)
Browse files Browse the repository at this point in the history
Co-authored-by: Nick Stogner <[email protected]>
  • Loading branch information
samos123 and nstogner authored Sep 5, 2024
1 parent c062fb7 commit 604966d
Show file tree
Hide file tree
Showing 15 changed files with 395 additions and 34 deletions.
11 changes: 7 additions & 4 deletions api/v1/model_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ type ModelSpec struct {

Features []ModelFeature `json:"features"`

// +kubebuilder:validation:Enum=OLlama;VLLM
// +kubebuilder:validation:Enum=OLlama;VLLM;FasterWhisper
Engine string `json:"engine"`

Replicas *int32 `json:"replicas,omitempty"`
Expand Down Expand Up @@ -57,17 +57,20 @@ type ModelSpec struct {
Env map[string]string `json:"env,omitempty"`
}

// +kubebuilder:validation:Enum=TextGeneration;TextEmbedding
// +kubebuilder:validation:Enum=TextGeneration;TextEmbedding;SpeechToText
type ModelFeature string

const (
ModelFeatureTextGeneration = "TextGeneration"
ModelFeatureTextEmbedding = "TextEmbedding"
// TODO (samos123): Add validation that Speech to Text only supports Faster Whisper.
ModelFeatureSpeechToText = "SpeechToText"
)

const (
OLlamaEngine = "OLlama"
VLLMEngine = "VLLM"
OLlamaEngine = "OLlama"
VLLMEngine = "VLLM"
FasterWhisperEngine = "FasterWhisper"
)

// ModelStatus defines the observed state of Model
Expand Down
6 changes: 3 additions & 3 deletions charts/kubeai/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.2.7
version: 0.3.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand All @@ -26,12 +26,12 @@ dependencies:
# https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/
- name: crds
condition: crds.enabled
version: 0.1.0
version: 0.1.1
# Models are the machine learning models that kubeai will serve.
# These are instances of the Model custom resource.
- name: models
condition: models.enabled
version: 0.1.0
version: 0.1.1


keywords: ["LLM", "AI"]
Expand Down
2 changes: 1 addition & 1 deletion charts/kubeai/charts/crds/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0
version: 0.1.1

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
2 changes: 2 additions & 0 deletions charts/kubeai/charts/crds/crds/kubeai.org_models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ spec:
enum:
- OLlama
- VLLM
- FasterWhisper
type: string
env:
additionalProperties:
Expand All @@ -59,6 +60,7 @@ spec:
enum:
- TextGeneration
- TextEmbedding
- SpeechToText
type: string
type: array
image:
Expand Down
2 changes: 1 addition & 1 deletion charts/kubeai/charts/models/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0
version: 0.1.1

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
7 changes: 7 additions & 0 deletions charts/kubeai/charts/models/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,4 +76,11 @@ catalog:
url: "ollama://qwen2:0.5b"
engine: OLlama
resourceProfile: cpu:1
faster-whisper-medium-en-cpu:
enabled: false
features: ["SpeechToText"]
owner: Systran
url: "hf://Systran/faster-whisper-medium.en"
engine: FasterWhisper
resourceProfile: cpu:1

4 changes: 4 additions & 0 deletions charts/kubeai/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ modelServers:
OLlama:
images:
default: "ollama/ollama:latest"
FasterWhisper:
images:
default: "fedirz/faster-whisper-server:latest-cpu"
nvidia-gpu: "fedirz/faster-whisper-server:latest-cuda"

resourceProfiles:
cpu:
Expand Down
10 changes: 6 additions & 4 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@ The simple AI platform that runs on Kubernetes.
> \- Some Google Engineer
✅️ Drop-in replacement for OpenAI with API compatibility
🚀 Serve OSS LLMs on CPUs or GPUs
⚖️ Scale from zero, autoscale based on load
🚀 Serve OSS LLMs on CPUs or GPUs
💬 Fast speed to text on CPU or GPU
🛠️ Zero dependencies (no Istio, Knative, etc.)
🤖 Operates OSS model servers (vLLM and Ollama)
🔋 Additional OSS addons included ([OpenWebUI](https://github.com/open-webui/open-webui) i.e. ChatGPT UI)
✉️ Plug-n-play with cloud messaging systems (Kafka, PubSub, etc.)
🤖 Operates OSS model servers (vLLM, Ollama, FasterWhisper)
🔋 Chat UI included ([OpenWebUI](https://github.com/open-webui/open-webui)
✉️ Batch inference with messaging systems (Kafka, PubSub, etc.)

## Architecture

Expand Down Expand Up @@ -122,6 +123,7 @@ Checkout our documenation on [kubeai.org](https://www.kubeai.org) to find info o
/v1/completions
/v1/embeddings
/v1/models
/v1/audio/transcriptions

# Planned #
# /v1/assistants/*
Expand Down
48 changes: 48 additions & 0 deletions docs/how-to/configure-speech-to-text.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Configure Speech To Text

KubeAI provides a Speech to Text endpoint that can be used to transcribe audio files. This guide will walk you through the steps to enable this feature.

## Enable Speech to Text model
You can create neew models by creating a Model CRD object or by enabling a model from the model catalog.

### Enable from model catalog
KubeAI provides predefined models in the model catalog. To enable the Speech to Text model, you can set the `enabled` flag to `true` in the `helm-values.yaml` file.

```yaml
models:
catalog:
faster-whisper-medium-en-cpu:
enabled: true
minReplicas: 1
```
### Enable by creating Model CRD
You can also create a Model CRD object to enable the Speech to Text model. Here is an example of a Model CRD object for the Speech to Text model:
```yaml
apiVersion: kubeai.org/v1
kind: Model
metadata:
name: faster-whisper-medium-en-cpu
spec:
features: [SpeechToText]
owner: Systran
url: hf://Systran/faster-whisper-medium.en
engine: FasterWhisper
minReplicas: 0
maxReplicas: 3
resourceProfile: cpu:1
```
## Usage
The Speech to Text endpoint is available at `/openai/v1/transcriptions`.

Example usage using curl:

```bash
curl -L -o kubeai.mp4 https://github.com/user-attachments/assets/711d1279-6af9-4c6c-a052-e59e7730b757
curl http://localhost:8000/openai/v1/audio/transcriptions \
-F "[email protected]" \
-F "language=en" \
-F "model=faster-whisper-medium-en-cpu"
```
5 changes: 3 additions & 2 deletions internal/config/system.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,9 @@ type MessageStream struct {
}

type ModelServers struct {
OLlama ModelServer `json:"OLlama"`
VLLM ModelServer `json:"VLLM"`
OLlama ModelServer `json:"OLlama"`
VLLM ModelServer `json:"VLLM"`
FasterWhisper ModelServer `json:"FasterWhisper"`
}

type ModelServer struct {
Expand Down
134 changes: 134 additions & 0 deletions internal/modelcontroller/model_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ func (r *ModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
switch model.Spec.Engine {
case kubeaiv1.OLlamaEngine:
podForModel = r.oLlamaPodForModel
case kubeaiv1.FasterWhisperEngine:
podForModel = r.fasterWhisperPodForModel
default:
podForModel = r.vLLMPodForModel
}
Expand Down Expand Up @@ -481,6 +483,136 @@ func (r *ModelReconciler) oLlamaPodForModel(m *kubeaiv1.Model, index int32) *cor

}

func (r *ModelReconciler) fasterWhisperPodForModel(m *kubeaiv1.Model, index int32) *corev1.Pod {
lbs := labelsForModel(m)
ann := r.annotationsForModel(m)
if _, ok := ann[kubeaiv1.ModelPodPortAnnotation]; !ok {
ann[kubeaiv1.ModelPodPortAnnotation] = "8000"
}

args := []string{}
args = append(args, m.Spec.Args...)

env := []corev1.EnvVar{
{
Name: "WHISPER__MODEL",
Value: strings.TrimPrefix(m.Spec.URL, "hf://"),
},
{
Name: "ENABLE_UI",
Value: "false",
},
{
// TODO: Conditionally set this token based on whether
// huggingface is the model source.
Name: "HUGGING_FACE_HUB_TOKEN",
ValueFrom: &corev1.EnvVarSource{
SecretKeyRef: &corev1.SecretKeySelector{
LocalObjectReference: corev1.LocalObjectReference{
Name: r.HuggingfaceSecretName,
},
Key: "token",
Optional: ptr.To(true),
},
},
},
}
var envKeys []string
for key := range m.Spec.Env {
envKeys = append(envKeys, key)
}
sort.Strings(envKeys)
for _, key := range envKeys {
env = append(env, corev1.EnvVar{
Name: key,
Value: m.Spec.Env[key],
})
}

pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("model-%s-%d", m.Name, index),
Namespace: m.Namespace,
Labels: lbs,
Annotations: ann,
},
Spec: corev1.PodSpec{
NodeSelector: m.Spec.NodeSelector,
Containers: []corev1.Container{
{
Name: "server",
Image: m.Spec.Image,
Args: args,
Env: env,
Resources: *m.Spec.Resources,
Ports: []corev1.ContainerPort{
{
ContainerPort: 8000,
Protocol: corev1.ProtocolTCP,
Name: "http",
},
},
StartupProbe: &corev1.Probe{
// Give the model 30 minutes to start up.
FailureThreshold: 900,
PeriodSeconds: 2,
TimeoutSeconds: 2,
SuccessThreshold: 1,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/health",
Port: intstr.FromString("http"),
},
},
},
ReadinessProbe: &corev1.Probe{
FailureThreshold: 3,
PeriodSeconds: 10,
TimeoutSeconds: 2,
SuccessThreshold: 1,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/health",
Port: intstr.FromString("http"),
},
},
},
LivenessProbe: &corev1.Probe{
FailureThreshold: 3,
PeriodSeconds: 30,
TimeoutSeconds: 3,
SuccessThreshold: 1,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: "/health",
Port: intstr.FromString("http"),
},
},
},
VolumeMounts: []corev1.VolumeMount{
{
Name: "dshm",
MountPath: "/dev/shm",
},
},
},
},
Volumes: []corev1.Volume{
{
Name: "dshm",
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{
Medium: corev1.StorageMediumMemory,
},
},
},
},
},
}

return pod
}

func labelsForModel(m *kubeaiv1.Model) map[string]string {
return map[string]string{"app": "model", "model": m.Name}
}
Expand Down Expand Up @@ -571,6 +703,8 @@ func (r *ModelReconciler) lookupServerImage(model *kubeaiv1.Model, profile confi
switch model.Spec.Engine {
case kubeaiv1.OLlamaEngine:
serverImgs = r.ModelServers.OLlama.Images
case kubeaiv1.FasterWhisperEngine:
serverImgs = r.ModelServers.FasterWhisper.Images
default:
serverImgs = r.ModelServers.VLLM.Images
}
Expand Down
Loading

0 comments on commit 604966d

Please sign in to comment.