basetenlabs · michaelfeil · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024
diff --git a/internal/config.yaml b/internal/config.yaml
@@ -0,0 +1,50 @@
+
+model_metadata:
+  tags:
+  - openai-compatible
+model_name: briton-spec-dec
+python_version: py310
+requirements: []
+resources:
+  accelerator: A10G
+  cpu: '1'
+  memory: 24Gi
+  use_gpu: true
+runtime:
+  predict_concurrency: 1000
+secrets:
+  hf_access_token: None
+trt_llm:
+  draft:
+    build:
+      base_model: deepseek
+      checkpoint_repository:
+        repo: deepseek-ai/deepseek-coder-1.3b-instruct
+        source: HF
+      max_seq_len: 10000
+      plugin_configuration:
+        use_paged_context_fmha: true
+      tensor_parallel_count: 1
+    runtime:
+      batch_scheduler_policy: max_utilization
+      enable_chunked_context: true
+      kv_cache_free_gpu_mem_fraction: 0.6
+      num_draft_tokens: 4
+  target:
+    build:
+      base_model: deepseek
+      checkpoint_repository:
+        repo: deepseek-ai/deepseek-coder-1.3b-instruct
+        source: HF
+      max_draft_len: 10
+      max_seq_len: 10000
+      plugin_configuration:
+        use_paged_context_fmha: true
+      speculative_decoding_mode: DRAFT_TOKENS_EXTERNAL
+      tensor_parallel_count: 1
+    runtime:
+      batch_scheduler_policy: max_utilization
+      enable_chunked_context: true
+      kv_cache_free_gpu_mem_fraction: 0.65
+      request_default_max_tokens: 1000
+  total_token_limit: 500000
diff --git a/text-embeddings-inference/.internal/Dockerfile b/text-embeddings-inference/.internal/Dockerfile
@@ -0,0 +1,9 @@
+ARG TAG=1.6
+# this image builds a truss-compatible image with the text-embeddings-inference image as base
+# it mainly requires python3
+# optional, git and git-lfs are installed to allow for easy cloning of the huggingface model repos.
+FROM ghcr.io/huggingface/text-embeddings-inference:${TAG}
+RUN apt-get update && apt-get install -y python3 python3-pip git git-lfs
+RUN git lfs install
+ENTRYPOINT ["text-embeddings-router"]
+CMD ["--json-output"]
diff --git a/text-embeddings-inference/.internal/roll_out_docker.sh b/text-embeddings-inference/.internal/roll_out_docker.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+set -e
+
+# Map architectures to prefixes
+declare -A ARCHES=(
+  ["cpu"]="cpu-"
+  ["turing"]="turing-"
+  ["ampere80"]=""
+  ["ampere86"]="86-"
+  ["adalovelace"]="89-"
+  ["hopper"]="hopper-"
+)
+
+# Define version and target
+VERSION="1.6"
+TARGET="baseten/text-embeddings-inference-mirror"
+
+# Build and push images
+for ARCH in "${!ARCHES[@]}"; do
+  ARCH_PREFIX=${ARCHES[$ARCH]}
+  TAG="${TARGET}:${ARCH_PREFIX}${VERSION}"
+
+  echo "Building and pushing image for $ARCH: $TAG"
+
+  docker buildx build -t "$TAG" --build-arg TAG="${ARCH_PREFIX}${VERSION}" --push .
+done
+
+echo "All images have been built and pushed."
diff --git a/text-embeddings-inference/README.md b/text-embeddings-inference/README.md
@@ -1,32 +1,109 @@
-# Text Embeddings Inference Truss (A100)
-This is an example of a Truss model that uses the Text Embeddings Inference API.
+# Text Embeddings Inference Truss
 
-## How to Deploy
-In the `config.yaml` file, you can specify the model to use, as well as other arguments per the [Text Embeddings Inference API](https://huggingface.co/docs/text-embeddings-inference) documentation.
-Note that not all models are supported by TEI.
+This is a [Truss](https://truss.baseten.co/) to deploy [text-embeddings-inference](https://github.com/huggingface/text-embeddings-inference), a high performance embedding and reranking api.
 
-To run the model, you can use the following command:
-```bash
-truss push
+## Deployment
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+3. [Required for gated model] Retrieve your Hugging Face token from the [settings](https://huggingface.co/settings/tokens). Set your Hugging Face token as a Baseten secret [here](https://app.baseten.co/settings/secrets) with the key `hf_access_key`.
+
+First, clone this repository:
+
+```sh
+git clone https://github.com/basetenlabs/truss-examples.git
+cd text-embeddings-inference
+```
+
+With `text-embeddings-inference` as your working directory, you can deploy the model with the following command, paste your Baseten API key if prompted.
+
+```sh
+truss push --publish
+```
+
+## Performance Optimization:
+
+The config.yaml contains a couple of variables that can be tuned, depending on:
+- which GPU is used
+- which model is deployed
+- how many concurrent requests users are sending
+
+The deployment example is for Bert-large and a Nvidia-L4. Bert-large has a maxiumum sequence length of 512 tokens per sentence.
+For Bert-large architecture & the L4, there are marginal gains above a batch-size of 16000 tokens.
+
+### Concurrent requests
+```
+--max-concurrent-requests 40
+# and
+runtime:
+  predict_concurrency : 40
+```
+The following set the number of parallel `post` requests.
+In this case we allow 40 parallel requests to be handled per replica & should allow to batch requests from multiple users together, reaching high token counts. Potentially 40 single parallel requests with one sequence each could fully utilize the GPU. `1*40*512=20480`
+
+
+### Tokens per batch
+```
+--max-batch-tokens 32768
 ```
 
-## How to Generate Embeddings
-The truss expects:
-- "texts" parameter with either a single string or an array of strings.
-- "stream" parameter with a boolean value (default is false).
+This number of total tokens in a batch. For embedding models, this will determine the VRAM usage.
+As most of TEI's models are implemented with `nested` attention implementation, `32768 tokens` could mean `64 sentence with 512 tokens` or `512 sentences with 64 tokens`. While the first will take slightly longer to compute, the peak VRAM usage will stay roughly the same. For `llama` or `mistral` based `7b` embedding models, we recommend setting it a lower setting e.g.
+```
+--max-batch-tokens 8192
+```
+
+### Client batch size
+```
+--max-client-batch-size 256
+```
+This determines the number of sentences / items in a single request.
+For optimal autoscaling that gets regulated by metrics such as requests/second in Baseten's infrastructure, you want to set this as low as possible. OpenAI-API historically set it to `--max-client-batch-size 32`, which could help for more aggressive autoscaling and thus better latency. One the other hand, frameworks such as LLamaIndex, Langchain or Haystack might prefer or even require higher batch_sizes, especially if the user code is old-fashioned and sends requests 1-by-1 in a for loop. This depends on your users & how you are planning to use your deployment.
+
+### Endpoint and OpenAPI
+Change to /rerank or /predict if you want to use the rerank or predict endpoint.
+Embedding model:
+```yaml
+  predict_endpoint: /v1/embeddings
+```
+Rerank model:
+```yaml
+  predict_endpoint: /rerank
+```
+Classification model:
+```yaml
+  predict_endpoint: /predict
+```
+
+## Call your model
+
+### curl
 
-To generate embeddings, you can use the following command:
 ```bash
-truss predict --d '{"texts": "This is a test"}'
+curl -X POST https://model-xxx.api.baseten.co/development/predict \
+        -H "Authorization: Api-Key YOUR_API_KEY" \
+        -d '{"input": "text string"}'
 ```
 
-# Notes
-- The base image is created by installing python on one of the images provided here: https://github.com/huggingface/text-embeddings-inference?tab=readme-ov-file. The current example was built for Ampere 80 architecture, which includes the A100.
-- Multi-GPU appears to have no impact on performance
-- Be aware of the token limit for each embedding model. It is currently up to the caller to ensure that the texts do not exceed the token limit.
 
-# Improvements
-- It may be possible to create a universal base image using the `-all` dockerfile to support a GPU-agnostic implementation
-- handle truncation / chunking with averaging (or other technique) when tokens > supported
-- investigate impact of dtype on performance
-- Add prompt support to embed with prompt
+### request python library
+
+```python
+import os
+import requests
+
+resp = requests.post(
+    "https://model-xxx.api.baseten.co/environments/production/predict",
+    headers={"Authorization": f"Api-Key {os.environ['BASETEN_API_KEY']}"},
+    json={"input": ["text string", "second string"]},
+)
+
+print(resp.json())
+```
+
+
+## Support
+
+If you have any questions or need assistance, please open an issue in this repository or contact our support team.
diff --git a/text-embeddings-inference/config.yaml b/text-embeddings-inference/config.yaml
@@ -1,25 +1,30 @@
 base_image:
-  image: vshulman/ampere-truss-custom-text-embeddings-inference:1.0
-  python_executable_path: /usr/bin/python
-build:
-  arguments:
-    model_id: nomic-ai/nomic-embed-text-v1.5
-  model_server: TrussServer
-environment_variables: {}
-external_package_dirs: []
-model_cache:
-- repo_id: nomic-ai/nomic-embed-text-v1.5
-model_metadata: {}
-model_name: TEI Experiment
-python_version: py39
-runtime:
-  predict_concurrency: 512
-requirements: []
+  # select an image: L4
+  # CPU	baseten/text-embeddings-inference-mirror:cpu-1.6
+  # Turing (T4, ...)	baseten/text-embeddings-inference-mirror:turing-1.6
+  # Ampere 80 (A100, A30)	baseten/text-embeddings-inference-mirror:1.6
+  # Ampere 86 (A10, A10G, A40, ...)	baseten/text-embeddings-inference-mirror:86-1.6
+  # Ada Lovelace (L4, ...)	baseten/text-embeddings-inference-mirror:89-1.6
+  # Hopper (H100/H100 40GB)	baseten/text-embeddings-inference-mirror:hopper-1.6
+  image: baseten/text-embeddings-inference-mirror:89-1.6
+model_metadata:
+  repo_id: BAAI/bge-base-en-v1.5
+docker_server:
+  start_command: sh -c "text-embeddings-router --port 7997 --model-id /data/local-model --max-client-batch-size 256 --max-concurrent-requests 40 --max-batch-tokens 32768"
+  readiness_endpoint: /health
+  liveness_endpoint: /health
+  # change to /rerank or /predict if you want to use the rerank or predict endpoint
+  # https://huggingface.github.io/text-embeddings-inference/
+  predict_endpoint: /v1/embeddings
+  server_port: 7997
 resources:
-  accelerator: A100
-  cpu: '1'
-  memory: 2Gi
+  accelerator: L4
   use_gpu: true
-secrets: {}
-system_packages:
-- python3.10-venv
+model_name: text-embeddings-inference trussless
+build_commands: # optional step to download the weights of the model into the image
+- git clone https://huggingface.co/BAAI/bge-base-en-v1.5 /data/local-model
+runtime:
+  predict_concurrency : 40
+environment_variables:
+  VLLM_LOGGING_LEVEL: WARNING
+  hf_access_token: null
diff --git a/text-embeddings-inference/model/__init__.py b/text-embeddings-inference/model/__init__.py
diff --git a/text-embeddings-inference/model/model.py b/text-embeddings-inference/model/model.py