From cee381fcec0a94a7583ea83b04313495740a3468 Mon Sep 17 00:00:00 2001
From: Michael Feil <63565275+michaelfeil@users.noreply.github.com>
Date: Thu, 19 Dec 2024 06:11:42 +0100
Subject: [PATCH] infinity - Update config.yaml (#383)

Pins version of infinity
---
 custom-server/infinity-embedding-server/config.yaml | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/custom-server/infinity-embedding-server/config.yaml b/custom-server/infinity-embedding-server/config.yaml
index 00b5f778..afe6c8f3 100644
--- a/custom-server/infinity-embedding-server/config.yaml
+++ b/custom-server/infinity-embedding-server/config.yaml
@@ -1,16 +1,25 @@
 base_image:
   image: python:3.11-slim
 docker_server:
-  start_command: sh -c "infinity_emb v2 --model-id BAAI/bge-small-en-v1.5"
+  start_command: sh -c "HF_TOKEN=$(cat /secrets/hf_access_token) infinity_emb v2 --batch-size 64 --model-id BAAI/bge-small-en-v1.5 --revision main"
   readiness_endpoint: /health
   liveness_endpoint: /health
   predict_endpoint: /embeddings
   server_port: 7997
+build_commands: # optional step to download the weights of the model into the image
+- sh -c "HF_TOKEN=$(cat /secrets/hf_access_token) infinity_emb v2 --preload-only --no-model-warmup --model-id BAAI/bge-small-en-v1.5 --revision main"
 resources:
   accelerator: L4
   use_gpu: true
 model_name: infinity-embedding-server
 requirements:
-- infinity-emb[all]
+- infinity-emb[all]==0.0.72
+runtime:
+  predict_concurrency : 40
 environment_variables:
   hf_access_token: null
+  # constrain api to at most 256 sentences per request, for better load-balancing
+  INFINITY_MAX_CLIENT_BATCH_SIZE: 256
+  # constrain model to a max backpressure of INFINITY_MAX_CLIENT_BATCH_SIZE * predict_concurrency = 10241 requests
+  INFINITY_QUEUE_SIZE: 10241
+  DO_NOT_TRACK: 1