From cee381fcec0a94a7583ea83b04313495740a3468 Mon Sep 17 00:00:00 2001 From: Michael Feil <63565275+michaelfeil@users.noreply.github.com> Date: Thu, 19 Dec 2024 06:11:42 +0100 Subject: [PATCH] infinity - Update config.yaml (#383) Pins version of infinity --- custom-server/infinity-embedding-server/config.yaml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/custom-server/infinity-embedding-server/config.yaml b/custom-server/infinity-embedding-server/config.yaml index 00b5f778..afe6c8f3 100644 --- a/custom-server/infinity-embedding-server/config.yaml +++ b/custom-server/infinity-embedding-server/config.yaml @@ -1,16 +1,25 @@ base_image: image: python:3.11-slim docker_server: - start_command: sh -c "infinity_emb v2 --model-id BAAI/bge-small-en-v1.5" + start_command: sh -c "HF_TOKEN=$(cat /secrets/hf_access_token) infinity_emb v2 --batch-size 64 --model-id BAAI/bge-small-en-v1.5 --revision main" readiness_endpoint: /health liveness_endpoint: /health predict_endpoint: /embeddings server_port: 7997 +build_commands: # optional step to download the weights of the model into the image +- sh -c "HF_TOKEN=$(cat /secrets/hf_access_token) infinity_emb v2 --preload-only --no-model-warmup --model-id BAAI/bge-small-en-v1.5 --revision main" resources: accelerator: L4 use_gpu: true model_name: infinity-embedding-server requirements: -- infinity-emb[all] +- infinity-emb[all]==0.0.72 +runtime: + predict_concurrency : 40 environment_variables: hf_access_token: null + # constrain api to at most 256 sentences per request, for better load-balancing + INFINITY_MAX_CLIENT_BATCH_SIZE: 256 + # constrain model to a max backpressure of INFINITY_MAX_CLIENT_BATCH_SIZE * predict_concurrency = 10241 requests + INFINITY_QUEUE_SIZE: 10241 + DO_NOT_TRACK: 1