Skip to content

Commit

Permalink
add max-concurrent-requests as option in TGI server (#331)
Browse files Browse the repository at this point in the history
  • Loading branch information
annapendleton authored Mar 11, 2024
1 parent 77e0d1a commit 0160a85
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ resource "kubernetes_manifest" "default" {
namespace = var.namespace
model_id = var.model_id
gpu_count = var.gpu_count
max_concurrent_requests = var.max_concurrent_requests
ksa = var.ksa
hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
}))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ spec:
ports:
- containerPort: 80
image: "ghcr.io/huggingface/text-generation-inference:1.4.2"
args: ["--model-id", "${model_id}", "--num-shard", "${gpu_count}"] # , "{token}" tensor parallelism, should correspond to number of gpus below
args: ["--model-id", "${model_id}", "--num-shard", "${gpu_count}", "--max-concurrent-requests", "${max_concurrent_requests}"]
%{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
env:
- name: HUGGING_FACE_HUB_TOKEN # Related token consumption
Expand Down
12 changes: 12 additions & 0 deletions benchmarks/inference-server/text-generation-inference/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@ variable "gpu_count" {
}
}

variable "max_concurrent_requests" {
description = "Max concurrent requests allowed for TGI to handle at once. TGI will drop all requests once it hits this max-concurrent-requests limit."
type = number
nullable = false
# TODO: default is same as tgi's default for now, update with reasonable number.
default = 128
validation {
condition = var.max_concurrent_requests > 0
error_message = "Max conccurent requests must be greater than 0."
}
}

variable "ksa" {
description = "Kubernetes Service Account used for workload."
type = string
Expand Down

0 comments on commit 0160a85

Please sign in to comment.