Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add max-concurrent-requests as option in TGI server #331

Merged
merged 1 commit into from
Mar 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ resource "kubernetes_manifest" "default" {
namespace = var.namespace
model_id = var.model_id
gpu_count = var.gpu_count
max_concurrent_requests = var.max_concurrent_requests
ksa = var.ksa
hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
}))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ spec:
ports:
- containerPort: 80
image: "ghcr.io/huggingface/text-generation-inference:1.4.2"
args: ["--model-id", "${model_id}", "--num-shard", "${gpu_count}"] # , "{token}" tensor parallelism, should correspond to number of gpus below
args: ["--model-id", "${model_id}", "--num-shard", "${gpu_count}", "--max-concurrent-requests", "${max_concurrent_requests}"]
%{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
env:
- name: HUGGING_FACE_HUB_TOKEN # Related token consumption
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@ variable "gpu_count" {
}
}

variable "max_concurrent_requests" {
description = "Max concurrent requests allowed for TGI to handle at once. TGI will drop all requests once it hits this max-concurrent-requests limit."
type = number
nullable = false
# TODO: default is same as tgi's default for now, update with reasonable number.
default = 128
validation {
condition = var.max_concurrent_requests > 0
error_message = "Max conccurent requests must be greater than 0."
}
}

variable "ksa" {
description = "Kubernetes Service Account used for workload."
type = string
Expand Down