-
Notifications
You must be signed in to change notification settings - Fork 384
[Misc] SLO-aware router with profile support #1192
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
96f4c12
3eba519
a6e81bc
a4a56da
5f533d9
dbfc1b9
1273614
f6975ef
9b3b338
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
apiVersion: kustomize.config.k8s.io/v1beta1 | ||
kind: Kustomization | ||
|
||
namespace: aibrix-system | ||
|
||
namePrefix: aibrix- | ||
|
||
resources: | ||
- ../../../gateway/gateway-plugin | ||
|
||
images: | ||
- name: gateway-plugins | ||
newName: aibrix/gateway-plugins | ||
newTag: nightly | ||
|
||
patches: | ||
- patch: |- # Use the '|' and '-' for inline patching | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
name: gateway-plugins | ||
spec: | ||
template: | ||
spec: | ||
containers: | ||
- name: gateway-plugin | ||
args: | ||
- -v=5 | ||
env: | ||
- name: AIBRIX_POD_METRIC_REFRESH_INTERVAL_MS | ||
value: "60000" | ||
- name: AIBRIX_GPU_OPTIMIZER_TRACING_FLAG | ||
value: "true" | ||
target: | ||
kind: Deployment | ||
name: gateway-plugins | ||
namespace: system | ||
version: v1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
name: gateway-plugins | ||
namespace: aibrix-system | ||
spec: | ||
replicas: 1 | ||
template: | ||
spec: | ||
affinity: | ||
nodeAffinity: # prevent gateway pod to be placed on gpu node. | ||
preferredDuringSchedulingIgnoredDuringExecution: | ||
- weight: 100 | ||
preference: | ||
matchExpressions: | ||
- key: vke.node.gpu.schedule | ||
operator: NotIn | ||
values: | ||
- nvidia | ||
containers: | ||
- name: gateway-plugin | ||
resources: | ||
limits: | ||
cpu: "2" | ||
memory: 8Gi | ||
requests: | ||
cpu: "2" | ||
memory: 8Gi | ||
env: | ||
- name: AIBRIX_PREFIX_CACHE_TOKENIZER_TYPE | ||
value: "character" | ||
- name: AIBRIX_PREFIX_CACHE_BLOCK_SIZE | ||
value: "128" | ||
- name: AIBRIX_PREFIX_CACHE_BLOCK_NUMBER | ||
value: "200000" | ||
- name: AIBRIX_PREFIX_CACHE_POD_RUNNING_REQUEST_IMBALANCE_ABS_COUNT | ||
value: "16" | ||
- name: AIBRIX_PREFIX_CACHE_STANDARD_DEVIATION_FACTOR | ||
value: "2" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,17 +1,16 @@ | ||
apiVersion: kustomize.config.k8s.io/v1beta1 | ||
kind: Kustomization | ||
|
||
namespace: aibrix-system | ||
|
||
namePrefix: aibrix- | ||
|
||
resources: | ||
- ../../../gateway/gateway-plugin | ||
- ../../dev/gateway-plugin | ||
|
||
patches: | ||
- path: gateway_plugins_patch.yaml | ||
|
||
images: | ||
- name: busybox | ||
newName: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/busybox | ||
newTag: stable | ||
- name: gateway-plugins | ||
- name: aibrix/gateway-plugins | ||
newName: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/gateway-plugins | ||
newTag: nightly |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -113,5 +113,17 @@ test-gateway2: | |
"max_tokens": 512 \ | ||
}' | ||
|
||
test-router: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can this make target rename to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In fact, test-router is just for showcases. I can change the strategy to least-request. |
||
curl -v http://localhost:8888/v1/chat/completions \ | ||
-H "model: llama2-7b" \ | ||
-H "Content-Type: application/json" \ | ||
-H "Authorization: Bearer any_key" \ | ||
-H "routing-strategy: least-request" \ | ||
-d '{ \ | ||
"model": "llama2-7b", \ | ||
"messages": [{"role": "user", "content": "Say this is a test!"}], \ | ||
"temperature": 0.7 \ | ||
}' | ||
|
||
metrics: | ||
curl http://localhost:8000/metrics |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,9 @@ type Cache interface { | |
ModelCache | ||
MetricCache | ||
RequestTracker | ||
ProfileCache | ||
types.OutputPredictorProvider | ||
types.RouterProvider | ||
} | ||
|
||
// PodCache defines operations for pod information caching | ||
|
@@ -106,7 +109,10 @@ type MetricCache interface { | |
|
||
// RequestTracker defines operations for track workload statistics | ||
type RequestTracker interface { | ||
// AddRequestCount starts tracking request count | ||
// AddRequestCount tracks the start of a request after routing. | ||
// To support realtime statistics update and access, AddRequestCount can be called multiple times for a request. | ||
// As the result, implementation should ensure thread-safe access to the counterm and idempotency. | ||
// | ||
// Parameters: | ||
// ctx: Routing context | ||
// requestID: Unique request identifier | ||
|
@@ -115,14 +121,18 @@ type RequestTracker interface { | |
// int64: Trace term identifier | ||
AddRequestCount(ctx *types.RoutingContext, requestID string, modelName string) (traceTerm int64) | ||
|
||
// DoneRequestCount completes request count tracking, only one DoneRequestXXX should be called for a request | ||
// DoneRequestCount tracks the completion of a request without usage information like inputTokens and outputTokens. | ||
// Only one DoneRequestXXX should be called for a request. Idemptency is not required. | ||
// | ||
// Parameters: | ||
// requestID: Unique request identifier | ||
// modelName: Name of the model | ||
// traceTerm: Trace term identifier | ||
DoneRequestCount(ctx *types.RoutingContext, requestID string, modelName string, traceTerm int64) | ||
|
||
// DoneRequestTrace completes request tracing, only one DoneRequestXXX should be called for a request | ||
// DoneRequestTrace tracks the completion of a request with usage information like inputTokens and outputTokens. | ||
// Only one DoneRequestXXX should be called for a request. Idemptency is not required. | ||
// | ||
// Parameters: | ||
// ctx: Routing context | ||
// requestID: Unique request identifier | ||
|
@@ -132,3 +142,18 @@ type RequestTracker interface { | |
// traceTerm: Trace term identifier | ||
DoneRequestTrace(ctx *types.RoutingContext, requestID string, modelName string, inputTokens, outputTokens, traceTerm int64) | ||
} | ||
|
||
// ProfileCache defines operations for model profiles | ||
type ProfileCache interface { | ||
// GetModelProfileByPod gets model profile for a pod | ||
// Parameters: | ||
// pod: Pod object | ||
// modelName: Name of the model | ||
GetModelProfileByPod(pod *v1.Pod, modelName string) (*ModelGPUProfile, error) | ||
|
||
// GetModelProfileByDeploymentName gets model profile for a deployment | ||
// Parameters: | ||
// deploymentName: Name of the deployment | ||
// modelName: Name of the model | ||
GetModelProfileByDeploymentName(deploymentName string, modelName string) (*ModelGPUProfile, error) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TODO: we may use other objects to orchestrate pods in future. in that case, deployment might be changed in future. This looks good at this moment. one more problem is, deployment without namespace can not be used to identify a deployment. we need to append the namespace field There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the case of deployment using other objects, the GPU optimizer would have been changed as well (it monitors deployment only). For the support of ray clusters, let me keep a note, leave this comment open, and add an issue after merging. Can you explain the cases where "deployment without namespace can not be used to identify a deployment"? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The key is in fact in the format aibrix:profile_[model_name]_[deployment_name], the name is unique across namespaces given:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cannot we deploy the same model name in different namespace? |
||
} |
Uh oh!
There was an error while loading. Please reload this page.