Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .cd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ cd vllm-gaudi/.cd/
- `VLLM_DECODE_BS_BUCKET_STEP` - Determines the batch size step for decode operations, influencing how batches are grouped and processed.
- `VLLM_PROMPT_BS_BUCKET_STEP` - Sets the batch size step for prompt processing, impacting how prompt batches are handled.
- `VLLM_PROMPT_SEQ_BUCKET_STEP` - Controls the step size for prompt sequence allocation, affecting how sequences are bucketed for processing.
- `VLLM_PROMPT_CTX_BUCKET_STEP` - Controls the step size for prompt ctx allocation, affecting how ctx blocks are bucketed for processing.

**Example usage:**

Expand Down
3 changes: 3 additions & 0 deletions docs/configuration/env_vars.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
- sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
- sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
- sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `1024`
- sequence ctx min (`VLLM_PROMPT_CTX_BUCKET_MIN`): `0`
- sequence ctx step (`VLLM_PROMPT_CTX_BUCKET_STEP`): `1`
- sequence ctx max (`VLLM_PROMPT_CTX_BUCKET_MAX`): `(max_model_len - block_size) // block_size`
- Decode:
- batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1`
- batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)`
Expand Down
8 changes: 6 additions & 2 deletions vllm_gaudi/extension/bucketing/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def get_prompt_cfgs(self, max_num_prefill_seqs, block_size, max_num_batched_toke
step=block_size,
max=max_model_len)
max_ctx = math.ceil((max_model_len - prompt_query_bucket_cfg[0]) // block_size)
prompt_ctx_bucket_cfg = [0, 1, max_ctx]
prompt_ctx_bucket_cfg = read_bucket_settings('prompt', 'ctx', min=0, step=1, max=max_ctx)

if use_merged_prefill:
prev_prompt_bs_bucket_cfg = tuple(prompt_bs_bucket_cfg)
Expand All @@ -32,7 +32,11 @@ def get_prompt_cfgs(self, max_num_prefill_seqs, block_size, max_num_batched_toke
prompt_bs_bucket_cfg = (1, 1, 1)
query_min, query_step, _ = prev_prompt_query_bucket_cfg
prompt_query_bucket_cfg = (query_min, query_step * 4, max_num_batched_tokens)
prompt_ctx_bucket_cfg = (0, 4, max_ctx * max_num_prefill_seqs)
prompt_ctx_bucket_cfg = read_bucket_settings('prompt',
'ctx',
min=0,
step=4,
max=max_ctx * max_num_prefill_seqs)

msg = ('Merged prefill is enabled!\n'
'Overriding prompt bucketing settings!\n'
Expand Down
3 changes: 3 additions & 0 deletions vllm_gaudi/extension/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def get_user_flags():
Env('VLLM_PROMPT_SEQ_BUCKET_STEP', int),
Env('VLLM_PROMPT_SEQ_BUCKET_MAX', int),
Env('VLLM_PROMPT_SEQ_BUCKET_LIMIT', int),
Env('VLLM_PROMPT_CTX_BUCKET_MIN', int),
Env('VLLM_PROMPT_CTX_BUCKET_STEP', int),
Env('VLLM_PROMPT_CTX_BUCKET_MAX', int),
Env('VLLM_DECODE_BS_BUCKET_MIN', int),
Env('VLLM_DECODE_BS_BUCKET_STEP', int),
Env('VLLM_DECODE_BS_BUCKET_MAX', int),
Expand Down