Skip to content

Commit

Permalink
Add cli arg --speculation-max-batch-size
Browse files Browse the repository at this point in the history
  • Loading branch information
tgaddair committed Nov 19, 2024
1 parent 80730b7 commit 7ca481b
Showing 1 changed file with 15 additions and 0 deletions.
15 changes: 15 additions & 0 deletions launcher/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,10 @@ struct Args {
#[clap(long, env)]
speculative_tokens: Option<usize>,

// The maximum batch size past which speculative decoding is disabled.
#[clap(long, env)]
speculation_max_batch_size: Option<usize>,

/// The list of adapter ids to preload during initialization (to avoid cold start times).
#[clap(long, env)]
preloaded_adapter_ids: Vec<String>,
Expand Down Expand Up @@ -638,6 +642,7 @@ fn shard_manager(
quantize: Option<Quantization>,
compile: bool,
speculative_tokens: Option<usize>,
speculation_max_batch_size: Option<usize>,
preloaded_adapter_ids: Vec<String>,
preloaded_adapter_source: Option<String>,
predibase_api_token: Option<String>,
Expand Down Expand Up @@ -802,6 +807,14 @@ fn shard_manager(
envs.push(("CHUNKED_PREFILL".into(), chunked_prefill.to_string().into()));
}

// Speculative decoding max batch size
if let Some(speculation_max_batch_size) = speculation_max_batch_size {
envs.push((
"LORAX_SPECULATION_MAX_BATCH_SIZE".into(),
speculation_max_batch_size.to_string().into(),
));
}

// Backend
if backend == Backend::FlashInfer {
envs.push(("FLASH_INFER".into(), "1".into()));
Expand Down Expand Up @@ -1244,6 +1257,7 @@ fn spawn_shards(
let quantize = args.quantize;
let compile = args.compile;
let speculative_tokens = args.speculative_tokens;
let speculation_max_batch_size = args.speculation_max_batch_size;
let preloaded_adapter_ids = args.preloaded_adapter_ids.clone();
let preloaded_adapter_source = args.preloaded_adapter_source.clone();
let predibase_api_token = args.predibase_api_token.clone();
Expand Down Expand Up @@ -1271,6 +1285,7 @@ fn spawn_shards(
quantize,
compile,
speculative_tokens,
speculation_max_batch_size,
preloaded_adapter_ids,
preloaded_adapter_source,
predibase_api_token,
Expand Down

0 comments on commit 7ca481b

Please sign in to comment.