@@ -542,8 +542,6 @@ def __init__(
542542 self .block_size : int = 64
543543 # Engine worker queue port
544544 self .engine_worker_queue_port : str = "9923"
545- # Max model len
546- self .max_model_len : int = 3072 # max_seq_len
547545 # cuda visible devices
548546 self .device_ids : str = "0"
549547 # Input dtype
@@ -1402,7 +1400,6 @@ def __init__(
14021400 plas_attention_config : PlasAttentionConfig = None ,
14031401 speculative_config : SpeculativeConfig = None ,
14041402 tokenizer : str = None ,
1405- max_model_len : int = 8192 ,
14061403 ips : str = None ,
14071404 use_warmup : bool = False ,
14081405 limit_mm_per_prompt : Optional [Dict [str , Any ]] = None ,
@@ -1470,7 +1467,6 @@ def __init__(
14701467 if ip == self .host_ip :
14711468 self .node_rank = idx
14721469
1473- self .max_model_len = max_model_len
14741470 self .limit_mm_per_prompt = limit_mm_per_prompt
14751471 self .mm_processor_kwargs = mm_processor_kwargs
14761472 self .use_warmup = use_warmup
@@ -1534,20 +1530,20 @@ def postprocess(self):
15341530 if self .scheduler_config .max_num_batched_tokens is None :
15351531 if int (envs .ENABLE_V1_KVCACHE_SCHEDULER ):
15361532 if paddle .is_compiled_with_xpu ():
1537- self .scheduler_config .max_num_batched_tokens = self .max_model_len
1533+ self .scheduler_config .max_num_batched_tokens = self .model_config . max_model_len
15381534 else :
15391535 self .scheduler_config .max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
15401536 else :
15411537 if self .cache_config .enable_chunked_prefill :
15421538 self .scheduler_config .max_num_batched_tokens = 2048
15431539 else :
1544- self .scheduler_config .max_num_batched_tokens = self .max_model_len
1540+ self .scheduler_config .max_num_batched_tokens = self .model_config . max_model_len
15451541
15461542 if self .long_prefill_token_threshold == 0 :
1547- self .long_prefill_token_threshold = int (self .max_model_len * 0.04 )
1543+ self .long_prefill_token_threshold = int (self .model_config . max_model_len * 0.04 )
15481544
15491545 self .cache_config .postprocess (self .scheduler_config .max_num_batched_tokens , self .scheduler_config .max_num_seqs )
1550- self .cache_config .max_block_num_per_seq = int (self .max_model_len // self .cache_config .block_size )
1546+ self .cache_config .max_block_num_per_seq = int (self .model_config . max_model_len // self .cache_config .block_size )
15511547 if self .model_config is not None and self .model_config .enable_mm :
15521548 self .cache_config .enable_prefix_caching = False
15531549
@@ -1576,7 +1572,9 @@ def check(self):
15761572 f"but now it's { self .scheduler_config .max_num_seqs } ."
15771573 )
15781574 assert self .nnode >= 1 , f"nnode: { self .nnode } should no less than 1"
1579- assert self .max_model_len >= 16 , f"max_model_len: { self .max_model_len } should be larger than 16"
1575+ assert (
1576+ self .model_config .max_model_len >= 16
1577+ ), f"max_model_len: { self .model_config .max_model_len } should be larger than 16"
15801578 assert (
15811579 self .scheduler_config .max_num_seqs >= 1
15821580 ), f"max_num_seqs: { self .scheduler_config .max_num_seqs } should be larger than 1"
@@ -1585,10 +1583,11 @@ def check(self):
15851583 f"should be larger than or equal to max_num_seqs: { self .scheduler_config .max_num_seqs } "
15861584 )
15871585 assert (
1588- self .scheduler_config .max_num_batched_tokens <= self .max_model_len * self .scheduler_config .max_num_seqs
1586+ self .scheduler_config .max_num_batched_tokens
1587+ <= self .model_config .max_model_len * self .scheduler_config .max_num_seqs
15891588 ), (
15901589 f"max_num_batched_tokens: { self .scheduler_config .max_num_batched_tokens } should be larger"
1591- f"than or equal to max_num_seqs: { self .scheduler_config .max_num_seqs } * max_model_len: { self .max_model_len } "
1590+ f"than or equal to max_num_seqs: { self .scheduler_config .max_num_seqs } * max_model_len: { self .model_config . max_model_len } "
15921591 )
15931592 assert (
15941593 self .max_num_partial_prefills >= 1
@@ -1609,9 +1608,9 @@ def check(self):
16091608
16101609 if not self .cache_config .enable_chunked_prefill :
16111610 if not envs .ENABLE_V1_KVCACHE_SCHEDULER :
1612- assert self .scheduler_config .max_num_batched_tokens >= self .max_model_len , (
1611+ assert self .scheduler_config .max_num_batched_tokens >= self .model_config . max_model_len , (
16131612 f"max_num_batched_tokens: { self .scheduler_config .max_num_batched_tokens } "
1614- f"should be larger than or equal to max_model_len: { self .max_model_len } "
1613+ f"should be larger than or equal to max_model_len: { self .model_config . max_model_len } "
16151614 )
16161615 else :
16171616 assert self .scheduler_config .max_num_batched_tokens >= self .cache_config .block_size , (
@@ -1623,9 +1622,9 @@ def check(self):
16231622 assert (
16241623 self .cache_config .enable_chunked_prefill is True
16251624 ), "Chunked prefill must be enabled to set max_num_partial_prefills > 1"
1626- assert self .long_prefill_token_threshold < self .max_model_len , (
1625+ assert self .long_prefill_token_threshold < self .model_config . max_model_len , (
16271626 f"long_prefill_token_threshold: { self .long_prefill_token_threshold } should be less than"
1628- f" max_model_len: { self .max_model_len } "
1627+ f" max_model_len: { self .model_config . max_model_len } "
16291628 )
16301629
16311630 if self .guided_decoding_backend is not None :
0 commit comments