Skip to content

Commit dd691a4

Browse files
authored
Merge branch 'develop' into pre
2 parents ddb5276 + a21e16e commit dd691a4

37 files changed

+134
-124
lines changed

fastdeploy/config.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -542,8 +542,6 @@ def __init__(
542542
self.block_size: int = 64
543543
# Engine worker queue port
544544
self.engine_worker_queue_port: str = "9923"
545-
# Max model len
546-
self.max_model_len: int = 3072 # max_seq_len
547545
# cuda visible devices
548546
self.device_ids: str = "0"
549547
# Input dtype
@@ -1402,7 +1400,6 @@ def __init__(
14021400
plas_attention_config: PlasAttentionConfig = None,
14031401
speculative_config: SpeculativeConfig = None,
14041402
tokenizer: str = None,
1405-
max_model_len: int = 8192,
14061403
ips: str = None,
14071404
use_warmup: bool = False,
14081405
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
@@ -1470,7 +1467,6 @@ def __init__(
14701467
if ip == self.host_ip:
14711468
self.node_rank = idx
14721469

1473-
self.max_model_len = max_model_len
14741470
self.limit_mm_per_prompt = limit_mm_per_prompt
14751471
self.mm_processor_kwargs = mm_processor_kwargs
14761472
self.use_warmup = use_warmup
@@ -1534,20 +1530,20 @@ def postprocess(self):
15341530
if self.scheduler_config.max_num_batched_tokens is None:
15351531
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
15361532
if paddle.is_compiled_with_xpu():
1537-
self.scheduler_config.max_num_batched_tokens = self.max_model_len
1533+
self.scheduler_config.max_num_batched_tokens = self.model_config.max_model_len
15381534
else:
15391535
self.scheduler_config.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
15401536
else:
15411537
if self.cache_config.enable_chunked_prefill:
15421538
self.scheduler_config.max_num_batched_tokens = 2048
15431539
else:
1544-
self.scheduler_config.max_num_batched_tokens = self.max_model_len
1540+
self.scheduler_config.max_num_batched_tokens = self.model_config.max_model_len
15451541

15461542
if self.long_prefill_token_threshold == 0:
1547-
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
1543+
self.long_prefill_token_threshold = int(self.model_config.max_model_len * 0.04)
15481544

15491545
self.cache_config.postprocess(self.scheduler_config.max_num_batched_tokens, self.scheduler_config.max_num_seqs)
1550-
self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)
1546+
self.cache_config.max_block_num_per_seq = int(self.model_config.max_model_len // self.cache_config.block_size)
15511547
if self.model_config is not None and self.model_config.enable_mm:
15521548
self.cache_config.enable_prefix_caching = False
15531549

@@ -1576,7 +1572,9 @@ def check(self):
15761572
f"but now it's {self.scheduler_config.max_num_seqs}."
15771573
)
15781574
assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1"
1579-
assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16"
1575+
assert (
1576+
self.model_config.max_model_len >= 16
1577+
), f"max_model_len: {self.model_config.max_model_len} should be larger than 16"
15801578
assert (
15811579
self.scheduler_config.max_num_seqs >= 1
15821580
), f"max_num_seqs: {self.scheduler_config.max_num_seqs} should be larger than 1"
@@ -1585,10 +1583,11 @@ def check(self):
15851583
f"should be larger than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs}"
15861584
)
15871585
assert (
1588-
self.scheduler_config.max_num_batched_tokens <= self.max_model_len * self.scheduler_config.max_num_seqs
1586+
self.scheduler_config.max_num_batched_tokens
1587+
<= self.model_config.max_model_len * self.scheduler_config.max_num_seqs
15891588
), (
15901589
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} should be larger"
1591-
f"than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs} * max_model_len: {self.max_model_len}"
1590+
f"than or equal to max_num_seqs: {self.scheduler_config.max_num_seqs} * max_model_len: {self.model_config.max_model_len}"
15921591
)
15931592
assert (
15941593
self.max_num_partial_prefills >= 1
@@ -1609,9 +1608,9 @@ def check(self):
16091608

16101609
if not self.cache_config.enable_chunked_prefill:
16111610
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
1612-
assert self.scheduler_config.max_num_batched_tokens >= self.max_model_len, (
1611+
assert self.scheduler_config.max_num_batched_tokens >= self.model_config.max_model_len, (
16131612
f"max_num_batched_tokens: {self.scheduler_config.max_num_batched_tokens} "
1614-
f"should be larger than or equal to max_model_len: {self.max_model_len}"
1613+
f"should be larger than or equal to max_model_len: {self.model_config.max_model_len}"
16151614
)
16161615
else:
16171616
assert self.scheduler_config.max_num_batched_tokens >= self.cache_config.block_size, (
@@ -1623,9 +1622,9 @@ def check(self):
16231622
assert (
16241623
self.cache_config.enable_chunked_prefill is True
16251624
), "Chunked prefill must be enabled to set max_num_partial_prefills > 1"
1626-
assert self.long_prefill_token_threshold < self.max_model_len, (
1625+
assert self.long_prefill_token_threshold < self.model_config.max_model_len, (
16271626
f"long_prefill_token_threshold: {self.long_prefill_token_threshold} should be less than"
1628-
f" max_model_len: {self.max_model_len}"
1627+
f" max_model_len: {self.model_config.max_model_len}"
16291628
)
16301629

16311630
if self.guided_decoding_backend is not None:

fastdeploy/engine/args_utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1079,7 +1079,6 @@ def create_engine_config(self) -> FDConfig:
10791079
cache_config=cache_cfg,
10801080
load_config=load_cfg,
10811081
parallel_config=parallel_cfg,
1082-
max_model_len=self.max_model_len,
10831082
speculative_config=speculative_cfg,
10841083
ips=self.ips,
10851084
use_warmup=self.use_warmup,

fastdeploy/engine/common_engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -630,7 +630,7 @@ def _fetch_request():
630630
available_blocks=available_blocks,
631631
block_size=self.cfg.cache_config.block_size,
632632
reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num,
633-
max_num_batched_tokens=self.cfg.max_model_len,
633+
max_num_batched_tokens=self.cfg.model_config.max_model_len,
634634
batch=num_prefill_batch,
635635
)
636636
if self.cfg.scheduler_config.splitwise_role != "mixed":

fastdeploy/engine/engine.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -187,15 +187,15 @@ def check_worker_initialize_status_func(res: dict):
187187
num_gpu_blocks = self.cfg.cache_config.num_gpu_blocks_override or self.cfg.cache_config.total_block_num
188188
num_cpu_blocks = self.cfg.cache_config.num_cpu_blocks
189189
max_running_requests = min(
190-
(num_gpu_blocks + num_cpu_blocks) * block_size // self.cfg.max_model_len,
190+
(num_gpu_blocks + num_cpu_blocks) * block_size // self.cfg.model_config.max_model_len,
191191
self.cfg.scheduler_config.max_num_seqs,
192192
)
193193
console_logger.info(
194194
f"Detected {num_gpu_blocks} gpu blocks and {num_cpu_blocks} cpu blocks in cache (block size: {block_size})."
195195
)
196196
console_logger.info(
197197
f"FastDeploy will be serving {max_running_requests} running requests "
198-
f"if each sequence reaches its maximum length: {self.cfg.max_model_len}"
198+
f"if each sequence reaches its maximum length: {self.cfg.model_config.max_model_len}"
199199
)
200200

201201
return True
@@ -248,30 +248,28 @@ def add_requests(self, task, sampling_params=None, **kwargs):
248248
chat_template_kwargs = kwargs.get("chat_template_kwargs") or {}
249249
chat_template_kwargs["chat_template"] = kwargs.get("chat_template")
250250
kwargs["chat_template_kwargs"] = chat_template_kwargs
251-
request = self.data_processor.process_request(request, self.cfg.max_model_len, **kwargs)
251+
request = self.data_processor.process_request(request, self.cfg.model_config.max_model_len, **kwargs)
252252
request.prompt_token_ids_len = len(request.prompt_token_ids)
253253
request.need_prefill_tokens = request.prompt_token_ids_len
254254
input_ids_len = request.prompt_token_ids_len
255255
request.set(
256256
"max_tokens",
257257
min(
258-
self.cfg.max_model_len - input_ids_len,
258+
self.cfg.model_config.max_model_len - input_ids_len,
259259
request.get("max_tokens"),
260260
),
261261
)
262262
min_tokens = request.get("min_tokens")
263-
if input_ids_len + min_tokens >= self.cfg.max_model_len:
263+
if input_ids_len + min_tokens >= self.cfg.model_config.max_model_len:
264264
error_msg = (
265265
f"Input text is too long, length of prompt token({input_ids_len}) "
266266
f"+ min_dec_len ({min_tokens}) >= max_model_len "
267267
)
268268
llm_logger.error(error_msg)
269269
raise EngineError(error_msg, error_code=400)
270270

271-
if input_ids_len > self.cfg.max_model_len:
272-
error_msg = (
273-
f"Length of input token({input_ids_len}) exceeds the limit max_model_len({self.cfg.max_model_len})."
274-
)
271+
if input_ids_len > self.cfg.model_config.max_model_len:
272+
error_msg = f"Length of input token({input_ids_len}) exceeds the limit max_model_len({self.cfg.model_config.max_model_len})."
275273
llm_logger.error(error_msg)
276274
raise EngineError(error_msg, error_code=400)
277275

@@ -506,7 +504,7 @@ def _start_worker_service(self):
506504
ips = ",".join(self.cfg.ips)
507505
arguments = (
508506
f" --devices {self.cfg.device_ids} {py_script}"
509-
f" --max_num_seqs {self.cfg.scheduler_config.max_num_seqs} --max_model_len {self.cfg.max_model_len}"
507+
f" --max_num_seqs {self.cfg.scheduler_config.max_num_seqs} --max_model_len {self.cfg.model_config.max_model_len}"
510508
f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}"
511509
f" --model {self.cfg.model_config.model!s}"
512510
f" --device_ids {self.cfg.device_ids}"
@@ -587,7 +585,7 @@ def _format_and_add_data(self, prompts: dict):
587585
prompts["prompt"] = query_list
588586

589587
if "max_tokens" not in prompts:
590-
prompts["max_tokens"] = self.cfg.max_model_len
588+
prompts["max_tokens"] = self.cfg.model_config.max_model_len
591589

592590
self.add_requests(prompts)
593591
return prompts["request_id"]

fastdeploy/entrypoints/llm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def __init__(
9393
# Create the Engine
9494
self.llm_engine = LLMEngine.from_engine_args(engine_args=engine_args)
9595

96-
self.default_sampling_params = SamplingParams(max_tokens=self.llm_engine.cfg.max_model_len)
96+
self.default_sampling_params = SamplingParams(max_tokens=self.llm_engine.cfg.model_config.max_model_len)
9797

9898
self.llm_engine.start()
9999

fastdeploy/model_executor/layers/attention/append_attn_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def __init__(
8585
super().__init__()
8686
self.attention_metadata: AppendAttentionMetadata = None
8787
self.block_size: int = fd_config.cache_config.block_size
88-
self.max_seq_len: int = fd_config.parallel_config.max_model_len
88+
self.max_seq_len: int = fd_config.model_config.max_model_len
8989
self.rope_theta: float = (
9090
10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta
9191
)

fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def __init__(
8181
super().__init__()
8282
self.attention_metadata: BlockAttentionMetadata = None
8383
self.block_size = fd_config.cache_config.block_size
84-
self.max_seq_len = fd_config.parallel_config.max_model_len
84+
self.max_seq_len = fd_config.model_config.max_model_len
8585
self.rope_theta = 10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta
8686
self.rank = fd_config.parallel_config.tensor_parallel_rank
8787

fastdeploy/model_executor/layers/attention/flash_attn_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def __init__(
110110
"""
111111
super().__init__()
112112
self.attention_metadata: FlashAttentionMetadata = None
113-
self.max_seq_len = fd_config.parallel_config.max_model_len
113+
self.max_seq_len = fd_config.model_config.max_model_len
114114
self.causal = getattr(fd_config.model_config, "causal", True)
115115

116116
self.kv_num_heads = kv_num_heads

fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, head_
7373
self.attention_metadata = IluvatarAttentionMetadata()
7474
self.block_size = fd_config.parallel_config.block_size
7575
assert self.block_size == 16, "Iluvatar paged attn requires block_size must be 16."
76-
self.max_context_len = fd_config.parallel_config.max_model_len
76+
self.max_context_len = fd_config.model_config.max_model_len
7777
self.causal = getattr(fd_config.model_config, "causal", True)
7878
self.speculate_method = getattr(fd_config.parallel_config, "speculate_method", None)
7979
self.use_speculate = self.speculate_method is not None

fastdeploy/model_executor/layers/attention/mla_attention_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def __init__(
111111

112112
# 基础配置
113113
self.block_size: int = fd_config.cache_config.block_size
114-
self.max_seq_len: int = fd_config.parallel_config.max_model_len
114+
self.max_seq_len: int = fd_config.model_config.max_model_len
115115
self.rope_theta: float = (
116116
10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta
117117
)

0 commit comments

Comments
 (0)