Skip to content

Commit a37c941

Browse files
YuanRishengroot
andauthored
[FDConfig]Remove reasoning_parser/guided_decoding_backend/disable_any_whitespace/device_ids in FDConfig (#4362)
* remove devices id * fix unittest * fix ce --------- Co-authored-by: root <[email protected]>
1 parent d1637db commit a37c941

File tree

14 files changed

+73
-59
lines changed

14 files changed

+73
-59
lines changed

fastdeploy/config.py

Lines changed: 40 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,6 @@ def __init__(
187187
self.redundant_experts_num = 0
188188
self.seed = 0
189189
self.quantization = None
190-
self.reasoning_parser = None
191190
self.pad_token_id: int = -1
192191
self.eos_tokens_lens: int = 2
193192
self.lm_head_fp32: bool = False
@@ -540,10 +539,6 @@ def __init__(
540539
# Do profile or not
541540
self.do_profile: bool = False
542541

543-
# guided decoding backend
544-
self.guided_decoding_backend: str = None
545-
# disable any whitespace for guided decoding
546-
self.disable_any_whitespace: bool = True
547542
self.pod_ip: str = None
548543
# enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce).
549544
self.disable_custom_all_reduce: bool = False
@@ -1128,12 +1123,6 @@ class PoolerConfig:
11281123
"""
11291124

11301125

1131-
class LoRAConfig:
1132-
"""LoRA Config"""
1133-
1134-
pass
1135-
1136-
11371126
class CacheConfig:
11381127
"""
11391128
Configuration for the KV cache.
@@ -1364,6 +1353,25 @@ def print(self):
13641353
logger.info("=============================================================")
13651354

13661355

1356+
class StructuredOutputsConfig:
1357+
"""
1358+
Configuration for structured outputs
1359+
"""
1360+
1361+
def __init__(
1362+
self,
1363+
args,
1364+
) -> None:
1365+
self.reasoning_parser: Optional[str] = None
1366+
self.guided_decoding_backend: Optional[str] = None
1367+
# disable any whitespace for guided decoding
1368+
self.disable_any_whitespace: bool = True
1369+
1370+
for key, value in args.items():
1371+
if hasattr(self, key) and value != "None":
1372+
setattr(self, key, value)
1373+
1374+
13671375
class FDConfig:
13681376
"""
13691377
The configuration class which contains all fastdeploy-related configuration. This
@@ -1384,6 +1392,7 @@ def __init__(
13841392
graph_opt_config: GraphOptimizationConfig = None,
13851393
plas_attention_config: PlasAttentionConfig = None,
13861394
speculative_config: SpeculativeConfig = None,
1395+
structured_outputs_config: StructuredOutputsConfig = None,
13871396
tokenizer: str = None,
13881397
ips: str = None,
13891398
use_warmup: bool = False,
@@ -1393,9 +1402,6 @@ def __init__(
13931402
max_num_partial_prefills: int = 1,
13941403
max_long_partial_prefills: int = 1,
13951404
long_prefill_token_threshold: int = 0,
1396-
reasoning_parser: str = None,
1397-
guided_decoding_backend: Optional[str] = None,
1398-
disable_any_whitespace: bool = False,
13991405
early_stop_config: Optional[Dict[str, Any]] = None,
14001406
tool_parser: str = None,
14011407
test_mode=False,
@@ -1413,6 +1419,7 @@ def __init__(
14131419
self.decoding_config: DecodingConfig = decoding_config # type: ignore
14141420
self.cache_config: CacheConfig = cache_config # type: ignore
14151421
self.plas_attention_config: Optional[PlasAttentionConfig] = plas_attention_config
1422+
self.structured_outputs_config: StructuredOutputsConfig = structured_outputs_config
14161423
# Initialize cuda graph capture list
14171424
if self.graph_opt_config.cudagraph_capture_sizes is None:
14181425
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.scheduler_config.max_num_seqs)
@@ -1459,9 +1466,7 @@ def __init__(
14591466
self.max_num_partial_prefills = max_num_partial_prefills
14601467
self.max_long_partial_prefills = max_long_partial_prefills
14611468
self.long_prefill_token_threshold = long_prefill_token_threshold
1462-
self.reasoning_parser = reasoning_parser
1463-
self.guided_decoding_backend = guided_decoding_backend
1464-
self.disable_any_whitespace = disable_any_whitespace
1469+
14651470
self._str_to_list("innode_prefill_ports", int)
14661471

14671472
if envs.FD_FOR_TORCH_MODEL_FORMAT:
@@ -1483,12 +1488,12 @@ def __init__(
14831488
else:
14841489
self.worker_num_per_node = num_ranks
14851490

1486-
self.device_ids = ",".join([str(i) for i in range(self.worker_num_per_node)])
1487-
self.device_ids = os.getenv("CUDA_VISIBLE_DEVICES", self.device_ids)
1491+
self.parallel_config.device_ids = ",".join([str(i) for i in range(self.worker_num_per_node)])
1492+
self.parallel_config.device_ids = os.getenv("CUDA_VISIBLE_DEVICES", self.parallel_config.device_ids)
14881493
if current_platform.is_xpu():
1489-
self.device_ids = os.getenv("XPU_VISIBLE_DEVICES", self.device_ids)
1494+
self.parallel_config.device_ids = os.getenv("XPU_VISIBLE_DEVICES", self.parallel_config.device_ids)
14901495
if current_platform.is_intel_hpu():
1491-
self.device_ids = os.getenv("HPU_VISIBLE_DEVICES", self.device_ids)
1496+
self.parallel_config.device_ids = os.getenv("HPU_VISIBLE_DEVICES", self.parallel_config.device_ids)
14921497

14931498
self.read_from_config()
14941499
self.postprocess()
@@ -1501,7 +1506,7 @@ def postprocess(self):
15011506
"""
15021507
calculate some parameters
15031508
"""
1504-
self.local_device_ids = self.device_ids.split(",")[: self.parallel_config.tensor_parallel_size]
1509+
self.local_device_ids = self.parallel_config.device_ids.split(",")[: self.parallel_config.tensor_parallel_size]
15051510

15061511
if self.parallel_config.tensor_parallel_size <= self.worker_num_per_node or self.node_rank == 0:
15071512
self.is_master = True
@@ -1532,12 +1537,15 @@ def postprocess(self):
15321537
if self.model_config is not None and self.model_config.enable_mm:
15331538
self.cache_config.enable_prefix_caching = False
15341539

1535-
if self.guided_decoding_backend == "auto":
1540+
if (
1541+
self.structured_outputs_config is not None
1542+
and self.structured_outputs_config.guided_decoding_backend == "auto"
1543+
):
15361544
if current_platform.is_xpu() or self.speculative_config.method is not None:
15371545
logger.warning("Speculative Decoding and XPU currently do not support Guided decoding, set off.")
1538-
self.guided_decoding_backend = "off"
1546+
self.structured_outputs_config.guided_decoding_backend = "off"
15391547
else:
1540-
self.guided_decoding_backend = "xgrammar"
1548+
self.structured_outputs_config.guided_decoding_backend = "xgrammar"
15411549

15421550
if self.scheduler_config.splitwise_role == "mixed":
15431551
self.model_config.moe_phase = MoEPhase(phase="prefill")
@@ -1612,15 +1620,18 @@ def check(self):
16121620
f" max_model_len: {self.model_config.max_model_len}"
16131621
)
16141622

1615-
if self.guided_decoding_backend is not None:
1616-
assert self.guided_decoding_backend in [
1623+
if (
1624+
self.structured_outputs_config is not None
1625+
and self.structured_outputs_config.guided_decoding_backend is not None
1626+
):
1627+
assert self.structured_outputs_config.guided_decoding_backend in [
16171628
"xgrammar",
16181629
"XGrammar",
16191630
"auto",
16201631
"off",
1621-
], f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}."
1632+
], f"Only support xgrammar、auto guided decoding backend, but got {self.structured_outputs_config.guided_decoding_backend}."
16221633

1623-
if self.guided_decoding_backend != "off":
1634+
if self.structured_outputs_config.guided_decoding_backend != "off":
16241635
# TODO: speculative decoding support guided_decoding
16251636
assert (
16261637
self.speculative_config.method is None

fastdeploy/engine/args_utils.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
PoolerConfig,
3737
RunnerOption,
3838
SpeculativeConfig,
39+
StructuredOutputsConfig,
3940
TaskOption,
4041
)
4142
from fastdeploy.platforms import current_platform
@@ -1063,7 +1064,7 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig:
10631064

10641065
early_stop_cfg = self.create_early_stop_config()
10651066
early_stop_cfg.update_enable_early_stop(self.enable_early_stop)
1066-
1067+
structured_outputs_config: StructuredOutputsConfig = StructuredOutputsConfig(args=all_dict)
10671068
if port_availability_check:
10681069
assert is_port_available(
10691070
"0.0.0.0", int(self.engine_worker_queue_port[parallel_cfg.local_data_parallel_id])
@@ -1077,19 +1078,17 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig:
10771078
load_config=load_cfg,
10781079
parallel_config=parallel_cfg,
10791080
speculative_config=speculative_cfg,
1081+
structured_outputs_config=structured_outputs_config,
10801082
ips=self.ips,
10811083
use_warmup=self.use_warmup,
10821084
limit_mm_per_prompt=self.limit_mm_per_prompt,
10831085
mm_processor_kwargs=self.mm_processor_kwargs,
1084-
reasoning_parser=self.reasoning_parser,
10851086
tool_parser=self.tool_call_parser,
10861087
innode_prefill_ports=self.innode_prefill_ports,
10871088
max_num_partial_prefills=self.max_num_partial_prefills,
10881089
max_long_partial_prefills=self.max_long_partial_prefills,
10891090
long_prefill_token_threshold=self.long_prefill_token_threshold,
10901091
graph_opt_config=graph_opt_cfg,
10911092
plas_attention_config=plas_attention_config,
1092-
guided_decoding_backend=self.guided_decoding_backend,
1093-
disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
10941093
early_stop_config=early_stop_cfg,
10951094
)

fastdeploy/engine/common_engine.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,10 +128,10 @@ def __init__(self, cfg, start_queue=True):
128128
)
129129

130130
self.guided_decoding_checker = None
131-
if self.cfg.guided_decoding_backend != "off":
131+
if self.cfg.structured_outputs_config.guided_decoding_backend != "off":
132132
self.guided_decoding_checker = schema_checker(
133-
self.cfg.guided_decoding_backend,
134-
disable_any_whitespace=self.cfg.disable_any_whitespace,
133+
self.cfg.structured_outputs_config.guided_decoding_backend,
134+
disable_any_whitespace=self.cfg.structured_outputs_config.disable_any_whitespace,
135135
)
136136
self._init_worker_monitor_signals()
137137

fastdeploy/engine/engine.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def __init__(self, cfg):
9090

9191
self.input_processor = InputPreprocessor(
9292
cfg.tokenizer,
93-
cfg.reasoning_parser,
93+
cfg.structured_outputs_config.reasoning_parser,
9494
cfg.limit_mm_per_prompt,
9595
cfg.mm_processor_kwargs,
9696
cfg.model_config.enable_mm,
@@ -128,7 +128,7 @@ def start(self, api_server_pid=None):
128128

129129
# If block numer is specified and model is deployed in mixed mode, start cache manager first
130130
if not self.do_profile and self.cfg.scheduler_config.splitwise_role != "mixed":
131-
device_ids = self.cfg.device_ids.split(",")
131+
device_ids = self.cfg.parallel_config.device_ids.split(",")
132132
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix, True)
133133

134134
# Start workers
@@ -162,7 +162,7 @@ def check_worker_initialize_status_func(res: dict):
162162
if self.do_profile:
163163
self._stop_profile()
164164
elif self.cfg.cache_config.enable_prefix_caching:
165-
device_ids = self.cfg.device_ids.split(",")
165+
device_ids = self.cfg.parallel_config.device_ids.split(",")
166166
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix, False)
167167

168168
# Launch components: scheduler, cache_manager, expert_service et.al.
@@ -426,7 +426,7 @@ def _setting_environ_variables(self):
426426
"""
427427
variables = {
428428
"ENABLE_FASTDEPLOY_LOAD_MODEL_CONCURRENCY": 0,
429-
"LOAD_STATE_DICT_THREAD_NUM": len(self.cfg.device_ids.split(",")),
429+
"LOAD_STATE_DICT_THREAD_NUM": len(self.cfg.parallel_config.device_ids.split(",")),
430430
"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python",
431431
"FLAGS_use_append_attn": 1,
432432
"NCCL_ALGO": "Ring",
@@ -503,11 +503,11 @@ def _start_worker_service(self):
503503
if self.cfg.ips is not None:
504504
ips = ",".join(self.cfg.ips)
505505
arguments = (
506-
f" --devices {self.cfg.device_ids} {py_script}"
506+
f" --devices {self.cfg.parallel_config.device_ids} {py_script}"
507507
f" --max_num_seqs {self.cfg.scheduler_config.max_num_seqs} --max_model_len {self.cfg.model_config.max_model_len}"
508508
f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}"
509509
f" --model {self.cfg.model_config.model!s}"
510-
f" --device_ids {self.cfg.device_ids}"
510+
f" --device_ids {self.cfg.parallel_config.device_ids}"
511511
f" --tensor_parallel_size {self.cfg.parallel_config.tensor_parallel_size}"
512512
f" --engine_worker_queue_port {ports}"
513513
f" --pod_ip {self.cfg.master_ip}"
@@ -527,10 +527,10 @@ def _start_worker_service(self):
527527
f" --think_end_id {self.cfg.model_config.think_end_id}"
528528
f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'"
529529
f" --graph_optimization_config '{self.cfg.graph_opt_config.to_json_string()}'"
530-
f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
530+
f" --guided_decoding_backend {self.cfg.structured_outputs_config.guided_decoding_backend}"
531531
f" --load_strategy {self.cfg.load_config.load_strategy}"
532532
f" --early_stop_config '{self.cfg.early_stop_config.to_json_string()}'"
533-
f" --reasoning_parser {self.cfg.reasoning_parser}"
533+
f" --reasoning_parser {self.cfg.structured_outputs_config.reasoning_parser}"
534534
f" --load_choices {self.cfg.load_config.load_choices}"
535535
f" --plas_attention_config '{self.cfg.plas_attention_config.to_json_string()}'"
536536
f" --ips {ips}"
@@ -546,7 +546,7 @@ def _start_worker_service(self):
546546
"enable_chunked_prefill": self.cfg.cache_config.enable_chunked_prefill,
547547
"do_profile": self.do_profile,
548548
"dynamic_load_weight": self.cfg.load_config.dynamic_load_weight,
549-
"disable_any_whitespace": self.cfg.disable_any_whitespace,
549+
"disable_any_whitespace": self.cfg.structured_outputs_config.disable_any_whitespace,
550550
"disable_custom_all_reduce": self.cfg.parallel_config.disable_custom_all_reduce,
551551
"enable_logprob": self.cfg.model_config.enable_logprob,
552552
"lm_head_fp32": self.cfg.model_config.lm_head_fp32,
@@ -643,7 +643,7 @@ def _stop_profile(self):
643643
self.cfg.cache_config.reset(num_gpu_blocks)
644644
self.engine.resource_manager.reset_cache_config(self.cfg.cache_config)
645645
if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed":
646-
device_ids = self.cfg.device_ids.split(",")
646+
device_ids = self.cfg.parallel_config.device_ids.split(",")
647647
self.cache_manager_processes = self.engine.start_cache_service(
648648
device_ids, self.ipc_signal_suffix, self.cfg.scheduler_config.splitwise_role != "mixed"
649649
)

fastdeploy/engine/expert_service.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def __init__(self, cfg, local_data_parallel_id, start_queue=True):
5353
end_pos = start_pos + self.cfg.parallel_config.tensor_parallel_size
5454
if cfg.scheduler_config.splitwise_role != "mixed":
5555
self.cfg.cache_config.rdma_comm_ports = self.cfg.cache_config.rdma_comm_ports[start_pos:end_pos]
56-
self.cfg.local_device_ids = self.cfg.device_ids.split(",")[start_pos:end_pos]
56+
self.cfg.local_device_ids = self.cfg.parallel_config.device_ids.split(",")[start_pos:end_pos]
5757
llm_logger.info(f"local_data_parallel_id: {local_data_parallel_id}")
5858
self.cfg.disaggregate_info = None
5959

fastdeploy/model_executor/guided_decoding/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def get_guided_backend(
4141
Raises:
4242
ValueError: If the specified backend is not supported
4343
"""
44-
if fd_config.parallel_config.guided_decoding_backend.lower() == "xgrammar":
44+
if fd_config.structured_outputs_config.guided_decoding_backend.lower() == "xgrammar":
4545
from fastdeploy.model_executor.guided_decoding.xgrammar_backend import (
4646
XGrammarBackend,
4747
)
@@ -52,7 +52,7 @@ def get_guided_backend(
5252
)
5353
else:
5454
raise ValueError(
55-
f"Get unsupported backend {fd_config.parallel_config.guided_decoding_backend},"
55+
f"Get unsupported backend {fd_config.structured_outputs_config.guided_decoding_backend},"
5656
f" please check your configuration."
5757
)
5858

fastdeploy/model_executor/guided_decoding/base_guided_decoding.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,9 +142,9 @@ def __init__(self, fd_config: FDConfig):
142142
self.reasoning_parser = None
143143

144144
self.hf_tokenizer = self._get_tokenizer_hf()
145-
if self.fd_config.model_config.reasoning_parser:
145+
if self.fd_config.structured_outputs_config.reasoning_parser:
146146
reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser(
147-
self.fd_config.model_config.reasoning_parser
147+
self.fd_config.structured_outputs_config.reasoning_parser
148148
)
149149
self.reasoning_parser = reasoning_parser_obj(self.hf_tokenizer)
150150

fastdeploy/model_executor/guided_decoding/xgrammar_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ def __init__(
212212
self.vocab_size = fd_config.model_config.vocab_size
213213
self.batch_size = fd_config.scheduler_config.max_num_seqs
214214

215-
self.any_whitespace = not fd_config.parallel_config.disable_any_whitespace
215+
self.any_whitespace = not fd_config.structured_outputs_config.disable_any_whitespace
216216

217217
try:
218218
tokenizer_info = TokenizerInfo.from_huggingface(self.hf_tokenizer, vocab_size=self.vocab_size)

fastdeploy/splitwise/splitwise_connector.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ def send_cache_infos(self, tasks, current_id):
375375
if tasks[i].disaggregate_info["transfer_protocol"] == "ipc":
376376
cache_info = {
377377
"request_id": tasks[i].request_id,
378-
"device_ids": self.cfg.device_ids.split(","),
378+
"device_ids": self.cfg.parallel_config.device_ids.split(","),
379379
"transfer_protocol": "ipc",
380380
"dest_block_ids": tasks[i].disaggregate_info["block_tables"],
381381
}
@@ -395,7 +395,7 @@ def send_cache_infos(self, tasks, current_id):
395395
else:
396396
cache_info = {
397397
"request_id": tasks[i].request_id,
398-
"device_ids": self.cfg.device_ids.split(","),
398+
"device_ids": self.cfg.parallel_config.device_ids.split(","),
399399
"ip": self.cfg.host_ip,
400400
"rdma_ports": self.cfg.disaggregate_info["cache_info"]["rdma"]["rdma_port"],
401401
"transfer_protocol": "rdma",

fastdeploy/worker/gcu_model_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def __init__(
7373
self.enable_logprob = fd_config.model_config.enable_logprob
7474

7575
self.guided_backend = None
76-
if self.fd_config.parallel_config.guided_decoding_backend != "off":
76+
if self.fd_config.structured_outputs_config.guided_decoding_backend != "off":
7777
self.guided_backend = get_guided_backend(fd_config=self.fd_config)
7878

7979
# Sampler

0 commit comments

Comments
 (0)