@@ -187,7 +187,6 @@ def __init__(
187187 self .redundant_experts_num = 0
188188 self .seed = 0
189189 self .quantization = None
190- self .reasoning_parser = None
191190 self .pad_token_id : int = - 1
192191 self .eos_tokens_lens : int = 2
193192 self .lm_head_fp32 : bool = False
@@ -540,10 +539,6 @@ def __init__(
540539 # Do profile or not
541540 self .do_profile : bool = False
542541
543- # guided decoding backend
544- self .guided_decoding_backend : str = None
545- # disable any whitespace for guided decoding
546- self .disable_any_whitespace : bool = True
547542 self .pod_ip : str = None
548543 # enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce).
549544 self .disable_custom_all_reduce : bool = False
@@ -1128,12 +1123,6 @@ class PoolerConfig:
11281123 """
11291124
11301125
1131- class LoRAConfig :
1132- """LoRA Config"""
1133-
1134- pass
1135-
1136-
11371126class CacheConfig :
11381127 """
11391128 Configuration for the KV cache.
@@ -1364,6 +1353,25 @@ def print(self):
13641353 logger .info ("=============================================================" )
13651354
13661355
1356+ class StructuredOutputsConfig :
1357+ """
1358+ Configuration for structured outputs
1359+ """
1360+
1361+ def __init__ (
1362+ self ,
1363+ args ,
1364+ ) -> None :
1365+ self .reasoning_parser : Optional [str ] = None
1366+ self .guided_decoding_backend : Optional [str ] = None
1367+ # disable any whitespace for guided decoding
1368+ self .disable_any_whitespace : bool = True
1369+
1370+ for key , value in args .items ():
1371+ if hasattr (self , key ) and value != "None" :
1372+ setattr (self , key , value )
1373+
1374+
13671375class FDConfig :
13681376 """
13691377 The configuration class which contains all fastdeploy-related configuration. This
@@ -1384,6 +1392,7 @@ def __init__(
13841392 graph_opt_config : GraphOptimizationConfig = None ,
13851393 plas_attention_config : PlasAttentionConfig = None ,
13861394 speculative_config : SpeculativeConfig = None ,
1395+ structured_outputs_config : StructuredOutputsConfig = None ,
13871396 tokenizer : str = None ,
13881397 ips : str = None ,
13891398 use_warmup : bool = False ,
@@ -1393,9 +1402,6 @@ def __init__(
13931402 max_num_partial_prefills : int = 1 ,
13941403 max_long_partial_prefills : int = 1 ,
13951404 long_prefill_token_threshold : int = 0 ,
1396- reasoning_parser : str = None ,
1397- guided_decoding_backend : Optional [str ] = None ,
1398- disable_any_whitespace : bool = False ,
13991405 early_stop_config : Optional [Dict [str , Any ]] = None ,
14001406 tool_parser : str = None ,
14011407 test_mode = False ,
@@ -1413,6 +1419,7 @@ def __init__(
14131419 self .decoding_config : DecodingConfig = decoding_config # type: ignore
14141420 self .cache_config : CacheConfig = cache_config # type: ignore
14151421 self .plas_attention_config : Optional [PlasAttentionConfig ] = plas_attention_config
1422+ self .structured_outputs_config : StructuredOutputsConfig = structured_outputs_config
14161423 # Initialize cuda graph capture list
14171424 if self .graph_opt_config .cudagraph_capture_sizes is None :
14181425 self .graph_opt_config ._set_cudagraph_sizes (max_num_seqs = self .scheduler_config .max_num_seqs )
@@ -1459,9 +1466,7 @@ def __init__(
14591466 self .max_num_partial_prefills = max_num_partial_prefills
14601467 self .max_long_partial_prefills = max_long_partial_prefills
14611468 self .long_prefill_token_threshold = long_prefill_token_threshold
1462- self .reasoning_parser = reasoning_parser
1463- self .guided_decoding_backend = guided_decoding_backend
1464- self .disable_any_whitespace = disable_any_whitespace
1469+
14651470 self ._str_to_list ("innode_prefill_ports" , int )
14661471
14671472 if envs .FD_FOR_TORCH_MODEL_FORMAT :
@@ -1483,12 +1488,12 @@ def __init__(
14831488 else :
14841489 self .worker_num_per_node = num_ranks
14851490
1486- self .device_ids = "," .join ([str (i ) for i in range (self .worker_num_per_node )])
1487- self .device_ids = os .getenv ("CUDA_VISIBLE_DEVICES" , self .device_ids )
1491+ self .parallel_config . device_ids = "," .join ([str (i ) for i in range (self .worker_num_per_node )])
1492+ self .parallel_config . device_ids = os .getenv ("CUDA_VISIBLE_DEVICES" , self . parallel_config .device_ids )
14881493 if current_platform .is_xpu ():
1489- self .device_ids = os .getenv ("XPU_VISIBLE_DEVICES" , self .device_ids )
1494+ self .parallel_config . device_ids = os .getenv ("XPU_VISIBLE_DEVICES" , self . parallel_config .device_ids )
14901495 if current_platform .is_intel_hpu ():
1491- self .device_ids = os .getenv ("HPU_VISIBLE_DEVICES" , self .device_ids )
1496+ self .parallel_config . device_ids = os .getenv ("HPU_VISIBLE_DEVICES" , self . parallel_config .device_ids )
14921497
14931498 self .read_from_config ()
14941499 self .postprocess ()
@@ -1501,7 +1506,7 @@ def postprocess(self):
15011506 """
15021507 calculate some parameters
15031508 """
1504- self .local_device_ids = self .device_ids .split ("," )[: self .parallel_config .tensor_parallel_size ]
1509+ self .local_device_ids = self .parallel_config . device_ids .split ("," )[: self .parallel_config .tensor_parallel_size ]
15051510
15061511 if self .parallel_config .tensor_parallel_size <= self .worker_num_per_node or self .node_rank == 0 :
15071512 self .is_master = True
@@ -1532,12 +1537,15 @@ def postprocess(self):
15321537 if self .model_config is not None and self .model_config .enable_mm :
15331538 self .cache_config .enable_prefix_caching = False
15341539
1535- if self .guided_decoding_backend == "auto" :
1540+ if (
1541+ self .structured_outputs_config is not None
1542+ and self .structured_outputs_config .guided_decoding_backend == "auto"
1543+ ):
15361544 if current_platform .is_xpu () or self .speculative_config .method is not None :
15371545 logger .warning ("Speculative Decoding and XPU currently do not support Guided decoding, set off." )
1538- self .guided_decoding_backend = "off"
1546+ self .structured_outputs_config . guided_decoding_backend = "off"
15391547 else :
1540- self .guided_decoding_backend = "xgrammar"
1548+ self .structured_outputs_config . guided_decoding_backend = "xgrammar"
15411549
15421550 if self .scheduler_config .splitwise_role == "mixed" :
15431551 self .model_config .moe_phase = MoEPhase (phase = "prefill" )
@@ -1612,15 +1620,18 @@ def check(self):
16121620 f" max_model_len: { self .model_config .max_model_len } "
16131621 )
16141622
1615- if self .guided_decoding_backend is not None :
1616- assert self .guided_decoding_backend in [
1623+ if (
1624+ self .structured_outputs_config is not None
1625+ and self .structured_outputs_config .guided_decoding_backend is not None
1626+ ):
1627+ assert self .structured_outputs_config .guided_decoding_backend in [
16171628 "xgrammar" ,
16181629 "XGrammar" ,
16191630 "auto" ,
16201631 "off" ,
1621- ], f"Only support xgrammar、auto guided decoding backend, but got { self .guided_decoding_backend } ."
1632+ ], f"Only support xgrammar、auto guided decoding backend, but got { self .structured_outputs_config . guided_decoding_backend } ."
16221633
1623- if self .guided_decoding_backend != "off" :
1634+ if self .structured_outputs_config . guided_decoding_backend != "off" :
16241635 # TODO: speculative decoding support guided_decoding
16251636 assert (
16261637 self .speculative_config .method is None
0 commit comments