File tree Expand file tree Collapse file tree 1 file changed +9
-0
lines changed
fastdeploy/model_executor/layers/attention Expand file tree Collapse file tree 1 file changed +9
-0
lines changed Original file line number Diff line number Diff line change 2424from paddleformers .utils .log import logger
2525
2626from fastdeploy .config import FDConfig
27+ from fastdeploy .model_executor .layers .quantization .kv_cache import (
28+ KvCacheQuantzationTypes ,
29+ )
2730from fastdeploy .model_executor .layers .quantization .quant_base import QuantMethodBase
2831
2932if TYPE_CHECKING :
@@ -102,6 +105,12 @@ def __init__(
102105
103106 if fd_config .quant_config and hasattr (fd_config .quant_config , "kv_cache_quant_type" ):
104107 self .kvcache_quant_method : QuantMethodBase = fd_config .quant_config .get_quant_method (self )
108+
109+ # set for RL model, as RL do not need load state dict
110+ if fd_config .quant_config .kv_cache_quant_type == KvCacheQuantzationTypes .BLOCK_WISE_FP8 :
111+ self .cache_quant_type_str = "block_wise_fp8"
112+ self .quant_max_bound = 448.0
113+ self .quant_min_bound = - 448.0
105114 else :
106115 self .kvcache_quant_method = None
107116
You can’t perform that action at this time.
0 commit comments