Skip to content

Commit 45fef68

Browse files
committed
fix Cfp8 for RL load
1 parent 63e90e8 commit 45fef68

File tree

1 file changed

+7
-0
lines changed

1 file changed

+7
-0
lines changed

fastdeploy/model_executor/layers/attention/attention.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
from fastdeploy.config import FDConfig
2727
from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase
28+
from fastdeploy.model_executor.layers.quantization.kv_cache import KvCacheQuantzationTypes
2829

2930
if TYPE_CHECKING:
3031
from fastdeploy.model_executor.forward_meta import ForwardMeta
@@ -102,6 +103,12 @@ def __init__(
102103

103104
if fd_config.quant_config and hasattr(fd_config.quant_config, "kv_cache_quant_type"):
104105
self.kvcache_quant_method: QuantMethodBase = fd_config.quant_config.get_quant_method(self)
106+
107+
# set for RL model, as RL do not need load state dict
108+
if fd_config.quant_config.kv_cache_quant_type == KvCacheQuantzationTypes.BLOCK_WISE_FP8:
109+
self.cache_quant_type_str = "block_wise_fp8"
110+
self.quant_max_bound = 448.0
111+
self.quant_min_bound = -448.0
105112
else:
106113
self.kvcache_quant_method = None
107114

0 commit comments

Comments
 (0)