Skip to content

Commit 75d5454

Browse files
committed
fix Cfp8 for RL load
1 parent 63e90e8 commit 75d5454

File tree

1 file changed

+9
-0
lines changed

1 file changed

+9
-0
lines changed

fastdeploy/model_executor/layers/attention/attention.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
from paddleformers.utils.log import logger
2525

2626
from fastdeploy.config import FDConfig
27+
from fastdeploy.model_executor.layers.quantization.kv_cache import (
28+
KvCacheQuantzationTypes,
29+
)
2730
from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase
2831

2932
if TYPE_CHECKING:
@@ -102,6 +105,12 @@ def __init__(
102105

103106
if fd_config.quant_config and hasattr(fd_config.quant_config, "kv_cache_quant_type"):
104107
self.kvcache_quant_method: QuantMethodBase = fd_config.quant_config.get_quant_method(self)
108+
109+
# set for RL model, as RL do not need load state dict
110+
if fd_config.quant_config.kv_cache_quant_type == KvCacheQuantzationTypes.BLOCK_WISE_FP8:
111+
self.cache_quant_type_str = "block_wise_fp8"
112+
self.quant_max_bound = 448.0
113+
self.quant_min_bound = -448.0
105114
else:
106115
self.kvcache_quant_method = None
107116

0 commit comments

Comments
 (0)