Skip to content

Commit

Permalink
[CPU]set group size via hint
Browse files Browse the repository at this point in the history
Signed-off-by: Zhang Yi3 <[email protected]>
  • Loading branch information
zhangYiIntel committed Dec 10, 2024
1 parent 92e6cb3 commit 91ebc09
Show file tree
Hide file tree
Showing 11 changed files with 59 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,6 @@
from openvino._pyopenvino.properties.hint import allow_auto_batching
from openvino._pyopenvino.properties.hint import dynamic_quantization_group_size
from openvino._pyopenvino.properties.hint import kv_cache_precision
from openvino._pyopenvino.properties.hint import key_cache_group_size
from openvino._pyopenvino.properties.hint import value_cache_group_size
from openvino._pyopenvino.properties.hint import activations_scale_factor
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ void regmodule_properties(py::module m) {
wrap_property_RW(m_hint, ov::hint::allow_auto_batching, "allow_auto_batching");
wrap_property_RW(m_hint, ov::hint::dynamic_quantization_group_size, "dynamic_quantization_group_size");
wrap_property_RW(m_hint, ov::hint::kv_cache_precision, "kv_cache_precision");
wrap_property_RW(m_hint, ov::hint::key_cache_group_size, "key_cache_group_size");
wrap_property_RW(m_hint, ov::hint::value_cache_group_size, "value_cache_group_size");
wrap_property_RW(m_hint, ov::hint::activations_scale_factor, "activations_scale_factor");

// Submodule intel_cpu
Expand Down
10 changes: 10 additions & 0 deletions src/bindings/python/tests/test_runtime/test_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,16 @@ def test_properties_ro(ov_property_ro, expected_value):
"DYNAMIC_QUANTIZATION_GROUP_SIZE",
((64, 64),),
),
(
hints.key_cache_group_size,
"KEY_CACHE_GROUP_SIZE",
((64, 64),),
),
(
hints.value_cache_group_size,
"VALUE_CACHE_GROUP_SIZE",
((64, 64),),
),
(hints.kv_cache_precision, "KV_CACHE_PRECISION", ((Type.f32, Type.f32),)),
(
hints.activations_scale_factor,
Expand Down
12 changes: 12 additions & 0 deletions src/inference/include/openvino/runtime/properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,18 @@ static constexpr Property<uint64_t, PropertyMutability::RW> dynamic_quantization
*/
static constexpr Property<element::Type, PropertyMutability::RW> kv_cache_precision{"KV_CACHE_PRECISION"};

/**
* @brief Hint for device to use group_size for key cache compression
* @ingroup ov_runtime_cpp_prop_api
*/
static constexpr Property<uint64_t, PropertyMutability::RW> key_cache_group_size{"KEY_CACHE_GROUP_SIZE"};

/**
* @brief Hint for device to use group_size for value cache compression
* @ingroup ov_runtime_cpp_prop_api
*/
static constexpr Property<uint64_t, PropertyMutability::RW> value_cache_group_size{"VALUE_CACHE_GROUP_SIZE"};

/**
* @brief This property scales down activations to prevent overflows when inference precision is f16.
* @ingroup ov_runtime_cpp_prop_api
Expand Down
6 changes: 6 additions & 0 deletions src/plugins/intel_cpu/src/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,8 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
RO_property(ov::hint::dynamic_quantization_group_size.name()),
RO_property(ov::hint::kv_cache_precision.name()),
RO_property(ov::hint::key_cache_group_size.name()),
RO_property(ov::hint::value_cache_group_size.name()),
};

OPENVINO_SUPPRESS_DEPRECATED_START
Expand Down Expand Up @@ -333,6 +335,10 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
config.fcDynamicQuantizationGroupSize);
} else if (name == ov::hint::kv_cache_precision) {
return decltype(ov::hint::kv_cache_precision)::value_type(config.kvCachePrecision);
} else if (name == ov::hint::key_cache_group_size) {
return decltype(ov::hint::key_cache_group_size)::value_type(config.keyCacheGroupSize);
} else if (name == ov::hint::value_cache_group_size) {
return decltype(ov::hint::value_cache_group_size)::value_type(config.valueCacheGroupSize);
}
OPENVINO_THROW("Unsupported property: ", name);
}
Expand Down
15 changes: 15 additions & 0 deletions src/plugins/intel_cpu/src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,21 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
ov::hint::kv_cache_precision.name(),
". Supported values: u8, bf16, f16, f32");
}
} else if (key == ov::hint::key_cache_group_size.name() || key == ov::hint::value_cache_group_size.name()) {
try {
auto const groupSize = val.as<uint64_t>();
if (key == ov::hint::key_cache_group_size.name()) {
keyCacheGroupSize = groupSize;
} else {
valueCacheGroupSize = groupSize;
}
} catch (ov::Exception&) {
OPENVINO_THROW("Wrong value ",
val.as<std::string>(),
" for property key ",
key,
". Expected only unsinged integer numbers");
}
} else if (key == ov::cache_encryption_callbacks.name()) {
try {
auto encryption_callbacks = val.as<EncryptionCallbacks>();
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_cpu/src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ struct Config {
// TODO: Executor cache may leads to incorrect behavior on oneDNN ACL primitives
size_t rtCacheCapacity = 0ul;
#endif
size_t keyCacheGroupSize = 0ul;
size_t valueCacheGroupSize = 0ul;
ov::threading::IStreamsExecutor::Config streamExecutorConfig;
int streams = 1;
bool streamsChanged = false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2137,8 +2137,8 @@ struct AttentionExecutor : public PagedAttentionExecutor {
// u4 needs scale + zp. s4 needs scale.
const size_t param_size = one_of(v_cache.get_precision(), ov::element::u4, ov::element::u8) ? sizeof(float) * 2 : sizeof(float);
const size_t value_params_size = param_size * value_sub_byte_multiplyer;
size_t key_group_num = _key_group_size ? k_cache.size(3) / (_key_group_size + key_params_size) : _key_group_size;
size_t value_group_num = _value_group_size ? v_cache.size(3) / (_value_group_size + value_params_size) : _value_group_size;
size_t key_group_num = _key_group_size ? k_cache.size(3) / (_key_group_size + key_params_size) : 1;
size_t value_group_num = _value_group_size ? v_cache.size(3) / (_value_group_size + value_params_size) : 1;
auto S = k_cache.size(3) - (k_cache.get_precision().is_real() ? 0 : key_params_size * key_group_num);
auto SV = v_cache.size(3) - (v_cache.get_precision().is_real() ? 0 : value_params_size * value_group_num);
auto block_size = k_cache.size(2);
Expand Down
14 changes: 4 additions & 10 deletions src/plugins/intel_cpu/src/nodes/paged_attn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,16 +132,10 @@ void PagedAttention::createPrimitive() {
// Since we are quantize only last dim it's safe to use the last dim of KV.
auto kCachePrecision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_KCACHE);
auto vCachePrecision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_VCACHE);
const auto keyDims = getInputShapeAtPort(PagedAttentionExecutor::ID_KCACHE).getDims();
const auto valueDims = getInputShapeAtPort(PagedAttentionExecutor::ID_VCACHE).getDims();
const auto keyS = *(keyDims.end() - 1);
const auto valueS = *(valueDims.end() - 1);

size_t group_size = keyS;
if (getenv("GROUP_SIZE"))
group_size = std::stoi(std::string(getenv("GROUP_SIZE")));
size_t key_group_size = group_size;
size_t value_group_size = group_size;
const auto cpuConfig = context->getConfig();

size_t key_group_size = cpuConfig.keyCacheGroupSize;
size_t value_group_size = cpuConfig.valueCacheGroupSize;
std::cout << "PagedAttn|Kcache|" << kCachePrecision << "|Vcache|" << vCachePrecision << "|key_group_size|" << key_group_size << "|value_group_size|" << value_group_size << std::endl;
return make_pa_executor(rtPrecision, kCachePrecision, vCachePrecision, key_group_size, value_group_size);
#else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkSupportedPropertiesAreAvailable
RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
RO_property(ov::hint::dynamic_quantization_group_size.name()),
RO_property(ov::hint::kv_cache_precision.name()),
RO_property(ov::hint::key_cache_group_size.name()),
RO_property(ov::hint::value_cache_group_size.name()),
};

ov::Core ie;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ TEST_F(OVClassConfigTestCPU, smoke_PluginAllSupportedPropertiesAreAvailable) {
RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
RW_property(ov::hint::dynamic_quantization_group_size.name()),
RW_property(ov::hint::kv_cache_precision.name()),
RW_property(ov::hint::key_cache_group_size.name()),
RW_property(ov::hint::value_cache_group_size.name()),
};

ov::Core ie;
Expand Down

0 comments on commit 91ebc09

Please sign in to comment.