From 91ebc0999e86da1ca3e0a82d242fa31165f38946 Mon Sep 17 00:00:00 2001 From: Zhang Yi3 Date: Mon, 9 Dec 2024 19:23:55 -0800 Subject: [PATCH] [CPU]set group size via hint Signed-off-by: Zhang Yi3 --- .../openvino/runtime/properties/hint/__init__.py | 2 ++ .../src/pyopenvino/core/properties/properties.cpp | 2 ++ .../python/tests/test_runtime/test_properties.py | 10 ++++++++++ .../include/openvino/runtime/properties.hpp | 12 ++++++++++++ src/plugins/intel_cpu/src/compiled_model.cpp | 6 ++++++ src/plugins/intel_cpu/src/config.cpp | 15 +++++++++++++++ src/plugins/intel_cpu/src/config.h | 2 ++ .../src/nodes/kernels/scaled_attn/executor_pa.cpp | 4 ++-- src/plugins/intel_cpu/src/nodes/paged_attn.cpp | 14 ++++---------- .../behavior/ov_executable_network/properties.cpp | 2 ++ .../custom/behavior/ov_plugin/properties.cpp | 2 ++ 11 files changed, 59 insertions(+), 12 deletions(-) diff --git a/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py b/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py index d1dce289d09941..53eb5a76effdb4 100644 --- a/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py +++ b/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py @@ -23,4 +23,6 @@ from openvino._pyopenvino.properties.hint import allow_auto_batching from openvino._pyopenvino.properties.hint import dynamic_quantization_group_size from openvino._pyopenvino.properties.hint import kv_cache_precision +from openvino._pyopenvino.properties.hint import key_cache_group_size +from openvino._pyopenvino.properties.hint import value_cache_group_size from openvino._pyopenvino.properties.hint import activations_scale_factor diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp index 564e5f69f5ee14..cec0aae9b07a21 100644 --- a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp +++ b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp @@ -101,6 +101,8 @@ void regmodule_properties(py::module m) { wrap_property_RW(m_hint, ov::hint::allow_auto_batching, "allow_auto_batching"); wrap_property_RW(m_hint, ov::hint::dynamic_quantization_group_size, "dynamic_quantization_group_size"); wrap_property_RW(m_hint, ov::hint::kv_cache_precision, "kv_cache_precision"); + wrap_property_RW(m_hint, ov::hint::key_cache_group_size, "key_cache_group_size"); + wrap_property_RW(m_hint, ov::hint::value_cache_group_size, "value_cache_group_size"); wrap_property_RW(m_hint, ov::hint::activations_scale_factor, "activations_scale_factor"); // Submodule intel_cpu diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py index 6065d72196b44b..d2d95c32079bea 100644 --- a/src/bindings/python/tests/test_runtime/test_properties.py +++ b/src/bindings/python/tests/test_runtime/test_properties.py @@ -334,6 +334,16 @@ def test_properties_ro(ov_property_ro, expected_value): "DYNAMIC_QUANTIZATION_GROUP_SIZE", ((64, 64),), ), + ( + hints.key_cache_group_size, + "KEY_CACHE_GROUP_SIZE", + ((64, 64),), + ), + ( + hints.value_cache_group_size, + "VALUE_CACHE_GROUP_SIZE", + ((64, 64),), + ), (hints.kv_cache_precision, "KV_CACHE_PRECISION", ((Type.f32, Type.f32),)), ( hints.activations_scale_factor, diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp index 5674c75dd546d7..e539b7e209fcb3 100644 --- a/src/inference/include/openvino/runtime/properties.hpp +++ b/src/inference/include/openvino/runtime/properties.hpp @@ -580,6 +580,18 @@ static constexpr Property dynamic_quantization */ static constexpr Property kv_cache_precision{"KV_CACHE_PRECISION"}; +/** + * @brief Hint for device to use group_size for key cache compression + * @ingroup ov_runtime_cpp_prop_api + */ +static constexpr Property key_cache_group_size{"KEY_CACHE_GROUP_SIZE"}; + +/** + * @brief Hint for device to use group_size for value cache compression + * @ingroup ov_runtime_cpp_prop_api + */ +static constexpr Property value_cache_group_size{"VALUE_CACHE_GROUP_SIZE"}; + /** * @brief This property scales down activations to prevent overflows when inference precision is f16. * @ingroup ov_runtime_cpp_prop_api diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp index bbee5d937be5d5..2fd048cc3a05e0 100644 --- a/src/plugins/intel_cpu/src/compiled_model.cpp +++ b/src/plugins/intel_cpu/src/compiled_model.cpp @@ -256,6 +256,8 @@ ov::Any CompiledModel::get_property(const std::string& name) const { RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()), RO_property(ov::hint::dynamic_quantization_group_size.name()), RO_property(ov::hint::kv_cache_precision.name()), + RO_property(ov::hint::key_cache_group_size.name()), + RO_property(ov::hint::value_cache_group_size.name()), }; OPENVINO_SUPPRESS_DEPRECATED_START @@ -333,6 +335,10 @@ ov::Any CompiledModel::get_property(const std::string& name) const { config.fcDynamicQuantizationGroupSize); } else if (name == ov::hint::kv_cache_precision) { return decltype(ov::hint::kv_cache_precision)::value_type(config.kvCachePrecision); + } else if (name == ov::hint::key_cache_group_size) { + return decltype(ov::hint::key_cache_group_size)::value_type(config.keyCacheGroupSize); + } else if (name == ov::hint::value_cache_group_size) { + return decltype(ov::hint::value_cache_group_size)::value_type(config.valueCacheGroupSize); } OPENVINO_THROW("Unsupported property: ", name); } diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 83e4ed1c99ea3d..a25401f12566fc 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -375,6 +375,21 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { ov::hint::kv_cache_precision.name(), ". Supported values: u8, bf16, f16, f32"); } + } else if (key == ov::hint::key_cache_group_size.name() || key == ov::hint::value_cache_group_size.name()) { + try { + auto const groupSize = val.as(); + if (key == ov::hint::key_cache_group_size.name()) { + keyCacheGroupSize = groupSize; + } else { + valueCacheGroupSize = groupSize; + } + } catch (ov::Exception&) { + OPENVINO_THROW("Wrong value ", + val.as(), + " for property key ", + key, + ". Expected only unsinged integer numbers"); + } } else if (key == ov::cache_encryption_callbacks.name()) { try { auto encryption_callbacks = val.as(); diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index a8439d87803fd4..b6aeeaca38e0ee 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -64,6 +64,8 @@ struct Config { // TODO: Executor cache may leads to incorrect behavior on oneDNN ACL primitives size_t rtCacheCapacity = 0ul; #endif + size_t keyCacheGroupSize = 0ul; + size_t valueCacheGroupSize = 0ul; ov::threading::IStreamsExecutor::Config streamExecutorConfig; int streams = 1; bool streamsChanged = false; diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp index 51b750a0ec11a0..f545cba7dd5097 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp @@ -2137,8 +2137,8 @@ struct AttentionExecutor : public PagedAttentionExecutor { // u4 needs scale + zp. s4 needs scale. const size_t param_size = one_of(v_cache.get_precision(), ov::element::u4, ov::element::u8) ? sizeof(float) * 2 : sizeof(float); const size_t value_params_size = param_size * value_sub_byte_multiplyer; - size_t key_group_num = _key_group_size ? k_cache.size(3) / (_key_group_size + key_params_size) : _key_group_size; - size_t value_group_num = _value_group_size ? v_cache.size(3) / (_value_group_size + value_params_size) : _value_group_size; + size_t key_group_num = _key_group_size ? k_cache.size(3) / (_key_group_size + key_params_size) : 1; + size_t value_group_num = _value_group_size ? v_cache.size(3) / (_value_group_size + value_params_size) : 1; auto S = k_cache.size(3) - (k_cache.get_precision().is_real() ? 0 : key_params_size * key_group_num); auto SV = v_cache.size(3) - (v_cache.get_precision().is_real() ? 0 : value_params_size * value_group_num); auto block_size = k_cache.size(2); diff --git a/src/plugins/intel_cpu/src/nodes/paged_attn.cpp b/src/plugins/intel_cpu/src/nodes/paged_attn.cpp index 46cfb7ceee9ed2..41e7274953f9e6 100644 --- a/src/plugins/intel_cpu/src/nodes/paged_attn.cpp +++ b/src/plugins/intel_cpu/src/nodes/paged_attn.cpp @@ -132,16 +132,10 @@ void PagedAttention::createPrimitive() { // Since we are quantize only last dim it's safe to use the last dim of KV. auto kCachePrecision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_KCACHE); auto vCachePrecision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_VCACHE); - const auto keyDims = getInputShapeAtPort(PagedAttentionExecutor::ID_KCACHE).getDims(); - const auto valueDims = getInputShapeAtPort(PagedAttentionExecutor::ID_VCACHE).getDims(); - const auto keyS = *(keyDims.end() - 1); - const auto valueS = *(valueDims.end() - 1); - - size_t group_size = keyS; - if (getenv("GROUP_SIZE")) - group_size = std::stoi(std::string(getenv("GROUP_SIZE"))); - size_t key_group_size = group_size; - size_t value_group_size = group_size; + const auto cpuConfig = context->getConfig(); + + size_t key_group_size = cpuConfig.keyCacheGroupSize; + size_t value_group_size = cpuConfig.valueCacheGroupSize; std::cout << "PagedAttn|Kcache|" << kCachePrecision << "|Vcache|" << vCachePrecision << "|key_group_size|" << key_group_size << "|value_group_size|" << value_group_size << std::endl; return make_pa_executor(rtPrecision, kCachePrecision, vCachePrecision, key_group_size, value_group_size); #else diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp index 73086b78a0de95..29e5fbbe982542 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp @@ -41,6 +41,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkSupportedPropertiesAreAvailable RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()), RO_property(ov::hint::dynamic_quantization_group_size.name()), RO_property(ov::hint::kv_cache_precision.name()), + RO_property(ov::hint::key_cache_group_size.name()), + RO_property(ov::hint::value_cache_group_size.name()), }; ov::Core ie; diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp index 904d2b81dc05b6..696f73f27e1142 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp @@ -56,6 +56,8 @@ TEST_F(OVClassConfigTestCPU, smoke_PluginAllSupportedPropertiesAreAvailable) { RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()), RW_property(ov::hint::dynamic_quantization_group_size.name()), RW_property(ov::hint::kv_cache_precision.name()), + RW_property(ov::hint::key_cache_group_size.name()), + RW_property(ov::hint::value_cache_group_size.name()), }; ov::Core ie;