[CPU]set group size via hint

Signed-off-by: Zhang Yi3 <[email protected]>
openvinotoolkit · Dec 10, 2024 · 91ebc09 · 91ebc09
1 parent 92e6cb3
commit 91ebc09
Show file tree

Hide file tree

Showing 11 changed files with 59 additions and 12 deletions.
diff --git a/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py b/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py
@@ -23,4 +23,6 @@
 from openvino._pyopenvino.properties.hint import allow_auto_batching
 from openvino._pyopenvino.properties.hint import dynamic_quantization_group_size
 from openvino._pyopenvino.properties.hint import kv_cache_precision
+from openvino._pyopenvino.properties.hint import key_cache_group_size
+from openvino._pyopenvino.properties.hint import value_cache_group_size
 from openvino._pyopenvino.properties.hint import activations_scale_factor
diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp
@@ -101,6 +101,8 @@ void regmodule_properties(py::module m) {
     wrap_property_RW(m_hint, ov::hint::allow_auto_batching, "allow_auto_batching");
     wrap_property_RW(m_hint, ov::hint::dynamic_quantization_group_size, "dynamic_quantization_group_size");
     wrap_property_RW(m_hint, ov::hint::kv_cache_precision, "kv_cache_precision");
+    wrap_property_RW(m_hint, ov::hint::key_cache_group_size, "key_cache_group_size");
+    wrap_property_RW(m_hint, ov::hint::value_cache_group_size, "value_cache_group_size");
     wrap_property_RW(m_hint, ov::hint::activations_scale_factor, "activations_scale_factor");
 
     // Submodule intel_cpu

diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py
@@ -334,6 +334,16 @@ def test_properties_ro(ov_property_ro, expected_value):
             "DYNAMIC_QUANTIZATION_GROUP_SIZE",
             ((64, 64),),
         ),
+        (
+            hints.key_cache_group_size,
+            "KEY_CACHE_GROUP_SIZE",
+            ((64, 64),),
+        ),
+        (
+            hints.value_cache_group_size,
+            "VALUE_CACHE_GROUP_SIZE",
+            ((64, 64),),
+        ),
         (hints.kv_cache_precision, "KV_CACHE_PRECISION", ((Type.f32, Type.f32),)),
         (
             hints.activations_scale_factor,

diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp
@@ -580,6 +580,18 @@ static constexpr Property<uint64_t, PropertyMutability::RW> dynamic_quantization
  */
 static constexpr Property<element::Type, PropertyMutability::RW> kv_cache_precision{"KV_CACHE_PRECISION"};
 
+/**
+ * @brief Hint for device to use group_size for key cache compression
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<uint64_t, PropertyMutability::RW> key_cache_group_size{"KEY_CACHE_GROUP_SIZE"};
+
+/**
+ * @brief Hint for device to use group_size for value cache compression
+ * @ingroup ov_runtime_cpp_prop_api
+ */
+static constexpr Property<uint64_t, PropertyMutability::RW> value_cache_group_size{"VALUE_CACHE_GROUP_SIZE"};
+
 /**
  * @brief This property scales down activations to prevent overflows when inference precision is f16.
  * @ingroup ov_runtime_cpp_prop_api

diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp
@@ -256,6 +256,8 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
             RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
             RO_property(ov::hint::dynamic_quantization_group_size.name()),
             RO_property(ov::hint::kv_cache_precision.name()),
+            RO_property(ov::hint::key_cache_group_size.name()),
+            RO_property(ov::hint::value_cache_group_size.name()),
         };
 
         OPENVINO_SUPPRESS_DEPRECATED_START
@@ -333,6 +335,10 @@ ov::Any CompiledModel::get_property(const std::string& name) const {
             config.fcDynamicQuantizationGroupSize);
     } else if (name == ov::hint::kv_cache_precision) {
         return decltype(ov::hint::kv_cache_precision)::value_type(config.kvCachePrecision);
+    } else if (name == ov::hint::key_cache_group_size) {
+        return decltype(ov::hint::key_cache_group_size)::value_type(config.keyCacheGroupSize);
+    } else if (name == ov::hint::value_cache_group_size) {
+        return decltype(ov::hint::value_cache_group_size)::value_type(config.valueCacheGroupSize);
     }
     OPENVINO_THROW("Unsupported property: ", name);
 }

diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp
@@ -375,6 +375,21 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) {
                                ov::hint::kv_cache_precision.name(),
                                ". Supported values: u8, bf16, f16, f32");
             }
+        } else if (key == ov::hint::key_cache_group_size.name() || key == ov::hint::value_cache_group_size.name()) {
+            try {
+                auto const groupSize = val.as<uint64_t>();
+                if (key == ov::hint::key_cache_group_size.name()) {
+                    keyCacheGroupSize = groupSize;
+                } else {
+                    valueCacheGroupSize = groupSize;
+                }
+            } catch (ov::Exception&) {
+                OPENVINO_THROW("Wrong value ",
+                               val.as<std::string>(),
+                               " for property key ",
+                               key,
+                               ". Expected only unsinged integer numbers");
+            } 
         } else if (key == ov::cache_encryption_callbacks.name()) {
             try {
                 auto encryption_callbacks = val.as<EncryptionCallbacks>();

diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h
@@ -64,6 +64,8 @@ struct Config {
     // TODO: Executor cache may leads to incorrect behavior on oneDNN ACL primitives
     size_t rtCacheCapacity = 0ul;
 #endif
+    size_t keyCacheGroupSize = 0ul;
+    size_t valueCacheGroupSize = 0ul;
     ov::threading::IStreamsExecutor::Config streamExecutorConfig;
     int streams = 1;
     bool streamsChanged = false;

diff --git a/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp b/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/executor_pa.cpp
@@ -2137,8 +2137,8 @@ struct AttentionExecutor : public PagedAttentionExecutor {
         // u4 needs scale + zp. s4 needs scale.
         const size_t param_size = one_of(v_cache.get_precision(), ov::element::u4, ov::element::u8) ? sizeof(float) * 2 : sizeof(float);
         const size_t value_params_size = param_size * value_sub_byte_multiplyer;
-        size_t key_group_num = _key_group_size ? k_cache.size(3) / (_key_group_size + key_params_size) : _key_group_size;
-        size_t value_group_num = _value_group_size ? v_cache.size(3) / (_value_group_size + value_params_size) : _value_group_size;
+        size_t key_group_num = _key_group_size ? k_cache.size(3) / (_key_group_size + key_params_size) : 1;
+        size_t value_group_num = _value_group_size ? v_cache.size(3) / (_value_group_size + value_params_size) : 1;
         auto S = k_cache.size(3) - (k_cache.get_precision().is_real() ? 0 : key_params_size * key_group_num);
         auto SV = v_cache.size(3) - (v_cache.get_precision().is_real() ? 0 : value_params_size * value_group_num);
         auto block_size = k_cache.size(2);

diff --git a/src/plugins/intel_cpu/src/nodes/paged_attn.cpp b/src/plugins/intel_cpu/src/nodes/paged_attn.cpp
@@ -132,16 +132,10 @@ void PagedAttention::createPrimitive() {
         // Since we are quantize only last dim it's safe to use the last dim of KV.
         auto kCachePrecision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_KCACHE);
         auto vCachePrecision = getOriginalInputPrecisionAtPort(PagedAttentionExecutor::ID_VCACHE);
-        const auto keyDims = getInputShapeAtPort(PagedAttentionExecutor::ID_KCACHE).getDims();
-        const auto valueDims = getInputShapeAtPort(PagedAttentionExecutor::ID_VCACHE).getDims();
-        const auto keyS = *(keyDims.end() - 1);
-        const auto valueS = *(valueDims.end() - 1);
-
-        size_t group_size = keyS;
-        if (getenv("GROUP_SIZE"))
-            group_size = std::stoi(std::string(getenv("GROUP_SIZE")));
-        size_t key_group_size = group_size;
-        size_t value_group_size = group_size; 
+        const auto cpuConfig = context->getConfig();
+
+        size_t key_group_size = cpuConfig.keyCacheGroupSize;
+        size_t value_group_size = cpuConfig.valueCacheGroupSize; 
         std::cout << "PagedAttn|Kcache|" << kCachePrecision << "|Vcache|" << vCachePrecision << "|key_group_size|" << key_group_size << "|value_group_size|" << value_group_size << std::endl;
         return make_pa_executor(rtPrecision, kCachePrecision, vCachePrecision, key_group_size, value_group_size);
 #else

diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp
@@ -41,6 +41,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkSupportedPropertiesAreAvailable
         RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
         RO_property(ov::hint::dynamic_quantization_group_size.name()),
         RO_property(ov::hint::kv_cache_precision.name()),
+        RO_property(ov::hint::key_cache_group_size.name()),
+        RO_property(ov::hint::value_cache_group_size.name()),
     };
 
     ov::Core ie;

diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp
@@ -56,6 +56,8 @@ TEST_F(OVClassConfigTestCPU, smoke_PluginAllSupportedPropertiesAreAvailable) {
         RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()),
         RW_property(ov::hint::dynamic_quantization_group_size.name()),
         RW_property(ov::hint::kv_cache_precision.name()),
+        RW_property(ov::hint::key_cache_group_size.name()),
+        RW_property(ov::hint::value_cache_group_size.name()),
     };
 
     ov::Core ie;