diff --git a/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py b/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py index 53eb5a76effdb4..d5c5d5595e5e0b 100644 --- a/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py +++ b/src/bindings/python/src/openvino/runtime/properties/hint/__init__.py @@ -23,6 +23,8 @@ from openvino._pyopenvino.properties.hint import allow_auto_batching from openvino._pyopenvino.properties.hint import dynamic_quantization_group_size from openvino._pyopenvino.properties.hint import kv_cache_precision +from openvino._pyopenvino.properties.hint import key_cache_precision +from openvino._pyopenvino.properties.hint import value_cache_precision from openvino._pyopenvino.properties.hint import key_cache_group_size from openvino._pyopenvino.properties.hint import value_cache_group_size from openvino._pyopenvino.properties.hint import activations_scale_factor diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp index cec0aae9b07a21..2b997c6664cee0 100644 --- a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp +++ b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp @@ -101,6 +101,8 @@ void regmodule_properties(py::module m) { wrap_property_RW(m_hint, ov::hint::allow_auto_batching, "allow_auto_batching"); wrap_property_RW(m_hint, ov::hint::dynamic_quantization_group_size, "dynamic_quantization_group_size"); wrap_property_RW(m_hint, ov::hint::kv_cache_precision, "kv_cache_precision"); + wrap_property_RW(m_hint, ov::hint::key_cache_precision, "key_cache_precision"); + wrap_property_RW(m_hint, ov::hint::value_cache_precision, "value_cache_precision"); wrap_property_RW(m_hint, ov::hint::key_cache_group_size, "key_cache_group_size"); wrap_property_RW(m_hint, ov::hint::value_cache_group_size, "value_cache_group_size"); wrap_property_RW(m_hint, ov::hint::activations_scale_factor, "activations_scale_factor"); diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py index d2d95c32079bea..d0745f84361310 100644 --- a/src/bindings/python/tests/test_runtime/test_properties.py +++ b/src/bindings/python/tests/test_runtime/test_properties.py @@ -345,6 +345,8 @@ def test_properties_ro(ov_property_ro, expected_value): ((64, 64),), ), (hints.kv_cache_precision, "KV_CACHE_PRECISION", ((Type.f32, Type.f32),)), + (hints.key_cache_precision, "KEY_CACHE_PRECISION", ((Type.f32, Type.f32),)), + (hints.value_cache_precision, "VALUE_CACHE_PRECISION", ((Type.f32, Type.f32),)), ( hints.activations_scale_factor, "ACTIVATIONS_SCALE_FACTOR", diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp index e539b7e209fcb3..caff66750029fc 100644 --- a/src/inference/include/openvino/runtime/properties.hpp +++ b/src/inference/include/openvino/runtime/properties.hpp @@ -580,6 +580,18 @@ static constexpr Property dynamic_quantization */ static constexpr Property kv_cache_precision{"KV_CACHE_PRECISION"}; +/** + * @brief Hint for device to use specified precision for key cache compression + * @ingroup ov_runtime_cpp_prop_api + */ +static constexpr Property key_cache_precision{"KEY_CACHE_PRECISION"}; + +/** + * @brief Hint for device to use specified precision for value cache compression + * @ingroup ov_runtime_cpp_prop_api + */ +static constexpr Property value_cache_precision{"VALUE_CACHE_PRECISION"}; + /** * @brief Hint for device to use group_size for key cache compression * @ingroup ov_runtime_cpp_prop_api diff --git a/src/plugins/intel_cpu/src/compiled_model.cpp b/src/plugins/intel_cpu/src/compiled_model.cpp index 2fd048cc3a05e0..275fd0dbfff755 100644 --- a/src/plugins/intel_cpu/src/compiled_model.cpp +++ b/src/plugins/intel_cpu/src/compiled_model.cpp @@ -256,6 +256,8 @@ ov::Any CompiledModel::get_property(const std::string& name) const { RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()), RO_property(ov::hint::dynamic_quantization_group_size.name()), RO_property(ov::hint::kv_cache_precision.name()), + RO_property(ov::hint::key_cache_precision.name()), + RO_property(ov::hint::value_cache_precision.name()), RO_property(ov::hint::key_cache_group_size.name()), RO_property(ov::hint::value_cache_group_size.name()), }; @@ -335,6 +337,10 @@ ov::Any CompiledModel::get_property(const std::string& name) const { config.fcDynamicQuantizationGroupSize); } else if (name == ov::hint::kv_cache_precision) { return decltype(ov::hint::kv_cache_precision)::value_type(config.kvCachePrecision); + } else if (name == ov::hint::key_cache_precision) { + return decltype(ov::hint::key_cache_precision)::value_type(config.keyCachePrecision); + } else if (name == ov::hint::value_cache_precision) { + return decltype(ov::hint::value_cache_precision)::value_type(config.valueCachePrecision); } else if (name == ov::hint::key_cache_group_size) { return decltype(ov::hint::key_cache_group_size)::value_type(config.keyCacheGroupSize); } else if (name == ov::hint::value_cache_group_size) { diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 257dee95546e34..32653626daa981 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -375,6 +375,26 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { ov::hint::kv_cache_precision.name(), ". Supported values: u8, bf16, f16, f32"); } + } else if (key == ov::hint::key_cache_precision.name() || key == ov::hint::value_cache_precision.name()) { + try { + kvCachePrecisionSetExplicitly = true; + auto const prec = val.as(); + if (key == ov::hint::key_cache_precision.name()) { + if (one_of(prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8)) { + keyCachePrecision = prec; + } else { + OPENVINO_THROW("keyCachePrecision doesn't support value ", prec); + } + } else { + if (one_of(prec, ov::element::f32, ov::element::f16, ov::element::bf16, ov::element::u8, ov::element::u4, ov::element::i4)) { + valueCachePrecision = prec; + } else { + OPENVINO_THROW("valueCachePrecision doesn't support value ", prec); + } + } + } catch (ov::Exception&) { + + } } else if (key == ov::hint::key_cache_group_size.name() || key == ov::hint::value_cache_group_size.name()) { try { auto const groupSize = val.as(); @@ -432,6 +452,8 @@ void Config::readProperties(const ov::AnyMap& prop, const ModelType modelType) { } if (!kvCachePrecisionSetExplicitly) { kvCachePrecision = ov::element::f32; + valueCachePrecision = ov::element::f32; + keyCachePrecision = ov::element::f32; } } diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index b6aeeaca38e0ee..bcde841814d09c 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -58,9 +58,13 @@ struct Config { #endif #if defined(OPENVINO_ARCH_X86_64) ov::element::Type kvCachePrecision = ov::element::u8; + ov::element::Type keyCachePrecision = ov::element::u8; + ov::element::Type valueCachePrecision = ov::element::u8; size_t rtCacheCapacity = 5000ul; #else ov::element::Type kvCachePrecision = ov::element::f16; + ov::element::Type keyCachePrecision = ov::element::f16; + ov::element::Type valueCachePrecision = ov::element::f16; // TODO: Executor cache may leads to incorrect behavior on oneDNN ACL primitives size_t rtCacheCapacity = 0ul; #endif diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 6fdbf7a4ea4dee..f16f504ee2f5a0 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -390,6 +390,14 @@ ov::Any Plugin::get_property(const std::string& name, const ov::AnyMap& options) engConfig.fcDynamicQuantizationGroupSize); } else if (name == ov::hint::kv_cache_precision) { return decltype(ov::hint::kv_cache_precision)::value_type(engConfig.kvCachePrecision); + } else if (name == ov::hint::key_cache_precision) { + return decltype(ov::hint::key_cache_precision)::value_type(engConfig.keyCachePrecision); + } else if (name == ov::hint::value_cache_precision) { + return decltype(ov::hint::value_cache_precision)::value_type(engConfig.valueCachePrecision); + } else if (name == ov::hint::key_cache_group_size) { + return decltype(ov::hint::key_cache_group_size)::value_type(engConfig.keyCacheGroupSize); + } else if (name == ov::hint::value_cache_group_size) { + return decltype(ov::hint::value_cache_group_size)::value_type(engConfig.valueCacheGroupSize); } return get_ro_property(name, options); } @@ -433,6 +441,10 @@ ov::Any Plugin::get_ro_property(const std::string& name, const ov::AnyMap& optio RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()), RW_property(ov::hint::dynamic_quantization_group_size.name()), RW_property(ov::hint::kv_cache_precision.name()), + RW_property(ov::hint::key_cache_precision.name()), + RW_property(ov::hint::value_cache_precision.name()), + RW_property(ov::hint::key_cache_group_size.name()), + RW_property(ov::hint::value_cache_group_size.name()), }; OPENVINO_SUPPRESS_DEPRECATED_START diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp index 29e5fbbe982542..59fd31cdb34303 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp @@ -41,6 +41,8 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkSupportedPropertiesAreAvailable RO_property(ov::intel_cpu::sparse_weights_decompression_rate.name()), RO_property(ov::hint::dynamic_quantization_group_size.name()), RO_property(ov::hint::kv_cache_precision.name()), + RO_property(ov::hint::key_cache_precision.name()), + RO_property(ov::hint::value_cache_precision.name()), RO_property(ov::hint::key_cache_group_size.name()), RO_property(ov::hint::value_cache_group_size.name()), }; diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp index 696f73f27e1142..589f0641eae0e8 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_plugin/properties.cpp @@ -56,6 +56,8 @@ TEST_F(OVClassConfigTestCPU, smoke_PluginAllSupportedPropertiesAreAvailable) { RW_property(ov::intel_cpu::sparse_weights_decompression_rate.name()), RW_property(ov::hint::dynamic_quantization_group_size.name()), RW_property(ov::hint::kv_cache_precision.name()), + RW_property(ov::hint::key_cache_precision.name()), + RW_property(ov::hint::value_cache_precision.name()), RW_property(ov::hint::key_cache_group_size.name()), RW_property(ov::hint::value_cache_group_size.name()), };