From 9b70b92fbc21a799780c64295c8b098ff42fffad Mon Sep 17 00:00:00 2001 From: Aleksandr Voron Date: Fri, 31 Oct 2025 13:18:03 +0100 Subject: [PATCH 1/6] init --- .../src/nodes/executors/acl/acl_conv.cpp | 1 + .../executors/convolution_implementations.cpp | 32 ++++++++++--------- .../src/nodes/executors/debug_messages.hpp | 1 + 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_conv.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_conv.cpp index fd5f5ca4a238a3..b6941f5ca9f526 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_conv.cpp @@ -94,6 +94,7 @@ bool ACLConvolutionExecutor::supports(const ConvConfig& config) { config.descs.at(ARG_WEI)->getPrecision() == ov::element::i8; VERIFY(isQuantized, UNSUPPORTED_SRC_PRECISIONS); + VERIFY(config.descs.at(ARG_BIAS)->getPrecision() == ov::element::i32, UNSUPPORTED_BIAS_PRECISIONS); VERIFY(config.attrs.postOps.size() <= 1U, UNSUPPORTED_BY_EXECUTOR); return true; diff --git a/src/plugins/intel_cpu/src/nodes/executors/convolution_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/convolution_implementations.cpp index 78a45bd10bb76f..60d993fac68133 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/convolution_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/convolution_implementations.cpp @@ -59,11 +59,14 @@ static const TypeMapping dnnlConvTypeMapping { {{_f32, _half_float | _i8, _any, _any}, {bypass(), bypass(), use<0>(), use<0>()}}, {{_bf16, _f16, _any, _any}, {bypass(), bypass(), use<0>(), use<0>()}}, {{_f16, _bf16, _any, _any}, {bypass(), bypass(), use<0>(), use<0>()}}, - // quantization configuration + // quantization configuration is not applicable for ARM + // because there is the dedicated low-precision implementation for ARM +#if !defined(OPENVINO_ARCH_ARM64) && !defined(OPENVINO_ARCH_ARM) // int8 conv does not support f16 output and bias {{_u8 | _i8, _i8, _quant |_bf16 | _f32 | _i32 | _dynamic, _quant | _bf16 | _f32 | _i32 | _dynamic}, {bypass(), bypass(), bypass(), bypass()}}, {{_u8 | _i8, _i8, _f16, _u8 | _i8 | _i32 | _bf16 | _f32}, {bypass(), bypass(), just(), bypass()}}, {{_u8 | _i8, _i8, _any, _any}, {bypass(), bypass(), just(), just()}}, +#endif // @todo should we fallback to FPXX instead of _f32? {{_any, _any, _any, _any}, {just(), just(), just(), just()}}, // @todo explicitly cover configuration limitations for oneDNN on ARM @@ -71,8 +74,8 @@ static const TypeMapping dnnlConvTypeMapping { static const TypeMapping aclLowpConvTypeMapping { // {src, wei, bia, dst} pt - {{_u8, _u8 | _i8, _any, _u8}, {bypass(), bypass(), just(), bypass()}}, - {{_i8, _i8, _any, _i8}, {bypass(), bypass(), just(), bypass()}}, + {{_u8, _u8 | _i8, _i32 | _dynamic, _u8}, {bypass(), bypass(), bypass(), bypass()}}, + {{_i8, _i8, _i32 | _dynamic, _i8}, {bypass(), bypass(), bypass(), bypass()}}, }; // clang-format on struct CreateOptimalConfigDefault { @@ -245,13 +248,23 @@ const std::vector>& getImplementations() { AcceptsAnyShape, CreateDnnlDefault{} ) + OV_CPU_INSTANCE_ACL( + "convolution_acl_lowp", ExecutorType::Acl, OperationType::Convolution, + // supports + [](const ConvConfig& config, [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { + VERIFY(ACLConvolutionExecutor::supports(config), UNSUPPORTED_BY_EXECUTOR); + return true; + }, + CreateOptimalConfigAclLowp{{LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp}}, + AcceptsAnyShape, + CreateDefault{} + ) OV_CPU_INSTANCE_ACL( "convolution_dnnl_nspc_nspc_unconditional_acl", ExecutorType::Dnnl, OperationType::Convolution, // supports [](const ConvConfig& config, const MemoryFormatFilter& memoryFormatFilter) -> bool { VERIFY(MatchesMemoryFormatFilter(config.descs, LayoutConfig{LayoutType::nspc, LayoutType::ncsp, LayoutType::nspc, LayoutType::nspc}, memoryFormatFilter, dnnlConvolutionMappingNotation), MEMORY_FORMAT_MISMATCH); - VERIFY(!isQuantized(config), UNSUPPORTED_SRC_PRECISIONS); return true; }, CreateOptimalConfigDefault{{LayoutType::nspc, LayoutType::ncsp, LayoutType::nspc, LayoutType::nspc}}, @@ -274,17 +287,6 @@ const std::vector>& getImplementations() { AcceptsAnyShape, CreateDnnlDefault{} ) - OV_CPU_INSTANCE_ACL( - "convolution_acl_lowp", ExecutorType::Acl, OperationType::Convolution, - // supports - [](const ConvConfig& config, [[maybe_unused]] const MemoryFormatFilter& memoryFormatFilter) -> bool { - VERIFY(ACLConvolutionExecutor::supports(config), UNSUPPORTED_BY_EXECUTOR); - return true; - }, - CreateOptimalConfigAclLowp{{LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp, LayoutType::ncsp}}, - AcceptsAnyShape, - CreateDefault{} - ) }; return convolutionImplementations; diff --git a/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp b/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp index 578c176a60dd8a..c813dc318ba26f 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/debug_messages.hpp @@ -11,6 +11,7 @@ #define UNSUPPORTED_TYPE_OF_POSTOPS " the type of post ops is not supported" #define UNSUPPORTED_SRC_PRECISIONS " unsupported src precisions" #define UNSUPPORTED_WEI_PRECISIONS " unsupported wei precisions" +#define UNSUPPORTED_BIAS_PRECISIONS " unsupported bias precisions" #define UNSUPPORTED_DST_PRECISIONS " unsupported dst precisions" #define UNSUPPORTED_ISA " unsupported isa" #define UNSUPPORTED_SRC_RANK " unsupported src rank" From ed9480d82692cbe9d33ccd044ed234bf11901564 Mon Sep 17 00:00:00 2001 From: Aleksandr Voron Date: Mon, 3 Nov 2025 18:49:50 +0100 Subject: [PATCH 2/6] disable FuseConvolutionAndZeroPoints on ARM --- src/plugins/intel_cpu/src/graph_optimizer.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 9b7d4a573ddd32..cc68492eb26f43 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -919,6 +919,10 @@ void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) { void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph& graph) { const auto& graphNodes = graph.GetNodes(); +// zero points fusing is skipped on ARM platforms because oneDNN is not involved into int8 convolution inference +#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) + return; +#endif auto isSuitableConvNode = [](const NodePtr& node) { bool retVal = false; From 7e91ddc94c10c8db344f1f15ca432d66097ba55f Mon Sep 17 00:00:00 2001 From: Aleksandr Voron Date: Mon, 3 Nov 2025 19:09:43 +0100 Subject: [PATCH 3/6] fix clang --- src/plugins/intel_cpu/src/graph_optimizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index cc68492eb26f43..85f5c41ebf5701 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -919,7 +919,7 @@ void GraphOptimizer::FuseFCAndTransposeOnWeights(Graph& graph) { void GraphOptimizer::FuseConvolutionAndZeroPoints(Graph& graph) { const auto& graphNodes = graph.GetNodes(); -// zero points fusing is skipped on ARM platforms because oneDNN is not involved into int8 convolution inference +// zero points fusing is skipped on ARM platforms because oneDNN is not involved into int8 convolution inference #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) return; #endif From 6382808584056cc4ebe47118e534a1f72ed33285 Mon Sep 17 00:00:00 2001 From: Aleksandr Voron Date: Tue, 4 Nov 2025 13:26:01 +0100 Subject: [PATCH 4/6] address Egor comment --- .../src/nodes/executors/convolution_implementations.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/convolution_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/convolution_implementations.cpp index 60d993fac68133..40ba07eb06e765 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/convolution_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/convolution_implementations.cpp @@ -59,14 +59,11 @@ static const TypeMapping dnnlConvTypeMapping { {{_f32, _half_float | _i8, _any, _any}, {bypass(), bypass(), use<0>(), use<0>()}}, {{_bf16, _f16, _any, _any}, {bypass(), bypass(), use<0>(), use<0>()}}, {{_f16, _bf16, _any, _any}, {bypass(), bypass(), use<0>(), use<0>()}}, - // quantization configuration is not applicable for ARM - // because there is the dedicated low-precision implementation for ARM -#if !defined(OPENVINO_ARCH_ARM64) && !defined(OPENVINO_ARCH_ARM) + // quantization configuration // int8 conv does not support f16 output and bias {{_u8 | _i8, _i8, _quant |_bf16 | _f32 | _i32 | _dynamic, _quant | _bf16 | _f32 | _i32 | _dynamic}, {bypass(), bypass(), bypass(), bypass()}}, {{_u8 | _i8, _i8, _f16, _u8 | _i8 | _i32 | _bf16 | _f32}, {bypass(), bypass(), just(), bypass()}}, {{_u8 | _i8, _i8, _any, _any}, {bypass(), bypass(), just(), just()}}, -#endif // @todo should we fallback to FPXX instead of _f32? {{_any, _any, _any, _any}, {just(), just(), just(), just()}}, // @todo explicitly cover configuration limitations for oneDNN on ARM @@ -265,6 +262,7 @@ const std::vector>& getImplementations() { [](const ConvConfig& config, const MemoryFormatFilter& memoryFormatFilter) -> bool { VERIFY(MatchesMemoryFormatFilter(config.descs, LayoutConfig{LayoutType::nspc, LayoutType::ncsp, LayoutType::nspc, LayoutType::nspc}, memoryFormatFilter, dnnlConvolutionMappingNotation), MEMORY_FORMAT_MISMATCH); + VERIFY(!isQuantized(config), UNSUPPORTED_SRC_PRECISIONS); return true; }, CreateOptimalConfigDefault{{LayoutType::nspc, LayoutType::ncsp, LayoutType::nspc, LayoutType::nspc}}, From 4b6e5b6ff1d209896b488c981a6cb727ed13dec4 Mon Sep 17 00:00:00 2001 From: Aleksandr Voron Date: Tue, 4 Nov 2025 13:56:01 +0100 Subject: [PATCH 5/6] fix bias check --- src/plugins/intel_cpu/src/nodes/executors/acl/acl_conv.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_conv.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_conv.cpp index b6941f5ca9f526..28be0a66da1159 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_conv.cpp @@ -94,7 +94,9 @@ bool ACLConvolutionExecutor::supports(const ConvConfig& config) { config.descs.at(ARG_WEI)->getPrecision() == ov::element::i8; VERIFY(isQuantized, UNSUPPORTED_SRC_PRECISIONS); - VERIFY(config.descs.at(ARG_BIAS)->getPrecision() == ov::element::i32, UNSUPPORTED_BIAS_PRECISIONS); + if (config.attrs.withBias) { + VERIFY(config.descs.at(ARG_BIAS)->getPrecision() == ov::element::i32, UNSUPPORTED_BIAS_PRECISIONS); + } VERIFY(config.attrs.postOps.size() <= 1U, UNSUPPORTED_BY_EXECUTOR); return true; From b78dde84802408285b1d4fb8fa441a52c96d52a0 Mon Sep 17 00:00:00 2001 From: Aleksandr Voron Date: Tue, 4 Nov 2025 15:39:29 +0100 Subject: [PATCH 6/6] Revert "address Egor comment" This reverts commit 6382808584056cc4ebe47118e534a1f72ed33285. --- .../src/nodes/executors/convolution_implementations.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/convolution_implementations.cpp b/src/plugins/intel_cpu/src/nodes/executors/convolution_implementations.cpp index 40ba07eb06e765..60d993fac68133 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/convolution_implementations.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/convolution_implementations.cpp @@ -59,11 +59,14 @@ static const TypeMapping dnnlConvTypeMapping { {{_f32, _half_float | _i8, _any, _any}, {bypass(), bypass(), use<0>(), use<0>()}}, {{_bf16, _f16, _any, _any}, {bypass(), bypass(), use<0>(), use<0>()}}, {{_f16, _bf16, _any, _any}, {bypass(), bypass(), use<0>(), use<0>()}}, - // quantization configuration + // quantization configuration is not applicable for ARM + // because there is the dedicated low-precision implementation for ARM +#if !defined(OPENVINO_ARCH_ARM64) && !defined(OPENVINO_ARCH_ARM) // int8 conv does not support f16 output and bias {{_u8 | _i8, _i8, _quant |_bf16 | _f32 | _i32 | _dynamic, _quant | _bf16 | _f32 | _i32 | _dynamic}, {bypass(), bypass(), bypass(), bypass()}}, {{_u8 | _i8, _i8, _f16, _u8 | _i8 | _i32 | _bf16 | _f32}, {bypass(), bypass(), just(), bypass()}}, {{_u8 | _i8, _i8, _any, _any}, {bypass(), bypass(), just(), just()}}, +#endif // @todo should we fallback to FPXX instead of _f32? {{_any, _any, _any, _any}, {just(), just(), just(), just()}}, // @todo explicitly cover configuration limitations for oneDNN on ARM @@ -262,7 +265,6 @@ const std::vector>& getImplementations() { [](const ConvConfig& config, const MemoryFormatFilter& memoryFormatFilter) -> bool { VERIFY(MatchesMemoryFormatFilter(config.descs, LayoutConfig{LayoutType::nspc, LayoutType::ncsp, LayoutType::nspc, LayoutType::nspc}, memoryFormatFilter, dnnlConvolutionMappingNotation), MEMORY_FORMAT_MISMATCH); - VERIFY(!isQuantized(config), UNSUPPORTED_SRC_PRECISIONS); return true; }, CreateOptimalConfigDefault{{LayoutType::nspc, LayoutType::ncsp, LayoutType::nspc, LayoutType::nspc}},