[GPU] activations scaling to resolve accuracy issues for infer precis…

…ion of f16 (#27265) ### Details: - When a model runs at inference precision of f16, it might be unable to calculate correct results due to limited range of f16. - The purpose of this PR is to avoid situations where overflow occurs during calculation by scaling down the activation, thereby obtaining correct results when the infer precision is f16. - A new config property "ACTIVATIONS_SCALE_FACTOR" is introduced, which holds a single floating-point value. For example, if it is 64, activations are divided by 64 before Convolution and MatMul. If it is smaller than 0, this feature is disabled. - This property also can be set via rt_info of a model as below. ```html <rt_info> <runtime_options> <ACTIVATIONS_SCALE_FACTOR value="8.0" /> </runtime_options> </rt_info> ``` ### Tickets: - 147052 --------- Co-authored-by: Andrew Park <[email protected]>
openvinotoolkit · Jan 14, 2025 · cc67ad1 · cc67ad1
1 parent d63a9df
commit cc67ad1
Show file tree

Hide file tree

Showing 16 changed files with 1,035 additions and 52 deletions.
diff --git a/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp b/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp
@@ -252,11 +252,13 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass
             element::Type deqPrecision = element::f32,
             const std::vector<ov::element::Type> defaultPrecisions =
             { ov::element::u8,  ov::element::i8 },
-            const bool reshapeIgnorePerTensorQuantizationCheck = false) :
+            const bool reshapeIgnorePerTensorQuantizationCheck = false,
+            const bool scalingMode = false) :
             updatePrecisions(updatePrecisions),
             deqPrecision(deqPrecision),
             defaultPrecisions(defaultPrecisions),
-            reshapeIgnorePerTensorQuantizationCheck(reshapeIgnorePerTensorQuantizationCheck) {}
+            reshapeIgnorePerTensorQuantizationCheck(reshapeIgnorePerTensorQuantizationCheck),
+            scalingMode(scalingMode) {}
 
         Params& setUpdatePrecisions(const bool updatePrecisions) {
             this->updatePrecisions = updatePrecisions;
@@ -281,6 +283,8 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass
         std::vector<ov::element::Type> defaultPrecisions;
         // to support GPU workarround to keep Reshape and MatMul in FP32
         bool reshapeIgnorePerTensorQuantizationCheck;
+        // to support Activations Scaling
+        bool scalingMode;
     };
 
     class PrecisionDetails {
@@ -352,6 +356,7 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass
     element::Type deqPrecision;
     std::vector<ov::element::Type> defaultPrecisions;
     bool reshapeIgnorePerTensorQuantizationCheck;
+    bool scalingMode;
 
     static constexpr char originalLayerPostfix[] = "_original";
     TransformationContext* context;

diff --git a/src/common/low_precision_transformations/src/add.cpp b/src/common/low_precision_transformations/src/add.cpp
@@ -214,14 +214,15 @@ bool AddTransformation::transform(TransformationContext& context, ov::pass::patt
                     newSubtractFullPathValues),
             newMultiplyFullPathValues);
 
+        auto output_type = scalingMode ? add->get_output_element_type(0) : element::f32;
         newAddOrSubtract = std::make_shared<ov::op::TypeRelaxed<ov::opset1::Add>>(
-            std::vector<element::Type>{element::f32, element::f32}, std::vector<element::Type>{ element::f32 },
-            ov::op::TemporaryReplaceOutputType(inputs[0], element::f32).get(),
-            ov::op::TemporaryReplaceOutputType(inputs[1], element::f32).get());
+            std::vector<element::Type>{output_type, output_type}, std::vector<element::Type>{output_type},
+            ov::op::TemporaryReplaceOutputType(inputs[0], output_type).get(),
+            ov::op::TemporaryReplaceOutputType(inputs[1], output_type).get());
         newMultiply = std::make_shared<ov::op::TypeRelaxed<ov::opset1::Multiply>>(
-            std::vector<element::Type>{element::f32, element::f32}, std::vector<element::Type>{ add->get_output_element_type(0) },
-            ov::op::TemporaryReplaceOutputType(newAddOrSubtract, element::f32).get(),
-            ov::op::TemporaryReplaceOutputType(multiplyEmptyPathValues, element::f32).get());
+            std::vector<element::Type>{output_type, output_type}, std::vector<element::Type>{add->get_output_element_type(0)},
+            ov::op::TemporaryReplaceOutputType(newAddOrSubtract, output_type).get(),
+            ov::op::TemporaryReplaceOutputType(multiplyEmptyPathValues, output_type).get());
 
         NetworkHelper::insertDequantizationAfter(add, newMultiply, newAddOrSubtract);
         NetworkHelper::copyInfo(add, newAddOrSubtract);

diff --git a/src/common/low_precision_transformations/src/layer_transformation.cpp b/src/common/low_precision_transformations/src/layer_transformation.cpp
@@ -45,6 +45,7 @@ LayerTransformation::LayerTransformation(const Params& params) :
     deqPrecision(params.deqPrecision),
     defaultPrecisions(params.defaultPrecisions),
     reshapeIgnorePerTensorQuantizationCheck(params.reshapeIgnorePerTensorQuantizationCheck),
+    scalingMode(params.scalingMode),
     context(nullptr) {}
 
 void LayerTransformation::setContext(TransformationContext* context) noexcept {

diff --git a/src/common/low_precision_transformations/src/multiply_partial.cpp b/src/common/low_precision_transformations/src/multiply_partial.cpp
@@ -79,16 +79,17 @@ bool MultiplyPartialTransformation::transform(TransformationContext& context, ov
         auto constParent = multiply->input_value(multiplyBranch.first == 0 ? 1 : 0);
         auto multiplyParentParent = multiplyParent.get_node_shared_ptr()->input_value(multiplyBranch.second);
         auto multiplyParentConst = multiplyParent.get_node_shared_ptr()->input_value(multiplyBranch.second == 0 ? 1 : 0);
+        auto inputDataType = scalingMode ? multiply->get_output_element_type(0) : element::f32;
 
         newMultiply = std::make_shared<ov::op::TypeRelaxed<ov::opset1::Multiply>>(
-            std::vector<ov::element::Type>{ element::f32, element::f32 },
+            std::vector<ov::element::Type>{ inputDataType, inputDataType },
             std::vector<ov::element::Type>{ multiply->get_output_element_type(0) },
-            ov::op::TemporaryReplaceOutputType(multiplyParentParent, element::f32).get(),
+            ov::op::TemporaryReplaceOutputType(multiplyParentParent, inputDataType).get(),
             ov::op::TemporaryReplaceOutputType(
                 fold<ov::opset1::Multiply>(
-                    foldConvert(multiplyParentConst, element::f32),
-                    foldConvert(constParent, element::f32)),
-                element::f32).get());
+                    foldConvert(multiplyParentConst, inputDataType),
+                    foldConvert(constParent, inputDataType)),
+                inputDataType).get());
 
         NetworkHelper::copyInfo(multiplyParent.get_node_shared_ptr(), newMultiply);
         NetworkHelper::copyInfo(multiply, newMultiply);
@@ -133,24 +134,30 @@ bool MultiplyPartialTransformation::transform(TransformationContext& context, ov
 
 
         // before: Y = (SC1 * (X1 - SH1)) * (SC2 * X2)
-        // after : Y = (SC1' * (X1 - SH1)) * (X2) , where :
-        //         SC1' = SC1 * SC2
+        // if scalingMode == false
+        //     after : Y = (SC1' * (X1 - SH1)) * (X2) , where :
+        //             SC1' = SC1 * SC2
+        // else
+        //     after : Y = ((X1 - SH1) * X2) * SC1' ,  where :
+        //             SC1' = SC1 * SC2
         auto newMultiplyValuesFullPath = fold<ov::opset1::Multiply>(multiplyValuesEmptyPath, multiplyValuesFullPath);
         OutputVector inputs{ {}, {} };
-        inputs[emptyPathIndex] = dequantizationEmptyPath.data;
+        inputs[emptyPathIndex] = scalingMode ? newMultiplyValuesFullPath : dequantizationEmptyPath.data;
+        auto input_for_fullPath = scalingMode ? dequantizationEmptyPath.data.get_node_shared_ptr() :
+                                                newMultiplyValuesFullPath;
 
         ov::Output<ov::Node> parent0 = dequantizationFullPath.subtract == nullptr ?
             (dequantizationFullPath.convert == nullptr ? dequantizationFullPath.data : dequantizationFullPath.convert) :
             dequantizationFullPath.subtract;
 
         inputs[fullPathIndex] =
-            parent0.get_node()->get_output_element_type(0) == newMultiplyValuesFullPath->get_output_element_type(0) ?
-                std::make_shared<ov::opset1::Multiply>(parent0, newMultiplyValuesFullPath) :
+            parent0.get_node()->get_output_element_type(0) == input_for_fullPath->get_output_element_type(0) ?
+                std::make_shared<ov::opset1::Multiply>(parent0, input_for_fullPath) :
                 std::make_shared<ov::op::TypeRelaxed<ov::opset1::Multiply>>(
                       std::vector<element::Type>{element::f32, element::f32},
                       std::vector<element::Type>{element::f32},
                       ov::op::TemporaryReplaceOutputType(parent0, element::f32).get(),
-                      ov::op::TemporaryReplaceOutputType(newMultiplyValuesFullPath, element::f32).get());
+                      ov::op::TemporaryReplaceOutputType(input_for_fullPath, element::f32).get());
 
         newMultiply = std::make_shared<ov::op::TypeRelaxed<ov::opset1::Multiply>>(
                 std::vector<element::Type>{element::f32, element::f32},

diff --git a/src/common/low_precision_transformations/src/network_helper.cpp b/src/common/low_precision_transformations/src/network_helper.cpp
@@ -218,7 +218,6 @@ std::shared_ptr<Node> NetworkHelper::swapMultiplyAndAdd(std::shared_ptr<ov::opse
     if (multiplyConst == nullptr)
         return addAfterMultiply;
 
-    const auto x = multiply->input_value(multiplyInputBranch);
     auto a = as_type_ptr<ov::opset1::Constant>(multiply->get_input_node_shared_ptr(multiplyInputBranch == 0 ? 1 : 0));
     auto b = as_type_ptr<ov::opset1::Constant>(addAfterMultiply->get_input_node_shared_ptr(multiplyBranch == 0 ? 1 : 0));
     std::shared_ptr<ov::opset1::Constant> bDivA;
@@ -263,15 +262,15 @@ std::shared_ptr<Node> NetworkHelper::swapMultiplyAndAdd(std::shared_ptr<ov::opse
         bDivA = as_type_ptr<ov::opset1::Constant>(foldConvert(bDivA->output(0), a->get_element_type()));
     }
 
-    OutputVector inputs{ {}, {} };
-    inputs[0] = x;
-    inputs[1] = bDivA->output(0);
-
+    const auto& add_input = multiply->input_value(multiplyInputBranch);
+    // Note: precision is copied to a separate variable intentionally,
+    // since TemporaryReplaceOutputType replaces add_input's precision, whereas we need to set the original precision on newAdd's output
+    const auto add_output_precision = add_input.get_element_type();
     std::shared_ptr<ov::opset1::Add> newAdd = std::make_shared<ov::op::TypeRelaxed<ov::opset1::Add>>(
         std::vector<element::Type>{element::f32, element::f32},
-        std::vector<element::Type>{ x.get_element_type() },
-        ov::op::TemporaryReplaceOutputType(inputs[0], element::f32).get(),
-        ov::op::TemporaryReplaceOutputType(inputs[1], element::f32).get());
+        std::vector<element::Type>{ add_output_precision },
+        ov::op::TemporaryReplaceOutputType(add_input, element::f32).get(),
+        ov::op::TemporaryReplaceOutputType(bDivA, element::f32).get());
     copyInfo(addAfterMultiply, newAdd);
 
     auto newMultiply = std::make_shared<ov::op::TypeRelaxed<ov::opset1::Multiply>>(

diff --git a/...mmon/transformations/include/transformations/common_optimizations/activations_scaling.hpp b/...mmon/transformations/include/transformations/common_optimizations/activations_scaling.hpp
@@ -0,0 +1,104 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+
+#include "openvino/pass/matcher_pass.hpp"
+#include "transformations_visibility.hpp"
+
+namespace ov {
+namespace pass {
+
+class TRANSFORMATIONS_API ActivationsScaling;
+
+namespace activations_scaling {
+
+class TRANSFORMATIONS_API ScaleDownSingleLayer;
+class TRANSFORMATIONS_API EliminateScalarMul;
+class TRANSFORMATIONS_API MulConcatTransformation;
+class TRANSFORMATIONS_API MulShareTransformation;
+class TRANSFORMATIONS_API MoveDownScalarMul;
+
+}  // namespace activations_scaling
+}  // namespace pass
+}  // namespace ov
+
+// ActivationsScaling makes activation values smaller to prevent overflow due to the limited range of FP16
+// This feature is controlled by ov::hint::activations_scale_factor.
+// For example, when this property is set as 16, activations are divided by 16.
+// If ov::hint::activations_scale_factor is less than or equal to zero, it is disabled.
+
+// Add scale_down and scale_up layers around Convolution and MatMul nodes
+// Conv/MatMul
+//    ==>
+// Multiply(scale_down by scale_factor) --> Conv/MatMul --> Multiply(scale_up by scale_factor)
+class ov::pass::activations_scaling::ScaleDownSingleLayer : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("ScaleDownSingleLayer", "0");
+    ScaleDownSingleLayer(float scale_factor, ov::element::Type scaled_prec);
+};
+
+// Normalization and ShapeOf have the following property.
+//
+// Norm(input * const_a) = Norm(input)
+//
+// So, we can skip Multiply that is connected to Normalization and ShapeOf.
+//
+// input --> Multiply --> Normalization/ShapeOf
+//   ==>
+// input --> Normalization/ShapeOf
+class ov::pass::activations_scaling::EliminateScalarMul : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("EliminateScalarMul", "0");
+    EliminateScalarMul();
+};
+
+// input_a   const_a   input_b   const_b   input_c   const_c
+//    \        /          \        /          \        /
+//    Multiply_a          Multiply_b          Multiply_c
+//             \              |               /
+//              \             |              /
+//               ---------- Concat ------------
+// ==>
+//          (const_a            (const_b             (const_c
+// input_a  /const_c)  input_b  /const_c)  input_c   /const_c)
+//    \        /          \        /          \        /
+//    Multiply_a          Multiply_b          Multiply_c
+//             \              |               /
+//              \             |              /
+//               ---------- Concat ------------
+//                            |   const_c
+//                            |    /
+//                           Multiply
+class ov::pass::activations_scaling::MulConcatTransformation : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("MulConcatTransformation", "0");
+    MulConcatTransformation();
+};
+
+//         input             input
+//         /   \               |
+//      Norm   Mul    ==>     Mul (expect to be fused into the input layer)
+//        |     |            /   \_
+//      op_a   op_b       Norm   op_b
+//                          |
+//                        op_a
+class ov::pass::activations_scaling::MulShareTransformation : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("MulShareTransformation", "0");
+    MulShareTransformation();
+};
+
+//        input_b   scalar        input_a   input_b
+//              \   /                   \   /
+//    input_a   Mul_b       ==>         Mul_a'  scalar
+//          \   /                         \     /
+//          Mul_a                          Mul_b' (expect to be merged with Mul_a')
+class ov::pass::activations_scaling::MoveDownScalarMul : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("MoveDownScalarMul", "0");
+    MoveDownScalarMul();
+};
diff --git a/src/common/transformations/include/transformations/rt_info/dequantization_node.hpp b/src/common/transformations/include/transformations/rt_info/dequantization_node.hpp
@@ -12,7 +12,7 @@ namespace ov {
 
 TRANSFORMATIONS_API void mark_as_dequantization_node(const std::shared_ptr<Node>& node);
 
-TRANSFORMATIONS_API bool is_dequantization_node(const std::shared_ptr<Node>& node);
+TRANSFORMATIONS_API bool is_dequantization_node(const std::shared_ptr<const Node>& node);
 
 /**
  * @ingroup ov_runtime_attr_api