Skip to content

Commit

Permalink
[GPU] activations scaling to resolve accuracy issues for infer precis…
Browse files Browse the repository at this point in the history
…ion of f16 (#27265)

### Details:
- When a model runs at inference precision of f16, it might be unable to
calculate correct results due to limited range of f16.
- The purpose of this PR is to avoid situations where overflow occurs
during calculation by scaling down the activation, thereby obtaining
correct results when the infer precision is f16.
- A new config property "ACTIVATIONS_SCALE_FACTOR" is introduced, which
holds a single floating-point value. For example, if it is 64,
activations are divided by 64 before Convolution and MatMul. If it is
smaller than 0, this feature is disabled.
   - This property also can be set via rt_info of a model as below.
```html
    <rt_info>
        <runtime_options>
            <ACTIVATIONS_SCALE_FACTOR value="8.0" />
        </runtime_options>
    </rt_info>
``` 

### Tickets:
 - 147052

---------

Co-authored-by: Andrew Park <[email protected]>
  • Loading branch information
e-ddykim and andrew-k-park authored Jan 14, 2025
1 parent d63a9df commit cc67ad1
Show file tree
Hide file tree
Showing 16 changed files with 1,035 additions and 52 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -252,11 +252,13 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass
element::Type deqPrecision = element::f32,
const std::vector<ov::element::Type> defaultPrecisions =
{ ov::element::u8, ov::element::i8 },
const bool reshapeIgnorePerTensorQuantizationCheck = false) :
const bool reshapeIgnorePerTensorQuantizationCheck = false,
const bool scalingMode = false) :
updatePrecisions(updatePrecisions),
deqPrecision(deqPrecision),
defaultPrecisions(defaultPrecisions),
reshapeIgnorePerTensorQuantizationCheck(reshapeIgnorePerTensorQuantizationCheck) {}
reshapeIgnorePerTensorQuantizationCheck(reshapeIgnorePerTensorQuantizationCheck),
scalingMode(scalingMode) {}

Params& setUpdatePrecisions(const bool updatePrecisions) {
this->updatePrecisions = updatePrecisions;
Expand All @@ -281,6 +283,8 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass
std::vector<ov::element::Type> defaultPrecisions;
// to support GPU workarround to keep Reshape and MatMul in FP32
bool reshapeIgnorePerTensorQuantizationCheck;
// to support Activations Scaling
bool scalingMode;
};

class PrecisionDetails {
Expand Down Expand Up @@ -352,6 +356,7 @@ class LP_TRANSFORMATIONS_API LayerTransformation : public ov::pass::MatcherPass
element::Type deqPrecision;
std::vector<ov::element::Type> defaultPrecisions;
bool reshapeIgnorePerTensorQuantizationCheck;
bool scalingMode;

static constexpr char originalLayerPostfix[] = "_original";
TransformationContext* context;
Expand Down
13 changes: 7 additions & 6 deletions src/common/low_precision_transformations/src/add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,14 +214,15 @@ bool AddTransformation::transform(TransformationContext& context, ov::pass::patt
newSubtractFullPathValues),
newMultiplyFullPathValues);

auto output_type = scalingMode ? add->get_output_element_type(0) : element::f32;
newAddOrSubtract = std::make_shared<ov::op::TypeRelaxed<ov::opset1::Add>>(
std::vector<element::Type>{element::f32, element::f32}, std::vector<element::Type>{ element::f32 },
ov::op::TemporaryReplaceOutputType(inputs[0], element::f32).get(),
ov::op::TemporaryReplaceOutputType(inputs[1], element::f32).get());
std::vector<element::Type>{output_type, output_type}, std::vector<element::Type>{output_type},
ov::op::TemporaryReplaceOutputType(inputs[0], output_type).get(),
ov::op::TemporaryReplaceOutputType(inputs[1], output_type).get());
newMultiply = std::make_shared<ov::op::TypeRelaxed<ov::opset1::Multiply>>(
std::vector<element::Type>{element::f32, element::f32}, std::vector<element::Type>{ add->get_output_element_type(0) },
ov::op::TemporaryReplaceOutputType(newAddOrSubtract, element::f32).get(),
ov::op::TemporaryReplaceOutputType(multiplyEmptyPathValues, element::f32).get());
std::vector<element::Type>{output_type, output_type}, std::vector<element::Type>{add->get_output_element_type(0)},
ov::op::TemporaryReplaceOutputType(newAddOrSubtract, output_type).get(),
ov::op::TemporaryReplaceOutputType(multiplyEmptyPathValues, output_type).get());

NetworkHelper::insertDequantizationAfter(add, newMultiply, newAddOrSubtract);
NetworkHelper::copyInfo(add, newAddOrSubtract);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ LayerTransformation::LayerTransformation(const Params& params) :
deqPrecision(params.deqPrecision),
defaultPrecisions(params.defaultPrecisions),
reshapeIgnorePerTensorQuantizationCheck(params.reshapeIgnorePerTensorQuantizationCheck),
scalingMode(params.scalingMode),
context(nullptr) {}

void LayerTransformation::setContext(TransformationContext* context) noexcept {
Expand Down
29 changes: 18 additions & 11 deletions src/common/low_precision_transformations/src/multiply_partial.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,16 +79,17 @@ bool MultiplyPartialTransformation::transform(TransformationContext& context, ov
auto constParent = multiply->input_value(multiplyBranch.first == 0 ? 1 : 0);
auto multiplyParentParent = multiplyParent.get_node_shared_ptr()->input_value(multiplyBranch.second);
auto multiplyParentConst = multiplyParent.get_node_shared_ptr()->input_value(multiplyBranch.second == 0 ? 1 : 0);
auto inputDataType = scalingMode ? multiply->get_output_element_type(0) : element::f32;

newMultiply = std::make_shared<ov::op::TypeRelaxed<ov::opset1::Multiply>>(
std::vector<ov::element::Type>{ element::f32, element::f32 },
std::vector<ov::element::Type>{ inputDataType, inputDataType },
std::vector<ov::element::Type>{ multiply->get_output_element_type(0) },
ov::op::TemporaryReplaceOutputType(multiplyParentParent, element::f32).get(),
ov::op::TemporaryReplaceOutputType(multiplyParentParent, inputDataType).get(),
ov::op::TemporaryReplaceOutputType(
fold<ov::opset1::Multiply>(
foldConvert(multiplyParentConst, element::f32),
foldConvert(constParent, element::f32)),
element::f32).get());
foldConvert(multiplyParentConst, inputDataType),
foldConvert(constParent, inputDataType)),
inputDataType).get());

NetworkHelper::copyInfo(multiplyParent.get_node_shared_ptr(), newMultiply);
NetworkHelper::copyInfo(multiply, newMultiply);
Expand Down Expand Up @@ -133,24 +134,30 @@ bool MultiplyPartialTransformation::transform(TransformationContext& context, ov


// before: Y = (SC1 * (X1 - SH1)) * (SC2 * X2)
// after : Y = (SC1' * (X1 - SH1)) * (X2) , where :
// SC1' = SC1 * SC2
// if scalingMode == false
// after : Y = (SC1' * (X1 - SH1)) * (X2) , where :
// SC1' = SC1 * SC2
// else
// after : Y = ((X1 - SH1) * X2) * SC1' , where :
// SC1' = SC1 * SC2
auto newMultiplyValuesFullPath = fold<ov::opset1::Multiply>(multiplyValuesEmptyPath, multiplyValuesFullPath);
OutputVector inputs{ {}, {} };
inputs[emptyPathIndex] = dequantizationEmptyPath.data;
inputs[emptyPathIndex] = scalingMode ? newMultiplyValuesFullPath : dequantizationEmptyPath.data;
auto input_for_fullPath = scalingMode ? dequantizationEmptyPath.data.get_node_shared_ptr() :
newMultiplyValuesFullPath;

ov::Output<ov::Node> parent0 = dequantizationFullPath.subtract == nullptr ?
(dequantizationFullPath.convert == nullptr ? dequantizationFullPath.data : dequantizationFullPath.convert) :
dequantizationFullPath.subtract;

inputs[fullPathIndex] =
parent0.get_node()->get_output_element_type(0) == newMultiplyValuesFullPath->get_output_element_type(0) ?
std::make_shared<ov::opset1::Multiply>(parent0, newMultiplyValuesFullPath) :
parent0.get_node()->get_output_element_type(0) == input_for_fullPath->get_output_element_type(0) ?
std::make_shared<ov::opset1::Multiply>(parent0, input_for_fullPath) :
std::make_shared<ov::op::TypeRelaxed<ov::opset1::Multiply>>(
std::vector<element::Type>{element::f32, element::f32},
std::vector<element::Type>{element::f32},
ov::op::TemporaryReplaceOutputType(parent0, element::f32).get(),
ov::op::TemporaryReplaceOutputType(newMultiplyValuesFullPath, element::f32).get());
ov::op::TemporaryReplaceOutputType(input_for_fullPath, element::f32).get());

newMultiply = std::make_shared<ov::op::TypeRelaxed<ov::opset1::Multiply>>(
std::vector<element::Type>{element::f32, element::f32},
Expand Down
15 changes: 7 additions & 8 deletions src/common/low_precision_transformations/src/network_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,6 @@ std::shared_ptr<Node> NetworkHelper::swapMultiplyAndAdd(std::shared_ptr<ov::opse
if (multiplyConst == nullptr)
return addAfterMultiply;

const auto x = multiply->input_value(multiplyInputBranch);
auto a = as_type_ptr<ov::opset1::Constant>(multiply->get_input_node_shared_ptr(multiplyInputBranch == 0 ? 1 : 0));
auto b = as_type_ptr<ov::opset1::Constant>(addAfterMultiply->get_input_node_shared_ptr(multiplyBranch == 0 ? 1 : 0));
std::shared_ptr<ov::opset1::Constant> bDivA;
Expand Down Expand Up @@ -263,15 +262,15 @@ std::shared_ptr<Node> NetworkHelper::swapMultiplyAndAdd(std::shared_ptr<ov::opse
bDivA = as_type_ptr<ov::opset1::Constant>(foldConvert(bDivA->output(0), a->get_element_type()));
}

OutputVector inputs{ {}, {} };
inputs[0] = x;
inputs[1] = bDivA->output(0);

const auto& add_input = multiply->input_value(multiplyInputBranch);
// Note: precision is copied to a separate variable intentionally,
// since TemporaryReplaceOutputType replaces add_input's precision, whereas we need to set the original precision on newAdd's output
const auto add_output_precision = add_input.get_element_type();
std::shared_ptr<ov::opset1::Add> newAdd = std::make_shared<ov::op::TypeRelaxed<ov::opset1::Add>>(
std::vector<element::Type>{element::f32, element::f32},
std::vector<element::Type>{ x.get_element_type() },
ov::op::TemporaryReplaceOutputType(inputs[0], element::f32).get(),
ov::op::TemporaryReplaceOutputType(inputs[1], element::f32).get());
std::vector<element::Type>{ add_output_precision },
ov::op::TemporaryReplaceOutputType(add_input, element::f32).get(),
ov::op::TemporaryReplaceOutputType(bDivA, element::f32).get());
copyInfo(addAfterMultiply, newAdd);

auto newMultiply = std::make_shared<ov::op::TypeRelaxed<ov::opset1::Multiply>>(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <memory>

#include "openvino/pass/matcher_pass.hpp"
#include "transformations_visibility.hpp"

namespace ov {
namespace pass {

class TRANSFORMATIONS_API ActivationsScaling;

namespace activations_scaling {

class TRANSFORMATIONS_API ScaleDownSingleLayer;
class TRANSFORMATIONS_API EliminateScalarMul;
class TRANSFORMATIONS_API MulConcatTransformation;
class TRANSFORMATIONS_API MulShareTransformation;
class TRANSFORMATIONS_API MoveDownScalarMul;

} // namespace activations_scaling
} // namespace pass
} // namespace ov

// ActivationsScaling makes activation values smaller to prevent overflow due to the limited range of FP16
// This feature is controlled by ov::hint::activations_scale_factor.
// For example, when this property is set as 16, activations are divided by 16.
// If ov::hint::activations_scale_factor is less than or equal to zero, it is disabled.

// Add scale_down and scale_up layers around Convolution and MatMul nodes
// Conv/MatMul
// ==>
// Multiply(scale_down by scale_factor) --> Conv/MatMul --> Multiply(scale_up by scale_factor)
class ov::pass::activations_scaling::ScaleDownSingleLayer : public ov::pass::MatcherPass {
public:
OPENVINO_MATCHER_PASS_RTTI("ScaleDownSingleLayer", "0");
ScaleDownSingleLayer(float scale_factor, ov::element::Type scaled_prec);
};

// Normalization and ShapeOf have the following property.
//
// Norm(input * const_a) = Norm(input)
//
// So, we can skip Multiply that is connected to Normalization and ShapeOf.
//
// input --> Multiply --> Normalization/ShapeOf
// ==>
// input --> Normalization/ShapeOf
class ov::pass::activations_scaling::EliminateScalarMul : public ov::pass::MatcherPass {
public:
OPENVINO_MATCHER_PASS_RTTI("EliminateScalarMul", "0");
EliminateScalarMul();
};

// input_a const_a input_b const_b input_c const_c
// \ / \ / \ /
// Multiply_a Multiply_b Multiply_c
// \ | /
// \ | /
// ---------- Concat ------------
// ==>
// (const_a (const_b (const_c
// input_a /const_c) input_b /const_c) input_c /const_c)
// \ / \ / \ /
// Multiply_a Multiply_b Multiply_c
// \ | /
// \ | /
// ---------- Concat ------------
// | const_c
// | /
// Multiply
class ov::pass::activations_scaling::MulConcatTransformation : public ov::pass::MatcherPass {
public:
OPENVINO_MATCHER_PASS_RTTI("MulConcatTransformation", "0");
MulConcatTransformation();
};

// input input
// / \ |
// Norm Mul ==> Mul (expect to be fused into the input layer)
// | | / \_
// op_a op_b Norm op_b
// |
// op_a
class ov::pass::activations_scaling::MulShareTransformation : public ov::pass::MatcherPass {
public:
OPENVINO_MATCHER_PASS_RTTI("MulShareTransformation", "0");
MulShareTransformation();
};

// input_b scalar input_a input_b
// \ / \ /
// input_a Mul_b ==> Mul_a' scalar
// \ / \ /
// Mul_a Mul_b' (expect to be merged with Mul_a')
class ov::pass::activations_scaling::MoveDownScalarMul : public ov::pass::MatcherPass {
public:
OPENVINO_MATCHER_PASS_RTTI("MoveDownScalarMul", "0");
MoveDownScalarMul();
};
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ namespace ov {

TRANSFORMATIONS_API void mark_as_dequantization_node(const std::shared_ptr<Node>& node);

TRANSFORMATIONS_API bool is_dequantization_node(const std::shared_ptr<Node>& node);
TRANSFORMATIONS_API bool is_dequantization_node(const std::shared_ptr<const Node>& node);

/**
* @ingroup ov_runtime_attr_api
Expand Down
Loading

0 comments on commit cc67ad1

Please sign in to comment.