Skip to content

Commit acac26c

Browse files
authored
Perform f16 compression to postponed constant input (#32631)
### Details: - Perform f16 compression to postponed constant input. - *...* ### Tickets: - *ticket-id*
1 parent 25038a9 commit acac26c

File tree

5 files changed

+149
-25
lines changed

5 files changed

+149
-25
lines changed

src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp

Lines changed: 50 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "itt.hpp"
88
#include "openvino/core/graph_util.hpp"
99
#include "openvino/core/rt_info.hpp"
10+
#include "openvino/core/type.hpp"
1011
#include "openvino/op/constant.hpp"
1112
#include "openvino/op/convert.hpp"
1213
#include "openvino/op/fake_convert.hpp"
@@ -196,21 +197,57 @@ ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl(bool postponed)
196197
return false;
197198
}
198199
auto constant_target_inputs = const_node->get_output_target_inputs(0);
199-
auto convert = std::make_shared<ov::op::v0::Convert>(new_const, const_node->get_element_type());
200-
201-
convert->set_friendly_name(const_node->get_friendly_name());
202-
new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed");
203-
ov::copy_runtime_info(const_node, convert);
204-
ov::mark_as_decompression(convert);
205-
if (postponed) {
206-
postpone_fp16_compression(new_const->get_rt_info());
207-
postpone_fp16_compression(new_const->get_output_tensor(0).get_rt_info());
208-
209-
for (const auto& target_input : constant_target_inputs) {
210-
target_input.replace_source_output(convert);
200+
201+
// Check if the next node is a postponed constant. It will be constant_folded later during serialization.
202+
auto postponed_constant_node = [&]() -> std::shared_ptr<ov::Node> {
203+
if (constant_target_inputs.size() == 1 &&
204+
constant_target_inputs.begin()->get_node()->get_rt_info().count("postponed_constant")) {
205+
return constant_target_inputs.begin()->get_node()->shared_from_this();
206+
}
207+
return nullptr;
208+
}();
209+
210+
if (postponed_constant_node && postponed) {
211+
// If f16 conversion is also postponed, we need to insert Convert after the postponed_constant_node
212+
if (is_fp16_compression_postponed(postponed_constant_node->get_rt_info())) {
213+
// Convert was already added after postponed_constant_node. Get it and just update rt info
214+
auto next_node = postponed_constant_node->get_output_target_inputs(0).begin()->get_node();
215+
OPENVINO_ASSERT(ov::as_type<ov::op::v0::Convert>(next_node));
216+
ov::copy_runtime_info(const_node, next_node->shared_from_this());
217+
} else {
218+
auto postponed_constant_target_inputs = postponed_constant_node->get_output_target_inputs(0);
219+
auto convert =
220+
std::make_shared<ov::op::v0::Convert>(postponed_constant_node, const_node->get_element_type());
221+
222+
convert->set_friendly_name(postponed_constant_node->get_friendly_name());
223+
ov::mark_as_decompression(convert);
224+
ov::copy_runtime_info(const_node, convert);
225+
postponed_constant_node->set_friendly_name(postponed_constant_node->get_friendly_name() +
226+
"_compressed");
227+
postpone_fp16_compression(postponed_constant_node->get_rt_info());
228+
postpone_fp16_compression(postponed_constant_node->get_output_tensor(0).get_rt_info());
229+
230+
for (const auto& target_input : postponed_constant_target_inputs) {
231+
target_input.replace_source_output(convert);
232+
}
211233
}
212234
} else {
213-
ov::replace_node(const_node, convert);
235+
auto convert = std::make_shared<ov::op::v0::Convert>(new_const, const_node->get_element_type());
236+
237+
convert->set_friendly_name(const_node->get_friendly_name());
238+
new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed");
239+
ov::copy_runtime_info(const_node, convert);
240+
ov::mark_as_decompression(convert);
241+
if (postponed) {
242+
postpone_fp16_compression(new_const->get_rt_info());
243+
postpone_fp16_compression(new_const->get_output_tensor(0).get_rt_info());
244+
245+
for (const auto& target_input : constant_target_inputs) {
246+
target_input.replace_source_output(convert);
247+
}
248+
} else {
249+
ov::replace_node(const_node, convert);
250+
}
214251
}
215252
return true;
216253
};

src/common/transformations/src/transformations/common_optimizations/moe_transpose_weights.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ ov::pass::VectorizedMOE2GEMMTransposeWeights::VectorizedMOE2GEMMTransposeWeights
109109
if (ov::is_type<ov::op::v0::Constant>(transpose_input.get_node_shared_ptr())) {
110110
transpose->get_rt_info()["postponed_constant"] = true;
111111
ov::pass::disable_constant_folding(transpose);
112-
ov::disable_fp16_compression(transpose_input.get_node_shared_ptr());
113112
}
114113

115114
ov::NodeVector rt_sources{transpose_input.get_node_shared_ptr()};

src/common/transformations/tests/common_optimizations/moe_transpose_weights_test.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
#include "openvino/op/unsqueeze.hpp"
3131
#include "openvino/pass/constant_folding.hpp"
3232
#include "transformations/rt_info/decompression.hpp"
33-
#include "transformations/rt_info/disable_fp16_compression.hpp"
3433

3534
using namespace ov;
3635

@@ -85,7 +84,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
8584
auto gate_transpose = std::make_shared<op::v1::Transpose>(convert_input, order);
8685
gate_transpose->get_rt_info()["postponed_constant"] = true;
8786
ov::pass::disable_constant_folding(gate_transpose);
88-
ov::disable_fp16_compression(gate_const);
8987
convert_input = gate_transpose;
9088
}
9189

@@ -99,7 +97,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
9997
auto gate_transpose = std::make_shared<op::v1::Transpose>(gate_weight_output, order);
10098
gate_transpose->get_rt_info()["postponed_constant"] = true;
10199
ov::pass::disable_constant_folding(gate_transpose);
102-
ov::disable_fp16_compression(gate_weights);
103100
gate_weight_output = gate_transpose;
104101
}
105102
}
@@ -156,7 +153,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
156153
auto down_transpose = std::make_shared<op::v1::Transpose>(convert_input, order);
157154
down_transpose->get_rt_info()["postponed_constant"] = true;
158155
ov::pass::disable_constant_folding(down_transpose);
159-
ov::disable_fp16_compression(down_const);
160156
convert_input = down_transpose;
161157
}
162158

@@ -177,7 +173,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
177173
auto down_transpose = std::make_shared<op::v1::Transpose>(down_weight_output, order);
178174
down_transpose->get_rt_info()["postponed_constant"] = true;
179175
ov::pass::disable_constant_folding(down_transpose);
180-
ov::disable_fp16_compression(down_const);
181176
down_weight_output = down_transpose;
182177
}
183178
}

src/core/src/xml_util/xml_serialize_util.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "openvino/core/except.hpp"
1212
#include "openvino/core/meta_data.hpp"
1313
#include "openvino/core/model.hpp"
14+
#include "openvino/core/rt_info.hpp"
1415
#include "openvino/core/runtime_attribute.hpp"
1516
#include "openvino/op/binary_convolution.hpp"
1617
#include "openvino/op/constant.hpp"
@@ -58,14 +59,16 @@ class PostponedConstantReplacer {
5859
// clone to keep original node unchanged
5960
node_clone = node->clone_with_new_inputs(node->input_values());
6061
node_clone->get_rt_info().erase(ov::pass::DisableConstantFolding::get_type_info_static());
61-
node = node_clone.get();
6262
}
63+
auto node_to_fold = node_clone ? node_clone : node->shared_from_this();
6364
OPENVINO_ASSERT(
64-
node->constant_fold(outputs, node->input_values()),
65+
node_to_fold->constant_fold(outputs, node_to_fold->input_values()),
6566
"Node with set `postponed_constant` attribute cannot be fold to constant when saving model to IR file");
6667
m_constant = outputs[0].get_node_shared_ptr();
6768
m_node = m_constant.get();
6869
m_node->set_friendly_name(node->get_friendly_name());
70+
ov::copy_runtime_info(node->shared_from_this(), m_constant);
71+
ov::copy_output_runtime_info(node->outputs(), m_constant->outputs());
6972
}
7073
}
7174
};

src/core/tests/pass/serialization/custom_ops.cpp

Lines changed: 94 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,13 @@
1111
#include "openvino/op/add.hpp"
1212
#include "openvino/op/concat.hpp"
1313
#include "openvino/op/constant.hpp"
14+
#include "openvino/op/convert.hpp"
1415
#include "openvino/op/multiply.hpp"
1516
#include "openvino/pass/constant_folding.hpp"
1617
#include "openvino/pass/manager.hpp"
1718
#include "openvino/pass/serialize.hpp"
1819
#include "openvino/runtime/core.hpp"
20+
#include "transformations/common_optimizations/compress_float_constants.hpp"
1921

2022
class CustomOpsSerializationTest : public ::testing::Test {
2123
protected:
@@ -186,7 +188,7 @@ TEST(PostponedConstantTest, ConcatWithPostponedConstant) {
186188

187189
auto model = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");
188190

189-
ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
191+
ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
190192
}
191193
ov::Core core;
192194

@@ -230,7 +232,7 @@ TEST(PostponedConstantTest, SubgraphExclusion) {
230232
auto model =
231233
std::make_shared<ov::Model>(final_add->outputs(), ov::ParameterVector{param}, "SubgraphExclusionModel");
232234

233-
ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
235+
ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
234236
}
235237
ov::Core core;
236238

@@ -274,7 +276,7 @@ TEST(PostponedConstantTest, NodeWithMultipleConsumers) {
274276

275277
concat->get_rt_info()["postponed_constant"] = true;
276278

277-
ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
279+
ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
278280
}
279281
ov::Core core;
280282

@@ -330,7 +332,7 @@ TEST(PostponedConstantTest, ModelIsUnchangedAfterSerialization) {
330332
ov::pass::disable_constant_folding(concat);
331333

332334
auto model_copy = model->clone();
333-
ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
335+
ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
334336

335337
const auto& [success, message] = compare_functions(model_copy, model, true, true, true, true, true);
336338
ASSERT_TRUE(success) << message;
@@ -358,3 +360,91 @@ TEST(PostponedConstantTest, ModelIsUnchangedAfterSerialization) {
358360
ASSERT_TRUE(success) << message;
359361
}
360362
}
363+
364+
TEST(PostponedConstantTest, F16Compression2Inputs) {
365+
std::stringstream serialized_xml, serialized_bin;
366+
{
367+
auto const1 =
368+
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{1, 2, 3, 4});
369+
auto const2 =
370+
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{5, 6, 7, 8});
371+
auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{const1, const2}, 0);
372+
concat->get_rt_info()["postponed_constant"] = true;
373+
374+
auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
375+
auto add = std::make_shared<ov::op::v1::Add>(concat, param);
376+
377+
auto model = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");
378+
379+
// in case of postponed_constant + postponed f16 compression, f16 -> f32 convert should be added after postponed
380+
// constant
381+
bool postponed = true;
382+
ov::pass::compress_model_to_f16(model, postponed);
383+
384+
ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
385+
}
386+
ov::Core core;
387+
388+
auto weights = serialized_bin.str();
389+
ov::Tensor weights_tensor(ov::element::u8, ov::Shape{weights.size()}, weights.data());
390+
391+
auto deserialized_model = core.read_model(serialized_xml.str(), weights_tensor);
392+
393+
{
394+
auto constant = std::make_shared<ov::op::v0::Constant>(ov::element::f16,
395+
ov::Shape{4, 2},
396+
std::vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
397+
auto convert = std::make_shared<ov::op::v0::Convert>(constant, ov::element::f32);
398+
auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
399+
auto add = std::make_shared<ov::op::v1::Add>(convert, param);
400+
401+
auto expected = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");
402+
403+
const auto& [success, message] =
404+
compare_functions(deserialized_model, expected, true, false, false, true, true);
405+
ASSERT_TRUE(success) << message;
406+
}
407+
}
408+
409+
TEST(PostponedConstantTest, F16CompressionNotPostponned) {
410+
std::stringstream serialized_xml, serialized_bin;
411+
auto check_model = [](const std::shared_ptr<ov::Model>& model) {
412+
auto const1 =
413+
std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{2, 2}, std::vector<float>{1, 2, 3, 4});
414+
auto convert1 = std::make_shared<ov::op::v0::Convert>(const1, ov::element::f32);
415+
auto const2 =
416+
std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{2, 2}, std::vector<float>{5, 6, 7, 8});
417+
auto convert2 = std::make_shared<ov::op::v0::Convert>(const2, ov::element::f32);
418+
auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{convert1, convert2}, 0);
419+
concat->get_rt_info()["postponed_constant"] = true;
420+
421+
auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
422+
auto add = std::make_shared<ov::op::v1::Add>(concat, param);
423+
424+
auto expected = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");
425+
426+
const auto& [success, message] = compare_functions(model, expected, true, false, false, true, true);
427+
ASSERT_TRUE(success) << message;
428+
};
429+
430+
{
431+
auto const1 =
432+
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{1, 2, 3, 4});
433+
auto const2 =
434+
std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{5, 6, 7, 8});
435+
auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{const1, const2}, 0);
436+
concat->get_rt_info()["postponed_constant"] = true;
437+
438+
auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
439+
auto add = std::make_shared<ov::op::v1::Add>(concat, param);
440+
441+
auto model = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");
442+
443+
bool postponed = false;
444+
ov::pass::compress_model_to_f16(model, postponed);
445+
446+
check_model(model);
447+
448+
ASSERT_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model), ov::Exception);
449+
}
450+
}

0 commit comments

Comments
 (0)