Perform f16 compression to postponed constant input (#32631)

olpipi · web-flow · commit acac26c6b0b3 · 2025-11-07T16:26:26.000Z
### Details:
 - Perform f16 compression to postponed constant input. 
 - *...*

### Tickets:
 - *ticket-id*
diff --git a/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp b/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp
@@ -7,6 +7,7 @@
 #include "itt.hpp"
 #include "openvino/core/graph_util.hpp"
 #include "openvino/core/rt_info.hpp"
+#include "openvino/core/type.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/convert.hpp"
 #include "openvino/op/fake_convert.hpp"
@@ -196,21 +197,57 @@ ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl(bool postponed)
             return false;
         }
         auto constant_target_inputs = const_node->get_output_target_inputs(0);
-        auto convert = std::make_shared<ov::op::v0::Convert>(new_const, const_node->get_element_type());
-
-        convert->set_friendly_name(const_node->get_friendly_name());
-        new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed");
-        ov::copy_runtime_info(const_node, convert);
-        ov::mark_as_decompression(convert);
-        if (postponed) {
-            postpone_fp16_compression(new_const->get_rt_info());
-            postpone_fp16_compression(new_const->get_output_tensor(0).get_rt_info());
-
-            for (const auto& target_input : constant_target_inputs) {
-                target_input.replace_source_output(convert);
+
+        // Check if the next node is a postponed constant. It will be constant_folded later during serialization.
+        auto postponed_constant_node = [&]() -> std::shared_ptr<ov::Node> {
+            if (constant_target_inputs.size() == 1 &&
+                constant_target_inputs.begin()->get_node()->get_rt_info().count("postponed_constant")) {
+                return constant_target_inputs.begin()->get_node()->shared_from_this();
+            }
+            return nullptr;
+        }();
+
+        if (postponed_constant_node && postponed) {
+            // If f16 conversion is also postponed, we need to insert Convert after the postponed_constant_node
+            if (is_fp16_compression_postponed(postponed_constant_node->get_rt_info())) {
+                // Convert was already added after postponed_constant_node. Get it and just update rt info
+                auto next_node = postponed_constant_node->get_output_target_inputs(0).begin()->get_node();
+                OPENVINO_ASSERT(ov::as_type<ov::op::v0::Convert>(next_node));
+                ov::copy_runtime_info(const_node, next_node->shared_from_this());
+            } else {
+                auto postponed_constant_target_inputs = postponed_constant_node->get_output_target_inputs(0);
+                auto convert =
+                    std::make_shared<ov::op::v0::Convert>(postponed_constant_node, const_node->get_element_type());
+
+                convert->set_friendly_name(postponed_constant_node->get_friendly_name());
+                ov::mark_as_decompression(convert);
+                ov::copy_runtime_info(const_node, convert);
+                postponed_constant_node->set_friendly_name(postponed_constant_node->get_friendly_name() +
+                                                           "_compressed");
+                postpone_fp16_compression(postponed_constant_node->get_rt_info());
+                postpone_fp16_compression(postponed_constant_node->get_output_tensor(0).get_rt_info());
+
+                for (const auto& target_input : postponed_constant_target_inputs) {
+                    target_input.replace_source_output(convert);
+                }
             }
         } else {
-            ov::replace_node(const_node, convert);
+            auto convert = std::make_shared<ov::op::v0::Convert>(new_const, const_node->get_element_type());
+
+            convert->set_friendly_name(const_node->get_friendly_name());
+            new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed");
+            ov::copy_runtime_info(const_node, convert);
+            ov::mark_as_decompression(convert);
+            if (postponed) {
+                postpone_fp16_compression(new_const->get_rt_info());
+                postpone_fp16_compression(new_const->get_output_tensor(0).get_rt_info());
+
+                for (const auto& target_input : constant_target_inputs) {
+                    target_input.replace_source_output(convert);
+                }
+            } else {
+                ov::replace_node(const_node, convert);
+            }
         }
         return true;
     };
diff --git a/src/common/transformations/src/transformations/common_optimizations/moe_transpose_weights.cpp b/src/common/transformations/src/transformations/common_optimizations/moe_transpose_weights.cpp
@@ -109,7 +109,6 @@ ov::pass::VectorizedMOE2GEMMTransposeWeights::VectorizedMOE2GEMMTransposeWeights
             if (ov::is_type<ov::op::v0::Constant>(transpose_input.get_node_shared_ptr())) {
                 transpose->get_rt_info()["postponed_constant"] = true;
                 ov::pass::disable_constant_folding(transpose);
-                ov::disable_fp16_compression(transpose_input.get_node_shared_ptr());
             }
 
             ov::NodeVector rt_sources{transpose_input.get_node_shared_ptr()};
diff --git a/src/common/transformations/tests/common_optimizations/moe_transpose_weights_test.cpp b/src/common/transformations/tests/common_optimizations/moe_transpose_weights_test.cpp
@@ -30,7 +30,6 @@
 #include "openvino/op/unsqueeze.hpp"
 #include "openvino/pass/constant_folding.hpp"
 #include "transformations/rt_info/decompression.hpp"
-#include "transformations/rt_info/disable_fp16_compression.hpp"
 
 using namespace ov;
 
@@ -85,7 +84,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
             auto gate_transpose = std::make_shared<op::v1::Transpose>(convert_input, order);
             gate_transpose->get_rt_info()["postponed_constant"] = true;
             ov::pass::disable_constant_folding(gate_transpose);
-            ov::disable_fp16_compression(gate_const);
             convert_input = gate_transpose;
         }
 
@@ -99,7 +97,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
             auto gate_transpose = std::make_shared<op::v1::Transpose>(gate_weight_output, order);
             gate_transpose->get_rt_info()["postponed_constant"] = true;
             ov::pass::disable_constant_folding(gate_transpose);
-            ov::disable_fp16_compression(gate_weights);
             gate_weight_output = gate_transpose;
         }
     }
@@ -156,7 +153,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
             auto down_transpose = std::make_shared<op::v1::Transpose>(convert_input, order);
             down_transpose->get_rt_info()["postponed_constant"] = true;
             ov::pass::disable_constant_folding(down_transpose);
-            ov::disable_fp16_compression(down_const);
             convert_input = down_transpose;
         }
 
@@ -177,7 +173,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi
             auto down_transpose = std::make_shared<op::v1::Transpose>(down_weight_output, order);
             down_transpose->get_rt_info()["postponed_constant"] = true;
             ov::pass::disable_constant_folding(down_transpose);
-            ov::disable_fp16_compression(down_const);
             down_weight_output = down_transpose;
         }
     }
diff --git a/src/core/src/xml_util/xml_serialize_util.cpp b/src/core/src/xml_util/xml_serialize_util.cpp
@@ -11,6 +11,7 @@
 #include "openvino/core/except.hpp"
 #include "openvino/core/meta_data.hpp"
 #include "openvino/core/model.hpp"
+#include "openvino/core/rt_info.hpp"
 #include "openvino/core/runtime_attribute.hpp"
 #include "openvino/op/binary_convolution.hpp"
 #include "openvino/op/constant.hpp"
@@ -58,14 +59,16 @@ class PostponedConstantReplacer {
                 // clone to keep original node unchanged
                 node_clone = node->clone_with_new_inputs(node->input_values());
                 node_clone->get_rt_info().erase(ov::pass::DisableConstantFolding::get_type_info_static());
-                node = node_clone.get();
             }
+            auto node_to_fold = node_clone ? node_clone : node->shared_from_this();
             OPENVINO_ASSERT(
-                node->constant_fold(outputs, node->input_values()),
+                node_to_fold->constant_fold(outputs, node_to_fold->input_values()),
                 "Node with set `postponed_constant` attribute cannot be fold to constant when saving model to IR file");
             m_constant = outputs[0].get_node_shared_ptr();
             m_node = m_constant.get();
             m_node->set_friendly_name(node->get_friendly_name());
+            ov::copy_runtime_info(node->shared_from_this(), m_constant);
+            ov::copy_output_runtime_info(node->outputs(), m_constant->outputs());
         }
     }
 };
diff --git a/src/core/tests/pass/serialization/custom_ops.cpp b/src/core/tests/pass/serialization/custom_ops.cpp
@@ -11,11 +11,13 @@
 #include "openvino/op/add.hpp"
 #include "openvino/op/concat.hpp"
 #include "openvino/op/constant.hpp"
+#include "openvino/op/convert.hpp"
 #include "openvino/op/multiply.hpp"
 #include "openvino/pass/constant_folding.hpp"
 #include "openvino/pass/manager.hpp"
 #include "openvino/pass/serialize.hpp"
 #include "openvino/runtime/core.hpp"
+#include "transformations/common_optimizations/compress_float_constants.hpp"
 
 class CustomOpsSerializationTest : public ::testing::Test {
 protected:
@@ -186,7 +188,7 @@ TEST(PostponedConstantTest, ConcatWithPostponedConstant) {
 
         auto model = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");
 
-        ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
+        ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
     }
     ov::Core core;
 
@@ -230,7 +232,7 @@ TEST(PostponedConstantTest, SubgraphExclusion) {
         auto model =
             std::make_shared<ov::Model>(final_add->outputs(), ov::ParameterVector{param}, "SubgraphExclusionModel");
 
-        ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
+        ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
     }
     ov::Core core;
 
@@ -274,7 +276,7 @@ TEST(PostponedConstantTest, NodeWithMultipleConsumers) {
 
         concat->get_rt_info()["postponed_constant"] = true;
 
-        ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
+        ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
     }
     ov::Core core;
 
@@ -330,7 +332,7 @@ TEST(PostponedConstantTest, ModelIsUnchangedAfterSerialization) {
         ov::pass::disable_constant_folding(concat);
 
         auto model_copy = model->clone();
-        ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model);
+        ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
 
         const auto& [success, message] = compare_functions(model_copy, model, true, true, true, true, true);
         ASSERT_TRUE(success) << message;
@@ -358,3 +360,91 @@ TEST(PostponedConstantTest, ModelIsUnchangedAfterSerialization) {
         ASSERT_TRUE(success) << message;
     }
 }
+
+TEST(PostponedConstantTest, F16Compression2Inputs) {
+    std::stringstream serialized_xml, serialized_bin;
+    {
+        auto const1 =
+            std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{1, 2, 3, 4});
+        auto const2 =
+            std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{5, 6, 7, 8});
+        auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{const1, const2}, 0);
+        concat->get_rt_info()["postponed_constant"] = true;
+
+        auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
+        auto add = std::make_shared<ov::op::v1::Add>(concat, param);
+
+        auto model = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");
+
+        // in case of postponed_constant + postponed f16 compression, f16 -> f32 convert should be added after postponed
+        // constant
+        bool postponed = true;
+        ov::pass::compress_model_to_f16(model, postponed);
+
+        ASSERT_NO_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model));
+    }
+    ov::Core core;
+
+    auto weights = serialized_bin.str();
+    ov::Tensor weights_tensor(ov::element::u8, ov::Shape{weights.size()}, weights.data());
+
+    auto deserialized_model = core.read_model(serialized_xml.str(), weights_tensor);
+
+    {
+        auto constant = std::make_shared<ov::op::v0::Constant>(ov::element::f16,
+                                                               ov::Shape{4, 2},
+                                                               std::vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
+        auto convert = std::make_shared<ov::op::v0::Convert>(constant, ov::element::f32);
+        auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
+        auto add = std::make_shared<ov::op::v1::Add>(convert, param);
+
+        auto expected = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");
+
+        const auto& [success, message] =
+            compare_functions(deserialized_model, expected, true, false, false, true, true);
+        ASSERT_TRUE(success) << message;
+    }
+}
+
+TEST(PostponedConstantTest, F16CompressionNotPostponned) {
+    std::stringstream serialized_xml, serialized_bin;
+    auto check_model = [](const std::shared_ptr<ov::Model>& model) {
+        auto const1 =
+            std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{2, 2}, std::vector<float>{1, 2, 3, 4});
+        auto convert1 = std::make_shared<ov::op::v0::Convert>(const1, ov::element::f32);
+        auto const2 =
+            std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{2, 2}, std::vector<float>{5, 6, 7, 8});
+        auto convert2 = std::make_shared<ov::op::v0::Convert>(const2, ov::element::f32);
+        auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{convert1, convert2}, 0);
+        concat->get_rt_info()["postponed_constant"] = true;
+
+        auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
+        auto add = std::make_shared<ov::op::v1::Add>(concat, param);
+
+        auto expected = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");
+
+        const auto& [success, message] = compare_functions(model, expected, true, false, false, true, true);
+        ASSERT_TRUE(success) << message;
+    };
+
+    {
+        auto const1 =
+            std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{1, 2, 3, 4});
+        auto const2 =
+            std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{2, 2}, std::vector<float>{5, 6, 7, 8});
+        auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{const1, const2}, 0);
+        concat->get_rt_info()["postponed_constant"] = true;
+
+        auto param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::Shape{4, 2});
+        auto add = std::make_shared<ov::op::v1::Add>(concat, param);
+
+        auto model = std::make_shared<ov::Model>(add->outputs(), ov::ParameterVector{param}, "ConcatAddModel");
+
+        bool postponed = false;
+        ov::pass::compress_model_to_f16(model, postponed);
+
+        check_model(model);
+
+        ASSERT_THROW(ov::pass::Serialize(serialized_xml, serialized_bin).run_on_model(model), ov::Exception);
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -109,7 +109,6 @@ ov::pass::VectorizedMOE2GEMMTransposeWeights::VectorizedMOE2GEMMTransposeWeights`
`109`	`109`	`if (ov::is_type<ov::op::v0::Constant>(transpose_input.get_node_shared_ptr())) {`
`110`	`110`	`transpose->get_rt_info()["postponed_constant"] = true;`
`111`	`111`	`ov::pass::disable_constant_folding(transpose);`
`112`		`- ov::disable_fp16_compression(transpose_input.get_node_shared_ptr());`
`113`	`112`	`}`
`114`	`113`
`115`	`114`	`ov::NodeVector rt_sources{transpose_input.get_node_shared_ptr()};`
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,6 @@`
`30`	`30`	`#include "openvino/op/unsqueeze.hpp"`
`31`	`31`	`#include "openvino/pass/constant_folding.hpp"`
`32`	`32`	`#include "transformations/rt_info/decompression.hpp"`
`33`		`-#include "transformations/rt_info/disable_fp16_compression.hpp"`
`34`	`33`
`35`	`34`	`using namespace ov;`
`36`	`35`
`@@ -85,7 +84,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi`
`85`	`84`	`auto gate_transpose = std::make_shared<op::v1::Transpose>(convert_input, order);`
`86`	`85`	`gate_transpose->get_rt_info()["postponed_constant"] = true;`
`87`	`86`	`ov::pass::disable_constant_folding(gate_transpose);`
`88`		`- ov::disable_fp16_compression(gate_const);`
`89`	`87`	`convert_input = gate_transpose;`
`90`	`88`	`}`
`91`	`89`
`@@ -99,7 +97,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi`
`99`	`97`	`auto gate_transpose = std::make_shared<op::v1::Transpose>(gate_weight_output, order);`
`100`	`98`	`gate_transpose->get_rt_info()["postponed_constant"] = true;`
`101`	`99`	`ov::pass::disable_constant_folding(gate_transpose);`
`102`		`- ov::disable_fp16_compression(gate_weights);`
`103`	`100`	`gate_weight_output = gate_transpose;`
`104`	`101`	`}`
`105`	`102`	`}`
`@@ -156,7 +153,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi`
`156`	`153`	`auto down_transpose = std::make_shared<op::v1::Transpose>(convert_input, order);`
`157`	`154`	`down_transpose->get_rt_info()["postponed_constant"] = true;`
`158`	`155`	`ov::pass::disable_constant_folding(down_transpose);`
`159`		`- ov::disable_fp16_compression(down_const);`
`160`	`156`	`convert_input = down_transpose;`
`161`	`157`	`}`
`162`	`158`
`@@ -177,7 +173,6 @@ std::shared_ptr<ov::Model> build_moe_2gemm_model(bool use_decompression, bool wi`
`177`	`173`	`auto down_transpose = std::make_shared<op::v1::Transpose>(down_weight_output, order);`
`178`	`174`	`down_transpose->get_rt_info()["postponed_constant"] = true;`
`179`	`175`	`ov::pass::disable_constant_folding(down_transpose);`
`180`		`- ov::disable_fp16_compression(down_const);`
`181`	`176`	`down_weight_output = down_transpose;`
`182`	`177`	`}`
`183`	`178`	`}`