[onert] Support dconv2d hybrid (Samsung#11479)

It supports dconv2d hybrid - weights: i8 symm, in/out: float. ONE-DCO-1.0-Signed-off-by: Sanggyu Lee <[email protected]>
BalyshevArtem · Sep 5, 2023 · 82de9c2 · 82de9c2
1 parent a836fa5
commit 82de9c2
Show file tree

Hide file tree

Showing 4 changed files with 159 additions and 2 deletions.
diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h
@@ -26,6 +26,7 @@
 #include "cker/operation/optimized/DepthwiseConvUint8.h"
 #include "cker/operation/optimized/integer_ops/DepthwiseConvInt8.h"
 #include "cker/operation/reference/integer_ops/DepthwiseConvUInt8.h"
+#include "cker/operation/reference/integer_ops/DepthwiseConvHybrid.h"
 #include "cker/CpuBackendThreadpool.h"
 
 namespace nnfw

diff --git a/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.cc b/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.cc
@@ -16,6 +16,7 @@
 
 #include "DepthwiseConvolutionLayer.h"
 
+#include "cker/PortableTensorUtils.h"
 #include <cker/operation/DepthwiseConv.h>
 
 namespace onert
@@ -147,6 +148,50 @@ void DepthwiseConvolutionLayer::convQ8i()
     _external_context->ruy_context());
 }
 
+void DepthwiseConvolutionLayer::convQ8iHybridPerChannel()
+{
+  if (!_prepared)
+  {
+    prepareQ8iHybridPerChannel();
+    _prepared = true;
+  }
+
+  float output_activation_min = 0, output_activation_max = 0;
+  CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);
+
+  auto input_shape = getShape(_input);
+  const int batch_size = input_shape.Dims(0);
+  const int input_size = input_shape.FlatSize() / batch_size;
+
+  auto scaling_factors_ptr = _input_scaling_factors.data();
+  auto input_offsets_ptr = _input_offsets.data();
+
+  for (int b = 0; b < batch_size; ++b)
+  {
+    const int offset = b * input_size;
+    nnfw::cker::PortableAsymmetricQuantizeFloats(getBuffer<float>(_input) + offset, input_size,
+                                                 _input_quantized.data() + offset,
+                                                 &scaling_factors_ptr[b], &input_offsets_ptr[b]);
+  }
+
+  nnfw::cker::DepthwiseConvParams op_params;
+  op_params.padding_values.width = _paddingLeft;
+  op_params.padding_values.height = _paddingTop;
+  op_params.depth_multiplier = _multiplier;
+  op_params.stride_width = _strideWidth;
+  op_params.stride_height = _strideHeight;
+  op_params.dilation_width_factor = _dilationWidth;
+  op_params.dilation_height_factor = _dilationHeight;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  nnfw::cker::reference_integer_ops::DepthwiseConvHybridPerChannel(
+    op_params, _input_scaling_factors.data(), getShape(_input), _input_quantized.data(),
+    getShape(_kernel), getBuffer<int8_t>(_kernel), getShape(_bias), getBuffer<float>(_bias),
+    getShape(_output), getBuffer<float>(_output), _kernel->data_scales().data(),
+    _input_offsets.data());
+}
+
 void DepthwiseConvolutionLayer::prepareQ8i()
 {
   GetQuantizedConvolutionMultipliersAndShifts(
@@ -163,6 +208,31 @@ void DepthwiseConvolutionLayer::prepareQ8uPerChannel()
     _per_channel_output_shift);
 }
 
+void DepthwiseConvolutionLayer::prepareQ8iHybridPerChannel()
+{
+  // allocate memory for activation quantization.
+  // - quantized values (int8_t type and same shape of original input)
+  // - quantization params (= scale/zeropoint for each input)
+  auto input_shape = getShape(_input);
+  const int batch_size = input_shape.Dims(0);
+  const int input_size = input_shape.FlatSize() / batch_size;
+  _input_quantized.resize(input_size);
+  // TODO: Optimize the case of batch_size = 1
+  _input_scaling_factors.resize(batch_size);
+  _input_offsets.resize(batch_size);
+}
+
+void DepthwiseConvolutionLayer::ensureQ8iHybridPerChannel()
+{
+  // ensure weight is per-channel quantized.
+  int32_t kernel_input_channel = getShape(_kernel).Dims(3);
+  // zero_points comes from flatbuffer vector. Its size is within uint32_t range.
+  size_t kernel_zerop_cnt = _kernel->data_scales().size();
+  // promote to int64_t to compare int32_t and uint32_t
+  if ((int64_t)kernel_input_channel != (int64_t)kernel_zerop_cnt)
+    throw std::runtime_error{"DConv2D hybrid supports only per-channel quantized weight."};
+}
+
 void DepthwiseConvolutionLayer::configure(
   const IPortableTensor *input, const IPortableTensor *kernel, const IPortableTensor *bias,
   const uint32_t paddingLeft, const uint32_t paddingRight, const uint32_t paddingTop,
@@ -186,8 +256,16 @@ void DepthwiseConvolutionLayer::configure(
   _activation = activation;
   _output = output;
   _external_context = external_context;
+  _is_hybrid = _input->data_type() == OperandType::FLOAT32 &&
+               _kernel->data_type() == OperandType::QUANT_INT8_SYMM;
 
-  if (_input->data_type() == OperandType::QUANT_INT8_ASYMM)
+  if (_is_hybrid)
+  {
+    ensureQ8iHybridPerChannel();
+    prepareQ8iHybridPerChannel();
+    _prepared = true;
+  }
+  else if (_input->data_type() == OperandType::QUANT_INT8_ASYMM)
   {
     if (_kernel->is_constant() && !_input->is_dynamic() && !_output->is_dynamic())
     {
@@ -209,7 +287,11 @@ void DepthwiseConvolutionLayer::configure(
 
 void DepthwiseConvolutionLayer::run()
 {
-  if (_input->data_type() == OperandType::FLOAT32)
+  if (_is_hybrid)
+  {
+    convQ8iHybridPerChannel();
+  }
+  else if (_input->data_type() == OperandType::FLOAT32)
   {
     convFloat32();
   }

diff --git a/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.h b/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.h
@@ -44,6 +44,7 @@ class DepthwiseConvolutionLayer : public ::onert::exec::IFunction
   void convQ8uPerChannel();
 
   void convQ8i();
+  void convQ8iHybridPerChannel();
 
   void configure(const IPortableTensor *input, const IPortableTensor *kernel,
                  const IPortableTensor *bias, const uint32_t paddingLeft,
@@ -58,6 +59,8 @@ class DepthwiseConvolutionLayer : public ::onert::exec::IFunction
 private:
   void prepareQ8i();
   void prepareQ8uPerChannel();
+  void prepareQ8iHybridPerChannel();
+  void ensureQ8iHybridPerChannel();
 
 private:
   const IPortableTensor *_input{nullptr};
@@ -87,6 +90,12 @@ class DepthwiseConvolutionLayer : public ::onert::exec::IFunction
   // Per channel output multiplier and shift.
   std::vector<int32_t> _per_channel_output_multiplier;
   std::vector<int> _per_channel_output_shift;
+
+  // For hybrid
+  bool _is_hybrid;
+  std::vector<int8_t> _input_quantized;
+  std::vector<float> _input_scaling_factors;
+  std::vector<int32_t> _input_offsets;
 };
 
 } // namespace ops

diff --git a/tests/nnfw_api/src/one_op_tests/DepthwiseConv2D.test.cc b/tests/nnfw_api/src/one_op_tests/DepthwiseConv2D.test.cc
@@ -203,6 +203,45 @@ TEST_F(GenModelTest, OneOp_DepthwiseConv2D_U8_PerChannel)
   SUCCEED();
 }
 
+TEST_F(GenModelTest, OneOp_DepthwiseConv2D_I8_Hybrid_PerChannel)
+{
+  CircleGen cgen;
+  // weight
+  // clang-format off
+  std::vector<int8_t> weight_data{1, 2, 1, 2,      -9,  10, -9,  10,
+                                  5, 6, 5, 6,      13, -14, 13, -14};
+  // clang-format on
+  uint32_t weight_buf = cgen.addBuffer(weight_data);
+  std::vector<float> weight_scales = {1, 1, 1, 1};
+  std::vector<int64_t> weight_zeropoints = {0, 0, 0, 0};
+  int weight = cgen.addTensor({{1, 2, 2, 4}, circle::TensorType::TensorType_INT8, weight_buf},
+                              weight_scales, weight_zeropoints);
+  // bias
+  std::vector<float> bias_data{0, 1, 2, 3};
+  uint32_t bias_buf = cgen.addBuffer(bias_data);
+  int bias = cgen.addTensor({{1, 1, 1, 4}, circle::TensorType::TensorType_FLOAT32, bias_buf});
+
+  // in and out
+  int in = cgen.addTensor({{1, 3, 2, 2}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 1, 4}, circle::TensorType::TensorType_FLOAT32});
+
+  cgen.addOperatorDepthwiseConv2D({{in, weight, bias}, {out}}, circle::Padding_VALID, 1, 1, 2,
+                                  circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  // clang-format off
+  _context->addTestCase(uniformTCD<float>({{0, 1,     2, 3,
+                                            0, 1,     2, 3,
+                                            0, 1,     2, 3}},
+                                          {{8, -7, 20, -1,
+                                            8, -7, 20, -1}}));
+  // clang-format on
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
 TEST_F(GenModelTest, neg_OneOp_DepthwiseConv2D_Stride)
 {
   CircleGen cgen;
@@ -500,3 +539,29 @@ TEST_F(GenModelTest, neg_OneOp_DepthwiseConv2D_I8_NonZero_ZeroPoints)
 
   SUCCEED();
 }
+
+TEST_F(GenModelTest, neg_OneOp_DepthwiseConv2D_I8_Hybrid_PerTensor)
+{
+  // PerTensor Quantized Weight is not supported
+  CircleGen cgen;
+  std::vector<int8_t> weight_data{1, 2, 3};
+  uint32_t weight_buf = cgen.addBuffer(weight_data);
+  std::vector<float> bias_data{0, 2, 4};
+  uint32_t bias_buf = cgen.addBuffer(bias_data);
+  int in = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_FLOAT32});
+  // Hybrid does not support per-tensor.
+  std::vector<float> weight_scales = {0.5};
+  std::vector<int64_t> weight_zeropoints = {0};
+  int weight = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_INT8, weight_buf},
+                              weight_scales, weight_zeropoints);
+  int bias = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_FLOAT32, bias_buf});
+  int out = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorDepthwiseConv2D({{in, weight, bias}, {out}}, circle::Padding_VALID, 1, 1,
+                                  /* depth_multiplier */ 1, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailCompile();
+  _context->setBackends({"cpu"});
+  SUCCEED();
+}