[infra/onert] Bump up XNNPack version (Samsung#13022)

This commit updates XNNPack and dependent packages version. ONE-DCO-1.0-Signed-off-by: Hyeongseok Oh <[email protected]>
mhs4670go · May 22, 2024 · eeffa46 · eeffa46
1 parent c3bd1e2
commit eeffa46
Show file tree

Hide file tree

Showing 17 changed files with 125 additions and 29 deletions.
diff --git a/infra/cmake/packages/CpuInfoSourceConfig.cmake b/infra/cmake/packages/CpuInfoSourceConfig.cmake
@@ -8,8 +8,8 @@ function(_CpuInfoSource_import)
   nnas_include(OptionTools)
 
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  # CPUINFO commit from tflite v2.8
-  envoption(CPUINFO_URL ${EXTERNAL_DOWNLOAD_SERVER}/pytorch/cpuinfo/archive/5916273f79a21551890fd3d56fc5375a78d1598d.tar.gz)
+  # CPUINFO commit from tflite v2.16.1
+  envoption(CPUINFO_URL ${EXTERNAL_DOWNLOAD_SERVER}/pytorch/cpuinfo/archive/ef634603954d88d2643d5809011288b890ac126e.tar.gz)
   ExternalSource_Download(CPUINFO
     DIRNAME CPUINFO
     URL ${CPUINFO_URL})

diff --git a/infra/cmake/packages/Fp16SourceConfig.cmake b/infra/cmake/packages/Fp16SourceConfig.cmake
@@ -8,8 +8,8 @@ function(_Fp16Source_import)
   nnas_include(OptionTools)
 
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  # fp16 commit in xnnpack 8b283aa30a31
-  envoption(FP16_URL ${EXTERNAL_DOWNLOAD_SERVER}/Maratyszcza/FP16/archive/4dfe081cf6bcd15db339cf2680b9281b8451eeb3.tar.gz)
+  # fp16 commit in xnnpack (tflite v2.16.1)
+  envoption(FP16_URL ${EXTERNAL_DOWNLOAD_SERVER}/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.tar.gz)
   ExternalSource_Download(FP16
     DIRNAME FP16
     URL ${FP16_URL})

diff --git a/infra/cmake/packages/FxdivSourceConfig.cmake b/infra/cmake/packages/FxdivSourceConfig.cmake
@@ -8,8 +8,8 @@ function(_FxdivSource_import)
   nnas_include(OptionTools)
 
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  # fxdiv commit in xnnpack 8b283aa30a31
-  envoption(FXDIV_URL ${EXTERNAL_DOWNLOAD_SERVER}/Maratyszcza/FXdiv/archive/f8c5354679ec2597792bc70a9e06eff50c508b9a.tar.gz)
+  # fxdiv commit in tflite v2.16.1
+  envoption(FXDIV_URL ${EXTERNAL_DOWNLOAD_SERVER}/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.tar.gz)
   ExternalSource_Download(FXDIV
     DIRNAME FXDIV
     URL ${FXDIV_URL})

diff --git a/infra/cmake/packages/PthreadpoolSourceConfig.cmake b/infra/cmake/packages/PthreadpoolSourceConfig.cmake
@@ -8,8 +8,8 @@ function(_PthreadpoolSource_import)
   nnas_include(OptionTools)
 
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  # pthreadpool commit in xnnpack 8b283aa30a31
-  envoption(PTHREADPOOL_URL ${EXTERNAL_DOWNLOAD_SERVER}/Maratyszcza/pthreadpool/archive/029c88620802e1361ccf41d1970bd5b07fd6b7bb.tar.gz)
+  # pthreadpool commit in xnnpack (tflite v2.16.1)
+  envoption(PTHREADPOOL_URL ${EXTERNAL_DOWNLOAD_SERVER}/Maratyszcza/pthreadpool/archive/4fe0e1e183925bf8cfa6aae24237e724a96479b8.tar.gz)
   ExternalSource_Download(PTHREADPOOL
     DIRNAME PTHREADPOOL
     URL ${PTHREADPOOL_URL})

diff --git a/infra/cmake/packages/XnnpackSource.patch b/infra/cmake/packages/XnnpackSource.patch
@@ -0,0 +1,52 @@
+--- a/src/configs/dwconv-config.c
++++ b/src/configs/dwconv-config.c
+@@ -688,6 +688,7 @@
+     const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+     assert(hardware_config != NULL);
+     if (hardware_config->use_arm_neon) {
++#if defined(XNN_ENABLE_ASSEMBLY) && XNN_ENABLE_ASSEMBLY
+       if (hardware_config->use_arm_neon_v8) {
+         qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__asm_aarch32_neonv8_mla8_cortex_a35;
+         qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neonv8_params;
+@@ -708,6 +709,9 @@
+         qs8_qc8w_dwconv_config[2].channel_round = 1;
+         qs8_qc8w_dwconv_config[2].primary_tile = 25;
+       } else {
++#else
++      {
++#endif
+         qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__neon_mla8_ld128;
+         qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_neon_params;
+         qs8_qc8w_dwconv_config[0].channel_tile = 16;
+--- a/src/configs/hardware-config.c
++++ b/src/configs/hardware-config.c
+@@ -99,7 +99,11 @@
+     hardware_config.use_arm_neon = cpuinfo_has_arm_neon();
+     hardware_config.use_arm_neon_fp16 = cpuinfo_has_arm_neon_fp16();
+     hardware_config.use_arm_neon_fma = cpuinfo_has_arm_neon_fma();
++#if defined(XNN_ENABLE_ASSEMBLY) && XNN_ENABLE_ASSEMBLY
+     hardware_config.use_arm_neon_v8 = cpuinfo_has_arm_neon_v8();
++#else
++    hardware_config.use_arm_neon_v8 = false;
++#endif
+   #endif
+
+   #if XNN_ARCH_ARM64
+--- a/src/configs/unary-elementwise-config.c
++++ b/src/configs/unary-elementwise-config.c
+@@ -1806,11 +1806,15 @@
+   #if XNN_ARCH_ARM
+     const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+     assert(hardware_config != NULL);
++#if defined(XNN_ENABLE_ASSEMBLY) && XNN_ENABLE_ASSEMBLY
+     if (hardware_config->use_arm_neon) {
+       qs16_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs16_qs8_vcvt_ukernel__asm_aarch32_neon_u16;
+       qs16_to_qs8_cvt_config.init.qs16_qs8_cvt = xnn_init_qs16_qs8_cvt_neon_params;
+       qs16_to_qs8_cvt_config.element_tile = 16;
+     } else if (!XNN_PLATFORM_MOBILE) {
++#else
++    if (!XNN_PLATFORM_MOBILE) {
++#endif
+       qs16_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs16_qs8_vcvt_ukernel__scalar_u4;
+       qs16_to_qs8_cvt_config.init.qs16_qs8_cvt = xnn_init_qs16_qs8_cvt_scalar_params;
+       qs16_to_qs8_cvt_config.element_tile = 4;
diff --git a/infra/cmake/packages/XnnpackSourceConfig.cmake b/infra/cmake/packages/XnnpackSourceConfig.cmake
@@ -8,11 +8,14 @@ function(_XnnpackSource_import)
   nnas_include(OptionTools)
 
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  # xnnpack commit in tflite v2.3
-  envoption(XNNPACK_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/XNNPACK/archive/8b283aa30a3186c6e640aed520543e9c067132d.tar.gz)
+  # xnnpack latest commit (2024.05.20)
+  # xnnpack in tflite v2.16.1 is not stable on armv7l gbs and linux cross build process (assembly microkernel build issue)
+  # Patch: workaround to resolve build fail by forcing disable using armv8 feature on gbs build and arm linux cross build under gcc 10
+  envoption(XNNPACK_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/XNNPACK/archive/fcb36699c67201ceff7358df42730809e8f2c9cc.tar.gz)
   ExternalSource_Download(XNNPACK
     DIRNAME XNNPACK
-    URL ${XNNPACK_URL})
+    URL ${XNNPACK_URL}
+    PATCH ${CMAKE_CURRENT_LIST_DIR}/XnnpackSource.patch)
 
   set(XnnpackSource_DIR ${XNNPACK_SOURCE_DIR} PARENT_SCOPE)
   set(XnnpackSource_FOUND TRUE PARENT_SCOPE)

diff --git a/infra/nnfw/cmake/options/options_armv7l-linux.cmake b/infra/nnfw/cmake/options/options_armv7l-linux.cmake
@@ -12,3 +12,6 @@ option(BUILD_GPU_CL "Build gpu_cl backend" ON)
 option(BUILD_TENSORFLOW_LITE_GPU "Build TensorFlow Lite GPU delegate from the downloaded source" ON)
 option(DOWNLOAD_PYBIND11 "Download Pybind11 source" ON)
 option(BUILD_PYTHON_BINDING "Build python binding" ON)
+
+# Under linux gcc 10.0, required header for xnnpack arm build is not supported
+cmake_dependent_option(BUILD_XNNPACK "Build xnnpack library from the downloaded source" OFF "CXX_COMPILER_VERSION VERSION_LESS 10.0" ON)
diff --git a/infra/nnfw/cmake/packages/TensorFlowGpuConfig.cmake b/infra/nnfw/cmake/packages/TensorFlowGpuConfig.cmake
@@ -23,8 +23,8 @@ function(_Build_TfliteGpuDelagate_)
   nnas_find_package(Farmhash REQUIRED)
   return_unless(Farmhash_FOUND)
 
-  nnas_find_package(Fp16Source REQUIRED)
-  return_unless(Fp16Source_FOUND)
+  nnfw_find_package(Fp16 REQUIRED)
+  return_unless(Fp16_FOUND)
 
   nnas_find_package(VulkanSource QUIET)
   return_unless(VulkanSource_FOUND)

diff --git a/infra/nnfw/cmake/packages/XnnpackConfig.cmake b/infra/nnfw/cmake/packages/XnnpackConfig.cmake
@@ -3,7 +3,6 @@ function(_Xnnpack_Build)
   nnfw_find_package(Fxdiv QUIET)
   nnfw_find_package(CpuInfo QUIET)
   nnfw_find_package(Pthreadpool QUIET)
-  nnfw_find_package(Psimd QUIET)
   nnfw_find_package(Fp16 QUIET)
 
   # NOTE This line prevents multiple definitions of cpuinfo target
@@ -19,12 +18,24 @@ function(_Xnnpack_Build)
     return()
   endif(NOT XnnpackSource_FOUND)
 
+  set(XNNPACK_LIBRARY_TYPE "static")
   set(XNNPACK_BUILD_TESTS OFF CACHE BOOL "Build XNNPACK unit tests")
   set(XNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "Build XNNPACK benchmarks")
   set(XNNPACK_USE_SYSTEM_LIBS ON CACHE BOOL "Use system-provided dependency libraries")
 
+  # microkernel build is not supported under gcc 9.x and clang
+  # TODO Enable this
+if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10.0 OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+  set(XNNPACK_ENABLE_ASSEMBLY OFF CACHE BOOL "Build XNNPACK with assembly micro-kernels")
+  set(XNNPACK_ENABLE_ARM_I8MM OFF CACHE BOOL "Build XNNPACK with ARM I8MM (8-bit integer matrix multiply accumulate) micro-kernels")
+  # Set definition: used on patched code
+  add_compile_definitions("XNN_ENABLE_ASSEMBLY=$<BOOL:${XNNPACK_ENABLE_ASSEMBLY}>")
+endif()
+
+  # Set -fPIC property to XNNPack and linked libraries
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
   add_extdirectory("${XnnpackSource_DIR}" XNNPACK EXCLUDE_FROM_ALL)
-  set_target_properties(XNNPACK PROPERTIES POSITION_INDEPENDENT_CODE ON)
   # Suppress warnings generated by xnnpack
   set_target_properties(XNNPACK PROPERTIES COMPILE_FLAGS "-Wno-deprecated-declarations")
   set(XnnpackSource_DIR ${XnnpackSource_DIR} PARENT_SCOPE)

diff --git a/packaging/CPUINFO.tar.gz b/packaging/CPUINFO.tar.gz
diff --git a/packaging/FP16.tar.gz b/packaging/FP16.tar.gz
diff --git a/packaging/FXDIV.tar.gz b/packaging/FXDIV.tar.gz
diff --git a/packaging/PTHREADPOOL.tar.gz b/packaging/PTHREADPOOL.tar.gz
diff --git a/packaging/XNNPACK.tar.gz b/packaging/XNNPACK.tar.gz
diff --git a/runtime/onert/backend/xnnpack/ops/ConvolutionLayer.cc b/runtime/onert/backend/xnnpack/ops/ConvolutionLayer.cc
@@ -111,7 +111,7 @@ bool ConvolutionLayer::create()
     input_channels /* input_channel_stride */, output_channels /* output_channel_stride */,
     reinterpret_cast<const float *>(_kernel->buffer()),
     reinterpret_cast<const float *>(_bias->buffer()), output_activation_min, output_activation_max,
-    0, &_kernel_op);
+    0, nullptr, nullptr, &_kernel_op);
   if (status != xnn_status_success)
   {
     throw std::runtime_error{"failed to create FP32 Convolution operator"};
@@ -131,10 +131,20 @@ bool ConvolutionLayer::setup()
   uint32_t input_width = _input->getShape().dim(2);
   uint32_t input_height = _input->getShape().dim(1);
   uint32_t batch_size = _input->getShape().dim(0);
-  enum xnn_status status = xnn_setup_convolution2d_nhwc_f32(
-    _kernel_op, batch_size, input_height, input_width,
-    reinterpret_cast<const float *>(_input->buffer()), reinterpret_cast<float *>(_output->buffer()),
-    _external_context->getThreadPool());
+  size_t workspace_size = 0;
+  size_t workspace_alignment = 0;
+  enum xnn_status status = xnn_reshape_convolution2d_nhwc_f32(
+    _kernel_op, batch_size, input_height, input_width, &workspace_size, &workspace_alignment,
+    nullptr, nullptr, _external_context->getThreadPool());
+  if (status != xnn_status_success)
+  {
+    throw std::runtime_error{"failed to create FP32 DepthwiseConvolution operator"};
+  }
+
+  std::vector<uint8_t> workspace(workspace_size);
+  status = xnn_setup_convolution2d_nhwc_f32(_kernel_op, workspace.data(),
+                                            reinterpret_cast<const float *>(_input->buffer()),
+                                            reinterpret_cast<float *>(_output->buffer()));
   if (status != xnn_status_success)
   {
     throw std::runtime_error{"failed to create FP32 Convolution operator"};

diff --git a/runtime/onert/backend/xnnpack/ops/DepthwiseConvolutionLayer.cc b/runtime/onert/backend/xnnpack/ops/DepthwiseConvolutionLayer.cc
@@ -112,7 +112,7 @@ bool DepthwiseConvolutionLayer::create()
     _multiplier /* group_output_channels */, input_channels /* input_channel_stride */,
     output_channels /* output_channel_stride */, reinterpret_cast<const float *>(_kernel->buffer()),
     reinterpret_cast<const float *>(_bias->buffer()), output_activation_min, output_activation_max,
-    XNN_FLAG_DEPTHWISE_CONVOLUTION, &_kernel_op);
+    XNN_FLAG_DEPTHWISE_CONVOLUTION, nullptr, nullptr, &_kernel_op);
   if (status != xnn_status_success)
   {
     throw std::runtime_error{"failed to create FP32 DepthwiseConvolution operator"};
@@ -132,10 +132,20 @@ bool DepthwiseConvolutionLayer::setup()
   uint32_t input_width = _input->getShape().dim(2);
   uint32_t input_height = _input->getShape().dim(1);
   uint32_t batch_size = _input->getShape().dim(0);
-  enum xnn_status status = xnn_setup_convolution2d_nhwc_f32(
-    _kernel_op, batch_size, input_height, input_width,
-    reinterpret_cast<const float *>(_input->buffer()), reinterpret_cast<float *>(_output->buffer()),
-    _external_context->getThreadPool());
+  size_t workspace_size = 0;
+  size_t workspace_alignment = 0;
+  enum xnn_status status = xnn_reshape_convolution2d_nhwc_f32(
+    _kernel_op, batch_size, input_height, input_width, &workspace_size, &workspace_alignment,
+    nullptr, nullptr, _external_context->getThreadPool());
+  if (status != xnn_status_success)
+  {
+    throw std::runtime_error{"failed to create FP32 DepthwiseConvolution operator"};
+  }
+
+  std::vector<uint8_t> workspace(workspace_size);
+  status = xnn_setup_convolution2d_nhwc_f32(_kernel_op, workspace.data(),
+                                            reinterpret_cast<const float *>(_input->buffer()),
+                                            reinterpret_cast<float *>(_output->buffer()));
   if (status != xnn_status_success)
   {
     throw std::runtime_error{"failed to create FP32 DepthwiseConvolution operator"};

diff --git a/runtime/onert/backend/xnnpack/ops/FullyConnectedLayer.cc b/runtime/onert/backend/xnnpack/ops/FullyConnectedLayer.cc
@@ -104,7 +104,7 @@ bool FullyConnectedLayer::create()
   enum xnn_status status = xnn_create_fully_connected_nc_f32(
     input_channels, output_channels, input_channels /* input stride */,
     output_channels /* output stride */, kernel_buffer, bias_buffer, output_activation_min,
-    output_activation_max, flag, &_kernel_op);
+    output_activation_max, flag, nullptr, nullptr, &_kernel_op);
   if (status != xnn_status_success)
   {
     throw std::runtime_error{"failed to create FP32 FullyConnected operator"};
@@ -122,9 +122,16 @@ bool FullyConnectedLayer::setup()
   }
 
   uint32_t batch_size = _input->getShape().num_elements() / _kernel->getShape().dim(1);
-  enum xnn_status status = xnn_setup_fully_connected_nc_f32(
-    _kernel_op, batch_size, reinterpret_cast<const float *>(_input->buffer()),
-    reinterpret_cast<float *>(_output->buffer()), _external_context->getThreadPool());
+  enum xnn_status status =
+    xnn_reshape_fully_connected_nc_f32(_kernel_op, batch_size, _external_context->getThreadPool());
+  if (status != xnn_status_success)
+  {
+    throw std::runtime_error{"failed to create FP32 FullyConnected operator"};
+  }
+
+  status =
+    xnn_setup_fully_connected_nc_f32(_kernel_op, reinterpret_cast<const float *>(_input->buffer()),
+                                     reinterpret_cast<float *>(_output->buffer()));
   if (status != xnn_status_success)
   {
     throw std::runtime_error{"failed to create FP32 FullyConnected operator"};