[onert/odc] Auto-compilation. Add nnfw_run_with_auto_compilation meth…

…od. (#14412) This PR for `odc:auto-compilation` introduces `nnfw_run_with_auto_compilation`,`nnfw_odc_delete_minmax_file` and `nnfw_set_odc_param_minmax_records_count` methods to NNFW API. ONE-DCO-1.0-Signed-off-by: Evgenii Maltsev [email protected]
Samsung · Dec 6, 2024 · 112f55c · 112f55c
1 parent 8f0db8a
commit 112f55c
Show file tree

Hide file tree

Showing 4 changed files with 384 additions and 0 deletions.
diff --git a/runtime/onert/api/nnfw/include/nnfw_experimental.h b/runtime/onert/api/nnfw/include/nnfw_experimental.h
@@ -558,6 +558,58 @@ NNFW_STATUS nnfw_set_codegen_model_path(nnfw_session *session, const char *path)
  */
 NNFW_STATUS nnfw_codegen(nnfw_session *session, const char *target, NNFW_CODEGEN_PREF pref);
 
+/**
+ * @brief  Set MinMax records count in auto compilation mode with on-device compiler
+ *
+ * This function set MinMax records count for quantization in auto compilation mode.
+ * To enable automatic compilation mode, use  {@link nnfw_run_with_auto_compilation}
+ *
+ * @param[in] session nnfw_session
+ * @param[in] minmax_records_count    minmax records count
+ * @return    @c NNFW_STATUS_NO_ERROR if successful, otherwise return @c NNFW_STATUS_ERROR
+ */
+NNFW_STATUS nnfw_set_odc_param_minmax_records_count(nnfw_session *session,
+                                                    int minmax_records_count);
+
+/**
+ * @brief  Delete MinMax file for on-device compiler
+ *
+ * @param[in] session nnfw_session
+ * @return    @c NNFW_STATUS_NO_ERROR if successful, otherwise return @c NNFW_STATUS_ERROR
+ */
+NNFW_STATUS nnfw_odc_delete_minmax_file(nnfw_session *session);
+
+/**
+ * @brief  Run inference with auto compilation
+ *
+ * <p>This function runs inference with automatic compilation and replaces
+ *  the original model with a quantized or compiled model inside.
+ * During the inference the minmax statistics is collected and after that quantization is performed.
+ * If quantization was successful, try to code generating for target backend, otherwise run original
+ float model.
+ * If compilation was successful, run compiled model, otherwise run quantized model.
+ * On-device compiler (ODC) provides quantization and compilation functionality.
+ * Function should be called after model is loaded by {@link nnfw_load_model_from_file},
+ * session is prepared for inference by {@link nnfw_prepare}, set input and output buffers
+ * by {@link nnfw_set_input} and {@link nnfw_set_output}.
+ *
+ * Additionally the following parameters should be set up :
+ * 1. Quantization type {@link nnfw_set_quantization_type }
+ * 2. Quantizated model path {@link  nnfw_set_quantized_model_path }
+ * 3. Minmax records threshold for quantization {@link nnfw_set_odc_param_minmax_records_count }
+ * 3. File with minMax statistics can be removed by {@link nnfw_odc_delete_minmax_file}
+ * 4. Compiled model path {@link  nnfw_set_codegen_model_path}
+ * </p>
+ *
+ * @param[in] session nnfw_session
+ * @param[in] target  Target backend to generate code as in {@link nnfw_codegen}
+ * @param[in] pref @c NNFW_CODEGEN_PREF
+
+ * @return    @c NNFW_STATUS_NO_ERROR if successful, otherwise return @c NNFW_STATUS_ERROR
+ */
+NNFW_STATUS nnfw_run_with_auto_compilation(nnfw_session *session, const char *target,
+                                           NNFW_CODEGEN_PREF pref);
+
 //////////////////////////////////////////////
 // APIs for configuration
 //////////////////////////////////////////////

diff --git a/runtime/onert/api/nnfw/src/nnfw_api.cc b/runtime/onert/api/nnfw/src/nnfw_api.cc
@@ -508,6 +508,25 @@ NNFW_STATUS nnfw_codegen(nnfw_session *session, const char *target, NNFW_CODEGEN
   return session->codegen(target, pref);
 }
 
+NNFW_STATUS nnfw_set_odc_param_minmax_records_count(nnfw_session *session, int minmax_records_count)
+{
+  NNFW_RETURN_ERROR_IF_NULL(session);
+  return session->set_odc_param_minmax_records_count(minmax_records_count);
+}
+
+NNFW_STATUS nnfw_odc_delete_minmax_file(nnfw_session *session)
+{
+  NNFW_RETURN_ERROR_IF_NULL(session);
+  return session->delete_odc_minmax_file();
+}
+
+NNFW_STATUS nnfw_run_with_auto_compilation(nnfw_session *session, const char *target,
+                                           NNFW_CODEGEN_PREF pref)
+{
+  NNFW_RETURN_ERROR_IF_NULL(session);
+  return session->run_with_auto_compilation(target, pref);
+}
+
 // Configuration
 
 NNFW_STATUS nnfw_set_prepare_config(nnfw_session *session, const NNFW_PREPARE_CONFIG key,

diff --git a/runtime/onert/api/nnfw/src/nnfw_api_internal.cc b/runtime/onert/api/nnfw/src/nnfw_api_internal.cc
@@ -2028,3 +2028,300 @@ NNFW_STATUS nnfw_session::reset_execute_config()
 
   return NNFW_STATUS_NO_ERROR;
 }
+
+NNFW_STATUS nnfw_session::set_odc_param_minmax_records_count(int minmax_records_count)
+{
+  if (isStateInitialized() || isStateRunning())
+  {
+    std::cerr << "invalid state" << std::endl;
+    return NNFW_STATUS_INVALID_STATE;
+  }
+
+  if (_quant_manager->setMinMaxRecordsThreshold(minmax_records_count))
+    return NNFW_STATUS_NO_ERROR;
+  else
+    return NNFW_STATUS_ERROR;
+}
+
+NNFW_STATUS nnfw_session::delete_odc_minmax_file()
+{
+  if (isStateRunning())
+  {
+    std::cerr << "invalid state" << std::endl;
+    return NNFW_STATUS_INVALID_STATE;
+  }
+
+  if (_quant_manager->deleteMinMaxFile())
+    return NNFW_STATUS_NO_ERROR;
+  else
+    return NNFW_STATUS_ERROR;
+}
+
+// run with auto compilation
+NNFW_STATUS nnfw_session::run_with_auto_compilation(const char *target, NNFW_CODEGEN_PREF pref)
+{
+
+  if (!isStatePreparedOrFinishedRun())
+  {
+    std::cerr << "Error during nnfw_session::run_with_auto_compilation : "
+              << "run should be after preparation" << std::endl;
+    return NNFW_STATUS_INVALID_STATE;
+  }
+
+  // Check quantization and code-generation parameters
+  std::string target_str{target};
+  if (_quant_manager->exportModelPath().empty() || _codegen_manager->exportModelPath().empty() ||
+      target_str.empty() || target_str.substr(target_str.size() - 4) != "-gen")
+  {
+    std::cerr << "Error during nnfw_session::run_with_auto_compilation : "
+              << "quantization and code generation parameters should be set" << std::endl;
+    return NNFW_STATUS_INVALID_STATE;
+  }
+
+  // Odc: auto compilation with hidden switching mechanizm
+  // Check is model already quantized or compiled
+  std::ifstream file_quantized_model(_quant_manager->exportModelPath());
+  std::ifstream file_compiled_model(_codegen_manager->exportModelPath());
+
+  if (!file_quantized_model.good() && !file_compiled_model.good())
+  {
+    // Run float model and try to quantize it
+    {
+      // Save execution options
+      auto saved_options = _execution->executionOptions();
+      // turn on minmax recording
+      _execution->executionOptions().dump_minmax = true;
+
+      try
+      {
+        _execution->execute();
+      }
+      catch (const onert::InsufficientBufferSizeException &e)
+      {
+        // Currently insufficient buffer always means output buffer.
+        std::cerr << "Error during nnfw_session::run_with_auto_compilation : " << e.what()
+                  << std::endl;
+        return NNFW_STATUS_INSUFFICIENT_OUTPUT_SIZE;
+      }
+      catch (const std::exception &e)
+      {
+        std::cerr << "Error during nnfw_session::run_with_auto_compilation : " << e.what()
+                  << std::endl;
+        return NNFW_STATUS_ERROR;
+      }
+
+      _state = State::FINISHED_RUN;
+
+      // restore min_max option to user defined state
+      _execution->executionOptions().dump_minmax = saved_options.dump_minmax;
+
+      // if enough statistics are collected, then run the quantization
+      if (_quant_manager->readyForQuantize())
+      {
+        try
+        {
+          if (isStateInitialized() || isStateRunning())
+          {
+            std::cerr << "invalid state" << std::endl;
+            return NNFW_STATUS_INVALID_STATE;
+          }
+
+          auto result = _quant_manager->quantize(_model_path);
+          if (!result)
+            return NNFW_STATUS_INVALID_STATE;
+
+          // remove minmax file
+          result = _quant_manager->deleteMinMaxFile();
+          if (!result)
+            return NNFW_STATUS_INVALID_STATE;
+        }
+        catch (const std::exception &e)
+        {
+          std::cerr
+            << "Error during nnfw_session::run_with_auto_compilation in quantize operation: "
+            << e.what() << std::endl;
+          return NNFW_STATUS_ERROR;
+        }
+      }
+    }
+  }
+  else
+  {
+    // run compiled or quantized model
+    NNFW_STATUS status;
+
+    // turn off minmax recording
+    _execution->executionOptions().dump_minmax = false;
+
+    // save initial buffers if quantized model or compiled model is not loaded
+    if (_autoCompilationState == nnfw_session::AutoCompilationState::INITIAL_STATE)
+    {
+      auto dotidx = _codegen_manager->exportModelPath().rfind('.');
+      if (dotidx == std::string::npos)
+      {
+        std::cerr << "Error during nnfw_session::run_with_auto_compilation : Invalid compiled "
+                     "model path. Please use a "
+                     "path that includes the extension."
+                  << std::endl;
+        return NNFW_STATUS_ERROR;
+      }
+
+      std::string compiled_model_type =
+        _codegen_manager->exportModelPath().substr(dotidx + 1); // + 1 to exclude dot
+
+      dotidx = _quant_manager->exportModelPath().rfind('.');
+      if (dotidx == std::string::npos)
+      {
+        std::cerr << "Error during nnfw_session::run_with_auto_compilation : Invalid quantized "
+                     "model path. Please use a "
+                     "path that includes the extension."
+                  << std::endl;
+        return NNFW_STATUS_ERROR;
+      }
+      std::string quantized_model_type =
+        _quant_manager->exportModelPath().substr(dotidx + 1); // + 1 to exclude dot
+
+      // Save initial (float) input and output buffers
+      auto input_size = _compiler_artifact->_executors->inputSize();
+      auto output_size = _compiler_artifact->_executors->outputSize();
+
+      std::vector<const void *> _input_buffers;
+      std::vector<void *> _output_buffers;
+
+      // Save Inputs buffers
+      for (size_t input_index = 0; input_index < input_size; input_index++)
+      {
+        auto io_input_index = onert::ir::IOIndex(input_index);
+        auto input_Shape = _execution->getInputShape(io_input_index);
+        auto input_buffer = _execution->getInputBuffer(io_input_index);
+
+        _input_buffers.push_back(input_buffer);
+      }
+
+      // Save Outputs buffers
+      for (size_t output_index = 0; output_index < output_size; output_index++)
+      {
+        auto io_output_index = onert::ir::IOIndex(output_index);
+
+        auto output_Shape = _execution->getOutputShape(io_output_index);
+        auto output_buffer = _execution->getOutputBuffer(io_output_index);
+
+        _output_buffers.push_back(output_buffer);
+      }
+
+      // Save execution options
+      auto saved_options = _execution->executionOptions();
+
+      // if there is compiled model - try to load it
+      if (file_compiled_model.good())
+      {
+        // load compiled model
+        status = loadModelFile(_codegen_manager->exportModelPath(), compiled_model_type);
+        if (status == NNFW_STATUS_NO_ERROR)
+        {
+          _autoCompilationState = nnfw_session::AutoCompilationState::COMPILED_MODEL_LOADED;
+        }
+      }
+      else // there is no compiled model - try to compile and load it
+      {
+
+        // avoiding code duplication use existing "codegen" function. Set up _model_path for the
+        // codegen function.
+        // TODO: change it if codegen function will be generalized
+        _model_path = _quant_manager->exportModelPath();
+
+        // try to compile and load compiled model
+        status = codegen(target, pref);
+        if (status == NNFW_STATUS_NO_ERROR)
+        {
+          _autoCompilationState = nnfw_session::AutoCompilationState::COMPILED_MODEL_LOADED;
+          // TODO delete quantized model
+        }
+      }
+
+      // loading compiled model is fail - try to load quantized model
+      if (_autoCompilationState != nnfw_session::AutoCompilationState::COMPILED_MODEL_LOADED)
+      {
+        // load quantized model
+        status = loadModelFile(_quant_manager->exportModelPath(), quantized_model_type);
+        if (status != NNFW_STATUS_NO_ERROR)
+          return status;
+        else
+          _autoCompilationState = nnfw_session::AutoCompilationState::QUANTIZED_MODEL_LOADED;
+      }
+
+      status = prepare();
+      if (status != NNFW_STATUS_NO_ERROR)
+        return status;
+
+      // Restore execution options
+      _execution->executionOptions() = saved_options;
+
+      // Restore inputs to the quantized or compiled model
+      for (uint32_t input_index = 0; input_index < _input_buffers.size(); input_index++)
+      {
+        nnfw_tensorinfo ti;
+        status = input_tensorinfo(input_index, &ti);
+        if (status != NNFW_STATUS_NO_ERROR)
+          return status;
+
+        ti.dtype = NNFW_TYPE_TENSOR_FLOAT32;
+        auto input_size_in_bytes = getBufSize(&ti);
+
+        status = set_input(input_index, ti.dtype, _input_buffers[input_index], input_size_in_bytes);
+
+        if (status != NNFW_STATUS_NO_ERROR)
+          return status;
+      }
+
+      // Restore outputs to the quantized or compiled model
+      for (uint32_t output_index = 0; output_index < _output_buffers.size(); output_index++)
+      {
+
+        nnfw_tensorinfo ti;
+        status = output_tensorinfo(output_index, &ti);
+        if (status != NNFW_STATUS_NO_ERROR)
+          return status;
+
+        ti.dtype = NNFW_TYPE_TENSOR_FLOAT32;
+
+        uint64_t output_size_in_bytes = getBufSize(&ti);
+
+        status =
+          set_output(output_index, ti.dtype, _output_buffers[output_index], output_size_in_bytes);
+        if (status != NNFW_STATUS_NO_ERROR)
+          return status;
+      }
+    }
+
+    // Run quantized model
+    if (!isStatePreparedOrFinishedRun())
+    {
+      std::cerr << "Error during nnfw_session::run_with_auto_compilation : "
+                << "run should be run after prepare" << std::endl;
+      return NNFW_STATUS_INVALID_STATE;
+    }
+
+    try
+    {
+      _execution->execute();
+    }
+    catch (const onert::InsufficientBufferSizeException &e)
+    {
+      // Currently insufficient buffer always means output buffer.
+      std::cerr << "Error during nnfw_session::run_with_auto_compilation : " << e.what()
+                << std::endl;
+      return NNFW_STATUS_INSUFFICIENT_OUTPUT_SIZE;
+    }
+    catch (const std::exception &e)
+    {
+      std::cerr << "Error during nnfw_session::run_with_auto_compilation : " << e.what()
+                << std::endl;
+      return NNFW_STATUS_ERROR;
+    }
+
+    _state = State::FINISHED_RUN;
+  }
+
+  return NNFW_STATUS_NO_ERROR;
+}