feat(yolov8_obb): Implement obb task, add yolov8_obb sample.

Change-Id: I1104b1ee00a69786f45c1f04ba4117d34f836dcf
sophgo · Nov 7, 2024 · 0aa5339 · 0aa5339
1 parent 1bf4446
commit 0aa5339
Show file tree

Hide file tree

Showing 16 changed files with 722 additions and 4 deletions.
diff --git a/element/algorithm/yolov8/README.md b/element/algorithm/yolov8/README.md
@@ -48,7 +48,7 @@ sophon-stream yolov8插件具有一些可配置的参数，可以根据需求进
 |  model_path  |   字符串   | "../data/models/BM1684X/yolov8s_int8_1b.bmodel" | yolov8模型路径 |
 |  threshold_conf   |   浮点数或map   | 0.5 | 目标检测物体置信度阈值，设置为浮点数时，所有类别共用同一个阈值；设置为map时，不同类别可以使用不同阈值，此时还需要正确设置class_names_file |
 |  threshold_nms  |   浮点数   | 0.5 | 目标检测NMS IOU阈值 |
-|  task_type   | 字符串 | "Detect" | yolov8算法类型，支持了 "Detect", "Cls", "Pose" 和 "Seg" |
+|  task_type   | 字符串 | "Detect" | yolov8算法类型，支持了 "Detect", "Cls", "Pose", "Seg"和"obb" |
 |  bgr2rgb  |   bool   | true | 解码器解出来的图像默认是bgr格式，是否需要将图像转换成rgb格式 |
 |  mean  |   浮点数组   | 无 | 图像前处理均值，长度为3；计算方式为: y=(x-mean)/std；若bgr2rgb=true，数组中数组顺序需为r、g、b，否则需为b、g、r |
 |  std  |   浮点数组   | 无 | 图像前处理方差，长度为3；计算方式同上；若bgr2rgb=true数组中数组顺序需为r、g、b，否则需为b、g、r |

diff --git a/element/algorithm/yolov8/README_EN.md b/element/algorithm/yolov8/README_EN.md
@@ -49,7 +49,7 @@ The Sophon-Stream YOLOv8 plugin has several configurable parameters that can be
 |  threshold_conf   |   float/map   | 0.5 | Object detection confidence threshold. When set as a float number, all categories share the same threshold. When set as a map, different categories can have different thresholds. In second case, it's necessary to correctly set the class_names_file. |
 |  threshold_nms  |   float   | 0.5 | NMS Threshold |
 |  bgr2rgb  |   bool   | true | The images decoded by the decoder are in the default BGR format. whether a need to convert the images to the RGB format |
-|  task_type   | string | "Detect" | yolov8 alg type, supports "Detect", "Cls", "Pose" and "Seg" |
+|  task_type   | string | "Detect" | yolov8 alg type, supports "Detect", "Cls", "Pose, "Seg" and "obb" |
 |  mean  |   float[]   | \ | The image preprocessing requires mean values in an array of length 3. The formula used for calculation is `y=(x-mean)/std` . When bgr2rgb is set to true, the array should be in RGB order; otherwise, it should be in BGR order. |
 |  std  |   float[]   | \ | The image preprocessing involves variance values in an array of length 3. The calculation method remains the same. When bgr2rgb is set to true, the array should be in RGB order; otherwise, it should be in BGR order. |
 |  stage    |   queue   | ["pre"]  | The three stages include preprocessing, inference, and postprocessing. |

diff --git a/element/algorithm/yolov8/include/yolov8_context.h b/element/algorithm/yolov8/include/yolov8_context.h
@@ -18,7 +18,7 @@ namespace yolov8 {
 
 #define USE_ASPECT_RATIO 1
 
-enum class TaskType { Detect = 0, Pose, Cls, Seg };
+enum class TaskType { Detect = 0, Pose, Cls, Seg, Obb };
 
 class Yolov8Context : public ::sophon_stream::element::Context {
  public:

diff --git a/element/algorithm/yolov8/include/yolov8_post_process.h b/element/algorithm/yolov8/include/yolov8_post_process.h
@@ -44,6 +44,12 @@ struct Paras {
 
 using YoloV8BoxVec = std::vector<YoloV8Box>;
 
+struct obbBox{
+  float x, y, w, h, angle, score;
+  int class_id;
+};
+using obbBoxVec = std::vector<obbBox>;
+
 class Yolov8PostProcess : public ::sophon_stream::element::PostProcess {
  public:
   void init(std::shared_ptr<Yolov8Context> context);
@@ -71,6 +77,8 @@ class Yolov8PostProcess : public ::sophon_stream::element::PostProcess {
                       common::ObjectMetadatas& objectMetadatas);
   void postProcessSeg(std::shared_ptr<Yolov8Context> context,
                       common::ObjectMetadatas& objectMetadatas);
+  void postProcessObb(std::shared_ptr<Yolov8Context> context,
+                      common::ObjectMetadatas& objectMetadatas);
   void clip_boxes(YoloV8BoxVec& yolobox_vec, int src_w, int src_h);
 
   // yolov8 seg
@@ -81,6 +89,13 @@ class Yolov8PostProcess : public ::sophon_stream::element::PostProcess {
                    YoloV8BoxVec& yolov8box_input, int start,
                    const bm_tensor_t& segmentation_tensor, Paras& paras,
                    YoloV8BoxVec& yolov8box_output, float confThreshold);
+
+  //obb utils.
+  void nms_rotated(obbBoxVec& dets, float nmsConfidence = 0.5);
+  std::tuple<float, float, float> convariance_matrix(const obbBox& obb);
+  float probiou(const obbBox& obb1, const obbBox& obb2, float eps = 1e-7);
+  void regularize_rbox(obbBoxVec& obb);
+  common::ObbObjectMetadata xywhr2xyxyxyxy(const obbBox& obb);
 };
 
 }  // namespace yolov8

diff --git a/element/algorithm/yolov8/src/yolov8.cc b/element/algorithm/yolov8/src/yolov8.cc
@@ -32,7 +32,8 @@ const std::string Yolov8::elementName = "yolov8";
 std::unordered_map<std::string, TaskType> taskMap{{"Detect", TaskType::Detect},
                                                   {"Pose", TaskType::Pose},
                                                   {"Cls", TaskType::Cls},
-                                                  {"Seg", TaskType::Seg}};
+                                                  {"Seg", TaskType::Seg},
+                                                  {"Obb", TaskType::Obb}};
 
 common::ErrorCode Yolov8::initContext(const std::string& json) {
   common::ErrorCode errorCode = common::ErrorCode::SUCCESS;
@@ -210,6 +211,17 @@ common::ErrorCode Yolov8::initContext(const std::string& json) {
         mContext->class_num =
             mContext->bmNetwork->outputTensor(0)->get_shape()->dims[1] -
             mContext->mask_len - 4;
+      else if (mContext->taskType == TaskType::Obb){
+        int ndim1 = mContext->bmNetwork->outputTensor(0)->get_shape()->dims[1];
+        int ndim2 = mContext->bmNetwork->outputTensor(0)->get_shape()->dims[2];
+        if (ndim1 < ndim2){
+          IVS_CRITICAL(
+            "We only support bmodel's output_shape like [N, box_num, nout], usually box_num > nout. "
+            "But your bmodel's shape is [{0:d}, {1:d}, {2:d}].", mContext->max_batch, ndim1, ndim2);
+          abort();
+        }
+        mContext->class_num = mContext->bmNetwork->outputTensor(0)->get_shape()->dims[2] - 5;
+      }
     }
 
     if (mContext->class_thresh_valid) {

diff --git a/element/algorithm/yolov8/src/yolov8_post_process.cc b/element/algorithm/yolov8/src/yolov8_post_process.cc
@@ -85,6 +85,8 @@ void Yolov8PostProcess::postProcess(std::shared_ptr<Yolov8Context> context,
     postProcessCls(context, objectMetadatas);
   else if (context->taskType == TaskType::Seg)
     postProcessSeg(context, objectMetadatas);
+  else if (context->taskType == TaskType::Obb)
+    postProcessObb(context, objectMetadatas);
 }
 
 void Yolov8PostProcess::clip_boxes(YoloV8BoxVec& yolobox_vec, int src_w,
@@ -1039,6 +1041,241 @@ void Yolov8PostProcess::get_mask(std::shared_ptr<Yolov8Context> context,
   mask_out = mask(bound) > context->thresh_nms;
 }
 
+void Yolov8PostProcess::postProcessObb(
+    std::shared_ptr<Yolov8Context> context,
+    common::ObjectMetadatas& objectMetadatas) {
+  // Yolov8 obb vec
+  obbBoxVec yolobox_vec;
+
+  int idx = 0;
+  for (auto obj : objectMetadatas) {
+    if (obj->mFrame->mEndOfStream) break;
+    std::vector<std::shared_ptr<BMNNTensor>> outputTensors(context->output_num);
+    for (int i = 0; i < context->output_num; i++) {
+      outputTensors[i] = std::make_shared<BMNNTensor>(
+          obj->mOutputBMtensors->handle,
+          context->bmNetwork->m_netinfo->output_names[i],
+          context->bmNetwork->m_netinfo->output_scales[i],
+          obj->mOutputBMtensors->tensors[i].get(), context->bmNetwork->is_soc);
+    }
+
+    int frame_width = obj->mFrame->mSpData->width;
+    int frame_height = obj->mFrame->mSpData->height;
+    int tx1 = 0, ty1 = 0;
+    float ratio = 1.0;
+#ifdef USE_ASPECT_RATIO
+    bool isAlignWidth = false;
+    ratio =
+        context->roi_predefined
+            ? get_aspect_scaled_ratio(context->roi.crop_w, context->roi.crop_h,
+                                      context->net_w, context->net_h,
+                                      &isAlignWidth)
+            : get_aspect_scaled_ratio(frame_width, frame_height, context->net_w,
+                                      context->net_h, &isAlignWidth);
+    if (isAlignWidth) {
+      ty1 = (int)((context->net_h -
+                   (int)((context->roi_predefined ? context->roi.crop_h
+                                                  : frame_height) *
+                         ratio)) /
+                  2);
+    } else {
+      tx1 = (int)((context->net_w -
+                   (int)((context->roi_predefined ? context->roi.crop_w
+                                                  : frame_width) *
+                         ratio)) /
+                  2);
+    }
+#endif
+    int min_idx = 0;
+    int box_num = 0;
+    for (int i = 0; i < context->output_num; ++i) {
+      auto output_shape = context->bmNetwork->outputTensor(i)->get_shape();
+      auto output_dims = output_shape->num_dims;
+      assert(output_dims == 3 || output_dims == 5);
+      if (output_dims == 5) {
+        box_num += output_shape->dims[1] * output_shape->dims[2] *
+                   output_shape->dims[3];
+      }
+
+      if (context->min_dim > output_dims) {
+        min_idx = i;
+        context->min_dim = output_dims;
+      }
+    }
+    // mask info
+    int mask_num = 0;
+    auto out_tensor = outputTensors[min_idx];
+    int m_class_num = out_tensor->get_shape()->dims[2] - mask_num - 5;
+    int feature_num = out_tensor->get_shape()->dims[1];  // 8400
+    int nout = m_class_num + mask_num + 5;
+    int max_wh = 7680;  // (pixels) maximum box width and height
+    bool agnostic = false;
+
+    float* output_data = nullptr;
+    std::vector<float> decoded_data;
+
+    if (context->min_dim == 3 && context->output_num != 1) {
+      std::cout << "--> WARNING: the current bmodel has redundant outputs"
+                << std::endl;
+      std::cout << "             you can remove the redundant outputs to "
+                   "improve performance"
+                << std::endl;
+      std::cout << std::endl;
+    }
+
+    assert(box_num == 0 || box_num == out_tensor->get_shape()->dims[1]);
+    box_num = out_tensor->get_shape()->dims[1];
+    output_data =
+        (float*)out_tensor->get_cpu_data();  // 如果只有一张图片不要需修改
+
+    // Candidates
+    float* cls_conf = output_data + 4; //output_tensor's last dim: [x, y, w, h, cls_conf0, ..., cls_conf14, rotate_angle]
+    for (int i = 0; i < box_num; i++) {
+      // multilabel
+      for (int j = 0; j < m_class_num; j++) {
+        float cur_conf = cls_conf[i * nout + j];
+        float cur_class_thresh =
+          context->class_thresh_valid
+              ? context->thresh_conf[context->class_names[j]]
+              : context->thresh_conf_min;
+        if (cur_conf > cur_class_thresh) {
+          obbBox box;
+          box.score = cur_conf;
+          box.class_id = j;
+          int c = agnostic ? 0 : box.class_id * max_wh;
+          box.x = output_data[i * nout + 0] + c;
+          box.y = output_data[i * nout + 1] + c;
+          box.w = output_data[i * nout + 2];
+          box.h = output_data[i * nout + 3];
+          box.angle = output_data[(i + 1) * nout - 1];
+          yolobox_vec.push_back(box);
+        }
+      }
+    }
+    nms_rotated(yolobox_vec, context->thresh_nms);
+    if (yolobox_vec.size() > max_det) {
+        yolobox_vec.erase(yolobox_vec.begin(), yolobox_vec.begin() + (yolobox_vec.size() - max_det));
+    }
+    if(!agnostic){
+        for (int i = 0; i < yolobox_vec.size(); i++) {
+            int c = yolobox_vec[i].class_id * max_wh;
+            yolobox_vec[i].x = yolobox_vec[i].x - c;
+            yolobox_vec[i].y = yolobox_vec[i].y - c;
+        }
+    }
+    regularize_rbox(yolobox_vec);
+    float inv_ratio = 1.0 / ratio;
+    for (int i = 0; i < yolobox_vec.size(); i++) {
+        yolobox_vec[i].x = std::round((yolobox_vec[i].x - tx1) * inv_ratio);
+        yolobox_vec[i].y = std::round((yolobox_vec[i].y - ty1) * inv_ratio);
+        yolobox_vec[i].w = std::round(yolobox_vec[i].w * inv_ratio);
+        yolobox_vec[i].h = std::round(yolobox_vec[i].h * inv_ratio);
+    }
+
+    for (auto bbox : yolobox_vec) {
+      std::shared_ptr<common::ObbObjectMetadata> obbData =
+          std::make_shared<common::ObbObjectMetadata>(xywhr2xyxyxyxy(bbox));
+
+      if (context->roi_predefined) {
+        obbData->add_offset(context->roi.start_x, context->roi.start_y);
+      }
+      obj->mObbObjectMetadatas.push_back(obbData);
+    }
+    ++idx;
+  }
+}
+
+
+void Yolov8PostProcess::regularize_rbox(obbBoxVec& obbVec){
+  for(auto& obb : obbVec){
+    if(obb.h > obb.w){
+      std::swap(obb.w, obb.h);
+      obb.angle = obb.angle + M_PI / 2;
+    }
+    obb.angle = std::fmod(obb.angle, M_PI);
+    if(obb.angle < 0){
+      obb.angle += M_PI;
+    }
+  }
+}
+
+std::tuple<float, float, float> Yolov8PostProcess::convariance_matrix(const obbBox& obb){
+  float w = obb.w;
+  float h = obb.h;
+  float r = obb.angle;
+  float a = w * w / 12.0;
+  float b = h * h / 12.0;
+  float cos_r = std::cos(r);
+  float sin_r = std::sin(r);
+  float a_val = a * cos_r * cos_r + b * sin_r * sin_r;
+  float b_val = a * sin_r * sin_r + b * cos_r * cos_r;
+  float c_val = (a - b) * cos_r * sin_r;
+  return std::make_tuple(a_val, b_val, c_val);
+}
+
+float Yolov8PostProcess::probiou(const obbBox& obb1, const obbBox& obb2, float eps){
+  // Calculate the prob iou between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
+  float a1, b1, c1, a2, b2, c2;
+  std::tie(a1, b1, c1) = convariance_matrix(obb1);
+  std::tie(a2, b2, c2) = convariance_matrix(obb2);
+  float x1 = obb1.x, y1 = obb1.y;
+  float x2 = obb2.x, y2 = obb2.y;
+  float t1 = ((a1 + a2) * std::pow(y1 - y2, 2) + (b1 + b2) * std::pow(x1 - x2, 2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
+  float t2 = ((c1 + c2) * (x2 - x1) * (y1 - y2)) / ((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2) + eps);
+  float t3 = std::log(((a1 + a2) * (b1 + b2) - std::pow(c1 + c2, 2)) / (4 * std::sqrt(std::max(a1 * b1 - c1 * c1, 0.0f)) * std::sqrt(std::max(a2 * b2 - c2 * c2, 0.0f)) + eps) + eps);
+  float bd = 0.25 * t1 + 0.5 * t2 + 0.5 * t3;
+  bd = std::max(std::min(bd, 100.0f), eps);
+  float hd = std::sqrt(1.0 - std::exp(-bd) + eps);
+  return 1 - hd;
+}
+
+
+void Yolov8PostProcess::nms_rotated(obbBoxVec& dets, float nmsConfidence) {
+    int length = dets.size();
+    int index = length - 1;
+
+    std::sort(dets.begin(), dets.end(), [](const obbBox& a, const obbBox& b) { return a.score < b.score; });
+
+    while (index > 0) {
+        int i = 0;
+        while (i < index) {
+            float iou = probiou(dets[index], dets[i]);
+            if (iou >= nmsConfidence) {
+                dets.erase(dets.begin() + i);
+                index--;
+            } else {
+                i++;
+            }
+        }
+        index--;
+    }
+}
+
+common::ObbObjectMetadata Yolov8PostProcess::xywhr2xyxyxyxy(const obbBox& obb){
+  common::ObbObjectMetadata obb_;
+  float cos_value = std::cos(obb.angle);
+  float sin_value = std::sin(obb.angle);
+
+  // Calculate half-dimensions rotated
+  float dx1 = (obb.w / 2) * cos_value;
+  float dy1 = (obb.w / 2) * sin_value;
+  float dx2 = (obb.h / 2) * sin_value;
+  float dy2 = (obb.h / 2) * cos_value;
+
+  // Calculate corners
+  obb_.class_id = obb.class_id;
+  obb_.score = obb.score;
+  obb_.x1 = std::round(obb.x + dx1 + dx2);
+  obb_.y1 = std::round(obb.y + dy1 - dy2);
+  obb_.x2 = std::round(obb.x + dx1 - dx2);
+  obb_.y2 = std::round(obb.y + dy1 + dy2);
+  obb_.x3 = std::round(obb.x - dx1 - dx2);
+  obb_.y3 = std::round(obb.y - dy1 + dy2);
+  obb_.x4 = std::round(obb.x - dx1 + dx2);
+  obb_.y4 = std::round(obb.y - dy1 - dy2);
+  return obb_;
+}
+
 }  // namespace yolov8
 }  // namespace element
 }  // namespace sophon_stream
diff --git a/framework/common/obb_object_metadata.h b/framework/common/obb_object_metadata.h
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Copyright (C) 2022 Sophgo Technologies Inc.  All rights reserved.
+//
+// SOPHON-STREAM is licensed under the 2-Clause BSD License except for the
+// third-party components.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SOPHON_STREAM_COMMON_OBB_OBJECT_METADATA_H_
+#define SOPHON_STREAM_COMMON_OBB_OBJECT_METADATA_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+
+namespace sophon_stream {
+namespace common {
+struct ObbObjectMetadata {
+    float x1, y1, x2, y2, x3, y3, x4, y4, score;
+    int class_id;
+    inline void add_offset(int x, int y){
+        this->x1 += x;
+        this->x2 += x;
+        this->x3 += x;
+        this->x4 += x;
+        this->y1 += y;
+        this->y2 += y;
+        this->y3 += y;
+        this->y4 += y;
+    }
+};
+
+}  // namespace common
+}  // namespace sophon_stream
+
+#endif  // SOPHON_STREAM_COMMON_POSED_OBJECT_METADATA_H_