src/operator/contrib/multi_proposal.cc

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * Copyright (c) 2017 Microsoft
 * Licensed under The Apache-2.0 License [see LICENSE for details]
 * \file multi_proposal.cc
 * \brief
 * \author Xizhou Zhu, Kan Wu
*/

#include "./multi_proposal-inl.h"

//============================
// Bounding Box Transform Utils
//============================
namespace mxnet {
namespace op {
namespace utils {

// bbox prediction and clip to the image borders
inline void BBoxTransformInv(const mshadow::Tensor<cpu, 2>& boxes,
                             const mshadow::Tensor<cpu, 3>& deltas,
                             const float im_height,
                             const float im_width,
                             const int real_height,
                             const int real_width,
                             mshadow::Tensor<cpu, 2> *out_pred_boxes) {
  CHECK_GE(boxes.size(1), 4);
  CHECK_GE(out_pred_boxes->size(1), 4);
  int anchors = deltas.size(0) / 4;
  int heights = deltas.size(1);
  int widths = deltas.size(2);

  #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
  for (int index = 0; index < anchors * heights * widths; ++index) {
    // index_t index = h * (widths * anchors) + w * (anchors) + a;
    int a = index % anchors;
    int w = (index / anchors) % widths;
    int h = index / (widths * anchors);

    float width = boxes[index][2] - boxes[index][0] + 1.0;
    float height = boxes[index][3] - boxes[index][1] + 1.0;
    float ctr_x = boxes[index][0] + 0.5 * (width - 1.0);
    float ctr_y = boxes[index][1] + 0.5 * (height - 1.0);

    float dx = deltas[a*4 + 0][h][w];
    float dy = deltas[a*4 + 1][h][w];
    float dw = deltas[a*4 + 2][h][w];
    float dh = deltas[a*4 + 3][h][w];

    float pred_ctr_x = dx * width + ctr_x;
    float pred_ctr_y = dy * height + ctr_y;
    float pred_w = std::exp(dw) * width;
    float pred_h = std::exp(dh) * height;

    float pred_x1 = pred_ctr_x - 0.5 * (pred_w - 1.0);
    float pred_y1 = pred_ctr_y - 0.5 * (pred_h - 1.0);
    float pred_x2 = pred_ctr_x + 0.5 * (pred_w - 1.0);
    float pred_y2 = pred_ctr_y + 0.5 * (pred_h - 1.0);

    pred_x1 = std::max(std::min(pred_x1, im_width - 1.0f), 0.0f);
    pred_y1 = std::max(std::min(pred_y1, im_height - 1.0f), 0.0f);
    pred_x2 = std::max(std::min(pred_x2, im_width - 1.0f), 0.0f);
    pred_y2 = std::max(std::min(pred_y2, im_height - 1.0f), 0.0f);

    (*out_pred_boxes)[index][0] = pred_x1;
    (*out_pred_boxes)[index][1] = pred_y1;
    (*out_pred_boxes)[index][2] = pred_x2;
    (*out_pred_boxes)[index][3] = pred_y2;

    if (h >= real_height || w >= real_width) {
      (*out_pred_boxes)[index][4] = -1.0;
    }
  }
}

// iou prediction and clip to the image border
inline void IoUTransformInv(const mshadow::Tensor<cpu, 2>& boxes,
                            const mshadow::Tensor<cpu, 3>& deltas,
                            const float im_height,
                            const float im_width,
                            const int real_height,
                            const int real_width,
                            mshadow::Tensor<cpu, 2> *out_pred_boxes) {
  CHECK_GE(boxes.size(1), 4);
  CHECK_GE(out_pred_boxes->size(1), 4);
  int anchors = deltas.size(0) / 4;
  int heights = deltas.size(1);
  int widths = deltas.size(2);

  #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
  for (int index = 0; index < anchors * heights * widths; ++index) {
    // index_t index = h * (widths * anchors) + w * (anchors) + a;
    int a = index % anchors;
    int w = (index / anchors) % widths;
    int h = index / (widths * anchors);

    float x1 = boxes[index][0];
    float y1 = boxes[index][1];
    float x2 = boxes[index][2];
    float y2 = boxes[index][3];

    float dx1 = deltas[a * 4 + 0][h][w];
    float dy1 = deltas[a * 4 + 1][h][w];
    float dx2 = deltas[a * 4 + 2][h][w];
    float dy2 = deltas[a * 4 + 3][h][w];

    float pred_x1 = x1 + dx1;
    float pred_y1 = y1 + dy1;
    float pred_x2 = x2 + dx2;
    float pred_y2 = y2 + dy2;

    pred_x1 = std::max(std::min(pred_x1, im_width - 1.0f), 0.0f);
    pred_y1 = std::max(std::min(pred_y1, im_height - 1.0f), 0.0f);
    pred_x2 = std::max(std::min(pred_x2, im_width - 1.0f), 0.0f);
    pred_y2 = std::max(std::min(pred_y2, im_height - 1.0f), 0.0f);

    (*out_pred_boxes)[index][0] = pred_x1;
    (*out_pred_boxes)[index][1] = pred_y1;
    (*out_pred_boxes)[index][2] = pred_x2;
    (*out_pred_boxes)[index][3] = pred_y2;

    if (h >= real_height || w >= real_width) {
      (*out_pred_boxes)[index][4] = -1.0f;
    }
  }
}

// filter box by set confidence to zero
// * height or width < rpn_min_size
inline void FilterBox(mshadow::Tensor<cpu, 2> *dets,
                      const float min_size) {
  #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
  for (int i = 0; i < static_cast<int>(dets->size(0)); ++i) {
    float iw = (*dets)[i][2] - (*dets)[i][0] + 1.0f;
    float ih = (*dets)[i][3] - (*dets)[i][1] + 1.0f;
    if (iw < min_size || ih < min_size) {
      (*dets)[i][0] -= min_size / 2;
      (*dets)[i][1] -= min_size / 2;
      (*dets)[i][2] += min_size / 2;
      (*dets)[i][3] += min_size / 2;
      (*dets)[i][4] = -1.0f;
    }
  }
}

}  // namespace utils
}  // namespace op
}  // namespace mxnet

//=====================
// NMS Utils
//=====================
namespace mxnet {
namespace op {
namespace utils {

struct ReverseArgsortCompl {
  const float *val_;
  explicit ReverseArgsortCompl(float *val)
    : val_(val) {}
  bool operator() (float i, float j) {
    return (val_[static_cast<index_t>(i)] >
            val_[static_cast<index_t>(j)]);
  }
};

// copy score and init order
inline void CopyScore(const mshadow::Tensor<cpu, 2>& dets,
                      mshadow::Tensor<cpu, 1> *score,
                      mshadow::Tensor<cpu, 1> *order) {
  #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
  for (int i = 0; i < static_cast<int>(dets.size(0)); ++i) {
    (*score)[i] = dets[i][4];
    (*order)[i] = i;
  }
}

// sort order array according to score
inline void ReverseArgsort(const mshadow::Tensor<cpu, 1>& score,
                           mshadow::Tensor<cpu, 1> *order) {
  ReverseArgsortCompl cmpl(score.dptr_);
  std::stable_sort(order->dptr_, order->dptr_ + score.size(0), cmpl);
}

// reorder proposals according to order and keep the pre_nms_top_n proposals
// dets.size(0) == pre_nms_top_n
inline void ReorderProposals(const mshadow::Tensor<cpu, 2>& prev_dets,
                             const mshadow::Tensor<cpu, 1>& order,
                             const index_t pre_nms_top_n,
                             mshadow::Tensor<cpu, 2> *dets) {
  CHECK_EQ(dets->size(0), pre_nms_top_n);
  const int dets_size0 = static_cast<int>(dets->size(0));
  const int dets_size1 = static_cast<int>(dets->size(1));
  #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
  for (int k = 0; k < dets_size0 * dets_size1; ++k) {
    int i = k / dets_size1;
    int j = k % dets_size1;
    const index_t index = order[i];
    (*dets)[i][j] = prev_dets[index][j];
  }
}

// greedily keep the max detections (already sorted)
inline void NonMaximumSuppression(const mshadow::Tensor<cpu, 2>& dets,
                                  const float thresh,
                                  const index_t post_nms_top_n,
                                  mshadow::Tensor<cpu, 1> *area,
                                  mshadow::Tensor<cpu, 1> *suppressed,
                                  mshadow::Tensor<cpu, 1> *keep,
                                  int *out_size) {
  CHECK_EQ(dets.shape_[1], 5) << "dets: [x1, y1, x2, y2, score]";
  CHECK_GT(dets.shape_[0], 0);
  CHECK_EQ(dets.CheckContiguous(), true);
  CHECK_EQ(area->CheckContiguous(), true);
  CHECK_EQ(suppressed->CheckContiguous(), true);
  CHECK_EQ(keep->CheckContiguous(), true);
  // calculate area
  #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
  for (int i = 0; i < static_cast<int>(dets.size(0)); ++i) {
    (*area)[i] = (dets[i][2] - dets[i][0] + 1) *
                 (dets[i][3] - dets[i][1] + 1);
  }

  // calculate nms
  *out_size = 0;
  for (index_t i = 0; i < dets.size(0) && (*out_size) < static_cast<int>(post_nms_top_n); ++i) {
    float ix1 = dets[i][0];
    float iy1 = dets[i][1];
    float ix2 = dets[i][2];
    float iy2 = dets[i][3];
    float iarea = (*area)[i];

    if ((*suppressed)[i] > 0.0f) {
      continue;
    }

    (*keep)[(*out_size)++] = i;
    #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
    for (int j = i + 1; j < static_cast<int>(dets.size(0)); ++j) {
      if ((*suppressed)[j] > 0.0f) {
        continue;
      }
      float xx1 = std::max(ix1, dets[j][0]);
      float yy1 = std::max(iy1, dets[j][1]);
      float xx2 = std::min(ix2, dets[j][2]);
      float yy2 = std::min(iy2, dets[j][3]);
      float w = std::max(0.0f, xx2 - xx1 + 1.0f);
      float h = std::max(0.0f, yy2 - yy1 + 1.0f);
      float inter = w * h;
      float ovr = inter / (iarea + (*area)[j] - inter);
      if (ovr > thresh) {
        (*suppressed)[j] = 1.0f;
      }
    }
  }
}

}  // namespace utils
}  // namespace op
}  // namespace mxnet


namespace mxnet {
namespace op {

template<typename xpu>
class MultiProposalOp : public Operator{
 public:
  explicit MultiProposalOp(MultiProposalParam param) {
    this->param_ = param;
  }

  virtual void Forward(const OpContext &ctx,
                       const std::vector<TBlob> &in_data,
                       const std::vector<OpReqType> &req,
                       const std::vector<TBlob> &out_data,
                       const std::vector<TBlob> &aux_states) {
    using namespace mshadow;
    using namespace mshadow::expr;
    CHECK_EQ(in_data.size(), 3);
    CHECK_EQ(out_data.size(), 2);
    CHECK_GT(req.size(), 1);
    CHECK_EQ(req[proposal::kOut], kWriteTo);

    Stream<xpu> *s = ctx.get_stream<xpu>();

    Tensor<cpu, 4> scores = in_data[proposal::kClsProb].get<cpu, 4, real_t>(s);
    Tensor<cpu, 4> bbox_deltas = in_data[proposal::kBBoxPred].get<cpu, 4, real_t>(s);
    Tensor<cpu, 2> im_info = in_data[proposal::kImInfo].get<cpu, 2, real_t>(s);

    Tensor<cpu, 2> out = out_data[proposal::kOut].get<cpu, 2, real_t>(s);
    Tensor<cpu, 2> out_score = out_data[proposal::kScore].get<cpu, 2, real_t>(s);

    int num_images = scores.size(0);
    int num_anchors = scores.size(1) / 2;
    int height = scores.size(2);
    int width = scores.size(3);
    int count_anchors = num_anchors * height * width;
    int rpn_pre_nms_top_n =
        (param_.rpn_pre_nms_top_n > 0) ? param_.rpn_pre_nms_top_n : count_anchors;
    rpn_pre_nms_top_n = std::min(rpn_pre_nms_top_n, count_anchors);
    int rpn_post_nms_top_n = std::min(param_.rpn_post_nms_top_n, rpn_pre_nms_top_n);

    int workspace_size =
        num_images * (count_anchors * 5 + 2 * count_anchors +
        rpn_pre_nms_top_n * 5 + 3 * rpn_pre_nms_top_n);

    Tensor<cpu, 1> workspace = ctx.requested[proposal::kTempResource].get_space<cpu>(
      Shape1(workspace_size), s);
    int start = 0;
    Tensor<cpu, 3> workspace_proposals(workspace.dptr_ +
            start, Shape3(num_images, count_anchors, 5));
    start += num_images * count_anchors * 5;
    Tensor<cpu, 3> workspace_pre_nms(workspace.dptr_ + start, Shape3(num_images, 2, count_anchors));
    start += num_images * 2 * count_anchors;
    Tensor<cpu, 3> workspace_ordered_proposals(workspace.dptr_ + start,
                                               Shape3(num_images, rpn_pre_nms_top_n, 5));
    start += num_images * rpn_pre_nms_top_n * 5;
    Tensor<cpu, 3> workspace_nms(workspace.dptr_ + start, Shape3(num_images, 3, rpn_pre_nms_top_n));
    start += num_images * 3 * rpn_pre_nms_top_n;
    CHECK_EQ(workspace_size, start) << workspace_size << " " << start << std::endl;

    // Generate anchors
    std::vector<float> base_anchor(4);
    base_anchor[0] = 0.0;
    base_anchor[1] = 0.0;
    base_anchor[2] = param_.feature_stride - 1.0;
    base_anchor[3] = param_.feature_stride - 1.0;
    CHECK_EQ(num_anchors, param_.ratios.ndim() * param_.scales.ndim());
    std::vector<float> anchors;
    utils::GenerateAnchors(base_anchor,
                           param_.ratios,
                           param_.scales,
                           &anchors);
    std::memcpy(workspace_proposals.dptr_, &anchors[0], sizeof(float) * anchors.size());

    Tensor<cpu, 2> workspace_proposals0 = workspace_proposals[0];
    // Enumerate all shifted anchors
    #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
    for (int index = 0; index < num_anchors * height * width; ++index) {
      // index_t index = j * (width * num_anchors) + k * (num_anchors) + i;
      int i = index % num_anchors;
      int k = (index / num_anchors) % width;
      int j = index / (width * num_anchors);
      workspace_proposals0[index][0] =
          workspace_proposals0[i][0] + k * param_.feature_stride;
      workspace_proposals0[index][1] =
          workspace_proposals0[i][1] + j * param_.feature_stride;
      workspace_proposals0[index][2] =
          workspace_proposals0[i][2] + k * param_.feature_stride;
      workspace_proposals0[index][3] =
          workspace_proposals0[i][3] + j * param_.feature_stride;
      workspace_proposals0[index][4] = scores[0][i + num_anchors][j][k];
    }

    // Copy shifted anchors to other images
    #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
    for (int t = count_anchors; t < num_images * count_anchors; ++t) {
        int b = t / count_anchors;
        int index = t % count_anchors;
        int i = index % num_anchors;
        int k = (index / num_anchors) % width;
        int j = index / (width * num_anchors);
        for (int w = 0; w < 4; ++w) {
            workspace_proposals[b][index][w] = workspace_proposals[0][index][w];
        }
        workspace_proposals[b][index][4] = scores[b][i + num_anchors][j][k];
    }

    // Assign Foreground Scores for each anchor
    #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
    for (int b = 0; b < num_images; ++b) {
      // prevent padded predictions
      int real_height = static_cast<int>(im_info[b][0] / param_.feature_stride);
      int real_width = static_cast<int>(im_info[b][1] / param_.feature_stride);
      CHECK_GE(height, real_height) << height << " " << real_height << std::endl;
      CHECK_GE(width, real_width) << width << " " << real_width << std::endl;

      Tensor<cpu, 2> workspace_proposals_i = workspace_proposals[b];
      Tensor<cpu, 2> workspace_pre_nms_i = workspace_pre_nms[b];
      Tensor<cpu, 2> workspace_ordered_proposals_i =
                       workspace_ordered_proposals[b];
      Tensor<cpu, 2> workspace_nms_i = workspace_nms[b];

      if (param_.iou_loss) {
        utils::IoUTransformInv(workspace_proposals_i, bbox_deltas[b], im_info[b][0], im_info[b][1],
                               real_height, real_width, &(workspace_proposals_i));
      } else {
        utils::BBoxTransformInv(workspace_proposals_i, bbox_deltas[b], im_info[b][0], im_info[b][1],
                                real_height, real_width, &(workspace_proposals_i));
      }
      utils::FilterBox(&workspace_proposals_i, param_.rpn_min_size * im_info[b][2]);

      Tensor<cpu, 1> score = workspace_pre_nms_i[0];
      Tensor<cpu, 1> order = workspace_pre_nms_i[1];

      utils::CopyScore(workspace_proposals_i,
                       &score,
                       &order);
      utils::ReverseArgsort(score,
                            &order);
      utils::ReorderProposals(workspace_proposals_i,
                              order,
                              rpn_pre_nms_top_n,
                              &workspace_ordered_proposals_i);
      int out_size = 0;
      Tensor<cpu, 1> area = workspace_nms_i[0];
      Tensor<cpu, 1> suppressed = workspace_nms_i[1];
      Tensor<cpu, 1> keep = workspace_nms_i[2];
      suppressed = 0;  // surprised!

      utils::NonMaximumSuppression(workspace_ordered_proposals_i,
                                   param_.threshold,
                                   rpn_post_nms_top_n,
                                   &area,
                                   &suppressed,
                                   &keep,
                                   &out_size);

      // fill in output rois and output scores
      #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
      for (int i = 0; i < param_.rpn_post_nms_top_n; ++i) {
        int out_index = b * param_.rpn_post_nms_top_n + i;
        out[out_index][0] = b;
        if (i < out_size) {
          index_t index = keep[i];
          for (index_t j = 0; j < 4; ++j) {
            out[out_index][j + 1] =  workspace_ordered_proposals_i[index][j];
          }
          out_score[out_index][0] = workspace_ordered_proposals_i[index][4];
        } else {
          index_t index = keep[i % out_size];
          for (index_t j = 0; j < 4; ++j) {
            out[out_index][j + 1] = workspace_ordered_proposals_i[index][j];
          }
          out_score[out_index][0] = workspace_ordered_proposals_i[index][4];
        }
      }
    }
  }

  virtual void Backward(const OpContext &ctx,
                        const std::vector<TBlob> &out_grad,
                        const std::vector<TBlob> &in_data,
                        const std::vector<TBlob> &out_data,
                        const std::vector<OpReqType> &req,
                        const std::vector<TBlob> &in_grad,
                        const std::vector<TBlob> &aux_states) {
    using namespace mshadow;
    using namespace mshadow::expr;
    CHECK_EQ(in_grad.size(), 3);

    Stream<xpu> *s = ctx.get_stream<xpu>();
    Tensor<xpu, 4> gscores = in_grad[proposal::kClsProb].get<xpu, 4, real_t>(s);
    Tensor<xpu, 4> gbbox = in_grad[proposal::kBBoxPred].get<xpu, 4, real_t>(s);
    Tensor<xpu, 2> ginfo = in_grad[proposal::kImInfo].get<xpu, 2, real_t>(s);

    // can not assume the grad would be zero
    Assign(gscores, req[proposal::kClsProb], 0);
    Assign(gbbox, req[proposal::kBBoxPred], 0);
    Assign(ginfo, req[proposal::kImInfo], 0);
  }

 private:
  MultiProposalParam param_;
};  // class MultiProposalOp

template<>
Operator *CreateOp<cpu>(MultiProposalParam param) {
  return new MultiProposalOp<cpu>(param);
}

Operator* MultiProposalProp::CreateOperator(Context ctx) const {
  DO_BIND_DISPATCH(CreateOp, param_);
}

DMLC_REGISTER_PARAMETER(MultiProposalParam);

MXNET_REGISTER_OP_PROPERTY(_contrib_MultiProposal, MultiProposalProp)
.describe("Generate region proposals via RPN")
.add_argument("cls_prob", "NDArray-or-Symbol", "Score of how likely proposal is object.")
.add_argument("bbox_pred", "NDArray-or-Symbol", "BBox Predicted deltas from anchors for proposals")
.add_argument("im_info", "NDArray-or-Symbol", "Image size and scale.")
.add_arguments(MultiProposalParam::__FIELDS__());

}  // namespace op
}  // namespace mxnet