From 927aff695f507dd0588e0315be64f784fe13ab48 Mon Sep 17 00:00:00 2001
From: huafengchun <huafengchun@gmail.com>
Date: Thu, 21 Dec 2023 15:26:29 +0800
Subject: [PATCH 1/2] Enable AscendC kernel operator

AscendC is an extended syntax for the C/C++ language that can be used to
write operators that run on Ascend NPU.

This commit introduce an operator(threshold) written in AscendC. Others
can refer to this to implement other operators.

AscendC can implement efficient fusion operators according to needs, in
this case, threshold execution speed increased by nearly 4 times.

Co-authored-by: CaoMengqing <cmq0113@163.com>
---
 modules/cannops/CMakeLists.txt                |   6 +
 .../cannops/ascendc_kernels/CMakeLists.txt    |  17 +
 .../ascendc_kernels/kernel_tiling_types.h     |  22 +
 .../threshold_opencv_kernel.cpp               | 387 ++++++++++++++++++
 .../cannops/ascendc_kernels/vector_tiling.h   |  77 ++++
 .../include/opencv2/ascendc_kernels.hpp       |   7 +
 modules/cannops/include/opencv2/cann_call.hpp |  28 +-
 .../cannops/perf/perf_element_operations.cpp  |  20 +
 modules/cannops/src/ascend_mat.cpp            |   6 +-
 modules/cannops/src/cann_call.cpp             |  12 +-
 modules/cannops/src/element_operations.cpp    | 128 +++---
 modules/cannops/src/precomp.hpp               |   1 +
 .../cannops/test/test_element_operations.cpp  |  33 +-
 modules/cannops/test/test_kernel.cpp          |  51 +++
 modules/cannops/test/test_precomp.hpp         |   1 +
 15 files changed, 704 insertions(+), 92 deletions(-)
 create mode 100644 modules/cannops/ascendc_kernels/CMakeLists.txt
 create mode 100644 modules/cannops/ascendc_kernels/kernel_tiling_types.h
 create mode 100644 modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp
 create mode 100644 modules/cannops/ascendc_kernels/vector_tiling.h
 create mode 100644 modules/cannops/include/opencv2/ascendc_kernels.hpp
 create mode 100644 modules/cannops/test/test_kernel.cpp

diff --git a/modules/cannops/CMakeLists.txt b/modules/cannops/CMakeLists.txt
index 0c16c5eb143..557fbe7f492 100644
--- a/modules/cannops/CMakeLists.txt
+++ b/modules/cannops/CMakeLists.txt
@@ -15,3 +15,9 @@ ocv_include_directories(${CMAKE_SOURCE_DIR}/modules/ts/include)
 ocv_add_accuracy_tests(DEPENDS_ON opencv_cannops)
 ocv_add_perf_tests(DEPENDS_ON opencv_cannops)
 ocv_add_samples(opencv_cannops)
+
+# compile ascnedc kernels.
+add_subdirectory(ascendc_kernels)
+ocv_include_directories(${CMAKE_BINARY_DIR}/include/ascendc_kernels)
+ocv_target_link_libraries(opencv_cannops PRIVATE ascendc_kernels)
+ocv_target_link_libraries(opencv_test_cannops PRIVATE ascendc_kernels)
diff --git a/modules/cannops/ascendc_kernels/CMakeLists.txt b/modules/cannops/ascendc_kernels/CMakeLists.txt
new file mode 100644
index 00000000000..c4198e8b8e6
--- /dev/null
+++ b/modules/cannops/ascendc_kernels/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(SOC_VERSION "ascend310p3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" CACHE PATH "ASCEND CANN package installation directory")
+set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim/cpu")
+
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
+endif()
+
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels STATIC
+    threshold_opencv_kernel.cpp
+)
diff --git a/modules/cannops/ascendc_kernels/kernel_tiling_types.h b/modules/cannops/ascendc_kernels/kernel_tiling_types.h
new file mode 100644
index 00000000000..3fbbdd06a63
--- /dev/null
+++ b/modules/cannops/ascendc_kernels/kernel_tiling_types.h
@@ -0,0 +1,22 @@
+#ifndef KERNEL_TILING_H
+#define KERNEL_TILING_H
+
+/*
+ * threshType:
+ *   THRESH_BINARY     = 0,
+ *   THRESH_BINARY_INV = 1,
+ *   THRESH_TRUNC      = 2,
+ *   THRESH_TOZERO     = 3,
+ *   THRESH_TOZERO_INV = 4,
+*/
+#pragma pack(push, 8)
+struct ThresholdOpencvTilingData
+{
+    float maxVal;
+    float thresh;
+    uint32_t totalLength;
+    uint8_t threshType;
+    uint8_t dtype;
+};
+#pragma pack(pop)
+#endif // KERNEL_TILING_H
diff --git a/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp b/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp
new file mode 100644
index 00000000000..ffab30ebd54
--- /dev/null
+++ b/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp
@@ -0,0 +1,387 @@
+#include "kernel_operator.h"
+#include "vector_tiling.h"
+#include "kernel_tiling_types.h"
+
+using namespace AscendC;
+
+// Make compiler happy. These two function will never be called.
+__aicore__ static inline void Cast(const LocalTensor<half>& dstLocal,
+                                   const LocalTensor<half>& srcLocal, const RoundMode& round_mode,
+                                   const uint32_t calCount){};
+__aicore__ static inline void Cast(const LocalTensor<float>& dstLocal,
+                                   const LocalTensor<float>& srcLocal, const RoundMode& round_mode,
+                                   const uint32_t calCount){};
+
+/**
+ * T: input data type.
+ * C: data type for calculate.
+ * if T != C, data should cast from T to C.
+ */
+template <typename T, typename C>
+class KernelThreshold
+{
+public:
+    __aicore__ inline KernelThreshold() {}
+    __aicore__ inline void Init(ThresholdOpencvTilingData* tiling, GM_ADDR x, GM_ADDR y)
+    {
+        tilingData = tiling;
+
+        /**
+         * Calculate memory use per element.
+         * 1. InputQueue: sizeof(T) * BUFFER_NUM
+         * 2. OutputQueue: sizeof(T) * BUFFER_NUM
+         * 3. maskBuffer: 1 byte at most.
+         */
+        uint64_t bytesPerElem = sizeof(T) * BUFFER_NUM * 2 + sizeof(uint8_t) * 1;
+
+        /**
+         * If need cast, should init two more cast buffers.
+         * Memory use per element:
+         * 1. InputCastBuffer: sizeof(C)
+         * 2. OutputCastBuffer: sizeof(C)
+         */
+        if (!std::is_same<T, C>::value)
+        {
+            bytesPerElem += sizeof(C) * 2;
+        }
+
+        // Most of AscendC APIs need align to 32 Bytes, but Compare and Select need
+        // align to 256 Bytes, 256/sizeof(C) means how many element can be process
+        // in one loop.
+        vecTiling.calculate(tilingData->totalLength, GetBlockNum(), GetBlockIdx(), bytesPerElem,
+                            256 / sizeof(C));
+
+        xGM.SetGlobalBuffer((__gm__ T*)x + vecTiling.blockOffset, vecTiling.blockLength);
+        yGM.SetGlobalBuffer((__gm__ T*)y + vecTiling.blockOffset, vecTiling.blockLength);
+
+        // Cast buffer.
+        if (!std::is_same<T, C>::value)
+        {
+            pipe.InitBuffer(InputCastBuffer, vecTiling.loopLength * sizeof(C));
+            pipe.InitBuffer(outputCastBuffer, vecTiling.loopLength * sizeof(C));
+        }
+
+        pipe.InitBuffer(inputQueue, BUFFER_NUM, vecTiling.loopLength * sizeof(T));
+        pipe.InitBuffer(outputQueue, BUFFER_NUM, vecTiling.loopLength * sizeof(T));
+        pipe.InitBuffer(maskBuffer, vecTiling.loopLength * sizeof(uint8_t));
+    }
+
+    __aicore__ inline void Run()
+    {
+        for (uint32_t loop = 0; loop < vecTiling.loopCount; loop++)
+        {
+            uint32_t offset = loop * vecTiling.loopLength;
+            Compute(offset, vecTiling.loopLength);
+        }
+
+        if (vecTiling.loopTailLength != 0)
+        {
+            uint32_t offset = vecTiling.loopCount * vecTiling.loopLength;
+            Compute(offset, vecTiling.loopTailLength);
+        }
+    }
+
+private:
+    __aicore__ inline void Compute(uint32_t offset, uint32_t len)
+    {
+        CopyIn(offset, len);
+
+        // Get local Tensor, if case is need, local tensors come from
+        // cast buffer. otherwise, local tensors come from input/output queue.
+        LocalTensor<C> xLocal = CastInput(inputQueue, InputCastBuffer, len);
+        LocalTensor<C> yLocal = GetOutput(outputQueue, outputCastBuffer);
+
+        Threshold(xLocal, yLocal, len);
+
+        // Free local input tensor if tensor is not from cast buffer.
+        FreeInput(inputQueue, xLocal);
+        // Cast output tensor to output queue if output tensor is from cast buffer.
+        CastOutput(outputQueue, yLocal, len);
+
+        CopyOut(offset, len);
+    }
+
+    /**
+     * If need cast:
+     * 1. Get data from input queue, this data can't be calculate directly.
+     * 2. Get buffer with type C, which satisfied AscendC APIs.
+     * 3. Cast data from T to C.
+     *
+     * If not need cast:
+     * 1. Only need get data from queue.
+     */
+    __aicore__ inline LocalTensor<C> CastInput(TQue<QuePosition::VECIN, BUFFER_NUM>& queue,
+                                               TBuf<TPosition::VECCALC>& buffer, uint32_t len)
+    {
+        LocalTensor<C> xLocal;
+        if (std::is_same<T, C>::value)
+        {
+            xLocal = queue.DeQue<C>();
+        }
+        else
+        {
+            xLocal = buffer.Get<C>();
+            LocalTensor<T> xCast = queue.DeQue<T>();
+            Cast(xLocal, xCast, RoundMode::CAST_NONE, len);
+            queue.FreeTensor(xCast);
+        }
+        return xLocal;
+    }
+
+    /**
+     * If need cast:
+     * 1. Get local tensor from cast buffer.
+     *
+     * If not need cast:
+     * 1. Alloc local tensor from output queue.
+     */
+    __aicore__ inline LocalTensor<C> GetOutput(TQue<QuePosition::VECOUT, BUFFER_NUM>& queue,
+                                               TBuf<TPosition::VECCALC>& buffer)
+    {
+        if (std::is_same<T, C>::value)
+        {
+            return queue.AllocTensor<C>();
+        }
+        else
+        {
+            return buffer.Get<C>();
+        }
+    }
+
+    /**
+     * If need cast:
+     * 1. Input local tensor are get from cast buffer, which do not need free.
+     *
+     * If not need cast:
+     * 1. Input local tensor are alloced from input queue, which need free.
+     */
+    __aicore__ inline void FreeInput(TQue<QuePosition::VECIN, BUFFER_NUM>& queue,
+                                     LocalTensor<C>& xLocal)
+    {
+        if (std::is_same<T, C>::value)
+        {
+            queue.FreeTensor(xLocal);
+        }
+    }
+
+    /**
+     * If need cast:
+     * 1. Alloc local tensor from output queue.
+     * 2. Cast from C to T.
+     * 3. Put casted local tensor in queue.
+     *
+     * If not need cast:
+     * 1. Only put local tensor in queue.
+     *
+     */
+    __aicore__ inline void CastOutput(TQue<QuePosition::VECOUT, BUFFER_NUM>& queue,
+                                      LocalTensor<C>& yLocal, uint32_t len)
+    {
+        if (std::is_same<T, C>::value)
+        {
+            queue.EnQue(yLocal);
+        }
+        else
+        {
+            LocalTensor<T> yCast = queue.AllocTensor<T>();
+            RoundMode roundMode = RoundMode::CAST_NONE;
+            // Ref to AscendC cast API.
+            if (std::is_same<T, int16_t>::value)
+            {
+                roundMode = RoundMode::CAST_RINT;
+            }
+            else if (std::is_same<T, int32_t>::value)
+            {
+                roundMode = RoundMode::CAST_ROUND;
+            }
+            Cast(yCast, yLocal, roundMode, len);
+            queue.EnQue(yCast);
+        }
+    }
+
+    __aicore__ inline void CopyIn(uint32_t offset, uint32_t len)
+    {
+        LocalTensor<T> xLocal = inputQueue.AllocTensor<T>();
+        DataCopy(xLocal, xGM[offset], len);
+        inputQueue.EnQue(xLocal);
+    }
+
+    __aicore__ inline void CopyOut(uint32_t offset, uint32_t len)
+    {
+        LocalTensor<T> yLocal = outputQueue.DeQue<T>();
+        DataCopy(yGM[offset], yLocal, len);
+        outputQueue.FreeTensor(yLocal);
+    }
+
+    /**
+     * AscendC API Compare Warpper.
+     * AscendC Compare level2 API need input length align to 256, process
+     * tail data by level0 API.
+     */
+    __aicore__ inline void CompareWrap(const LocalTensor<uint8_t>& dstLocal,
+                                       const LocalTensor<C>& src0Local,
+                                       const LocalTensor<C>& src1Local, CMPMODE cmpMode,
+                                       uint32_t calCount)
+    {
+        // Elements total count for on loop inside Compare.
+        uint32_t batchCount = 256 / sizeof(C);
+
+        // Tail elements count.
+        uint32_t tailCount = calCount % batchCount;
+
+        // Level2 API, calCount should align to 256.
+        Compare(dstLocal, src0Local, src1Local, cmpMode, calCount - tailCount);
+
+        // Data blocks are already cut align to 256, tail count will be 0 for
+        // all process loops except last one.
+        if (tailCount != 0)
+        {
+            BinaryRepeatParams repeatParams = {1, 1, 1, 8, 8, 8};
+            uint32_t tailIdx = calCount - tailCount;
+            uint32_t maskIdx = tailIdx / sizeof(uint8_t);
+            Compare(dstLocal[maskIdx], src0Local[tailIdx], src1Local[tailIdx], cmpMode, tailCount,
+                    1, repeatParams);
+        }
+    }
+
+    /**
+     * AscendC API Select Warpper.
+     * AscendC Select level2 API need input length align to 256, process
+     * tail data by level0 API.
+     */
+    __aicore__ inline void SelectWrap(const LocalTensor<C>& dstLocal,
+                                      const LocalTensor<uint8_t>& selMask,
+                                      const LocalTensor<C>& src0Local, C src1Local, SELMODE selMode,
+                                      uint32_t calCount)
+    {
+        uint32_t batchCount = 256 / sizeof(C);
+        uint32_t tailCount = calCount % batchCount;
+
+        Select(dstLocal, selMask, src0Local, src1Local, selMode, calCount - tailCount);
+        if (tailCount != 0)
+        {
+            BinaryRepeatParams repeatParams = {1, 1, 1, 8, 8, 8};
+            uint32_t tailIdx = calCount - tailCount;
+            uint32_t maskIdx = tailIdx / sizeof(uint8_t);
+            Select(dstLocal[tailIdx], selMask[maskIdx], src0Local[tailIdx], src1Local, selMode,
+                   tailCount, 1, repeatParams);
+        }
+    }
+
+    __aicore__ inline void Threshold(LocalTensor<C>& xLocal, LocalTensor<C>& yLocal, uint32_t len)
+    {
+        LocalTensor<uint8_t> mask = maskBuffer.Get<uint8_t>();
+        Duplicate(yLocal, static_cast<C>(tilingData->thresh), len);
+        switch (tilingData->threshType)
+        {
+            case 0:
+                CompareWrap(mask, xLocal, yLocal, CMPMODE::LE, len);
+                Duplicate(yLocal, static_cast<C>(0), len);
+                SelectWrap(yLocal, mask, yLocal, static_cast<C>(tilingData->maxVal),
+                           SELMODE::VSEL_TENSOR_SCALAR_MODE, len);
+                break;
+            case 1:
+                CompareWrap(mask, xLocal, yLocal, CMPMODE::GT, len);
+                Duplicate(yLocal, static_cast<C>(0), len);
+                SelectWrap(yLocal, mask, yLocal, static_cast<C>(tilingData->maxVal),
+                           SELMODE::VSEL_TENSOR_SCALAR_MODE, len);
+                break;
+            case 2:
+                CompareWrap(mask, xLocal, yLocal, CMPMODE::LE, len);
+                SelectWrap(yLocal, mask, xLocal, static_cast<C>(tilingData->thresh),
+                           SELMODE::VSEL_TENSOR_SCALAR_MODE, len);
+                break;
+            case 3:
+                CompareWrap(mask, xLocal, yLocal, CMPMODE::GT, len);
+                SelectWrap(yLocal, mask, xLocal, static_cast<C>(0),
+                           SELMODE::VSEL_TENSOR_SCALAR_MODE, len);
+                break;
+            case 4:
+                CompareWrap(mask, xLocal, yLocal, CMPMODE::LE, len);
+                SelectWrap(yLocal, mask, xLocal, static_cast<C>(0),
+                           SELMODE::VSEL_TENSOR_SCALAR_MODE, len);
+                break;
+            default:
+                break;
+        }
+    }
+
+    TPipe pipe;
+    TQue<QuePosition::VECIN, BUFFER_NUM> inputQueue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> outputQueue;
+    TBuf<TPosition::VECCALC> InputCastBuffer, outputCastBuffer, maskBuffer;
+
+    GlobalTensor<T> xGM, yGM;
+    VectorTiling vecTiling;
+    ThresholdOpencvTilingData* tilingData;
+};
+
+#define LAUNCH_THRESHOLD_KERNEL(NAME, T, C)                                                      \
+    __aicore__ inline void launch_threshold_kernel_##NAME(ThresholdOpencvTilingData* tilingData, \
+                                                          GM_ADDR x, GM_ADDR y)                  \
+    {                                                                                            \
+        KernelThreshold<T, C> op;                                                                \
+        op.Init(tilingData, x, y);                                                               \
+        op.Run();                                                                                \
+    }
+
+LAUNCH_THRESHOLD_KERNEL(CV_8U, uint8_t, half)   // CV_8U
+LAUNCH_THRESHOLD_KERNEL(CV_8S, int8_t, half)    // CV_8S
+                                                // CV_16U
+LAUNCH_THRESHOLD_KERNEL(CV_16S, int16_t, half)  // CV_16S
+LAUNCH_THRESHOLD_KERNEL(CV_32S, int32_t, float) // CV_32S
+LAUNCH_THRESHOLD_KERNEL(CV_32F, float, float)   // CV_32F
+                                                // CV_64F
+LAUNCH_THRESHOLD_KERNEL(CV_16F, half, half)     // CV_16F
+
+#undef LAUNCH_THRESHOLD_KERNEL
+
+#define CALL_THRESHOLD_KERNEL(NAME) launch_threshold_kernel_##NAME
+
+extern "C" __global__ __aicore__ void threshold_opencv(GM_ADDR tilingGM, GM_ADDR x, GM_ADDR y)
+{
+    ThresholdOpencvTilingData tilingData;
+    auto tempTilingGM = (__gm__ uint8_t*)tilingGM;
+    auto tempTiling = (uint8_t*)&tilingData;
+    for (int32_t i = 0; i < sizeof(ThresholdOpencvTilingData) / sizeof(uint8_t);
+         ++i, ++tempTilingGM, ++tempTiling)
+    {
+        *tempTiling = *tempTilingGM;
+    }
+
+    // AscendC can only call inline functions, function pointer can't be used here.
+    // Use Macro and switch case instead.
+    switch (tilingData.dtype)
+    {
+        case 0:
+            CALL_THRESHOLD_KERNEL(CV_8U)(&tilingData, x, y);
+            break;
+        case 1:
+            CALL_THRESHOLD_KERNEL(CV_8S)(&tilingData, x, y);
+            break;
+        case 3:
+            CALL_THRESHOLD_KERNEL(CV_16S)(&tilingData, x, y);
+            break;
+        case 4:
+            CALL_THRESHOLD_KERNEL(CV_32S)(&tilingData, x, y);
+            break;
+        case 5:
+            CALL_THRESHOLD_KERNEL(CV_32F)(&tilingData, x, y);
+            break;
+        case 7:
+            CALL_THRESHOLD_KERNEL(CV_16F)(&tilingData, x, y);
+            break;
+        case 2: case 6: default: // CV_16U, CV_64F
+            break;
+    }
+    // Clear tiling GM cache manually. (cce compiler bug)
+    dcci(tilingGM, 1);
+}
+
+#ifndef __CCE_KT_TEST__
+void threshold_opencv_kernel(uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* tiling,
+                             uint8_t* x, uint8_t* y)
+{
+    threshold_opencv<<<blockDim, l2ctrl, stream>>>(tiling, x, y);
+}
+#endif
diff --git a/modules/cannops/ascendc_kernels/vector_tiling.h b/modules/cannops/ascendc_kernels/vector_tiling.h
new file mode 100644
index 00000000000..e00dd423c38
--- /dev/null
+++ b/modules/cannops/ascendc_kernels/vector_tiling.h
@@ -0,0 +1,77 @@
+#ifndef TILING_KERNEL_H
+#define TILING_KERNEL_H
+
+#ifdef __CCE_KT_TEST__
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+
+inline __aicore__ int32_t AlignNCeil(int32_t n, int32_t align) { return ((n + align) & ~(align-1)); }
+
+inline __aicore__ int32_t AlignNFloor(int32_t n, int32_t align) { return (n & ~(align-1)); }
+
+constexpr int32_t BUFFER_NUM = 2;
+constexpr int32_t UB_BUF_LEN = 248 * 1024;
+
+struct VectorTiling {
+  __aicore__ inline void calculate(uint64_t _totalLength, uint64_t _blockNum,
+                                   uint64_t _blockIdx, uint64_t _variableBytesPerElem, uint32_t _align) {
+    totalLength = _totalLength;
+    blockNum = _blockNum;
+    blockIdx = _blockIdx;
+    variableBytesPerElem = _variableBytesPerElem;
+    blockLength = 0;
+    blockOffset = 0;
+    align = _align;
+    GetBlockLengthAndOffset();
+    GetLoopLengthAndCount();
+#ifdef __CCE_KT_TEST__
+    std::cout << "Block(" << blockIdx << "): BlockLength = " << blockLength
+              << ", BlockOffset = " << blockOffset
+              << ", LoopLength = " << loopLength
+              << ", LoopCount = " << loopCount
+              << ", LoopTailLength = " << loopTailLength << std::endl;
+#endif
+  }
+
+  __aicore__ inline void GetBlockLengthAndOffset() {
+    // Data should Align by 32B.
+    uint32_t fullBlockLength = AlignNCeil(totalLength / blockNum, 32);
+    // Some core may get no data after Align32 Ceil.
+    uint32_t fullBlockNum = totalLength / fullBlockLength;
+    uint32_t blockTailLength = totalLength % fullBlockLength;
+
+    if (blockIdx < fullBlockNum) {
+      blockLength = fullBlockLength;
+      blockOffset = blockIdx * blockLength;
+      // Last block must less than full block num.
+    } else if (blockTailLength != 0 && blockIdx == fullBlockNum) {
+      blockLength = blockTailLength;
+      blockOffset = blockIdx * fullBlockLength;
+    }
+  }
+
+  /**
+   * @brief Get length for one loop and loop count.
+   * Use as much UB buf as possible.
+   */
+  __aicore__ inline void GetLoopLengthAndCount() {
+    loopLength = AlignNFloor(UB_BUF_LEN / variableBytesPerElem, align);
+    loopCount = blockLength / loopLength;
+    loopTailLength = blockLength - (loopLength * loopCount);
+  }
+
+  uint64_t totalLength;
+  uint64_t blockNum;
+  uint64_t blockIdx;
+  uint64_t variableBytesPerElem;
+  uint32_t blockLength;
+  uint32_t blockOffset;
+  uint32_t loopLength;
+  uint32_t loopCount;
+  uint32_t loopTailLength;
+  uint32_t align;
+};
+
+#endif  // TILING_KERNEL_H
diff --git a/modules/cannops/include/opencv2/ascendc_kernels.hpp b/modules/cannops/include/opencv2/ascendc_kernels.hpp
new file mode 100644
index 00000000000..714b6460fd7
--- /dev/null
+++ b/modules/cannops/include/opencv2/ascendc_kernels.hpp
@@ -0,0 +1,7 @@
+#ifndef ASCENDC_KERNELS_H
+#define KERNEL_TILINASCENDC_KERNELS_HG_H
+
+#include "../../ascendc_kernels/kernel_tiling_types.h"
+#include "aclrtlaunch_threshold_opencv.h"
+
+#endif //ASCENDC_KERNELS_H
diff --git a/modules/cannops/include/opencv2/cann_call.hpp b/modules/cannops/include/opencv2/cann_call.hpp
index 651bff8bba0..e21f339db96 100644
--- a/modules/cannops/include/opencv2/cann_call.hpp
+++ b/modules/cannops/include/opencv2/cann_call.hpp
@@ -9,7 +9,9 @@
 #include <set>
 #include <string>
 #include <acl/acl_base.h>
-#include "opencv2/cann.hpp"
+#include "cann.hpp"
+#include "stream_accessor.hpp"
+#include "ascendc_kernels.hpp"
 
 class aclopAttr;
 
@@ -17,6 +19,15 @@ namespace cv
 {
 namespace cann
 {
+CV_EXPORTS void checkAclError(aclError err, const char* file, const int line, const char* func);
+void checkAclPtr(void* ptr, const char* file, const int line, const char* func);
+#define CV_ACL_SAFE_CALL(expr) checkAclError((expr), __FILE__, __LINE__, CV_Func)
+#define CV_ACL_SAFE_CALL_PTR(expr)                     \
+    ({                                                 \
+        auto ptr = (expr);                             \
+        checkAclPtr(ptr, __FILE__, __LINE__, CV_Func); \
+        ptr;                                           \
+    })
 // Warpper for functions in CANN, callers should not call CANN's api directly, but should call the
 // function provided in cann_call.
 void aclrtMallocWarpper(void** data, size_t size);
@@ -39,7 +50,7 @@ void aclrtMemsetWarpper(std::shared_ptr<uchar>& ptr, int32_t value, size_t count
 //! Type mapping between opencv and cann.
 aclDataType getACLType(int opencvdepth);
 //! Malloc and upload raw data to devices.
-std::shared_ptr<uchar> mallocAndUpload(const void* data, size_t size, AscendStream& stream,
+CV_EXPORTS std::shared_ptr<uchar> mallocAndUpload(const void* data, size_t size, AscendStream& stream,
                                        AscendMat::Allocator* allocator);
 /**
  * @brief Warpper of CANN streams.
@@ -151,6 +162,19 @@ class OperatorRunner
     OperatorRunner& run(AscendStream& stream);
 };
 
+template <typename KERNEL_TYPE, typename TILING_TYPE, typename... ARGS>
+void kernel_launch(KERNEL_TYPE kernel, AscendStream& stream, TILING_TYPE& tiling, ARGS... args)
+{
+    std::shared_ptr<uchar> tilingDevice =
+        mallocAndUpload(&tiling, sizeof(TILING_TYPE), stream, AscendMat::defaultAllocator());
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    CV_ACL_SAFE_CALL(kernel(1, rawStream, tilingDevice.get(), args...));
+    if (rawStream == nullptr)
+    {
+        stream.waitForCompletion();
+    }
+}
+
 } // namespace cann
 } // namespace cv
 
diff --git a/modules/cannops/perf/perf_element_operations.cpp b/modules/cannops/perf/perf_element_operations.cpp
index 0612abe6085..4527346e190 100644
--- a/modules/cannops/perf/perf_element_operations.cpp
+++ b/modules/cannops/perf/perf_element_operations.cpp
@@ -207,5 +207,25 @@ PERF_TEST_P(CPU, MAT_BITWISE_NOT_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES,
     SANITY_CHECK_NOTHING();
 }
 
+PERF_TEST_P(NPU, THRESHOLD_ASCENDC, testing::Combine(TYPICAL_ASCEND_MAT_SIZES,  Values(CV_8U, CV_16S, CV_32F)))
+{
+    Mat mat(GET_PARAM(0), GET_PARAM(1));
+    AscendMat dst;
+    AscendMat src;
+    src.upload(mat);
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE_N(10) { cv::cann::threshold(src, dst, 100.0, 255.0, cv::THRESH_BINARY); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, THRESHOLD, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, Values(CV_8U, CV_16S, CV_32F)))
+{
+    Mat mat(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE_N(10) { cv::threshold(mat, dst, 100.0, 255.0, cv::THRESH_BINARY); }
+    SANITY_CHECK_NOTHING();
+}
+
 } // namespace
 } // namespace opencv_test
diff --git a/modules/cannops/src/ascend_mat.cpp b/modules/cannops/src/ascend_mat.cpp
index ba17a545bb7..dde838c8d37 100644
--- a/modules/cannops/src/ascend_mat.cpp
+++ b/modules/cannops/src/ascend_mat.cpp
@@ -23,7 +23,11 @@ std::shared_ptr<uchar> DefaultAllocator::allocate(size_t size)
 
 bool DefaultAllocator::allocate(cv::cann::AscendMat* mat, int rows, int cols, size_t elemSize)
 {
-    mat->data = allocate(elemSize * cols * rows);
+    size_t totalBytes = elemSize * cols * rows;
+
+    // align by 32B.
+    totalBytes = ((totalBytes + 32) & ~31);
+    mat->data = allocate(totalBytes);
     mat->step = cols * elemSize;
 
     return true;
diff --git a/modules/cannops/src/cann_call.cpp b/modules/cannops/src/cann_call.cpp
index 3b83052ccbe..97d49d66fd1 100644
--- a/modules/cannops/src/cann_call.cpp
+++ b/modules/cannops/src/cann_call.cpp
@@ -11,7 +11,7 @@ namespace cv
 namespace cann
 {
 /*******************************Acl Error Checker*****************************/
-static inline void checkAclError(aclError err, const char* file, const int line, const char* func)
+void checkAclError(aclError err, const char* file, const int line, const char* func)
 {
     if (ACL_SUCCESS != err)
     {
@@ -20,7 +20,7 @@ static inline void checkAclError(aclError err, const char* file, const int line,
     }
 }
 
-static inline void checkAclPtr(void* ptr, const char* file, const int line, const char* func)
+void checkAclPtr(void* ptr, const char* file, const int line, const char* func)
 {
     if (nullptr == ptr)
     {
@@ -29,14 +29,6 @@ static inline void checkAclPtr(void* ptr, const char* file, const int line, cons
     }
 }
 
-#define CV_ACL_SAFE_CALL(expr) checkAclError((expr), __FILE__, __LINE__, CV_Func)
-#define CV_ACL_SAFE_CALL_PTR(expr)                     \
-    ({                                                 \
-        auto ptr = (expr);                             \
-        checkAclPtr(ptr, __FILE__, __LINE__, CV_Func); \
-        ptr;                                           \
-    })
-
 /******************************Acl Runtime Warpper****************************/
 void aclrtMallocWarpper(void** data, size_t size)
 {
diff --git a/modules/cannops/src/element_operations.cpp b/modules/cannops/src/element_operations.cpp
index 402658369b5..48d9edb596b 100644
--- a/modules/cannops/src/element_operations.cpp
+++ b/modules/cannops/src/element_operations.cpp
@@ -3,6 +3,7 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "precomp.hpp"
+
 namespace cv
 {
 namespace cann
@@ -110,8 +111,8 @@ static void convert(const Scalar& src, Scalar& dst, AscendStream& stream)
 }
 
 template <typename T1, typename T2>
-static void arithm_op(const T1& src1, const T2& src2, AscendMat& dst, const AscendMat& mask, float scale,
-                      int dtype, const char* op, AscendStream& stream)
+static void arithm_op(const T1& src1, const T2& src2, AscendMat& dst, const AscendMat& mask,
+                      float scale, int dtype, const char* op, AscendStream& stream)
 {
     T1 castedSrc1;
     T2 castedSrc2;
@@ -170,8 +171,9 @@ static void arithm_op(const T1& src1, const T2& src2, AscendMat& dst, const Asce
     }
 }
 
-static void arithm_op(const InputArray _src1, const InputArray _src2, OutputArray _dst, const InputArray _mask,
-                      float scale, int dtype, const char* op, AscendStream& stream)
+static void arithm_op(const InputArray _src1, const InputArray _src2, OutputArray _dst,
+                      const InputArray _mask, float scale, int dtype, const char* op,
+                      AscendStream& stream)
 {
     const bool isScalar1 = (_src1.kind() == _InputArray::MATX);
     const bool isScalar2 = (_src2.kind() == _InputArray::MATX);
@@ -213,56 +215,54 @@ static void arithm_op(const InputArray _src1, const InputArray _src2, OutputArra
 }
 
 // In order to supply more interfaces, differnet function declaration shoule be done.
-void add(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask, int dtype,
-         AscendStream& stream)
+void add(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+         int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
 }
 
-void add(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
-         AscendStream& stream)
+void add(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+         int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
 }
 
-void add(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask, int dtype,
-         AscendStream& stream)
+void add(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+         int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
 }
 
-void add(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
-         AscendStream& stream)
+void add(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+         int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
 }
 
-
-void subtract(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask, int dtype,
-              AscendStream& stream)
+void subtract(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+              int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
 }
 
-void subtract(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
-              AscendStream& stream)
+void subtract(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+              int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
 }
 
-void subtract(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask, int dtype,
-              AscendStream& stream)
+void subtract(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+              int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
 }
 
-void subtract(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
-              AscendStream& stream)
+void subtract(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+              int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
 }
 
-
 void multiply(const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
               AscendStream& stream)
 {
@@ -287,7 +287,6 @@ void multiply(const Scalar& src1, const AscendMat& src2, AscendMat& dst, float s
     arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "Mul", stream);
 }
 
-
 void divide(const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
             AscendStream& stream)
 {
@@ -312,15 +311,14 @@ void divide(const Scalar& src1, const AscendMat& src2, AscendMat& dst, float sca
     arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "RealDiv", stream);
 }
 
-
-void bitwise_and(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
-                 AscendStream& stream)
+void bitwise_and(const InputArray src1, const InputArray src2, OutputArray dst,
+                 const InputArray mask, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
 }
 
-void bitwise_and(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
-                 AscendStream& stream)
+void bitwise_and(const AscendMat& src1, const AscendMat& src2, AscendMat& dst,
+                 const AscendMat& mask, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
 }
@@ -337,9 +335,8 @@ void bitwise_and(const Scalar& src1, const AscendMat& src2, AscendMat& dst, cons
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
 }
 
-
-void bitwise_or(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
-                AscendStream& stream)
+void bitwise_or(const InputArray src1, const InputArray src2, OutputArray dst,
+                const InputArray mask, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
 }
@@ -362,15 +359,14 @@ void bitwise_or(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
 }
 
-
-void bitwise_xor(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
-                 AscendStream& stream)
+void bitwise_xor(const InputArray src1, const InputArray src2, OutputArray dst,
+                 const InputArray mask, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
 }
 
-void bitwise_xor(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
-                 AscendStream& stream)
+void bitwise_xor(const AscendMat& src1, const AscendMat& src2, AscendMat& dst,
+                 const AscendMat& mask, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
 }
@@ -387,7 +383,6 @@ void bitwise_xor(const Scalar& src1, const AscendMat& src2, AscendMat& dst, cons
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
 }
 
-
 void bitwise_not(const InputArray src, OutputArray dst, const InputArray mask, AscendStream& stream)
 {
     arithm_op(src, noArray(), dst, mask, 1, -1, "Invert", stream);
@@ -398,9 +393,8 @@ void bitwise_not(const AscendMat& src, AscendMat& dst, const AscendMat& mask, As
     arithm_op(src, AscendMat(), dst, mask, 1, -1, "Invert", stream);
 }
 
-
-void addWeighted(const AscendMat& src1, double alpha, const AscendMat& src2, double beta, double gamma,
-                 AscendMat& dst, int dtype, AscendStream& stream)
+void addWeighted(const AscendMat& src1, double alpha, const AscendMat& src2, double beta,
+                 double gamma, AscendMat& dst, int dtype, AscendStream& stream)
 {
     if (dtype < 0)
         dtype = src1.depth();
@@ -421,8 +415,8 @@ void addWeighted(const AscendMat& src1, double alpha, const AscendMat& src2, dou
     arithm_op(srcWeightedSumRet, (float)gamma, dst, "Adds", stream);
 }
 
-void addWeighted(const InputArray _src1, double alpha, const InputArray _src2, double beta, double gamma,
-                 OutputArray _dst, int dtype, AscendStream& stream)
+void addWeighted(const InputArray _src1, double alpha, const InputArray _src2, double beta,
+                 double gamma, OutputArray _dst, int dtype, AscendStream& stream)
 {
     AscendMat src1, src2, dst;
     src1.upload(_src1, stream);
@@ -442,45 +436,23 @@ double threshold(const AscendMat& src, AscendMat& dst, double thresh, double max
 
     dst.create(src.rows, src.cols, src.type());
 
-    OperatorRunner runner;
-    runner.setOp("Threshold")
-        .addInput(src, "x")
-        .addOutput(threshMat, "y")
-        .addAttr((float)thresh, "threshold")
-        .run(stream);
-
-    // THRESH_*_INV, THRESH_TRUNC need a inverse threshMat.
-    // THRESH_BINARY_INV = 1, THRESH_TRUNC = 2, THRESH_TOZERO_INV = 4,
-    if (type == 1 || type == 2 || type == 4)
+    if (src.depth() == CV_8U || src.depth() == CV_8S || src.depth() == CV_16S ||
+        src.depth() == CV_32S || src.depth() == CV_32F || src.depth() == CV_16F)
     {
-        AscendMat threshInvMat(src.size(), src.type());
-        AscendMat ones(src.size(), src.type());
-        Scalar s(1, 1, 1, 1);
-        ones.setTo(s, stream);
-        arithm_op(ones, threshMat, threshInvMat, "Sub", stream);
-
-        if (type == 1)
-            arithm_op(threshInvMat, (float)maxval, dst, "Muls", stream);
-        else if (type == 2)
-        {
-            AscendMat ToZeroInvMat(src.size(), src.type());
-            AscendMat TruncMat(src.size(), src.type());
-            arithm_op(threshInvMat, src, ToZeroInvMat, "Mul", stream);
-            arithm_op(threshMat, (float)thresh, TruncMat, "Muls", stream);
-            arithm_op(ToZeroInvMat, TruncMat, dst, "Add", stream);
-        }
-        else
-            arithm_op(threshInvMat, src, dst, "Mul", stream);
+        ThresholdOpencvTilingData tiling;
+        tiling.maxVal = maxval;
+        tiling.thresh = thresh;
+        // AscendMat memory will be align to 32B, it's safe to set totalLengh a little bigger.
+        size_t totalBytes = src.rows * src.cols * src.channels();
+        tiling.totalLength = ((totalBytes + 32) & ~31);
+        tiling.threshType = type;
+        tiling.dtype = src.depth();
+
+        kernel_launch(aclrtlaunch_threshold_opencv, stream, tiling, src.data.get(), dst.data.get());
     }
     else
-    {
-        if (type == 0) /* THRESH_BINARY = 0 */
-            arithm_op(threshMat, (float)maxval, dst, "Muls", stream);
-        else if (type == 3) /* THRESH_TOZERO = 3 */
-            arithm_op(threshMat, src, dst, "Mul", stream);
-        else
-            CV_Error(Error::StsError, "Unknown/unsupported threshold type");
-    }
+        CV_Error(Error::StsUnsupportedFormat, "");
+
     return thresh;
 }
 
diff --git a/modules/cannops/src/precomp.hpp b/modules/cannops/src/precomp.hpp
index 8411cc40407..53ed398fde3 100644
--- a/modules/cannops/src/precomp.hpp
+++ b/modules/cannops/src/precomp.hpp
@@ -10,5 +10,6 @@
 #include "opencv2/cann_call.hpp"
 #include "opencv2/cann_interface.hpp"
 #include "opencv2/cann_private.hpp"
+#include "opencv2/ascendc_kernels.hpp"
 
 #endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cannops/test/test_element_operations.cpp b/modules/cannops/test/test_element_operations.cpp
index 76c103a65f4..730d2912d95 100644
--- a/modules/cannops/test/test_element_operations.cpp
+++ b/modules/cannops/test/test_element_operations.cpp
@@ -678,7 +678,6 @@ TEST(ELEMENTWISE_OP, MAT_THRESHOLD)
     for (int i = 0; i <= 4; i++)
     {
         cv::threshold(cpuMat, cpuOpRet, 128, 250, i);
-        // TODO find the reason empty AscendMat is not continuous.
         cv::cann::threshold(ascendMat16F, aclOpRet, 128, 250, i);
         aclOpRet.convertTo(aclOpRet16S, CV_16S);
         aclOpRet16S.download(checker);
@@ -693,5 +692,37 @@ TEST(ELEMENTWISE_OP, MAT_THRESHOLD)
     cv::cann::resetDevice();
 }
 
+TEST(ELEMENTWISE_OP, MAT_THRESHOLD_ASCENDC)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat cpuRet, npuRet;
+    AscendMat npuImg, npuTmpMat;
+
+    // opencv do not support CV_8S, CV_32S, CV_16F
+    // ascend do not support CV_16U, CV_64F
+    uint8_t dtypes[] = {CV_8U, CV_16S, CV_32F};
+
+    for (uint i = 0; i <= 4; i++)
+    {
+        for (uint j = 0; j < sizeof(dtypes) / sizeof(dtypes[0]); j++)
+        {
+            double thresh = 90.5;
+            double maxVal = 85.2;
+
+            Mat img = randomMat(10, 10, CV_MAKETYPE(dtypes[j], 3), 0.0f, 128.0f);
+            npuImg.upload(img);
+            npuTmpMat.create(npuImg.rows, npuImg.cols, npuImg.type());
+
+            cv::threshold(img, cpuRet, thresh, maxVal, i);
+            cv::cann::threshold(npuImg, npuTmpMat, thresh, maxVal, i);
+
+            npuTmpMat.download(npuRet);
+            EXPECT_MAT_NEAR(cpuRet, npuRet, 10.0f);
+        }
+    }
+
+    cv::cann::resetDevice();
+}
+
 } // namespace
 } // namespace opencv_test
diff --git a/modules/cannops/test/test_kernel.cpp b/modules/cannops/test/test_kernel.cpp
new file mode 100644
index 00000000000..ac0996a27b6
--- /dev/null
+++ b/modules/cannops/test/test_kernel.cpp
@@ -0,0 +1,51 @@
+#include "test_precomp.hpp"
+#include "opencv2/cann_call.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+TEST(ASCENDC_KERNEL, THRESHOLD)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat cpuRet, npuRet;
+    AscendMat npuImg, npuTmpMat;
+
+    // opencv do not support CV_8S, CV_32S, CV_16F
+    // ascend do not support CV_16U, CV_64F
+    uint8_t dtypes[] = {CV_8U, CV_16S, CV_32F};
+
+    for (uint i = 0; i <= 4; i++)
+    {
+        for (uint j = 0; j < sizeof(dtypes) / sizeof(dtypes[0]); j++)
+        {
+            double thresh = 90.5;
+            double maxVal = 85.2;
+
+            Mat img = randomMat(10, 10, CV_MAKETYPE(dtypes[j], 3), 0.0f, 128.0f);
+            npuImg.upload(img);
+            npuTmpMat.create(npuImg.rows, npuImg.cols, npuImg.type());
+
+            cv::threshold(img, cpuRet, thresh, maxVal, i);
+            ThresholdOpencvTilingData tiling;
+            tiling.maxVal = maxVal;
+            tiling.thresh = thresh;
+            size_t totalBytes = img.rows * img.cols * img.channels();
+            // AscendMat memory will be align to 32B, it's safe to set totalLengh a little bigger.
+            tiling.totalLength = ((totalBytes + 32) & ~31);
+            tiling.threshType = i;
+            tiling.dtype = dtypes[j];
+            kernel_launch(aclrtlaunch_threshold_opencv, AscendStream::Null(), tiling,
+                          npuImg.data.get(), npuTmpMat.data.get());
+
+            npuTmpMat.download(npuRet);
+            EXPECT_MAT_NEAR(cpuRet, npuRet, 10.0f);
+        }
+    }
+
+    cv::cann::resetDevice();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/test/test_precomp.hpp b/modules/cannops/test/test_precomp.hpp
index f7bdbea0b08..74cfcb11ee9 100644
--- a/modules/cannops/test/test_precomp.hpp
+++ b/modules/cannops/test/test_precomp.hpp
@@ -9,6 +9,7 @@
 #include "opencv2/cann.hpp"
 #include "opencv2/ts/cuda_test.hpp"
 #include "opencv2/cann_interface.hpp"
+#include "opencv2/ascendc_kernels.hpp"
 
 using namespace cv;
 using namespace cv::cann;

From 09ed18d2802d892755ff168d9b6677c462ae4ec2 Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Wed, 21 Feb 2024 10:33:09 +0800
Subject: [PATCH 2/2] Remove redundant code

1. remove threshold_opencv_kernel
2. typo ASCENDC_KERNELS_H
3. add ALIGN_UP macro
---
 .../cannops/ascendc_kernels/threshold_opencv_kernel.cpp   | 8 --------
 modules/cannops/include/opencv2/ascendc_kernels.hpp       | 2 +-
 modules/cannops/src/element_operations.cpp                | 2 +-
 modules/cannops/src/precomp.hpp                           | 1 +
 4 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp b/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp
index ffab30ebd54..7fa1867c8b1 100644
--- a/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp
+++ b/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp
@@ -377,11 +377,3 @@ extern "C" __global__ __aicore__ void threshold_opencv(GM_ADDR tilingGM, GM_ADDR
     // Clear tiling GM cache manually. (cce compiler bug)
     dcci(tilingGM, 1);
 }
-
-#ifndef __CCE_KT_TEST__
-void threshold_opencv_kernel(uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* tiling,
-                             uint8_t* x, uint8_t* y)
-{
-    threshold_opencv<<<blockDim, l2ctrl, stream>>>(tiling, x, y);
-}
-#endif
diff --git a/modules/cannops/include/opencv2/ascendc_kernels.hpp b/modules/cannops/include/opencv2/ascendc_kernels.hpp
index 714b6460fd7..b030920a62f 100644
--- a/modules/cannops/include/opencv2/ascendc_kernels.hpp
+++ b/modules/cannops/include/opencv2/ascendc_kernels.hpp
@@ -1,5 +1,5 @@
 #ifndef ASCENDC_KERNELS_H
-#define KERNEL_TILINASCENDC_KERNELS_HG_H
+#define ASCENDC_KERNELS_H
 
 #include "../../ascendc_kernels/kernel_tiling_types.h"
 #include "aclrtlaunch_threshold_opencv.h"
diff --git a/modules/cannops/src/element_operations.cpp b/modules/cannops/src/element_operations.cpp
index 48d9edb596b..cacf6e6cff1 100644
--- a/modules/cannops/src/element_operations.cpp
+++ b/modules/cannops/src/element_operations.cpp
@@ -444,7 +444,7 @@ double threshold(const AscendMat& src, AscendMat& dst, double thresh, double max
         tiling.thresh = thresh;
         // AscendMat memory will be align to 32B, it's safe to set totalLengh a little bigger.
         size_t totalBytes = src.rows * src.cols * src.channels();
-        tiling.totalLength = ((totalBytes + 32) & ~31);
+        tiling.totalLength = ALIGN_UP(totalBytes, 32);
         tiling.threshType = type;
         tiling.dtype = src.depth();
 
diff --git a/modules/cannops/src/precomp.hpp b/modules/cannops/src/precomp.hpp
index 53ed398fde3..8aadaf4d8de 100644
--- a/modules/cannops/src/precomp.hpp
+++ b/modules/cannops/src/precomp.hpp
@@ -11,5 +11,6 @@
 #include "opencv2/cann_interface.hpp"
 #include "opencv2/cann_private.hpp"
 #include "opencv2/ascendc_kernels.hpp"
+#define ALIGN_UP(num, align) (((num) + (align) - 1) & ~((align) - 1))
 
 #endif /* __OPENCV_PRECOMP_H__ */