From 927aff695f507dd0588e0315be64f784fe13ab48 Mon Sep 17 00:00:00 2001
From: huafengchun <huafengchun@gmail.com>
Date: Thu, 21 Dec 2023 15:26:29 +0800
Subject: [PATCH 1/4] Enable AscendC kernel operator

AscendC is an extended syntax for the C/C++ language that can be used to
write operators that run on Ascend NPU.

This commit introduce an operator(threshold) written in AscendC. Others
can refer to this to implement other operators.

AscendC can implement efficient fusion operators according to needs, in
this case, threshold execution speed increased by nearly 4 times.

Co-authored-by: CaoMengqing <cmq0113@163.com>
---
 modules/cannops/CMakeLists.txt                |   6 +
 .../cannops/ascendc_kernels/CMakeLists.txt    |  17 +
 .../ascendc_kernels/kernel_tiling_types.h     |  22 +
 .../threshold_opencv_kernel.cpp               | 387 ++++++++++++++++++
 .../cannops/ascendc_kernels/vector_tiling.h   |  77 ++++
 .../include/opencv2/ascendc_kernels.hpp       |   7 +
 modules/cannops/include/opencv2/cann_call.hpp |  28 +-
 .../cannops/perf/perf_element_operations.cpp  |  20 +
 modules/cannops/src/ascend_mat.cpp            |   6 +-
 modules/cannops/src/cann_call.cpp             |  12 +-
 modules/cannops/src/element_operations.cpp    | 128 +++---
 modules/cannops/src/precomp.hpp               |   1 +
 .../cannops/test/test_element_operations.cpp  |  33 +-
 modules/cannops/test/test_kernel.cpp          |  51 +++
 modules/cannops/test/test_precomp.hpp         |   1 +
 15 files changed, 704 insertions(+), 92 deletions(-)
 create mode 100644 modules/cannops/ascendc_kernels/CMakeLists.txt
 create mode 100644 modules/cannops/ascendc_kernels/kernel_tiling_types.h
 create mode 100644 modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp
 create mode 100644 modules/cannops/ascendc_kernels/vector_tiling.h
 create mode 100644 modules/cannops/include/opencv2/ascendc_kernels.hpp
 create mode 100644 modules/cannops/test/test_kernel.cpp

diff --git a/modules/cannops/CMakeLists.txt b/modules/cannops/CMakeLists.txt
index 0c16c5eb143..557fbe7f492 100644
--- a/modules/cannops/CMakeLists.txt
+++ b/modules/cannops/CMakeLists.txt
@@ -15,3 +15,9 @@ ocv_include_directories(${CMAKE_SOURCE_DIR}/modules/ts/include)
 ocv_add_accuracy_tests(DEPENDS_ON opencv_cannops)
 ocv_add_perf_tests(DEPENDS_ON opencv_cannops)
 ocv_add_samples(opencv_cannops)
+
+# compile ascnedc kernels.
+add_subdirectory(ascendc_kernels)
+ocv_include_directories(${CMAKE_BINARY_DIR}/include/ascendc_kernels)
+ocv_target_link_libraries(opencv_cannops PRIVATE ascendc_kernels)
+ocv_target_link_libraries(opencv_test_cannops PRIVATE ascendc_kernels)
diff --git a/modules/cannops/ascendc_kernels/CMakeLists.txt b/modules/cannops/ascendc_kernels/CMakeLists.txt
new file mode 100644
index 00000000000..c4198e8b8e6
--- /dev/null
+++ b/modules/cannops/ascendc_kernels/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(SOC_VERSION "ascend310p3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" CACHE PATH "ASCEND CANN package installation directory")
+set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim/cpu")
+
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
+endif()
+
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels STATIC
+    threshold_opencv_kernel.cpp
+)
diff --git a/modules/cannops/ascendc_kernels/kernel_tiling_types.h b/modules/cannops/ascendc_kernels/kernel_tiling_types.h
new file mode 100644
index 00000000000..3fbbdd06a63
--- /dev/null
+++ b/modules/cannops/ascendc_kernels/kernel_tiling_types.h
@@ -0,0 +1,22 @@
+#ifndef KERNEL_TILING_H
+#define KERNEL_TILING_H
+
+/*
+ * threshType:
+ *   THRESH_BINARY     = 0,
+ *   THRESH_BINARY_INV = 1,
+ *   THRESH_TRUNC      = 2,
+ *   THRESH_TOZERO     = 3,
+ *   THRESH_TOZERO_INV = 4,
+*/
+#pragma pack(push, 8)
+struct ThresholdOpencvTilingData
+{
+    float maxVal;
+    float thresh;
+    uint32_t totalLength;
+    uint8_t threshType;
+    uint8_t dtype;
+};
+#pragma pack(pop)
+#endif // KERNEL_TILING_H
diff --git a/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp b/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp
new file mode 100644
index 00000000000..ffab30ebd54
--- /dev/null
+++ b/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp
@@ -0,0 +1,387 @@
+#include "kernel_operator.h"
+#include "vector_tiling.h"
+#include "kernel_tiling_types.h"
+
+using namespace AscendC;
+
+// Make compiler happy. These two function will never be called.
+__aicore__ static inline void Cast(const LocalTensor<half>& dstLocal,
+                                   const LocalTensor<half>& srcLocal, const RoundMode& round_mode,
+                                   const uint32_t calCount){};
+__aicore__ static inline void Cast(const LocalTensor<float>& dstLocal,
+                                   const LocalTensor<float>& srcLocal, const RoundMode& round_mode,
+                                   const uint32_t calCount){};
+
+/**
+ * T: input data type.
+ * C: data type for calculate.
+ * if T != C, data should cast from T to C.
+ */
+template <typename T, typename C>
+class KernelThreshold
+{
+public:
+    __aicore__ inline KernelThreshold() {}
+    __aicore__ inline void Init(ThresholdOpencvTilingData* tiling, GM_ADDR x, GM_ADDR y)
+    {
+        tilingData = tiling;
+
+        /**
+         * Calculate memory use per element.
+         * 1. InputQueue: sizeof(T) * BUFFER_NUM
+         * 2. OutputQueue: sizeof(T) * BUFFER_NUM
+         * 3. maskBuffer: 1 byte at most.
+         */
+        uint64_t bytesPerElem = sizeof(T) * BUFFER_NUM * 2 + sizeof(uint8_t) * 1;
+
+        /**
+         * If need cast, should init two more cast buffers.
+         * Memory use per element:
+         * 1. InputCastBuffer: sizeof(C)
+         * 2. OutputCastBuffer: sizeof(C)
+         */
+        if (!std::is_same<T, C>::value)
+        {
+            bytesPerElem += sizeof(C) * 2;
+        }
+
+        // Most of AscendC APIs need align to 32 Bytes, but Compare and Select need
+        // align to 256 Bytes, 256/sizeof(C) means how many element can be process
+        // in one loop.
+        vecTiling.calculate(tilingData->totalLength, GetBlockNum(), GetBlockIdx(), bytesPerElem,
+                            256 / sizeof(C));
+
+        xGM.SetGlobalBuffer((__gm__ T*)x + vecTiling.blockOffset, vecTiling.blockLength);
+        yGM.SetGlobalBuffer((__gm__ T*)y + vecTiling.blockOffset, vecTiling.blockLength);
+
+        // Cast buffer.
+        if (!std::is_same<T, C>::value)
+        {
+            pipe.InitBuffer(InputCastBuffer, vecTiling.loopLength * sizeof(C));
+            pipe.InitBuffer(outputCastBuffer, vecTiling.loopLength * sizeof(C));
+        }
+
+        pipe.InitBuffer(inputQueue, BUFFER_NUM, vecTiling.loopLength * sizeof(T));
+        pipe.InitBuffer(outputQueue, BUFFER_NUM, vecTiling.loopLength * sizeof(T));
+        pipe.InitBuffer(maskBuffer, vecTiling.loopLength * sizeof(uint8_t));
+    }
+
+    __aicore__ inline void Run()
+    {
+        for (uint32_t loop = 0; loop < vecTiling.loopCount; loop++)
+        {
+            uint32_t offset = loop * vecTiling.loopLength;
+            Compute(offset, vecTiling.loopLength);
+        }
+
+        if (vecTiling.loopTailLength != 0)
+        {
+            uint32_t offset = vecTiling.loopCount * vecTiling.loopLength;
+            Compute(offset, vecTiling.loopTailLength);
+        }
+    }
+
+private:
+    __aicore__ inline void Compute(uint32_t offset, uint32_t len)
+    {
+        CopyIn(offset, len);
+
+        // Get local Tensor, if case is need, local tensors come from
+        // cast buffer. otherwise, local tensors come from input/output queue.
+        LocalTensor<C> xLocal = CastInput(inputQueue, InputCastBuffer, len);
+        LocalTensor<C> yLocal = GetOutput(outputQueue, outputCastBuffer);
+
+        Threshold(xLocal, yLocal, len);
+
+        // Free local input tensor if tensor is not from cast buffer.
+        FreeInput(inputQueue, xLocal);
+        // Cast output tensor to output queue if output tensor is from cast buffer.
+        CastOutput(outputQueue, yLocal, len);
+
+        CopyOut(offset, len);
+    }
+
+    /**
+     * If need cast:
+     * 1. Get data from input queue, this data can't be calculate directly.
+     * 2. Get buffer with type C, which satisfied AscendC APIs.
+     * 3. Cast data from T to C.
+     *
+     * If not need cast:
+     * 1. Only need get data from queue.
+     */
+    __aicore__ inline LocalTensor<C> CastInput(TQue<QuePosition::VECIN, BUFFER_NUM>& queue,
+                                               TBuf<TPosition::VECCALC>& buffer, uint32_t len)
+    {
+        LocalTensor<C> xLocal;
+        if (std::is_same<T, C>::value)
+        {
+            xLocal = queue.DeQue<C>();
+        }
+        else
+        {
+            xLocal = buffer.Get<C>();
+            LocalTensor<T> xCast = queue.DeQue<T>();
+            Cast(xLocal, xCast, RoundMode::CAST_NONE, len);
+            queue.FreeTensor(xCast);
+        }
+        return xLocal;
+    }
+
+    /**
+     * If need cast:
+     * 1. Get local tensor from cast buffer.
+     *
+     * If not need cast:
+     * 1. Alloc local tensor from output queue.
+     */
+    __aicore__ inline LocalTensor<C> GetOutput(TQue<QuePosition::VECOUT, BUFFER_NUM>& queue,
+                                               TBuf<TPosition::VECCALC>& buffer)
+    {
+        if (std::is_same<T, C>::value)
+        {
+            return queue.AllocTensor<C>();
+        }
+        else
+        {
+            return buffer.Get<C>();
+        }
+    }
+
+    /**
+     * If need cast:
+     * 1. Input local tensor are get from cast buffer, which do not need free.
+     *
+     * If not need cast:
+     * 1. Input local tensor are alloced from input queue, which need free.
+     */
+    __aicore__ inline void FreeInput(TQue<QuePosition::VECIN, BUFFER_NUM>& queue,
+                                     LocalTensor<C>& xLocal)
+    {
+        if (std::is_same<T, C>::value)
+        {
+            queue.FreeTensor(xLocal);
+        }
+    }
+
+    /**
+     * If need cast:
+     * 1. Alloc local tensor from output queue.
+     * 2. Cast from C to T.
+     * 3. Put casted local tensor in queue.
+     *
+     * If not need cast:
+     * 1. Only put local tensor in queue.
+     *
+     */
+    __aicore__ inline void CastOutput(TQue<QuePosition::VECOUT, BUFFER_NUM>& queue,
+                                      LocalTensor<C>& yLocal, uint32_t len)
+    {
+        if (std::is_same<T, C>::value)
+        {
+            queue.EnQue(yLocal);
+        }
+        else
+        {
+            LocalTensor<T> yCast = queue.AllocTensor<T>();
+            RoundMode roundMode = RoundMode::CAST_NONE;
+            // Ref to AscendC cast API.
+            if (std::is_same<T, int16_t>::value)
+            {
+                roundMode = RoundMode::CAST_RINT;
+            }
+            else if (std::is_same<T, int32_t>::value)
+            {
+                roundMode = RoundMode::CAST_ROUND;
+            }
+            Cast(yCast, yLocal, roundMode, len);
+            queue.EnQue(yCast);
+        }
+    }
+
+    __aicore__ inline void CopyIn(uint32_t offset, uint32_t len)
+    {
+        LocalTensor<T> xLocal = inputQueue.AllocTensor<T>();
+        DataCopy(xLocal, xGM[offset], len);
+        inputQueue.EnQue(xLocal);
+    }
+
+    __aicore__ inline void CopyOut(uint32_t offset, uint32_t len)
+    {
+        LocalTensor<T> yLocal = outputQueue.DeQue<T>();
+        DataCopy(yGM[offset], yLocal, len);
+        outputQueue.FreeTensor(yLocal);
+    }
+
+    /**
+     * AscendC API Compare Warpper.
+     * AscendC Compare level2 API need input length align to 256, process
+     * tail data by level0 API.
+     */
+    __aicore__ inline void CompareWrap(const LocalTensor<uint8_t>& dstLocal,
+                                       const LocalTensor<C>& src0Local,
+                                       const LocalTensor<C>& src1Local, CMPMODE cmpMode,
+                                       uint32_t calCount)
+    {
+        // Elements total count for on loop inside Compare.
+        uint32_t batchCount = 256 / sizeof(C);
+
+        // Tail elements count.
+        uint32_t tailCount = calCount % batchCount;
+
+        // Level2 API, calCount should align to 256.
+        Compare(dstLocal, src0Local, src1Local, cmpMode, calCount - tailCount);
+
+        // Data blocks are already cut align to 256, tail count will be 0 for
+        // all process loops except last one.
+        if (tailCount != 0)
+        {
+            BinaryRepeatParams repeatParams = {1, 1, 1, 8, 8, 8};
+            uint32_t tailIdx = calCount - tailCount;
+            uint32_t maskIdx = tailIdx / sizeof(uint8_t);
+            Compare(dstLocal[maskIdx], src0Local[tailIdx], src1Local[tailIdx], cmpMode, tailCount,
+                    1, repeatParams);
+        }
+    }
+
+    /**
+     * AscendC API Select Warpper.
+     * AscendC Select level2 API need input length align to 256, process
+     * tail data by level0 API.
+     */
+    __aicore__ inline void SelectWrap(const LocalTensor<C>& dstLocal,
+                                      const LocalTensor<uint8_t>& selMask,
+                                      const LocalTensor<C>& src0Local, C src1Local, SELMODE selMode,
+                                      uint32_t calCount)
+    {
+        uint32_t batchCount = 256 / sizeof(C);
+        uint32_t tailCount = calCount % batchCount;
+
+        Select(dstLocal, selMask, src0Local, src1Local, selMode, calCount - tailCount);
+        if (tailCount != 0)
+        {
+            BinaryRepeatParams repeatParams = {1, 1, 1, 8, 8, 8};
+            uint32_t tailIdx = calCount - tailCount;
+            uint32_t maskIdx = tailIdx / sizeof(uint8_t);
+            Select(dstLocal[tailIdx], selMask[maskIdx], src0Local[tailIdx], src1Local, selMode,
+                   tailCount, 1, repeatParams);
+        }
+    }
+
+    __aicore__ inline void Threshold(LocalTensor<C>& xLocal, LocalTensor<C>& yLocal, uint32_t len)
+    {
+        LocalTensor<uint8_t> mask = maskBuffer.Get<uint8_t>();
+        Duplicate(yLocal, static_cast<C>(tilingData->thresh), len);
+        switch (tilingData->threshType)
+        {
+            case 0:
+                CompareWrap(mask, xLocal, yLocal, CMPMODE::LE, len);
+                Duplicate(yLocal, static_cast<C>(0), len);
+                SelectWrap(yLocal, mask, yLocal, static_cast<C>(tilingData->maxVal),
+                           SELMODE::VSEL_TENSOR_SCALAR_MODE, len);
+                break;
+            case 1:
+                CompareWrap(mask, xLocal, yLocal, CMPMODE::GT, len);
+                Duplicate(yLocal, static_cast<C>(0), len);
+                SelectWrap(yLocal, mask, yLocal, static_cast<C>(tilingData->maxVal),
+                           SELMODE::VSEL_TENSOR_SCALAR_MODE, len);
+                break;
+            case 2:
+                CompareWrap(mask, xLocal, yLocal, CMPMODE::LE, len);
+                SelectWrap(yLocal, mask, xLocal, static_cast<C>(tilingData->thresh),
+                           SELMODE::VSEL_TENSOR_SCALAR_MODE, len);
+                break;
+            case 3:
+                CompareWrap(mask, xLocal, yLocal, CMPMODE::GT, len);
+                SelectWrap(yLocal, mask, xLocal, static_cast<C>(0),
+                           SELMODE::VSEL_TENSOR_SCALAR_MODE, len);
+                break;
+            case 4:
+                CompareWrap(mask, xLocal, yLocal, CMPMODE::LE, len);
+                SelectWrap(yLocal, mask, xLocal, static_cast<C>(0),
+                           SELMODE::VSEL_TENSOR_SCALAR_MODE, len);
+                break;
+            default:
+                break;
+        }
+    }
+
+    TPipe pipe;
+    TQue<QuePosition::VECIN, BUFFER_NUM> inputQueue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> outputQueue;
+    TBuf<TPosition::VECCALC> InputCastBuffer, outputCastBuffer, maskBuffer;
+
+    GlobalTensor<T> xGM, yGM;
+    VectorTiling vecTiling;
+    ThresholdOpencvTilingData* tilingData;
+};
+
+#define LAUNCH_THRESHOLD_KERNEL(NAME, T, C)                                                      \
+    __aicore__ inline void launch_threshold_kernel_##NAME(ThresholdOpencvTilingData* tilingData, \
+                                                          GM_ADDR x, GM_ADDR y)                  \
+    {                                                                                            \
+        KernelThreshold<T, C> op;                                                                \
+        op.Init(tilingData, x, y);                                                               \
+        op.Run();                                                                                \
+    }
+
+LAUNCH_THRESHOLD_KERNEL(CV_8U, uint8_t, half)   // CV_8U
+LAUNCH_THRESHOLD_KERNEL(CV_8S, int8_t, half)    // CV_8S
+                                                // CV_16U
+LAUNCH_THRESHOLD_KERNEL(CV_16S, int16_t, half)  // CV_16S
+LAUNCH_THRESHOLD_KERNEL(CV_32S, int32_t, float) // CV_32S
+LAUNCH_THRESHOLD_KERNEL(CV_32F, float, float)   // CV_32F
+                                                // CV_64F
+LAUNCH_THRESHOLD_KERNEL(CV_16F, half, half)     // CV_16F
+
+#undef LAUNCH_THRESHOLD_KERNEL
+
+#define CALL_THRESHOLD_KERNEL(NAME) launch_threshold_kernel_##NAME
+
+extern "C" __global__ __aicore__ void threshold_opencv(GM_ADDR tilingGM, GM_ADDR x, GM_ADDR y)
+{
+    ThresholdOpencvTilingData tilingData;
+    auto tempTilingGM = (__gm__ uint8_t*)tilingGM;
+    auto tempTiling = (uint8_t*)&tilingData;
+    for (int32_t i = 0; i < sizeof(ThresholdOpencvTilingData) / sizeof(uint8_t);
+         ++i, ++tempTilingGM, ++tempTiling)
+    {
+        *tempTiling = *tempTilingGM;
+    }
+
+    // AscendC can only call inline functions, function pointer can't be used here.
+    // Use Macro and switch case instead.
+    switch (tilingData.dtype)
+    {
+        case 0:
+            CALL_THRESHOLD_KERNEL(CV_8U)(&tilingData, x, y);
+            break;
+        case 1:
+            CALL_THRESHOLD_KERNEL(CV_8S)(&tilingData, x, y);
+            break;
+        case 3:
+            CALL_THRESHOLD_KERNEL(CV_16S)(&tilingData, x, y);
+            break;
+        case 4:
+            CALL_THRESHOLD_KERNEL(CV_32S)(&tilingData, x, y);
+            break;
+        case 5:
+            CALL_THRESHOLD_KERNEL(CV_32F)(&tilingData, x, y);
+            break;
+        case 7:
+            CALL_THRESHOLD_KERNEL(CV_16F)(&tilingData, x, y);
+            break;
+        case 2: case 6: default: // CV_16U, CV_64F
+            break;
+    }
+    // Clear tiling GM cache manually. (cce compiler bug)
+    dcci(tilingGM, 1);
+}
+
+#ifndef __CCE_KT_TEST__
+void threshold_opencv_kernel(uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* tiling,
+                             uint8_t* x, uint8_t* y)
+{
+    threshold_opencv<<<blockDim, l2ctrl, stream>>>(tiling, x, y);
+}
+#endif
diff --git a/modules/cannops/ascendc_kernels/vector_tiling.h b/modules/cannops/ascendc_kernels/vector_tiling.h
new file mode 100644
index 00000000000..e00dd423c38
--- /dev/null
+++ b/modules/cannops/ascendc_kernels/vector_tiling.h
@@ -0,0 +1,77 @@
+#ifndef TILING_KERNEL_H
+#define TILING_KERNEL_H
+
+#ifdef __CCE_KT_TEST__
+#define __aicore__
+#else
+#define __aicore__ [aicore]
+#endif
+
+inline __aicore__ int32_t AlignNCeil(int32_t n, int32_t align) { return ((n + align) & ~(align-1)); }
+
+inline __aicore__ int32_t AlignNFloor(int32_t n, int32_t align) { return (n & ~(align-1)); }
+
+constexpr int32_t BUFFER_NUM = 2;
+constexpr int32_t UB_BUF_LEN = 248 * 1024;
+
+struct VectorTiling {
+  __aicore__ inline void calculate(uint64_t _totalLength, uint64_t _blockNum,
+                                   uint64_t _blockIdx, uint64_t _variableBytesPerElem, uint32_t _align) {
+    totalLength = _totalLength;
+    blockNum = _blockNum;
+    blockIdx = _blockIdx;
+    variableBytesPerElem = _variableBytesPerElem;
+    blockLength = 0;
+    blockOffset = 0;
+    align = _align;
+    GetBlockLengthAndOffset();
+    GetLoopLengthAndCount();
+#ifdef __CCE_KT_TEST__
+    std::cout << "Block(" << blockIdx << "): BlockLength = " << blockLength
+              << ", BlockOffset = " << blockOffset
+              << ", LoopLength = " << loopLength
+              << ", LoopCount = " << loopCount
+              << ", LoopTailLength = " << loopTailLength << std::endl;
+#endif
+  }
+
+  __aicore__ inline void GetBlockLengthAndOffset() {
+    // Data should Align by 32B.
+    uint32_t fullBlockLength = AlignNCeil(totalLength / blockNum, 32);
+    // Some core may get no data after Align32 Ceil.
+    uint32_t fullBlockNum = totalLength / fullBlockLength;
+    uint32_t blockTailLength = totalLength % fullBlockLength;
+
+    if (blockIdx < fullBlockNum) {
+      blockLength = fullBlockLength;
+      blockOffset = blockIdx * blockLength;
+      // Last block must less than full block num.
+    } else if (blockTailLength != 0 && blockIdx == fullBlockNum) {
+      blockLength = blockTailLength;
+      blockOffset = blockIdx * fullBlockLength;
+    }
+  }
+
+  /**
+   * @brief Get length for one loop and loop count.
+   * Use as much UB buf as possible.
+   */
+  __aicore__ inline void GetLoopLengthAndCount() {
+    loopLength = AlignNFloor(UB_BUF_LEN / variableBytesPerElem, align);
+    loopCount = blockLength / loopLength;
+    loopTailLength = blockLength - (loopLength * loopCount);
+  }
+
+  uint64_t totalLength;
+  uint64_t blockNum;
+  uint64_t blockIdx;
+  uint64_t variableBytesPerElem;
+  uint32_t blockLength;
+  uint32_t blockOffset;
+  uint32_t loopLength;
+  uint32_t loopCount;
+  uint32_t loopTailLength;
+  uint32_t align;
+};
+
+#endif  // TILING_KERNEL_H
diff --git a/modules/cannops/include/opencv2/ascendc_kernels.hpp b/modules/cannops/include/opencv2/ascendc_kernels.hpp
new file mode 100644
index 00000000000..714b6460fd7
--- /dev/null
+++ b/modules/cannops/include/opencv2/ascendc_kernels.hpp
@@ -0,0 +1,7 @@
+#ifndef ASCENDC_KERNELS_H
+#define KERNEL_TILINASCENDC_KERNELS_HG_H
+
+#include "../../ascendc_kernels/kernel_tiling_types.h"
+#include "aclrtlaunch_threshold_opencv.h"
+
+#endif //ASCENDC_KERNELS_H
diff --git a/modules/cannops/include/opencv2/cann_call.hpp b/modules/cannops/include/opencv2/cann_call.hpp
index 651bff8bba0..e21f339db96 100644
--- a/modules/cannops/include/opencv2/cann_call.hpp
+++ b/modules/cannops/include/opencv2/cann_call.hpp
@@ -9,7 +9,9 @@
 #include <set>
 #include <string>
 #include <acl/acl_base.h>
-#include "opencv2/cann.hpp"
+#include "cann.hpp"
+#include "stream_accessor.hpp"
+#include "ascendc_kernels.hpp"
 
 class aclopAttr;
 
@@ -17,6 +19,15 @@ namespace cv
 {
 namespace cann
 {
+CV_EXPORTS void checkAclError(aclError err, const char* file, const int line, const char* func);
+void checkAclPtr(void* ptr, const char* file, const int line, const char* func);
+#define CV_ACL_SAFE_CALL(expr) checkAclError((expr), __FILE__, __LINE__, CV_Func)
+#define CV_ACL_SAFE_CALL_PTR(expr)                     \
+    ({                                                 \
+        auto ptr = (expr);                             \
+        checkAclPtr(ptr, __FILE__, __LINE__, CV_Func); \
+        ptr;                                           \
+    })
 // Warpper for functions in CANN, callers should not call CANN's api directly, but should call the
 // function provided in cann_call.
 void aclrtMallocWarpper(void** data, size_t size);
@@ -39,7 +50,7 @@ void aclrtMemsetWarpper(std::shared_ptr<uchar>& ptr, int32_t value, size_t count
 //! Type mapping between opencv and cann.
 aclDataType getACLType(int opencvdepth);
 //! Malloc and upload raw data to devices.
-std::shared_ptr<uchar> mallocAndUpload(const void* data, size_t size, AscendStream& stream,
+CV_EXPORTS std::shared_ptr<uchar> mallocAndUpload(const void* data, size_t size, AscendStream& stream,
                                        AscendMat::Allocator* allocator);
 /**
  * @brief Warpper of CANN streams.
@@ -151,6 +162,19 @@ class OperatorRunner
     OperatorRunner& run(AscendStream& stream);
 };
 
+template <typename KERNEL_TYPE, typename TILING_TYPE, typename... ARGS>
+void kernel_launch(KERNEL_TYPE kernel, AscendStream& stream, TILING_TYPE& tiling, ARGS... args)
+{
+    std::shared_ptr<uchar> tilingDevice =
+        mallocAndUpload(&tiling, sizeof(TILING_TYPE), stream, AscendMat::defaultAllocator());
+    aclrtStream rawStream = AscendStreamAccessor::getStream(stream);
+    CV_ACL_SAFE_CALL(kernel(1, rawStream, tilingDevice.get(), args...));
+    if (rawStream == nullptr)
+    {
+        stream.waitForCompletion();
+    }
+}
+
 } // namespace cann
 } // namespace cv
 
diff --git a/modules/cannops/perf/perf_element_operations.cpp b/modules/cannops/perf/perf_element_operations.cpp
index 0612abe6085..4527346e190 100644
--- a/modules/cannops/perf/perf_element_operations.cpp
+++ b/modules/cannops/perf/perf_element_operations.cpp
@@ -207,5 +207,25 @@ PERF_TEST_P(CPU, MAT_BITWISE_NOT_MAT, testing::Combine(TYPICAL_ASCEND_MAT_SIZES,
     SANITY_CHECK_NOTHING();
 }
 
+PERF_TEST_P(NPU, THRESHOLD_ASCENDC, testing::Combine(TYPICAL_ASCEND_MAT_SIZES,  Values(CV_8U, CV_16S, CV_32F)))
+{
+    Mat mat(GET_PARAM(0), GET_PARAM(1));
+    AscendMat dst;
+    AscendMat src;
+    src.upload(mat);
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE_N(10) { cv::cann::threshold(src, dst, 100.0, 255.0, cv::THRESH_BINARY); }
+    SANITY_CHECK_NOTHING();
+}
+
+PERF_TEST_P(CPU, THRESHOLD, testing::Combine(TYPICAL_ASCEND_MAT_SIZES, Values(CV_8U, CV_16S, CV_32F)))
+{
+    Mat mat(GET_PARAM(0), GET_PARAM(1));
+    Mat dst;
+    declare.in(mat, WARMUP_RNG);
+    TEST_CYCLE_N(10) { cv::threshold(mat, dst, 100.0, 255.0, cv::THRESH_BINARY); }
+    SANITY_CHECK_NOTHING();
+}
+
 } // namespace
 } // namespace opencv_test
diff --git a/modules/cannops/src/ascend_mat.cpp b/modules/cannops/src/ascend_mat.cpp
index ba17a545bb7..dde838c8d37 100644
--- a/modules/cannops/src/ascend_mat.cpp
+++ b/modules/cannops/src/ascend_mat.cpp
@@ -23,7 +23,11 @@ std::shared_ptr<uchar> DefaultAllocator::allocate(size_t size)
 
 bool DefaultAllocator::allocate(cv::cann::AscendMat* mat, int rows, int cols, size_t elemSize)
 {
-    mat->data = allocate(elemSize * cols * rows);
+    size_t totalBytes = elemSize * cols * rows;
+
+    // align by 32B.
+    totalBytes = ((totalBytes + 32) & ~31);
+    mat->data = allocate(totalBytes);
     mat->step = cols * elemSize;
 
     return true;
diff --git a/modules/cannops/src/cann_call.cpp b/modules/cannops/src/cann_call.cpp
index 3b83052ccbe..97d49d66fd1 100644
--- a/modules/cannops/src/cann_call.cpp
+++ b/modules/cannops/src/cann_call.cpp
@@ -11,7 +11,7 @@ namespace cv
 namespace cann
 {
 /*******************************Acl Error Checker*****************************/
-static inline void checkAclError(aclError err, const char* file, const int line, const char* func)
+void checkAclError(aclError err, const char* file, const int line, const char* func)
 {
     if (ACL_SUCCESS != err)
     {
@@ -20,7 +20,7 @@ static inline void checkAclError(aclError err, const char* file, const int line,
     }
 }
 
-static inline void checkAclPtr(void* ptr, const char* file, const int line, const char* func)
+void checkAclPtr(void* ptr, const char* file, const int line, const char* func)
 {
     if (nullptr == ptr)
     {
@@ -29,14 +29,6 @@ static inline void checkAclPtr(void* ptr, const char* file, const int line, cons
     }
 }
 
-#define CV_ACL_SAFE_CALL(expr) checkAclError((expr), __FILE__, __LINE__, CV_Func)
-#define CV_ACL_SAFE_CALL_PTR(expr)                     \
-    ({                                                 \
-        auto ptr = (expr);                             \
-        checkAclPtr(ptr, __FILE__, __LINE__, CV_Func); \
-        ptr;                                           \
-    })
-
 /******************************Acl Runtime Warpper****************************/
 void aclrtMallocWarpper(void** data, size_t size)
 {
diff --git a/modules/cannops/src/element_operations.cpp b/modules/cannops/src/element_operations.cpp
index 402658369b5..48d9edb596b 100644
--- a/modules/cannops/src/element_operations.cpp
+++ b/modules/cannops/src/element_operations.cpp
@@ -3,6 +3,7 @@
 // of this distribution and at http://opencv.org/license.html.
 
 #include "precomp.hpp"
+
 namespace cv
 {
 namespace cann
@@ -110,8 +111,8 @@ static void convert(const Scalar& src, Scalar& dst, AscendStream& stream)
 }
 
 template <typename T1, typename T2>
-static void arithm_op(const T1& src1, const T2& src2, AscendMat& dst, const AscendMat& mask, float scale,
-                      int dtype, const char* op, AscendStream& stream)
+static void arithm_op(const T1& src1, const T2& src2, AscendMat& dst, const AscendMat& mask,
+                      float scale, int dtype, const char* op, AscendStream& stream)
 {
     T1 castedSrc1;
     T2 castedSrc2;
@@ -170,8 +171,9 @@ static void arithm_op(const T1& src1, const T2& src2, AscendMat& dst, const Asce
     }
 }
 
-static void arithm_op(const InputArray _src1, const InputArray _src2, OutputArray _dst, const InputArray _mask,
-                      float scale, int dtype, const char* op, AscendStream& stream)
+static void arithm_op(const InputArray _src1, const InputArray _src2, OutputArray _dst,
+                      const InputArray _mask, float scale, int dtype, const char* op,
+                      AscendStream& stream)
 {
     const bool isScalar1 = (_src1.kind() == _InputArray::MATX);
     const bool isScalar2 = (_src2.kind() == _InputArray::MATX);
@@ -213,56 +215,54 @@ static void arithm_op(const InputArray _src1, const InputArray _src2, OutputArra
 }
 
 // In order to supply more interfaces, differnet function declaration shoule be done.
-void add(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask, int dtype,
-         AscendStream& stream)
+void add(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+         int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
 }
 
-void add(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
-         AscendStream& stream)
+void add(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+         int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
 }
 
-void add(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask, int dtype,
-         AscendStream& stream)
+void add(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+         int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
 }
 
-void add(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
-         AscendStream& stream)
+void add(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+         int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Add", stream);
 }
 
-
-void subtract(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask, int dtype,
-              AscendStream& stream)
+void subtract(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
+              int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
 }
 
-void subtract(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
-              AscendStream& stream)
+void subtract(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+              int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
 }
 
-void subtract(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask, int dtype,
-              AscendStream& stream)
+void subtract(const AscendMat& src1, const Scalar& src2, AscendMat& dst, const AscendMat& mask,
+              int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
 }
 
-void subtract(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask, int dtype,
-              AscendStream& stream)
+void subtract(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
+              int dtype, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, dtype, "Sub", stream);
 }
 
-
 void multiply(const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
               AscendStream& stream)
 {
@@ -287,7 +287,6 @@ void multiply(const Scalar& src1, const AscendMat& src2, AscendMat& dst, float s
     arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "Mul", stream);
 }
 
-
 void divide(const InputArray src1, const InputArray src2, OutputArray dst, float scale, int dtype,
             AscendStream& stream)
 {
@@ -312,15 +311,14 @@ void divide(const Scalar& src1, const AscendMat& src2, AscendMat& dst, float sca
     arithm_op(src1, src2, dst, AscendMat(), scale, dtype, "RealDiv", stream);
 }
 
-
-void bitwise_and(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
-                 AscendStream& stream)
+void bitwise_and(const InputArray src1, const InputArray src2, OutputArray dst,
+                 const InputArray mask, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
 }
 
-void bitwise_and(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
-                 AscendStream& stream)
+void bitwise_and(const AscendMat& src1, const AscendMat& src2, AscendMat& dst,
+                 const AscendMat& mask, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
 }
@@ -337,9 +335,8 @@ void bitwise_and(const Scalar& src1, const AscendMat& src2, AscendMat& dst, cons
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseAnd", stream);
 }
 
-
-void bitwise_or(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
-                AscendStream& stream)
+void bitwise_or(const InputArray src1, const InputArray src2, OutputArray dst,
+                const InputArray mask, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
 }
@@ -362,15 +359,14 @@ void bitwise_or(const Scalar& src1, const AscendMat& src2, AscendMat& dst, const
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseOr", stream);
 }
 
-
-void bitwise_xor(const InputArray src1, const InputArray src2, OutputArray dst, const InputArray mask,
-                 AscendStream& stream)
+void bitwise_xor(const InputArray src1, const InputArray src2, OutputArray dst,
+                 const InputArray mask, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
 }
 
-void bitwise_xor(const AscendMat& src1, const AscendMat& src2, AscendMat& dst, const AscendMat& mask,
-                 AscendStream& stream)
+void bitwise_xor(const AscendMat& src1, const AscendMat& src2, AscendMat& dst,
+                 const AscendMat& mask, AscendStream& stream)
 {
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
 }
@@ -387,7 +383,6 @@ void bitwise_xor(const Scalar& src1, const AscendMat& src2, AscendMat& dst, cons
     arithm_op(src1, src2, dst, mask, 1, -1, "BitwiseXor", stream);
 }
 
-
 void bitwise_not(const InputArray src, OutputArray dst, const InputArray mask, AscendStream& stream)
 {
     arithm_op(src, noArray(), dst, mask, 1, -1, "Invert", stream);
@@ -398,9 +393,8 @@ void bitwise_not(const AscendMat& src, AscendMat& dst, const AscendMat& mask, As
     arithm_op(src, AscendMat(), dst, mask, 1, -1, "Invert", stream);
 }
 
-
-void addWeighted(const AscendMat& src1, double alpha, const AscendMat& src2, double beta, double gamma,
-                 AscendMat& dst, int dtype, AscendStream& stream)
+void addWeighted(const AscendMat& src1, double alpha, const AscendMat& src2, double beta,
+                 double gamma, AscendMat& dst, int dtype, AscendStream& stream)
 {
     if (dtype < 0)
         dtype = src1.depth();
@@ -421,8 +415,8 @@ void addWeighted(const AscendMat& src1, double alpha, const AscendMat& src2, dou
     arithm_op(srcWeightedSumRet, (float)gamma, dst, "Adds", stream);
 }
 
-void addWeighted(const InputArray _src1, double alpha, const InputArray _src2, double beta, double gamma,
-                 OutputArray _dst, int dtype, AscendStream& stream)
+void addWeighted(const InputArray _src1, double alpha, const InputArray _src2, double beta,
+                 double gamma, OutputArray _dst, int dtype, AscendStream& stream)
 {
     AscendMat src1, src2, dst;
     src1.upload(_src1, stream);
@@ -442,45 +436,23 @@ double threshold(const AscendMat& src, AscendMat& dst, double thresh, double max
 
     dst.create(src.rows, src.cols, src.type());
 
-    OperatorRunner runner;
-    runner.setOp("Threshold")
-        .addInput(src, "x")
-        .addOutput(threshMat, "y")
-        .addAttr((float)thresh, "threshold")
-        .run(stream);
-
-    // THRESH_*_INV, THRESH_TRUNC need a inverse threshMat.
-    // THRESH_BINARY_INV = 1, THRESH_TRUNC = 2, THRESH_TOZERO_INV = 4,
-    if (type == 1 || type == 2 || type == 4)
+    if (src.depth() == CV_8U || src.depth() == CV_8S || src.depth() == CV_16S ||
+        src.depth() == CV_32S || src.depth() == CV_32F || src.depth() == CV_16F)
     {
-        AscendMat threshInvMat(src.size(), src.type());
-        AscendMat ones(src.size(), src.type());
-        Scalar s(1, 1, 1, 1);
-        ones.setTo(s, stream);
-        arithm_op(ones, threshMat, threshInvMat, "Sub", stream);
-
-        if (type == 1)
-            arithm_op(threshInvMat, (float)maxval, dst, "Muls", stream);
-        else if (type == 2)
-        {
-            AscendMat ToZeroInvMat(src.size(), src.type());
-            AscendMat TruncMat(src.size(), src.type());
-            arithm_op(threshInvMat, src, ToZeroInvMat, "Mul", stream);
-            arithm_op(threshMat, (float)thresh, TruncMat, "Muls", stream);
-            arithm_op(ToZeroInvMat, TruncMat, dst, "Add", stream);
-        }
-        else
-            arithm_op(threshInvMat, src, dst, "Mul", stream);
+        ThresholdOpencvTilingData tiling;
+        tiling.maxVal = maxval;
+        tiling.thresh = thresh;
+        // AscendMat memory will be align to 32B, it's safe to set totalLengh a little bigger.
+        size_t totalBytes = src.rows * src.cols * src.channels();
+        tiling.totalLength = ((totalBytes + 32) & ~31);
+        tiling.threshType = type;
+        tiling.dtype = src.depth();
+
+        kernel_launch(aclrtlaunch_threshold_opencv, stream, tiling, src.data.get(), dst.data.get());
     }
     else
-    {
-        if (type == 0) /* THRESH_BINARY = 0 */
-            arithm_op(threshMat, (float)maxval, dst, "Muls", stream);
-        else if (type == 3) /* THRESH_TOZERO = 3 */
-            arithm_op(threshMat, src, dst, "Mul", stream);
-        else
-            CV_Error(Error::StsError, "Unknown/unsupported threshold type");
-    }
+        CV_Error(Error::StsUnsupportedFormat, "");
+
     return thresh;
 }
 
diff --git a/modules/cannops/src/precomp.hpp b/modules/cannops/src/precomp.hpp
index 8411cc40407..53ed398fde3 100644
--- a/modules/cannops/src/precomp.hpp
+++ b/modules/cannops/src/precomp.hpp
@@ -10,5 +10,6 @@
 #include "opencv2/cann_call.hpp"
 #include "opencv2/cann_interface.hpp"
 #include "opencv2/cann_private.hpp"
+#include "opencv2/ascendc_kernels.hpp"
 
 #endif /* __OPENCV_PRECOMP_H__ */
diff --git a/modules/cannops/test/test_element_operations.cpp b/modules/cannops/test/test_element_operations.cpp
index 76c103a65f4..730d2912d95 100644
--- a/modules/cannops/test/test_element_operations.cpp
+++ b/modules/cannops/test/test_element_operations.cpp
@@ -678,7 +678,6 @@ TEST(ELEMENTWISE_OP, MAT_THRESHOLD)
     for (int i = 0; i <= 4; i++)
     {
         cv::threshold(cpuMat, cpuOpRet, 128, 250, i);
-        // TODO find the reason empty AscendMat is not continuous.
         cv::cann::threshold(ascendMat16F, aclOpRet, 128, 250, i);
         aclOpRet.convertTo(aclOpRet16S, CV_16S);
         aclOpRet16S.download(checker);
@@ -693,5 +692,37 @@ TEST(ELEMENTWISE_OP, MAT_THRESHOLD)
     cv::cann::resetDevice();
 }
 
+TEST(ELEMENTWISE_OP, MAT_THRESHOLD_ASCENDC)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat cpuRet, npuRet;
+    AscendMat npuImg, npuTmpMat;
+
+    // opencv do not support CV_8S, CV_32S, CV_16F
+    // ascend do not support CV_16U, CV_64F
+    uint8_t dtypes[] = {CV_8U, CV_16S, CV_32F};
+
+    for (uint i = 0; i <= 4; i++)
+    {
+        for (uint j = 0; j < sizeof(dtypes) / sizeof(dtypes[0]); j++)
+        {
+            double thresh = 90.5;
+            double maxVal = 85.2;
+
+            Mat img = randomMat(10, 10, CV_MAKETYPE(dtypes[j], 3), 0.0f, 128.0f);
+            npuImg.upload(img);
+            npuTmpMat.create(npuImg.rows, npuImg.cols, npuImg.type());
+
+            cv::threshold(img, cpuRet, thresh, maxVal, i);
+            cv::cann::threshold(npuImg, npuTmpMat, thresh, maxVal, i);
+
+            npuTmpMat.download(npuRet);
+            EXPECT_MAT_NEAR(cpuRet, npuRet, 10.0f);
+        }
+    }
+
+    cv::cann::resetDevice();
+}
+
 } // namespace
 } // namespace opencv_test
diff --git a/modules/cannops/test/test_kernel.cpp b/modules/cannops/test/test_kernel.cpp
new file mode 100644
index 00000000000..ac0996a27b6
--- /dev/null
+++ b/modules/cannops/test/test_kernel.cpp
@@ -0,0 +1,51 @@
+#include "test_precomp.hpp"
+#include "opencv2/cann_call.hpp"
+
+namespace opencv_test
+{
+namespace
+{
+
+TEST(ASCENDC_KERNEL, THRESHOLD)
+{
+    cv::cann::setDevice(DEVICE_ID);
+    Mat cpuRet, npuRet;
+    AscendMat npuImg, npuTmpMat;
+
+    // opencv do not support CV_8S, CV_32S, CV_16F
+    // ascend do not support CV_16U, CV_64F
+    uint8_t dtypes[] = {CV_8U, CV_16S, CV_32F};
+
+    for (uint i = 0; i <= 4; i++)
+    {
+        for (uint j = 0; j < sizeof(dtypes) / sizeof(dtypes[0]); j++)
+        {
+            double thresh = 90.5;
+            double maxVal = 85.2;
+
+            Mat img = randomMat(10, 10, CV_MAKETYPE(dtypes[j], 3), 0.0f, 128.0f);
+            npuImg.upload(img);
+            npuTmpMat.create(npuImg.rows, npuImg.cols, npuImg.type());
+
+            cv::threshold(img, cpuRet, thresh, maxVal, i);
+            ThresholdOpencvTilingData tiling;
+            tiling.maxVal = maxVal;
+            tiling.thresh = thresh;
+            size_t totalBytes = img.rows * img.cols * img.channels();
+            // AscendMat memory will be align to 32B, it's safe to set totalLengh a little bigger.
+            tiling.totalLength = ((totalBytes + 32) & ~31);
+            tiling.threshType = i;
+            tiling.dtype = dtypes[j];
+            kernel_launch(aclrtlaunch_threshold_opencv, AscendStream::Null(), tiling,
+                          npuImg.data.get(), npuTmpMat.data.get());
+
+            npuTmpMat.download(npuRet);
+            EXPECT_MAT_NEAR(cpuRet, npuRet, 10.0f);
+        }
+    }
+
+    cv::cann::resetDevice();
+}
+
+} // namespace
+} // namespace opencv_test
diff --git a/modules/cannops/test/test_precomp.hpp b/modules/cannops/test/test_precomp.hpp
index f7bdbea0b08..74cfcb11ee9 100644
--- a/modules/cannops/test/test_precomp.hpp
+++ b/modules/cannops/test/test_precomp.hpp
@@ -9,6 +9,7 @@
 #include "opencv2/cann.hpp"
 #include "opencv2/ts/cuda_test.hpp"
 #include "opencv2/cann_interface.hpp"
+#include "opencv2/ascendc_kernels.hpp"
 
 using namespace cv;
 using namespace cv::cann;

From 09ed18d2802d892755ff168d9b6677c462ae4ec2 Mon Sep 17 00:00:00 2001
From: MengqingCao <cmq0113@163.com>
Date: Wed, 21 Feb 2024 10:33:09 +0800
Subject: [PATCH 2/4] Remove redundant code

1. remove threshold_opencv_kernel
2. typo ASCENDC_KERNELS_H
3. add ALIGN_UP macro
---
 .../cannops/ascendc_kernels/threshold_opencv_kernel.cpp   | 8 --------
 modules/cannops/include/opencv2/ascendc_kernels.hpp       | 2 +-
 modules/cannops/src/element_operations.cpp                | 2 +-
 modules/cannops/src/precomp.hpp                           | 1 +
 4 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp b/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp
index ffab30ebd54..7fa1867c8b1 100644
--- a/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp
+++ b/modules/cannops/ascendc_kernels/threshold_opencv_kernel.cpp
@@ -377,11 +377,3 @@ extern "C" __global__ __aicore__ void threshold_opencv(GM_ADDR tilingGM, GM_ADDR
     // Clear tiling GM cache manually. (cce compiler bug)
     dcci(tilingGM, 1);
 }
-
-#ifndef __CCE_KT_TEST__
-void threshold_opencv_kernel(uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* tiling,
-                             uint8_t* x, uint8_t* y)
-{
-    threshold_opencv<<<blockDim, l2ctrl, stream>>>(tiling, x, y);
-}
-#endif
diff --git a/modules/cannops/include/opencv2/ascendc_kernels.hpp b/modules/cannops/include/opencv2/ascendc_kernels.hpp
index 714b6460fd7..b030920a62f 100644
--- a/modules/cannops/include/opencv2/ascendc_kernels.hpp
+++ b/modules/cannops/include/opencv2/ascendc_kernels.hpp
@@ -1,5 +1,5 @@
 #ifndef ASCENDC_KERNELS_H
-#define KERNEL_TILINASCENDC_KERNELS_HG_H
+#define ASCENDC_KERNELS_H
 
 #include "../../ascendc_kernels/kernel_tiling_types.h"
 #include "aclrtlaunch_threshold_opencv.h"
diff --git a/modules/cannops/src/element_operations.cpp b/modules/cannops/src/element_operations.cpp
index 48d9edb596b..cacf6e6cff1 100644
--- a/modules/cannops/src/element_operations.cpp
+++ b/modules/cannops/src/element_operations.cpp
@@ -444,7 +444,7 @@ double threshold(const AscendMat& src, AscendMat& dst, double thresh, double max
         tiling.thresh = thresh;
         // AscendMat memory will be align to 32B, it's safe to set totalLengh a little bigger.
         size_t totalBytes = src.rows * src.cols * src.channels();
-        tiling.totalLength = ((totalBytes + 32) & ~31);
+        tiling.totalLength = ALIGN_UP(totalBytes, 32);
         tiling.threshType = type;
         tiling.dtype = src.depth();
 
diff --git a/modules/cannops/src/precomp.hpp b/modules/cannops/src/precomp.hpp
index 53ed398fde3..8aadaf4d8de 100644
--- a/modules/cannops/src/precomp.hpp
+++ b/modules/cannops/src/precomp.hpp
@@ -11,5 +11,6 @@
 #include "opencv2/cann_interface.hpp"
 #include "opencv2/cann_private.hpp"
 #include "opencv2/ascendc_kernels.hpp"
+#define ALIGN_UP(num, align) (((num) + (align) - 1) & ~((align) - 1))
 
 #endif /* __OPENCV_PRECOMP_H__ */

From 1ad3ad5f856aac752403bf7725e75b491aa4d975 Mon Sep 17 00:00:00 2001
From: Dhanwanth1803 <147172285+Dhanwanth1803@users.noreply.github.com>
Date: Tue, 5 Mar 2024 15:17:19 +0530
Subject: [PATCH 3/4] Merge pull request #3646 from Dhanwanth1803:latch

Fixes #25081:Latch input corruption fix #3646

Fixes https://github.com/opencv/opencv/issues/25081

As mentioned by WennPaper making it `grayImage = image.clone();` will make a deep copy. It prevents the `InputArray` from being modified.

- [X] I agree to contribute to the project under Apache 2 License.
- [X] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [X] The PR is proposed to the proper branch
- [X] There is a reference to the original bug report and related work
- [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [ ] The feature is well documented and sample code can be built with the project CMake

---
Co-authored-by: Dhanwanth1803 <dhanwanthvarala@gmail,com>
Co-authored-by: Dmitry Kurtaev <dmitry.kurtaev@gmail.com>
---
 modules/xfeatures2d/src/latch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/xfeatures2d/src/latch.cpp b/modules/xfeatures2d/src/latch.cpp
index da5041fa0ae..49cd8f58947 100644
--- a/modules/xfeatures2d/src/latch.cpp
+++ b/modules/xfeatures2d/src/latch.cpp
@@ -519,7 +519,7 @@ namespace cv
             switch (image.type())
             {
             case CV_8UC1:
-                grayImage = image;
+                grayImage = sigma_ ? image.clone() : image;
                 break;
             case CV_8UC3:
                 cvtColor(image, grayImage, COLOR_BGR2GRAY);

From 1aaf6e1c8ba7472741adb0f90b6d11d2aeeeae1d Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@gmail.com>
Date: Tue, 5 Mar 2024 16:20:40 +0300
Subject: [PATCH 4/4] Merge pull request #3638 from mshabunin:doc-upgrade

Documentation transition to fresh Doxygen #3638

Merge with https://github.com/opencv/opencv/pull/25042
---
 .../include/opencv2/bioinspired/retina.hpp    | 51 +----------
 .../samples/default_retina_config.xml         | 24 ++++++
 .../samples/realistic_retina_config.xml       | 24 ++++++
 .../retina_model/retina_model.markdown        | 12 +--
 modules/cannops/include/opencv2/cann.hpp      |  4 +-
 .../include/opencv2/cann_interface.hpp        |  4 +-
 .../include/opencv2/cudaimgproc.hpp           |  1 -
 .../tutorials/benchmark/sr_benchmark.markdown | 29 ++-----
 .../face/include/opencv2/face/facemark.hpp    |  9 +-
 .../include/opencv2/face/facemark_train.hpp   |  6 --
 .../face_landmark_trainer.markdown            |  6 +-
 modules/fuzzy/include/opencv2/fuzzy.hpp       | 10 +--
 modules/hdf/include/opencv2/hdf.hpp           | 14 ++-
 .../mcc/include/opencv2/mcc/checker_model.hpp |  1 -
 modules/rgbd/include/opencv2/rgbd/dynafu.hpp  |  7 +-
 modules/sfm/include/opencv2/sfm.hpp           | 17 ++--
 .../opencv2/stereo/quasi_dense_stereo.hpp     |  6 +-
 modules/text/include/opencv2/text.hpp         | 86 +++++++++----------
 modules/text/include/opencv2/text/ocr.hpp     |  1 -
 .../videostab/include/opencv2/videostab.hpp   | 32 ++++---
 modules/viz/include/opencv2/viz.hpp           | 29 +++----
 .../include/opencv2/xfeatures2d.hpp           | 13 ++-
 .../include/opencv2/xfeatures2d/nonfree.hpp   |  3 +
 modules/ximgproc/include/opencv2/ximgproc.hpp | 36 ++++----
 .../include/opencv2/ximgproc/color_match.hpp  |  2 +
 .../opencv2/ximgproc/deriche_filter.hpp       |  2 +
 .../ximgproc/edgepreserving_filter.hpp        |  4 +-
 .../opencv2/ximgproc/fast_hough_transform.hpp |  3 +-
 .../opencv2/ximgproc/paillou_filter.hpp       |  2 +
 .../include/opencv2/ximgproc/peilin.hpp       |  2 +
 .../ximgproc/run_length_morphology.hpp        |  2 +
 31 files changed, 214 insertions(+), 228 deletions(-)
 create mode 100644 modules/bioinspired/samples/default_retina_config.xml
 create mode 100644 modules/bioinspired/samples/realistic_retina_config.xml

diff --git a/modules/bioinspired/include/opencv2/bioinspired/retina.hpp b/modules/bioinspired/include/opencv2/bioinspired/retina.hpp
index 8e6eda93cae..478b6a0f75c 100644
--- a/modules/bioinspired/include/opencv2/bioinspired/retina.hpp
+++ b/modules/bioinspired/include/opencv2/bioinspired/retina.hpp
@@ -94,57 +94,12 @@ enum {
     Here is the default configuration file of the retina module. It gives results such as the first
     retina output shown on the top of this page.
 
-    @code{xml}
-    <?xml version="1.0"?>
-    <opencv_storage>
-    <OPLandIPLparvo>
-        <colorMode>1</colorMode>
-        <normaliseOutput>1</normaliseOutput>
-        <photoreceptorsLocalAdaptationSensitivity>7.5e-01</photoreceptorsLocalAdaptationSensitivity>
-        <photoreceptorsTemporalConstant>9.0e-01</photoreceptorsTemporalConstant>
-        <photoreceptorsSpatialConstant>5.3e-01</photoreceptorsSpatialConstant>
-        <horizontalCellsGain>0.01</horizontalCellsGain>
-        <hcellsTemporalConstant>0.5</hcellsTemporalConstant>
-        <hcellsSpatialConstant>7.</hcellsSpatialConstant>
-        <ganglionCellsSensitivity>7.5e-01</ganglionCellsSensitivity></OPLandIPLparvo>
-    <IPLmagno>
-        <normaliseOutput>1</normaliseOutput>
-        <parasolCells_beta>0.</parasolCells_beta>
-        <parasolCells_tau>0.</parasolCells_tau>
-        <parasolCells_k>7.</parasolCells_k>
-        <amacrinCellsTemporalCutFrequency>2.0e+00</amacrinCellsTemporalCutFrequency>
-        <V0CompressionParameter>9.5e-01</V0CompressionParameter>
-        <localAdaptintegration_tau>0.</localAdaptintegration_tau>
-        <localAdaptintegration_k>7.</localAdaptintegration_k></IPLmagno>
-    </opencv_storage>
-    @endcode
+    @include default_retina_config.xml
 
     Here is the 'realistic" setup used to obtain the second retina output shown on the top of this page.
 
-    @code{xml}
-    <?xml version="1.0"?>
-    <opencv_storage>
-    <OPLandIPLparvo>
-      <colorMode>1</colorMode>
-      <normaliseOutput>1</normaliseOutput>
-      <photoreceptorsLocalAdaptationSensitivity>8.9e-01</photoreceptorsLocalAdaptationSensitivity>
-      <photoreceptorsTemporalConstant>9.0e-01</photoreceptorsTemporalConstant>
-      <photoreceptorsSpatialConstant>5.3e-01</photoreceptorsSpatialConstant>
-      <horizontalCellsGain>0.3</horizontalCellsGain>
-      <hcellsTemporalConstant>0.5</hcellsTemporalConstant>
-      <hcellsSpatialConstant>7.</hcellsSpatialConstant>
-      <ganglionCellsSensitivity>8.9e-01</ganglionCellsSensitivity></OPLandIPLparvo>
-    <IPLmagno>
-      <normaliseOutput>1</normaliseOutput>
-      <parasolCells_beta>0.</parasolCells_beta>
-      <parasolCells_tau>0.</parasolCells_tau>
-      <parasolCells_k>7.</parasolCells_k>
-      <amacrinCellsTemporalCutFrequency>2.0e+00</amacrinCellsTemporalCutFrequency>
-      <V0CompressionParameter>9.5e-01</V0CompressionParameter>
-      <localAdaptintegration_tau>0.</localAdaptintegration_tau>
-      <localAdaptintegration_k>7.</localAdaptintegration_k></IPLmagno>
-    </opencv_storage>
-    @endcode
+    @include realistic_retina_config.xml
+
       */
     struct RetinaParameters{
         //! Outer Plexiform Layer (OPL) and Inner Plexiform Layer Parvocellular (IplParvo) parameters
diff --git a/modules/bioinspired/samples/default_retina_config.xml b/modules/bioinspired/samples/default_retina_config.xml
new file mode 100644
index 00000000000..469b5d58f10
--- /dev/null
+++ b/modules/bioinspired/samples/default_retina_config.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0"?>
+<opencv_storage>
+<OPLandIPLparvo>
+    <colorMode>1</colorMode>
+    <normaliseOutput>1</normaliseOutput>
+    <photoreceptorsLocalAdaptationSensitivity>7.5e-01</photoreceptorsLocalAdaptationSensitivity>
+    <photoreceptorsTemporalConstant>9.0e-01</photoreceptorsTemporalConstant>
+    <photoreceptorsSpatialConstant>5.3e-01</photoreceptorsSpatialConstant>
+    <horizontalCellsGain>0.01</horizontalCellsGain>
+    <hcellsTemporalConstant>0.5</hcellsTemporalConstant>
+    <hcellsSpatialConstant>7.</hcellsSpatialConstant>
+    <ganglionCellsSensitivity>7.5e-01</ganglionCellsSensitivity>
+</OPLandIPLparvo>
+<IPLmagno>
+    <normaliseOutput>1</normaliseOutput>
+    <parasolCells_beta>0.</parasolCells_beta>
+    <parasolCells_tau>0.</parasolCells_tau>
+    <parasolCells_k>7.</parasolCells_k>
+    <amacrinCellsTemporalCutFrequency>2.0e+00</amacrinCellsTemporalCutFrequency>
+    <V0CompressionParameter>9.5e-01</V0CompressionParameter>
+    <localAdaptintegration_tau>0.</localAdaptintegration_tau>
+    <localAdaptintegration_k>7.</localAdaptintegration_k>
+</IPLmagno>
+</opencv_storage>
diff --git a/modules/bioinspired/samples/realistic_retina_config.xml b/modules/bioinspired/samples/realistic_retina_config.xml
new file mode 100644
index 00000000000..c02e79b3c6d
--- /dev/null
+++ b/modules/bioinspired/samples/realistic_retina_config.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0"?>
+<opencv_storage>
+<OPLandIPLparvo>
+    <colorMode>1</colorMode>
+    <normaliseOutput>1</normaliseOutput>
+    <photoreceptorsLocalAdaptationSensitivity>8.9e-01</photoreceptorsLocalAdaptationSensitivity>
+    <photoreceptorsTemporalConstant>9.0e-01</photoreceptorsTemporalConstant>
+    <photoreceptorsSpatialConstant>5.3e-01</photoreceptorsSpatialConstant>
+    <horizontalCellsGain>0.3</horizontalCellsGain>
+    <hcellsTemporalConstant>0.5</hcellsTemporalConstant>
+    <hcellsSpatialConstant>7.</hcellsSpatialConstant>
+    <ganglionCellsSensitivity>8.9e-01</ganglionCellsSensitivity>
+</OPLandIPLparvo>
+<IPLmagno>
+    <normaliseOutput>1</normaliseOutput>
+    <parasolCells_beta>0.</parasolCells_beta>
+    <parasolCells_tau>0.</parasolCells_tau>
+    <parasolCells_k>7.</parasolCells_k>
+    <amacrinCellsTemporalCutFrequency>2.0e+00</amacrinCellsTemporalCutFrequency>
+    <V0CompressionParameter>9.5e-01</V0CompressionParameter>
+    <localAdaptintegration_tau>0.</localAdaptintegration_tau>
+    <localAdaptintegration_k>7.</localAdaptintegration_k>
+</IPLmagno>
+</opencv_storage>
diff --git a/modules/bioinspired/tutorials/retina_model/retina_model.markdown b/modules/bioinspired/tutorials/retina_model/retina_model.markdown
index 37285bfa1c7..d71fe797bec 100644
--- a/modules/bioinspired/tutorials/retina_model/retina_model.markdown
+++ b/modules/bioinspired/tutorials/retina_model/retina_model.markdown
@@ -1,6 +1,8 @@
 Retina and real-world vision {#tutorial_bioinspired_retina_model}
 =============================================================
 
+@tableofcontents
+
 Goal
 ----
 
@@ -382,7 +384,7 @@ need to know if mean luminance information is required or not. If not, the the r
 significantly reduce its energy thus giving more visibility to higher spatial frequency details.
 
 
-#### Basic parameters
+## Basic parameters
 
 The simplest parameters are as follows :
 
@@ -397,7 +399,7 @@ processing. You can expect much faster processing using gray levels : it would r
 product per pixel for all of the retina processes and it has recently been parallelized for multicore
 architectures.
 
-#### Photo-receptors parameters
+## Photo-receptors parameters
 
 The following parameters act on the entry point of the retina - photo-receptors - and has impact on all
  of the following processes. These sensors are low pass spatio-temporal filters that smooth temporal and
@@ -421,7 +423,7 @@ and high frequency noise canceling.
     A good compromise for color images is a 0.53 value since such choice won't affect too much the color spectrum.
     Higher values would lead to gray and blurred output images.
 
-#### Horizontal cells parameters
+## Horizontal cells parameters
 
 This parameter set tunes the neural network connected to the photo-receptors, the horizontal cells.
 It modulates photo-receptors sensitivity and completes the processing for final spectral whitening
@@ -446,7 +448,7 @@ It modulates photo-receptors sensitivity and completes the processing for final
 and luminance is already partly enhanced. The following parameters act on the last processing stages
 of the two outing retina signals.
 
-#### Parvo (details channel) dedicated parameter
+## Parvo (details channel) dedicated parameter
 
 -   **ganglionCellsSensitivity** specifies the strength of the final local adaptation occurring at
     the output of this details' dedicated channel. Parameter values remain between 0 and 1. Low value
@@ -455,7 +457,7 @@ of the two outing retina signals.
 **Note :** this parameter can correct eventual burned images by favoring low energetic details of
 the visual scene, even in bright areas.
 
-#### IPL Magno (motion/transient channel) parameters
+## IPL Magno (motion/transient channel) parameters
 
 Once image's information are cleaned, this channel acts as a high pass temporal filter that
 selects only the signals related to transient signals (events, motion, etc.). A low pass spatial filter
diff --git a/modules/cannops/include/opencv2/cann.hpp b/modules/cannops/include/opencv2/cann.hpp
index 30555dd8257..bd351481624 100644
--- a/modules/cannops/include/opencv2/cann.hpp
+++ b/modules/cannops/include/opencv2/cann.hpp
@@ -8,12 +8,12 @@
 #include "opencv2/core.hpp"
 
 /**
-  @defgroup cann Ascend-accelerated Computer Vision
+  @defgroup cannops Ascend-accelerated Computer Vision
   @{
     @defgroup canncore Core part
     @{
       @defgroup cann_struct Data Structures
-      @defgroup cann_init Initializeation and Information
+      @defgroup cann_init Initialization and Information
     @}
   @}
  */
diff --git a/modules/cannops/include/opencv2/cann_interface.hpp b/modules/cannops/include/opencv2/cann_interface.hpp
index 6667eb58519..6b13090f4f1 100644
--- a/modules/cannops/include/opencv2/cann_interface.hpp
+++ b/modules/cannops/include/opencv2/cann_interface.hpp
@@ -13,9 +13,9 @@ namespace cann
 {
 
 /**
-  @addtogroup cann
+  @addtogroup cannops
   @{
-    @defgroup cannops Operations for Ascend Backend.
+    @defgroup cannops_ops Operations for Ascend Backend.
     @{
         @defgroup cannops_elem Per-element Operations
         @defgroup cannops_core Core Operations on Matrices
diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
index d72700168cd..01e7c41ca9a 100644
--- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
+++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@@ -844,7 +844,6 @@ cv::Moments cvMoments = convertSpatialMoments<float>(spatialMoments, order);
 ```
 
 see the \a CUDA_TEST_P(Moments, Async) test inside opencv_contrib_source_code/modules/cudaimgproc/test/test_moments.cpp for an example.
-@returns cv::Moments.
 @sa cuda::moments, cuda::convertSpatialMoments, cuda::numMoments, cuda::MomentsOrder
 */
 CV_EXPORTS_W void spatialMoments(InputArray src, OutputArray moments, const bool binaryImage = false, const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS, const int momentsType = CV_64F, Stream& stream = Stream::Null());
diff --git a/modules/dnn_superres/tutorials/benchmark/sr_benchmark.markdown b/modules/dnn_superres/tutorials/benchmark/sr_benchmark.markdown
index 26244c9f8ae..3a4b88ef81b 100644
--- a/modules/dnn_superres/tutorials/benchmark/sr_benchmark.markdown
+++ b/modules/dnn_superres/tutorials/benchmark/sr_benchmark.markdown
@@ -50,14 +50,9 @@ Explanation
 Benchmarking results
 -----------
 
-Dataset benchmarking
-----
-
-###General100 dataset
-
-<center>
+## General100 dataset
 
-#####2x scaling factor
+### 2x scaling factor
 
 
 |               | Avg inference time in sec (CPU)| Avg PSNR | Avg SSIM |
@@ -70,7 +65,7 @@ Dataset benchmarking
 | Nearest neighbor | 0.000114 | 29.1665 | 0.9049 |
 | Lanczos          | 0.001094 | 32.4687 | 0.9327 |
 
-#####3x scaling factor
+### 3x scaling factor
 
 |               | Avg inference time in sec (CPU)| Avg PSNR | Avg SSIM |
 | ------------- |:-------------------:| ---------:|--------:|
@@ -83,7 +78,7 @@ Dataset benchmarking
 | Lanczos          | 0.001012  |25.9115  |0.8706  |
 
 
-#####4x scaling factor
+### 4x scaling factor
 
 |               | Avg inference time in sec (CPU)| Avg PSNR | Avg SSIM |
 | ------------- |:-------------------:| ---------:|--------:|
@@ -96,14 +91,10 @@ Dataset benchmarking
 | Lanczos          | 0.001012  |25.9115  |0.8706  |
 
 
-</center>
 
-Images
-----
-
-<center>
+## Images
 
-####2x scaling factor
+### 2x scaling factor
 
 |Set5: butterfly.png | size: 256x256 | ||
 |:-------------:|:-------------------:|:-------------:|:----:|
@@ -112,7 +103,7 @@ Images
 ![ESPCN](images/espcn_butterfly.jpg)| ![FSRCNN](images/fsrcnn_butterfly.jpg) | ![LapSRN](images/lapsrn_butterfly.jpg) | ![EDSR](images/edsr_butterfly.jpg)
 |29.0341 / 0.9354 / **0.004157**| 29.0077 / 0.9345 / 0.006325 | 27.8212 / 0.9230 / 0.037937 | **30.0347** / **0.9453** / 2.077280 |
 
-####3x scaling factor
+### 3x scaling factor
 
 |Urban100: img_001.png | size: 1024x644 | ||
 |:-------------:|:-------------------:|:-------------:|:----:|
@@ -122,7 +113,7 @@ Images
 |28.0118 / 0.8588 / **0.030748**| 28.0184 / 0.8597 / 0.094173 |  | **30.5671** / **0.9019** / 9.517580 |
 
 
-####4x scaling factor
+### 4x scaling factor
 
 |Set14: comic.png | size: 250x361 | ||
 |:-------------:|:-------------------:|:-------------:|:----:|
@@ -131,7 +122,7 @@ Images
 |![ESPCN](images/espcn_comic.jpg)| ![FSRCNN](images/fsrcnn_comic.jpg) | ![LapSRN](images/lapsrn_comic.jpg) | ![EDSR](images/edsr_comic.jpg)
 |20.0417 / 0.6302 / **0.001894**| 20.0885 / 0.6384 / 0.002103 | 20.0676 / 0.6339 / 0.061640 | **20.5233** / **0.6901** / 0.665876 |
 
-####8x scaling factor
+### 8x scaling factor
 
 |Div2K: 0006.png | size: 1356x2040 | |
 |:-------------:|:-------------------:|:-------------:|
@@ -139,5 +130,3 @@ Images
 |PSRN / SSIM / Speed (CPU)| 26.3139 / **0.8033** / 0.001107| 23.8291 / 0.7340 / **0.000611** |
 |![Lanczos interpolation](images/lanczos_div2k.jpg)| ![LapSRN](images/lapsrn_div2k.jpg) | |
 |26.1565 / 0.7962 / 0.004782| **26.7046** / 0.7987 / 2.274290 | |
-
-</center>
\ No newline at end of file
diff --git a/modules/face/include/opencv2/face/facemark.hpp b/modules/face/include/opencv2/face/facemark.hpp
index 86e9384342e..4e66727fe46 100644
--- a/modules/face/include/opencv2/face/facemark.hpp
+++ b/modules/face/include/opencv2/face/facemark.hpp
@@ -12,12 +12,6 @@ Mentor: Delia Passalacqua
 #ifndef __OPENCV_FACELANDMARK_HPP__
 #define __OPENCV_FACELANDMARK_HPP__
 
-/**
-@defgroup face Face Analysis
-- @ref tutorial_table_of_content_facemark
-- The Facemark API
-*/
-
 #include "opencv2/core.hpp"
 #include <vector>
 
@@ -25,6 +19,8 @@ Mentor: Delia Passalacqua
 namespace cv {
 namespace face {
 
+//! @addtogroup face
+//! @{
 
 /** @brief Abstract base class for all facemark models
 
@@ -88,6 +84,7 @@ CV_EXPORTS_W Ptr<Facemark> createFacemarkLBF();
 //! construct a Kazemi facemark detector
 CV_EXPORTS_W Ptr<Facemark> createFacemarkKazemi();
 
+//! @}
 
 } // face
 } // cv
diff --git a/modules/face/include/opencv2/face/facemark_train.hpp b/modules/face/include/opencv2/face/facemark_train.hpp
index d6e27e9face..591c079a0d6 100644
--- a/modules/face/include/opencv2/face/facemark_train.hpp
+++ b/modules/face/include/opencv2/face/facemark_train.hpp
@@ -12,12 +12,6 @@ Mentor: Delia Passalacqua
 #ifndef __OPENCV_FACELANDMARKTRAIN_HPP__
 #define __OPENCV_FACELANDMARKTRAIN_HPP__
 
-/**
-@defgroup face Face Analysis
-- @ref tutorial_table_of_content_facemark
-- The Facemark API
-*/
-
 #include "opencv2/face/facemark.hpp"
 #include "opencv2/objdetect.hpp"
 #include <vector>
diff --git a/modules/face/tutorials/face_landmark/face_landmark_trainer.markdown b/modules/face/tutorials/face_landmark/face_landmark_trainer.markdown
index 601a6b4c428..8fdeaa611d5 100644
--- a/modules/face/tutorials/face_landmark/face_landmark_trainer.markdown
+++ b/modules/face/tutorials/face_landmark/face_landmark_trainer.markdown
@@ -21,7 +21,7 @@ The above format is similar to HELEN dataset which is used for training the mode
 ./sample_train_landmark_detector -annotations=/home/sukhad/Downloads/code/trainset/ -config=config.xml -face_cascade=lbpcascadefrontalface.xml -model=trained_model.dat -width=460 -height=460
 ```
 
-### Description of command parameters
+## Description of command parameters
 
 > * **annotations** a : (REQUIRED) Path to annotations txt file [example - /data/annotations.txt]
 > * **config** c : (REQUIRED) Path to configuration xml file containing parameters for training.[ example - /data/config.xml]
@@ -30,7 +30,7 @@ The above format is similar to HELEN dataset which is used for training the mode
 > * **height** h : (OPTIONAL) The height which you want all images to get to scale the annotations. Large images are slow to process [default = 460]
 > * **face_cascade** f (REQUIRED) Path to the face cascade xml file which you want to use as a detector.
 
-### Description of training parameters
+## Description of training parameters
 
 
 The configuration file described above which is used while training contains the training parameters which are required for training.
@@ -49,7 +49,7 @@ The configuration file described above which is used while training contains the
 
 To get more detailed description about the training parameters you can refer to the [Research paper](https://pdfs.semanticscholar.org/d78b/6a5b0dcaa81b1faea5fb0000045a62513567.pdf).
 
-### Understanding code
+## Understanding code
 
 
 ![](images/3.jpg)
diff --git a/modules/fuzzy/include/opencv2/fuzzy.hpp b/modules/fuzzy/include/opencv2/fuzzy.hpp
index d660cc3615c..59f2a3f2a1f 100644
--- a/modules/fuzzy/include/opencv2/fuzzy.hpp
+++ b/modules/fuzzy/include/opencv2/fuzzy.hpp
@@ -52,19 +52,19 @@
 
 Namespace for all functions is `ft`. The module brings implementation of the last image processing algorithms based on fuzzy mathematics. Method are named based on the pattern `FT`_degree_dimension`_`method.
 
-  @{
+@{
     @defgroup f0_math Math with F0-transform support
 
-Fuzzy transform (\f$F^0\f$-transform) of the 0th degree transforms whole image to a matrix of its components. These components are used in latter computation where each of them represents average color of certain subarea.
+    Fuzzy transform (\f$F^0\f$-transform) of the 0th degree transforms whole image to a matrix of its components. These components are used in latter computation where each of them represents average color of certain subarea.
 
     @defgroup f1_math Math with F1-transform support
 
-Fuzzy transform (\f$F^1\f$-transform) of the 1th degree transforms whole image to a matrix of its components. Each component is polynomial of the 1th degree carrying information about average color and average gradient of certain subarea.
+    Fuzzy transform (\f$F^1\f$-transform) of the 1th degree transforms whole image to a matrix of its components. Each component is polynomial of the 1th degree carrying information about average color and average gradient of certain subarea.
 
     @defgroup f_image Fuzzy image processing
 
-Image proceesing based on fuzzy mathematics namely F-transform.
-   @}
+    Image proceesing based on fuzzy mathematics namely F-transform.
+@}
 
 */
 
diff --git a/modules/hdf/include/opencv2/hdf.hpp b/modules/hdf/include/opencv2/hdf.hpp
index ff40426ff65..ac48e4b9ac8 100644
--- a/modules/hdf/include/opencv2/hdf.hpp
+++ b/modules/hdf/include/opencv2/hdf.hpp
@@ -41,17 +41,15 @@
 
 This module provides storage routines for Hierarchical Data Format objects.
 
-  @{
+@{
     @defgroup hdf5 Hierarchical Data Format version 5
 
-Hierarchical Data Format version 5
---------------------------------------------------------
+    Hierarchical Data Format version 5
+    --------------------------------------------------------
 
-In order to use it, the hdf5 library has to be installed, which
-means cmake should find it using `find_package(HDF5)` .
-
-
-  @}
+    In order to use it, the hdf5 library has to be installed, which
+    means cmake should find it using `find_package(HDF5)`.
+@}
 */
 
 #endif
diff --git a/modules/mcc/include/opencv2/mcc/checker_model.hpp b/modules/mcc/include/opencv2/mcc/checker_model.hpp
index c13d5afc585..0768c691e05 100644
--- a/modules/mcc/include/opencv2/mcc/checker_model.hpp
+++ b/modules/mcc/include/opencv2/mcc/checker_model.hpp
@@ -116,7 +116,6 @@ class CV_EXPORTS_W CCheckerDraw
     virtual ~CCheckerDraw() {}
     /** \brief Draws the checker to the given image.
     * \param img image in color space BGR
-    * \return void
     */
     CV_WRAP virtual void draw(InputOutputArray img) = 0;
     /** \brief Create a new CCheckerDraw object.
diff --git a/modules/rgbd/include/opencv2/rgbd/dynafu.hpp b/modules/rgbd/include/opencv2/rgbd/dynafu.hpp
index 32875ad5ac7..e5ad3447778 100644
--- a/modules/rgbd/include/opencv2/rgbd/dynafu.hpp
+++ b/modules/rgbd/include/opencv2/rgbd/dynafu.hpp
@@ -114,7 +114,6 @@ class CV_EXPORTS_W DynaFu
     virtual void renderSurface(OutputArray depthImage, OutputArray vertImage, OutputArray normImage, bool warp=true) = 0;
 };
 
-//! @}
-}
-}
-#endif
+} // dynafu::
+} // cv::
+#endif // __OPENCV_RGBD_DYNAFU_HPP__
diff --git a/modules/sfm/include/opencv2/sfm.hpp b/modules/sfm/include/opencv2/sfm.hpp
index 25a3b10da5d..52c1af07e8e 100644
--- a/modules/sfm/include/opencv2/sfm.hpp
+++ b/modules/sfm/include/opencv2/sfm.hpp
@@ -75,7 +75,7 @@ This module has been originally developed as a project for Google Summer of Code
   - Notice that it is compiled only when Eigen, GLog and GFlags are correctly installed.\n
     Check installation instructions in the following tutorial: @ref tutorial_sfm_installation
 
-  @{
+@{
     @defgroup conditioning Conditioning
     @defgroup fundamental Fundamental
     @defgroup io Input/Output
@@ -85,18 +85,17 @@ This module has been originally developed as a project for Google Summer of Code
     @defgroup triangulation Triangulation
 
     @defgroup reconstruction Reconstruction
-      @note
-        - Notice that it is compiled only when Ceres Solver is correctly installed.\n
-          Check installation instructions in the following tutorial: @ref tutorial_sfm_installation
 
+    @note
+    - Notice that it is compiled only when Ceres Solver is correctly installed.\n
+        Check installation instructions in the following tutorial: @ref tutorial_sfm_installation
 
     @defgroup simple_pipeline Simple Pipeline
-      @note
-          - Notice that it is compiled only when Ceres Solver is correctly installed.\n
-            Check installation instructions in the following tutorial: @ref tutorial_sfm_installation
-
-  @}
 
+    @note
+        - Notice that it is compiled only when Ceres Solver is correctly installed.\n
+        Check installation instructions in the following tutorial: @ref tutorial_sfm_installation
+@}
 */
 
 #endif
diff --git a/modules/stereo/include/opencv2/stereo/quasi_dense_stereo.hpp b/modules/stereo/include/opencv2/stereo/quasi_dense_stereo.hpp
index b2290e3768c..469c46f72ea 100644
--- a/modules/stereo/include/opencv2/stereo/quasi_dense_stereo.hpp
+++ b/modules/stereo/include/opencv2/stereo/quasi_dense_stereo.hpp
@@ -18,6 +18,7 @@ namespace cv
 {
 namespace stereo
 {
+
 /** \addtogroup stereo
  *  @{
  */
@@ -190,9 +191,8 @@ class  CV_EXPORTS_W QuasiDenseStereo
     CV_PROP_RW PropagationParameters Param;
 };
 
-} //namespace cv
-} //namespace stereo
-
 /** @}*/
 
+} //namespace cv
+} //namespace stereo
 #endif // __OPENCV_QUASI_DENSE_STEREO_H__
diff --git a/modules/text/include/opencv2/text.hpp b/modules/text/include/opencv2/text.hpp
index 86ce3ec6e80..2b84451c23f 100644
--- a/modules/text/include/opencv2/text.hpp
+++ b/modules/text/include/opencv2/text.hpp
@@ -52,49 +52,49 @@ scene images.
   @{
     @defgroup text_detect Scene Text Detection
 
-Class-specific Extremal Regions for Scene Text Detection
---------------------------------------------------------
-
-The scene text detection algorithm described below has been initially proposed by Lukás Neumann &
-Jiri Matas @cite Neumann11. The main idea behind Class-specific Extremal Regions is similar to the MSER
-in that suitable Extremal Regions (ERs) are selected from the whole component tree of the image.
-However, this technique differs from MSER in that selection of suitable ERs is done by a sequential
-classifier trained for character detection, i.e. dropping the stability requirement of MSERs and
-selecting class-specific (not necessarily stable) regions.
-
-The component tree of an image is constructed by thresholding by an increasing value step-by-step
-from 0 to 255 and then linking the obtained connected components from successive levels in a
-hierarchy by their inclusion relation:
-
-![image](pics/component_tree.png)
-
-The component tree may contain a huge number of regions even for a very simple image as shown in
-the previous image. This number can easily reach the order of 1 x 10\^6 regions for an average 1
-Megapixel image. In order to efficiently select suitable regions among all the ERs the algorithm
-make use of a sequential classifier with two differentiated stages.
-
-In the first stage incrementally computable descriptors (area, perimeter, bounding box, and Euler's
-number) are computed (in O(1)) for each region r and used as features for a classifier which
-estimates the class-conditional probability p(r|character). Only the ERs which correspond to local
-maximum of the probability p(r|character) are selected (if their probability is above a global limit
-p_min and the difference between local maximum and local minimum is greater than a delta_min
-value).
-
-In the second stage, the ERs that passed the first stage are classified into character and
-non-character classes using more informative but also more computationally expensive features. (Hole
-area ratio, convex hull ratio, and the number of outer boundary inflexion points).
-
-This ER filtering process is done in different single-channel projections of the input image in
-order to increase the character localization recall.
-
-After the ER filtering is done on each input channel, character candidates must be grouped in
-high-level text blocks (i.e. words, text lines, paragraphs, ...). The opencv_text module implements
-two different grouping algorithms: the Exhaustive Search algorithm proposed in @cite Neumann12 for
-grouping horizontally aligned text, and the method proposed by Lluis Gomez and Dimosthenis Karatzas
-in @cite Gomez13 @cite Gomez14 for grouping arbitrary oriented text (see erGrouping).
-
-To see the text detector at work, have a look at the textdetection demo:
-<https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/textdetection.cpp>
+    Class-specific Extremal Regions for Scene Text Detection
+    --------------------------------------------------------
+
+    The scene text detection algorithm described below has been initially proposed by Lukás Neumann &
+    Jiri Matas @cite Neumann11. The main idea behind Class-specific Extremal Regions is similar to the MSER
+    in that suitable Extremal Regions (ERs) are selected from the whole component tree of the image.
+    However, this technique differs from MSER in that selection of suitable ERs is done by a sequential
+    classifier trained for character detection, i.e. dropping the stability requirement of MSERs and
+    selecting class-specific (not necessarily stable) regions.
+
+    The component tree of an image is constructed by thresholding by an increasing value step-by-step
+    from 0 to 255 and then linking the obtained connected components from successive levels in a
+    hierarchy by their inclusion relation:
+
+    ![image](pics/component_tree.png)
+
+    The component tree may contain a huge number of regions even for a very simple image as shown in
+    the previous image. This number can easily reach the order of 1 x 10\^6 regions for an average 1
+    Megapixel image. In order to efficiently select suitable regions among all the ERs the algorithm
+    make use of a sequential classifier with two differentiated stages.
+
+    In the first stage incrementally computable descriptors (area, perimeter, bounding box, and Euler's
+    number) are computed (in O(1)) for each region r and used as features for a classifier which
+    estimates the class-conditional probability p(r|character). Only the ERs which correspond to local
+    maximum of the probability p(r|character) are selected (if their probability is above a global limit
+    p_min and the difference between local maximum and local minimum is greater than a delta_min
+    value).
+
+    In the second stage, the ERs that passed the first stage are classified into character and
+    non-character classes using more informative but also more computationally expensive features. (Hole
+    area ratio, convex hull ratio, and the number of outer boundary inflexion points).
+
+    This ER filtering process is done in different single-channel projections of the input image in
+    order to increase the character localization recall.
+
+    After the ER filtering is done on each input channel, character candidates must be grouped in
+    high-level text blocks (i.e. words, text lines, paragraphs, ...). The opencv_text module implements
+    two different grouping algorithms: the Exhaustive Search algorithm proposed in @cite Neumann12 for
+    grouping horizontally aligned text, and the method proposed by Lluis Gomez and Dimosthenis Karatzas
+    in @cite Gomez13 @cite Gomez14 for grouping arbitrary oriented text (see erGrouping).
+
+    To see the text detector at work, have a look at the textdetection demo:
+    <https://github.com/opencv/opencv_contrib/blob/master/modules/text/samples/textdetection.cpp>
 
     @defgroup text_recognize Scene Text Recognition
   @}
diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp
index a0c967e87bd..083fc7a5aba 100644
--- a/modules/text/include/opencv2/text/ocr.hpp
+++ b/modules/text/include/opencv2/text/ocr.hpp
@@ -363,7 +363,6 @@ CV_EXPORTS_W Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifierCNN(cons
 
  */
 CV_EXPORTS_W Ptr<OCRHMMDecoder::ClassifierCallback> loadOCRHMMClassifier(const String& filename, int classifier);
-//! @}
 
 /** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon).
  *
diff --git a/modules/videostab/include/opencv2/videostab.hpp b/modules/videostab/include/opencv2/videostab.hpp
index ca3f5adef2b..14c52ebaf1b 100644
--- a/modules/videostab/include/opencv2/videostab.hpp
+++ b/modules/videostab/include/opencv2/videostab.hpp
@@ -44,7 +44,7 @@
 #define OPENCV_VIDEOSTAB_HPP
 
 /**
-  @defgroup videostab Video Stabilization
+@defgroup videostab Video Stabilization
 
 The video stabilization module contains a set of functions and classes that can be used to solve the
 problem of video stabilization. There are a few methods implemented, most of them are described in
@@ -53,26 +53,24 @@ paper methods.
 
 ### References
 
- 1. "Full-Frame Video Stabilization with Motion Inpainting"
-     Yasuyuki Matsushita, Eyal Ofek, Weina Ge, Xiaoou Tang, Senior Member, and Heung-Yeung Shum
- 2. "Auto-Directed Video Stabilization with Robust L1 Optimal Camera Paths"
-     Matthias Grundmann, Vivek Kwatra, Irfan Essa
+1. "Full-Frame Video Stabilization with Motion Inpainting"
+    Yasuyuki Matsushita, Eyal Ofek, Weina Ge, Xiaoou Tang, Senior Member, and Heung-Yeung Shum
+2. "Auto-Directed Video Stabilization with Robust L1 Optimal Camera Paths"
+    Matthias Grundmann, Vivek Kwatra, Irfan Essa
 
-     @{
-         @defgroup videostab_motion Global Motion Estimation
+@{
+    @defgroup videostab_motion Global Motion Estimation
 
-The video stabilization module contains a set of functions and classes for global motion estimation
-between point clouds or between images. In the last case features are extracted and matched
-internally. For the sake of convenience the motion estimation functions are wrapped into classes.
-Both the functions and the classes are available.
+    The video stabilization module contains a set of functions and classes for global motion estimation
+    between point clouds or between images. In the last case features are extracted and matched
+    internally. For the sake of convenience the motion estimation functions are wrapped into classes.
+    Both the functions and the classes are available.
 
-         @defgroup videostab_marching Fast Marching Method
-
-The Fast Marching Method @cite Telea04 is used in of the video stabilization routines to do motion and
-color inpainting. The method is implemented is a flexible way and it's made public for other users.
-
-     @}
+    @defgroup videostab_marching Fast Marching Method
 
+    The Fast Marching Method @cite Telea04 is used in of the video stabilization routines to do motion and
+    color inpainting. The method is implemented is a flexible way and it's made public for other users.
+@}
 */
 
 #include "opencv2/videostab/stabilizer.hpp"
diff --git a/modules/viz/include/opencv2/viz.hpp b/modules/viz/include/opencv2/viz.hpp
index fc79b8b60e7..c31ed342ab1 100644
--- a/modules/viz/include/opencv2/viz.hpp
+++ b/modules/viz/include/opencv2/viz.hpp
@@ -60,25 +60,24 @@ interact with it.
 3D visualization window (see Viz3d) is used to display widgets (see Widget), and it provides several
 methods to interact with scene and widgets.
 
-  @{
+@{
     @defgroup viz_widget Widget
 
-In this section, the widget framework is explained. Widgets represent 2D or 3D objects, varying from
-simple ones such as lines to complex ones such as point clouds and meshes.
+    In this section, the widget framework is explained. Widgets represent 2D or 3D objects, varying from
+    simple ones such as lines to complex ones such as point clouds and meshes.
 
-Widgets are **implicitly shared**. Therefore, one can add a widget to the scene, and modify the
-widget without re-adding the widget.
+    Widgets are **implicitly shared**. Therefore, one can add a widget to the scene, and modify the
+    widget without re-adding the widget.
 
-@code
-// Create a cloud widget
-viz::WCloud cw(cloud, viz::Color::red());
-// Display it in a window
-myWindow.showWidget("CloudWidget1", cw);
-// Modify it, and it will be modified in the window.
-cw.setColor(viz::Color::yellow());
-@endcode
-
-  @}
+    @code
+    // Create a cloud widget
+    viz::WCloud cw(cloud, viz::Color::red());
+    // Display it in a window
+    myWindow.showWidget("CloudWidget1", cw);
+    // Modify it, and it will be modified in the window.
+    cw.setColor(viz::Color::yellow());
+    @endcode
+@}
 */
 
 #endif /* OPENCV_VIZ_HPP */
diff --git a/modules/xfeatures2d/include/opencv2/xfeatures2d.hpp b/modules/xfeatures2d/include/opencv2/xfeatures2d.hpp
index 3313a38348a..3793541c238 100644
--- a/modules/xfeatures2d/include/opencv2/xfeatures2d.hpp
+++ b/modules/xfeatures2d/include/opencv2/xfeatures2d.hpp
@@ -46,19 +46,18 @@ the use of this software, even if advised of the possibility of such damage.
 @{
     @defgroup xfeatures2d_experiment Experimental 2D Features Algorithms
 
-This section describes experimental algorithms for 2d feature detection.
+    This section describes experimental algorithms for 2d feature detection.
 
     @defgroup xfeatures2d_nonfree Non-free 2D Features Algorithms
 
-This section describes two popular algorithms for 2d feature detection, SIFT and SURF, that are
-known to be patented. You need to set the OPENCV_ENABLE_NONFREE option in cmake to use those. Use them at your own risk.
+    This section describes two popular algorithms for 2d feature detection, SIFT and SURF, that are
+    known to be patented. You need to set the OPENCV_ENABLE_NONFREE option in cmake to use those. Use them at your own risk.
 
     @defgroup xfeatures2d_match Experimental 2D Features Matching Algorithm
 
-This section describes the following matching strategies:
-    - GMS: Grid-based Motion Statistics, @cite Bian2017gms
-    - LOGOS: Local geometric support for high-outlier spatial verification, @cite Lowry2018LOGOSLG
-
+    This section describes the following matching strategies:
+        - GMS: Grid-based Motion Statistics, @cite Bian2017gms
+        - LOGOS: Local geometric support for high-outlier spatial verification, @cite Lowry2018LOGOSLG
 @}
 */
 
diff --git a/modules/xfeatures2d/include/opencv2/xfeatures2d/nonfree.hpp b/modules/xfeatures2d/include/opencv2/xfeatures2d/nonfree.hpp
index 8eb11aa6653..5fb299f20f4 100644
--- a/modules/xfeatures2d/include/opencv2/xfeatures2d/nonfree.hpp
+++ b/modules/xfeatures2d/include/opencv2/xfeatures2d/nonfree.hpp
@@ -50,6 +50,9 @@ namespace cv
 namespace xfeatures2d
 {
 
+//! @addtogroup xfeatures2d_nonfree
+//! @{
+
 /** @brief Class for extracting Speeded Up Robust Features from an image @cite Bay06 .
 
 The algorithm parameters:
diff --git a/modules/ximgproc/include/opencv2/ximgproc.hpp b/modules/ximgproc/include/opencv2/ximgproc.hpp
index dca0443c0ad..099205126cb 100644
--- a/modules/ximgproc/include/opencv2/ximgproc.hpp
+++ b/modules/ximgproc/include/opencv2/ximgproc.hpp
@@ -65,12 +65,13 @@
 #include "ximgproc/find_ellipses.hpp"
 
 
-/** @defgroup ximgproc Extended Image Processing
-  @{
+/**
+@defgroup ximgproc Extended Image Processing
+@{
     @defgroup ximgproc_edge Structured forests for fast edge detection
 
-This module contains implementations of modern structured edge detection algorithms,
-i.e. algorithms which somehow takes into account pixel affinities in natural images.
+    This module contains implementations of modern structured edge detection algorithms,
+    i.e. algorithms which somehow takes into account pixel affinities in natural images.
 
     @defgroup ximgproc_edgeboxes EdgeBoxes
 
@@ -84,16 +85,16 @@ i.e. algorithms which somehow takes into account pixel affinities in natural ima
 
     @defgroup ximgproc_edge_drawing EdgeDrawing
 
-EDGE DRAWING LIBRARY FOR GEOMETRIC FEATURE EXTRACTION AND VALIDATION
+    EDGE DRAWING LIBRARY FOR GEOMETRIC FEATURE EXTRACTION AND VALIDATION
 
-Edge Drawing (ED) algorithm is an proactive approach on edge detection problem. In contrast to many other existing edge detection algorithms which follow a subtractive
-approach (i.e. after applying gradient filters onto an image eliminating pixels w.r.t. several rules, e.g. non-maximal suppression and hysteresis in Canny), ED algorithm
-works via an additive strategy, i.e. it picks edge pixels one by one, hence the name Edge Drawing. Then we process those random shaped edge segments to extract higher level
-edge features, i.e. lines, circles, ellipses, etc. The popular method of extraction edge pixels from the thresholded gradient magnitudes is non-maximal supression that tests
-every pixel whether it has the maximum gradient response along its gradient direction and eliminates if it does not. However, this method does not check status of the
-neighboring pixels, and therefore might result low quality (in terms of edge continuity, smoothness, thinness, localization) edge segments. Instead of non-maximal supression,
-ED points a set of edge pixels and join them by maximizing the total gradient response of edge segments. Therefore it can extract high quality edge segments without need for
-an additional hysteresis step.
+    Edge Drawing (ED) algorithm is an proactive approach on edge detection problem. In contrast to many other existing edge detection algorithms which follow a subtractive
+    approach (i.e. after applying gradient filters onto an image eliminating pixels w.r.t. several rules, e.g. non-maximal suppression and hysteresis in Canny), ED algorithm
+    works via an additive strategy, i.e. it picks edge pixels one by one, hence the name Edge Drawing. Then we process those random shaped edge segments to extract higher level
+    edge features, i.e. lines, circles, ellipses, etc. The popular method of extraction edge pixels from the thresholded gradient magnitudes is non-maximal supression that tests
+    every pixel whether it has the maximum gradient response along its gradient direction and eliminates if it does not. However, this method does not check status of the
+    neighboring pixels, and therefore might result low quality (in terms of edge continuity, smoothness, thinness, localization) edge segments. Instead of non-maximal supression,
+    ED points a set of edge pixels and join them by maximizing the total gradient response of edge segments. Therefore it can extract high quality edge segments without need for
+    an additional hysteresis step.
 
     @defgroup ximgproc_fourier Fourier descriptors
 
@@ -115,8 +116,7 @@ an additional hysteresis step.
 
     The size of the original image is required for compatibility with the imgproc functions when the boundary handling requires that pixel outside the image boundary are
     "on".
-
-    @}
+@}
 */
 
 namespace cv
@@ -124,6 +124,9 @@ namespace cv
 namespace ximgproc
 {
 
+//! @addtogroup ximgproc
+//! @{
+
 enum ThinningTypes{
     THINNING_ZHANGSUEN    = 0, // Thinning technique of Zhang-Suen
     THINNING_GUOHALL      = 1  // Thinning technique of Guo-Hall
@@ -139,9 +142,6 @@ enum LocalBinarizationMethods{
 	BINARIZATION_NICK = 3     //!< NICK technique. See @cite Khurshid2009 .
 };
 
-//! @addtogroup ximgproc
-//! @{
-
 /** @brief Performs thresholding on input images using Niblack's technique or some of the
 popular variations it inspired.
 
diff --git a/modules/ximgproc/include/opencv2/ximgproc/color_match.hpp b/modules/ximgproc/include/opencv2/ximgproc/color_match.hpp
index c18390d4ac6..8408b5b2331 100644
--- a/modules/ximgproc/include/opencv2/ximgproc/color_match.hpp
+++ b/modules/ximgproc/include/opencv2/ximgproc/color_match.hpp
@@ -61,6 +61,8 @@ CV_EXPORTS_W void qdft(InputArray img, OutputArray qimg, int  	flags, bool sideL
 */
 CV_EXPORTS_W void colorMatchTemplate(InputArray img, InputArray templ, OutputArray result);
 
+//! @}
+
 }
 }
 #endif
diff --git a/modules/ximgproc/include/opencv2/ximgproc/deriche_filter.hpp b/modules/ximgproc/include/opencv2/ximgproc/deriche_filter.hpp
index 26d3b6759da..18adade6f90 100644
--- a/modules/ximgproc/include/opencv2/ximgproc/deriche_filter.hpp
+++ b/modules/ximgproc/include/opencv2/ximgproc/deriche_filter.hpp
@@ -71,6 +71,8 @@ CV_EXPORTS_W void GradientDericheY(InputArray op, OutputArray dst, double alpha,
 */
 CV_EXPORTS_W void GradientDericheX(InputArray op, OutputArray dst, double alpha,double omega);
 
+//! @}
+
 }
 }
 #endif
diff --git a/modules/ximgproc/include/opencv2/ximgproc/edgepreserving_filter.hpp b/modules/ximgproc/include/opencv2/ximgproc/edgepreserving_filter.hpp
index f5685ce39bb..758b61b4349 100644
--- a/modules/ximgproc/include/opencv2/ximgproc/edgepreserving_filter.hpp
+++ b/modules/ximgproc/include/opencv2/ximgproc/edgepreserving_filter.hpp
@@ -26,8 +26,8 @@ namespace cv { namespace ximgproc {
     */
     CV_EXPORTS_W void edgePreservingFilter( InputArray src, OutputArray dst, int d, double threshold );
 
-}} // namespace
-
 //! @}
 
+}} // namespace
+
 #endif
diff --git a/modules/ximgproc/include/opencv2/ximgproc/fast_hough_transform.hpp b/modules/ximgproc/include/opencv2/ximgproc/fast_hough_transform.hpp
index adfbf543b57..94668b06520 100644
--- a/modules/ximgproc/include/opencv2/ximgproc/fast_hough_transform.hpp
+++ b/modules/ximgproc/include/opencv2/ximgproc/fast_hough_transform.hpp
@@ -82,8 +82,7 @@ enum AngleRangeOption
  *          two operands. Formally, a binary operation @f$ f @f$ on a set @f$ S @f$
  *          is a binary relation that maps elements of the Cartesian product
  *          @f$ S \times S @f$ to @f$ S @f$:
-*           @f[ f: S \times S \to S @f]
- * @ingroup MinUtils_MathOper
+ *          @f[ f: S \times S \to S @f]
  */
 enum HoughOp
 {
diff --git a/modules/ximgproc/include/opencv2/ximgproc/paillou_filter.hpp b/modules/ximgproc/include/opencv2/ximgproc/paillou_filter.hpp
index 03754a1119d..56fcd3c9618 100644
--- a/modules/ximgproc/include/opencv2/ximgproc/paillou_filter.hpp
+++ b/modules/ximgproc/include/opencv2/ximgproc/paillou_filter.hpp
@@ -61,6 +61,8 @@ namespace ximgproc {
 CV_EXPORTS void GradientPaillouY(InputArray op, OutputArray _dst, double alpha, double omega);
 CV_EXPORTS void GradientPaillouX(InputArray op, OutputArray _dst, double alpha, double omega);
 
+//! @}
+
 }
 }
 #endif
diff --git a/modules/ximgproc/include/opencv2/ximgproc/peilin.hpp b/modules/ximgproc/include/opencv2/ximgproc/peilin.hpp
index 1b224aaf88b..194f12e1196 100644
--- a/modules/ximgproc/include/opencv2/ximgproc/peilin.hpp
+++ b/modules/ximgproc/include/opencv2/ximgproc/peilin.hpp
@@ -27,6 +27,8 @@ namespace cv { namespace ximgproc {
     /** @overload */
     CV_EXPORTS_W void PeiLinNormalization ( InputArray I, OutputArray T );
 
+    //! @}
+
 }} // namespace
 
 #endif
diff --git a/modules/ximgproc/include/opencv2/ximgproc/run_length_morphology.hpp b/modules/ximgproc/include/opencv2/ximgproc/run_length_morphology.hpp
index c19e2d858db..6cf2eb663c1 100644
--- a/modules/ximgproc/include/opencv2/ximgproc/run_length_morphology.hpp
+++ b/modules/ximgproc/include/opencv2/ximgproc/run_length_morphology.hpp
@@ -113,6 +113,8 @@ CV_EXPORTS void createRLEImage(const std::vector<cv::Point3i>& runs, OutputArray
 CV_EXPORTS void morphologyEx(InputArray rlSrc, OutputArray rlDest, int op, InputArray rlKernel,
     bool bBoundaryOnForErosion = true, Point anchor = Point(0,0));
 
+//! @}
+
 }
 }
 }