Merge pull request opencv#3516 from cudawarped:cuda_moments

`cuda`: add `moments`
Chester-zZz · Dec 5, 2023 · 0bcbc73 · 0bcbc73
2 parents faa5468 + eca7f1c
commit 0bcbc73
Show file tree

Hide file tree

Showing 8 changed files with 548 additions and 5 deletions.
diff --git a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@@ -57,6 +57,7 @@
     @{
       @defgroup cudaimgproc_color Color space processing
       @defgroup cudaimgproc_hist Histogram Calculation
+      @defgroup cudaimgproc_shape Structural Analysis and Shape Descriptors
       @defgroup cudaimgproc_hough Hough Transform
       @defgroup cudaimgproc_feature Feature Detection
     @}
@@ -779,9 +780,84 @@ CV_EXPORTS_AS(connectedComponentsWithAlgorithm) void connectedComponents(InputAr
 CV_EXPORTS_W void connectedComponents(InputArray image, OutputArray labels,
     int connectivity = 8, int ltype = CV_32S);
 
-
 //! @}
 
+//! @addtogroup cudaimgproc_shape
+//! @{
+
+ /** @brief Order of image moments.
+ * @param FIRST_ORDER_MOMENTS First order moments
+ * @param SECOND_ORDER_MOMENTS Second order moments.
+ * @param THIRD_ORDER_MOMENTS Third order moments.
+ * */
+enum MomentsOrder {
+    FIRST_ORDER_MOMENTS = 1,
+    SECOND_ORDER_MOMENTS = 2,
+    THIRD_ORDER_MOMENTS = 3
+};
+
+/** @brief Returns the number of image moments less than or equal to the largest image moments \a order.
+@param order Order of largest moments to calculate with lower order moments requiring less computation.
+@returns number of image moments.
+
+@sa cuda::moments, cuda::spatialMoments, cuda::MomentsOrder
+ */
+CV_EXPORTS_W int numMoments(const MomentsOrder order);
+
+/** @brief Calculates all of the spatial moments up to the 3rd order of a rasterized shape.
+
+Asynchronous version of cuda::moments() which only calculates the spatial (not centralized or normalized) moments, up to the 3rd order, of a rasterized shape.
+Each moment is returned as a column entry in the 1D \a moments array.
+
+@param src Raster image (single-channel 2D array).
+@param [out] moments 1D array with each column entry containing a spatial image moment.
+@param binaryImage If it is true, all non-zero image pixels are treated as 1's.
+@param order Order of largest moments to calculate with lower order moments requiring less computation.
+@param momentsType Precision to use when calculating moments. Available types are `CV_32F` and `CV_64F` with the performance of `CV_32F` an order of magnitude greater than `CV_64F`. If the image is small the accuracy from `CV_32F` can be equal or very close to `CV_64F`.
+@param stream Stream for the asynchronous version.
+
+@note For maximum performance pre-allocate a 1D GpuMat for \a moments of the correct type and size large enough to store the all the image moments of up to the desired \a order. e.g. With \a order === MomentsOrder::SECOND_ORDER_MOMENTS and \a momentsType == `CV_32F` \a moments can be allocated as
+```
+GpuMat momentsDevice(1,numMoments(MomentsOrder::SECOND_ORDER_MOMENTS),CV_32F)
+```
+The central and normalized moments can easily be calculated on the host by downloading the \a moments array and using the cv::Moments constructor. e.g.
+```
+HostMem momentsHostMem(1, numMoments(MomentsOrder::SECOND_ORDER_MOMENTS), CV_32F);
+momentsDevice.download(momentsHostMem, stream);
+stream.waitForCompletion();
+Mat momentsMat = momentsHostMem.createMatHeader();
+cv::Moments cvMoments(momentsMat.at<float>(0), momentsMat.at<float>(1), momentsMat.at<float>(2), momentsMat.at<float>(3), momentsMat.at<float>(4), momentsMat.at<float>(5), momentsMat.at<float>(6), momentsMat.at<float>(7), momentsMat.at<float>(8), momentsMat.at<float>(9));
+```
+see the \a CUDA_TEST_P(Moments, Async) test inside opencv_contrib_source_code/modules/cudaimgproc/test/test_moments.cpp for an example.
+@returns cv::Moments.
+@sa cuda::moments
+*/
+CV_EXPORTS_W void spatialMoments(InputArray src, OutputArray moments, const bool binaryImage = false, const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS, const int momentsType = CV_64F, Stream& stream = Stream::Null());
+
+/** @brief Calculates all of the moments up to the 3rd order of a rasterized shape.
+
+The function computes moments, up to the 3rd order, of a rasterized shape. The
+results are returned in the structure cv::Moments.
+
+@param src Raster image (single-channel 2D array).
+@param binaryImage If it is true, all non-zero image pixels are treated as 1's.
+@param order Order of largest moments to calculate with lower order moments requiring less computation.
+ @param momentsType Precision to use when calculating moments. Available types are `CV_32F` and `CV_64F` with the performance of `CV_32F` an order of magnitude greater than `CV_64F`. If the image is small the accuracy from `CV_32F` can be equal or very close to `CV_64F`.
+
+@note For maximum performance use the asynchronous version cuda::spatialMoments() as this version interally allocates and deallocates both GpuMat and HostMem to respectively perform the calculation on the device and download the result to the host.
+The costly HostMem allocation cannot be avoided however the GpuMat device allocation can be by using BufferPool, e.g.
+```
+    setBufferPoolUsage(true);
+    setBufferPoolConfig(getDevice(), numMoments(order) * ((momentsType == CV_64F) ? sizeof(double) : sizeof(float)), 1);
+```
+see the \a CUDA_TEST_P(Moments, Accuracy) test inside opencv_contrib_source_code/modules/cudaimgproc/test/test_moments.cpp for an example.
+@returns cv::Moments.
+@sa cuda::spatialMoments
+ */
+CV_EXPORTS_W Moments moments(InputArray src, const bool binaryImage = false, const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS, const int momentsType = CV_64F);
+
+//! @} cudaimgproc_shape
+
 }} // namespace cv { namespace cuda {
 
 #endif /* OPENCV_CUDAIMGPROC_HPP */
diff --git a/modules/cudaimgproc/misc/python/test/test_cudaimgproc.py b/modules/cudaimgproc/misc/python/test/test_cudaimgproc.py
@@ -89,5 +89,30 @@ def test_cvtColor(self):
         self.assertTrue(np.allclose(cv.cuda.cvtColor(cuMat, cv.COLOR_BGR2HSV).download(),
                                          cv.cvtColor(npMat, cv.COLOR_BGR2HSV)))
 
+    def test_moments(self):
+        # setup
+        src_host = (np.ones([10,10])).astype(np.uint8)*255
+        cpu_moments = cv.moments(src_host, True)
+        moments_order = cv.cuda.THIRD_ORDER_MOMENTS
+        n_moments = cv.cuda.numMoments(cv.cuda.THIRD_ORDER_MOMENTS)
+        src_device = cv.cuda.GpuMat(src_host)
+
+        # synchronous
+        cv.cuda.setBufferPoolUsage(True)
+        cv.cuda.setBufferPoolConfig(cv.cuda.getDevice(), n_moments * np.dtype(float).itemsize, 1);
+        gpu_moments = cv.cuda.moments(src_device, True, moments_order, cv.CV_64F)
+        self.assertTrue(len([1 for moment_type in cpu_moments if moment_type in gpu_moments and cpu_moments[moment_type] == gpu_moments[moment_type]]) == 24)
+
+        # asynchronous
+        stream = cv.cuda.Stream()
+        moments_array_host = np.empty([1, n_moments], np.float64)
+        cv.cuda.registerPageLocked(moments_array_host)
+        moments_array_device = cv.cuda.GpuMat(1, n_moments, cv.CV_64F)
+        cv.cuda.spatialMoments(src_device, moments_array_device, True, moments_order, cv.CV_64F, stream)
+        moments_array_device.download(stream, moments_array_host);
+        stream.waitForCompletion()
+        cv.cuda.unregisterPageLocked(moments_array_host)
+        self.assertTrue(len([ 1 for moment_type,gpu_moment in zip(cpu_moments,moments_array_host[0]) if cpu_moments[moment_type] == gpu_moment]) == 10)
+
 if __name__ == '__main__':
     NewOpenCVTests.bootstrap()
diff --git a/modules/cudaimgproc/perf/perf_moments.cpp b/modules/cudaimgproc/perf/perf_moments.cpp
@@ -0,0 +1,61 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+
+namespace opencv_test { namespace {
+static void drawCircle(cv::Mat& dst, const cv::Vec3i& circle, bool fill)
+{
+    dst.setTo(Scalar::all(0));
+    cv::circle(dst, Point2i(circle[0], circle[1]), circle[2], Scalar::all(255), fill ? -1 : 1, cv::LINE_AA);
+}
+
+DEF_PARAM_TEST(Sz_Depth, Size, MatDepth);
+PERF_TEST_P(Sz_Depth, SpatialMoments, Combine(CUDA_TYPICAL_MAT_SIZES, Values(MatDepth(CV_32F), MatDepth((CV_64F)))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int momentsType = GET_PARAM(1);
+    Mat imgHost(size, CV_8U);
+    const Vec3i circle(size.width / 2, size.height / 2, static_cast<int>(static_cast<float>(size.width / 2) * 0.9));
+    drawCircle(imgHost, circle, true);
+    if (PERF_RUN_CUDA()) {
+        const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS;
+        const int nMoments = numMoments(order);
+        GpuMat momentsDevice(1, nMoments, momentsType);
+        const GpuMat imgDevice(imgHost);
+        TEST_CYCLE() cuda::spatialMoments(imgDevice, momentsDevice, false, order, momentsType);
+        SANITY_CHECK_NOTHING();
+    }
+    else {
+        cv::Moments momentsHost;
+        TEST_CYCLE() momentsHost = cv::moments(imgHost, false);
+        SANITY_CHECK_NOTHING();
+    }
+}
+
+PERF_TEST_P(Sz_Depth, Moments, Combine(CUDA_TYPICAL_MAT_SIZES, Values(MatDepth(CV_32F), MatDepth(CV_64F))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int momentsType = GET_PARAM(1);
+    Mat imgHost(size, CV_8U);
+    const Vec3i circle(size.width / 2, size.height / 2, static_cast<int>(static_cast<float>(size.width / 2) * 0.9));
+    drawCircle(imgHost, circle, true);
+    if (PERF_RUN_CUDA()) {
+        const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS;
+        const int nMoments = numMoments(order);
+        setBufferPoolUsage(true);
+        setBufferPoolConfig(getDevice(), nMoments * ((momentsType == CV_64F) ? sizeof(double) : sizeof(float)), 1);
+        const GpuMat imgDevice(imgHost);
+        cv::Moments momentsHost;
+        TEST_CYCLE() momentsHost = cuda::moments(imgDevice, false, order, momentsType);
+        SANITY_CHECK_NOTHING();
+    }
+    else {
+        cv::Moments momentsHost;
+        TEST_CYCLE() momentsHost = cv::moments(imgHost, false);
+        SANITY_CHECK_NOTHING();
+    }
+}
+
+}}
diff --git a/modules/cudaimgproc/src/cuda/moments.cu b/modules/cudaimgproc/src/cuda/moments.cu
@@ -0,0 +1,186 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#if !defined CUDA_DISABLER
+
+#include <opencv2/core/cuda/common.hpp>
+#include <opencv2/cudev/util/atomic.hpp>
+#include "moments.cuh"
+
+namespace cv { namespace cuda { namespace device { namespace imgproc {
+
+constexpr int blockSizeX = 32;
+constexpr int blockSizeY = 16;
+
+template <typename T>
+__device__ T butterflyWarpReduction(T value) {
+    for (int i = 16; i >= 1; i /= 2)
+        value += __shfl_xor_sync(0xffffffff, value, i, 32);
+    return value;
+}
+
+template <typename T>
+__device__ T butterflyHalfWarpReduction(T value) {
+    for (int i = 8; i >= 1; i /= 2)
+        value += __shfl_xor_sync(0xffff, value, i, 32);
+    return value;
+}
+
+template<typename T, int nMoments>
+__device__ void updateSums(const T val, const unsigned int x, T r[4]) {
+    const T x2 = x * x;
+    const T x3 = static_cast<T>(x) * x2;
+    r[0] += val;
+    r[1] += val * x;
+    if (nMoments >= n12) r[2] += val * x2;
+    if (nMoments >= n123) r[3] += val * x3;
+}
+
+template<typename TSrc, typename TMoments, int nMoments>
+__device__ void rowReductions(const PtrStepSz<TSrc> img, const bool binary, const unsigned int y, TMoments r[4], TMoments smem[][nMoments + 1]) {
+    for (int x = threadIdx.x; x < img.cols; x += blockDim.x) {
+        const TMoments val = (!binary || img(y, x) == 0) ? img(y, x) : 1;
+        updateSums<TMoments,nMoments>(val, x, r);
+    }
+}
+
+template<typename TSrc, typename TMoments, bool fourByteAligned, int nMoments>
+__device__ void rowReductionsCoalesced(const PtrStepSz<TSrc> img, const bool binary, const unsigned int y, TMoments r[4], const int offsetX, TMoments smem[][nMoments + 1]) {
+    const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
+    // load uncoalesced head
+    if (!fourByteAligned && threadIdx.x == 0) {
+        for (int x = 0; x < ::min(alignedOffset, static_cast<int>(img.cols)); x++) {
+            const TMoments val = (!binary || img(y, x) == 0) ? img(y, x) : 1;
+            updateSums<TMoments, nMoments>(val, x, r);
+        }
+    }
+
+    // coalesced loads
+    const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? img.ptr(y) : img.ptr(y) + alignedOffset);
+    const int cols4 = fourByteAligned ? img.cols / 4 : (img.cols - alignedOffset) / 4;
+    for (int x = threadIdx.x; x < cols4; x += blockDim.x) {
+        const unsigned int data = rowPtrIntAligned[x];
+#pragma unroll 4
+        for (int i = 0; i < 4; i++) {
+            const int iX = alignedOffset + 4 * x + i;
+            const uchar ucharVal = ((data >> i * 8) & 0xFFU);
+            const TMoments val = (!binary || ucharVal == 0) ? ucharVal : 1;
+            updateSums<TMoments, nMoments>(val, iX, r);
+        }
+    }
+
+    // load uncoalesced tail
+    if (threadIdx.x == 0) {
+        const int iTailStart = fourByteAligned ? cols4 * 4 : cols4 * 4 + alignedOffset;
+        for (int x = iTailStart; x < img.cols; x++) {
+            const TMoments val = (!binary || img(y, x) == 0) ? img(y, x) : 1;
+            updateSums<TMoments, nMoments>(val, x, r);
+        }
+    }
+}
+
+template <typename TSrc, typename TMoments, bool coalesced = false, bool fourByteAligned = false, int nMoments>
+__global__ void spatialMoments(const PtrStepSz<TSrc> img, const bool binary, TMoments* moments, const int offsetX = 0) {
+    const unsigned int y = blockIdx.x * blockDim.y + threadIdx.y;
+    __shared__ TMoments smem[blockSizeY][nMoments + 1];
+    if (threadIdx.y < nMoments && threadIdx.x < blockSizeY)
+        smem[threadIdx.x][threadIdx.y] = 0;
+    __syncthreads();
+
+    TMoments r[4] = { 0 };
+    if (y < img.rows) {
+        if (coalesced)
+            rowReductionsCoalesced<TSrc, TMoments, fourByteAligned, nMoments>(img, binary, y, r, offsetX, smem);
+        else
+            rowReductions<TSrc, TMoments, nMoments>(img, binary, y, r, smem);
+    }
+
+    const unsigned long y2 = y * y;
+    const TMoments y3 = static_cast<TMoments>(y2) * y;
+    const TMoments res = butterflyWarpReduction<float>(r[0]);
+    if (res) {
+        smem[threadIdx.y][0] = res; //0th
+        smem[threadIdx.y][1] = butterflyWarpReduction(r[1]); //1st
+        smem[threadIdx.y][2] = y * res; //1st
+        if (nMoments >= n12) {
+            smem[threadIdx.y][3] = butterflyWarpReduction(r[2]); //2nd
+            smem[threadIdx.y][4] = smem[threadIdx.y][1] * y; //2nd
+            smem[threadIdx.y][5] = y2 * res; //2nd
+        }
+        if (nMoments >= n123) {
+            smem[threadIdx.y][6] = butterflyWarpReduction(r[3]); //3rd
+            smem[threadIdx.y][7] = smem[threadIdx.y][3] * y; //3rd
+            smem[threadIdx.y][8] = smem[threadIdx.y][1] * y2; //3rd
+            smem[threadIdx.y][9] = y3 * res; //3rd
+        }
+    }
+    __syncthreads();
+
+    if (threadIdx.x < blockSizeY && threadIdx.y < nMoments)
+        smem[threadIdx.y][nMoments] = butterflyHalfWarpReduction(smem[threadIdx.x][threadIdx.y]);
+    __syncthreads();
+
+    if (threadIdx.y == 0 && threadIdx.x < nMoments) {
+        if (smem[threadIdx.x][nMoments])
+            cudev::atomicAdd(&moments[threadIdx.x], smem[threadIdx.x][nMoments]);
+    }
+}
+
+template <typename TSrc, typename TMoments, int nMoments> struct momentsDispatcherNonChar {
+    static void call(const PtrStepSz<TSrc> src, PtrStepSz<TMoments> moments, const bool binary, const int offsetX, const cudaStream_t stream) {
+        dim3 blockSize(blockSizeX, blockSizeY);
+        dim3 gridSize = dim3(divUp(src.rows, blockSizeY));
+        spatialMoments<TSrc, TMoments, false, false, nMoments> << <gridSize, blockSize, 0, stream >> > (src, binary, moments.ptr());
+        if (stream == 0)
+            cudaSafeCall(cudaStreamSynchronize(stream));
+    };
+};
+
+template <typename TSrc, int nMoments> struct momentsDispatcherChar {
+    static void call(const PtrStepSz<TSrc> src, PtrStepSz<float> moments, const bool binary, const int offsetX, const cudaStream_t stream) {
+        dim3 blockSize(blockSizeX, blockSizeY);
+        dim3 gridSize = dim3(divUp(src.rows, blockSizeY));
+        if (offsetX)
+            spatialMoments<TSrc, float, true, false, nMoments> << <gridSize, blockSize, 0, stream >> > (src, binary, moments.ptr(), offsetX);
+        else
+            spatialMoments<TSrc, float, true, true, nMoments> << <gridSize, blockSize, 0, stream >> > (src, binary, moments.ptr());
+
+        if (stream == 0)
+            cudaSafeCall(cudaStreamSynchronize(stream));
+    };
+};
+
+template <typename TSrc, typename TMoments, int nMoments> struct momentsDispatcher : momentsDispatcherNonChar<TSrc, TMoments, nMoments> {};
+template <int nMoments> struct momentsDispatcher<uchar, float, nMoments> : momentsDispatcherChar<uchar, nMoments> {};
+template <int nMoments> struct momentsDispatcher<schar, float, nMoments> : momentsDispatcherChar<schar, nMoments> {};
+
+template <typename TSrc, typename TMoments>
+void moments(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream) {
+    if (order == 1)
+        momentsDispatcher<TSrc, TMoments, n1>::call(static_cast<PtrStepSz<TSrc>>(src), static_cast<PtrStepSz<TMoments>>(moments), binary, offsetX, stream);
+    else if (order == 2)
+        momentsDispatcher<TSrc, TMoments, n12>::call(static_cast<PtrStepSz<TSrc>>(src), static_cast<PtrStepSz<TMoments>>(moments), binary, offsetX, stream);
+    else if (order == 3)
+        momentsDispatcher<TSrc, TMoments, n123>::call(static_cast<PtrStepSz<TSrc>>(src), static_cast<PtrStepSz<TMoments>>(moments), binary, offsetX, stream);
+};
+
+template void moments<uchar, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<schar, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<ushort, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<short, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<int, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<float, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<double, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+
+template void moments<uchar, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<schar, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<ushort, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<short, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<int, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<float, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+template void moments<double, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
+
+}}}}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudaimgproc/src/cuda/moments.cuh b/modules/cudaimgproc/src/cuda/moments.cuh
@@ -0,0 +1,6 @@
+#pragma once
+namespace cv { namespace cuda { namespace device { namespace imgproc {
+    constexpr int n1 = 3;
+    constexpr int n12 = 6;
+    constexpr int n123 = 10;
+}}}}