Skip to content

Commit

Permalink
cuda: add moments
Browse files Browse the repository at this point in the history
  • Loading branch information
cudawarped committed Dec 5, 2023
1 parent faa5468 commit eca7f1c
Show file tree
Hide file tree
Showing 8 changed files with 548 additions and 5 deletions.
78 changes: 77 additions & 1 deletion modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
@{
@defgroup cudaimgproc_color Color space processing
@defgroup cudaimgproc_hist Histogram Calculation
@defgroup cudaimgproc_shape Structural Analysis and Shape Descriptors
@defgroup cudaimgproc_hough Hough Transform
@defgroup cudaimgproc_feature Feature Detection
@}
Expand Down Expand Up @@ -779,9 +780,84 @@ CV_EXPORTS_AS(connectedComponentsWithAlgorithm) void connectedComponents(InputAr
CV_EXPORTS_W void connectedComponents(InputArray image, OutputArray labels,
int connectivity = 8, int ltype = CV_32S);


//! @}

//! @addtogroup cudaimgproc_shape
//! @{

/** @brief Order of image moments.
* @param FIRST_ORDER_MOMENTS First order moments
* @param SECOND_ORDER_MOMENTS Second order moments.
* @param THIRD_ORDER_MOMENTS Third order moments.
* */
enum MomentsOrder {
FIRST_ORDER_MOMENTS = 1,
SECOND_ORDER_MOMENTS = 2,
THIRD_ORDER_MOMENTS = 3
};

/** @brief Returns the number of image moments less than or equal to the largest image moments \a order.
@param order Order of largest moments to calculate with lower order moments requiring less computation.
@returns number of image moments.
@sa cuda::moments, cuda::spatialMoments, cuda::MomentsOrder
*/
CV_EXPORTS_W int numMoments(const MomentsOrder order);

/** @brief Calculates all of the spatial moments up to the 3rd order of a rasterized shape.
Asynchronous version of cuda::moments() which only calculates the spatial (not centralized or normalized) moments, up to the 3rd order, of a rasterized shape.
Each moment is returned as a column entry in the 1D \a moments array.
@param src Raster image (single-channel 2D array).
@param [out] moments 1D array with each column entry containing a spatial image moment.
@param binaryImage If it is true, all non-zero image pixels are treated as 1's.
@param order Order of largest moments to calculate with lower order moments requiring less computation.
@param momentsType Precision to use when calculating moments. Available types are `CV_32F` and `CV_64F` with the performance of `CV_32F` an order of magnitude greater than `CV_64F`. If the image is small the accuracy from `CV_32F` can be equal or very close to `CV_64F`.
@param stream Stream for the asynchronous version.
@note For maximum performance pre-allocate a 1D GpuMat for \a moments of the correct type and size large enough to store the all the image moments of up to the desired \a order. e.g. With \a order === MomentsOrder::SECOND_ORDER_MOMENTS and \a momentsType == `CV_32F` \a moments can be allocated as
```
GpuMat momentsDevice(1,numMoments(MomentsOrder::SECOND_ORDER_MOMENTS),CV_32F)
```
The central and normalized moments can easily be calculated on the host by downloading the \a moments array and using the cv::Moments constructor. e.g.
```
HostMem momentsHostMem(1, numMoments(MomentsOrder::SECOND_ORDER_MOMENTS), CV_32F);
momentsDevice.download(momentsHostMem, stream);
stream.waitForCompletion();
Mat momentsMat = momentsHostMem.createMatHeader();
cv::Moments cvMoments(momentsMat.at<float>(0), momentsMat.at<float>(1), momentsMat.at<float>(2), momentsMat.at<float>(3), momentsMat.at<float>(4), momentsMat.at<float>(5), momentsMat.at<float>(6), momentsMat.at<float>(7), momentsMat.at<float>(8), momentsMat.at<float>(9));
```
see the \a CUDA_TEST_P(Moments, Async) test inside opencv_contrib_source_code/modules/cudaimgproc/test/test_moments.cpp for an example.
@returns cv::Moments.
@sa cuda::moments
*/
CV_EXPORTS_W void spatialMoments(InputArray src, OutputArray moments, const bool binaryImage = false, const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS, const int momentsType = CV_64F, Stream& stream = Stream::Null());

/** @brief Calculates all of the moments up to the 3rd order of a rasterized shape.
The function computes moments, up to the 3rd order, of a rasterized shape. The
results are returned in the structure cv::Moments.
@param src Raster image (single-channel 2D array).
@param binaryImage If it is true, all non-zero image pixels are treated as 1's.
@param order Order of largest moments to calculate with lower order moments requiring less computation.
@param momentsType Precision to use when calculating moments. Available types are `CV_32F` and `CV_64F` with the performance of `CV_32F` an order of magnitude greater than `CV_64F`. If the image is small the accuracy from `CV_32F` can be equal or very close to `CV_64F`.
@note For maximum performance use the asynchronous version cuda::spatialMoments() as this version interally allocates and deallocates both GpuMat and HostMem to respectively perform the calculation on the device and download the result to the host.
The costly HostMem allocation cannot be avoided however the GpuMat device allocation can be by using BufferPool, e.g.
```
setBufferPoolUsage(true);
setBufferPoolConfig(getDevice(), numMoments(order) * ((momentsType == CV_64F) ? sizeof(double) : sizeof(float)), 1);
```
see the \a CUDA_TEST_P(Moments, Accuracy) test inside opencv_contrib_source_code/modules/cudaimgproc/test/test_moments.cpp for an example.
@returns cv::Moments.
@sa cuda::spatialMoments
*/
CV_EXPORTS_W Moments moments(InputArray src, const bool binaryImage = false, const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS, const int momentsType = CV_64F);

//! @} cudaimgproc_shape

}} // namespace cv { namespace cuda {

#endif /* OPENCV_CUDAIMGPROC_HPP */
25 changes: 25 additions & 0 deletions modules/cudaimgproc/misc/python/test/test_cudaimgproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,5 +89,30 @@ def test_cvtColor(self):
self.assertTrue(np.allclose(cv.cuda.cvtColor(cuMat, cv.COLOR_BGR2HSV).download(),
cv.cvtColor(npMat, cv.COLOR_BGR2HSV)))

def test_moments(self):
# setup
src_host = (np.ones([10,10])).astype(np.uint8)*255
cpu_moments = cv.moments(src_host, True)
moments_order = cv.cuda.THIRD_ORDER_MOMENTS
n_moments = cv.cuda.numMoments(cv.cuda.THIRD_ORDER_MOMENTS)
src_device = cv.cuda.GpuMat(src_host)

# synchronous
cv.cuda.setBufferPoolUsage(True)
cv.cuda.setBufferPoolConfig(cv.cuda.getDevice(), n_moments * np.dtype(float).itemsize, 1);
gpu_moments = cv.cuda.moments(src_device, True, moments_order, cv.CV_64F)
self.assertTrue(len([1 for moment_type in cpu_moments if moment_type in gpu_moments and cpu_moments[moment_type] == gpu_moments[moment_type]]) == 24)

# asynchronous
stream = cv.cuda.Stream()
moments_array_host = np.empty([1, n_moments], np.float64)
cv.cuda.registerPageLocked(moments_array_host)
moments_array_device = cv.cuda.GpuMat(1, n_moments, cv.CV_64F)
cv.cuda.spatialMoments(src_device, moments_array_device, True, moments_order, cv.CV_64F, stream)
moments_array_device.download(stream, moments_array_host);
stream.waitForCompletion()
cv.cuda.unregisterPageLocked(moments_array_host)
self.assertTrue(len([ 1 for moment_type,gpu_moment in zip(cpu_moments,moments_array_host[0]) if cpu_moments[moment_type] == gpu_moment]) == 10)

if __name__ == '__main__':
NewOpenCVTests.bootstrap()
61 changes: 61 additions & 0 deletions modules/cudaimgproc/perf/perf_moments.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.

#include "perf_precomp.hpp"

namespace opencv_test { namespace {
static void drawCircle(cv::Mat& dst, const cv::Vec3i& circle, bool fill)
{
dst.setTo(Scalar::all(0));
cv::circle(dst, Point2i(circle[0], circle[1]), circle[2], Scalar::all(255), fill ? -1 : 1, cv::LINE_AA);
}

DEF_PARAM_TEST(Sz_Depth, Size, MatDepth);
PERF_TEST_P(Sz_Depth, SpatialMoments, Combine(CUDA_TYPICAL_MAT_SIZES, Values(MatDepth(CV_32F), MatDepth((CV_64F)))))
{
const cv::Size size = GET_PARAM(0);
const int momentsType = GET_PARAM(1);
Mat imgHost(size, CV_8U);
const Vec3i circle(size.width / 2, size.height / 2, static_cast<int>(static_cast<float>(size.width / 2) * 0.9));
drawCircle(imgHost, circle, true);
if (PERF_RUN_CUDA()) {
const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS;
const int nMoments = numMoments(order);
GpuMat momentsDevice(1, nMoments, momentsType);
const GpuMat imgDevice(imgHost);
TEST_CYCLE() cuda::spatialMoments(imgDevice, momentsDevice, false, order, momentsType);
SANITY_CHECK_NOTHING();
}
else {
cv::Moments momentsHost;
TEST_CYCLE() momentsHost = cv::moments(imgHost, false);
SANITY_CHECK_NOTHING();
}
}

PERF_TEST_P(Sz_Depth, Moments, Combine(CUDA_TYPICAL_MAT_SIZES, Values(MatDepth(CV_32F), MatDepth(CV_64F))))
{
const cv::Size size = GET_PARAM(0);
const int momentsType = GET_PARAM(1);
Mat imgHost(size, CV_8U);
const Vec3i circle(size.width / 2, size.height / 2, static_cast<int>(static_cast<float>(size.width / 2) * 0.9));
drawCircle(imgHost, circle, true);
if (PERF_RUN_CUDA()) {
const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS;
const int nMoments = numMoments(order);
setBufferPoolUsage(true);
setBufferPoolConfig(getDevice(), nMoments * ((momentsType == CV_64F) ? sizeof(double) : sizeof(float)), 1);
const GpuMat imgDevice(imgHost);
cv::Moments momentsHost;
TEST_CYCLE() momentsHost = cuda::moments(imgDevice, false, order, momentsType);
SANITY_CHECK_NOTHING();
}
else {
cv::Moments momentsHost;
TEST_CYCLE() momentsHost = cv::moments(imgHost, false);
SANITY_CHECK_NOTHING();
}
}

}}
186 changes: 186 additions & 0 deletions modules/cudaimgproc/src/cuda/moments.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.

#if !defined CUDA_DISABLER

#include <opencv2/core/cuda/common.hpp>
#include <opencv2/cudev/util/atomic.hpp>
#include "moments.cuh"

namespace cv { namespace cuda { namespace device { namespace imgproc {

constexpr int blockSizeX = 32;
constexpr int blockSizeY = 16;

template <typename T>
__device__ T butterflyWarpReduction(T value) {
for (int i = 16; i >= 1; i /= 2)
value += __shfl_xor_sync(0xffffffff, value, i, 32);
return value;
}

template <typename T>
__device__ T butterflyHalfWarpReduction(T value) {
for (int i = 8; i >= 1; i /= 2)
value += __shfl_xor_sync(0xffff, value, i, 32);
return value;
}

template<typename T, int nMoments>
__device__ void updateSums(const T val, const unsigned int x, T r[4]) {
const T x2 = x * x;
const T x3 = static_cast<T>(x) * x2;
r[0] += val;
r[1] += val * x;
if (nMoments >= n12) r[2] += val * x2;
if (nMoments >= n123) r[3] += val * x3;
}

template<typename TSrc, typename TMoments, int nMoments>
__device__ void rowReductions(const PtrStepSz<TSrc> img, const bool binary, const unsigned int y, TMoments r[4], TMoments smem[][nMoments + 1]) {
for (int x = threadIdx.x; x < img.cols; x += blockDim.x) {
const TMoments val = (!binary || img(y, x) == 0) ? img(y, x) : 1;
updateSums<TMoments,nMoments>(val, x, r);
}
}

template<typename TSrc, typename TMoments, bool fourByteAligned, int nMoments>
__device__ void rowReductionsCoalesced(const PtrStepSz<TSrc> img, const bool binary, const unsigned int y, TMoments r[4], const int offsetX, TMoments smem[][nMoments + 1]) {
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
// load uncoalesced head
if (!fourByteAligned && threadIdx.x == 0) {
for (int x = 0; x < ::min(alignedOffset, static_cast<int>(img.cols)); x++) {
const TMoments val = (!binary || img(y, x) == 0) ? img(y, x) : 1;
updateSums<TMoments, nMoments>(val, x, r);
}
}

// coalesced loads
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? img.ptr(y) : img.ptr(y) + alignedOffset);
const int cols4 = fourByteAligned ? img.cols / 4 : (img.cols - alignedOffset) / 4;
for (int x = threadIdx.x; x < cols4; x += blockDim.x) {
const unsigned int data = rowPtrIntAligned[x];
#pragma unroll 4
for (int i = 0; i < 4; i++) {
const int iX = alignedOffset + 4 * x + i;
const uchar ucharVal = ((data >> i * 8) & 0xFFU);
const TMoments val = (!binary || ucharVal == 0) ? ucharVal : 1;
updateSums<TMoments, nMoments>(val, iX, r);
}
}

// load uncoalesced tail
if (threadIdx.x == 0) {
const int iTailStart = fourByteAligned ? cols4 * 4 : cols4 * 4 + alignedOffset;
for (int x = iTailStart; x < img.cols; x++) {
const TMoments val = (!binary || img(y, x) == 0) ? img(y, x) : 1;
updateSums<TMoments, nMoments>(val, x, r);
}
}
}

template <typename TSrc, typename TMoments, bool coalesced = false, bool fourByteAligned = false, int nMoments>
__global__ void spatialMoments(const PtrStepSz<TSrc> img, const bool binary, TMoments* moments, const int offsetX = 0) {
const unsigned int y = blockIdx.x * blockDim.y + threadIdx.y;
__shared__ TMoments smem[blockSizeY][nMoments + 1];
if (threadIdx.y < nMoments && threadIdx.x < blockSizeY)
smem[threadIdx.x][threadIdx.y] = 0;
__syncthreads();

TMoments r[4] = { 0 };
if (y < img.rows) {
if (coalesced)
rowReductionsCoalesced<TSrc, TMoments, fourByteAligned, nMoments>(img, binary, y, r, offsetX, smem);
else
rowReductions<TSrc, TMoments, nMoments>(img, binary, y, r, smem);
}

const unsigned long y2 = y * y;
const TMoments y3 = static_cast<TMoments>(y2) * y;
const TMoments res = butterflyWarpReduction<float>(r[0]);
if (res) {
smem[threadIdx.y][0] = res; //0th
smem[threadIdx.y][1] = butterflyWarpReduction(r[1]); //1st
smem[threadIdx.y][2] = y * res; //1st
if (nMoments >= n12) {
smem[threadIdx.y][3] = butterflyWarpReduction(r[2]); //2nd
smem[threadIdx.y][4] = smem[threadIdx.y][1] * y; //2nd
smem[threadIdx.y][5] = y2 * res; //2nd
}
if (nMoments >= n123) {
smem[threadIdx.y][6] = butterflyWarpReduction(r[3]); //3rd
smem[threadIdx.y][7] = smem[threadIdx.y][3] * y; //3rd
smem[threadIdx.y][8] = smem[threadIdx.y][1] * y2; //3rd
smem[threadIdx.y][9] = y3 * res; //3rd
}
}
__syncthreads();

if (threadIdx.x < blockSizeY && threadIdx.y < nMoments)
smem[threadIdx.y][nMoments] = butterflyHalfWarpReduction(smem[threadIdx.x][threadIdx.y]);
__syncthreads();

if (threadIdx.y == 0 && threadIdx.x < nMoments) {
if (smem[threadIdx.x][nMoments])
cudev::atomicAdd(&moments[threadIdx.x], smem[threadIdx.x][nMoments]);
}
}

template <typename TSrc, typename TMoments, int nMoments> struct momentsDispatcherNonChar {
static void call(const PtrStepSz<TSrc> src, PtrStepSz<TMoments> moments, const bool binary, const int offsetX, const cudaStream_t stream) {
dim3 blockSize(blockSizeX, blockSizeY);
dim3 gridSize = dim3(divUp(src.rows, blockSizeY));
spatialMoments<TSrc, TMoments, false, false, nMoments> << <gridSize, blockSize, 0, stream >> > (src, binary, moments.ptr());
if (stream == 0)
cudaSafeCall(cudaStreamSynchronize(stream));
};
};

template <typename TSrc, int nMoments> struct momentsDispatcherChar {
static void call(const PtrStepSz<TSrc> src, PtrStepSz<float> moments, const bool binary, const int offsetX, const cudaStream_t stream) {
dim3 blockSize(blockSizeX, blockSizeY);
dim3 gridSize = dim3(divUp(src.rows, blockSizeY));
if (offsetX)
spatialMoments<TSrc, float, true, false, nMoments> << <gridSize, blockSize, 0, stream >> > (src, binary, moments.ptr(), offsetX);
else
spatialMoments<TSrc, float, true, true, nMoments> << <gridSize, blockSize, 0, stream >> > (src, binary, moments.ptr());

if (stream == 0)
cudaSafeCall(cudaStreamSynchronize(stream));
};
};

template <typename TSrc, typename TMoments, int nMoments> struct momentsDispatcher : momentsDispatcherNonChar<TSrc, TMoments, nMoments> {};
template <int nMoments> struct momentsDispatcher<uchar, float, nMoments> : momentsDispatcherChar<uchar, nMoments> {};
template <int nMoments> struct momentsDispatcher<schar, float, nMoments> : momentsDispatcherChar<schar, nMoments> {};

template <typename TSrc, typename TMoments>
void moments(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream) {
if (order == 1)
momentsDispatcher<TSrc, TMoments, n1>::call(static_cast<PtrStepSz<TSrc>>(src), static_cast<PtrStepSz<TMoments>>(moments), binary, offsetX, stream);
else if (order == 2)
momentsDispatcher<TSrc, TMoments, n12>::call(static_cast<PtrStepSz<TSrc>>(src), static_cast<PtrStepSz<TMoments>>(moments), binary, offsetX, stream);
else if (order == 3)
momentsDispatcher<TSrc, TMoments, n123>::call(static_cast<PtrStepSz<TSrc>>(src), static_cast<PtrStepSz<TMoments>>(moments), binary, offsetX, stream);
};

template void moments<uchar, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
template void moments<schar, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
template void moments<ushort, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
template void moments<short, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
template void moments<int, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
template void moments<float, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
template void moments<double, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);

template void moments<uchar, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
template void moments<schar, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
template void moments<ushort, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
template void moments<short, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
template void moments<int, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
template void moments<float, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
template void moments<double, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);

}}}}

#endif /* CUDA_DISABLER */
6 changes: 6 additions & 0 deletions modules/cudaimgproc/src/cuda/moments.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#pragma once
namespace cv { namespace cuda { namespace device { namespace imgproc {
constexpr int n1 = 3;
constexpr int n12 = 6;
constexpr int n123 = 10;
}}}}
Loading

0 comments on commit eca7f1c

Please sign in to comment.