diff --git a/niftyreg_build_version.txt b/niftyreg_build_version.txt index 446dfcc5..5b0cffbc 100644 --- a/niftyreg_build_version.txt +++ b/niftyreg_build_version.txt @@ -1 +1 @@ -369 +370 diff --git a/reg-lib/cuda/BlockSize.hpp b/reg-lib/cuda/BlockSize.hpp index 5483ae59..fe411adb 100644 --- a/reg-lib/cuda/BlockSize.hpp +++ b/reg-lib/cuda/BlockSize.hpp @@ -32,10 +32,6 @@ struct BlockSize { unsigned reg_defField_compose2D; unsigned reg_defField_compose3D; unsigned reg_defField_getJacobianMatrix; - unsigned reg_initialiseConjugateGradient; - unsigned reg_getConjugateGradient1; - unsigned reg_getConjugateGradient2; - unsigned reg_updateControlPointPosition; unsigned reg_voxelCentricToNodeCentric; unsigned reg_convertNmiGradientFromVoxelToRealSpace; unsigned reg_ApplyConvolutionWindowAlongX; @@ -68,10 +64,6 @@ struct BlockSize100: public BlockSize { reg_defField_compose2D = 512; // 15 reg - 24 smem - 08 cmem - 16 lmem reg_defField_compose3D = 384; // 21 reg - 24 smem - 08 cmem - 24 lmem reg_defField_getJacobianMatrix = 512; // 16 reg - 24 smem - 04 cmem - reg_initialiseConjugateGradient = 384; // 09 reg - 24 smem - reg_getConjugateGradient1 = 320; // 12 reg - 24 smem - reg_getConjugateGradient2 = 384; // 10 reg - 40 smem - reg_updateControlPointPosition = 384; // 08 reg - 24 smem reg_voxelCentricToNodeCentric = 320; // 11 reg - 24 smem - 16 cmem reg_convertNmiGradientFromVoxelToRealSpace = 512; // 16 reg - 24 smem reg_ApplyConvolutionWindowAlongX = 512; // 14 reg - 28 smem - 08 cmem @@ -106,10 +98,6 @@ struct BlockSize300: public BlockSize { reg_defField_compose2D = 1024; // 23 reg reg_defField_compose3D = 1024; // 24 reg reg_defField_getJacobianMatrix = 768; // 34 reg - reg_initialiseConjugateGradient = 1024; // 20 reg - reg_getConjugateGradient1 = 1024; // 22 reg - reg_getConjugateGradient2 = 1024; // 25 reg - reg_updateControlPointPosition = 1024; // 22 reg reg_voxelCentricToNodeCentric = 1024; // 23 reg reg_convertNmiGradientFromVoxelToRealSpace = 1024; // 23 reg reg_ApplyConvolutionWindowAlongX = 1024; // 25 reg diff --git a/reg-lib/cuda/CudaCompute.cu b/reg-lib/cuda/CudaCompute.cu index 1b8f140d..9dfae7b0 100644 --- a/reg-lib/cuda/CudaCompute.cu +++ b/reg-lib/cuda/CudaCompute.cu @@ -109,6 +109,51 @@ void CudaCompute::GetDeformationField(bool composition, bool bspline) { bspline); } /* *************************************************************** */ +template +inline void UpdateControlPointPosition(float4 *currentDofCuda, + cudaTextureObject_t bestDofTexture, + cudaTextureObject_t gradientTexture, + const size_t nVoxels, + const float scale) { + thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), nVoxels, [=]__device__(const int index) { + float4 dofValue = currentDofCuda[index]; scale; // To capture scale + const float4 bestValue = tex1Dfetch(bestDofTexture, index); + const float4 gradValue = tex1Dfetch(gradientTexture, index); + if constexpr (optimiseX) + dofValue.x = bestValue.x + scale * gradValue.x; + if constexpr (optimiseY) + dofValue.y = bestValue.y + scale * gradValue.y; + if constexpr (optimiseZ) + dofValue.z = bestValue.z + scale * gradValue.z; + currentDofCuda[index] = dofValue; + }); +} +/* *************************************************************** */ +template +static inline void UpdateControlPointPosition(float4 *currentDofCuda, + cudaTextureObject_t bestDofTexture, + cudaTextureObject_t gradientTexture, + const size_t nVoxels, + const float scale, + const bool optimiseZ) { + auto updateControlPointPosition = UpdateControlPointPosition; + if (!optimiseZ) updateControlPointPosition = UpdateControlPointPosition; + updateControlPointPosition(currentDofCuda, bestDofTexture, gradientTexture, nVoxels, scale); +} +/* *************************************************************** */ +template +static inline void UpdateControlPointPosition(float4 *currentDofCuda, + cudaTextureObject_t bestDofTexture, + cudaTextureObject_t gradientTexture, + const size_t nVoxels, + const float scale, + const bool optimiseY, + const bool optimiseZ) { + auto updateControlPointPosition = UpdateControlPointPosition; + if (!optimiseY) updateControlPointPosition = UpdateControlPointPosition; + updateControlPointPosition(currentDofCuda, bestDofTexture, gradientTexture, nVoxels, scale, optimiseZ); +} +/* *************************************************************** */ void CudaCompute::UpdateControlPointPosition(float *currentDof, const float *bestDof, const float *gradient, @@ -116,14 +161,16 @@ void CudaCompute::UpdateControlPointPosition(float *currentDof, const bool optimiseX, const bool optimiseY, const bool optimiseZ) { - Cuda::UpdateControlPointPosition(NiftiImage::calcVoxelNumber(dynamic_cast(con).F3dContent::GetControlPointGrid(), 3), - reinterpret_cast(currentDof), - reinterpret_cast(bestDof), - reinterpret_cast(gradient), - scale, - optimiseX, - optimiseY, - optimiseZ); + const nifti_image *controlPointGrid = dynamic_cast(con).F3dContent::GetControlPointGrid(); + const bool is3d = controlPointGrid->nz > 1; + const size_t nVoxels = NiftiImage::calcVoxelNumber(controlPointGrid, 3); + auto bestDofTexturePtr = Cuda::CreateTextureObject(reinterpret_cast(bestDof), nVoxels, cudaChannelFormatKindFloat, 4); + auto gradientTexturePtr = Cuda::CreateTextureObject(reinterpret_cast(gradient), nVoxels, cudaChannelFormatKindFloat, 4); + + auto updateControlPointPosition = ::UpdateControlPointPosition; + if (!optimiseX) updateControlPointPosition = ::UpdateControlPointPosition; + updateControlPointPosition(reinterpret_cast(currentDof), *bestDofTexturePtr, *gradientTexturePtr, + nVoxels, scale, optimiseY, is3d ? optimiseZ : false); } /* *************************************************************** */ void CudaCompute::GetImageGradient(int interpolation, float paddingValue, int activeTimePoint) { diff --git a/reg-lib/cuda/CudaOptimiser.cu b/reg-lib/cuda/CudaOptimiser.cu index 1a094805..587b4f7d 100644 --- a/reg-lib/cuda/CudaOptimiser.cu +++ b/reg-lib/cuda/CudaOptimiser.cu @@ -1,5 +1,4 @@ #include "CudaOptimiser.hpp" -#include "CudaOptimiserKernels.cu" #include "_reg_common_cuda_kernels.cu" #include @@ -178,20 +177,6 @@ void CudaConjugateGradient::Initialise(size_t nvox, NR_FUNC_CALLED(); } /* *************************************************************** */ -void CudaConjugateGradient::UpdateGradientValues() { - if (this->firstCall) { - NR_DEBUG("Conjugate gradient initialisation"); - InitialiseConjugateGradient(this->gradientCuda, this->array1, this->array2, this->GetVoxNumber()); - if (this->isSymmetric) - InitialiseConjugateGradient(this->gradientBwCuda, this->array1Bw, this->array2Bw, this->GetVoxNumberBw()); - this->firstCall = false; - } else { - NR_DEBUG("Conjugate gradient update"); - GetConjugateGradient(this->gradientCuda, this->array1, this->array2, this->GetVoxNumber(), - this->isSymmetric, this->gradientBwCuda, this->array1Bw, this->array2Bw, this->GetVoxNumberBw()); - } -} -/* *************************************************************** */ void CudaConjugateGradient::Optimise(float maxLength, float smallLength, float& startLength) { @@ -204,108 +189,107 @@ void CudaConjugateGradient::Perturbation(float length) { this->firstCall = true; } /* *************************************************************** */ -void CudaConjugateGradient::InitialiseConjugateGradient(float4 *gradientImageCuda, - float4 *conjugateGCuda, - float4 *conjugateHCuda, - const size_t nVoxels) { - auto gradientImageTexture = Cuda::CreateTextureObject(gradientImageCuda, nVoxels, cudaChannelFormatKindFloat, 4); - - const unsigned blocks = CudaContext::GetBlockSize()->reg_initialiseConjugateGradient; - const unsigned grids = (unsigned)Ceil(sqrtf((float)nVoxels / (float)blocks)); - const dim3 gridDims(grids, grids, 1); - const dim3 blockDims(blocks, 1, 1); - - Cuda::InitialiseConjugateGradientKernel<<>>(conjugateGCuda, *gradientImageTexture, (unsigned)nVoxels); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); - NR_CUDA_SAFE_CALL(cudaMemcpy(conjugateHCuda, conjugateGCuda, nVoxels * sizeof(float4), cudaMemcpyDeviceToDevice)); +void InitialiseConjugateGradient(float4 *gradientCuda, float4 *conjugateGCuda, float4 *conjugateHCuda, const size_t nVoxels) { + auto gradientTexturePtr = Cuda::CreateTextureObject(gradientCuda, nVoxels, cudaChannelFormatKindFloat, 4); + auto gradientTexture = *gradientTexturePtr; + thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), nVoxels, [=]__device__(const int index) { + const float4 gradValue = tex1Dfetch(gradientTexture, index); + conjugateGCuda[index] = conjugateHCuda[index] = make_float4(-gradValue.x, -gradValue.y, -gradValue.z, 0); + }); } /* *************************************************************** */ -struct Float2Sum { - __host__ __device__ double2 operator()(const float2& a, const float2& b) const { - return make_double2((double)a.x + (double)b.x, (double)a.y + (double)b.y); - } -}; -/* *************************************************************** */ -void CudaConjugateGradient::GetConjugateGradient(float4 *gradientImageCuda, - float4 *conjugateGCuda, - float4 *conjugateHCuda, - const size_t nVoxels, - const bool isSymmetric, - float4 *gradientImageBwCuda, - float4 *conjugateGBwCuda, - float4 *conjugateHBwCuda, - const size_t nVoxelsBw) { - auto gradientImageTexture = Cuda::CreateTextureObject(gradientImageCuda, nVoxels, cudaChannelFormatKindFloat, 4); - auto conjugateGTexture = Cuda::CreateTextureObject(conjugateGCuda, nVoxels, cudaChannelFormatKindFloat, 4); - auto conjugateHTexture = Cuda::CreateTextureObject(conjugateHCuda, nVoxels, cudaChannelFormatKindFloat, 4); - Cuda::UniqueTextureObjectPtr gradientImageBwTexture, conjugateGBwTexture, conjugateHBwTexture; +void GetConjugateGradient(float4 *gradientCuda, + float4 *conjugateGCuda, + float4 *conjugateHCuda, + const size_t nVoxels, + const bool isSymmetric, + float4 *gradientBwCuda, + float4 *conjugateGBwCuda, + float4 *conjugateHBwCuda, + const size_t nVoxelsBw) { + auto gradientTexturePtr = Cuda::CreateTextureObject(gradientCuda, nVoxels, cudaChannelFormatKindFloat, 4); + auto conjugateGTexturePtr = Cuda::CreateTextureObject(conjugateGCuda, nVoxels, cudaChannelFormatKindFloat, 4); + auto conjugateHTexturePtr = Cuda::CreateTextureObject(conjugateHCuda, nVoxels, cudaChannelFormatKindFloat, 4); + auto gradientTexture = *gradientTexturePtr; + auto conjugateGTexture = *conjugateGTexturePtr; + auto conjugateHTexture = *conjugateHTexturePtr; + Cuda::UniqueTextureObjectPtr gradientBwTexturePtr, conjugateGBwTexturePtr, conjugateHBwTexturePtr; + cudaTextureObject_t gradientBwTexture = 0, conjugateGBwTexture = 0, conjugateHBwTexture = 0; if (isSymmetric) { - gradientImageBwTexture = Cuda::CreateTextureObject(gradientImageBwCuda, nVoxelsBw, cudaChannelFormatKindFloat, 4); - conjugateGBwTexture = Cuda::CreateTextureObject(conjugateGBwCuda, nVoxelsBw, cudaChannelFormatKindFloat, 4); - conjugateHBwTexture = Cuda::CreateTextureObject(conjugateHBwCuda, nVoxelsBw, cudaChannelFormatKindFloat, 4); + gradientBwTexturePtr = Cuda::CreateTextureObject(gradientBwCuda, nVoxelsBw, cudaChannelFormatKindFloat, 4); + conjugateGBwTexturePtr = Cuda::CreateTextureObject(conjugateGBwCuda, nVoxelsBw, cudaChannelFormatKindFloat, 4); + conjugateHBwTexturePtr = Cuda::CreateTextureObject(conjugateHBwCuda, nVoxelsBw, cudaChannelFormatKindFloat, 4); + gradientBwTexture = *gradientBwTexturePtr; + conjugateGBwTexture = *conjugateGBwTexturePtr; + conjugateHBwTexture = *conjugateHBwTexturePtr; } // gam = sum((grad+g)*grad)/sum(HxG); - unsigned blocks = CudaContext::GetBlockSize()->reg_getConjugateGradient1; - unsigned grids = (unsigned)Ceil(sqrtf((float)nVoxels / (float)blocks)); - dim3 blockDims(blocks, 1, 1); - dim3 gridDims(grids, grids, 1); + auto calcGam = []__device__(cudaTextureObject_t gradientTexture, cudaTextureObject_t conjugateGTexture, + cudaTextureObject_t conjugateHTexture, const int index) { + const float4 hValue = tex1Dfetch(conjugateHTexture, index); + const float4 gValue = tex1Dfetch(conjugateGTexture, index); + const float gg = gValue.x * hValue.x + gValue.y * hValue.y + gValue.z * hValue.z; + + const float4 grad = tex1Dfetch(gradientTexture, index); + const float dgg = (grad.x + gValue.x) * grad.x + (grad.y + gValue.y) * grad.y + (grad.z + gValue.z) * grad.z; + + return make_double2(dgg, gg); + }; - thrust::device_vector sumsCuda(nVoxels + nVoxels % 2); // Make it even for thrust::inner_product - Cuda::GetConjugateGradientKernel1<<>>(sumsCuda.data().get(), *gradientImageTexture, - *conjugateGTexture, *conjugateHTexture, (unsigned)nVoxels); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); - const size_t sumsSizeHalf = sumsCuda.size() / 2; - const double2 gg = thrust::inner_product(sumsCuda.begin(), sumsCuda.begin() + sumsSizeHalf, sumsCuda.begin() + sumsSizeHalf, - make_double2(0, 0), thrust::plus(), Float2Sum()); - float gam = static_cast(gg.x / gg.y); + double gam; + thrust::counting_iterator it(0); + const double2 gg = thrust::transform_reduce(thrust::device, it, it + nVoxels, [=]__device__(const int index) { + return calcGam(gradientTexture, conjugateGTexture, conjugateHTexture, index); + }, make_double2(0, 0), thrust::plus()); if (isSymmetric) { - grids = (unsigned)Ceil(sqrtf((float)nVoxelsBw / (float)blocks)); - gridDims = dim3(blocks, 1, 1); - blockDims = dim3(grids, grids, 1); - thrust::device_vector sumsBwCuda(nVoxelsBw + nVoxelsBw % 2); // Make it even for thrust::inner_product - Cuda::GetConjugateGradientKernel1<<>>(sumsBwCuda.data().get(), *gradientImageBwTexture, - *conjugateGBwTexture, *conjugateHBwTexture, (unsigned)nVoxelsBw); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); - const size_t sumsBwSizeHalf = sumsBwCuda.size() / 2; - const double2 ggBw = thrust::inner_product(sumsBwCuda.begin(), sumsBwCuda.begin() + sumsBwSizeHalf, sumsBwCuda.begin() + sumsBwSizeHalf, - make_double2(0, 0), thrust::plus(), Float2Sum()); - gam = static_cast((gg.x + ggBw.x) / (gg.y + ggBw.y)); - } + it = thrust::counting_iterator(0); + const double2 ggBw = thrust::transform_reduce(thrust::device, it, it + nVoxelsBw, [=]__device__(const int index) { + return calcGam(gradientBwTexture, conjugateGBwTexture, conjugateHBwTexture, index); + }, make_double2(0, 0), thrust::plus()); + gam = (gg.x + ggBw.x) / (gg.y + ggBw.y); + } else gam = gg.x / gg.y; + + // Conjugate gradient + auto conjugate = [gam]__device__(float4 *gradientCuda, float4 *conjugateGCuda, float4 *conjugateHCuda, + cudaTextureObject_t gradientTexture, cudaTextureObject_t conjugateHTexture, const int index) { + // G = -grad + float4 gradGValue = tex1Dfetch(gradientTexture, index); + gradGValue = make_float4(-gradGValue.x, -gradGValue.y, -gradGValue.z, 0); + conjugateGCuda[index] = gradGValue; + + // H = G + gam * H + float4 gradHValue = tex1Dfetch(conjugateHTexture, index); + gradHValue = make_float4(gradGValue.x + gam * gradHValue.x, + gradGValue.y + gam * gradHValue.y, + gradGValue.z + gam * gradHValue.z, 0); + conjugateHCuda[index] = gradHValue; + + gradientCuda[index] = make_float4(-gradHValue.x, -gradHValue.y, -gradHValue.z, 0); + }; - blocks = (unsigned)CudaContext::GetBlockSize()->reg_getConjugateGradient2; - grids = (unsigned)Ceil(sqrtf((float)nVoxels / (float)blocks)); - gridDims = dim3(blocks, 1, 1); - blockDims = dim3(grids, grids, 1); - Cuda::GetConjugateGradientKernel2<<>>(gradientImageCuda, conjugateGCuda, conjugateHCuda, (unsigned)nVoxels, gam); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); + thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), nVoxels, [=]__device__(const int index) { + conjugate(gradientCuda, conjugateGCuda, conjugateHCuda, gradientTexture, conjugateHTexture, index); + }); if (isSymmetric) { - grids = (unsigned)Ceil(sqrtf((float)nVoxelsBw / (float)blocks)); - gridDims = dim3(blocks, 1, 1); - blockDims = dim3(grids, grids, 1); - Cuda::GetConjugateGradientKernel2<<>>(gradientImageBwCuda, conjugateGBwCuda, conjugateHBwCuda, (unsigned)nVoxelsBw, gam); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); + thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), nVoxelsBw, [=]__device__(const int index) { + conjugate(gradientBwCuda, conjugateGBwCuda, conjugateHBwCuda, gradientBwTexture, conjugateHBwTexture, index); + }); } } /* *************************************************************** */ -void Cuda::UpdateControlPointPosition(const size_t nVoxels, - float4 *controlPointImageCuda, - const float4 *bestControlPointCuda, - const float4 *gradientImageCuda, - const float scale, - const bool optimiseX, - const bool optimiseY, - const bool optimiseZ) { - auto bestControlPointTexture = Cuda::CreateTextureObject(bestControlPointCuda, nVoxels, cudaChannelFormatKindFloat, 4); - auto gradientImageTexture = Cuda::CreateTextureObject(gradientImageCuda, nVoxels, cudaChannelFormatKindFloat, 4); - - const unsigned blocks = (unsigned)CudaContext::GetBlockSize()->reg_updateControlPointPosition; - const unsigned grids = (unsigned)Ceil(sqrtf((float)nVoxels / (float)blocks)); - const dim3 blockDims(blocks, 1, 1); - const dim3 gridDims(grids, grids, 1); - UpdateControlPointPositionKernel<<>>(controlPointImageCuda, *bestControlPointTexture, *gradientImageTexture, - (unsigned)nVoxels, scale, optimiseX, optimiseY, optimiseZ); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); +void CudaConjugateGradient::UpdateGradientValues() { + if (this->firstCall) { + NR_DEBUG("Conjugate gradient initialisation"); + InitialiseConjugateGradient(this->gradientCuda, this->array1, this->array2, this->GetVoxNumber()); + if (this->isSymmetric) + InitialiseConjugateGradient(this->gradientBwCuda, this->array1Bw, this->array2Bw, this->GetVoxNumberBw()); + this->firstCall = false; + } else { + NR_DEBUG("Conjugate gradient update"); + GetConjugateGradient(this->gradientCuda, this->array1, this->array2, this->GetVoxNumber(), + this->isSymmetric, this->gradientBwCuda, this->array1Bw, this->array2Bw, this->GetVoxNumberBw()); + } } /* *************************************************************** */ } // namespace NiftyReg diff --git a/reg-lib/cuda/CudaOptimiser.hpp b/reg-lib/cuda/CudaOptimiser.hpp index fa9fec4d..56a1aceb 100644 --- a/reg-lib/cuda/CudaOptimiser.hpp +++ b/reg-lib/cuda/CudaOptimiser.hpp @@ -67,19 +67,6 @@ class CudaConjugateGradient: public CudaOptimiser { float4 *array2, *array2Bw; bool firstCall; - void InitialiseConjugateGradient(float4 *gradientImageCuda, - float4 *conjugateGCuda, - float4 *conjugateHCuda, - const size_t nVoxels); - void GetConjugateGradient(float4 *gradientImageCuda, - float4 *conjugateGCuda, - float4 *conjugateHCuda, - const size_t nVoxels, - const bool isSymmetric, - float4 *gradientImageBwCuda, - float4 *conjugateGBwCuda, - float4 *conjugateHBwCuda, - const size_t nVoxelsBw); #ifdef NR_TESTING public: #endif @@ -108,18 +95,5 @@ class CudaConjugateGradient: public CudaOptimiser { virtual void Perturbation(float length) override; }; /* *************************************************************** */ -namespace Cuda { -/* *************************************************************** */ -void UpdateControlPointPosition(const size_t nVoxels, - float4 *controlPointImageCuda, - const float4 *bestControlPointCuda, - const float4 *gradientImageCuda, - const float scale, - const bool optimiseX, - const bool optimiseY, - const bool optimiseZ); -/* *************************************************************** */ -} // namespace Cuda -/* *************************************************************** */ } // namespace NiftyReg /* *************************************************************** */ diff --git a/reg-lib/cuda/CudaOptimiserKernels.cu b/reg-lib/cuda/CudaOptimiserKernels.cu deleted file mode 100644 index 22a56c00..00000000 --- a/reg-lib/cuda/CudaOptimiserKernels.cu +++ /dev/null @@ -1,80 +0,0 @@ -/* *************************************************************** */ -namespace NiftyReg::Cuda { -/* *************************************************************** */ -__global__ void InitialiseConjugateGradientKernel(float4 *conjugateGCuda, - cudaTextureObject_t gradientImageTexture, - const unsigned nVoxels) { - const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < nVoxels) { - const float4 gradValue = tex1Dfetch(gradientImageTexture, tid); - conjugateGCuda[tid] = make_float4(-gradValue.x, -gradValue.y, -gradValue.z, 0); - } -} -/* *************************************************************** */ -__global__ void GetConjugateGradientKernel1(float2 *sums, - cudaTextureObject_t gradientImageTexture, - cudaTextureObject_t conjugateGTexture, - cudaTextureObject_t conjugateHTexture, - const unsigned nVoxels) { - const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < nVoxels) { - const float4 valueH = tex1Dfetch(conjugateHTexture, tid); - const float4 valueG = tex1Dfetch(conjugateGTexture, tid); - const float gg = valueG.x * valueH.x + valueG.y * valueH.y + valueG.z * valueH.z; - - const float4 grad = tex1Dfetch(gradientImageTexture, tid); - const float dgg = (grad.x + valueG.x) * grad.x + (grad.y + valueG.y) * grad.y + (grad.z + valueG.z) * grad.z; - - sums[tid] = make_float2(dgg, gg); - } -} -/* *************************************************************** */ -__global__ void GetConjugateGradientKernel2(float4 *gradientImageCuda, - float4 *conjugateGCuda, - float4 *conjugateHCuda, - const unsigned nVoxels, - const float scale) { - const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < nVoxels) { - // G = - grad - float4 gradGValue = gradientImageCuda[tid]; - gradGValue = make_float4(-gradGValue.x, -gradGValue.y, -gradGValue.z, 0); - conjugateGCuda[tid] = gradGValue; - - // H = G + gam * H - float4 gradHValue = conjugateHCuda[tid]; - gradHValue = make_float4(gradGValue.x + scale * gradHValue.x, - gradGValue.y + scale * gradHValue.y, - gradGValue.z + scale * gradHValue.z, - 0); - conjugateHCuda[tid] = gradHValue; - - gradientImageCuda[tid] = make_float4(-gradHValue.x, -gradHValue.y, -gradHValue.z, 0); - } -} -/* *************************************************************** */ -__global__ void UpdateControlPointPositionKernel(float4 *controlPointImageCuda, - cudaTextureObject_t bestControlPointTexture, - cudaTextureObject_t gradientImageTexture, - const unsigned nVoxels, - const float scale, - const bool optimiseX, - const bool optimiseY, - const bool optimiseZ) { - const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < nVoxels) { - float4 value = controlPointImageCuda[tid]; - const float4 bestValue = tex1Dfetch(bestControlPointTexture, tid); - const float4 gradValue = tex1Dfetch(gradientImageTexture, tid); - if (optimiseX) - value.x = bestValue.x + scale * gradValue.x; - if (optimiseY) - value.y = bestValue.y + scale * gradValue.y; - if (optimiseZ) - value.z = bestValue.z + scale * gradValue.z; - controlPointImageCuda[tid] = value; - } -} -/* *************************************************************** */ -} // namespace NiftyReg::Cuda -/* *************************************************************** */