From bc4c672772b44e22ba32a82d00ca521881229dd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20=C3=9Clgen?= Date: Wed, 15 Nov 2023 15:25:22 +0000 Subject: [PATCH] Make reg_getVoxelBasedNmiGradient_gpu() on a par with CPU #92 - Optimise reg_getVoxelBasedNmiGradient_gpu() - Get the function ready for multi-timepoint support --- niftyreg_build_version.txt | 2 +- reg-lib/cuda/BlockSize.hpp | 6 - reg-lib/cuda/_reg_nmi_gpu.cu | 184 +++++++---- reg-lib/cuda/_reg_nmi_kernels.cu | 519 ------------------------------- 4 files changed, 116 insertions(+), 595 deletions(-) delete mode 100755 reg-lib/cuda/_reg_nmi_kernels.cu diff --git a/niftyreg_build_version.txt b/niftyreg_build_version.txt index 35329ed8..e5db9a27 100644 --- a/niftyreg_build_version.txt +++ b/niftyreg_build_version.txt @@ -1 +1 @@ -361 +362 diff --git a/reg-lib/cuda/BlockSize.hpp b/reg-lib/cuda/BlockSize.hpp index 06beca8a..a86430ec 100644 --- a/reg-lib/cuda/BlockSize.hpp +++ b/reg-lib/cuda/BlockSize.hpp @@ -14,8 +14,6 @@ namespace NiftyReg { /* *************************************************************** */ struct BlockSize { - unsigned reg_getVoxelBasedNmiGradientUsingPw2D; - unsigned reg_getVoxelBasedNmiGradientUsingPw3D; unsigned reg_affine_getDeformationField; unsigned reg_spline_getDeformationField2D; unsigned reg_spline_getDeformationField3D; @@ -54,8 +52,6 @@ struct BlockSize { /* *************************************************************** */ struct BlockSize100: public BlockSize { BlockSize100() { - reg_getVoxelBasedNmiGradientUsingPw2D = 384; // 21 reg - 24 smem - 32 cmem - reg_getVoxelBasedNmiGradientUsingPw3D = 320; // 25 reg - 24 smem - 32 cmem reg_affine_getDeformationField = 512; // 16 reg - 24 smem reg_spline_getDeformationField2D = 384; // 20 reg - 6168 smem - 28 cmem reg_spline_getDeformationField3D = 192; // 37 reg - 6168 smem - 28 cmem @@ -96,8 +92,6 @@ struct BlockSize100: public BlockSize { /* *************************************************************** */ struct BlockSize300: public BlockSize { BlockSize300() { - reg_getVoxelBasedNmiGradientUsingPw2D = 768; // 38 reg - reg_getVoxelBasedNmiGradientUsingPw3D = 640; // 45 reg reg_affine_getDeformationField = 1024; // 23 reg reg_spline_getDeformationField2D = 1024; // 34 reg reg_spline_getDeformationField3D = 1024; // 34 reg diff --git a/reg-lib/cuda/_reg_nmi_gpu.cu b/reg-lib/cuda/_reg_nmi_gpu.cu index f48fff8f..d0c3056d 100755 --- a/reg-lib/cuda/_reg_nmi_gpu.cu +++ b/reg-lib/cuda/_reg_nmi_gpu.cu @@ -11,7 +11,7 @@ */ #include "_reg_nmi_gpu.h" -#include "_reg_nmi_kernels.cu" +#include "_reg_common_cuda_kernels.cu" /* *************************************************************** */ reg_nmi_gpu::reg_nmi_gpu(): reg_nmi::reg_nmi() { @@ -298,95 +298,141 @@ double reg_nmi_gpu::GetSimilarityMeasureValueBw() { this->approximatePw); } /* *************************************************************** */ +template struct Derivative { using Type = double3; }; +template<> struct Derivative { using Type = double2; }; +/* *************************************************************** */ /// Called when we only have one target and one source image +template void reg_getVoxelBasedNmiGradient_gpu(const nifti_image *referenceImage, const cudaArray *referenceImageCuda, const float *warpedImageCuda, const float4 *warpedGradientCuda, - const float *logJointHistogramCuda, + const double *jointHistogramLogCuda, float4 *voxelBasedGradientCuda, const int *maskCuda, const size_t activeVoxelNumber, const double *entropies, - const int refBinning, - const int floBinning) { - auto blockSize = CudaContext::GetBlockSize(); + const int refBinNumber, + const int floBinNumber, + const int totalBinNumber, + const double timePointWeight, + const int currentTimePoint) { const size_t voxelNumber = NiftiImage::calcVoxelNumber(referenceImage, 3); const int3 imageSize = make_int3(referenceImage->nx, referenceImage->ny, referenceImage->nz); - const int binNumber = refBinning * floBinning + refBinning + floBinning; - const float normalisedJE = (float)(entropies[2] * entropies[3]); - const float nmi = (float)((entropies[0] + entropies[1]) / entropies[2]); + const double normalisedJE = entropies[2] * entropies[3]; + const double nmi = (entropies[0] + entropies[1]) / entropies[2]; + const int referenceOffset = refBinNumber * floBinNumber; + const int floatingOffset = referenceOffset + refBinNumber; - auto referenceImageTexture = Cuda::CreateTextureObject(referenceImageCuda, cudaResourceTypeArray, 0, - cudaChannelFormatKindNone, 1, cudaFilterModePoint, true); - auto warpedImageTexture = Cuda::CreateTextureObject(warpedImageCuda, cudaResourceTypeLinear, voxelNumber * sizeof(float), - cudaChannelFormatKindFloat, 1); - auto warpedGradientTexture = Cuda::CreateTextureObject(warpedGradientCuda, cudaResourceTypeLinear, voxelNumber * sizeof(float4), - cudaChannelFormatKindFloat, 4); - auto histogramTexture = Cuda::CreateTextureObject(logJointHistogramCuda, cudaResourceTypeLinear, binNumber * sizeof(float), - cudaChannelFormatKindFloat, 1); - auto maskTexture = Cuda::CreateTextureObject(maskCuda, cudaResourceTypeLinear, activeVoxelNumber * sizeof(int), - cudaChannelFormatKindSigned, 1); + auto referenceImageTexturePtr = Cuda::CreateTextureObject(referenceImageCuda, cudaResourceTypeArray, 0, + cudaChannelFormatKindNone, 1, cudaFilterModePoint, true); + auto warpedImageTexturePtr = Cuda::CreateTextureObject(warpedImageCuda + currentTimePoint * voxelNumber, cudaResourceTypeLinear, + voxelNumber * sizeof(float), cudaChannelFormatKindFloat, 1); + auto warpedGradientTexturePtr = Cuda::CreateTextureObject(warpedGradientCuda, cudaResourceTypeLinear, voxelNumber * sizeof(float4), + cudaChannelFormatKindFloat, 4); + auto maskTexturePtr = Cuda::CreateTextureObject(maskCuda, cudaResourceTypeLinear, activeVoxelNumber * sizeof(int), + cudaChannelFormatKindSigned, 1); + auto referenceImageTexture = *referenceImageTexturePtr; + auto warpedImageTexture = *warpedImageTexturePtr; + auto warpedGradientTexture = *warpedGradientTexturePtr; + auto maskTexture = *maskTexturePtr; - if (referenceImage->nz > 1) { - const unsigned blocks = blockSize->reg_getVoxelBasedNmiGradientUsingPw3D; - const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks)); - const dim3 gridDims(grids, grids, 1); - const dim3 blockDims(blocks, 1, 1); - reg_getVoxelBasedNmiGradientUsingPw3D_kernel<<>>(voxelBasedGradientCuda, *referenceImageTexture, *warpedImageTexture, - *warpedGradientTexture, *histogramTexture, *maskTexture, - imageSize, refBinning, floBinning, normalisedJE, nmi, - (unsigned)activeVoxelNumber); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); - } else { - const unsigned blocks = blockSize->reg_getVoxelBasedNmiGradientUsingPw2D; - const unsigned grids = (unsigned)Ceil(sqrtf((float)activeVoxelNumber / (float)blocks)); - const dim3 gridDims(grids, grids, 1); - const dim3 blockDims(blocks, 1, 1); - reg_getVoxelBasedNmiGradientUsingPw2D_kernel<<>>(voxelBasedGradientCuda, *referenceImageTexture, *warpedImageTexture, - *warpedGradientTexture, *histogramTexture, *maskTexture, - imageSize, refBinning, floBinning, normalisedJE, nmi, - (unsigned)activeVoxelNumber); - NR_CUDA_CHECK_KERNEL(gridDims, blockDims); - } + thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), activeVoxelNumber, [=]__device__(const unsigned index) { + const int targetIndex = tex1Dfetch(maskTexture, index); + const float warpedImageValue = tex1Dfetch(warpedImageTexture, targetIndex); + if (warpedImageValue != warpedImageValue) return; + const auto&& [x, y, z] = reg_indexToDims_cuda(targetIndex, imageSize); + const float referenceImageValue = tex3D(referenceImageTexture, + (float(x) + 0.5f) / float(imageSize.x), + (float(y) + 0.5f) / float(imageSize.y), + is3d ? (float(z) + 0.5f) / float(imageSize.z) : 0.5f); + if (referenceImageValue != referenceImageValue) return; + const float4& warpedGradValue = tex1Dfetch(warpedGradientTexture, index); + float4 gradValue = voxelBasedGradientCuda[targetIndex]; + + // No computation is performed if any of the point is part of the background + // The two is added because the image is resample between 2 and bin+2 + // if 64 bins are used the histogram will have 68 bins et the image will be between 2 and 65 + typename Derivative::Type jointDeriv{}, refDeriv{}, warDeriv{}; + for (int r = (int)referenceImageValue - 1; r < (int)referenceImageValue + 3; ++r) { + if (-1 < r && r < refBinNumber) { + for (int w = (int)warpedImageValue - 1; w < (int)warpedImageValue + 3; ++w) { + if (-1 < w && w < floBinNumber) { + const double commonValue = (GetBasisSplineValue(referenceImageValue - r) * + GetBasisSplineDerivativeValue(warpedImageValue - w)); + const double jointLog = jointHistogramLogCuda[r + w * refBinNumber]; + const double refLog = jointHistogramLogCuda[r + referenceOffset]; + const double warLog = jointHistogramLogCuda[w + floatingOffset]; + if (warpedGradValue.x == warpedGradValue.x) { + const double commonMultGrad = commonValue * warpedGradValue.x; + jointDeriv.x += commonMultGrad * jointLog; + refDeriv.x += commonMultGrad * refLog; + warDeriv.x += commonMultGrad * warLog; + } + if (warpedGradValue.y == warpedGradValue.y) { + const double commonMultGrad = commonValue * warpedGradValue.y; + jointDeriv.y += commonMultGrad * jointLog; + refDeriv.y += commonMultGrad * refLog; + warDeriv.y += commonMultGrad * warLog; + } + if constexpr (is3d) { + if (warpedGradValue.z == warpedGradValue.z) { + const double commonMultGrad = commonValue * warpedGradValue.z; + jointDeriv.z += commonMultGrad * jointLog; + refDeriv.z += commonMultGrad * refLog; + warDeriv.z += commonMultGrad * warLog; + } + } + } + } + } + } + + // (Marc) I removed the normalisation by the voxel number as each gradient has to be normalised in the same way + gradValue.x += static_cast(timePointWeight * (refDeriv.x + warDeriv.x - nmi * jointDeriv.x) / normalisedJE); + gradValue.y += static_cast(timePointWeight * (refDeriv.y + warDeriv.y - nmi * jointDeriv.y) / normalisedJE); + if constexpr (is3d) + gradValue.z += static_cast(timePointWeight * (refDeriv.z + warDeriv.z - nmi * jointDeriv.z) / normalisedJE); + voxelBasedGradientCuda[targetIndex] = gradValue; + }); } /* *************************************************************** */ void reg_nmi_gpu::GetVoxelBasedSimilarityMeasureGradientFw(int currentTimePoint) { // Call compute similarity measure to calculate joint histogram this->GetSimilarityMeasureValue(); - // The latest joint histogram is transferred onto the GPU - thrust::device_vector jointHistogramLogCuda(this->jointHistogramLog[0], this->jointHistogramLog[0] + this->totalBinNumber[0]); - - // The gradient of the NMI is computed on the GPU - reg_getVoxelBasedNmiGradient_gpu(this->referenceImage, - this->referenceImageCuda, - this->warpedImageCuda, - this->warpedGradientCuda, - jointHistogramLogCuda.data().get(), - this->voxelBasedGradientCuda, - this->referenceMaskCuda, - this->activeVoxelNumber, - this->entropyValues[0], - this->referenceBinNumber[0], - this->floatingBinNumber[0]); + auto getVoxelBasedNmiGradient = this->referenceImage->nz > 1 ? reg_getVoxelBasedNmiGradient_gpu : reg_getVoxelBasedNmiGradient_gpu; + getVoxelBasedNmiGradient(this->referenceImage, + this->referenceImageCuda, + this->warpedImageCuda, + this->warpedGradientCuda, + this->jointHistogramLogCudaVecs[currentTimePoint].data().get(), + this->voxelBasedGradientCuda, + this->referenceMaskCuda, + this->activeVoxelNumber, + this->entropyValues[currentTimePoint], + this->referenceBinNumber[currentTimePoint], + this->floatingBinNumber[currentTimePoint], + this->totalBinNumber[currentTimePoint], + this->timePointWeights[currentTimePoint], + currentTimePoint); } /* *************************************************************** */ void reg_nmi_gpu::GetVoxelBasedSimilarityMeasureGradientBw(int currentTimePoint) { - // The latest joint histogram is transferred onto the GPU - thrust::device_vector jointHistogramLogCudaBw(this->jointHistogramLogBw[0], this->jointHistogramLogBw[0] + this->totalBinNumber[0]); - - // The gradient of the NMI is computed on the GPU - reg_getVoxelBasedNmiGradient_gpu(this->floatingImage, - this->floatingImageCuda, - this->warpedImageBwCuda, - this->warpedGradientBwCuda, - jointHistogramLogCudaBw.data().get(), - this->voxelBasedGradientBwCuda, - this->floatingMaskCuda, - this->activeVoxelNumber, - this->entropyValuesBw[0], - this->floatingBinNumber[0], - this->referenceBinNumber[0]); + auto getVoxelBasedNmiGradient = this->floatingImage->nz > 1 ? reg_getVoxelBasedNmiGradient_gpu : reg_getVoxelBasedNmiGradient_gpu; + getVoxelBasedNmiGradient(this->floatingImage, + this->floatingImageCuda, + this->warpedImageBwCuda, + this->warpedGradientBwCuda, + this->jointHistogramLogBwCudaVecs[currentTimePoint].data().get(), + this->voxelBasedGradientBwCuda, + this->floatingMaskCuda, + this->activeVoxelNumber, + this->entropyValuesBw[currentTimePoint], + this->floatingBinNumber[currentTimePoint], + this->referenceBinNumber[currentTimePoint], + this->totalBinNumber[currentTimePoint], + this->timePointWeights[currentTimePoint], + currentTimePoint); } /* *************************************************************** */ diff --git a/reg-lib/cuda/_reg_nmi_kernels.cu b/reg-lib/cuda/_reg_nmi_kernels.cu deleted file mode 100755 index 0da6c415..00000000 --- a/reg-lib/cuda/_reg_nmi_kernels.cu +++ /dev/null @@ -1,519 +0,0 @@ -/* - * _reg_mutualinformation_kernels.cu - * - * - * Created by Marc Modat on 24/03/2009. - * Copyright (c) 2009-2018, University College London - * Copyright (c) 2018, NiftyReg Developers. - * All rights reserved. - * See the LICENSE.txt file in the nifty_reg root folder - * - */ - -#include "_reg_common_cuda_kernels.cu" - -#define COEFF_L 0.16666666f -#define COEFF_C 0.66666666f -#define COEFF_B 0.83333333f - -/* *************************************************************** */ -__device__ float GetBasisSplineValue(float x) { - x = fabsf(x); - float value = 0.0f; - if (x < 2.0f) - if (x < 1.0f) - value = 2.0f / 3.0f + (0.5f * x - 1.0f) * x * x; - else { - x -= 2.0f; - value = -x * x * x / 6.0f; - } - return value; -} -/* *************************************************************** */ -__device__ float GetBasisSplineDerivativeValue(const float& ori) { - float x = fabsf(ori); - float value = 0.0f; - if (x < 2.0f) - if (x < 1.0f) - value = (1.5f * x - 2.0f) * ori; - else { - x -= 2.0f; - value = -0.5f * x * x; - if (ori < 0.0f) value = -value; - } - return value; -} -/* *************************************************************** */ -__global__ void reg_getVoxelBasedNmiGradientUsingPw2D_kernel(float4 *voxelBasedGradient, - cudaTextureObject_t referenceImageTexture, - cudaTextureObject_t warpedImageTexture, - cudaTextureObject_t warpedGradientTexture, - cudaTextureObject_t histogramTexture, - cudaTextureObject_t maskTexture, - const int3 imageSize, - const int refBinning, - const int floBinning, - const float normalisedJE, - const float nmi, - const unsigned activeVoxelNumber) { - const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < activeVoxelNumber) { - const int targetIndex = tex1Dfetch(maskTexture, tid); - int quot, rem; - reg_div_cuda(targetIndex, imageSize.x, quot, rem); - const int y = quot, x = rem; - - const float referenceImageValue = tex3D(referenceImageTexture, - ((float)x + 0.5f) / (float)imageSize.x, - ((float)y + 0.5f) / (float)imageSize.y, - 0.5f); - const float warpedImageValue = tex1Dfetch(warpedImageTexture, targetIndex); - const float4 warpedImageGradient = tex1Dfetch(warpedGradientTexture, tid); - - float4 gradValue{}; - - // No computation is performed if any of the point is part of the background - // The two is added because the image is resample between 2 and bin +2 - // if 64 bins are used the histogram will have 68 bins et the image will be between 2 and 65 - if (0.f < referenceImageValue && referenceImageValue < refBinning && - 0.f < warpedImageValue && warpedImageValue < floBinning && - referenceImageValue == referenceImageValue && warpedImageValue == warpedImageValue) { - const float2 resDeriv = make_float2(warpedImageGradient.x, warpedImageGradient.y); - if (resDeriv.x == resDeriv.x && resDeriv.y == resDeriv.y) { - float jointEntropyDerivative_X = 0.0f; - float warpedEntropyDerivative_X = 0.0f; - float referenceEntropyDerivative_X = 0.0f; - float jointEntropyDerivative_Y = 0.0f; - float warpedEntropyDerivative_Y = 0.0f; - float referenceEntropyDerivative_Y = 0.0f; - for (int r = (int)referenceImageValue - 1; r < (int)referenceImageValue + 3; ++r) { - if (-1 < r && r < refBinning) { - for (int w = (int)warpedImageValue - 1; w < (int)warpedImageValue + 3; ++w) { - if (-1 < w && w < floBinning) { - const float commonValue = (GetBasisSplineValue(referenceImageValue - (float)r) * - GetBasisSplineDerivativeValue(warpedImageValue - (float)w)); - - const float jointLog = tex1Dfetch(histogramTexture, w * floBinning + r); - const float targetLog = tex1Dfetch(histogramTexture, refBinning * floBinning + r); - const float resultLog = tex1Dfetch(histogramTexture, refBinning * floBinning + refBinning + w); - - float temp = commonValue * resDeriv.x; - jointEntropyDerivative_X += temp * jointLog; - referenceEntropyDerivative_X += temp * targetLog; - warpedEntropyDerivative_X += temp * resultLog; - - temp = commonValue * resDeriv.y; - jointEntropyDerivative_Y += temp * jointLog; - referenceEntropyDerivative_Y += temp * targetLog; - warpedEntropyDerivative_Y += temp * resultLog; - } // O(maskTexture, tid); - int quot, rem; - reg_div_cuda(targetIndex, imageSize.x * imageSize.y, quot, rem); - const int z = quot; - reg_div_cuda(rem, imageSize.x, quot, rem); - const int y = quot, x = rem; - - const float referenceImageValue = tex3D(referenceImageTexture, - ((float)x + 0.5f) / (float)imageSize.x, - ((float)y + 0.5f) / (float)imageSize.y, - ((float)z + 0.5f) / (float)imageSize.z); - const float warpedImageValue = tex1Dfetch(warpedImageTexture, targetIndex); - const float4 warpedImageGradient = tex1Dfetch(warpedGradientTexture, tid); - - float4 gradValue{}; - - // No computation is performed if any of the point is part of the background - // The two is added because the image is resample between 2 and bin +2 - // if 64 bins are used the histogram will have 68 bins et the image will be between 2 and 65 - if (0.f < referenceImageValue && referenceImageValue < refBinning && - 0.f < warpedImageValue && warpedImageValue < floBinning && - referenceImageValue == referenceImageValue && warpedImageValue == warpedImageValue) { - const float3 resDeriv = make_float3(warpedImageGradient.x, warpedImageGradient.y, warpedImageGradient.z); - if (resDeriv.x == resDeriv.x && resDeriv.y == resDeriv.y && resDeriv.z == resDeriv.z) { - float jointEntropyDerivative_X = 0.0f; - float warpedEntropyDerivative_X = 0.0f; - float referenceEntropyDerivative_X = 0.0f; - float jointEntropyDerivative_Y = 0.0f; - float warpedEntropyDerivative_Y = 0.0f; - float referenceEntropyDerivative_Y = 0.0f; - float jointEntropyDerivative_Z = 0.0f; - float warpedEntropyDerivative_Z = 0.0f; - float referenceEntropyDerivative_Z = 0.0f; - for (int r = (int)referenceImageValue - 1; r < (int)referenceImageValue + 3; ++r) { - if (-1 < r && r < refBinning) { - for (int w = (int)warpedImageValue - 1; w < (int)warpedImageValue + 3; ++w) { - if (-1 < w && w < floBinning) { - const float commonValue = (GetBasisSplineValue(referenceImageValue - (float)r) * - GetBasisSplineDerivativeValue(warpedImageValue - (float)w)); - - const float jointLog = tex1Dfetch(histogramTexture, w * floBinning + r); - const float targetLog = tex1Dfetch(histogramTexture, refBinning * floBinning + r); - const float resultLog = tex1Dfetch(histogramTexture, refBinning * floBinning + refBinning + w); - - float temp = commonValue * resDeriv.x; - jointEntropyDerivative_X += temp * jointLog; - referenceEntropyDerivative_X += temp * targetLog; - warpedEntropyDerivative_X += temp * resultLog; - - temp = commonValue * resDeriv.y; - jointEntropyDerivative_Y += temp * jointLog; - referenceEntropyDerivative_Y += temp * targetLog; - warpedEntropyDerivative_Y += temp * resultLog; - - temp = commonValue * resDeriv.z; - jointEntropyDerivative_Z += temp * jointLog; - referenceEntropyDerivative_Z += temp * targetLog; - warpedEntropyDerivative_Z += temp * resultLog; - } // O= 0.0f && - voxelValues.y >= 0.0f && - voxelValues.z >= 0.0f && - voxelValues.w >= 0.0f && - voxelValues.x < c_firstTargetBin && - voxelValues.y < c_secondTargetBin && - voxelValues.z < c_firstResultBin && - voxelValues.w < c_secondResultBin) { - voxelValues.x = (float)((int)voxelValues.x); - voxelValues.y = (float)((int)voxelValues.y); - voxelValues.z = (float)((int)voxelValues.z); - voxelValues.w = (float)((int)voxelValues.w); - - if (firstwarpedImageGradient.x == firstwarpedImageGradient.x && - firstwarpedImageGradient.y == firstwarpedImageGradient.y && - firstwarpedImageGradient.z == firstwarpedImageGradient.z && - secondwarpedImageGradient.x == secondwarpedImageGradient.x && - secondwarpedImageGradient.y == secondwarpedImageGradient.y && - secondwarpedImageGradient.z == secondwarpedImageGradient.z) { - float jointEntropyDerivative_X = 0.0f; - float warpedEntropyDerivative_X = 0.0f; - float referenceEntropyDerivative_X = 0.0f; - - float jointEntropyDerivative_Y = 0.0f; - float warpedEntropyDerivative_Y = 0.0f; - float referenceEntropyDerivative_Y = 0.0f; - - float jointEntropyDerivative_Z = 0.0f; - float warpedEntropyDerivative_Z = 0.0f; - float referenceEntropyDerivative_Z = 0.0f; - - float jointLog, targetLog, resultLog, temp; - float4 relative_pos = make_float4(0.0f, 0.0f, 0.0f, 0.0f); - - float s_x, s_y, s_z, s_w; - float common_target_value = 0.0f; - int target_flat_index, result_flat_index, total_target_entries, num_probabilities; - for (int i = -1; i < 2; ++i) { - relative_pos.x = (int)(voxelValues.x + i); - - if (-1 < relative_pos.x && relative_pos.x < c_firstTargetBin) { - for (int j = -1; j < 2; ++j) { - relative_pos.y = (int)(voxelValues.y + j); - - if (-1 < relative_pos.y && relative_pos.y < c_secondTargetBin) { - s_x = GetBasisSplineValue(relative_pos.x - voxelValues.x); - s_y = GetBasisSplineValue(relative_pos.y - voxelValues.y); - common_target_value = s_x * s_y; - - for (int k = -1; k < 2; ++k) { - relative_pos.z = (int)(voxelValues.z + k); - if (-1 < relative_pos.z && relative_pos.z < c_firstResultBin) { - s_x = GetBasisSplineDerivativeValue(relative_pos.z - voxelValues.z); - s_w = GetBasisSplineValue(relative_pos.z - voxelValues.z); - for (int l = -1; l < 2; ++l) { - relative_pos.w = (int)(voxelValues.w + l); - if (-1 < relative_pos.w && relative_pos.w < c_secondResultBin) { - target_flat_index = relative_pos.x + relative_pos.y * c_firstTargetBin; - result_flat_index = relative_pos.z + relative_pos.w * c_firstResultBin; - total_target_entries = c_firstTargetBin * c_secondTargetBin; - num_probabilities = total_target_entries * c_firstResultBin * c_secondResultBin; - - jointLog = tex1Dfetch(histogramTexture, target_flat_index + (result_flat_index * total_target_entries)); - targetLog = tex1Dfetch(histogramTexture, num_probabilities + target_flat_index); - resultLog = tex1Dfetch(histogramTexture, num_probabilities + total_target_entries + result_flat_index); - - // Contribution from floating images. These arithmetic operations use - // a lot of registers. Need to look into whether this can be reduced somehow. - s_y = GetBasisSplineValue(relative_pos.w - voxelValues.w); - s_z = GetBasisSplineDerivativeValue(relative_pos.w - voxelValues.w); - temp = (s_x * firstwarpedImageGradient.x * s_y) + - (s_z * secondwarpedImageGradient.x * s_w); - temp *= common_target_value; - - jointEntropyDerivative_X -= temp * jointLog; - referenceEntropyDerivative_X -= temp * targetLog; - warpedEntropyDerivative_X -= temp * resultLog; - - temp = (s_x * firstwarpedImageGradient.y * s_y) + - (s_z * secondwarpedImageGradient.y * s_w); - temp *= common_target_value; - jointEntropyDerivative_Y -= temp * jointLog; - referenceEntropyDerivative_Y -= temp * targetLog; - warpedEntropyDerivative_Y -= temp * resultLog; - - temp = (s_x * firstwarpedImageGradient.z * s_y) + - (s_z * secondwarpedImageGradient.z * s_w); - temp *= common_target_value; - jointEntropyDerivative_Z -= temp * jointLog; - referenceEntropyDerivative_Z -= temp * targetLog; - warpedEntropyDerivative_Z -= temp * resultLog; - } - } - } - } - } - } - } - } - - gradValue.x = (referenceEntropyDerivative_X + warpedEntropyDerivative_X - c_NMI * jointEntropyDerivative_X) / c_NormalisedJE; - gradValue.y = (referenceEntropyDerivative_Y + warpedEntropyDerivative_Y - c_NMI * jointEntropyDerivative_Y) / c_NormalisedJE; - gradValue.z = (referenceEntropyDerivative_Z + warpedEntropyDerivative_Z - c_NMI * jointEntropyDerivative_Z) / c_NormalisedJE; - } - } - voxelBasedGradient[targetIndex] = gradValue; - } -} */ -/* *************************************************************** */ -/* __global__ void reg_smoothJointHistogramX_kernel(float *tempHistogram) { - const int tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < c_secondTargetBin * c_firstResultBin * c_secondResultBin) { - // The starting index is computed - unsigned startingPoint = tid * c_firstTargetBin; - unsigned finishPoint = startingPoint + c_firstTargetBin; - - // The first point is computed - tempHistogram[startingPoint] = (tex1Dfetch(histogramTexture, startingPoint) * COEFF_C + - tex1Dfetch(histogramTexture, startingPoint + 1) * COEFF_L) / COEFF_B; - // The middle points are computed - for (unsigned i = startingPoint + 1; i < finishPoint - 1; ++i) { - tempHistogram[i] = tex1Dfetch(histogramTexture, i - 1) * COEFF_L + - tex1Dfetch(histogramTexture, i) * COEFF_C + - tex1Dfetch(histogramTexture, i + 1) * COEFF_L; - } - // The last point is computed - tempHistogram[finishPoint - 1] = (tex1Dfetch(histogramTexture, finishPoint - 2) * COEFF_L + - tex1Dfetch(histogramTexture, finishPoint - 1) * COEFF_C) / COEFF_B; - } -} */ -/* *************************************************************** */ -/* __global__ void reg_smoothJointHistogramY_kernel(float *tempHistogram) { - const int tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < c_firstTargetBin * c_firstResultBin * c_secondResultBin) { - // The starting index is computed - unsigned startingPoint = tid + c_firstTargetBin * (c_secondTargetBin - 1) * (c_firstResultBin * (int)(tid / (c_firstTargetBin * c_firstResultBin)) + - (int)(tid / c_firstTargetBin - c_firstResultBin * (int)(tid / (c_firstTargetBin * c_firstResultBin)))); - unsigned increment = c_firstTargetBin; - unsigned finishPoint = startingPoint + increment * c_secondTargetBin; - - // The first point is computed - tempHistogram[startingPoint] = (tex1Dfetch(histogramTexture, startingPoint) * COEFF_C + - tex1Dfetch(histogramTexture, startingPoint + increment) * COEFF_L) / COEFF_B; - // The middle points are computed - for (unsigned i = startingPoint + increment; i < finishPoint - increment; i += increment) { - tempHistogram[i] = tex1Dfetch(histogramTexture, i - increment) * COEFF_L + - tex1Dfetch(histogramTexture, i) * COEFF_C + - tex1Dfetch(histogramTexture, i + increment) * COEFF_L; - } - // The last point is computed - tempHistogram[finishPoint - increment] = (tex1Dfetch(histogramTexture, finishPoint - 2 * increment) * COEFF_L + - tex1Dfetch(histogramTexture, finishPoint - increment) * COEFF_C) / COEFF_B; - } -} */ -/* *************************************************************** */ -/* __global__ void reg_smoothJointHistogramZ_kernel(float *tempHistogram) { - const int tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < c_firstTargetBin * c_secondTargetBin * c_secondResultBin) { - // The starting index is computed - unsigned startingPoint = tid + c_firstTargetBin * c_secondTargetBin * (c_firstResultBin - 1) * (int)(tid / (c_firstTargetBin * c_secondTargetBin)); - unsigned increment = c_firstTargetBin * c_secondTargetBin; - unsigned finishPoint = startingPoint + increment * c_firstResultBin; - - // The first point is computed - tempHistogram[startingPoint] = (tex1Dfetch(histogramTexture, startingPoint) * COEFF_C + - tex1Dfetch(histogramTexture, startingPoint + increment) * COEFF_L) / COEFF_B; - // The middle points are computed - for (unsigned i = startingPoint + increment; i < finishPoint - increment; i += increment) { - tempHistogram[i] = tex1Dfetch(histogramTexture, i - increment) * COEFF_L + - tex1Dfetch(histogramTexture, i) * COEFF_C + - tex1Dfetch(histogramTexture, i + increment) * COEFF_L; - } - // The last point is computed - tempHistogram[finishPoint - increment] = (tex1Dfetch(histogramTexture, finishPoint - 2 * increment) * COEFF_L + - tex1Dfetch(histogramTexture, finishPoint - increment) * COEFF_C) / COEFF_B; - } -} */ -/* *************************************************************** */ -/* __global__ void reg_smoothJointHistogramW_kernel(float *tempHistogram) { - const int tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < c_firstTargetBin * c_secondTargetBin * c_firstResultBin) { - // The starting index is computed - unsigned startingPoint = tid; - unsigned increment = c_firstTargetBin * c_secondTargetBin * c_firstResultBin; - unsigned finishPoint = increment * c_secondResultBin; - - // The first point is computed - tempHistogram[startingPoint] = (tex1Dfetch(histogramTexture, startingPoint) * COEFF_C + - tex1Dfetch(histogramTexture, startingPoint + increment) * COEFF_L) / COEFF_B; - // The middle points are computed - for (unsigned i = startingPoint + increment; i < finishPoint - increment; i += increment) { - tempHistogram[i] = tex1Dfetch(histogramTexture, i - increment) * COEFF_L + - tex1Dfetch(histogramTexture, i) * COEFF_C + - tex1Dfetch(histogramTexture, i + increment) * COEFF_L; - } - // The last point is computed - tempHistogram[finishPoint - increment] = (tex1Dfetch(histogramTexture, finishPoint - 2 * increment) * COEFF_L + - tex1Dfetch(histogramTexture, finishPoint - increment) * COEFF_C) / COEFF_B; - } -} */ -/* *************************************************************** */ -// Kernels for marginalisation along the different axes -/* __global__ void reg_marginaliseTargetX_kernel(float *babyHisto) { - const int tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < c_secondTargetBin * c_firstResultBin * c_secondResultBin) { - unsigned startingPoint = tid * c_firstTargetBin; - unsigned finishPoint = startingPoint + c_firstTargetBin; - - float sum = tex1Dfetch(histogramTexture, startingPoint); - float c = 0.f, Y, t; - for (unsigned i = startingPoint + 1; i < finishPoint; ++i) { - Y = tex1Dfetch(histogramTexture, i) - c; - t = sum + Y; - c = (t - sum) - Y; - sum = t; - } - babyHisto[tid] = sum; - } -} */ -/* *************************************************************** */ -/* __global__ void reg_marginaliseTargetXY_kernel(float *babyHisto) { - const int tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < c_firstResultBin * c_secondResultBin) { - unsigned startingPoint = tid * c_secondTargetBin; - unsigned finishPoint = startingPoint + c_secondTargetBin; - - float sum = tex1Dfetch(histogramTexture, startingPoint); - float c = 0.f, Y, t; - for (unsigned i = startingPoint + 1; i < finishPoint; ++i) { - Y = tex1Dfetch(histogramTexture, i) - c; - t = sum + Y; - c = (t - sum) - Y; - sum = t; - } - babyHisto[tid] = sum; - } -} */ -/* *************************************************************** */ -/* __global__ void reg_marginaliseResultX_kernel(float *babyHisto) { - const int tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < c_firstTargetBin * c_secondTargetBin * c_firstResultBin) { - unsigned startingPoint = tid; - float sum = tex1Dfetch(histogramTexture, startingPoint); - // increment by a the cube - unsigned increment = c_firstTargetBin * c_secondTargetBin * c_firstResultBin; - float c = 0.f, Y, t; - - for (unsigned i = 1; i < c_secondResultBin; ++i) { - Y = tex1Dfetch(histogramTexture, startingPoint + i * increment) - c; - t = sum + Y; - c = (t - sum) - Y; - sum = t; - } - babyHisto[tid] = sum; - } -} */ -/* *************************************************************** */ -/* __global__ void reg_marginaliseResultXY_kernel(float *babyHisto) { - const int tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < c_firstTargetBin * c_secondTargetBin) { - unsigned startingPoint = tid; - float sum = tex1Dfetch(histogramTexture, startingPoint); - // increment by the plane. - unsigned increment = c_firstTargetBin * c_secondTargetBin; - float c = 0.f, Y, t; - for (unsigned i = 1; i < c_firstResultBin; ++i) { - Y = tex1Dfetch(histogramTexture, startingPoint + i * increment) - c; - t = sum + Y; - c = (t - sum) - Y; - sum = t; - } - babyHisto[tid] = sum; - } -} */ -/* *************************************************************** */