diff --git a/niftyreg_build_version.txt b/niftyreg_build_version.txt
index 446dfcc5..5b0cffbc 100644
--- a/niftyreg_build_version.txt
+++ b/niftyreg_build_version.txt
@@ -1 +1 @@
-369
+370
diff --git a/reg-lib/cuda/BlockSize.hpp b/reg-lib/cuda/BlockSize.hpp
index 5483ae59..fe411adb 100644
--- a/reg-lib/cuda/BlockSize.hpp
+++ b/reg-lib/cuda/BlockSize.hpp
@@ -32,10 +32,6 @@ struct BlockSize {
     unsigned reg_defField_compose2D;
     unsigned reg_defField_compose3D;
     unsigned reg_defField_getJacobianMatrix;
-    unsigned reg_initialiseConjugateGradient;
-    unsigned reg_getConjugateGradient1;
-    unsigned reg_getConjugateGradient2;
-    unsigned reg_updateControlPointPosition;
     unsigned reg_voxelCentricToNodeCentric;
     unsigned reg_convertNmiGradientFromVoxelToRealSpace;
     unsigned reg_ApplyConvolutionWindowAlongX;
@@ -68,10 +64,6 @@ struct BlockSize100: public BlockSize {
         reg_defField_compose2D = 512; // 15 reg - 24 smem - 08 cmem - 16 lmem
         reg_defField_compose3D = 384; // 21 reg - 24 smem - 08 cmem - 24 lmem
         reg_defField_getJacobianMatrix = 512; // 16 reg - 24 smem - 04 cmem
-        reg_initialiseConjugateGradient = 384; // 09 reg - 24 smem
-        reg_getConjugateGradient1 = 320; // 12 reg - 24 smem
-        reg_getConjugateGradient2 = 384; // 10 reg - 40 smem
-        reg_updateControlPointPosition = 384; // 08 reg - 24 smem
         reg_voxelCentricToNodeCentric = 320; // 11 reg - 24 smem - 16 cmem
         reg_convertNmiGradientFromVoxelToRealSpace = 512; // 16 reg - 24 smem
         reg_ApplyConvolutionWindowAlongX = 512; // 14 reg - 28 smem - 08 cmem
@@ -106,10 +98,6 @@ struct BlockSize300: public BlockSize {
         reg_defField_compose2D = 1024; // 23 reg
         reg_defField_compose3D = 1024; // 24 reg
         reg_defField_getJacobianMatrix = 768; // 34 reg
-        reg_initialiseConjugateGradient = 1024; // 20 reg
-        reg_getConjugateGradient1 = 1024; // 22 reg
-        reg_getConjugateGradient2 = 1024; // 25 reg
-        reg_updateControlPointPosition = 1024; // 22 reg
         reg_voxelCentricToNodeCentric = 1024; // 23 reg
         reg_convertNmiGradientFromVoxelToRealSpace = 1024; // 23 reg
         reg_ApplyConvolutionWindowAlongX = 1024; // 25 reg
diff --git a/reg-lib/cuda/CudaCompute.cu b/reg-lib/cuda/CudaCompute.cu
index 1b8f140d..9dfae7b0 100644
--- a/reg-lib/cuda/CudaCompute.cu
+++ b/reg-lib/cuda/CudaCompute.cu
@@ -109,6 +109,51 @@ void CudaCompute::GetDeformationField(bool composition, bool bspline) {
                                        bspline);
 }
 /* *************************************************************** */
+template<bool optimiseX, bool optimiseY, bool optimiseZ>
+inline void UpdateControlPointPosition(float4 *currentDofCuda,
+                                       cudaTextureObject_t bestDofTexture,
+                                       cudaTextureObject_t gradientTexture,
+                                       const size_t nVoxels,
+                                       const float scale) {
+    thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), nVoxels, [=]__device__(const int index) {
+        float4 dofValue = currentDofCuda[index]; scale; // To capture scale
+        const float4 bestValue = tex1Dfetch<float4>(bestDofTexture, index);
+        const float4 gradValue = tex1Dfetch<float4>(gradientTexture, index);
+        if constexpr (optimiseX)
+            dofValue.x = bestValue.x + scale * gradValue.x;
+        if constexpr (optimiseY)
+            dofValue.y = bestValue.y + scale * gradValue.y;
+        if constexpr (optimiseZ)
+            dofValue.z = bestValue.z + scale * gradValue.z;
+        currentDofCuda[index] = dofValue;
+    });
+}
+/* *************************************************************** */
+template<bool optimiseX, bool optimiseY>
+static inline void UpdateControlPointPosition(float4 *currentDofCuda,
+                                              cudaTextureObject_t bestDofTexture,
+                                              cudaTextureObject_t gradientTexture,
+                                              const size_t nVoxels,
+                                              const float scale,
+                                              const bool optimiseZ) {
+    auto updateControlPointPosition = UpdateControlPointPosition<optimiseX, optimiseY, true>;
+    if (!optimiseZ) updateControlPointPosition = UpdateControlPointPosition<optimiseX, optimiseY, false>;
+    updateControlPointPosition(currentDofCuda, bestDofTexture, gradientTexture, nVoxels, scale);
+}
+/* *************************************************************** */
+template<bool optimiseX>
+static inline void UpdateControlPointPosition(float4 *currentDofCuda,
+                                              cudaTextureObject_t bestDofTexture,
+                                              cudaTextureObject_t gradientTexture,
+                                              const size_t nVoxels,
+                                              const float scale,
+                                              const bool optimiseY,
+                                              const bool optimiseZ) {
+    auto updateControlPointPosition = UpdateControlPointPosition<optimiseX, true>;
+    if (!optimiseY) updateControlPointPosition = UpdateControlPointPosition<optimiseX, false>;
+    updateControlPointPosition(currentDofCuda, bestDofTexture, gradientTexture, nVoxels, scale, optimiseZ);
+}
+/* *************************************************************** */
 void CudaCompute::UpdateControlPointPosition(float *currentDof,
                                              const float *bestDof,
                                              const float *gradient,
@@ -116,14 +161,16 @@ void CudaCompute::UpdateControlPointPosition(float *currentDof,
                                              const bool optimiseX,
                                              const bool optimiseY,
                                              const bool optimiseZ) {
-    Cuda::UpdateControlPointPosition(NiftiImage::calcVoxelNumber(dynamic_cast<CudaF3dContent&>(con).F3dContent::GetControlPointGrid(), 3),
-                                     reinterpret_cast<float4*>(currentDof),
-                                     reinterpret_cast<const float4*>(bestDof),
-                                     reinterpret_cast<const float4*>(gradient),
-                                     scale,
-                                     optimiseX,
-                                     optimiseY,
-                                     optimiseZ);
+    const nifti_image *controlPointGrid = dynamic_cast<CudaF3dContent&>(con).F3dContent::GetControlPointGrid();
+    const bool is3d = controlPointGrid->nz > 1;
+    const size_t nVoxels = NiftiImage::calcVoxelNumber(controlPointGrid, 3);
+    auto bestDofTexturePtr = Cuda::CreateTextureObject(reinterpret_cast<const float4*>(bestDof), nVoxels, cudaChannelFormatKindFloat, 4);
+    auto gradientTexturePtr = Cuda::CreateTextureObject(reinterpret_cast<const float4*>(gradient), nVoxels, cudaChannelFormatKindFloat, 4);
+
+    auto updateControlPointPosition = ::UpdateControlPointPosition<true>;
+    if (!optimiseX) updateControlPointPosition = ::UpdateControlPointPosition<false>;
+    updateControlPointPosition(reinterpret_cast<float4*>(currentDof), *bestDofTexturePtr, *gradientTexturePtr,
+                               nVoxels, scale, optimiseY, is3d ? optimiseZ : false);
 }
 /* *************************************************************** */
 void CudaCompute::GetImageGradient(int interpolation, float paddingValue, int activeTimePoint) {
diff --git a/reg-lib/cuda/CudaOptimiser.cu b/reg-lib/cuda/CudaOptimiser.cu
index 1a094805..587b4f7d 100644
--- a/reg-lib/cuda/CudaOptimiser.cu
+++ b/reg-lib/cuda/CudaOptimiser.cu
@@ -1,5 +1,4 @@
 #include "CudaOptimiser.hpp"
-#include "CudaOptimiserKernels.cu"
 #include "_reg_common_cuda_kernels.cu"
 #include <curand_kernel.h>
 
@@ -178,20 +177,6 @@ void CudaConjugateGradient::Initialise(size_t nvox,
     NR_FUNC_CALLED();
 }
 /* *************************************************************** */
-void CudaConjugateGradient::UpdateGradientValues() {
-    if (this->firstCall) {
-        NR_DEBUG("Conjugate gradient initialisation");
-        InitialiseConjugateGradient(this->gradientCuda, this->array1, this->array2, this->GetVoxNumber());
-        if (this->isSymmetric)
-            InitialiseConjugateGradient(this->gradientBwCuda, this->array1Bw, this->array2Bw, this->GetVoxNumberBw());
-        this->firstCall = false;
-    } else {
-        NR_DEBUG("Conjugate gradient update");
-        GetConjugateGradient(this->gradientCuda, this->array1, this->array2, this->GetVoxNumber(),
-                             this->isSymmetric, this->gradientBwCuda, this->array1Bw, this->array2Bw, this->GetVoxNumberBw());
-    }
-}
-/* *************************************************************** */
 void CudaConjugateGradient::Optimise(float maxLength,
                                          float smallLength,
                                          float& startLength) {
@@ -204,108 +189,107 @@ void CudaConjugateGradient::Perturbation(float length) {
     this->firstCall = true;
 }
 /* *************************************************************** */
-void CudaConjugateGradient::InitialiseConjugateGradient(float4 *gradientImageCuda,
-                                                        float4 *conjugateGCuda,
-                                                        float4 *conjugateHCuda,
-                                                        const size_t nVoxels) {
-    auto gradientImageTexture = Cuda::CreateTextureObject(gradientImageCuda, nVoxels, cudaChannelFormatKindFloat, 4);
-
-    const unsigned blocks = CudaContext::GetBlockSize()->reg_initialiseConjugateGradient;
-    const unsigned grids = (unsigned)Ceil(sqrtf((float)nVoxels / (float)blocks));
-    const dim3 gridDims(grids, grids, 1);
-    const dim3 blockDims(blocks, 1, 1);
-
-    Cuda::InitialiseConjugateGradientKernel<<<gridDims, blockDims>>>(conjugateGCuda, *gradientImageTexture, (unsigned)nVoxels);
-    NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
-    NR_CUDA_SAFE_CALL(cudaMemcpy(conjugateHCuda, conjugateGCuda, nVoxels * sizeof(float4), cudaMemcpyDeviceToDevice));
+void InitialiseConjugateGradient(float4 *gradientCuda, float4 *conjugateGCuda, float4 *conjugateHCuda, const size_t nVoxels) {
+    auto gradientTexturePtr = Cuda::CreateTextureObject(gradientCuda, nVoxels, cudaChannelFormatKindFloat, 4);
+    auto gradientTexture = *gradientTexturePtr;
+    thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), nVoxels, [=]__device__(const int index) {
+        const float4 gradValue = tex1Dfetch<float4>(gradientTexture, index);
+        conjugateGCuda[index] = conjugateHCuda[index] = make_float4(-gradValue.x, -gradValue.y, -gradValue.z, 0);
+    });
 }
 /* *************************************************************** */
-struct Float2Sum {
-    __host__ __device__ double2 operator()(const float2& a, const float2& b) const {
-        return make_double2((double)a.x + (double)b.x, (double)a.y + (double)b.y);
-    }
-};
-/* *************************************************************** */
-void CudaConjugateGradient::GetConjugateGradient(float4 *gradientImageCuda,
-                                                 float4 *conjugateGCuda,
-                                                 float4 *conjugateHCuda,
-                                                 const size_t nVoxels,
-                                                 const bool isSymmetric,
-                                                 float4 *gradientImageBwCuda,
-                                                 float4 *conjugateGBwCuda,
-                                                 float4 *conjugateHBwCuda,
-                                                 const size_t nVoxelsBw) {
-    auto gradientImageTexture = Cuda::CreateTextureObject(gradientImageCuda, nVoxels, cudaChannelFormatKindFloat, 4);
-    auto conjugateGTexture = Cuda::CreateTextureObject(conjugateGCuda, nVoxels, cudaChannelFormatKindFloat, 4);
-    auto conjugateHTexture = Cuda::CreateTextureObject(conjugateHCuda, nVoxels, cudaChannelFormatKindFloat, 4);
-    Cuda::UniqueTextureObjectPtr gradientImageBwTexture, conjugateGBwTexture, conjugateHBwTexture;
+void GetConjugateGradient(float4 *gradientCuda,
+                          float4 *conjugateGCuda,
+                          float4 *conjugateHCuda,
+                          const size_t nVoxels,
+                          const bool isSymmetric,
+                          float4 *gradientBwCuda,
+                          float4 *conjugateGBwCuda,
+                          float4 *conjugateHBwCuda,
+                          const size_t nVoxelsBw) {
+    auto gradientTexturePtr = Cuda::CreateTextureObject(gradientCuda, nVoxels, cudaChannelFormatKindFloat, 4);
+    auto conjugateGTexturePtr = Cuda::CreateTextureObject(conjugateGCuda, nVoxels, cudaChannelFormatKindFloat, 4);
+    auto conjugateHTexturePtr = Cuda::CreateTextureObject(conjugateHCuda, nVoxels, cudaChannelFormatKindFloat, 4);
+    auto gradientTexture = *gradientTexturePtr;
+    auto conjugateGTexture = *conjugateGTexturePtr;
+    auto conjugateHTexture = *conjugateHTexturePtr;
+    Cuda::UniqueTextureObjectPtr gradientBwTexturePtr, conjugateGBwTexturePtr, conjugateHBwTexturePtr;
+    cudaTextureObject_t gradientBwTexture = 0, conjugateGBwTexture = 0, conjugateHBwTexture = 0;
     if (isSymmetric) {
-        gradientImageBwTexture = Cuda::CreateTextureObject(gradientImageBwCuda, nVoxelsBw, cudaChannelFormatKindFloat, 4);
-        conjugateGBwTexture = Cuda::CreateTextureObject(conjugateGBwCuda, nVoxelsBw, cudaChannelFormatKindFloat, 4);
-        conjugateHBwTexture = Cuda::CreateTextureObject(conjugateHBwCuda, nVoxelsBw, cudaChannelFormatKindFloat, 4);
+        gradientBwTexturePtr = Cuda::CreateTextureObject(gradientBwCuda, nVoxelsBw, cudaChannelFormatKindFloat, 4);
+        conjugateGBwTexturePtr = Cuda::CreateTextureObject(conjugateGBwCuda, nVoxelsBw, cudaChannelFormatKindFloat, 4);
+        conjugateHBwTexturePtr = Cuda::CreateTextureObject(conjugateHBwCuda, nVoxelsBw, cudaChannelFormatKindFloat, 4);
+        gradientBwTexture = *gradientBwTexturePtr;
+        conjugateGBwTexture = *conjugateGBwTexturePtr;
+        conjugateHBwTexture = *conjugateHBwTexturePtr;
     }
 
     // gam = sum((grad+g)*grad)/sum(HxG);
-    unsigned blocks = CudaContext::GetBlockSize()->reg_getConjugateGradient1;
-    unsigned grids = (unsigned)Ceil(sqrtf((float)nVoxels / (float)blocks));
-    dim3 blockDims(blocks, 1, 1);
-    dim3 gridDims(grids, grids, 1);
+    auto calcGam = []__device__(cudaTextureObject_t gradientTexture, cudaTextureObject_t conjugateGTexture,
+                                cudaTextureObject_t conjugateHTexture, const int index) {
+        const float4 hValue = tex1Dfetch<float4>(conjugateHTexture, index);
+        const float4 gValue = tex1Dfetch<float4>(conjugateGTexture, index);
+        const float gg = gValue.x * hValue.x + gValue.y * hValue.y + gValue.z * hValue.z;
+
+        const float4 grad = tex1Dfetch<float4>(gradientTexture, index);
+        const float dgg = (grad.x + gValue.x) * grad.x + (grad.y + gValue.y) * grad.y + (grad.z + gValue.z) * grad.z;
+
+        return make_double2(dgg, gg);
+    };
 
-    thrust::device_vector<float2> sumsCuda(nVoxels + nVoxels % 2);  // Make it even for thrust::inner_product
-    Cuda::GetConjugateGradientKernel1<<<gridDims, blockDims>>>(sumsCuda.data().get(), *gradientImageTexture,
-                                                         *conjugateGTexture, *conjugateHTexture, (unsigned)nVoxels);
-    NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
-    const size_t sumsSizeHalf = sumsCuda.size() / 2;
-    const double2 gg = thrust::inner_product(sumsCuda.begin(), sumsCuda.begin() + sumsSizeHalf, sumsCuda.begin() + sumsSizeHalf,
-                                             make_double2(0, 0), thrust::plus<double2>(), Float2Sum());
-    float gam = static_cast<float>(gg.x / gg.y);
+    double gam;
+    thrust::counting_iterator<int> it(0);
+    const double2 gg = thrust::transform_reduce(thrust::device, it, it + nVoxels, [=]__device__(const int index) {
+        return calcGam(gradientTexture, conjugateGTexture, conjugateHTexture, index);
+    }, make_double2(0, 0), thrust::plus<double2>());
     if (isSymmetric) {
-        grids = (unsigned)Ceil(sqrtf((float)nVoxelsBw / (float)blocks));
-        gridDims = dim3(blocks, 1, 1);
-        blockDims = dim3(grids, grids, 1);
-        thrust::device_vector<float2> sumsBwCuda(nVoxelsBw + nVoxelsBw % 2);  // Make it even for thrust::inner_product
-        Cuda::GetConjugateGradientKernel1<<<gridDims, blockDims>>>(sumsBwCuda.data().get(), *gradientImageBwTexture,
-                                                             *conjugateGBwTexture, *conjugateHBwTexture, (unsigned)nVoxelsBw);
-        NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
-        const size_t sumsBwSizeHalf = sumsBwCuda.size() / 2;
-        const double2 ggBw = thrust::inner_product(sumsBwCuda.begin(), sumsBwCuda.begin() + sumsBwSizeHalf, sumsBwCuda.begin() + sumsBwSizeHalf,
-                                                   make_double2(0, 0), thrust::plus<double2>(), Float2Sum());
-        gam = static_cast<float>((gg.x + ggBw.x) / (gg.y + ggBw.y));
-    }
+        it = thrust::counting_iterator<int>(0);
+        const double2 ggBw = thrust::transform_reduce(thrust::device, it, it + nVoxelsBw, [=]__device__(const int index) {
+            return calcGam(gradientBwTexture, conjugateGBwTexture, conjugateHBwTexture, index);
+        }, make_double2(0, 0), thrust::plus<double2>());
+        gam = (gg.x + ggBw.x) / (gg.y + ggBw.y);
+    } else gam = gg.x / gg.y;
+
+    // Conjugate gradient
+    auto conjugate = [gam]__device__(float4 *gradientCuda, float4 *conjugateGCuda, float4 *conjugateHCuda,
+                                     cudaTextureObject_t gradientTexture, cudaTextureObject_t conjugateHTexture, const int index) {
+        // G = -grad
+        float4 gradGValue = tex1Dfetch<float4>(gradientTexture, index);
+        gradGValue = make_float4(-gradGValue.x, -gradGValue.y, -gradGValue.z, 0);
+        conjugateGCuda[index] = gradGValue;
+
+        // H = G + gam * H
+        float4 gradHValue = tex1Dfetch<float4>(conjugateHTexture, index);
+        gradHValue = make_float4(gradGValue.x + gam * gradHValue.x,
+                                 gradGValue.y + gam * gradHValue.y,
+                                 gradGValue.z + gam * gradHValue.z, 0);
+        conjugateHCuda[index] = gradHValue;
+
+        gradientCuda[index] = make_float4(-gradHValue.x, -gradHValue.y, -gradHValue.z, 0);
+    };
 
-    blocks = (unsigned)CudaContext::GetBlockSize()->reg_getConjugateGradient2;
-    grids = (unsigned)Ceil(sqrtf((float)nVoxels / (float)blocks));
-    gridDims = dim3(blocks, 1, 1);
-    blockDims = dim3(grids, grids, 1);
-    Cuda::GetConjugateGradientKernel2<<<blockDims, gridDims>>>(gradientImageCuda, conjugateGCuda, conjugateHCuda, (unsigned)nVoxels, gam);
-    NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
+    thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), nVoxels, [=]__device__(const int index) {
+        conjugate(gradientCuda, conjugateGCuda, conjugateHCuda, gradientTexture, conjugateHTexture, index);
+    });
     if (isSymmetric) {
-        grids = (unsigned)Ceil(sqrtf((float)nVoxelsBw / (float)blocks));
-        gridDims = dim3(blocks, 1, 1);
-        blockDims = dim3(grids, grids, 1);
-        Cuda::GetConjugateGradientKernel2<<<blockDims, gridDims>>>(gradientImageBwCuda, conjugateGBwCuda, conjugateHBwCuda, (unsigned)nVoxelsBw, gam);
-        NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
+        thrust::for_each_n(thrust::device, thrust::make_counting_iterator(0), nVoxelsBw, [=]__device__(const int index) {
+            conjugate(gradientBwCuda, conjugateGBwCuda, conjugateHBwCuda, gradientBwTexture, conjugateHBwTexture, index);
+        });
     }
 }
 /* *************************************************************** */
-void Cuda::UpdateControlPointPosition(const size_t nVoxels,
-                                      float4 *controlPointImageCuda,
-                                      const float4 *bestControlPointCuda,
-                                      const float4 *gradientImageCuda,
-                                      const float scale,
-                                      const bool optimiseX,
-                                      const bool optimiseY,
-                                      const bool optimiseZ) {
-    auto bestControlPointTexture = Cuda::CreateTextureObject(bestControlPointCuda, nVoxels, cudaChannelFormatKindFloat, 4);
-    auto gradientImageTexture = Cuda::CreateTextureObject(gradientImageCuda, nVoxels, cudaChannelFormatKindFloat, 4);
-
-    const unsigned blocks = (unsigned)CudaContext::GetBlockSize()->reg_updateControlPointPosition;
-    const unsigned grids = (unsigned)Ceil(sqrtf((float)nVoxels / (float)blocks));
-    const dim3 blockDims(blocks, 1, 1);
-    const dim3 gridDims(grids, grids, 1);
-    UpdateControlPointPositionKernel<<<gridDims, blockDims>>>(controlPointImageCuda, *bestControlPointTexture, *gradientImageTexture,
-                                                              (unsigned)nVoxels, scale, optimiseX, optimiseY, optimiseZ);
-    NR_CUDA_CHECK_KERNEL(gridDims, blockDims);
+void CudaConjugateGradient::UpdateGradientValues() {
+    if (this->firstCall) {
+        NR_DEBUG("Conjugate gradient initialisation");
+        InitialiseConjugateGradient(this->gradientCuda, this->array1, this->array2, this->GetVoxNumber());
+        if (this->isSymmetric)
+            InitialiseConjugateGradient(this->gradientBwCuda, this->array1Bw, this->array2Bw, this->GetVoxNumberBw());
+        this->firstCall = false;
+    } else {
+        NR_DEBUG("Conjugate gradient update");
+        GetConjugateGradient(this->gradientCuda, this->array1, this->array2, this->GetVoxNumber(),
+                             this->isSymmetric, this->gradientBwCuda, this->array1Bw, this->array2Bw, this->GetVoxNumberBw());
+    }
 }
 /* *************************************************************** */
 } // namespace NiftyReg
diff --git a/reg-lib/cuda/CudaOptimiser.hpp b/reg-lib/cuda/CudaOptimiser.hpp
index fa9fec4d..56a1aceb 100644
--- a/reg-lib/cuda/CudaOptimiser.hpp
+++ b/reg-lib/cuda/CudaOptimiser.hpp
@@ -67,19 +67,6 @@ class CudaConjugateGradient: public CudaOptimiser {
     float4 *array2, *array2Bw;
     bool firstCall;
 
-    void InitialiseConjugateGradient(float4 *gradientImageCuda,
-                                     float4 *conjugateGCuda,
-                                     float4 *conjugateHCuda,
-                                     const size_t nVoxels);
-    void GetConjugateGradient(float4 *gradientImageCuda,
-                              float4 *conjugateGCuda,
-                              float4 *conjugateHCuda,
-                              const size_t nVoxels,
-                              const bool isSymmetric,
-                              float4 *gradientImageBwCuda,
-                              float4 *conjugateGBwCuda,
-                              float4 *conjugateHBwCuda,
-                              const size_t nVoxelsBw);
 #ifdef NR_TESTING
 public:
 #endif
@@ -108,18 +95,5 @@ class CudaConjugateGradient: public CudaOptimiser {
     virtual void Perturbation(float length) override;
 };
 /* *************************************************************** */
-namespace Cuda {
-/* *************************************************************** */
-void UpdateControlPointPosition(const size_t nVoxels,
-                                float4 *controlPointImageCuda,
-                                const float4 *bestControlPointCuda,
-                                const float4 *gradientImageCuda,
-                                const float scale,
-                                const bool optimiseX,
-                                const bool optimiseY,
-                                const bool optimiseZ);
-/* *************************************************************** */
-} // namespace Cuda
-/* *************************************************************** */
 } // namespace NiftyReg
 /* *************************************************************** */
diff --git a/reg-lib/cuda/CudaOptimiserKernels.cu b/reg-lib/cuda/CudaOptimiserKernels.cu
deleted file mode 100644
index 22a56c00..00000000
--- a/reg-lib/cuda/CudaOptimiserKernels.cu
+++ /dev/null
@@ -1,80 +0,0 @@
-/* *************************************************************** */
-namespace NiftyReg::Cuda {
-/* *************************************************************** */
-__global__ void InitialiseConjugateGradientKernel(float4 *conjugateGCuda,
-                                                  cudaTextureObject_t gradientImageTexture,
-                                                  const unsigned nVoxels) {
-    const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
-    if (tid < nVoxels) {
-        const float4 gradValue = tex1Dfetch<float4>(gradientImageTexture, tid);
-        conjugateGCuda[tid] = make_float4(-gradValue.x, -gradValue.y, -gradValue.z, 0);
-    }
-}
-/* *************************************************************** */
-__global__ void GetConjugateGradientKernel1(float2 *sums,
-                                            cudaTextureObject_t gradientImageTexture,
-                                            cudaTextureObject_t conjugateGTexture,
-                                            cudaTextureObject_t conjugateHTexture,
-                                            const unsigned nVoxels) {
-    const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
-    if (tid < nVoxels) {
-        const float4 valueH = tex1Dfetch<float4>(conjugateHTexture, tid);
-        const float4 valueG = tex1Dfetch<float4>(conjugateGTexture, tid);
-        const float gg = valueG.x * valueH.x + valueG.y * valueH.y + valueG.z * valueH.z;
-
-        const float4 grad = tex1Dfetch<float4>(gradientImageTexture, tid);
-        const float dgg = (grad.x + valueG.x) * grad.x + (grad.y + valueG.y) * grad.y + (grad.z + valueG.z) * grad.z;
-
-        sums[tid] = make_float2(dgg, gg);
-    }
-}
-/* *************************************************************** */
-__global__ void GetConjugateGradientKernel2(float4 *gradientImageCuda,
-                                            float4 *conjugateGCuda,
-                                            float4 *conjugateHCuda,
-                                            const unsigned nVoxels,
-                                            const float scale) {
-    const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
-    if (tid < nVoxels) {
-        // G = - grad
-        float4 gradGValue = gradientImageCuda[tid];
-        gradGValue = make_float4(-gradGValue.x, -gradGValue.y, -gradGValue.z, 0);
-        conjugateGCuda[tid] = gradGValue;
-
-        // H = G + gam * H
-        float4 gradHValue = conjugateHCuda[tid];
-        gradHValue = make_float4(gradGValue.x + scale * gradHValue.x,
-                                 gradGValue.y + scale * gradHValue.y,
-                                 gradGValue.z + scale * gradHValue.z,
-                                 0);
-        conjugateHCuda[tid] = gradHValue;
-
-        gradientImageCuda[tid] = make_float4(-gradHValue.x, -gradHValue.y, -gradHValue.z, 0);
-    }
-}
-/* *************************************************************** */
-__global__ void UpdateControlPointPositionKernel(float4 *controlPointImageCuda,
-                                                 cudaTextureObject_t bestControlPointTexture,
-                                                 cudaTextureObject_t gradientImageTexture,
-                                                 const unsigned nVoxels,
-                                                 const float scale,
-                                                 const bool optimiseX,
-                                                 const bool optimiseY,
-                                                 const bool optimiseZ) {
-    const unsigned tid = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
-    if (tid < nVoxels) {
-        float4 value = controlPointImageCuda[tid];
-        const float4 bestValue = tex1Dfetch<float4>(bestControlPointTexture, tid);
-        const float4 gradValue = tex1Dfetch<float4>(gradientImageTexture, tid);
-        if (optimiseX)
-            value.x = bestValue.x + scale * gradValue.x;
-        if (optimiseY)
-            value.y = bestValue.y + scale * gradValue.y;
-        if (optimiseZ)
-            value.z = bestValue.z + scale * gradValue.z;
-        controlPointImageCuda[tid] = value;
-    }
-}
-/* *************************************************************** */
-} // namespace NiftyReg::Cuda
-/* *************************************************************** */