From 8d4db978f4bd6ab189749c0cbcb87afbe5b9700c Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 30 Apr 2024 13:18:50 +0100
Subject: [PATCH 01/36] hiding cupy in try condition

---
 httomolibgpu/cuda_kernels/__init__.py |  10 ++-
 httomolibgpu/misc/corr.py             |   2 +-
 httomolibgpu/misc/morph.py            |   9 ++-
 httomolibgpu/misc/rescale.py          | 101 ++++++++++++++------------
 httomolibgpu/prep/alignment.py        |  10 ++-
 httomolibgpu/prep/normalize.py        |  13 +++-
 httomolibgpu/prep/phase.py            |  26 +++++--
 httomolibgpu/prep/stripe.py           |  14 ++--
 httomolibgpu/recon/algorithm.py       |  14 ++--
 httomolibgpu/recon/rotation.py        |  16 ++--
 10 files changed, 130 insertions(+), 85 deletions(-)

diff --git a/httomolibgpu/cuda_kernels/__init__.py b/httomolibgpu/cuda_kernels/__init__.py
index 0f945def..72d3415b 100644
--- a/httomolibgpu/cuda_kernels/__init__.py
+++ b/httomolibgpu/cuda_kernels/__init__.py
@@ -1,10 +1,16 @@
 import os
 from typing import List, Optional, Tuple
-import cupy as cp
+
+try:
+    import cupy as cp
+except ImportError:
+    print("Cupy library is a required dependency for HTTomolibgpu, please install")
 
 
 def load_cuda_module(
-    file: str, name_expressions: Optional[List[str]] = None, options: Tuple[str, ...] = tuple()
+    file: str,
+    name_expressions: Optional[List[str]] = None,
+    options: Tuple[str, ...] = tuple(),
 ) -> cp.RawModule:
     """Load a CUDA module file, i.e. a .cu file, from the file system,
     compile it, and return is as a CuPy RawModule for further
diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index d82aa3f6..d30bde4c 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -27,7 +27,7 @@
 from typing import Tuple
 import numpy as np
 import nvtx
-from cupy import float32
+from numpy import float32
 
 from httomolibgpu.cuda_kernels import load_cuda_module
 
diff --git a/httomolibgpu/misc/morph.py b/httomolibgpu/misc/morph.py
index 93608dbe..dd125187 100644
--- a/httomolibgpu/misc/morph.py
+++ b/httomolibgpu/misc/morph.py
@@ -20,14 +20,14 @@
 # ---------------------------------------------------------------------------
 """Module for data type morphing functions"""
 
-import cupy as cp
+try:
+    import cupy as cp
+except ImportError:
+    print("Cupy library is a required dependency for HTTomolibgpu, please install")
 import numpy as np
 import nvtx
 from typing import Literal, Tuple
 
-from cupyx.scipy.interpolate import interpn
-
-
 __all__ = [
     "sino_360_to_180",
     "data_resampler",
@@ -115,6 +115,7 @@ def data_resampler(
     Returns:
         cp.ndarray: Up/Down-scaled 3D cupy array
     """
+    from cupyx.scipy.interpolate import interpn
 
     if data.ndim != 3:
         raise ValueError("only 3D data is supported")
diff --git a/httomolibgpu/misc/rescale.py b/httomolibgpu/misc/rescale.py
index daddc4a5..2038e319 100644
--- a/httomolibgpu/misc/rescale.py
+++ b/httomolibgpu/misc/rescale.py
@@ -1,4 +1,8 @@
-import cupy as cp
+try:
+    import cupy as cp
+except ImportError:
+    print("Cupy library is a required dependency for HTTomolibgpu, please install")
+
 import numpy as np
 from typing import Literal, Optional, Tuple, Union
 import nvtx
@@ -9,79 +13,82 @@
 
 
 rescale_kernel = cp.ElementwiseKernel(
-    'T x, raw T input_min, raw T input_max, raw T factor',
-    'O out',
-    '''
+    "T x, raw T input_min, raw T input_max, raw T factor",
+    "O out",
+    """
       T x_clean = isnan(x) || isinf(x) ? T(0) : x;
       T x_clipped = x_clean < input_min ? input_min : (x_clean > input_max ? input_max : x_clean);
       T x_rebased = x_clipped - input_min;
       out = O(x_rebased * factor);
-    ''',
-    'rescale_to_int'
+    """,
+    "rescale_to_int",
 )
 
+
 @nvtx.annotate()
-def rescale_to_int(data: cp.ndarray, 
-            perc_range_min: float = 0.0,
-            perc_range_max: float = 100.0,
-            bits: Literal[8, 16, 32] = 8, 
-            glob_stats: Optional[Tuple[float, float, float, int]] = None):
+def rescale_to_int(
+    data: cp.ndarray,
+    perc_range_min: float = 0.0,
+    perc_range_max: float = 100.0,
+    bits: Literal[8, 16, 32] = 8,
+    glob_stats: Optional[Tuple[float, float, float, int]] = None,
+):
     """
-        Rescales the data and converts it fit into the range of an unsigned integer type
-        with the given number of bits.
-        
-        Parameters
-        ----------
-        data : cp.ndarray
-            Required input data array, on GPU
-        perc_range_min: float, optional
-            The lower cutoff point in the input data, in percent of the data range (defaults to 0).
-            The lower bound is computed as min + perc_range_min/100*(max-min)
-        perc_range_max: float, optional
-            The upper cutoff point in the input data, in percent of the data range (defaults to 100).
-            The upper bound is computed as min + perc_range_max/100*(max-min)
-        bits: Literal[8, 16, 32], optional
-            The number of bits in the output integer range (defaults to 8). 
-            Allowed values are:
-            - 8 -> uint8
-            - 16 -> uint16
-            - 32 -> uint32
-        glob_stats: tuple, optional
-            Global statistics of the full dataset (beyond the data passed into this call).
-            It's a tuple with (min, max, sum, num_items). If not given, the min/max is 
-            computed from the given data.
-            
-        Returns
-        -------
-        cp.ndarray
-            The original data, clipped to the range specified with the perc_range_min and 
-            perc_range_max, and scaled to the full range of the output integer type
+    Rescales the data and converts it fit into the range of an unsigned integer type
+    with the given number of bits.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        Required input data array, on GPU
+    perc_range_min: float, optional
+        The lower cutoff point in the input data, in percent of the data range (defaults to 0).
+        The lower bound is computed as min + perc_range_min/100*(max-min)
+    perc_range_max: float, optional
+        The upper cutoff point in the input data, in percent of the data range (defaults to 100).
+        The upper bound is computed as min + perc_range_max/100*(max-min)
+    bits: Literal[8, 16, 32], optional
+        The number of bits in the output integer range (defaults to 8).
+        Allowed values are:
+        - 8 -> uint8
+        - 16 -> uint16
+        - 32 -> uint32
+    glob_stats: tuple, optional
+        Global statistics of the full dataset (beyond the data passed into this call).
+        It's a tuple with (min, max, sum, num_items). If not given, the min/max is
+        computed from the given data.
+
+    Returns
+    -------
+    cp.ndarray
+        The original data, clipped to the range specified with the perc_range_min and
+        perc_range_max, and scaled to the full range of the output integer type
     """
-    
+
     if bits == 8:
         output_dtype: Union[type[np.uint8], type[np.uint16], type[np.uint32]] = np.uint8
     elif bits == 16:
         output_dtype = np.uint16
     else:
         output_dtype = np.uint32
-    
+
     # get the min and max integer values of the output type
     output_min = np.iinfo(output_dtype).min
     output_max = np.iinfo(output_dtype).max
-    
+
     if not isinstance(glob_stats, tuple):
         min_value = float(cp.min(data))
         max_value = float(cp.max(data))
     else:
         min_value = glob_stats[0]
         max_value = glob_stats[1]
-    
+
     range_intensity = max_value - min_value
     input_min = (perc_range_min * (range_intensity) / 100) + min_value
     input_max = (perc_range_max * (range_intensity) / 100) + min_value
-    
+
     factor = (output_max - output_min) / (input_max - input_min)
-    
+
     res = cp.empty(data.shape, dtype=output_dtype)
     rescale_kernel(data, input_min, input_max, factor, res)
-    return res
\ No newline at end of file
+    return res
diff --git a/httomolibgpu/prep/alignment.py b/httomolibgpu/prep/alignment.py
index b7181365..82397f23 100644
--- a/httomolibgpu/prep/alignment.py
+++ b/httomolibgpu/prep/alignment.py
@@ -23,8 +23,11 @@
 import os
 from typing import Dict, List, Optional, Tuple
 
-import cupy as cp
-from cupyx.scipy.ndimage import map_coordinates
+try:
+    import cupy as cp
+except ImportError:
+    print("Cupy library is a required dependency for HTTomolibgpu, please install")
+
 import nvtx
 import numpy as np
 
@@ -32,6 +35,7 @@
     "distortion_correction_proj_discorpy",
 ]
 
+
 # CuPy implementation of distortion correction from Discorpy
 # https://github.com/DiamondLightSource/discorpy/blob/67743842b60bf5dd45b21b8460e369d4a5e94d67/discorpy/post/postprocessing.py#L111-L148
 # (which is the same as the TomoPy version
@@ -74,6 +78,8 @@ def distortion_correction_proj_discorpy(
     cp.ndarray
         3D array. Distortion-corrected image(s).
     """
+    from cupyx.scipy.ndimage import map_coordinates
+
     # Check if it's a stack of 2D images, or only a single 2D image
     if len(data.shape) == 2:
         data = cp.expand_dims(data, axis=0)
diff --git a/httomolibgpu/prep/normalize.py b/httomolibgpu/prep/normalize.py
index 54d09010..ed04b654 100644
--- a/httomolibgpu/prep/normalize.py
+++ b/httomolibgpu/prep/normalize.py
@@ -21,10 +21,15 @@
 """Modules for raw projection data normalization"""
 
 from typing import Tuple
-import cupy as cp
+
+try:
+    import cupy as cp
+    import nvtx
+except ImportError:
+    print("Cupy library is a required dependency for HTTomolibgpu, please install")
+
 import numpy as np
-import nvtx
-from cupy import uint16, float32, mean
+from numpy import uint16, float32
 
 __all__ = ["normalize"]
 
@@ -65,6 +70,8 @@ def normalize(
     cp.ndarray
         Normalised 3D tomographic data as a CuPy array.
     """
+    from cupy import mean
+
     _check_valid_input(data, flats, darks)
 
     dark0 = cp.empty(darks.shape[1:], dtype=float32)
diff --git a/httomolibgpu/prep/phase.py b/httomolibgpu/prep/phase.py
index 79433c0f..1ab3daa9 100644
--- a/httomolibgpu/prep/phase.py
+++ b/httomolibgpu/prep/phase.py
@@ -22,9 +22,13 @@
 
 import math
 from typing import Tuple
-import cupy as cp
-from cupy import float32
-import cupyx
+
+try:
+    import cupy as cp
+except ImportError:
+    print("Cupy library is a required dependency for HTTomolibgpu, please install")
+
+from numpy import float32
 import numpy as np
 import nvtx
 
@@ -41,6 +45,7 @@
 PI = 3.14159265359
 PLANCK_CONSTANT = 6.58211928e-19  # [keV*s]
 
+
 ## %%%%%%%%%%%%%%%%%%%%%%% paganin_filter %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 #: CuPy implementation of Paganin filter from Savu
 @nvtx.annotate()
@@ -93,6 +98,8 @@ def paganin_filter_savu(
     cp.ndarray
         The stack of filtered projections.
     """
+    import cupyx
+
     # Check the input data is valid
     if data.ndim != 3:
         raise ValueError(
@@ -267,9 +274,11 @@ def _reciprocal_coord(pixel_size: float, num_grid: int) -> cp.ndarray:
     rc *= 2 * PI / (n * pixel_size)
     return rc
 
+
 ##-------------------------------------------------------------##
 ##-------------------------------------------------------------##
 
+
 # Adaptation with some corrections of retrieve_phase (Paganin filter)
 # from TomoPy
 @nvtx.annotate()
@@ -302,6 +311,7 @@ def paganin_filter_tomopy(
     cp.ndarray
         The 3D array of Paganin phase-filtered projection images.
     """
+    import cupyx
 
     # Check the input data is valid
     if tomo.ndim != 3:
@@ -312,7 +322,7 @@ def paganin_filter_tomopy(
 
     dz_orig, dy_orig, dx_orig = cp.shape(tomo)
 
-    # Perform padding to the power of 2 as FFT is O(n*log(n)) complexity    
+    # Perform padding to the power of 2 as FFT is O(n*log(n)) complexity
     # TODO: adding other options of padding?
     padded_tomo, pad_tup = _pad_projections_to_second_power(tomo)
 
@@ -326,7 +336,9 @@ def paganin_filter_tomopy(
     w2 = _reciprocal_grid(pixel_size, (dy, dx))
 
     # Build filter in the Fourier space.
-    phase_filter = cupyx.scipy.fft.fftshift(_paganin_filter_factor2(energy, dist, alpha, w2))
+    phase_filter = cupyx.scipy.fft.fftshift(
+        _paganin_filter_factor2(energy, dist, alpha, w2)
+    )
     phase_filter = phase_filter / phase_filter.max()  # normalisation
 
     # Apply filter and take inverse FFT
@@ -344,7 +356,7 @@ def paganin_filter_tomopy(
     # crop the padded filtered data:
     tomo = ifft_filtered_tomo[slc_indices].astype(cp.float32)
 
-    # taking the negative log    
+    # taking the negative log
     _log_kernel = cp.ElementwiseKernel(
         "C tomo",
         "C out",
@@ -399,5 +411,5 @@ def _pad_projections_to_second_power(tomo: cp.ndarray) -> tuple[cp.ndarray, tupl
 
 
 def _paganin_filter_factor2(energy, dist, alpha, w2):
-    # Alpha represents the ratio of delta/beta.    
+    # Alpha represents the ratio of delta/beta.
     return 1 / (_wavelength(energy) * dist * w2 / (4 * PI) + alpha)
diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py
index d4a29908..02d15c0f 100644
--- a/httomolibgpu/prep/stripe.py
+++ b/httomolibgpu/prep/stripe.py
@@ -21,13 +21,15 @@
 """Modules for stripes removal"""
 from typing import Tuple, Union
 
-import cupy as cp
+try:
+    import cupy as cp
+    import nvtx
+    from cupyx.scipy.ndimage import median_filter
+    from cupyx.scipy.ndimage import binary_dilation
+    from cupyx.scipy.ndimage import uniform_filter1d
+except ImportError:
+    print("Cupy library is a required dependency for HTTomolibgpu, please install")
 import numpy as np
-import nvtx
-from cupyx.scipy.ndimage import median_filter
-from cupyx.scipy import signal
-from cupyx.scipy.ndimage import binary_dilation
-from cupyx.scipy.ndimage import uniform_filter1d
 
 __all__ = [
     "remove_stripe_based_sorting",
diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
index ad2110bc..59f88851 100644
--- a/httomolibgpu/recon/algorithm.py
+++ b/httomolibgpu/recon/algorithm.py
@@ -21,14 +21,16 @@
 """Module for tomographic reconstruction"""
 
 from typing import Optional, Tuple, Union
-
 from typing import Type
 
-import cupy as cp
-from cupy import float32, complex64
-import cupyx
+try:
+    import cupy as cp
+    import nvtx
+except ImportError:
+    print("Cupy library is a required dependency for HTTomolibgpu, please install")
+
+from numpy import float32, complex64
 import numpy as np
-import nvtx
 
 from tomobar.methodsDIR_CuPy import RecToolsDIRCuPy
 from tomobar.methodsIR_CuPy import RecToolsIRCuPy
@@ -89,7 +91,7 @@ def FBP(
     reconstruction = RecToolsCP.FBP(
         data,
         cutoff_freq=filter_freq_cutoff,
-        recon_mask_radius=recon_mask_radius,        
+        recon_mask_radius=recon_mask_radius,
         data_axes_labels_order=input_data_axis_labels,
     )
     cp._default_memory_pool.free_all_blocks()
diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
index 338782df..32b2b742 100644
--- a/httomolibgpu/recon/rotation.py
+++ b/httomolibgpu/recon/rotation.py
@@ -23,14 +23,16 @@
 import math
 from typing import List, Literal, Optional, Tuple, Union
 
-import cupy as cp
-import numpy as np
-import cupyx
-import nvtx
-import cupyx.scipy.ndimage as cpndi
-from cupy import ndarray
-from cupyx.scipy.ndimage import gaussian_filter, shift
+try:
+    import cupy as cp
+    import cupyx
+    import nvtx
+    import cupyx.scipy.ndimage as cpndi
+    from cupyx.scipy.ndimage import gaussian_filter, shift
+except ImportError:
+    print("Cupy library is a required dependency for HTTomolibgpu, please install")
 
+import numpy as np
 from httomolibgpu.cuda_kernels import load_cuda_module
 
 __all__ = [

From 20db114abe7747d0ea62bab9665564834b2fe7fa Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 30 Apr 2024 13:38:21 +0100
Subject: [PATCH 02/36] try1

---
 httomolibgpu/__init__.py        |  14 +-
 httomolibgpu/misc/corr.py       |  12 +-
 httomolibgpu/misc/morph.py      | 214 ---------
 httomolibgpu/misc/rescale.py    |  94 ----
 httomolibgpu/prep/__init__.py   |   0
 httomolibgpu/prep/alignment.py  | 168 --------
 httomolibgpu/prep/normalize.py  | 133 ------
 httomolibgpu/prep/phase.py      | 415 ------------------
 httomolibgpu/prep/stripe.py     | 388 -----------------
 httomolibgpu/recon/__init__.py  |   0
 httomolibgpu/recon/algorithm.py | 282 ------------
 httomolibgpu/recon/rotation.py  | 741 --------------------------------
 12 files changed, 15 insertions(+), 2446 deletions(-)
 delete mode 100644 httomolibgpu/misc/morph.py
 delete mode 100644 httomolibgpu/misc/rescale.py
 delete mode 100644 httomolibgpu/prep/__init__.py
 delete mode 100644 httomolibgpu/prep/alignment.py
 delete mode 100644 httomolibgpu/prep/normalize.py
 delete mode 100644 httomolibgpu/prep/phase.py
 delete mode 100644 httomolibgpu/prep/stripe.py
 delete mode 100644 httomolibgpu/recon/__init__.py
 delete mode 100644 httomolibgpu/recon/algorithm.py
 delete mode 100644 httomolibgpu/recon/rotation.py

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index 9643e39b..08a5227e 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -1,8 +1,8 @@
 from httomolibgpu.misc.corr import *
-from httomolibgpu.misc.morph import *
-from httomolibgpu.prep.alignment import *
-from httomolibgpu.prep.normalize import *
-from httomolibgpu.prep.phase import *
-from httomolibgpu.prep.stripe import *
-from httomolibgpu.recon.algorithm import *
-from httomolibgpu.recon.rotation import *
+#from httomolibgpu.misc.morph import *
+#from httomolibgpu.prep.alignment import *
+#from httomolibgpu.prep.normalize import *
+#from httomolibgpu.prep.phase import *
+#from httomolibgpu.prep.stripe import *
+#from httomolibgpu.recon.algorithm import *
+#from httomolibgpu.recon.rotation import *
diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index d30bde4c..277a27a5 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -22,11 +22,18 @@
 
 try:
     import cupy as cp
+    import nvtx
 except ImportError:
     print("Cupy library is a required dependency for HTTomolibgpu, please install")
+
+try:
+    from cucim.skimage.filters import median
+    from cucim.skimage.morphology import disk    
+except ImportError:
+    print("Cucim library of RapidsAI is a required dependency for HTTomolibgpu, please install")
+
 from typing import Tuple
 import numpy as np
-import nvtx
 from numpy import float32
 
 from httomolibgpu.cuda_kernels import load_cuda_module
@@ -69,9 +76,6 @@ def median_filter(
     ValueError
         If the input array is not three dimensional.
     """
-    from cucim.skimage.filters import median
-    from cucim.skimage.morphology import disk
-
     input_type = data.dtype
 
     if input_type not in ["float32", "uint16"]:
diff --git a/httomolibgpu/misc/morph.py b/httomolibgpu/misc/morph.py
deleted file mode 100644
index dd125187..00000000
--- a/httomolibgpu/misc/morph.py
+++ /dev/null
@@ -1,214 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# ---------------------------------------------------------------------------
-# Copyright 2023 Diamond Light Source Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ---------------------------------------------------------------------------
-# Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
-# Created Date: 23 March 2023
-# ---------------------------------------------------------------------------
-"""Module for data type morphing functions"""
-
-try:
-    import cupy as cp
-except ImportError:
-    print("Cupy library is a required dependency for HTTomolibgpu, please install")
-import numpy as np
-import nvtx
-from typing import Literal, Tuple
-
-__all__ = [
-    "sino_360_to_180",
-    "data_resampler",
-]
-
-
-@nvtx.annotate()
-def sino_360_to_180(
-    data: cp.ndarray, overlap: int = 0, rotation: Literal["left", "right"] = "left"
-) -> cp.ndarray:
-    """
-    Converts 0-360 degrees sinogram to a 0-180 sinogram.
-    If the number of projections in the input data is odd, the last projection
-    will be discarded.
-
-    Parameters
-    ----------
-    data : cp.ndarray
-        Input 3D data.
-    overlap : scalar, optional
-        Overlapping number of pixels.
-    rotation : string, optional
-        'left' if rotation center is close to the left of the
-        field-of-view, 'right' otherwise.
-    Returns
-    -------
-    cp.ndarray
-        Output 3D data.
-    """
-    if data.ndim != 3:
-        raise ValueError("only 3D data is supported")
-
-    dx, dy, dz = data.shape
-
-    overlap = int(np.round(overlap))
-    if overlap >= dz:
-        raise ValueError("overlap must be less than data.shape[2]")
-    if overlap < 0:
-        raise ValueError("only positive overlaps are allowed.")
-
-    n = dx // 2
-
-    out = cp.empty((n, dy, 2 * dz - overlap), dtype=data.dtype)
-
-    if rotation == "left":
-        weights = cp.linspace(0, 1.0, overlap)
-        out[:, :, -dz + overlap :] = data[:n, :, overlap:]
-        out[:, :, : dz - overlap] = data[n : 2 * n, :, overlap:][:, :, ::-1]
-        out[:, :, dz - overlap : dz] = (
-            weights * data[:n, :, :overlap]
-            + (weights * data[n : 2 * n, :, :overlap])[:, :, ::-1]
-        )
-    elif rotation == "right":
-        weights = cp.linspace(1.0, 0, overlap)
-        out[:, :, : dz - overlap] = data[:n, :, :-overlap]
-        out[:, :, -dz + overlap :] = data[n : 2 * n, :, :-overlap][:, :, ::-1]
-        out[:, :, dz - overlap : dz] = (
-            weights * data[:n, :, -overlap:]
-            + (weights * data[n : 2 * n, :, -overlap:])[:, :, ::-1]
-        )
-    else:
-        raise ValueError('rotation parameter must be either "left" or "right"')
-
-    return out
-
-
-@nvtx.annotate()
-def data_resampler(
-    data: cp.ndarray, newshape: list, axis: int = 1, interpolation: str = "linear"
-) -> cp.ndarray:
-    """Down/Up-resampler of the input data implemented through interpn function.
-       Please note that the method will leave the specified axis
-       dimension unchanged, e.g. (128,128,128) -> (128,256,256) for axis = 0 and
-       newshape = [256,256].
-
-    Args:
-        data (cp.ndarray): 3d cupy array.
-        newshape (list): 2d list that defines the 2D slice shape of new shape data.
-        axis (int, optional): Axis along which the scaling is applied. Defaults to 1.
-        interpolation (str, optional): Selection of interpolation method. Defaults to 'linear'.
-
-    Raises:
-        ValueError: When data is not 3D
-
-    Returns:
-        cp.ndarray: Up/Down-scaled 3D cupy array
-    """
-    from cupyx.scipy.interpolate import interpn
-
-    if data.ndim != 3:
-        raise ValueError("only 3D data is supported")
-
-    N, M, Z = cp.shape(data)
-
-    if axis == 0:
-        xaxis = cp.arange(M) - M / 2
-        yaxis = cp.arange(Z) - Z / 2
-        step_x = M / newshape[0]
-        step_y = Z / newshape[1]
-        scaled_data = cp.empty((N, newshape[0], newshape[1]), dtype=cp.float32)
-    elif axis == 1:
-        xaxis = cp.arange(N) - N / 2
-        yaxis = cp.arange(Z) - Z / 2
-        step_x = N / newshape[0]
-        step_y = Z / newshape[1]
-        scaled_data = cp.empty((newshape[0], M, newshape[1]), dtype=cp.float32)
-    elif axis == 2:
-        xaxis = cp.arange(N) - N / 2
-        yaxis = cp.arange(M) - M / 2
-        step_x = N / newshape[0]
-        step_y = M / newshape[1]
-        scaled_data = cp.empty((newshape[0], newshape[1], Z), dtype=cp.float32)
-    else:
-        raise ValueError("Only 0,1,2 values for axes are supported")
-
-    points = (xaxis, yaxis)
-
-    scale_x = 2 / step_x
-    scale_y = 2 / step_y
-
-    y1 = np.linspace(
-        -newshape[0] / scale_x,
-        newshape[0] / scale_x - step_x,
-        num=newshape[0],
-        endpoint=False,
-    ).astype(np.float32)
-    x1 = np.linspace(
-        -newshape[1] / scale_y,
-        newshape[1] / scale_y - step_y,
-        num=newshape[1],
-        endpoint=False,
-    ).astype(np.float32)
-
-    xi_mesh = np.meshgrid(x1, y1)
-    xi = np.empty((2, newshape[0], newshape[1]), dtype=np.float32)
-    xi[0, :, :] = xi_mesh[1]
-    xi[1, :, :] = xi_mesh[0]
-    xi_size = xi.size
-    xi = np.rollaxis(xi, 0, 3)
-    xi = np.reshape(xi, [xi_size // 2, 2])
-    xi = cp.asarray(xi, dtype=cp.float32, order="C")
-
-    if axis == 0:
-        for j in range(N):
-            res = interpn(
-                points,
-                data[j, :, :],
-                xi,
-                method=interpolation,
-                bounds_error=False,
-                fill_value=0.0,
-            )
-            scaled_data[j, :, :] = cp.reshape(
-                res, [newshape[0], newshape[1]], order="C"
-            )
-    elif axis == 1:
-
-        for j in range(M):
-            res = interpn(
-                points,
-                data[:, j, :],
-                xi,
-                method=interpolation,
-                bounds_error=False,
-                fill_value=0.0,
-            )
-            scaled_data[:, j, :] = cp.reshape(
-                res, [newshape[0], newshape[1]], order="C"
-            )
-    else:
-        for j in range(Z):
-            res = interpn(
-                points,
-                data[:, :, j],
-                xi,
-                method=interpolation,
-                bounds_error=False,
-                fill_value=0.0,
-            )
-            scaled_data[:, :, j] = cp.reshape(
-                res, [newshape[0], newshape[1]], order="C"
-            )
-
-    return scaled_data
diff --git a/httomolibgpu/misc/rescale.py b/httomolibgpu/misc/rescale.py
deleted file mode 100644
index 2038e319..00000000
--- a/httomolibgpu/misc/rescale.py
+++ /dev/null
@@ -1,94 +0,0 @@
-try:
-    import cupy as cp
-except ImportError:
-    print("Cupy library is a required dependency for HTTomolibgpu, please install")
-
-import numpy as np
-from typing import Literal, Optional, Tuple, Union
-import nvtx
-
-__all__ = [
-    "rescale_to_int",
-]
-
-
-rescale_kernel = cp.ElementwiseKernel(
-    "T x, raw T input_min, raw T input_max, raw T factor",
-    "O out",
-    """
-      T x_clean = isnan(x) || isinf(x) ? T(0) : x;
-      T x_clipped = x_clean < input_min ? input_min : (x_clean > input_max ? input_max : x_clean);
-      T x_rebased = x_clipped - input_min;
-      out = O(x_rebased * factor);
-    """,
-    "rescale_to_int",
-)
-
-
-@nvtx.annotate()
-def rescale_to_int(
-    data: cp.ndarray,
-    perc_range_min: float = 0.0,
-    perc_range_max: float = 100.0,
-    bits: Literal[8, 16, 32] = 8,
-    glob_stats: Optional[Tuple[float, float, float, int]] = None,
-):
-    """
-    Rescales the data and converts it fit into the range of an unsigned integer type
-    with the given number of bits.
-
-    Parameters
-    ----------
-    data : cp.ndarray
-        Required input data array, on GPU
-    perc_range_min: float, optional
-        The lower cutoff point in the input data, in percent of the data range (defaults to 0).
-        The lower bound is computed as min + perc_range_min/100*(max-min)
-    perc_range_max: float, optional
-        The upper cutoff point in the input data, in percent of the data range (defaults to 100).
-        The upper bound is computed as min + perc_range_max/100*(max-min)
-    bits: Literal[8, 16, 32], optional
-        The number of bits in the output integer range (defaults to 8).
-        Allowed values are:
-        - 8 -> uint8
-        - 16 -> uint16
-        - 32 -> uint32
-    glob_stats: tuple, optional
-        Global statistics of the full dataset (beyond the data passed into this call).
-        It's a tuple with (min, max, sum, num_items). If not given, the min/max is
-        computed from the given data.
-
-    Returns
-    -------
-    cp.ndarray
-        The original data, clipped to the range specified with the perc_range_min and
-        perc_range_max, and scaled to the full range of the output integer type
-    """
-
-    if bits == 8:
-        output_dtype: Union[type[np.uint8], type[np.uint16], type[np.uint32]] = np.uint8
-    elif bits == 16:
-        output_dtype = np.uint16
-    else:
-        output_dtype = np.uint32
-
-    # get the min and max integer values of the output type
-    output_min = np.iinfo(output_dtype).min
-    output_max = np.iinfo(output_dtype).max
-
-    if not isinstance(glob_stats, tuple):
-        min_value = float(cp.min(data))
-        max_value = float(cp.max(data))
-    else:
-        min_value = glob_stats[0]
-        max_value = glob_stats[1]
-
-    range_intensity = max_value - min_value
-    input_min = (perc_range_min * (range_intensity) / 100) + min_value
-    input_max = (perc_range_max * (range_intensity) / 100) + min_value
-
-    factor = (output_max - output_min) / (input_max - input_min)
-
-    res = cp.empty(data.shape, dtype=output_dtype)
-    rescale_kernel(data, input_min, input_max, factor, res)
-    return res
diff --git a/httomolibgpu/prep/__init__.py b/httomolibgpu/prep/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/httomolibgpu/prep/alignment.py b/httomolibgpu/prep/alignment.py
deleted file mode 100644
index 82397f23..00000000
--- a/httomolibgpu/prep/alignment.py
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# ---------------------------------------------------------------------------
-# Copyright 2022 Diamond Light Source Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ---------------------------------------------------------------------------
-# Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
-# Created Date: 01 November 2022
-# ---------------------------------------------------------------------------
-"""Modules for data correction"""
-
-import os
-from typing import Dict, List, Optional, Tuple
-
-try:
-    import cupy as cp
-except ImportError:
-    print("Cupy library is a required dependency for HTTomolibgpu, please install")
-
-import nvtx
-import numpy as np
-
-__all__ = [
-    "distortion_correction_proj_discorpy",
-]
-
-
-# CuPy implementation of distortion correction from Discorpy
-# https://github.com/DiamondLightSource/discorpy/blob/67743842b60bf5dd45b21b8460e369d4a5e94d67/discorpy/post/postprocessing.py#L111-L148
-# (which is the same as the TomoPy version
-# https://github.com/tomopy/tomopy/blob/c236a2969074f5fc70189fb5545f0a165924f916/source/tomopy/prep/alignment.py#L950-L981
-# but with the additional params `order` and `mode`).
-@nvtx.annotate()
-def distortion_correction_proj_discorpy(
-    data: cp.ndarray,
-    metadata_path: str,
-    preview: Dict[str, List[int]],
-    order: int = 1,
-    mode: str = "reflect",
-):
-    """Unwarp a stack of images using a backward model.
-
-    Parameters
-    ----------
-    data : cp.ndarray
-        3D array.
-
-    metadata_path : str
-        The path to the file containing the distortion coefficients for the
-        data.
-
-    preview : Dict[str, List[int]]
-        A dict containing three key-value pairs:
-        - a list containing the `start` value of each dimension
-        - a list containing the `stop` value of each dimension
-        - a list containing the `step` value of each dimension
-
-    order : int, optional.
-        The order of the spline interpolation.
-
-    mode : {'reflect', 'grid-mirror', 'constant', 'grid-constant', 'nearest',
-           'mirror', 'grid-wrap', 'wrap'}, optional
-        To determine how to handle image boundaries.
-
-    Returns
-    -------
-    cp.ndarray
-        3D array. Distortion-corrected image(s).
-    """
-    from cupyx.scipy.ndimage import map_coordinates
-
-    # Check if it's a stack of 2D images, or only a single 2D image
-    if len(data.shape) == 2:
-        data = cp.expand_dims(data, axis=0)
-
-    # Get info from metadata txt file
-    xcenter, ycenter, list_fact = _load_metadata_txt(metadata_path)
-
-    # Use preview information to offset the x and y coords of the center of
-    # distortion
-    shift = preview["starts"]
-    step = preview["steps"]
-    x_dim = 1
-    y_dim = 0
-    step_check = max([step[i] for i in [x_dim, y_dim]]) > 1
-    if step_check:
-        msg = (
-            "\n***********************************************\n"
-            "!!! ERROR !!! -> Method doesn't work with the step in"
-            " the preview larger than 1 \n"
-            "***********************************************\n"
-        )
-        raise ValueError(msg)
-
-    x_offset = shift[x_dim]
-    y_offset = shift[y_dim]
-    xcenter = xcenter - x_offset
-    ycenter = ycenter - y_offset
-
-    height, width = data.shape[y_dim + 1], data.shape[x_dim + 1]
-    xu_list = cp.arange(width) - xcenter
-    yu_list = cp.arange(height) - ycenter
-    xu_mat, yu_mat = cp.meshgrid(xu_list, yu_list)
-    ru_mat = cp.sqrt(xu_mat**2 + yu_mat**2)
-    fact_mat = cp.sum(
-        cp.asarray([factor * ru_mat**i for i, factor in enumerate(list_fact)]), axis=0
-    )
-    xd_mat = cp.asarray(
-        cp.clip(xcenter + fact_mat * xu_mat, 0, width - 1), dtype=cp.float32
-    )
-    yd_mat = cp.asarray(
-        cp.clip(ycenter + fact_mat * yu_mat, 0, height - 1), dtype=cp.float32
-    )
-    indices = [cp.reshape(yd_mat, (-1, 1)), cp.reshape(xd_mat, (-1, 1))]
-    indices = cp.asarray(indices, dtype=cp.float32)
-
-    # Loop over images and unwarp them
-    for i in range(data.shape[0]):
-        mat = map_coordinates(data[i], indices, order=order, mode=mode)
-        mat = cp.reshape(mat, (height, width))
-        data[i] = mat
-
-    return data
-
-
-def _load_metadata_txt(file_path):
-    """
-    Load distortion coefficients from a text file.
-    Order of the infor in the text file:
-    xcenter
-    ycenter
-    factor_0
-    factor_1
-    factor_2
-    ...
-    Parameters
-    ----------
-    file_path : str
-        Path to the file
-    Returns
-    -------
-    tuple of float and list of floats
-        Tuple of (xcenter, ycenter, list_fact).
-    """
-    with open(file_path, "r") as f:
-        x = f.read().splitlines()
-        list_data = []
-        for i in x:
-            list_data.append(float(i.split()[-1]))
-    xcenter = list_data[0]
-    ycenter = list_data[1]
-    list_fact = list_data[2:]
-
-    return xcenter, ycenter, list_fact
-
-
-## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
diff --git a/httomolibgpu/prep/normalize.py b/httomolibgpu/prep/normalize.py
deleted file mode 100644
index ed04b654..00000000
--- a/httomolibgpu/prep/normalize.py
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# ---------------------------------------------------------------------------
-# Copyright 2022 Diamond Light Source Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ---------------------------------------------------------------------------
-# Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
-# Created Date: 01 November 2022
-# ---------------------------------------------------------------------------
-"""Modules for raw projection data normalization"""
-
-from typing import Tuple
-
-try:
-    import cupy as cp
-    import nvtx
-except ImportError:
-    print("Cupy library is a required dependency for HTTomolibgpu, please install")
-
-import numpy as np
-from numpy import uint16, float32
-
-__all__ = ["normalize"]
-
-
-@nvtx.annotate()
-def normalize(
-    data: cp.ndarray,
-    flats: cp.ndarray,
-    darks: cp.ndarray,
-    cutoff: float = 10.0,
-    minus_log: bool = True,
-    nonnegativity: bool = False,
-    remove_nans: bool = False,
-) -> cp.ndarray:
-    """
-    Normalize raw projection data using the flat and dark field projections.
-    This is a raw CUDA kernel implementation with CuPy wrappers.
-
-    Parameters
-    ----------
-    data : cp.ndarray
-        Projection data as a CuPy array.
-    flats : cp.ndarray
-        3D flat field data as a CuPy array.
-    darks : cp.ndarray
-        3D dark field data as a CuPy array.
-    cutoff : float, optional
-        Permitted maximum value for the normalised data.
-    minus_log : bool, optional
-        Apply negative log to the normalised data.
-    nonnegativity : bool, optional
-        Remove negative values in the normalised data.
-    remove_nans : bool, optional
-        Remove NaN and Inf values in the normalised data.
-
-    Returns
-    -------
-    cp.ndarray
-        Normalised 3D tomographic data as a CuPy array.
-    """
-    from cupy import mean
-
-    _check_valid_input(data, flats, darks)
-
-    dark0 = cp.empty(darks.shape[1:], dtype=float32)
-    flat0 = cp.empty(flats.shape[1:], dtype=float32)
-    out = cp.empty(data.shape, dtype=float32)
-    mean(darks, axis=0, dtype=float32, out=dark0)
-    mean(flats, axis=0, dtype=float32, out=flat0)
-
-    kernel_name = "normalisation"
-    kernel = r"""
-        float denom = float(flats) - float(darks);
-        if (denom < eps) {
-            denom = eps;
-        }
-        float v = (float(data) - float(darks))/denom;
-        """
-    if minus_log:
-        kernel += "v = -log(v);\n"
-        kernel_name += "_mlog"
-    if nonnegativity:
-        kernel += "if (v < 0.0f) v = 0.0f;\n"
-        kernel_name += "_nneg"
-    if remove_nans:
-        kernel += "if (isnan(v)) v = 0.0f;\n"
-        kernel += "if (isinf(v)) v = 0.0f;\n"
-        kernel_name += "_remnan"
-    kernel += "if (v > cutoff) v = cutoff;\n"
-    kernel += "out = v;\n"
-
-    normalisation_kernel = cp.ElementwiseKernel(
-        "T data, U flats, U darks, raw float32 cutoff",
-        "float32 out",
-        kernel,
-        kernel_name,
-        options=("-std=c++11",),
-        loop_prep="constexpr float eps = 1.0e-07;",
-        no_return=True,
-    )
-
-    normalisation_kernel(data, flat0, dark0, float32(cutoff), out)
-
-    return out
-
-
-def _check_valid_input(data, flats, darks) -> None:
-    """Helper function to check the validity of inputs to normalisation functions"""
-    if data.ndim != 3:
-        raise ValueError("Input data must be a 3D stack of projections")
-
-    if flats.ndim not in (2, 3):
-        raise ValueError("Input flats must be 2D or 3D data only")
-
-    if darks.ndim not in (2, 3):
-        raise ValueError("Input darks must be 2D or 3D data only")
-
-    if flats.ndim == 2:
-        flats = flats[cp.newaxis, :, :]
-    if darks.ndim == 2:
-        darks = darks[cp.newaxis, :, :]
diff --git a/httomolibgpu/prep/phase.py b/httomolibgpu/prep/phase.py
deleted file mode 100644
index 1ab3daa9..00000000
--- a/httomolibgpu/prep/phase.py
+++ /dev/null
@@ -1,415 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# ---------------------------------------------------------------------------
-# Copyright 2022 Diamond Light Source Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ---------------------------------------------------------------------------
-# Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
-# Created Date: 01 November 2022
-# ---------------------------------------------------------------------------
-"""Modules for phase retrieval and phase-contrast enhancement"""
-
-import math
-from typing import Tuple
-
-try:
-    import cupy as cp
-except ImportError:
-    print("Cupy library is a required dependency for HTTomolibgpu, please install")
-
-from numpy import float32
-import numpy as np
-import nvtx
-
-from httomolibgpu.cuda_kernels import load_cuda_module
-
-__all__ = [
-    "paganin_filter_savu",
-    "paganin_filter_tomopy",
-]
-
-# Define constants used in phase retrieval method
-BOLTZMANN_CONSTANT = 1.3806488e-16  # [erg/k]
-SPEED_OF_LIGHT = 299792458e2  # [cm/s]
-PI = 3.14159265359
-PLANCK_CONSTANT = 6.58211928e-19  # [keV*s]
-
-
-## %%%%%%%%%%%%%%%%%%%%%%% paganin_filter %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-#: CuPy implementation of Paganin filter from Savu
-@nvtx.annotate()
-def paganin_filter_savu(
-    data: cp.ndarray,
-    ratio: float = 250.0,
-    energy: float = 53.0,
-    distance: float = 1.0,
-    resolution: float = 1.28,
-    pad_y: int = 100,
-    pad_x: int = 100,
-    pad_method: str = "edge",
-    increment: float = 0.0,
-) -> cp.ndarray:
-    """
-    Apply Paganin filter (for denoising or contrast enhancement) to
-    projections.
-
-    Parameters
-    ----------
-    data : cp.ndarray
-        The stack of projections to filter.
-
-    ratio : float, optional
-        Ratio of delta/beta.
-
-    energy : float, optional
-        Beam energy in keV.
-
-    distance : float, optional
-        Distance from sample to detector in metres.
-
-    resolution : float, optional
-        Pixel size in microns.
-
-    pad_y : int, optional
-        Pad the top and bottom of projections.
-
-    pad_x : int, optional
-        Pad the left and right of projections.
-
-    pad_method : str, optional
-        Numpy pad method to use.
-
-    increment : float, optional
-        Increment all values by this amount before taking the log.
-
-    Returns
-    -------
-    cp.ndarray
-        The stack of filtered projections.
-    """
-    import cupyx
-
-    # Check the input data is valid
-    if data.ndim != 3:
-        raise ValueError(
-            f"Invalid number of dimensions in data: {data.ndim},"
-            " please provide a stack of 2D projections."
-        )
-
-    # Setup various values for the filter
-    _, height, width = data.shape
-    micron = 1e-6
-    keV = 1000.0
-    energy *= keV
-    resolution *= micron
-    wavelength = (1240.0 / energy) * 1e-9
-
-    height1 = height + 2 * pad_y
-    width1 = width + 2 * pad_x
-
-    # Define the paganin filter, taking into account the padding that will be
-    # applied to the projections (if any)
-
-    # Using raw kernel her as indexing is direct and it avoids a lot of temporaries
-    # and tiny kernels
-    module = load_cuda_module("paganin_filter_gen")
-    kernel = module.get_function("paganin_filter_gen")
-
-    # Apply padding to all the 2D projections
-    # Note: this takes considerable time on GPU...
-    data = cp.pad(data, ((0, 0), (pad_y, pad_y), (pad_x, pad_x)), mode=pad_method)
-
-    # Define array to hold result, which will not have the padding applied to it
-    precond_kernel_float = cp.ElementwiseKernel(
-        "T data",
-        "T out",
-        """
-        if (isnan(data)) {
-            out = T(0); 
-        } else if (isinf(data)) {
-            out = data < 0.0 ? -3.402823e38f : 3.402823e38f;  // FLT_MAX, not available in cupy
-        } else if (data == 0.0) {
-            out = 1.0;
-        } else {
-            out = data;
-        }
-        """,
-        name="paganin_precond_float",
-        no_return=True,
-    )
-    precond_kernel_int = cp.ElementwiseKernel(
-        "T data",
-        "T out",
-        """out = data == 0 ? 1 : data""",
-        name="paganin_precond_int",
-        no_return=True,
-    )
-
-    if data.dtype in (cp.float32, cp.float64):
-        precond_kernel_float(data, data)
-    else:
-        precond_kernel_int(data, data)
-
-    # avoid normalising in both directions - we include multiplier in the post_kernel
-    data = cp.asarray(data, dtype=cp.complex64)
-    data = cupyx.scipy.fft.fft2(data, axes=(-2, -1), overwrite_x=True, norm="backward")
-
-    # prepare filter here, while the GPU is busy with the FFT
-    filtercomplex = cp.empty((height1, width1), dtype=np.complex64)
-    bx = 16
-    by = 8
-    gx = (width1 + bx - 1) // bx
-    gy = (height1 + by - 1) // by
-    kernel(
-        grid=(gx, gy, 1),
-        block=(bx, by, 1),
-        args=(
-            cp.int32(width1),
-            cp.int32(height1),
-            cp.float32(resolution),
-            cp.float32(wavelength),
-            cp.float32(distance),
-            cp.float32(ratio),
-            filtercomplex,
-        ),
-    )
-    data *= filtercomplex
-
-    data = cupyx.scipy.fft.ifft2(data, axes=(-2, -1), overwrite_x=True, norm="forward")
-
-    post_kernel = cp.ElementwiseKernel(
-        "C pci1, raw float32 increment, raw float32 ratio, raw float32 fft_scale",
-        "T out",
-        "out = -0.5 * ratio * log(abs(pci1) * fft_scale + increment)",
-        name="paganin_post_proc",
-        no_return=True,
-    )
-    fft_scale = 1.0 / (data.shape[1] * data.shape[2])
-    res = cp.empty((data.shape[0], height, width), dtype=np.float32)
-    post_kernel(
-        data[:, pad_y : pad_y + height, pad_x : pad_x + width],
-        np.float32(increment),
-        np.float32(ratio),
-        np.float32(fft_scale),
-        res,
-    )
-
-    return res
-
-
-def _wavelength(energy: float) -> float:
-    return 2 * PI * PLANCK_CONSTANT * SPEED_OF_LIGHT / energy
-
-
-def _paganin_filter_factor(
-    energy: float, dist: float, alpha: float, w2: cp.ndarray
-) -> cp.ndarray:
-    return 1 / (_wavelength(energy) * dist * w2 / (4 * PI) + alpha)
-
-
-def _calc_pad_width(dim: int, pixel_size: float, wavelength: float, dist: float) -> int:
-    pad_pix = cp.ceil(PI * wavelength * dist / pixel_size**2)
-    return int((pow(2, cp.ceil(cp.log2(dim + pad_pix))) - dim) * 0.5)
-
-
-def _calc_pad_val(tomo: cp.ndarray) -> float:
-    return cp.mean((tomo[..., 0] + tomo[..., -1]) * 0.5)
-
-
-def _reciprocal_grid(pixel_size: float, shape_proj: tuple) -> cp.ndarray:
-    """
-    Calculate reciprocal grid.
-
-    Parameters
-    ----------
-    pixel_size : float
-        Detector pixel size in cm.
-    shape_proj : tuple
-        Shape of the reciprocal grid along x and y axes.
-
-    Returns
-    -------
-    ndarray
-        Grid coordinates.
-    """
-    # Sampling in reciprocal space.
-    indx = _reciprocal_coord(pixel_size, shape_proj[0])
-    indy = _reciprocal_coord(pixel_size, shape_proj[1])
-    indx_sq = cp.square(indx)
-    indy_sq = cp.square(indy)
-
-    return cp.add.outer(indx_sq, indy_sq)
-
-
-def _reciprocal_coord(pixel_size: float, num_grid: int) -> cp.ndarray:
-    """
-    Calculate reciprocal grid coordinates for a given pixel size
-    and discretization.
-
-    Parameters
-    ----------
-    pixel_size : float
-        Detector pixel size in cm.
-    num_grid : int
-        Size of the reciprocal grid.
-
-    Returns
-    -------
-    ndarray
-        Grid coordinates.
-    """
-    n = num_grid - 1
-    rc = cp.arange(-n, num_grid, 2, dtype=cp.float32)
-    rc *= 2 * PI / (n * pixel_size)
-    return rc
-
-
-##-------------------------------------------------------------##
-##-------------------------------------------------------------##
-
-
-# Adaptation with some corrections of retrieve_phase (Paganin filter)
-# from TomoPy
-@nvtx.annotate()
-def paganin_filter_tomopy(
-    tomo: cp.ndarray,
-    pixel_size: float = 1e-4,
-    dist: float = 50.0,
-    energy: float = 53.0,
-    alpha: float = 1e-3,
-) -> cp.ndarray:
-    """
-    Perform single-material phase retrieval from flats/darks corrected tomographic measurements
-    :cite:`Paganin:02`.
-
-    Parameters
-    ----------
-    tomo : cp.ndarray
-        3D array of f/d corrected tomographic projections.
-    pixel_size : float, optional
-        Detector pixel size in cm.
-    dist : float, optional
-        Propagation distance of the wavefront in cm.
-    energy : float, optional
-        Energy of incident wave in keV.
-    alpha : float, optional
-        Regularization parameter, the ratio of delta/beta. Larger values lead to more smoothing.
-
-    Returns
-    -------
-    cp.ndarray
-        The 3D array of Paganin phase-filtered projection images.
-    """
-    import cupyx
-
-    # Check the input data is valid
-    if tomo.ndim != 3:
-        raise ValueError(
-            f"Invalid number of dimensions in data: {tomo.ndim},"
-            " please provide a stack of 2D projections."
-        )
-
-    dz_orig, dy_orig, dx_orig = cp.shape(tomo)
-
-    # Perform padding to the power of 2 as FFT is O(n*log(n)) complexity
-    # TODO: adding other options of padding?
-    padded_tomo, pad_tup = _pad_projections_to_second_power(tomo)
-
-    dz, dy, dx = cp.shape(padded_tomo)
-
-    # 3D FFT of tomo data
-    padded_tomo = cp.asarray(padded_tomo, dtype=cp.complex64)
-    fft_tomo = cupyx.scipy.fft.fft2(padded_tomo, axes=(-2, -1), overwrite_x=True)
-
-    # Compute the reciprocal grid.
-    w2 = _reciprocal_grid(pixel_size, (dy, dx))
-
-    # Build filter in the Fourier space.
-    phase_filter = cupyx.scipy.fft.fftshift(
-        _paganin_filter_factor2(energy, dist, alpha, w2)
-    )
-    phase_filter = phase_filter / phase_filter.max()  # normalisation
-
-    # Apply filter and take inverse FFT
-    ifft_filtered_tomo = (
-        cupyx.scipy.fft.ifft2(phase_filter * fft_tomo, axes=(-2, -1), overwrite_x=True)
-    ).real
-
-    # slicing indices for cropping
-    slc_indices = (
-        slice(pad_tup[0][0], pad_tup[0][0] + dz_orig, 1),
-        slice(pad_tup[1][0], pad_tup[1][0] + dy_orig, 1),
-        slice(pad_tup[2][0], pad_tup[2][0] + dx_orig, 1),
-    )
-
-    # crop the padded filtered data:
-    tomo = ifft_filtered_tomo[slc_indices].astype(cp.float32)
-
-    # taking the negative log
-    _log_kernel = cp.ElementwiseKernel(
-        "C tomo",
-        "C out",
-        "out = -log(tomo)",
-        name="log_kernel",
-    )
-
-    return _log_kernel(tomo)
-
-
-def _shift_bit_length(x: int) -> int:
-    return 1 << (x - 1).bit_length()
-
-
-def _pad_projections_to_second_power(tomo: cp.ndarray) -> tuple[cp.ndarray, tuple]:
-    """
-    Performs padding of each projection to the next power of 2.
-    If the shape is not even we also care of that before padding.
-
-    Parameters
-    ----------
-    tomo : cp.ndarray
-        3d projection data
-
-    Returns
-    -------
-    ndarray: padded 3d projection data
-    tuple: a tuple with padding dimensions
-    """
-    full_shape_tomo = cp.shape(tomo)
-
-    pad_tup = []
-    for index, element in enumerate(full_shape_tomo):
-        if index == 0:
-            pad_width = (0, 0)  # do not pad the slicing dim
-        else:
-            diff = _shift_bit_length(element + 1) - element
-            if element % 2 == 0:
-                pad_width = diff // 2
-                pad_width = (pad_width, pad_width)
-            else:
-                # need an uneven padding for odd-number lengths
-                left_pad = diff // 2
-                right_pad = diff - left_pad
-                pad_width = (left_pad, right_pad)
-
-        pad_tup.append(pad_width)
-
-    padded_tomo = cp.pad(tomo, tuple(pad_tup), "edge")
-
-    return padded_tomo, pad_tup
-
-
-def _paganin_filter_factor2(energy, dist, alpha, w2):
-    # Alpha represents the ratio of delta/beta.
-    return 1 / (_wavelength(energy) * dist * w2 / (4 * PI) + alpha)
diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py
deleted file mode 100644
index 02d15c0f..00000000
--- a/httomolibgpu/prep/stripe.py
+++ /dev/null
@@ -1,388 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# ---------------------------------------------------------------------------
-# Copyright 2022 Diamond Light Source Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ---------------------------------------------------------------------------
-# Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
-# Created Date: 01 November 2022
-# ---------------------------------------------------------------------------
-"""Modules for stripes removal"""
-from typing import Tuple, Union
-
-try:
-    import cupy as cp
-    import nvtx
-    from cupyx.scipy.ndimage import median_filter
-    from cupyx.scipy.ndimage import binary_dilation
-    from cupyx.scipy.ndimage import uniform_filter1d
-except ImportError:
-    print("Cupy library is a required dependency for HTTomolibgpu, please install")
-import numpy as np
-
-__all__ = [
-    "remove_stripe_based_sorting",
-    "remove_stripe_ti",
-    "remove_all_stripe",
-]
-
-
-@nvtx.annotate()
-def remove_stripe_based_sorting(
-    data: Union[cp.ndarray, np.ndarray],
-    size: int = 11,
-    dim: int = 1,
-) -> Union[cp.ndarray, np.ndarray]:
-    """
-    Remove full and partial stripe artifacts from sinogram using Nghia Vo's
-    approach, algorithm 3 in Ref. [1]. Angular direction is along the axis 0.
-    This algorithm works particularly well for removing partial stripes.
-
-    Steps of the algorithm:
-    1. Sort each column of the sinogram by its grayscale values.
-    2. Apply a smoothing (median) filter on the sorted image along each row.
-    3. Re-sort the smoothed image columns to the original rows to
-       get the corrected sinogram.
-
-    Parameters
-    ----------
-    data : ndarray
-        3D tomographic data as a CuPy or NumPy array.
-    size : int, optional
-        Window size of the median filter.
-    dim : {1, 2}, optional
-        Dimension of the window.
-
-    Returns
-    -------
-    ndarray
-        Corrected 3D tomographic data as a CuPy or NumPy array.
-
-    References
-    ----------
-    .. [1] https://doi.org/10.1364/OE.26.028396
-    """
-
-    if size is None:
-        if data.shape[2] > 2000:
-            size = 21
-        else:
-            size = max(5, int(0.01 * data.shape[2]))
-
-    for m in range(data.shape[1]):
-        data[:, m, :] = _rs_sort(data[:, m, :], size, dim)
-
-    return data
-
-
-@nvtx.annotate()
-def _rs_sort(sinogram, size, dim):
-    """
-    Remove stripes using the sorting technique.
-    """
-    xp = cp.get_array_module(sinogram)
-    sinogram = xp.transpose(sinogram)
-
-    #: Sort each column of the sinogram by its grayscale values
-    #: Keep track of the sorting indices so we can reverse it below
-    sortvals = xp.argsort(sinogram, axis=1)
-    sortvals_reverse = xp.argsort(sortvals, axis=1)
-    sino_sort = xp.take_along_axis(sinogram, sortvals, axis=1)
-
-    #: Now apply the median filter on the sorted image along each row
-    if xp.__name__ == "cupy":
-        from cupyx.scipy.ndimage import median_filter
-    else:
-        from scipy.ndimage import median_filter
-
-    sino_sort = median_filter(sino_sort, (size, 1) if dim == 1 else (size, size))
-
-    #: step 3: re-sort the smoothed image columns to the original rows
-    sino_corrected = xp.take_along_axis(sino_sort, sortvals_reverse, axis=1)
-
-    return xp.transpose(sino_corrected)
-
-
-@nvtx.annotate()
-def remove_stripe_ti(
-    data: Union[cp.ndarray, np.ndarray],
-    beta: float = 0.1,
-) -> Union[cp.ndarray, np.ndarray]:
-    """
-    Removes stripes with the method of V. Titarenko (TomoCuPy implementation)
-
-    Parameters
-    ----------
-    data : ndarray
-        3D stack of projections as a CuPy array.
-    beta : float, optional
-        filter parameter, lower values increase the filter strength.
-        Default is 0.1.
-
-    Returns
-    -------
-    ndarray
-        3D array of de-striped projections.
-    """
-    # TODO: detector dimensions must be even otherwise error
-    xp = cp.get_array_module(data)
-    gamma = beta * ((1 - beta) / (1 + beta)) ** xp.abs(
-        xp.fft.fftfreq(data.shape[-1]) * data.shape[-1]
-    )
-    gamma[0] -= 1
-    v = xp.mean(data, axis=0)
-    v = v - v[:, 0:1]
-    v = xp.fft.irfft(xp.fft.rfft(v) * xp.fft.rfft(gamma)).astype(data.dtype)
-    data[:] += v
-
-    return data
-
-
-######## Optimized version for Vo-all ring removal in tomopy########
-# This function is taken from TomoCuPy package
-# *************************************************************************** #
-#                  Copyright © 2022, UChicago Argonne, LLC                    #
-#                           All Rights Reserved                               #
-#                         Software Name: Tomocupy                             #
-#                     By: Argonne National Laboratory                         #
-#                                                                             #
-#                           OPEN SOURCE LICENSE                               #
-#                                                                             #
-# Redistribution and use in source and binary forms, with or without          #
-# modification, are permitted provided that the following conditions are met: #
-#                                                                             #
-# 1. Redistributions of source code must retain the above copyright notice,   #
-#    this list of conditions and the following disclaimer.                    #
-# 2. Redistributions in binary form must reproduce the above copyright        #
-#    notice, this list of conditions and the following disclaimer in the      #
-#    documentation and/or other materials provided with the distribution.     #
-# 3. Neither the name of the copyright holder nor the names of its            #
-#    contributors may be used to endorse or promote products derived          #
-#    from this software without specific prior written permission.            #
-#                                                                             #
-#                                                                             #
-# *************************************************************************** #
-@nvtx.annotate()
-def remove_all_stripe(
-    data: cp.ndarray,
-    snr: float = 3.0,
-    la_size: int = 61,
-    sm_size: int = 21,
-    dim: int = 1,
-) -> cp.ndarray:
-    """
-    Remove all types of stripe artifacts from sinogram using Nghia Vo's
-    approach :cite:`Vo:18` (combination of algorithm 3,4,5, and 6).
-
-    Parameters
-    ----------
-    data : ndarray
-        3D tomographic data as a CuPy array.
-    snr  : float, optional
-        Ratio used to locate large stripes.
-        Greater is less sensitive.
-    la_size : int, optional
-        Window size of the median filter to remove large stripes.
-    sm_size : int, optional
-        Window size of the median filter to remove small-to-medium stripes.
-    dim : {1, 2}, optional
-        Dimension of the window.
-
-    Returns
-    -------
-    ndarray
-        Corrected 3D tomographic data as a CuPy or NumPy array.
-
-    References
-    ----------
-    .. [1] https://doi.org/10.1364/OE.26.028396
-
-    """
-    matindex = _create_matindex(data.shape[2], data.shape[0])
-    for m in range(data.shape[1]):
-        sino = data[:, m, :]
-        sino = _rs_dead(sino, snr, la_size, matindex)
-        sino = _rs_sort2(sino, sm_size, matindex, dim)
-        data[:, m, :] = sino
-    return data
-
-
-@nvtx.annotate()
-def _rs_sort2(sinogram, size, matindex, dim):
-    """
-    Remove stripes using the sorting technique.
-    """
-    sinogram = cp.transpose(sinogram)
-    matcomb = cp.asarray(cp.dstack((matindex, sinogram)))
-
-    # matsort = cp.asarray([row[row[:, 1].argsort()] for row in matcomb])
-    ids = cp.argsort(matcomb[:, :, 1], axis=1)
-    matsort = matcomb.copy()
-    matsort[:, :, 0] = cp.take_along_axis(matsort[:, :, 0], ids, axis=1)
-    matsort[:, :, 1] = cp.take_along_axis(matsort[:, :, 1], ids, axis=1)
-    if dim == 1:
-        matsort[:, :, 1] = median_filter(matsort[:, :, 1], (size, 1))
-    else:
-        matsort[:, :, 1] = median_filter(matsort[:, :, 1], (size, size))
-
-    # matsortback = cp.asarray([row[row[:, 0].argsort()] for row in matsort])
-
-    ids = cp.argsort(matsort[:, :, 0], axis=1)
-    matsortback = matsort.copy()
-    matsortback[:, :, 0] = cp.take_along_axis(matsortback[:, :, 0], ids, axis=1)
-    matsortback[:, :, 1] = cp.take_along_axis(matsortback[:, :, 1], ids, axis=1)
-
-    sino_corrected = matsortback[:, :, 1]
-    return cp.transpose(sino_corrected)
-
-
-@nvtx.annotate()
-def _mpolyfit(x, y):
-    n = len(x)
-    x_mean = cp.mean(x)
-    y_mean = cp.mean(y)
-
-    Sxy = cp.sum(x * y) - n * x_mean * y_mean
-    Sxx = cp.sum(x * x) - n * x_mean * x_mean
-
-    slope = Sxy / Sxx
-    intercept = y_mean - slope * x_mean
-    return slope, intercept
-
-
-@nvtx.annotate()
-def _detect_stripe(listdata, snr):
-    """
-    Algorithm 4 in :cite:`Vo:18`. Used to locate stripes.
-    """
-    numdata = len(listdata)
-    listsorted = cp.sort(listdata)[::-1]
-    xlist = cp.arange(0, numdata, 1.0)
-    ndrop = cp.int16(0.25 * numdata)
-    # (_slope, _intercept) = cp.polyfit(xlist[ndrop:-ndrop - 1],
-    #   listsorted[ndrop:-ndrop - 1], 1)
-    (_slope, _intercept) = _mpolyfit(
-        xlist[ndrop : -ndrop - 1], listsorted[ndrop : -ndrop - 1]
-    )
-
-    numt1 = _intercept + _slope * xlist[-1]
-    noiselevel = cp.abs(numt1 - _intercept)
-    noiselevel = cp.clip(noiselevel, 1e-6, None)
-    val1 = cp.abs(listsorted[0] - _intercept) / noiselevel
-    val2 = cp.abs(listsorted[-1] - numt1) / noiselevel
-    listmask = cp.zeros_like(listdata)
-    if val1 >= snr:
-        upper_thresh = _intercept + noiselevel * snr * 0.5
-        listmask[listdata > upper_thresh] = 1.0
-    if val2 >= snr:
-        lower_thresh = numt1 - noiselevel * snr * 0.5
-        listmask[listdata <= lower_thresh] = 1.0
-    return listmask
-
-
-@nvtx.annotate()
-def _rs_large(sinogram, snr, size, matindex, drop_ratio=0.1, norm=True):
-    """
-    Remove large stripes.
-    """
-    drop_ratio = max(min(drop_ratio, 0.8), 0)  # = cp.clip(drop_ratio, 0.0, 0.8)
-    (nrow, ncol) = sinogram.shape
-    ndrop = int(0.5 * drop_ratio * nrow)
-    sinosort = cp.sort(sinogram, axis=0)
-    sinosmooth = median_filter(sinosort, (1, size))
-    list1 = cp.mean(sinosort[ndrop : nrow - ndrop], axis=0)
-    list2 = cp.mean(sinosmooth[ndrop : nrow - ndrop], axis=0)
-    # listfact = cp.divide(list1,
-    #                      list2,
-    #                      out=cp.ones_like(list1),
-    #                      where=list2 != 0)
-
-    listfact = list1 / list2
-
-    # Locate stripes
-    listmask = _detect_stripe(listfact, snr)
-    listmask = binary_dilation(listmask, iterations=1).astype(listmask.dtype)
-    matfact = cp.tile(listfact, (nrow, 1))
-    # Normalize
-    if norm is True:
-        sinogram = sinogram / matfact
-    sinogram1 = cp.transpose(sinogram)
-    matcombine = cp.asarray(cp.dstack((matindex, sinogram1)))
-
-    # matsort = cp.asarray([row[row[:, 1].argsort()] for row in matcombine])
-    ids = cp.argsort(matcombine[:, :, 1], axis=1)
-    matsort = matcombine.copy()
-    matsort[:, :, 0] = cp.take_along_axis(matsort[:, :, 0], ids, axis=1)
-    matsort[:, :, 1] = cp.take_along_axis(matsort[:, :, 1], ids, axis=1)
-
-    matsort[:, :, 1] = cp.transpose(sinosmooth)
-    # matsortback = cp.asarray([row[row[:, 0].argsort()] for row in matsort])
-    ids = cp.argsort(matsort[:, :, 0], axis=1)
-    matsortback = matsort.copy()
-    matsortback[:, :, 0] = cp.take_along_axis(matsortback[:, :, 0], ids, axis=1)
-    matsortback[:, :, 1] = cp.take_along_axis(matsortback[:, :, 1], ids, axis=1)
-
-    sino_corrected = cp.transpose(matsortback[:, :, 1])
-    listxmiss = cp.where(listmask > 0.0)[0]
-    sinogram[:, listxmiss] = sino_corrected[:, listxmiss]
-    return sinogram
-
-
-@nvtx.annotate()
-def _rs_dead(sinogram, snr, size, matindex, norm=True):
-    """
-    Remove unresponsive and fluctuating stripes.
-    """
-    sinogram = cp.copy(sinogram)  # Make it mutable
-    (nrow, _) = sinogram.shape
-    # sinosmooth = cp.apply_along_axis(uniform_filter1d, 0, sinogram, 10)
-    sinosmooth = uniform_filter1d(sinogram, 10, axis=0)
-
-    listdiff = cp.sum(cp.abs(sinogram - sinosmooth), axis=0)
-    listdiffbck = median_filter(listdiff, size)
-
-    listfact = listdiff / listdiffbck
-
-    listmask = _detect_stripe(listfact, snr)
-    listmask = binary_dilation(listmask, iterations=1).astype(listmask.dtype)
-    listmask[0:2] = 0.0
-    listmask[-2:] = 0.0
-    listx = cp.where(listmask < 1.0)[0]
-    listy = cp.arange(nrow)
-    matz = sinogram[:, listx]
-
-    listxmiss = cp.where(listmask > 0.0)[0]
-
-    # finter = interpolate.interp2d(listx.get(), listy.get(), matz.get(), kind='linear')
-    if len(listxmiss) > 0:
-        # sinogram_c[:, listxmiss.get()] = finter(listxmiss.get(), listy.get())
-        ids = cp.searchsorted(listx, listxmiss)
-        sinogram[:, listxmiss] = matz[:, ids - 1] + (listxmiss - listx[ids - 1]) * (
-            matz[:, ids] - matz[:, ids - 1]
-        ) / (listx[ids] - listx[ids - 1])
-
-    # Remove residual stripes
-    if norm is True:
-        sinogram = _rs_large(sinogram, snr, size, matindex)
-    return sinogram
-
-
-@nvtx.annotate()
-def _create_matindex(nrow, ncol):
-    """
-    Create a 2D array of indexes used for the sorting technique.
-    """
-    listindex = cp.arange(0.0, ncol, 1.0)
-    matindex = cp.tile(listindex, (nrow, 1))
-    return matindex.astype(np.float32)
diff --git a/httomolibgpu/recon/__init__.py b/httomolibgpu/recon/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
deleted file mode 100644
index 59f88851..00000000
--- a/httomolibgpu/recon/algorithm.py
+++ /dev/null
@@ -1,282 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# ---------------------------------------------------------------------------
-# Copyright 2022 Diamond Light Source Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ---------------------------------------------------------------------------
-# Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
-# Changes relative to ToMoBAR 2024.01 version
-# ---------------------------------------------------------------------------
-"""Module for tomographic reconstruction"""
-
-from typing import Optional, Tuple, Union
-from typing import Type
-
-try:
-    import cupy as cp
-    import nvtx
-except ImportError:
-    print("Cupy library is a required dependency for HTTomolibgpu, please install")
-
-from numpy import float32, complex64
-import numpy as np
-
-from tomobar.methodsDIR_CuPy import RecToolsDIRCuPy
-from tomobar.methodsIR_CuPy import RecToolsIRCuPy
-
-__all__ = [
-    "FBP",
-    "SIRT",
-    "CGLS",
-]
-
-input_data_axis_labels = ["angles", "detY", "detX"]  # set the labels of the input data
-
-
-## %%%%%%%%%%%%%%%%%%%%%%% FBP reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-@nvtx.annotate()
-def FBP(
-    data: cp.ndarray,
-    angles: np.ndarray,
-    center: Optional[float] = None,
-    filter_freq_cutoff: Optional[float] = 0.6,
-    recon_size: Optional[int] = None,
-    recon_mask_radius: Optional[float] = None,
-    gpu_id: int = 0,
-) -> cp.ndarray:
-    """
-    Perform Filtered Backprojection (FBP) reconstruction using ASTRA toolbox and ToMoBAR wrappers.
-    This is a 3D recon from a CuPy array and a custom built filter.
-
-    Parameters
-    ----------
-    data : cp.ndarray
-        Projection data as a CuPy array.
-    angles : np.ndarray
-        An array of angles given in radians.
-    center : float, optional
-        The center of rotation (CoR).
-    filter_freq_cutoff : float, optional
-        Cutoff frequency parameter for the sinc filter, the lowest values produce more crispy but noisy reconstruction.
-    recon_size : int, optional
-        The [recon_size, recon_size] shape of the reconstructed slice in pixels.
-        By default (None), the reconstructed size will be the dimension of the horizontal detector.
-    recon_mask_radius: float, optional
-        The radius of the circular mask that applies to the reconstructed slice in order to crop
-        out some undesirable artefacts. The values outside the diameter will be set to zero.
-        None by default, to see the effect of the mask try setting the value in the range [0.7-1.0].
-    gpu_id : int, optional
-        A GPU device index to perform operation on.
-
-    Returns
-    -------
-    cp.ndarray
-        The FBP reconstructed volume as a CuPy array.
-    """
-    RecToolsCP = _instantiate_direct_recon_class(
-        data, angles, center, recon_size, gpu_id
-    )
-
-    reconstruction = RecToolsCP.FBP(
-        data,
-        cutoff_freq=filter_freq_cutoff,
-        recon_mask_radius=recon_mask_radius,
-        data_axes_labels_order=input_data_axis_labels,
-    )
-    cp._default_memory_pool.free_all_blocks()
-    return cp.require(cp.swapaxes(reconstruction, 0, 1), requirements="C")
-
-
-## %%%%%%%%%%%%%%%%%%%%%%% SIRT reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-@nvtx.annotate()
-def SIRT(
-    data: cp.ndarray,
-    angles: np.ndarray,
-    center: Optional[float] = None,
-    recon_size: Optional[int] = None,
-    iterations: Optional[int] = 300,
-    nonnegativity: Optional[bool] = True,
-    gpu_id: int = 0,
-) -> cp.ndarray:
-    """
-    Perform Simultaneous Iterative Recostruction Technique (SIRT) using ASTRA toolbox and ToMoBAR wrappers.
-    This is 3D recon directly from a CuPy array while using ASTRA GPUlink capability.
-
-    Parameters
-    ----------
-    data : cp.ndarray
-        Projection data as a CuPy array.
-    angles : np.ndarray
-        An array of angles given in radians.
-    center : float, optional
-        The center of rotation (CoR).
-    recon_size : int, optional
-        The [recon_size, recon_size] shape of the reconstructed slice in pixels.
-        By default (None), the reconstructed size will be the dimension of the horizontal detector.
-    iterations : int, optional
-        The number of SIRT iterations.
-    nonnegativity : bool, optional
-        Impose nonnegativity constraint on reconstructed image.
-    gpu_id : int, optional
-        A GPU device index to perform operation on.
-
-    Returns
-    -------
-    cp.ndarray
-        The SIRT reconstructed volume as a CuPy array.
-    """
-
-    RecToolsCP = _instantiate_iterative_recon_class(
-        data, angles, center, recon_size, gpu_id, datafidelity="LS"
-    )
-
-    _data_ = {
-        "projection_norm_data": data,
-        "data_axes_labels_order": input_data_axis_labels,
-    }  # data dictionary
-    _algorithm_ = {
-        "iterations": iterations,
-        "nonnegativity": nonnegativity,
-    }
-    reconstruction = RecToolsCP.SIRT(_data_, _algorithm_)
-    cp._default_memory_pool.free_all_blocks()
-    return cp.require(cp.swapaxes(reconstruction, 0, 1), requirements="C")
-
-
-## %%%%%%%%%%%%%%%%%%%%%%% CGLS reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-@nvtx.annotate()
-def CGLS(
-    data: cp.ndarray,
-    angles: np.ndarray,
-    center: Optional[float] = None,
-    recon_size: Optional[int] = None,
-    iterations: Optional[int] = 20,
-    nonnegativity: Optional[bool] = True,
-    gpu_id: int = 0,
-) -> cp.ndarray:
-    """
-    Perform Congugate Gradient Least Squares (CGLS) using ASTRA toolbox and ToMoBAR wrappers.
-    This is 3D recon directly from a CuPy array while using ASTRA GPUlink capability.
-
-    Parameters
-    ----------
-    data : cp.ndarray
-        Projection data as a CuPy array.
-    angles : np.ndarray
-        An array of angles given in radians.
-    center : float, optional
-        The center of rotation (CoR).
-    recon_size : int, optional
-        The [recon_size, recon_size] shape of the reconstructed slice in pixels.
-        By default (None), the reconstructed size will be the dimension of the horizontal detector.
-    iterations : int, optional
-        The number of CGLS iterations.
-    nonnegativity : bool, optional
-        Impose nonnegativity constraint on reconstructed image.
-    gpu_id : int, optional
-        A GPU device index to perform operation on.
-
-    Returns
-    -------
-    cp.ndarray
-        The CGLS reconstructed volume as a CuPy array.
-    """
-    RecToolsCP = _instantiate_iterative_recon_class(
-        data, angles, center, recon_size, gpu_id, datafidelity="LS"
-    )
-
-    _data_ = {
-        "projection_norm_data": data,
-        "data_axes_labels_order": input_data_axis_labels,
-    }  # data dictionary
-    _algorithm_ = {"iterations": iterations, "nonnegativity": nonnegativity}
-    reconstruction = RecToolsCP.CGLS(_data_, _algorithm_)
-    cp._default_memory_pool.free_all_blocks()
-    return cp.require(cp.swapaxes(reconstruction, 0, 1), requirements="C")
-
-
-## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-def _instantiate_direct_recon_class(
-    data: cp.ndarray,
-    angles: np.ndarray,
-    center: Optional[float] = None,
-    recon_size: Optional[int] = None,
-    gpu_id: int = 0,
-) -> type[RecToolsDIRCuPy]:
-    """instantiate ToMoBAR's direct recon class
-
-    Args:
-        data (cp.ndarray): data array
-        angles (np.ndarray): angles
-        center (Optional[float], optional): center of recon. Defaults to None.
-        recon_size (Optional[int], optional): recon_size. Defaults to None.
-        gpu_id (int, optional): gpu ID. Defaults to 0.
-
-    Returns:
-        type[RecToolsDIRCuPy]: an instance of the direct recon class
-    """
-    if center is None:
-        center = data.shape[2] // 2  # making a crude guess
-    if recon_size is None:
-        recon_size = data.shape[2]
-    RecToolsCP = RecToolsDIRCuPy(
-        DetectorsDimH=data.shape[2],  # Horizontal detector dimension
-        DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
-        CenterRotOffset=data.shape[2] / 2
-        - center
-        - 0.5,  # Center of Rotation scalar or a vector
-        AnglesVec=-angles,  # A vector of projection angles in radians
-        ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
-        device_projector=gpu_id,
-    )
-    return RecToolsCP
-
-
-def _instantiate_iterative_recon_class(
-    data: cp.ndarray,
-    angles: np.ndarray,
-    center: Optional[float] = None,
-    recon_size: Optional[int] = None,
-    gpu_id: int = 0,
-    datafidelity: str = "LS",
-) -> type[RecToolsIRCuPy]:
-    """instantiate ToMoBAR's iterative recon class
-
-    Args:
-        data (cp.ndarray): data array
-        angles (np.ndarray): angles
-        center (Optional[float], optional): center of recon. Defaults to None.
-        recon_size (Optional[int], optional): recon_size. Defaults to None.
-        datafidelity (str, optional): Data fidelity
-        gpu_id (int, optional): gpu ID. Defaults to 0.
-
-    Returns:
-        type[RecToolsIRCuPy]: an instance of the iterative class
-    """
-    if center is None:
-        center = data.shape[2] // 2  # making a crude guess
-    if recon_size is None:
-        recon_size = data.shape[2]
-    RecToolsCP = RecToolsIRCuPy(
-        DetectorsDimH=data.shape[2],  # Horizontal detector dimension
-        DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
-        CenterRotOffset=data.shape[2] / 2
-        - center
-        - 0.5,  # Center of Rotation scalar or a vector
-        AnglesVec=-angles,  # A vector of projection angles in radians
-        ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
-        datafidelity=datafidelity,
-        device_projector=gpu_id,
-    )
-    return RecToolsCP
diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
deleted file mode 100644
index 32b2b742..00000000
--- a/httomolibgpu/recon/rotation.py
+++ /dev/null
@@ -1,741 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# ---------------------------------------------------------------------------
-# Copyright 2022 Diamond Light Source Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ---------------------------------------------------------------------------
-# Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
-# Created Date: 01 November 2022
-# ---------------------------------------------------------------------------
-"""Modules for finding the axis of rotation"""
-
-import math
-from typing import List, Literal, Optional, Tuple, Union
-
-try:
-    import cupy as cp
-    import cupyx
-    import nvtx
-    import cupyx.scipy.ndimage as cpndi
-    from cupyx.scipy.ndimage import gaussian_filter, shift
-except ImportError:
-    print("Cupy library is a required dependency for HTTomolibgpu, please install")
-
-import numpy as np
-from httomolibgpu.cuda_kernels import load_cuda_module
-
-__all__ = [
-    "find_center_vo",
-    "find_center_360",
-    "find_center_pc",
-]
-
-
-@nvtx.annotate()
-def find_center_vo(
-    data: cp.ndarray,
-    ind: Optional[int] = None,
-    smin: int = -50,
-    smax: int = 50,
-    srad: float = 6.0,
-    step: float = 0.25,
-    ratio: float = 0.5,
-    drop: int = 20,
-) -> float:
-    """
-    Find rotation axis location using Nghia Vo's method. See the paper
-    https://opg.optica.org/oe/fulltext.cfm?uri=oe-22-16-19078&id=297315
-
-    Parameters
-    ----------
-    data : cp.ndarray
-        3D tomographic data or a 2D sinogram as a CuPy array.
-    ind : int, optional
-        Index of the slice to be used to estimate the CoR.
-    smin : int, optional
-        Coarse search radius. Reference to the horizontal center of
-        the sinogram.
-    smax : int, optional
-        Coarse search radius. Reference to the horizontal center of
-        the sinogram.
-    srad : float, optional
-        Fine search radius.
-    step : float, optional
-        Step of fine searching.
-    ratio : float, optional
-        The ratio between the FOV of the camera and the size of object.
-        It's used to generate the mask.
-    drop : int, optional
-        Drop lines around vertical center of the mask.
-
-    Returns
-    -------
-    float
-        Rotation axis location.
-    """
-
-    if data.ndim == 2:
-        data = cp.expand_dims(data, 1)
-        ind = 0
-
-    height = data.shape[1]
-
-    if ind is None:
-        ind = height // 2
-        if height > 10:
-            _sino = cp.mean(data[:, ind - 5 : ind + 5, :], axis=1)
-        else:
-            _sino = data[:, ind, :]
-    else:
-        _sino = data[:, ind, :]
-
-    with nvtx.annotate("gaussian_filter_1", color="green"):
-        _sino_cs = gaussian_filter(_sino, (3, 1), mode="reflect")
-    with nvtx.annotate("gaussian_filter_2", color="green"):
-        _sino_fs = gaussian_filter(_sino, (2, 2), mode="reflect")
-
-    if _sino.shape[0] * _sino.shape[1] > 4e6:
-        # data is large, so downsample it before performing search for
-        # centre of rotation
-        _sino_coarse = _downsample(_sino_cs, 2, 1)
-        init_cen = _search_coarse(_sino_coarse, smin / 4.0, smax / 4.0, ratio, drop)
-        fine_cen = _search_fine(_sino_fs, srad, step, init_cen * 4.0, ratio, drop)
-    else:
-        init_cen = _search_coarse(_sino_cs, smin, smax, ratio, drop)
-        fine_cen = _search_fine(_sino_fs, srad, step, init_cen, ratio, drop)
-
-    return fine_cen.get()
-
-
-@nvtx.annotate()
-def find_center_pc(proj1, proj2, tol=0.5, rotc_guess=None):
-    """
-    Find rotation axis location by finding the offset between the first
-    projection and a mirrored projection 180 degrees apart using
-    phase correlation in Fourier space.
-    The ``phase_cross_correlation`` function uses cross-correlation in Fourier
-    space, optionally employing an upsampled matrix-multiplication DFT to
-    achieve arbitrary subpixel precision. :cite:`Guizar:08`.
-
-    Parameters
-    ----------
-    proj1 : cp.ndarray
-        Projection data
-
-    proj2 : cp.ndarray
-        Projection data
-
-    tol : scalar, optional
-        Subpixel accuracy
-
-    rotc_guess : float, optional
-        Initial guess value for the rotation center
-
-    Returns
-    -------
-    float
-        Rotation axis location.
-    """
-    from cucim.skimage.registration import phase_cross_correlation
-
-    imgshift = 0.0 if rotc_guess is None else rotc_guess - (proj1.shape[1] - 1.0) / 2.0
-
-    proj1 = cpndi.shift(proj1, [0, -imgshift], mode="constant", cval=0)
-    proj2 = cpndi.shift(proj2, [0, -imgshift], mode="constant", cval=0)
-
-    # create reflection of second projection
-    proj2 = cp.fliplr(proj2)
-
-    # using cucim of rapids to do phase cross correlation between two images
-    shift = phase_cross_correlation(
-        reference_image=proj1, moving_image=proj2, upsample_factor=1.0 / tol
-    )
-
-    # Compute center of rotation as the center of first image and the
-    # registered translation with the second image
-    center = (proj1.shape[1] + shift[0][1] - 1.0) / 2.0
-
-    return center + imgshift
-
-
-@nvtx.annotate()
-def _search_coarse(sino, smin, smax, ratio, drop):
-    (nrow, ncol) = sino.shape
-    flip_sino = cp.ascontiguousarray(cp.fliplr(sino))
-    comp_sino = cp.ascontiguousarray(cp.flipud(sino))
-    mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
-
-    cen_fliplr = (ncol - 1.0) / 2.0
-    smin_clip_val = max(min(smin + cen_fliplr, ncol - 1), 0)
-    smin = smin_clip_val - cen_fliplr
-    smax_clip_val = max(min(smax + cen_fliplr, ncol - 1), 0)
-    smax = smax_clip_val - cen_fliplr
-    start_cor = ncol // 2 + smin
-    stop_cor = ncol // 2 + smax
-    list_cor = cp.arange(start_cor, stop_cor + 0.5, 0.5, dtype=cp.float32)
-    list_shift = 2.0 * (list_cor - cen_fliplr)
-    list_metric = cp.empty(list_shift.shape, dtype=cp.float32)
-    _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, list_metric)
-
-    minpos = cp.argmin(list_metric)
-    if minpos == 0:
-        print("WARNING!!!Global minimum is out of searching range")
-        print(f"Please extend smin: {smin}")
-    if minpos == len(list_metric) - 1:
-        print("WARNING!!!Global minimum is out of searching range")
-        print(f"Please extend smax: {smax}")
-    cor = list_cor[minpos]
-    return cor
-
-
-@nvtx.annotate()
-def _search_fine(sino, srad, step, init_cen, ratio, drop):
-    (nrow, ncol) = sino.shape
-
-    flip_sino = cp.ascontiguousarray(cp.fliplr(sino))
-    comp_sino = cp.ascontiguousarray(cp.flipud(sino))
-    mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
-
-    cen_fliplr = (ncol - 1.0) / 2.0
-    srad = max(min(abs(float(srad)), ncol / 4.0), 1.0)
-    step = max(min(abs(step), srad), 0.1)
-    init_cen = max(min(init_cen, ncol - srad - 1), srad)
-    list_cor = init_cen + cp.arange(-srad, srad + step, step, dtype=np.float32)
-    list_shift = 2.0 * (list_cor - cen_fliplr)
-    list_metric = cp.empty(list_shift.shape, dtype="float32")
-
-    _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, out=list_metric)
-    cor = list_cor[cp.argmin(list_metric)]
-    return cor
-
-
-@nvtx.annotate()
-def _create_mask(nrow, ncol, radius, drop):
-    du = 1.0 / ncol
-    dv = (nrow - 1.0) / (nrow * 2.0 * np.pi)
-    cen_row = int(math.ceil(nrow / 2.0) - 1)
-    cen_col = int(math.ceil(ncol / 2.0) - 1)
-    drop = min([drop, int(math.ceil(0.05 * nrow))])
-
-    block_x = 128
-    block_y = 1
-    block_dims = (block_x, block_y)
-    grid_x = (ncol // 2 + 1 + block_x - 1) // block_x
-    grid_y = nrow
-    grid_dims = (grid_x, grid_y)
-    mask = cp.empty((nrow, ncol // 2 + 1), dtype="uint16")
-    params = (
-        ncol,
-        nrow,
-        cen_col,
-        cen_row,
-        cp.float32(du),
-        cp.float32(dv),
-        cp.float32(radius),
-        cp.float32(drop),
-        mask,
-    )
-    module = load_cuda_module("generate_mask")
-    kernel = module.get_function("generate_mask")
-    kernel(grid_dims, block_dims, params)
-    return mask
-
-
-def round_up(x: float) -> int:
-    if x >= 0.0:
-        return int(math.ceil(x))
-    else:
-        return int(math.floor(x))
-
-
-def _get_available_gpu_memory() -> int:
-    dev = cp.cuda.Device()
-    # first, let's make some space
-    cp.get_default_memory_pool().free_all_blocks()
-    cache = cp.fft.config.get_plan_cache()
-    cache.clear()
-    available_memory = dev.mem_info[0] + cp.get_default_memory_pool().free_bytes()
-    return int(available_memory * 0.9)  # 10% safety margin
-
-
-def _calculate_chunks(
-    nshifts: int, shift_size: int, available_memory: Optional[int] = None
-) -> List[int]:
-    if available_memory is None:
-        available_memory = _get_available_gpu_memory()
-
-    available_memory -= shift_size
-    freq_domain_size = (
-        shift_size  # it needs only half (RFFT), but complex64, so it's the same
-    )
-    fft_plan_size = freq_domain_size
-    size_per_shift = fft_plan_size + freq_domain_size + shift_size
-    nshift_max = available_memory // size_per_shift
-    assert nshift_max > 0, "Not enough memory to process"
-    num_chunks = int(np.ceil(nshifts / nshift_max))
-    chunk_size = int(np.ceil(nshifts / num_chunks))
-    chunks = [chunk_size] * (num_chunks - 1)
-    stop_idx = list(np.cumsum(chunks))
-    stop_idx.append(nshifts)
-    return stop_idx
-
-
-@nvtx.annotate()
-def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
-    # this tries to simplify - if shift_col is integer, no need to spline interpolate
-    assert list_shift.dtype == cp.float32, "shifts must be single precision floats"
-    assert sino1.dtype == cp.float32, "sino1 must be float32"
-    assert sino2.dtype == cp.float32, "sino1 must be float32"
-    assert sino3.dtype == cp.float32, "sino1 must be float32"
-    assert out.dtype == cp.float32, "sino1 must be float32"
-    assert sino2.flags["C_CONTIGUOUS"], "sino2 must be C-contiguous"
-    assert sino3.flags["C_CONTIGUOUS"], "sino3 must be C-contiguous"
-    assert list_shift.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
-    nshifts = list_shift.shape[0]
-    na1 = sino1.shape[0]
-    na2 = sino2.shape[0]
-
-    module = load_cuda_module("center_360_shifts")
-    shift_whole_shifts = module.get_function("shift_whole_shifts")
-    # note: we don't have to calculate the mean here, as we're only looking for minimum metric.
-    # The sum is enough.
-    masked_sum_abs_kernel = cp.ReductionKernel(
-        in_params="complex64 x, uint16 mask",  # input, complex + mask
-        out_params="float32 out",  # output, real
-        map_expr="mask ? abs(x) : 0.0f",
-        reduce_expr="a + b",
-        post_map_expr="out = a",
-        identity="0.0f",
-        reduce_type="float",
-        name="masked_sum_abs",
-    )
-
-    # determine how many shifts we can fit in the available memory
-    # and iterate in chunks
-    chunks = _calculate_chunks(
-        nshifts, (na1 + na2) * sino2.shape[1] * cp.float32().nbytes
-    )
-
-    mat = cp.empty((chunks[0], na1 + na2, sino2.shape[1]), dtype=cp.float32)
-    mat[:, :na1, :] = sino1
-    # explicitly create FFT plan here, so it's not cached and clearly re-used
-    plan = cupyx.scipy.fftpack.get_fft_plan(
-        mat, mat.shape[-2:], axes=(1, 2), value_type="R2C"
-    )
-
-    for i, stop_idx in enumerate(chunks):
-        if i > 0:
-            # more than one iteration means we're tight on memory, so clear up freed blocks
-            mat_freq = None
-            cp.get_default_memory_pool().free_all_blocks()
-
-        start_idx = 0 if i == 0 else chunks[i - 1]
-        size = stop_idx - start_idx
-
-        # first, handle the integer shifts without spline in a raw kernel,
-        # and shift in the sino3 one accordingly
-        bx = 128
-        gx = (sino3.shape[1] + bx - 1) // bx
-        shift_whole_shifts(
-            grid=(gx, na2, size),  ####
-            block=(bx, 1, 1),
-            args=(
-                sino2,
-                sino3,
-                list_shift[start_idx:stop_idx],
-                mat[:, na1:, :],
-                sino3.shape[1],
-                na1 + na2,
-            ),
-        )
-
-        # now we can only look at the spline shifting, the rest is done
-        list_shift_host = cp.asnumpy(list_shift[start_idx:stop_idx])
-        for i in range(list_shift_host.shape[0]):
-            shift_col = float(list_shift_host[i])
-            if not shift_col.is_integer():
-                shifted = shift(sino2, (0, shift_col), order=3, prefilter=True)
-                shift_int = round_up(shift_col)
-                if shift_int >= 0:
-                    mat[i, na1:, shift_int:] = shifted[:, shift_int:]
-                else:
-                    mat[i, na1:, :shift_int] = shifted[:, :shift_int]
-
-        # stack and transform
-        # (we do the full sized mat FFT, even though the last chunk may be smaller, to
-        # make sure we can re-use the same FFT plan as before)
-        mat_freq = cupyx.scipy.fft.rfft2(mat, axes=(1, 2), norm=None, plan=plan)
-        masked_sum_abs_kernel(
-            mat_freq[:size, :, :], mask, out=out[start_idx:stop_idx], axis=(1, 2)
-        )
-
-
-@nvtx.annotate()
-def _downsample(sino, level, axis):
-    assert sino.dtype == cp.float32, "single precision floating point input required"
-    assert sino.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
-
-    dx, dz = sino.shape
-    # Determine the new size, dim, of the downsampled dimension
-    dim = int(sino.shape[axis] / math.pow(2, level))
-    shape = [dx, dz]
-    shape[axis] = dim
-    downsampled_data = cp.empty(shape, dtype="float32")
-
-    block_x = 8
-    block_y = 8
-    block_dims = (block_x, block_y)
-    grid_x = (sino.shape[1] + block_x - 1) // block_x
-    grid_y = (sino.shape[0] + block_y - 1) // block_y
-    grid_dims = (grid_x, grid_y)
-    # 8x8 thread-block, which means 16 "lots" of columns to downsample per
-    # thread-block; 4 bytes per float, so allocate 16*6 = 64 bytes of shared
-    # memeory per thread-block
-    shared_mem_bytes = 64
-    params = (sino, dx, dz, level, downsampled_data)
-    module = load_cuda_module("downsample_sino")
-    kernel = module.get_function("downsample_sino")
-    kernel(grid_dims, block_dims, params, shared_mem=shared_mem_bytes)
-    return downsampled_data
-
-
-# --- Center of rotation (COR) estimation method ---#
-@nvtx.annotate()
-def find_center_360(
-    data: cp.ndarray,
-    ind: Optional[int] = None,
-    win_width: int = 10,
-    side: Optional[Literal[0, 1]] = None,
-    denoise: bool = True,
-    norm: bool = False,
-    use_overlap: bool = False,
-) -> Tuple[float, float, Optional[Literal[0, 1]], float]:
-    """
-    Find the center-of-rotation (COR) in a 360-degree scan with offset COR use
-    the method presented in Ref. [1] by Nghia Vo.
-
-    This function supports both numpy and cupy - the implementation is selected
-    by where the input data array resides.
-
-    Parameters
-    ----------
-    data : cp.ndarray
-        3D tomographic data as a Cupy array.
-    ind : int, optional
-        Index of the slice to be used for estimate the CoR and the overlap.
-    win_width : int, optional
-        Window width used for finding the overlap area.
-    side : {None, 0, 1}, optional
-        Overlap size. Only there options: None, 0, or 1. "None" corresponds
-        to fully automated determination. "0" corresponds to the left side.
-        "1" corresponds to the right side.
-    denoise : bool, optional
-        Apply the Gaussian filter if True.
-    norm : bool, optional
-        Apply the normalisation if True.
-    use_overlap : bool, optional
-        Use the combination of images in the overlap area for calculating
-        correlation coefficients if True.
-
-    Returns
-    -------
-    cor : float
-        Center-of-rotation.
-    overlap : float
-        Width of the overlap area between two halves of the sinogram.
-    side : int
-        Overlap side between two halves of the sinogram.
-    overlap_position : float
-        Position of the window in the first image giving the best
-        correlation metric.
-
-    References
-    ----------
-    [1] : https://doi.org/10.1364/OE.418448
-    """
-    if data.ndim != 3:
-        raise ValueError("A 3D array must be provided")
-
-    # this method works with a 360-degree sinogram.
-    if ind is None:
-        _sino = data[:, 0, :]
-    else:
-        _sino = data[:, ind, :]
-
-    (nrow, ncol) = _sino.shape
-    nrow_180 = nrow // 2 + 1
-    sino_top = _sino[0:nrow_180, :]
-    sino_bot = cp.fliplr(_sino[-nrow_180:, :])
-    (overlap, side, overlap_position) = _find_overlap(
-        sino_top, sino_bot, win_width, side, denoise, norm, use_overlap
-    )
-    if side == 0:
-        cor = overlap / 2.0 - 1.0
-    else:
-        cor = ncol - overlap / 2.0 - 1.0
-
-    return cor, overlap, side, overlap_position
-
-
-def _find_overlap(
-    mat1, mat2, win_width, side=None, denoise=True, norm=False, use_overlap=False
-):
-    """
-    Find the overlap area and overlap side between two images (Ref. [1]) where
-    the overlap side referring to the first image.
-
-    Parameters
-    ----------
-    mat1 : array_like
-        2D array. Projection image or sinogram image.
-    mat2 :  array_like
-        2D array. Projection image or sinogram image.
-    win_width : int
-        Width of the searching window.
-    side : {None, 0, 1}, optional
-        Only there options: None, 0, or 1. "None" corresponding to fully
-        automated determination. "0" corresponding to the left side. "1"
-        corresponding to the right side.
-    denoise : bool, optional
-        Apply the Gaussian filter if True.
-    norm : bool, optional
-        Apply the normalization if True.
-    use_overlap : bool, optional
-        Use the combination of images in the overlap area for calculating
-        correlation coefficients if True.
-
-    Returns
-    -------
-    overlap : float
-        Width of the overlap area between two images.
-    side : int
-        Overlap side between two images.
-    overlap_position : float
-        Position of the window in the first image giving the best
-        correlation metric.
-
-    """
-    ncol1 = mat1.shape[1]
-    ncol2 = mat2.shape[1]
-    win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2))
-
-    if side == 1:
-        (list_metric, offset) = _search_overlap(
-            mat1,
-            mat2,
-            win_width,
-            side=side,
-            denoise=denoise,
-            norm=norm,
-            use_overlap=use_overlap,
-        )
-        overlap_position = _calculate_curvature(list_metric)[1]
-        overlap_position += offset
-        overlap = ncol1 - overlap_position + win_width // 2
-    elif side == 0:
-        (list_metric, offset) = _search_overlap(
-            mat1,
-            mat2,
-            win_width,
-            side=side,
-            denoise=denoise,
-            norm=norm,
-            use_overlap=use_overlap,
-        )
-        overlap_position = _calculate_curvature(list_metric)[1]
-        overlap_position += offset
-        overlap = overlap_position + win_width // 2
-    else:
-        (list_metric1, offset1) = _search_overlap(
-            mat1,
-            mat2,
-            win_width,
-            side=1,
-            denoise=denoise,
-            norm=norm,
-            use_overlap=use_overlap,
-        )
-        (list_metric2, offset2) = _search_overlap(
-            mat1,
-            mat2,
-            win_width,
-            side=0,
-            denoise=denoise,
-            norm=norm,
-            use_overlap=use_overlap,
-        )
-
-        (curvature1, overlap_position1) = _calculate_curvature(list_metric1)
-        overlap_position1 += offset1
-        (curvature2, overlap_position2) = _calculate_curvature(list_metric2)
-        overlap_position2 += offset2
-
-        if curvature1 > curvature2:
-            side = 1
-            overlap_position = overlap_position1
-            overlap = ncol1 - overlap_position + win_width // 2
-        else:
-            side = 0
-            overlap_position = overlap_position2
-            overlap = overlap_position + win_width // 2
-
-    return overlap, side, overlap_position
-
-
-@nvtx.annotate()
-def _search_overlap(
-    mat1, mat2, win_width, side, denoise=True, norm=False, use_overlap=False
-):
-    """
-    Calculate the correlation metrics between a rectangular region, defined
-    by the window width, on the utmost left/right side of image 2 and the
-    same size region in image 1 where the region is slided across image 1.
-
-    Parameters
-    ----------
-    mat1 : array_like
-        2D array. Projection image or sinogram image.
-    mat2 : array_like
-        2D array. Projection image or sinogram image.
-    win_width : int
-        Width of the searching window.
-    side : {0, 1}
-        Only two options: 0 or 1. It is used to indicate the overlap side
-        respects to image 1. "0" corresponds to the left side. "1" corresponds
-        to the right side.
-    denoise : bool, optional
-        Apply the Gaussian filter if True.
-    norm : bool, optional
-        Apply the normalization if True.
-    use_overlap : bool, optional
-        Use the combination of images in the overlap area for calculating
-        correlation coefficients if True.
-
-    Returns
-    -------
-    list_metric : array_like
-        1D array. List of the correlation metrics.
-    offset : int
-        Initial position of the searching window where the position
-        corresponds to the center of the window.
-    """
-    if denoise is True:
-        # note: the filtering makes the output contiguous
-        with nvtx.annotate("denoise_filter", color="green"):
-            mat1 = cpndi.gaussian_filter(mat1, (2, 2), mode="reflect")
-            mat2 = cpndi.gaussian_filter(mat2, (2, 2), mode="reflect")
-    else:
-        mat1 = cp.ascontiguousarray(mat1, dtype=cp.float32)
-        mat2 = cp.ascontiguousarray(mat2, dtype=cp.float32)
-
-    (nrow1, ncol1) = mat1.shape
-    (nrow2, ncol2) = mat2.shape
-
-    if nrow1 != nrow2:
-        raise ValueError("Two images are not at the same height!!!")
-
-    win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2 - 1))
-    offset = win_width // 2
-    win_width = 2 * offset  # Make it even
-
-    list_metric = _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm)
-
-    min_metric = cp.min(list_metric)
-    if min_metric != 0.0:
-        list_metric /= min_metric
-
-    return list_metric, offset
-
-
-_calc_metrics_module = load_cuda_module(
-    "calc_metrics",
-    name_expressions=[
-        "calc_metrics_kernel<false, false>",
-        "calc_metrics_kernel<true, false>",
-        "calc_metrics_kernel<false, true>",
-        "calc_metrics_kernel<true, true>",
-    ],
-    options=("--maxrregcount=32",),
-)
-
-
-@nvtx.annotate()
-def _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm):
-    assert mat1.dtype == cp.float32, "only float32 supported"
-    assert mat2.dtype == cp.float32, "only float32 supported"
-    assert mat1.shape[0] == mat2.shape[0]
-    assert mat1.flags.c_contiguous, "only contiguos arrays supported"
-    assert mat2.flags.c_contiguous, "only contiguos arrays supported"
-
-    num_pos = mat1.shape[1] - win_width
-    list_metric = cp.empty(num_pos, dtype=cp.float32)
-
-    args = (
-        mat1,
-        np.int32(mat1.strides[0] / mat1.strides[1]),
-        mat2,
-        np.int32(mat2.strides[0] / mat2.strides[1]),
-        np.int32(win_width),
-        np.int32(mat1.shape[0]),
-        np.int32(side),
-        list_metric,
-    )
-    block = (128, 1, 1)
-    grid = (1, np.int32(num_pos), 1)
-    smem = block[0] * 4 * 6 if use_overlap else block[0] * 4 * 3
-    bool2str = lambda x: "true" if x is True else "false"
-    calc_metrics = _calc_metrics_module.get_function(
-        f"calc_metrics_kernel<{bool2str(norm)}, {bool2str(use_overlap)}>"
-    )
-    calc_metrics(grid=grid, block=block, args=args, shared_mem=smem)
-
-    return list_metric
-
-
-@nvtx.annotate()
-def _calculate_curvature(list_metric):
-    """
-    Calculate the curvature of a fitted curve going through the minimum
-    value of a metric list.
-
-    Parameters
-    ----------
-    list_metric : array_like
-        1D array. List of metrics.
-
-    Returns
-    -------
-    curvature : float
-        Quadratic coefficient of the parabola fitting.
-    min_pos : float
-        Position of the minimum value with sub-pixel accuracy.
-    """
-    radi = 2
-    num_metric = list_metric.size
-    min_metric_idx = int(cp.argmin(list_metric))
-    min_pos = int(np.clip(min_metric_idx, radi, num_metric - radi - 1))
-
-    # work mostly on CPU here - we have very small arrays here
-    list1 = cp.asnumpy(list_metric[min_pos - radi : min_pos + radi + 1])
-    afact1 = np.polyfit(np.arange(0, 2 * radi + 1), list1, 2)[0]
-    list2 = cp.asnumpy(list_metric[min_pos - 1 : min_pos + 2])
-    (afact2, bfact2, _) = np.polyfit(np.arange(min_pos - 1, min_pos + 2), list2, 2)
-
-    curvature = np.abs(afact1)
-    if afact2 != 0.0:
-        num = -bfact2 / (2 * afact2)
-        if (num >= min_pos - 1) and (num <= min_pos + 1):
-            min_pos = num
-
-    return curvature, np.float32(min_pos)

From c72fa1a298a906e3934d3146a316b7aece97af46 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 30 Apr 2024 13:46:04 +0100
Subject: [PATCH 03/36] try2

---
 conda/recipe/meta.yaml | 2 --
 pyproject.toml         | 2 --
 2 files changed, 4 deletions(-)

diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml
index 98625e81..cbbb2591 100644
--- a/conda/recipe/meta.yaml
+++ b/conda/recipe/meta.yaml
@@ -41,8 +41,6 @@ test:
   imports:
     - httomolibgpu
     - httomolibgpu.misc
-    - httomolibgpu.prep
-    - httomolibgpu.recon
   source_files:
     - tests/*
   commands:
diff --git a/pyproject.toml b/pyproject.toml
index 282e66ac..ad679a42 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,8 +6,6 @@ build-backend = "setuptools.build_meta"
 include-package-data = true
 packages = ["httomolibgpu",
             "httomolibgpu.misc",
-            "httomolibgpu.prep",
-            "httomolibgpu.recon",
             "httomolibgpu.cuda_kernels"]
 
 [tool.setuptools.package-data]

From ddf9725da3051c8547d6d3eecbd784ecf7633a78 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 30 Apr 2024 14:11:13 +0100
Subject: [PATCH 04/36] try3

---
 httomolibgpu/misc/corr.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index 277a27a5..02e6e884 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -22,7 +22,7 @@
 
 try:
     import cupy as cp
-    import nvtx
+    #import nvtx
 except ImportError:
     print("Cupy library is a required dependency for HTTomolibgpu, please install")
 
@@ -36,15 +36,13 @@
 import numpy as np
 from numpy import float32
 
-from httomolibgpu.cuda_kernels import load_cuda_module
-
 __all__ = [
     "median_filter",
     "remove_outlier",
 ]
 
 
-@nvtx.annotate()
+#@nvtx.annotate()
 def median_filter(
     data: cp.ndarray,
     kernel_size: int = 3,
@@ -76,6 +74,8 @@ def median_filter(
     ValueError
         If the input array is not three dimensional.
     """
+    from httomolibgpu.cuda_kernels import load_cuda_module    
+    
     input_type = data.dtype
 
     if input_type not in ["float32", "uint16"]:
@@ -146,7 +146,6 @@ def median_filter(
         thresholding_kernel(data, float32(dif), output)
     return output
 
-
 def remove_outlier(
     data: cp.ndarray, kernel_size: int = 3, axis: int = 0, dif: float = 0.1
 ) -> cp.ndarray:

From 0e4908d9337cd0aff9cd0153936dff28020abaf8 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 30 Apr 2024 14:19:59 +0100
Subject: [PATCH 05/36] try4

---
 httomolibgpu/misc/corr.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index 02e6e884..f1350e69 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -20,29 +20,37 @@
 # ---------------------------------------------------------------------------
 """ Module for data correction """
 
+cupy_run = False
 try:
     import cupy as cp
-    #import nvtx
+
+    # import nvtx
+    cupy_run = True
 except ImportError:
     print("Cupy library is a required dependency for HTTomolibgpu, please install")
 
 try:
     from cucim.skimage.filters import median
-    from cucim.skimage.morphology import disk    
+    from cucim.skimage.morphology import disk
 except ImportError:
-    print("Cucim library of RapidsAI is a required dependency for HTTomolibgpu, please install")
+    print(
+        "Cucim library of RapidsAI is a required dependency for HTTomolibgpu, please install"
+    )
 
 from typing import Tuple
 import numpy as np
 from numpy import float32
 
+if cupy_run:
+    from httomolibgpu.cuda_kernels import load_cuda_module
+
 __all__ = [
     "median_filter",
     "remove_outlier",
 ]
 
 
-#@nvtx.annotate()
+# @nvtx.annotate()
 def median_filter(
     data: cp.ndarray,
     kernel_size: int = 3,
@@ -74,8 +82,7 @@ def median_filter(
     ValueError
         If the input array is not three dimensional.
     """
-    from httomolibgpu.cuda_kernels import load_cuda_module    
-    
+
     input_type = data.dtype
 
     if input_type not in ["float32", "uint16"]:
@@ -146,6 +153,7 @@ def median_filter(
         thresholding_kernel(data, float32(dif), output)
     return output
 
+
 def remove_outlier(
     data: cp.ndarray, kernel_size: int = 3, axis: int = 0, dif: float = 0.1
 ) -> cp.ndarray:

From fb8ce77b48bc8fb83e2073705ff2b3547799296a Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 30 Apr 2024 15:13:08 +0100
Subject: [PATCH 06/36] try5

---
 httomolibgpu/misc/corr.py | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index f1350e69..819c2481 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -20,14 +20,32 @@
 # ---------------------------------------------------------------------------
 """ Module for data correction """
 
+
+import numpy as xp
+
 cupy_run = False
+
 try:
     import cupy as cp
 
-    # import nvtx
-    cupy_run = True
+    try:
+        cp.cuda.Device(0).compute_capability
+        cupy_run = True
+
+    except cp.cuda.runtime.CUDARuntimeError:
+        print("Cupy library is a required dependency for HTTomolibgpu, please install")
+        import numpy as np
 except ImportError:
-    print("Cupy library is a required dependency for HTTomolibgpu, please install")
+    import numpy as np
+
+# cupy_run = False
+# try:
+#     import cupy as cp
+
+#     # import nvtx
+#     cupy_run = True
+# except ImportError:
+#     print("Cupy library is a required dependency for HTTomolibgpu, please install")
 
 try:
     from cucim.skimage.filters import median
@@ -38,7 +56,8 @@
     )
 
 from typing import Tuple
-import numpy as np
+
+# import numpy as np
 from numpy import float32
 
 if cupy_run:
@@ -52,11 +71,11 @@
 
 # @nvtx.annotate()
 def median_filter(
-    data: cp.ndarray,
+    data: xp.ndarray,
     kernel_size: int = 3,
     axis: int = 0,
     dif: float = 0.0,
-) -> cp.ndarray:
+) -> xp.ndarray:
     """
     Apply 2D or 3D median or dezinger (when dif>0) filter to a 3D array.
 
@@ -101,7 +120,7 @@ def median_filter(
         raise ValueError("The axis should be 0,1,2 or None for full 3d processing")
 
     dz, dy, dx = data.shape
-    output = cp.empty(data.shape, dtype=input_type, order="C")
+    output = xp.empty(data.shape, dtype=input_type, order="C")
 
     if axis == 0:
         for j in range(dz):
@@ -142,7 +161,7 @@ def median_filter(
                 output = data;
             }
             """
-        thresholding_kernel = cp.ElementwiseKernel(
+        thresholding_kernel = xp.ElementwiseKernel(
             "T data, raw float32 dif",
             "T output",
             kernel,
@@ -155,8 +174,8 @@ def median_filter(
 
 
 def remove_outlier(
-    data: cp.ndarray, kernel_size: int = 3, axis: int = 0, dif: float = 0.1
-) -> cp.ndarray:
+    data: xp.ndarray, kernel_size: int = 3, axis: int = 0, dif: float = 0.1
+) -> xp.ndarray:
     """
     Selectively applies 3D median filter to a 3D array to remove outliers. Also called a dezinger.
 

From 08012d38f0b2fa741aa07e1c72f4b22a5f1125c9 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 30 Apr 2024 16:03:22 +0100
Subject: [PATCH 07/36] working version for corr module

---
 httomolibgpu/misc/corr.py | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index 819c2481..8c043742 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -20,45 +20,32 @@
 # ---------------------------------------------------------------------------
 """ Module for data correction """
 
-
 import numpy as xp
 
 cupy_run = False
-
 try:
-    import cupy as cp
+    import cupy as xp
 
     try:
-        cp.cuda.Device(0).compute_capability
+        xp.cuda.Device(0).compute_capability
         cupy_run = True
 
-    except cp.cuda.runtime.CUDARuntimeError:
-        print("Cupy library is a required dependency for HTTomolibgpu, please install")
+    except xp.cuda.runtime.CUDARuntimeError:
+        print("CuPy library is a major dependency for HTTomolibgpu, please install")
         import numpy as np
 except ImportError:
     import numpy as np
 
-# cupy_run = False
-# try:
-#     import cupy as cp
-
-#     # import nvtx
-#     cupy_run = True
-# except ImportError:
-#     print("Cupy library is a required dependency for HTTomolibgpu, please install")
-
 try:
     from cucim.skimage.filters import median
     from cucim.skimage.morphology import disk
 except ImportError:
     print(
-        "Cucim library of RapidsAI is a required dependency for HTTomolibgpu, please install"
+        "Cucim library of RapidsAI is a required dependency for some modules, please install"
     )
 
-from typing import Tuple
-
-# import numpy as np
 from numpy import float32
+import nvtx
 
 if cupy_run:
     from httomolibgpu.cuda_kernels import load_cuda_module
@@ -69,7 +56,7 @@
 ]
 
 
-# @nvtx.annotate()
+@nvtx.annotate()
 def median_filter(
     data: xp.ndarray,
     kernel_size: int = 3,

From cb0c656ec20d9fbaf790f3d3b5634648df32cf0a Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 30 Apr 2024 16:16:24 +0100
Subject: [PATCH 08/36] adding other misc modules

---
 httomolibgpu/__init__.py     |   3 +-
 httomolibgpu/misc/corr.py    |   2 +-
 httomolibgpu/misc/morph.py   | 223 +++++++++++++++++++++++++++++++++++
 httomolibgpu/misc/rescale.py | 101 ++++++++++++++++
 4 files changed, 327 insertions(+), 2 deletions(-)
 create mode 100644 httomolibgpu/misc/morph.py
 create mode 100644 httomolibgpu/misc/rescale.py

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index 08a5227e..994c904f 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -1,5 +1,6 @@
 from httomolibgpu.misc.corr import *
-#from httomolibgpu.misc.morph import *
+from httomolibgpu.misc.morph import *
+from httomolibgpu.misc.rescale import *
 #from httomolibgpu.prep.alignment import *
 #from httomolibgpu.prep.normalize import *
 #from httomolibgpu.prep.phase import *
diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index 8c043742..e0607ae5 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -41,7 +41,7 @@
     from cucim.skimage.morphology import disk
 except ImportError:
     print(
-        "Cucim library of RapidsAI is a required dependency for some modules, please install"
+        "Cucim library of Rapidsai is a required dependency for some modules, please install"
     )
 
 from numpy import float32
diff --git a/httomolibgpu/misc/morph.py b/httomolibgpu/misc/morph.py
new file mode 100644
index 00000000..06bd9ef2
--- /dev/null
+++ b/httomolibgpu/misc/morph.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# ---------------------------------------------------------------------------
+# Copyright 2023 Diamond Light Source Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ---------------------------------------------------------------------------
+# Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
+# Created Date: 23 March 2023
+# ---------------------------------------------------------------------------
+"""Module for data type morphing functions"""
+
+import numpy as xp
+import numpy as np
+
+try:
+    import cupy as xp
+
+    try:
+        xp.cuda.Device(0).compute_capability
+
+    except xp.cuda.runtime.CUDARuntimeError:
+        print("CuPy library is a major dependency for HTTomolibgpu, please install")
+        import numpy as np
+except ImportError:
+    import numpy as np
+import nvtx
+from typing import Literal
+
+__all__ = [
+    "sino_360_to_180",
+    "data_resampler",
+]
+
+
+@nvtx.annotate()
+def sino_360_to_180(
+    data: xp.ndarray, overlap: int = 0, rotation: Literal["left", "right"] = "left"
+) -> xp.ndarray:
+    """
+    Converts 0-360 degrees sinogram to a 0-180 sinogram.
+    If the number of projections in the input data is odd, the last projection
+    will be discarded.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        Input 3D data.
+    overlap : scalar, optional
+        Overlapping number of pixels.
+    rotation : string, optional
+        'left' if rotation center is close to the left of the
+        field-of-view, 'right' otherwise.
+    Returns
+    -------
+    cp.ndarray
+        Output 3D data.
+    """
+    if data.ndim != 3:
+        raise ValueError("only 3D data is supported")
+
+    dx, dy, dz = data.shape
+
+    overlap = int(np.round(overlap))
+    if overlap >= dz:
+        raise ValueError("overlap must be less than data.shape[2]")
+    if overlap < 0:
+        raise ValueError("only positive overlaps are allowed.")
+
+    n = dx // 2
+
+    out = xp.empty((n, dy, 2 * dz - overlap), dtype=data.dtype)
+
+    if rotation == "left":
+        weights = xp.linspace(0, 1.0, overlap)
+        out[:, :, -dz + overlap :] = data[:n, :, overlap:]
+        out[:, :, : dz - overlap] = data[n : 2 * n, :, overlap:][:, :, ::-1]
+        out[:, :, dz - overlap : dz] = (
+            weights * data[:n, :, :overlap]
+            + (weights * data[n : 2 * n, :, :overlap])[:, :, ::-1]
+        )
+    elif rotation == "right":
+        weights = xp.linspace(1.0, 0, overlap)
+        out[:, :, : dz - overlap] = data[:n, :, :-overlap]
+        out[:, :, -dz + overlap :] = data[n : 2 * n, :, :-overlap][:, :, ::-1]
+        out[:, :, dz - overlap : dz] = (
+            weights * data[:n, :, -overlap:]
+            + (weights * data[n : 2 * n, :, -overlap:])[:, :, ::-1]
+        )
+    else:
+        raise ValueError('rotation parameter must be either "left" or "right"')
+
+    return out
+
+
+@nvtx.annotate()
+def data_resampler(
+    data: xp.ndarray, newshape: list, axis: int = 1, interpolation: str = "linear"
+) -> xp.ndarray:
+    """Down/Up-resampler of the input data implemented through interpn function.
+       Please note that the method will leave the specified axis
+       dimension unchanged, e.g. (128,128,128) -> (128,256,256) for axis = 0 and
+       newshape = [256,256].
+
+    Args:
+        data (cp.ndarray): 3d cupy array.
+        newshape (list): 2d list that defines the 2D slice shape of new shape data.
+        axis (int, optional): Axis along which the scaling is applied. Defaults to 1.
+        interpolation (str, optional): Selection of interpolation method. Defaults to 'linear'.
+
+    Raises:
+        ValueError: When data is not 3D
+
+    Returns:
+        cp.ndarray: Up/Down-scaled 3D cupy array
+    """
+    from cupyx.scipy.interpolate import interpn
+
+    if data.ndim != 3:
+        raise ValueError("only 3D data is supported")
+
+    N, M, Z = xp.shape(data)
+
+    if axis == 0:
+        xaxis = xp.arange(M) - M / 2
+        yaxis = xp.arange(Z) - Z / 2
+        step_x = M / newshape[0]
+        step_y = Z / newshape[1]
+        scaled_data = xp.empty((N, newshape[0], newshape[1]), dtype=xp.float32)
+    elif axis == 1:
+        xaxis = xp.arange(N) - N / 2
+        yaxis = xp.arange(Z) - Z / 2
+        step_x = N / newshape[0]
+        step_y = Z / newshape[1]
+        scaled_data = xp.empty((newshape[0], M, newshape[1]), dtype=xp.float32)
+    elif axis == 2:
+        xaxis = xp.arange(N) - N / 2
+        yaxis = xp.arange(M) - M / 2
+        step_x = N / newshape[0]
+        step_y = M / newshape[1]
+        scaled_data = xp.empty((newshape[0], newshape[1], Z), dtype=xp.float32)
+    else:
+        raise ValueError("Only 0,1,2 values for axes are supported")
+
+    points = (xaxis, yaxis)
+
+    scale_x = 2 / step_x
+    scale_y = 2 / step_y
+
+    y1 = np.linspace(
+        -newshape[0] / scale_x,
+        newshape[0] / scale_x - step_x,
+        num=newshape[0],
+        endpoint=False,
+    ).astype(np.float32)
+    x1 = np.linspace(
+        -newshape[1] / scale_y,
+        newshape[1] / scale_y - step_y,
+        num=newshape[1],
+        endpoint=False,
+    ).astype(np.float32)
+
+    xi_mesh = np.meshgrid(x1, y1)
+    xi = np.empty((2, newshape[0], newshape[1]), dtype=np.float32)
+    xi[0, :, :] = xi_mesh[1]
+    xi[1, :, :] = xi_mesh[0]
+    xi_size = xi.size
+    xi = np.rollaxis(xi, 0, 3)
+    xi = np.reshape(xi, [xi_size // 2, 2])
+    xi = xp.asarray(xi, dtype=xp.float32, order="C")
+
+    if axis == 0:
+        for j in range(N):
+            res = interpn(
+                points,
+                data[j, :, :],
+                xi,
+                method=interpolation,
+                bounds_error=False,
+                fill_value=0.0,
+            )
+            scaled_data[j, :, :] = xp.reshape(
+                res, [newshape[0], newshape[1]], order="C"
+            )
+    elif axis == 1:
+
+        for j in range(M):
+            res = interpn(
+                points,
+                data[:, j, :],
+                xi,
+                method=interpolation,
+                bounds_error=False,
+                fill_value=0.0,
+            )
+            scaled_data[:, j, :] = xp.reshape(
+                res, [newshape[0], newshape[1]], order="C"
+            )
+    else:
+        for j in range(Z):
+            res = interpn(
+                points,
+                data[:, :, j],
+                xi,
+                method=interpolation,
+                bounds_error=False,
+                fill_value=0.0,
+            )
+            scaled_data[:, :, j] = xp.reshape(
+                res, [newshape[0], newshape[1]], order="C"
+            )
+
+    return scaled_data
diff --git a/httomolibgpu/misc/rescale.py b/httomolibgpu/misc/rescale.py
new file mode 100644
index 00000000..fc54e2bb
--- /dev/null
+++ b/httomolibgpu/misc/rescale.py
@@ -0,0 +1,101 @@
+import numpy as xp
+import numpy as np
+
+try:
+    import cupy as xp
+
+    try:
+        xp.cuda.Device(0).compute_capability
+
+    except xp.cuda.runtime.CUDARuntimeError:
+        print("CuPy library is a major dependency for HTTomolibgpu, please install")
+        import numpy as np
+except ImportError:
+    import numpy as np
+import nvtx
+from typing import Literal, Optional, Tuple, Union
+
+__all__ = [
+    "rescale_to_int",
+]
+
+rescale_kernel = xp.ElementwiseKernel(
+    "T x, raw T input_min, raw T input_max, raw T factor",
+    "O out",
+    """
+      T x_clean = isnan(x) || isinf(x) ? T(0) : x;
+      T x_clipped = x_clean < input_min ? input_min : (x_clean > input_max ? input_max : x_clean);
+      T x_rebased = x_clipped - input_min;
+      out = O(x_rebased * factor);
+    """,
+    "rescale_to_int",
+)
+
+
+@nvtx.annotate()
+def rescale_to_int(
+    data: xp.ndarray,
+    perc_range_min: float = 0.0,
+    perc_range_max: float = 100.0,
+    bits: Literal[8, 16, 32] = 8,
+    glob_stats: Optional[Tuple[float, float, float, int]] = None,
+):
+    """
+    Rescales the data and converts it fit into the range of an unsigned integer type
+    with the given number of bits.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        Required input data array, on GPU
+    perc_range_min: float, optional
+        The lower cutoff point in the input data, in percent of the data range (defaults to 0).
+        The lower bound is computed as min + perc_range_min/100*(max-min)
+    perc_range_max: float, optional
+        The upper cutoff point in the input data, in percent of the data range (defaults to 100).
+        The upper bound is computed as min + perc_range_max/100*(max-min)
+    bits: Literal[8, 16, 32], optional
+        The number of bits in the output integer range (defaults to 8).
+        Allowed values are:
+        - 8 -> uint8
+        - 16 -> uint16
+        - 32 -> uint32
+    glob_stats: tuple, optional
+        Global statistics of the full dataset (beyond the data passed into this call).
+        It's a tuple with (min, max, sum, num_items). If not given, the min/max is
+        computed from the given data.
+
+    Returns
+    -------
+    cp.ndarray
+        The original data, clipped to the range specified with the perc_range_min and
+        perc_range_max, and scaled to the full range of the output integer type
+    """
+
+    if bits == 8:
+        output_dtype: Union[type[np.uint8], type[np.uint16], type[np.uint32]] = np.uint8
+    elif bits == 16:
+        output_dtype = np.uint16
+    else:
+        output_dtype = np.uint32
+
+    # get the min and max integer values of the output type
+    output_min = np.iinfo(output_dtype).min
+    output_max = np.iinfo(output_dtype).max
+
+    if not isinstance(glob_stats, tuple):
+        min_value = float(xp.min(data))
+        max_value = float(xp.max(data))
+    else:
+        min_value = glob_stats[0]
+        max_value = glob_stats[1]
+
+    range_intensity = max_value - min_value
+    input_min = (perc_range_min * (range_intensity) / 100) + min_value
+    input_max = (perc_range_max * (range_intensity) / 100) + min_value
+
+    factor = (output_max - output_min) / (input_max - input_min)
+
+    res = xp.empty(data.shape, dtype=output_dtype)
+    rescale_kernel(data, input_min, input_max, factor, res)
+    return res

From fb0744a685fa616fd0d27593faf77fb22bd575a1 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 30 Apr 2024 16:42:23 +0100
Subject: [PATCH 09/36] adding prep

---
 httomolibgpu/misc/rescale.py   |  23 +-
 httomolibgpu/prep/__init__.py  |   0
 httomolibgpu/prep/alignment.py | 175 ++++++++++++++
 httomolibgpu/prep/normalize.py | 140 +++++++++++
 httomolibgpu/prep/phase.py     | 422 +++++++++++++++++++++++++++++++++
 httomolibgpu/prep/stripe.py    | 394 ++++++++++++++++++++++++++++++
 tests/test_prep/test_stripe.py |  46 ++--
 7 files changed, 1161 insertions(+), 39 deletions(-)
 create mode 100644 httomolibgpu/prep/__init__.py
 create mode 100644 httomolibgpu/prep/alignment.py
 create mode 100644 httomolibgpu/prep/normalize.py
 create mode 100644 httomolibgpu/prep/phase.py
 create mode 100644 httomolibgpu/prep/stripe.py

diff --git a/httomolibgpu/misc/rescale.py b/httomolibgpu/misc/rescale.py
index fc54e2bb..339808e2 100644
--- a/httomolibgpu/misc/rescale.py
+++ b/httomolibgpu/misc/rescale.py
@@ -19,18 +19,6 @@
     "rescale_to_int",
 ]
 
-rescale_kernel = xp.ElementwiseKernel(
-    "T x, raw T input_min, raw T input_max, raw T factor",
-    "O out",
-    """
-      T x_clean = isnan(x) || isinf(x) ? T(0) : x;
-      T x_clipped = x_clean < input_min ? input_min : (x_clean > input_max ? input_max : x_clean);
-      T x_rebased = x_clipped - input_min;
-      out = O(x_rebased * factor);
-    """,
-    "rescale_to_int",
-)
-
 
 @nvtx.annotate()
 def rescale_to_int(
@@ -97,5 +85,16 @@ def rescale_to_int(
     factor = (output_max - output_min) / (input_max - input_min)
 
     res = xp.empty(data.shape, dtype=output_dtype)
+    rescale_kernel = xp.ElementwiseKernel(
+        "T x, raw T input_min, raw T input_max, raw T factor",
+        "O out",
+        """
+        T x_clean = isnan(x) || isinf(x) ? T(0) : x;
+        T x_clipped = x_clean < input_min ? input_min : (x_clean > input_max ? input_max : x_clean);
+        T x_rebased = x_clipped - input_min;
+        out = O(x_rebased * factor);
+        """,
+        "rescale_to_int",
+    )
     rescale_kernel(data, input_min, input_max, factor, res)
     return res
diff --git a/httomolibgpu/prep/__init__.py b/httomolibgpu/prep/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/httomolibgpu/prep/alignment.py b/httomolibgpu/prep/alignment.py
new file mode 100644
index 00000000..c5ffec32
--- /dev/null
+++ b/httomolibgpu/prep/alignment.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# ---------------------------------------------------------------------------
+# Copyright 2022 Diamond Light Source Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ---------------------------------------------------------------------------
+# Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
+# Created Date: 01 November 2022
+# ---------------------------------------------------------------------------
+"""Modules for data correction"""
+
+import numpy as xp
+import numpy as np
+
+try:
+    import cupy as xp
+    from cupy import mean
+
+    try:
+        xp.cuda.Device(0).compute_capability
+    except xp.cuda.runtime.CUDARuntimeError:
+        print("CuPy library is a major dependency for HTTomolibgpu, please install")
+        import numpy as np
+except ImportError:
+    import numpy as np
+
+from typing import Dict, List
+import nvtx
+
+__all__ = [
+    "distortion_correction_proj_discorpy",
+]
+
+
+# CuPy implementation of distortion correction from Discorpy
+# https://github.com/DiamondLightSource/discorpy/blob/67743842b60bf5dd45b21b8460e369d4a5e94d67/discorpy/post/postprocessing.py#L111-L148
+# (which is the same as the TomoPy version
+# https://github.com/tomopy/tomopy/blob/c236a2969074f5fc70189fb5545f0a165924f916/source/tomopy/prep/alignment.py#L950-L981
+# but with the additional params `order` and `mode`).
+@nvtx.annotate()
+def distortion_correction_proj_discorpy(
+    data: xp.ndarray,
+    metadata_path: str,
+    preview: Dict[str, List[int]],
+    order: int = 1,
+    mode: str = "reflect",
+):
+    """Unwarp a stack of images using a backward model.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        3D array.
+
+    metadata_path : str
+        The path to the file containing the distortion coefficients for the
+        data.
+
+    preview : Dict[str, List[int]]
+        A dict containing three key-value pairs:
+        - a list containing the `start` value of each dimension
+        - a list containing the `stop` value of each dimension
+        - a list containing the `step` value of each dimension
+
+    order : int, optional.
+        The order of the spline interpolation.
+
+    mode : {'reflect', 'grid-mirror', 'constant', 'grid-constant', 'nearest',
+           'mirror', 'grid-wrap', 'wrap'}, optional
+        To determine how to handle image boundaries.
+
+    Returns
+    -------
+    cp.ndarray
+        3D array. Distortion-corrected image(s).
+    """
+    from cupyx.scipy.ndimage import map_coordinates
+
+    # Check if it's a stack of 2D images, or only a single 2D image
+    if len(data.shape) == 2:
+        data = xp.expand_dims(data, axis=0)
+
+    # Get info from metadata txt file
+    xcenter, ycenter, list_fact = _load_metadata_txt(metadata_path)
+
+    # Use preview information to offset the x and y coords of the center of
+    # distortion
+    shift = preview["starts"]
+    step = preview["steps"]
+    x_dim = 1
+    y_dim = 0
+    step_check = max([step[i] for i in [x_dim, y_dim]]) > 1
+    if step_check:
+        msg = (
+            "\n***********************************************\n"
+            "!!! ERROR !!! -> Method doesn't work with the step in"
+            " the preview larger than 1 \n"
+            "***********************************************\n"
+        )
+        raise ValueError(msg)
+
+    x_offset = shift[x_dim]
+    y_offset = shift[y_dim]
+    xcenter = xcenter - x_offset
+    ycenter = ycenter - y_offset
+
+    height, width = data.shape[y_dim + 1], data.shape[x_dim + 1]
+    xu_list = xp.arange(width) - xcenter
+    yu_list = xp.arange(height) - ycenter
+    xu_mat, yu_mat = xp.meshgrid(xu_list, yu_list)
+    ru_mat = xp.sqrt(xu_mat**2 + yu_mat**2)
+    fact_mat = xp.sum(
+        xp.asarray([factor * ru_mat**i for i, factor in enumerate(list_fact)]), axis=0
+    )
+    xd_mat = xp.asarray(
+        xp.clip(xcenter + fact_mat * xu_mat, 0, width - 1), dtype=xp.float32
+    )
+    yd_mat = xp.asarray(
+        xp.clip(ycenter + fact_mat * yu_mat, 0, height - 1), dtype=xp.float32
+    )
+    indices = [xp.reshape(yd_mat, (-1, 1)), xp.reshape(xd_mat, (-1, 1))]
+    indices = xp.asarray(indices, dtype=xp.float32)
+
+    # Loop over images and unwarp them
+    for i in range(data.shape[0]):
+        mat = map_coordinates(data[i], indices, order=order, mode=mode)
+        mat = xp.reshape(mat, (height, width))
+        data[i] = mat
+
+    return data
+
+
+def _load_metadata_txt(file_path):
+    """
+    Load distortion coefficients from a text file.
+    Order of the infor in the text file:
+    xcenter
+    ycenter
+    factor_0
+    factor_1
+    factor_2
+    ...
+    Parameters
+    ----------
+    file_path : str
+        Path to the file
+    Returns
+    -------
+    tuple of float and list of floats
+        Tuple of (xcenter, ycenter, list_fact).
+    """
+    with open(file_path, "r") as f:
+        x = f.read().splitlines()
+        list_data = []
+        for i in x:
+            list_data.append(float(i.split()[-1]))
+    xcenter = list_data[0]
+    ycenter = list_data[1]
+    list_fact = list_data[2:]
+
+    return xcenter, ycenter, list_fact
+
+
+## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
diff --git a/httomolibgpu/prep/normalize.py b/httomolibgpu/prep/normalize.py
new file mode 100644
index 00000000..94276142
--- /dev/null
+++ b/httomolibgpu/prep/normalize.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# ---------------------------------------------------------------------------
+# Copyright 2022 Diamond Light Source Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ---------------------------------------------------------------------------
+# Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
+# Created Date: 01 November 2022
+# ---------------------------------------------------------------------------
+"""Modules for raw projection data normalization"""
+
+import numpy as xp
+import numpy as np
+
+try:
+    import cupy as xp
+    from cupy import mean
+
+    try:
+        xp.cuda.Device(0).compute_capability
+    except xp.cuda.runtime.CUDARuntimeError:
+        print("CuPy library is a major dependency for HTTomolibgpu, please install")
+        import numpy as np
+except ImportError:
+    import numpy as np
+
+import nvtx
+from numpy import float32
+from typing import Tuple
+
+__all__ = ["normalize"]
+
+
+@nvtx.annotate()
+def normalize(
+    data: xp.ndarray,
+    flats: xp.ndarray,
+    darks: xp.ndarray,
+    cutoff: float = 10.0,
+    minus_log: bool = True,
+    nonnegativity: bool = False,
+    remove_nans: bool = False,
+) -> xp.ndarray:
+    """
+    Normalize raw projection data using the flat and dark field projections.
+    This is a raw CUDA kernel implementation with CuPy wrappers.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        Projection data as a CuPy array.
+    flats : cp.ndarray
+        3D flat field data as a CuPy array.
+    darks : cp.ndarray
+        3D dark field data as a CuPy array.
+    cutoff : float, optional
+        Permitted maximum value for the normalised data.
+    minus_log : bool, optional
+        Apply negative log to the normalised data.
+    nonnegativity : bool, optional
+        Remove negative values in the normalised data.
+    remove_nans : bool, optional
+        Remove NaN and Inf values in the normalised data.
+
+    Returns
+    -------
+    cp.ndarray
+        Normalised 3D tomographic data as a CuPy array.
+    """
+
+    _check_valid_input(data, flats, darks)
+
+    dark0 = xp.empty(darks.shape[1:], dtype=float32)
+    flat0 = xp.empty(flats.shape[1:], dtype=float32)
+    out = xp.empty(data.shape, dtype=float32)
+    mean(darks, axis=0, dtype=float32, out=dark0)
+    mean(flats, axis=0, dtype=float32, out=flat0)
+
+    kernel_name = "normalisation"
+    kernel = r"""
+        float denom = float(flats) - float(darks);
+        if (denom < eps) {
+            denom = eps;
+        }
+        float v = (float(data) - float(darks))/denom;
+        """
+    if minus_log:
+        kernel += "v = -log(v);\n"
+        kernel_name += "_mlog"
+    if nonnegativity:
+        kernel += "if (v < 0.0f) v = 0.0f;\n"
+        kernel_name += "_nneg"
+    if remove_nans:
+        kernel += "if (isnan(v)) v = 0.0f;\n"
+        kernel += "if (isinf(v)) v = 0.0f;\n"
+        kernel_name += "_remnan"
+    kernel += "if (v > cutoff) v = cutoff;\n"
+    kernel += "out = v;\n"
+
+    normalisation_kernel = xp.ElementwiseKernel(
+        "T data, U flats, U darks, raw float32 cutoff",
+        "float32 out",
+        kernel,
+        kernel_name,
+        options=("-std=c++11",),
+        loop_prep="constexpr float eps = 1.0e-07;",
+        no_return=True,
+    )
+
+    normalisation_kernel(data, flat0, dark0, float32(cutoff), out)
+
+    return out
+
+
+def _check_valid_input(data, flats, darks) -> None:
+    """Helper function to check the validity of inputs to normalisation functions"""
+    if data.ndim != 3:
+        raise ValueError("Input data must be a 3D stack of projections")
+
+    if flats.ndim not in (2, 3):
+        raise ValueError("Input flats must be 2D or 3D data only")
+
+    if darks.ndim not in (2, 3):
+        raise ValueError("Input darks must be 2D or 3D data only")
+
+    if flats.ndim == 2:
+        flats = flats[xp.newaxis, :, :]
+    if darks.ndim == 2:
+        darks = darks[xp.newaxis, :, :]
diff --git a/httomolibgpu/prep/phase.py b/httomolibgpu/prep/phase.py
new file mode 100644
index 00000000..d126fe37
--- /dev/null
+++ b/httomolibgpu/prep/phase.py
@@ -0,0 +1,422 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# ---------------------------------------------------------------------------
+# Copyright 2022 Diamond Light Source Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ---------------------------------------------------------------------------
+# Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
+# Created Date: 01 November 2022
+# ---------------------------------------------------------------------------
+"""Modules for phase retrieval and phase-contrast enhancement"""
+
+cupy_run = False
+try:
+    import cupy as xp
+
+    try:
+        xp.cuda.Device(0).compute_capability
+        cupy_run = True
+
+    except xp.cuda.runtime.CUDARuntimeError:
+        print("CuPy library is a major dependency for HTTomolibgpu, please install")
+        import numpy as np
+except ImportError:
+    import numpy as np
+
+from numpy import float32
+import numpy as np
+import nvtx
+
+if cupy_run:
+    from httomolibgpu.cuda_kernels import load_cuda_module
+
+__all__ = [
+    "paganin_filter_savu",
+    "paganin_filter_tomopy",
+]
+
+# Define constants used in phase retrieval method
+BOLTZMANN_CONSTANT = 1.3806488e-16  # [erg/k]
+SPEED_OF_LIGHT = 299792458e2  # [cm/s]
+PI = 3.14159265359
+PLANCK_CONSTANT = 6.58211928e-19  # [keV*s]
+
+
+## %%%%%%%%%%%%%%%%%%%%%%% paganin_filter %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
+#: CuPy implementation of Paganin filter from Savu
+@nvtx.annotate()
+def paganin_filter_savu(
+    data: xp.ndarray,
+    ratio: float = 250.0,
+    energy: float = 53.0,
+    distance: float = 1.0,
+    resolution: float = 1.28,
+    pad_y: int = 100,
+    pad_x: int = 100,
+    pad_method: str = "edge",
+    increment: float = 0.0,
+) -> xp.ndarray:
+    """
+    Apply Paganin filter (for denoising or contrast enhancement) to
+    projections.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        The stack of projections to filter.
+
+    ratio : float, optional
+        Ratio of delta/beta.
+
+    energy : float, optional
+        Beam energy in keV.
+
+    distance : float, optional
+        Distance from sample to detector in metres.
+
+    resolution : float, optional
+        Pixel size in microns.
+
+    pad_y : int, optional
+        Pad the top and bottom of projections.
+
+    pad_x : int, optional
+        Pad the left and right of projections.
+
+    pad_method : str, optional
+        Numpy pad method to use.
+
+    increment : float, optional
+        Increment all values by this amount before taking the log.
+
+    Returns
+    -------
+    cp.ndarray
+        The stack of filtered projections.
+    """
+    import cupyx
+
+    # Check the input data is valid
+    if data.ndim != 3:
+        raise ValueError(
+            f"Invalid number of dimensions in data: {data.ndim},"
+            " please provide a stack of 2D projections."
+        )
+
+    # Setup various values for the filter
+    _, height, width = data.shape
+    micron = 1e-6
+    keV = 1000.0
+    energy *= keV
+    resolution *= micron
+    wavelength = (1240.0 / energy) * 1e-9
+
+    height1 = height + 2 * pad_y
+    width1 = width + 2 * pad_x
+
+    # Define the paganin filter, taking into account the padding that will be
+    # applied to the projections (if any)
+
+    # Using raw kernel her as indexing is direct and it avoids a lot of temporaries
+    # and tiny kernels
+    module = load_cuda_module("paganin_filter_gen")
+    kernel = module.get_function("paganin_filter_gen")
+
+    # Apply padding to all the 2D projections
+    # Note: this takes considerable time on GPU...
+    data = xp.pad(data, ((0, 0), (pad_y, pad_y), (pad_x, pad_x)), mode=pad_method)
+
+    # Define array to hold result, which will not have the padding applied to it
+    precond_kernel_float = xp.ElementwiseKernel(
+        "T data",
+        "T out",
+        """
+        if (isnan(data)) {
+            out = T(0); 
+        } else if (isinf(data)) {
+            out = data < 0.0 ? -3.402823e38f : 3.402823e38f;  // FLT_MAX, not available in cupy
+        } else if (data == 0.0) {
+            out = 1.0;
+        } else {
+            out = data;
+        }
+        """,
+        name="paganin_precond_float",
+        no_return=True,
+    )
+    precond_kernel_int = xp.ElementwiseKernel(
+        "T data",
+        "T out",
+        """out = data == 0 ? 1 : data""",
+        name="paganin_precond_int",
+        no_return=True,
+    )
+
+    if data.dtype in (xp.float32, xp.float64):
+        precond_kernel_float(data, data)
+    else:
+        precond_kernel_int(data, data)
+
+    # avoid normalising in both directions - we include multiplier in the post_kernel
+    data = xp.asarray(data, dtype=xp.complex64)
+    data = cupyx.scipy.fft.fft2(data, axes=(-2, -1), overwrite_x=True, norm="backward")
+
+    # prepare filter here, while the GPU is busy with the FFT
+    filtercomplex = xp.empty((height1, width1), dtype=np.complex64)
+    bx = 16
+    by = 8
+    gx = (width1 + bx - 1) // bx
+    gy = (height1 + by - 1) // by
+    kernel(
+        grid=(gx, gy, 1),
+        block=(bx, by, 1),
+        args=(
+            xp.int32(width1),
+            xp.int32(height1),
+            xp.float32(resolution),
+            xp.float32(wavelength),
+            xp.float32(distance),
+            xp.float32(ratio),
+            filtercomplex,
+        ),
+    )
+    data *= filtercomplex
+
+    data = cupyx.scipy.fft.ifft2(data, axes=(-2, -1), overwrite_x=True, norm="forward")
+
+    post_kernel = xp.ElementwiseKernel(
+        "C pci1, raw float32 increment, raw float32 ratio, raw float32 fft_scale",
+        "T out",
+        "out = -0.5 * ratio * log(abs(pci1) * fft_scale + increment)",
+        name="paganin_post_proc",
+        no_return=True,
+    )
+    fft_scale = 1.0 / (data.shape[1] * data.shape[2])
+    res = xp.empty((data.shape[0], height, width), dtype=np.float32)
+    post_kernel(
+        data[:, pad_y : pad_y + height, pad_x : pad_x + width],
+        np.float32(increment),
+        np.float32(ratio),
+        np.float32(fft_scale),
+        res,
+    )
+
+    return res
+
+
+def _wavelength(energy: float) -> float:
+    return 2 * PI * PLANCK_CONSTANT * SPEED_OF_LIGHT / energy
+
+
+def _paganin_filter_factor(
+    energy: float, dist: float, alpha: float, w2: xp.ndarray
+) -> xp.ndarray:
+    return 1 / (_wavelength(energy) * dist * w2 / (4 * PI) + alpha)
+
+
+def _calc_pad_width(dim: int, pixel_size: float, wavelength: float, dist: float) -> int:
+    pad_pix = xp.ceil(PI * wavelength * dist / pixel_size**2)
+    return int((pow(2, xp.ceil(xp.log2(dim + pad_pix))) - dim) * 0.5)
+
+
+def _calc_pad_val(tomo: xp.ndarray) -> float:
+    return xp.mean((tomo[..., 0] + tomo[..., -1]) * 0.5)
+
+
+def _reciprocal_grid(pixel_size: float, shape_proj: tuple) -> xp.ndarray:
+    """
+    Calculate reciprocal grid.
+
+    Parameters
+    ----------
+    pixel_size : float
+        Detector pixel size in cm.
+    shape_proj : tuple
+        Shape of the reciprocal grid along x and y axes.
+
+    Returns
+    -------
+    ndarray
+        Grid coordinates.
+    """
+    # Sampling in reciprocal space.
+    indx = _reciprocal_coord(pixel_size, shape_proj[0])
+    indy = _reciprocal_coord(pixel_size, shape_proj[1])
+    indx_sq = xp.square(indx)
+    indy_sq = xp.square(indy)
+
+    return xp.add.outer(indx_sq, indy_sq)
+
+
+def _reciprocal_coord(pixel_size: float, num_grid: int) -> xp.ndarray:
+    """
+    Calculate reciprocal grid coordinates for a given pixel size
+    and discretization.
+
+    Parameters
+    ----------
+    pixel_size : float
+        Detector pixel size in cm.
+    num_grid : int
+        Size of the reciprocal grid.
+
+    Returns
+    -------
+    ndarray
+        Grid coordinates.
+    """
+    n = num_grid - 1
+    rc = xp.arange(-n, num_grid, 2, dtype=xp.float32)
+    rc *= 2 * PI / (n * pixel_size)
+    return rc
+
+
+##-------------------------------------------------------------##
+##-------------------------------------------------------------##
+
+
+# Adaptation with some corrections of retrieve_phase (Paganin filter)
+# from TomoPy
+@nvtx.annotate()
+def paganin_filter_tomopy(
+    tomo: xp.ndarray,
+    pixel_size: float = 1e-4,
+    dist: float = 50.0,
+    energy: float = 53.0,
+    alpha: float = 1e-3,
+) -> xp.ndarray:
+    """
+    Perform single-material phase retrieval from flats/darks corrected tomographic measurements
+    :cite:`Paganin:02`.
+
+    Parameters
+    ----------
+    tomo : cp.ndarray
+        3D array of f/d corrected tomographic projections.
+    pixel_size : float, optional
+        Detector pixel size in cm.
+    dist : float, optional
+        Propagation distance of the wavefront in cm.
+    energy : float, optional
+        Energy of incident wave in keV.
+    alpha : float, optional
+        Regularization parameter, the ratio of delta/beta. Larger values lead to more smoothing.
+
+    Returns
+    -------
+    cp.ndarray
+        The 3D array of Paganin phase-filtered projection images.
+    """
+    import cupyx
+
+    # Check the input data is valid
+    if tomo.ndim != 3:
+        raise ValueError(
+            f"Invalid number of dimensions in data: {tomo.ndim},"
+            " please provide a stack of 2D projections."
+        )
+
+    dz_orig, dy_orig, dx_orig = xp.shape(tomo)
+
+    # Perform padding to the power of 2 as FFT is O(n*log(n)) complexity
+    # TODO: adding other options of padding?
+    padded_tomo, pad_tup = _pad_projections_to_second_power(tomo)
+
+    dz, dy, dx = xp.shape(padded_tomo)
+
+    # 3D FFT of tomo data
+    padded_tomo = xp.asarray(padded_tomo, dtype=xp.complex64)
+    fft_tomo = cupyx.scipy.fft.fft2(padded_tomo, axes=(-2, -1), overwrite_x=True)
+
+    # Compute the reciprocal grid.
+    w2 = _reciprocal_grid(pixel_size, (dy, dx))
+
+    # Build filter in the Fourier space.
+    phase_filter = cupyx.scipy.fft.fftshift(
+        _paganin_filter_factor2(energy, dist, alpha, w2)
+    )
+    phase_filter = phase_filter / phase_filter.max()  # normalisation
+
+    # Apply filter and take inverse FFT
+    ifft_filtered_tomo = (
+        cupyx.scipy.fft.ifft2(phase_filter * fft_tomo, axes=(-2, -1), overwrite_x=True)
+    ).real
+
+    # slicing indices for cropping
+    slc_indices = (
+        slice(pad_tup[0][0], pad_tup[0][0] + dz_orig, 1),
+        slice(pad_tup[1][0], pad_tup[1][0] + dy_orig, 1),
+        slice(pad_tup[2][0], pad_tup[2][0] + dx_orig, 1),
+    )
+
+    # crop the padded filtered data:
+    tomo = ifft_filtered_tomo[slc_indices].astype(xp.float32)
+
+    # taking the negative log
+    _log_kernel = xp.ElementwiseKernel(
+        "C tomo",
+        "C out",
+        "out = -log(tomo)",
+        name="log_kernel",
+    )
+
+    return _log_kernel(tomo)
+
+
+def _shift_bit_length(x: int) -> int:
+    return 1 << (x - 1).bit_length()
+
+
+def _pad_projections_to_second_power(tomo: xp.ndarray) -> tuple[xp.ndarray, tuple]:
+    """
+    Performs padding of each projection to the next power of 2.
+    If the shape is not even we also care of that before padding.
+
+    Parameters
+    ----------
+    tomo : cp.ndarray
+        3d projection data
+
+    Returns
+    -------
+    ndarray: padded 3d projection data
+    tuple: a tuple with padding dimensions
+    """
+    full_shape_tomo = xp.shape(tomo)
+
+    pad_tup = []
+    for index, element in enumerate(full_shape_tomo):
+        if index == 0:
+            pad_width = (0, 0)  # do not pad the slicing dim
+        else:
+            diff = _shift_bit_length(element + 1) - element
+            if element % 2 == 0:
+                pad_width = diff // 2
+                pad_width = (pad_width, pad_width)
+            else:
+                # need an uneven padding for odd-number lengths
+                left_pad = diff // 2
+                right_pad = diff - left_pad
+                pad_width = (left_pad, right_pad)
+
+        pad_tup.append(pad_width)
+
+    padded_tomo = xp.pad(tomo, tuple(pad_tup), "edge")
+
+    return padded_tomo, pad_tup
+
+
+def _paganin_filter_factor2(energy, dist, alpha, w2):
+    # Alpha represents the ratio of delta/beta.
+    return 1 / (_wavelength(energy) * dist * w2 / (4 * PI) + alpha)
diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py
new file mode 100644
index 00000000..0c37f608
--- /dev/null
+++ b/httomolibgpu/prep/stripe.py
@@ -0,0 +1,394 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# ---------------------------------------------------------------------------
+# Copyright 2022 Diamond Light Source Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ---------------------------------------------------------------------------
+# Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
+# Created Date: 01 November 2022
+# ---------------------------------------------------------------------------
+"""Modules for stripes removal"""
+
+import numpy as xp
+import numpy as np
+
+try:
+    import cupy as xp
+    from cupyx.scipy.ndimage import median_filter
+    from cupyx.scipy.ndimage import binary_dilation
+    from cupyx.scipy.ndimage import uniform_filter1d
+
+    try:
+        xp.cuda.Device(0).compute_capability
+    except xp.cuda.runtime.CUDARuntimeError:
+        print("CuPy library is a major dependency for HTTomolibgpu, please install")
+        import numpy as np
+except ImportError:
+    import numpy as np
+
+import nvtx
+from typing import Union
+
+__all__ = [
+    "remove_stripe_based_sorting",
+    "remove_stripe_ti",
+    "remove_all_stripe",
+]
+
+
+@nvtx.annotate()
+def remove_stripe_based_sorting(
+    data: Union[xp.ndarray, np.ndarray],
+    size: int = 11,
+    dim: int = 1,
+) -> Union[xp.ndarray, np.ndarray]:
+    """
+    Remove full and partial stripe artifacts from sinogram using Nghia Vo's
+    approach, algorithm 3 in Ref. [1]. Angular direction is along the axis 0.
+    This algorithm works particularly well for removing partial stripes.
+
+    Steps of the algorithm:
+    1. Sort each column of the sinogram by its grayscale values.
+    2. Apply a smoothing (median) filter on the sorted image along each row.
+    3. Re-sort the smoothed image columns to the original rows to
+       get the corrected sinogram.
+
+    Parameters
+    ----------
+    data : ndarray
+        3D tomographic data as a CuPy or NumPy array.
+    size : int, optional
+        Window size of the median filter.
+    dim : {1, 2}, optional
+        Dimension of the window.
+
+    Returns
+    -------
+    ndarray
+        Corrected 3D tomographic data as a CuPy or NumPy array.
+
+    References
+    ----------
+    .. [1] https://doi.org/10.1364/OE.26.028396
+    """
+
+    if size is None:
+        if data.shape[2] > 2000:
+            size = 21
+        else:
+            size = max(5, int(0.01 * data.shape[2]))
+
+    for m in range(data.shape[1]):
+        data[:, m, :] = _rs_sort(data[:, m, :], size, dim)
+
+    return data
+
+
+@nvtx.annotate()
+def _rs_sort(sinogram, size, dim):
+    """
+    Remove stripes using the sorting technique.
+    """
+    sinogram = xp.transpose(sinogram)
+
+    #: Sort each column of the sinogram by its grayscale values
+    #: Keep track of the sorting indices so we can reverse it below
+    sortvals = xp.argsort(sinogram, axis=1)
+    sortvals_reverse = xp.argsort(sortvals, axis=1)
+    sino_sort = xp.take_along_axis(sinogram, sortvals, axis=1)
+
+    #: Now apply the median filter on the sorted image along each row
+    if xp.__name__ == "cupy":
+        from cupyx.scipy.ndimage import median_filter
+    else:
+        from scipy.ndimage import median_filter
+
+    sino_sort = median_filter(sino_sort, (size, 1) if dim == 1 else (size, size))
+
+    #: step 3: re-sort the smoothed image columns to the original rows
+    sino_corrected = xp.take_along_axis(sino_sort, sortvals_reverse, axis=1)
+
+    return xp.transpose(sino_corrected)
+
+
+@nvtx.annotate()
+def remove_stripe_ti(
+    data: Union[xp.ndarray, np.ndarray],
+    beta: float = 0.1,
+) -> Union[xp.ndarray, np.ndarray]:
+    """
+    Removes stripes with the method of V. Titarenko (TomoCuPy implementation)
+
+    Parameters
+    ----------
+    data : ndarray
+        3D stack of projections as a CuPy array.
+    beta : float, optional
+        filter parameter, lower values increase the filter strength.
+        Default is 0.1.
+
+    Returns
+    -------
+    ndarray
+        3D array of de-striped projections.
+    """
+    # TODO: detector dimensions must be even otherwise error
+    gamma = beta * ((1 - beta) / (1 + beta)) ** xp.abs(
+        xp.fft.fftfreq(data.shape[-1]) * data.shape[-1]
+    )
+    gamma[0] -= 1
+    v = xp.mean(data, axis=0)
+    v = v - v[:, 0:1]
+    v = xp.fft.irfft(xp.fft.rfft(v) * xp.fft.rfft(gamma)).astype(data.dtype)
+    data[:] += v
+    return data
+
+
+######## Optimized version for Vo-all ring removal in tomopy########
+# This function is taken from TomoCuPy package
+# *************************************************************************** #
+#                  Copyright © 2022, UChicago Argonne, LLC                    #
+#                           All Rights Reserved                               #
+#                         Software Name: Tomocupy                             #
+#                     By: Argonne National Laboratory                         #
+#                                                                             #
+#                           OPEN SOURCE LICENSE                               #
+#                                                                             #
+# Redistribution and use in source and binary forms, with or without          #
+# modification, are permitted provided that the following conditions are met: #
+#                                                                             #
+# 1. Redistributions of source code must retain the above copyright notice,   #
+#    this list of conditions and the following disclaimer.                    #
+# 2. Redistributions in binary form must reproduce the above copyright        #
+#    notice, this list of conditions and the following disclaimer in the      #
+#    documentation and/or other materials provided with the distribution.     #
+# 3. Neither the name of the copyright holder nor the names of its            #
+#    contributors may be used to endorse or promote products derived          #
+#    from this software without specific prior written permission.            #
+#                                                                             #
+#                                                                             #
+# *************************************************************************** #
+@nvtx.annotate()
+def remove_all_stripe(
+    data: xp.ndarray,
+    snr: float = 3.0,
+    la_size: int = 61,
+    sm_size: int = 21,
+    dim: int = 1,
+) -> xp.ndarray:
+    """
+    Remove all types of stripe artifacts from sinogram using Nghia Vo's
+    approach :cite:`Vo:18` (combination of algorithm 3,4,5, and 6).
+
+    Parameters
+    ----------
+    data : ndarray
+        3D tomographic data as a CuPy array.
+    snr  : float, optional
+        Ratio used to locate large stripes.
+        Greater is less sensitive.
+    la_size : int, optional
+        Window size of the median filter to remove large stripes.
+    sm_size : int, optional
+        Window size of the median filter to remove small-to-medium stripes.
+    dim : {1, 2}, optional
+        Dimension of the window.
+
+    Returns
+    -------
+    ndarray
+        Corrected 3D tomographic data as a CuPy or NumPy array.
+
+    References
+    ----------
+    .. [1] https://doi.org/10.1364/OE.26.028396
+
+    """
+    matindex = _create_matindex(data.shape[2], data.shape[0])
+    for m in range(data.shape[1]):
+        sino = data[:, m, :]
+        sino = _rs_dead(sino, snr, la_size, matindex)
+        sino = _rs_sort2(sino, sm_size, matindex, dim)
+        data[:, m, :] = sino
+    return data
+
+
+@nvtx.annotate()
+def _rs_sort2(sinogram, size, matindex, dim):
+    """
+    Remove stripes using the sorting technique.
+    """
+    sinogram = xp.transpose(sinogram)
+    matcomb = xp.asarray(xp.dstack((matindex, sinogram)))
+
+    # matsort = xp.asarray([row[row[:, 1].argsort()] for row in matcomb])
+    ids = xp.argsort(matcomb[:, :, 1], axis=1)
+    matsort = matcomb.copy()
+    matsort[:, :, 0] = xp.take_along_axis(matsort[:, :, 0], ids, axis=1)
+    matsort[:, :, 1] = xp.take_along_axis(matsort[:, :, 1], ids, axis=1)
+    if dim == 1:
+        matsort[:, :, 1] = median_filter(matsort[:, :, 1], (size, 1))
+    else:
+        matsort[:, :, 1] = median_filter(matsort[:, :, 1], (size, size))
+
+    # matsortback = xp.asarray([row[row[:, 0].argsort()] for row in matsort])
+
+    ids = xp.argsort(matsort[:, :, 0], axis=1)
+    matsortback = matsort.copy()
+    matsortback[:, :, 0] = xp.take_along_axis(matsortback[:, :, 0], ids, axis=1)
+    matsortback[:, :, 1] = xp.take_along_axis(matsortback[:, :, 1], ids, axis=1)
+
+    sino_corrected = matsortback[:, :, 1]
+    return xp.transpose(sino_corrected)
+
+
+@nvtx.annotate()
+def _mpolyfit(x, y):
+    n = len(x)
+    x_mean = xp.mean(x)
+    y_mean = xp.mean(y)
+
+    Sxy = xp.sum(x * y) - n * x_mean * y_mean
+    Sxx = xp.sum(x * x) - n * x_mean * x_mean
+
+    slope = Sxy / Sxx
+    intercept = y_mean - slope * x_mean
+    return slope, intercept
+
+
+@nvtx.annotate()
+def _detect_stripe(listdata, snr):
+    """
+    Algorithm 4 in :cite:`Vo:18`. Used to locate stripes.
+    """
+    numdata = len(listdata)
+    listsorted = xp.sort(listdata)[::-1]
+    xlist = xp.arange(0, numdata, 1.0)
+    ndrop = xp.int16(0.25 * numdata)
+    # (_slope, _intercept) = xp.polyfit(xlist[ndrop:-ndrop - 1],
+    #   listsorted[ndrop:-ndrop - 1], 1)
+    (_slope, _intercept) = _mpolyfit(
+        xlist[ndrop : -ndrop - 1], listsorted[ndrop : -ndrop - 1]
+    )
+
+    numt1 = _intercept + _slope * xlist[-1]
+    noiselevel = xp.abs(numt1 - _intercept)
+    noiselevel = xp.clip(noiselevel, 1e-6, None)
+    val1 = xp.abs(listsorted[0] - _intercept) / noiselevel
+    val2 = xp.abs(listsorted[-1] - numt1) / noiselevel
+    listmask = xp.zeros_like(listdata)
+    if val1 >= snr:
+        upper_thresh = _intercept + noiselevel * snr * 0.5
+        listmask[listdata > upper_thresh] = 1.0
+    if val2 >= snr:
+        lower_thresh = numt1 - noiselevel * snr * 0.5
+        listmask[listdata <= lower_thresh] = 1.0
+    return listmask
+
+
+@nvtx.annotate()
+def _rs_large(sinogram, snr, size, matindex, drop_ratio=0.1, norm=True):
+    """
+    Remove large stripes.
+    """
+    drop_ratio = max(min(drop_ratio, 0.8), 0)  # = xp.clip(drop_ratio, 0.0, 0.8)
+    (nrow, ncol) = sinogram.shape
+    ndrop = int(0.5 * drop_ratio * nrow)
+    sinosort = xp.sort(sinogram, axis=0)
+    sinosmooth = median_filter(sinosort, (1, size))
+    list1 = xp.mean(sinosort[ndrop : nrow - ndrop], axis=0)
+    list2 = xp.mean(sinosmooth[ndrop : nrow - ndrop], axis=0)
+    # listfact = xp.divide(list1,
+    #                      list2,
+    #                      out=xp.ones_like(list1),
+    #                      where=list2 != 0)
+
+    listfact = list1 / list2
+
+    # Locate stripes
+    listmask = _detect_stripe(listfact, snr)
+    listmask = binary_dilation(listmask, iterations=1).astype(listmask.dtype)
+    matfact = xp.tile(listfact, (nrow, 1))
+    # Normalize
+    if norm is True:
+        sinogram = sinogram / matfact
+    sinogram1 = xp.transpose(sinogram)
+    matcombine = xp.asarray(xp.dstack((matindex, sinogram1)))
+
+    # matsort = xp.asarray([row[row[:, 1].argsort()] for row in matcombine])
+    ids = xp.argsort(matcombine[:, :, 1], axis=1)
+    matsort = matcombine.copy()
+    matsort[:, :, 0] = xp.take_along_axis(matsort[:, :, 0], ids, axis=1)
+    matsort[:, :, 1] = xp.take_along_axis(matsort[:, :, 1], ids, axis=1)
+
+    matsort[:, :, 1] = xp.transpose(sinosmooth)
+    # matsortback = xp.asarray([row[row[:, 0].argsort()] for row in matsort])
+    ids = xp.argsort(matsort[:, :, 0], axis=1)
+    matsortback = matsort.copy()
+    matsortback[:, :, 0] = xp.take_along_axis(matsortback[:, :, 0], ids, axis=1)
+    matsortback[:, :, 1] = xp.take_along_axis(matsortback[:, :, 1], ids, axis=1)
+
+    sino_corrected = xp.transpose(matsortback[:, :, 1])
+    listxmiss = xp.where(listmask > 0.0)[0]
+    sinogram[:, listxmiss] = sino_corrected[:, listxmiss]
+    return sinogram
+
+
+@nvtx.annotate()
+def _rs_dead(sinogram, snr, size, matindex, norm=True):
+    """
+    Remove unresponsive and fluctuating stripes.
+    """
+    sinogram = xp.copy(sinogram)  # Make it mutable
+    (nrow, _) = sinogram.shape
+    # sinosmooth = xp.apply_along_axis(uniform_filter1d, 0, sinogram, 10)
+    sinosmooth = uniform_filter1d(sinogram, 10, axis=0)
+
+    listdiff = xp.sum(xp.abs(sinogram - sinosmooth), axis=0)
+    listdiffbck = median_filter(listdiff, size)
+
+    listfact = listdiff / listdiffbck
+
+    listmask = _detect_stripe(listfact, snr)
+    listmask = binary_dilation(listmask, iterations=1).astype(listmask.dtype)
+    listmask[0:2] = 0.0
+    listmask[-2:] = 0.0
+    listx = xp.where(listmask < 1.0)[0]
+    listy = xp.arange(nrow)
+    matz = sinogram[:, listx]
+
+    listxmiss = xp.where(listmask > 0.0)[0]
+
+    # finter = interpolate.interp2d(listx.get(), listy.get(), matz.get(), kind='linear')
+    if len(listxmiss) > 0:
+        # sinogram_c[:, listxmiss.get()] = finter(listxmiss.get(), listy.get())
+        ids = xp.searchsorted(listx, listxmiss)
+        sinogram[:, listxmiss] = matz[:, ids - 1] + (listxmiss - listx[ids - 1]) * (
+            matz[:, ids] - matz[:, ids - 1]
+        ) / (listx[ids] - listx[ids - 1])
+
+    # Remove residual stripes
+    if norm is True:
+        sinogram = _rs_large(sinogram, snr, size, matindex)
+    return sinogram
+
+
+@nvtx.annotate()
+def _create_matindex(nrow, ncol):
+    """
+    Create a 2D array of indexes used for the sorting technique.
+    """
+    listindex = xp.arange(0.0, ncol, 1.0)
+    matindex = xp.tile(listindex, (nrow, 1))
+    return matindex.astype(np.float32)
diff --git a/tests/test_prep/test_stripe.py b/tests/test_prep/test_stripe.py
index 230687d7..2650db55 100644
--- a/tests/test_prep/test_stripe.py
+++ b/tests/test_prep/test_stripe.py
@@ -31,25 +31,25 @@ def test_remove_stripe_ti_on_data(data, flats, darks):
     assert data_after_stripe_removal.dtype == np.float32
 
 
-def test_remove_stripe_ti_on_flats(host_flats):
-    #: testing that numpy uint16 arrays can be passed
-    corrected_data = remove_stripe_ti(np.copy(host_flats))
-    assert_allclose(np.mean(corrected_data), 976.558447, rtol=1e-7)
-    assert_allclose(np.mean(corrected_data, axis=(1, 2)).sum(), 19531.168945, rtol=1e-7)
-    assert_allclose(np.median(corrected_data), 976.0, rtol=1e-7)
-
-
-def test_remove_stripe_ti_numpy_vs_cupy_on_random_data():
-    host_data = np.random.random_sample(size=(181, 5, 256)).astype(np.float32) * 2.0
-    corrected_host_data = remove_stripe_ti(np.copy(host_data))
-    corrected_data = remove_stripe_ti(
-        cp.copy(cp.asarray(host_data, dtype=cp.float32))
-    ).get()
-
-    assert_allclose(np.sum(corrected_data), np.sum(corrected_host_data), rtol=1e-6)
-    assert_allclose(
-        np.median(corrected_data), np.median(corrected_host_data), rtol=1e-6
-    )
+# def test_remove_stripe_ti_on_flats(host_flats):
+#     #: testing that numpy uint16 arrays can be passed
+#     corrected_data = remove_stripe_ti(np.copy(host_flats))
+#     assert_allclose(np.mean(corrected_data), 976.558447, rtol=1e-7)
+#     assert_allclose(np.mean(corrected_data, axis=(1, 2)).sum(), 19531.168945, rtol=1e-7)
+#     assert_allclose(np.median(corrected_data), 976.0, rtol=1e-7)
+
+
+# def test_remove_stripe_ti_numpy_vs_cupy_on_random_data():
+#     host_data = np.random.random_sample(size=(181, 5, 256)).astype(np.float32) * 2.0
+#     corrected_host_data = remove_stripe_ti(np.copy(host_data))
+#     corrected_data = remove_stripe_ti(
+#         cp.copy(cp.asarray(host_data, dtype=cp.float32))
+#     ).get()
+
+#     assert_allclose(np.sum(corrected_data), np.sum(corrected_host_data), rtol=1e-6)
+#     assert_allclose(
+#         np.median(corrected_data), np.median(corrected_host_data), rtol=1e-6
+#     )
 
 
 def test_stripe_removal_sorting_cupy(data, flats, darks):
@@ -67,14 +67,6 @@ def test_stripe_removal_sorting_cupy(data, flats, darks):
     assert corrected_data.flags.c_contiguous
 
 
-@cp.testing.numpy_cupy_allclose(rtol=1e-6)
-def test_stripe_removal_sorting_numpy_vs_cupy_on_random_data(ensure_clean_memory, xp):
-    np.random.seed(12345)
-    data = np.random.random_sample(size=(181, 5, 256)).astype(np.float32) * 2.0 + 0.001
-    data = xp.asarray(data)
-    return xp.asarray(remove_stripe_based_sorting(data))
-
-
 @pytest.mark.perf
 def test_stripe_removal_sorting_cupy_performance(ensure_clean_memory):
     data_host = (

From adbfa5e433d6227bf38fb6abbc3fa070e17b3eb3 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 30 Apr 2024 16:46:48 +0100
Subject: [PATCH 10/36] adds to build

---
 conda/recipe/meta.yaml | 1 +
 pyproject.toml         | 1 +
 2 files changed, 2 insertions(+)

diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml
index cbbb2591..cb35e294 100644
--- a/conda/recipe/meta.yaml
+++ b/conda/recipe/meta.yaml
@@ -41,6 +41,7 @@ test:
   imports:
     - httomolibgpu
     - httomolibgpu.misc
+    - httomolibgpu.prep
   source_files:
     - tests/*
   commands:
diff --git a/pyproject.toml b/pyproject.toml
index ad679a42..317818fd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,6 +6,7 @@ build-backend = "setuptools.build_meta"
 include-package-data = true
 packages = ["httomolibgpu",
             "httomolibgpu.misc",
+            "httomolibgpu.prep",            
             "httomolibgpu.cuda_kernels"]
 
 [tool.setuptools.package-data]

From e61ebe56bb830d7e7ac2109a40ecf5f0864c27da Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 30 Apr 2024 22:15:14 +0100
Subject: [PATCH 11/36] bringing recon back

---
 conda/recipe/meta.yaml          |   1 +
 httomolibgpu/__init__.py        |  12 +-
 httomolibgpu/recon/__init__.py  |   0
 httomolibgpu/recon/algorithm.py | 292 ++++++++++++
 httomolibgpu/recon/rotation.py  | 757 ++++++++++++++++++++++++++++++++
 pyproject.toml                  |   3 +-
 6 files changed, 1058 insertions(+), 7 deletions(-)
 create mode 100644 httomolibgpu/recon/__init__.py
 create mode 100644 httomolibgpu/recon/algorithm.py
 create mode 100644 httomolibgpu/recon/rotation.py

diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml
index cb35e294..98625e81 100644
--- a/conda/recipe/meta.yaml
+++ b/conda/recipe/meta.yaml
@@ -42,6 +42,7 @@ test:
     - httomolibgpu
     - httomolibgpu.misc
     - httomolibgpu.prep
+    - httomolibgpu.recon
   source_files:
     - tests/*
   commands:
diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index 994c904f..016c8d77 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -1,9 +1,9 @@
 from httomolibgpu.misc.corr import *
 from httomolibgpu.misc.morph import *
 from httomolibgpu.misc.rescale import *
-#from httomolibgpu.prep.alignment import *
-#from httomolibgpu.prep.normalize import *
-#from httomolibgpu.prep.phase import *
-#from httomolibgpu.prep.stripe import *
-#from httomolibgpu.recon.algorithm import *
-#from httomolibgpu.recon.rotation import *
+from httomolibgpu.prep.alignment import *
+from httomolibgpu.prep.normalize import *
+from httomolibgpu.prep.phase import *
+from httomolibgpu.prep.stripe import *
+from httomolibgpu.recon.algorithm import *
+from httomolibgpu.recon.rotation import *
diff --git a/httomolibgpu/recon/__init__.py b/httomolibgpu/recon/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
new file mode 100644
index 00000000..4daedc91
--- /dev/null
+++ b/httomolibgpu/recon/algorithm.py
@@ -0,0 +1,292 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# ---------------------------------------------------------------------------
+# Copyright 2022 Diamond Light Source Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ---------------------------------------------------------------------------
+# Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
+# Changes relative to ToMoBAR 2024.01 version
+# ---------------------------------------------------------------------------
+"""Module for tomographic reconstruction"""
+
+import numpy as xp
+import numpy as np
+
+cupy_run = False
+try:
+    import cupy as xp
+
+    try:
+        xp.cuda.Device(0).compute_capability
+        cupy_run = True
+
+    except xp.cuda.runtime.CUDARuntimeError:
+        print("CuPy library is a major dependency for HTTomolibgpu, please install")
+        import numpy as np
+except ImportError:
+    import numpy as np
+
+import nvtx
+from numpy import float32, complex64
+from typing import Optional, Tuple, Union
+from typing import Type
+
+from tomobar.methodsDIR_CuPy import RecToolsDIRCuPy
+from tomobar.methodsIR_CuPy import RecToolsIRCuPy
+
+__all__ = [
+    "FBP",
+    "SIRT",
+    "CGLS",
+]
+
+input_data_axis_labels = ["angles", "detY", "detX"]  # set the labels of the input data
+
+
+## %%%%%%%%%%%%%%%%%%%%%%% FBP reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
+@nvtx.annotate()
+def FBP(
+    data: xp.ndarray,
+    angles: np.ndarray,
+    center: Optional[float] = None,
+    filter_freq_cutoff: Optional[float] = 0.6,
+    recon_size: Optional[int] = None,
+    recon_mask_radius: Optional[float] = None,
+    gpu_id: int = 0,
+) -> xp.ndarray:
+    """
+    Perform Filtered Backprojection (FBP) reconstruction using ASTRA toolbox and ToMoBAR wrappers.
+    This is a 3D recon from a CuPy array and a custom built filter.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        Projection data as a CuPy array.
+    angles : np.ndarray
+        An array of angles given in radians.
+    center : float, optional
+        The center of rotation (CoR).
+    filter_freq_cutoff : float, optional
+        Cutoff frequency parameter for the sinc filter, the lowest values produce more crispy but noisy reconstruction.
+    recon_size : int, optional
+        The [recon_size, recon_size] shape of the reconstructed slice in pixels.
+        By default (None), the reconstructed size will be the dimension of the horizontal detector.
+    recon_mask_radius: float, optional
+        The radius of the circular mask that applies to the reconstructed slice in order to crop
+        out some undesirable artefacts. The values outside the diameter will be set to zero.
+        None by default, to see the effect of the mask try setting the value in the range [0.7-1.0].
+    gpu_id : int, optional
+        A GPU device index to perform operation on.
+
+    Returns
+    -------
+    cp.ndarray
+        The FBP reconstructed volume as a CuPy array.
+    """
+    RecToolsCP = _instantiate_direct_recon_class(
+        data, angles, center, recon_size, gpu_id
+    )
+
+    reconstruction = RecToolsCP.FBP(
+        data,
+        cutoff_freq=filter_freq_cutoff,
+        recon_mask_radius=recon_mask_radius,
+        data_axes_labels_order=input_data_axis_labels,
+    )
+    xp._default_memory_pool.free_all_blocks()
+    return xp.require(xp.swapaxes(reconstruction, 0, 1), requirements="C")
+
+
+## %%%%%%%%%%%%%%%%%%%%%%% SIRT reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
+@nvtx.annotate()
+def SIRT(
+    data: xp.ndarray,
+    angles: np.ndarray,
+    center: Optional[float] = None,
+    recon_size: Optional[int] = None,
+    iterations: Optional[int] = 300,
+    nonnegativity: Optional[bool] = True,
+    gpu_id: int = 0,
+) -> xp.ndarray:
+    """
+    Perform Simultaneous Iterative Recostruction Technique (SIRT) using ASTRA toolbox and ToMoBAR wrappers.
+    This is 3D recon directly from a CuPy array while using ASTRA GPUlink capability.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        Projection data as a CuPy array.
+    angles : np.ndarray
+        An array of angles given in radians.
+    center : float, optional
+        The center of rotation (CoR).
+    recon_size : int, optional
+        The [recon_size, recon_size] shape of the reconstructed slice in pixels.
+        By default (None), the reconstructed size will be the dimension of the horizontal detector.
+    iterations : int, optional
+        The number of SIRT iterations.
+    nonnegativity : bool, optional
+        Impose nonnegativity constraint on reconstructed image.
+    gpu_id : int, optional
+        A GPU device index to perform operation on.
+
+    Returns
+    -------
+    cp.ndarray
+        The SIRT reconstructed volume as a CuPy array.
+    """
+
+    RecToolsCP = _instantiate_iterative_recon_class(
+        data, angles, center, recon_size, gpu_id, datafidelity="LS"
+    )
+
+    _data_ = {
+        "projection_norm_data": data,
+        "data_axes_labels_order": input_data_axis_labels,
+    }  # data dictionary
+    _algorithm_ = {
+        "iterations": iterations,
+        "nonnegativity": nonnegativity,
+    }
+    reconstruction = RecToolsCP.SIRT(_data_, _algorithm_)
+    xp._default_memory_pool.free_all_blocks()
+    return xp.require(xp.swapaxes(reconstruction, 0, 1), requirements="C")
+
+
+## %%%%%%%%%%%%%%%%%%%%%%% CGLS reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
+@nvtx.annotate()
+def CGLS(
+    data: xp.ndarray,
+    angles: np.ndarray,
+    center: Optional[float] = None,
+    recon_size: Optional[int] = None,
+    iterations: Optional[int] = 20,
+    nonnegativity: Optional[bool] = True,
+    gpu_id: int = 0,
+) -> xp.ndarray:
+    """
+    Perform Congugate Gradient Least Squares (CGLS) using ASTRA toolbox and ToMoBAR wrappers.
+    This is 3D recon directly from a CuPy array while using ASTRA GPUlink capability.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        Projection data as a CuPy array.
+    angles : np.ndarray
+        An array of angles given in radians.
+    center : float, optional
+        The center of rotation (CoR).
+    recon_size : int, optional
+        The [recon_size, recon_size] shape of the reconstructed slice in pixels.
+        By default (None), the reconstructed size will be the dimension of the horizontal detector.
+    iterations : int, optional
+        The number of CGLS iterations.
+    nonnegativity : bool, optional
+        Impose nonnegativity constraint on reconstructed image.
+    gpu_id : int, optional
+        A GPU device index to perform operation on.
+
+    Returns
+    -------
+    cp.ndarray
+        The CGLS reconstructed volume as a CuPy array.
+    """
+    RecToolsCP = _instantiate_iterative_recon_class(
+        data, angles, center, recon_size, gpu_id, datafidelity="LS"
+    )
+
+    _data_ = {
+        "projection_norm_data": data,
+        "data_axes_labels_order": input_data_axis_labels,
+    }  # data dictionary
+    _algorithm_ = {"iterations": iterations, "nonnegativity": nonnegativity}
+    reconstruction = RecToolsCP.CGLS(_data_, _algorithm_)
+    xp._default_memory_pool.free_all_blocks()
+    return xp.require(xp.swapaxes(reconstruction, 0, 1), requirements="C")
+
+
+## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
+def _instantiate_direct_recon_class(
+    data: xp.ndarray,
+    angles: np.ndarray,
+    center: Optional[float] = None,
+    recon_size: Optional[int] = None,
+    gpu_id: int = 0,
+) -> type[RecToolsDIRCuPy]:
+    """instantiate ToMoBAR's direct recon class
+
+    Args:
+        data (cp.ndarray): data array
+        angles (np.ndarray): angles
+        center (Optional[float], optional): center of recon. Defaults to None.
+        recon_size (Optional[int], optional): recon_size. Defaults to None.
+        gpu_id (int, optional): gpu ID. Defaults to 0.
+
+    Returns:
+        type[RecToolsDIRCuPy]: an instance of the direct recon class
+    """
+    if center is None:
+        center = data.shape[2] // 2  # making a crude guess
+    if recon_size is None:
+        recon_size = data.shape[2]
+    RecToolsCP = RecToolsDIRCuPy(
+        DetectorsDimH=data.shape[2],  # Horizontal detector dimension
+        DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
+        CenterRotOffset=data.shape[2] / 2
+        - center
+        - 0.5,  # Center of Rotation scalar or a vector
+        AnglesVec=-angles,  # A vector of projection angles in radians
+        ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
+        device_projector=gpu_id,
+    )
+    return RecToolsCP
+
+
+def _instantiate_iterative_recon_class(
+    data: xp.ndarray,
+    angles: np.ndarray,
+    center: Optional[float] = None,
+    recon_size: Optional[int] = None,
+    gpu_id: int = 0,
+    datafidelity: str = "LS",
+) -> type[RecToolsIRCuPy]:
+    """instantiate ToMoBAR's iterative recon class
+
+    Args:
+        data (cp.ndarray): data array
+        angles (np.ndarray): angles
+        center (Optional[float], optional): center of recon. Defaults to None.
+        recon_size (Optional[int], optional): recon_size. Defaults to None.
+        datafidelity (str, optional): Data fidelity
+        gpu_id (int, optional): gpu ID. Defaults to 0.
+
+    Returns:
+        type[RecToolsIRCuPy]: an instance of the iterative class
+    """
+    if center is None:
+        center = data.shape[2] // 2  # making a crude guess
+    if recon_size is None:
+        recon_size = data.shape[2]
+    RecToolsCP = RecToolsIRCuPy(
+        DetectorsDimH=data.shape[2],  # Horizontal detector dimension
+        DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
+        CenterRotOffset=data.shape[2] / 2
+        - center
+        - 0.5,  # Center of Rotation scalar or a vector
+        AnglesVec=-angles,  # A vector of projection angles in radians
+        ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
+        datafidelity=datafidelity,
+        device_projector=gpu_id,
+    )
+    return RecToolsCP
diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
new file mode 100644
index 00000000..edfea414
--- /dev/null
+++ b/httomolibgpu/recon/rotation.py
@@ -0,0 +1,757 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# ---------------------------------------------------------------------------
+# Copyright 2022 Diamond Light Source Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ---------------------------------------------------------------------------
+# Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
+# Created Date: 01 November 2022
+# ---------------------------------------------------------------------------
+"""Modules for finding the axis of rotation"""
+
+import numpy as xp
+import numpy as np
+
+cupy_run = False
+try:
+    import cupy as xp
+    import cupyx
+
+    try:
+        xp.cuda.Device(0).compute_capability
+        cupy_run = True
+
+    except xp.cuda.runtime.CUDARuntimeError:
+        print("CuPy library is a major dependency for HTTomolibgpu, please install")
+        import numpy as np
+except ImportError:
+    import numpy as np
+
+import nvtx
+import math
+from typing import List, Literal, Optional, Tuple
+
+if cupy_run:
+    from httomolibgpu.cuda_kernels import load_cuda_module
+    from cupyx.scipy.ndimage import shift, gaussian_filter
+    from cucim.skimage.registration import phase_cross_correlation
+else:
+    from scipy.ndimage import shift, gaussian_filter
+    from skimage.registration import phase_cross_correlation
+
+__all__ = [
+    "find_center_vo",
+    "find_center_360",
+    "find_center_pc",
+]
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%find_center_vo%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+@nvtx.annotate()
+def find_center_vo(
+    data: xp.ndarray,
+    ind: Optional[int] = None,
+    smin: int = -50,
+    smax: int = 50,
+    srad: float = 6.0,
+    step: float = 0.25,
+    ratio: float = 0.5,
+    drop: int = 20,
+) -> float:
+    """
+    Find rotation axis location using Nghia Vo's method. See the paper
+    https://opg.optica.org/oe/fulltext.cfm?uri=oe-22-16-19078&id=297315
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        3D tomographic data or a 2D sinogram as a CuPy array.
+    ind : int, optional
+        Index of the slice to be used to estimate the CoR.
+    smin : int, optional
+        Coarse search radius. Reference to the horizontal center of
+        the sinogram.
+    smax : int, optional
+        Coarse search radius. Reference to the horizontal center of
+        the sinogram.
+    srad : float, optional
+        Fine search radius.
+    step : float, optional
+        Step of fine searching.
+    ratio : float, optional
+        The ratio between the FOV of the camera and the size of object.
+        It's used to generate the mask.
+    drop : int, optional
+        Drop lines around vertical center of the mask.
+
+    Returns
+    -------
+    float
+        Rotation axis location.
+    """
+
+    if data.ndim == 2:
+        data = xp.expand_dims(data, 1)
+        ind = 0
+
+    height = data.shape[1]
+
+    if ind is None:
+        ind = height // 2
+        if height > 10:
+            _sino = xp.mean(data[:, ind - 5 : ind + 5, :], axis=1)
+        else:
+            _sino = data[:, ind, :]
+    else:
+        _sino = data[:, ind, :]
+
+    with nvtx.annotate("gaussian_filter_1", color="green"):
+        _sino_cs = gaussian_filter(_sino, (3, 1), mode="reflect")
+    with nvtx.annotate("gaussian_filter_2", color="green"):
+        _sino_fs = gaussian_filter(_sino, (2, 2), mode="reflect")
+
+    if _sino.shape[0] * _sino.shape[1] > 4e6:
+        # data is large, so downsample it before performing search for
+        # centre of rotation
+        _sino_coarse = _downsample(_sino_cs, 2, 1)
+        init_cen = _search_coarse(_sino_coarse, smin / 4.0, smax / 4.0, ratio, drop)
+        fine_cen = _search_fine(_sino_fs, srad, step, init_cen * 4.0, ratio, drop)
+    else:
+        init_cen = _search_coarse(_sino_cs, smin, smax, ratio, drop)
+        fine_cen = _search_fine(_sino_fs, srad, step, init_cen, ratio, drop)
+
+    return xp.asnumpy(fine_cen)
+
+
+@nvtx.annotate()
+def _search_coarse(sino, smin, smax, ratio, drop):
+    (nrow, ncol) = sino.shape
+    flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
+    comp_sino = xp.ascontiguousarray(xp.flipud(sino))
+    mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
+
+    cen_fliplr = (ncol - 1.0) / 2.0
+    smin_clip_val = max(min(smin + cen_fliplr, ncol - 1), 0)
+    smin = smin_clip_val - cen_fliplr
+    smax_clip_val = max(min(smax + cen_fliplr, ncol - 1), 0)
+    smax = smax_clip_val - cen_fliplr
+    start_cor = ncol // 2 + smin
+    stop_cor = ncol // 2 + smax
+    list_cor = xp.arange(start_cor, stop_cor + 0.5, 0.5, dtype=xp.float32)
+    list_shift = 2.0 * (list_cor - cen_fliplr)
+    list_metric = xp.empty(list_shift.shape, dtype=xp.float32)
+    _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, list_metric)
+
+    minpos = xp.argmin(list_metric)
+    if minpos == 0:
+        print("WARNING!!!Global minimum is out of searching range")
+        print(f"Please extend smin: {smin}")
+    if minpos == len(list_metric) - 1:
+        print("WARNING!!!Global minimum is out of searching range")
+        print(f"Please extend smax: {smax}")
+    cor = list_cor[minpos]
+    return cor
+
+
+@nvtx.annotate()
+def _search_fine(sino, srad, step, init_cen, ratio, drop):
+    (nrow, ncol) = sino.shape
+
+    flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
+    comp_sino = xp.ascontiguousarray(xp.flipud(sino))
+    mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
+
+    cen_fliplr = (ncol - 1.0) / 2.0
+    srad = max(min(abs(float(srad)), ncol / 4.0), 1.0)
+    step = max(min(abs(step), srad), 0.1)
+    init_cen = max(min(init_cen, ncol - srad - 1), srad)
+    list_cor = init_cen + xp.arange(-srad, srad + step, step, dtype=np.float32)
+    list_shift = 2.0 * (list_cor - cen_fliplr)
+    list_metric = xp.empty(list_shift.shape, dtype="float32")
+
+    _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, out=list_metric)
+    cor = list_cor[xp.argmin(list_metric)]
+    return cor
+
+
+@nvtx.annotate()
+def _create_mask(nrow, ncol, radius, drop):
+    du = 1.0 / ncol
+    dv = (nrow - 1.0) / (nrow * 2.0 * np.pi)
+    cen_row = int(math.ceil(nrow / 2.0) - 1)
+    cen_col = int(math.ceil(ncol / 2.0) - 1)
+    drop = min([drop, int(math.ceil(0.05 * nrow))])
+
+    block_x = 128
+    block_y = 1
+    block_dims = (block_x, block_y)
+    grid_x = (ncol // 2 + 1 + block_x - 1) // block_x
+    grid_y = nrow
+    grid_dims = (grid_x, grid_y)
+    mask = xp.empty((nrow, ncol // 2 + 1), dtype="uint16")
+    params = (
+        ncol,
+        nrow,
+        cen_col,
+        cen_row,
+        xp.float32(du),
+        xp.float32(dv),
+        xp.float32(radius),
+        xp.float32(drop),
+        mask,
+    )
+    module = load_cuda_module("generate_mask")
+    kernel = module.get_function("generate_mask")
+    kernel(grid_dims, block_dims, params)
+    return mask
+
+
+def round_up(x: float) -> int:
+    if x >= 0.0:
+        return int(math.ceil(x))
+    else:
+        return int(math.floor(x))
+
+
+def _get_available_gpu_memory() -> int:
+    dev = xp.cuda.Device()
+    # first, let's make some space
+    xp.get_default_memory_pool().free_all_blocks()
+    cache = xp.fft.config.get_plan_cache()
+    cache.clear()
+    available_memory = dev.mem_info[0] + xp.get_default_memory_pool().free_bytes()
+    return int(available_memory * 0.9)  # 10% safety margin
+
+
+def _calculate_chunks(
+    nshifts: int, shift_size: int, available_memory: Optional[int] = None
+) -> List[int]:
+    if available_memory is None:
+        available_memory = _get_available_gpu_memory()
+
+    available_memory -= shift_size
+    freq_domain_size = (
+        shift_size  # it needs only half (RFFT), but complex64, so it's the same
+    )
+    fft_plan_size = freq_domain_size
+    size_per_shift = fft_plan_size + freq_domain_size + shift_size
+    nshift_max = available_memory // size_per_shift
+    assert nshift_max > 0, "Not enough memory to process"
+    num_chunks = int(np.ceil(nshifts / nshift_max))
+    chunk_size = int(np.ceil(nshifts / num_chunks))
+    chunks = [chunk_size] * (num_chunks - 1)
+    stop_idx = list(np.cumsum(chunks))
+    stop_idx.append(nshifts)
+    return stop_idx
+
+
+@nvtx.annotate()
+def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
+    # this tries to simplify - if shift_col is integer, no need to spline interpolate
+    assert list_shift.dtype == xp.float32, "shifts must be single precision floats"
+    assert sino1.dtype == xp.float32, "sino1 must be float32"
+    assert sino2.dtype == xp.float32, "sino1 must be float32"
+    assert sino3.dtype == xp.float32, "sino1 must be float32"
+    assert out.dtype == xp.float32, "sino1 must be float32"
+    assert sino2.flags["C_CONTIGUOUS"], "sino2 must be C-contiguous"
+    assert sino3.flags["C_CONTIGUOUS"], "sino3 must be C-contiguous"
+    assert list_shift.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
+    nshifts = list_shift.shape[0]
+    na1 = sino1.shape[0]
+    na2 = sino2.shape[0]
+
+    module = load_cuda_module("center_360_shifts")
+    shift_whole_shifts = module.get_function("shift_whole_shifts")
+    # note: we don't have to calculate the mean here, as we're only looking for minimum metric.
+    # The sum is enough.
+    masked_sum_abs_kernel = xp.ReductionKernel(
+        in_params="complex64 x, uint16 mask",  # input, complex + mask
+        out_params="float32 out",  # output, real
+        map_expr="mask ? abs(x) : 0.0f",
+        reduce_expr="a + b",
+        post_map_expr="out = a",
+        identity="0.0f",
+        reduce_type="float",
+        name="masked_sum_abs",
+    )
+
+    # determine how many shifts we can fit in the available memory
+    # and iterate in chunks
+    chunks = _calculate_chunks(
+        nshifts, (na1 + na2) * sino2.shape[1] * xp.float32().nbytes
+    )
+
+    mat = xp.empty((chunks[0], na1 + na2, sino2.shape[1]), dtype=xp.float32)
+    mat[:, :na1, :] = sino1
+    # explicitly create FFT plan here, so it's not cached and clearly re-used
+    plan = cupyx.scipy.fftpack.get_fft_plan(
+        mat, mat.shape[-2:], axes=(1, 2), value_type="R2C"
+    )
+
+    for i, stop_idx in enumerate(chunks):
+        if i > 0:
+            # more than one iteration means we're tight on memory, so clear up freed blocks
+            mat_freq = None
+            xp.get_default_memory_pool().free_all_blocks()
+
+        start_idx = 0 if i == 0 else chunks[i - 1]
+        size = stop_idx - start_idx
+
+        # first, handle the integer shifts without spline in a raw kernel,
+        # and shift in the sino3 one accordingly
+        bx = 128
+        gx = (sino3.shape[1] + bx - 1) // bx
+        shift_whole_shifts(
+            grid=(gx, na2, size),  ####
+            block=(bx, 1, 1),
+            args=(
+                sino2,
+                sino3,
+                list_shift[start_idx:stop_idx],
+                mat[:, na1:, :],
+                sino3.shape[1],
+                na1 + na2,
+            ),
+        )
+
+        # now we can only look at the spline shifting, the rest is done
+        list_shift_host = xp.asnumpy(list_shift[start_idx:stop_idx])
+        for i in range(list_shift_host.shape[0]):
+            shift_col = float(list_shift_host[i])
+            if not shift_col.is_integer():
+                shifted = shift(sino2, (0, shift_col), order=3, prefilter=True)
+                shift_int = round_up(shift_col)
+                if shift_int >= 0:
+                    mat[i, na1:, shift_int:] = shifted[:, shift_int:]
+                else:
+                    mat[i, na1:, :shift_int] = shifted[:, :shift_int]
+
+        # stack and transform
+        # (we do the full sized mat FFT, even though the last chunk may be smaller, to
+        # make sure we can re-use the same FFT plan as before)
+        mat_freq = cupyx.scipy.fft.rfft2(mat, axes=(1, 2), norm=None, plan=plan)
+        masked_sum_abs_kernel(
+            mat_freq[:size, :, :], mask, out=out[start_idx:stop_idx], axis=(1, 2)
+        )
+
+
+@nvtx.annotate()
+def _downsample(sino, level, axis):
+    assert sino.dtype == xp.float32, "single precision floating point input required"
+    assert sino.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
+
+    dx, dz = sino.shape
+    # Determine the new size, dim, of the downsampled dimension
+    dim = int(sino.shape[axis] / math.pow(2, level))
+    shape = [dx, dz]
+    shape[axis] = dim
+    downsampled_data = xp.empty(shape, dtype="float32")
+
+    block_x = 8
+    block_y = 8
+    block_dims = (block_x, block_y)
+    grid_x = (sino.shape[1] + block_x - 1) // block_x
+    grid_y = (sino.shape[0] + block_y - 1) // block_y
+    grid_dims = (grid_x, grid_y)
+    # 8x8 thread-block, which means 16 "lots" of columns to downsample per
+    # thread-block; 4 bytes per float, so allocate 16*6 = 64 bytes of shared
+    # memeory per thread-block
+    shared_mem_bytes = 64
+    params = (sino, dx, dz, level, downsampled_data)
+    module = load_cuda_module("downsample_sino")
+    kernel = module.get_function("downsample_sino")
+    kernel(grid_dims, block_dims, params, shared_mem=shared_mem_bytes)
+    return downsampled_data
+
+
+##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%find_center_360%%%%%%%%%%%%%%%%%%%%%%%%%
+# --- Center of rotation (COR) estimation method ---#
+@nvtx.annotate()
+def find_center_360(
+    data: xp.ndarray,
+    ind: Optional[int] = None,
+    win_width: int = 10,
+    side: Optional[Literal[0, 1]] = None,
+    denoise: bool = True,
+    norm: bool = False,
+    use_overlap: bool = False,
+) -> Tuple[float, float, Optional[Literal[0, 1]], float]:
+    """
+    Find the center-of-rotation (COR) in a 360-degree scan with offset COR use
+    the method presented in Ref. [1] by Nghia Vo.
+
+    This function supports both numpy and cupy - the implementation is selected
+    by where the input data array resides.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        3D tomographic data as a Cupy array.
+    ind : int, optional
+        Index of the slice to be used for estimate the CoR and the overlap.
+    win_width : int, optional
+        Window width used for finding the overlap area.
+    side : {None, 0, 1}, optional
+        Overlap size. Only there options: None, 0, or 1. "None" corresponds
+        to fully automated determination. "0" corresponds to the left side.
+        "1" corresponds to the right side.
+    denoise : bool, optional
+        Apply the Gaussian filter if True.
+    norm : bool, optional
+        Apply the normalisation if True.
+    use_overlap : bool, optional
+        Use the combination of images in the overlap area for calculating
+        correlation coefficients if True.
+
+    Returns
+    -------
+    cor : float
+        Center-of-rotation.
+    overlap : float
+        Width of the overlap area between two halves of the sinogram.
+    side : int
+        Overlap side between two halves of the sinogram.
+    overlap_position : float
+        Position of the window in the first image giving the best
+        correlation metric.
+
+    References
+    ----------
+    [1] : https://doi.org/10.1364/OE.418448
+    """
+    if data.ndim != 3:
+        raise ValueError("A 3D array must be provided")
+
+    # this method works with a 360-degree sinogram.
+    if ind is None:
+        _sino = data[:, 0, :]
+    else:
+        _sino = data[:, ind, :]
+
+    (nrow, ncol) = _sino.shape
+    nrow_180 = nrow // 2 + 1
+    sino_top = _sino[0:nrow_180, :]
+    sino_bot = xp.fliplr(_sino[-nrow_180:, :])
+    (overlap, side, overlap_position) = _find_overlap(
+        sino_top, sino_bot, win_width, side, denoise, norm, use_overlap
+    )
+    if side == 0:
+        cor = overlap / 2.0 - 1.0
+    else:
+        cor = ncol - overlap / 2.0 - 1.0
+
+    return cor, overlap, side, overlap_position
+
+
+def _find_overlap(
+    mat1, mat2, win_width, side=None, denoise=True, norm=False, use_overlap=False
+):
+    """
+    Find the overlap area and overlap side between two images (Ref. [1]) where
+    the overlap side referring to the first image.
+
+    Parameters
+    ----------
+    mat1 : array_like
+        2D array. Projection image or sinogram image.
+    mat2 :  array_like
+        2D array. Projection image or sinogram image.
+    win_width : int
+        Width of the searching window.
+    side : {None, 0, 1}, optional
+        Only there options: None, 0, or 1. "None" corresponding to fully
+        automated determination. "0" corresponding to the left side. "1"
+        corresponding to the right side.
+    denoise : bool, optional
+        Apply the Gaussian filter if True.
+    norm : bool, optional
+        Apply the normalization if True.
+    use_overlap : bool, optional
+        Use the combination of images in the overlap area for calculating
+        correlation coefficients if True.
+
+    Returns
+    -------
+    overlap : float
+        Width of the overlap area between two images.
+    side : int
+        Overlap side between two images.
+    overlap_position : float
+        Position of the window in the first image giving the best
+        correlation metric.
+
+    """
+    ncol1 = mat1.shape[1]
+    ncol2 = mat2.shape[1]
+    win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2))
+
+    if side == 1:
+        (list_metric, offset) = _search_overlap(
+            mat1,
+            mat2,
+            win_width,
+            side=side,
+            denoise=denoise,
+            norm=norm,
+            use_overlap=use_overlap,
+        )
+        overlap_position = _calculate_curvature(list_metric)[1]
+        overlap_position += offset
+        overlap = ncol1 - overlap_position + win_width // 2
+    elif side == 0:
+        (list_metric, offset) = _search_overlap(
+            mat1,
+            mat2,
+            win_width,
+            side=side,
+            denoise=denoise,
+            norm=norm,
+            use_overlap=use_overlap,
+        )
+        overlap_position = _calculate_curvature(list_metric)[1]
+        overlap_position += offset
+        overlap = overlap_position + win_width // 2
+    else:
+        (list_metric1, offset1) = _search_overlap(
+            mat1,
+            mat2,
+            win_width,
+            side=1,
+            denoise=denoise,
+            norm=norm,
+            use_overlap=use_overlap,
+        )
+        (list_metric2, offset2) = _search_overlap(
+            mat1,
+            mat2,
+            win_width,
+            side=0,
+            denoise=denoise,
+            norm=norm,
+            use_overlap=use_overlap,
+        )
+
+        (curvature1, overlap_position1) = _calculate_curvature(list_metric1)
+        overlap_position1 += offset1
+        (curvature2, overlap_position2) = _calculate_curvature(list_metric2)
+        overlap_position2 += offset2
+
+        if curvature1 > curvature2:
+            side = 1
+            overlap_position = overlap_position1
+            overlap = ncol1 - overlap_position + win_width // 2
+        else:
+            side = 0
+            overlap_position = overlap_position2
+            overlap = overlap_position + win_width // 2
+
+    return overlap, side, overlap_position
+
+
+@nvtx.annotate()
+def _search_overlap(
+    mat1, mat2, win_width, side, denoise=True, norm=False, use_overlap=False
+):
+    """
+    Calculate the correlation metrics between a rectangular region, defined
+    by the window width, on the utmost left/right side of image 2 and the
+    same size region in image 1 where the region is slided across image 1.
+
+    Parameters
+    ----------
+    mat1 : array_like
+        2D array. Projection image or sinogram image.
+    mat2 : array_like
+        2D array. Projection image or sinogram image.
+    win_width : int
+        Width of the searching window.
+    side : {0, 1}
+        Only two options: 0 or 1. It is used to indicate the overlap side
+        respects to image 1. "0" corresponds to the left side. "1" corresponds
+        to the right side.
+    denoise : bool, optional
+        Apply the Gaussian filter if True.
+    norm : bool, optional
+        Apply the normalization if True.
+    use_overlap : bool, optional
+        Use the combination of images in the overlap area for calculating
+        correlation coefficients if True.
+
+    Returns
+    -------
+    list_metric : array_like
+        1D array. List of the correlation metrics.
+    offset : int
+        Initial position of the searching window where the position
+        corresponds to the center of the window.
+    """
+    if denoise is True:
+        # note: the filtering makes the output contiguous
+        with nvtx.annotate("denoise_filter", color="green"):
+            mat1 = gaussian_filter(mat1, (2, 2), mode="reflect")
+            mat2 = gaussian_filter(mat2, (2, 2), mode="reflect")
+    else:
+        mat1 = xp.ascontiguousarray(mat1, dtype=xp.float32)
+        mat2 = xp.ascontiguousarray(mat2, dtype=xp.float32)
+
+    (nrow1, ncol1) = mat1.shape
+    (nrow2, ncol2) = mat2.shape
+
+    if nrow1 != nrow2:
+        raise ValueError("Two images are not at the same height!!!")
+
+    win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2 - 1))
+    offset = win_width // 2
+    win_width = 2 * offset  # Make it even
+
+    list_metric = _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm)
+
+    min_metric = xp.min(list_metric)
+    if min_metric != 0.0:
+        list_metric /= min_metric
+
+    return list_metric, offset
+
+
+_calc_metrics_module = load_cuda_module(
+    "calc_metrics",
+    name_expressions=[
+        "calc_metrics_kernel<false, false>",
+        "calc_metrics_kernel<true, false>",
+        "calc_metrics_kernel<false, true>",
+        "calc_metrics_kernel<true, true>",
+    ],
+    options=("--maxrregcount=32",),
+)
+
+
+@nvtx.annotate()
+def _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm):
+    assert mat1.dtype == xp.float32, "only float32 supported"
+    assert mat2.dtype == xp.float32, "only float32 supported"
+    assert mat1.shape[0] == mat2.shape[0]
+    assert mat1.flags.c_contiguous, "only contiguos arrays supported"
+    assert mat2.flags.c_contiguous, "only contiguos arrays supported"
+
+    num_pos = mat1.shape[1] - win_width
+    list_metric = xp.empty(num_pos, dtype=xp.float32)
+
+    args = (
+        mat1,
+        np.int32(mat1.strides[0] / mat1.strides[1]),
+        mat2,
+        np.int32(mat2.strides[0] / mat2.strides[1]),
+        np.int32(win_width),
+        np.int32(mat1.shape[0]),
+        np.int32(side),
+        list_metric,
+    )
+    block = (128, 1, 1)
+    grid = (1, np.int32(num_pos), 1)
+    smem = block[0] * 4 * 6 if use_overlap else block[0] * 4 * 3
+    bool2str = lambda x: "true" if x is True else "false"
+    calc_metrics = _calc_metrics_module.get_function(
+        f"calc_metrics_kernel<{bool2str(norm)}, {bool2str(use_overlap)}>"
+    )
+    calc_metrics(grid=grid, block=block, args=args, shared_mem=smem)
+
+    return list_metric
+
+
+@nvtx.annotate()
+def _calculate_curvature(list_metric):
+    """
+    Calculate the curvature of a fitted curve going through the minimum
+    value of a metric list.
+
+    Parameters
+    ----------
+    list_metric : array_like
+        1D array. List of metrics.
+
+    Returns
+    -------
+    curvature : float
+        Quadratic coefficient of the parabola fitting.
+    min_pos : float
+        Position of the minimum value with sub-pixel accuracy.
+    """
+    radi = 2
+    num_metric = list_metric.size
+    min_metric_idx = int(xp.argmin(list_metric))
+    min_pos = int(np.clip(min_metric_idx, radi, num_metric - radi - 1))
+
+    # work mostly on CPU here - we have very small arrays here
+    list1 = xp.asnumpy(list_metric[min_pos - radi : min_pos + radi + 1])
+    afact1 = np.polyfit(np.arange(0, 2 * radi + 1), list1, 2)[0]
+    list2 = xp.asnumpy(list_metric[min_pos - 1 : min_pos + 2])
+    (afact2, bfact2, _) = np.polyfit(np.arange(min_pos - 1, min_pos + 2), list2, 2)
+
+    curvature = np.abs(afact1)
+    if afact2 != 0.0:
+        num = -bfact2 / (2 * afact2)
+        if (num >= min_pos - 1) and (num <= min_pos + 1):
+            min_pos = num
+
+    return curvature, np.float32(min_pos)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+
+# %%%%%%%%%%%%%%%%%%%%%%find_center_pc%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+@nvtx.annotate()
+def find_center_pc(
+    proj1: xp.ndarray, proj2: xp.ndarray, tol: float = 0.5, rotc_guess: float = None
+) -> float:
+    """Find rotation axis location by finding the offset between the first
+    projection and a mirrored projection 180 degrees apart using
+    phase correlation in Fourier space.
+    The ``phase_cross_correlation`` function uses cross-correlation in Fourier
+    space, optionally employing an upsampled matrix-multiplication DFT to
+    achieve arbitrary subpixel precision. :cite:`Guizar:08`.
+
+    Args:
+        proj1 (xp.ndarray): Projection from the 0th degree
+        proj2 (xp.ndarray): Projection from the 180th degree
+        tol (float, optional): Subpixel accuracy. Defaults to 0.5.
+        rotc_guess (float, optional): Initial guess value for the rotation center. Defaults to None.
+
+    Returns:
+        float: Rotation axis location.
+    """
+    imgshift = 0.0 if rotc_guess is None else rotc_guess - (proj1.shape[1] - 1.0) / 2.0
+
+    proj1 = shift(proj1, [0, -imgshift], mode="constant", cval=0)
+    proj2 = shift(proj2, [0, -imgshift], mode="constant", cval=0)
+
+    # create reflection of second projection
+    proj2 = xp.fliplr(proj2)
+
+    # using cucim of rapids to do phase cross correlation between two images
+    shiftr = phase_cross_correlation(
+        reference_image=proj1, moving_image=proj2, upsample_factor=1.0 / tol
+    )
+
+    # Compute center of rotation as the center of first image and the
+    # registered translation with the second image
+    center = (proj1.shape[1] + shiftr[0][1] - 1.0) / 2.0
+
+    return center + imgshift
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
diff --git a/pyproject.toml b/pyproject.toml
index 317818fd..282e66ac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,8 @@ build-backend = "setuptools.build_meta"
 include-package-data = true
 packages = ["httomolibgpu",
             "httomolibgpu.misc",
-            "httomolibgpu.prep",            
+            "httomolibgpu.prep",
+            "httomolibgpu.recon",
             "httomolibgpu.cuda_kernels"]
 
 [tool.setuptools.package-data]

From 2d5e799ca2cb519cdf33eb845dbbedcef9c9db01 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 30 Apr 2024 22:25:17 +0100
Subject: [PATCH 12/36] mod of init file

---
 httomolibgpu/__init__.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index 016c8d77..09acafcf 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -3,7 +3,11 @@
 from httomolibgpu.misc.rescale import *
 from httomolibgpu.prep.alignment import *
 from httomolibgpu.prep.normalize import *
-from httomolibgpu.prep.phase import *
-from httomolibgpu.prep.stripe import *
-from httomolibgpu.recon.algorithm import *
-from httomolibgpu.recon.rotation import *
+from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
+from httomolibgpu.prep.stripe import (
+    remove_stripe_based_sorting,
+    remove_stripe_ti,
+    remove_all_stripe,
+)
+from httomolibgpu.recon.algorithm import FBP, SIRT, CGLS
+from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc

From 29affedcbf508995ae90d27fb31ddfec74311018 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 30 Apr 2024 22:35:02 +0100
Subject: [PATCH 13/36] corr to alignment

---
 httomolibgpu/__init__.py       | 19 ++++++++++---------
 httomolibgpu/prep/alignment.py |  9 +++++++--
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index 09acafcf..d742c19b 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -2,12 +2,13 @@
 from httomolibgpu.misc.morph import *
 from httomolibgpu.misc.rescale import *
 from httomolibgpu.prep.alignment import *
-from httomolibgpu.prep.normalize import *
-from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
-from httomolibgpu.prep.stripe import (
-    remove_stripe_based_sorting,
-    remove_stripe_ti,
-    remove_all_stripe,
-)
-from httomolibgpu.recon.algorithm import FBP, SIRT, CGLS
-from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
+
+# from httomolibgpu.prep.normalize import *
+# from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
+# from httomolibgpu.prep.stripe import (
+#     remove_stripe_based_sorting,
+#     remove_stripe_ti,
+#     remove_all_stripe,
+# )
+# from httomolibgpu.recon.algorithm import FBP, SIRT, CGLS
+# from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
diff --git a/httomolibgpu/prep/alignment.py b/httomolibgpu/prep/alignment.py
index c5ffec32..00159093 100644
--- a/httomolibgpu/prep/alignment.py
+++ b/httomolibgpu/prep/alignment.py
@@ -23,12 +23,14 @@
 import numpy as xp
 import numpy as np
 
+cupy_run = False
 try:
     import cupy as xp
     from cupy import mean
 
     try:
         xp.cuda.Device(0).compute_capability
+        cupy_run = True
     except xp.cuda.runtime.CUDARuntimeError:
         print("CuPy library is a major dependency for HTTomolibgpu, please install")
         import numpy as np
@@ -38,6 +40,11 @@
 from typing import Dict, List
 import nvtx
 
+if cupy_run:
+    from cupyx.scipy.ndimage import map_coordinates
+else:
+    from scipy.ndimage import map_coordinates
+
 __all__ = [
     "distortion_correction_proj_discorpy",
 ]
@@ -85,8 +92,6 @@ def distortion_correction_proj_discorpy(
     cp.ndarray
         3D array. Distortion-corrected image(s).
     """
-    from cupyx.scipy.ndimage import map_coordinates
-
     # Check if it's a stack of 2D images, or only a single 2D image
     if len(data.shape) == 2:
         data = xp.expand_dims(data, axis=0)

From eb5da60ed3cb3207ec52c29120877ac53bdbcd8b Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 10:10:38 +0100
Subject: [PATCH 14/36] fixing imports

---
 httomolibgpu/__init__.py       | 13 ++++++-------
 httomolibgpu/misc/corr.py      | 13 +++++++------
 httomolibgpu/misc/morph.py     | 18 +++++++++++-------
 httomolibgpu/misc/rescale.py   | 29 ++++++++++++++++++++++++-----
 httomolibgpu/prep/alignment.py |  7 ++-----
 httomolibgpu/prep/normalize.py |  7 +++----
 6 files changed, 53 insertions(+), 34 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index d742c19b..15a49d24 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -1,10 +1,9 @@
-from httomolibgpu.misc.corr import *
-from httomolibgpu.misc.morph import *
-from httomolibgpu.misc.rescale import *
-from httomolibgpu.prep.alignment import *
-
-# from httomolibgpu.prep.normalize import *
-# from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
+from httomolibgpu.misc.corr import median_filter, remove_outlier
+from httomolibgpu.misc.morph import sino_360_to_180, data_resampler
+from httomolibgpu.misc.rescale import rescale_to_int
+from httomolibgpu.prep.alignment import distortion_correction_proj_discorpy
+from httomolibgpu.prep.normalize import normalize
+from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
 # from httomolibgpu.prep.stripe import (
 #     remove_stripe_based_sorting,
 #     remove_stripe_ti,
diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index e0607ae5..f00e3d88 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -20,29 +20,30 @@
 # ---------------------------------------------------------------------------
 """ Module for data correction """
 
-import numpy as xp
-
+import numpy as np
 cupy_run = False
 try:
     import cupy as xp
+    from cupy import mean
 
     try:
         xp.cuda.Device(0).compute_capability
         cupy_run = True
-
     except xp.cuda.runtime.CUDARuntimeError:
         print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as np
+        import numpy as xp
 except ImportError:
-    import numpy as np
+    import numpy as xp
 
 try:
     from cucim.skimage.filters import median
     from cucim.skimage.morphology import disk
 except ImportError:
     print(
-        "Cucim library of Rapidsai is a required dependency for some modules, please install"
+        "Cucim library of Rapidsai is a required dependency for median_filter and remove_outlier modules, please install"
     )
+    from skimage.filters import median
+    from skimage.morphology import disk
 
 from numpy import float32
 import nvtx
diff --git a/httomolibgpu/misc/morph.py b/httomolibgpu/misc/morph.py
index 06bd9ef2..75fa8774 100644
--- a/httomolibgpu/misc/morph.py
+++ b/httomolibgpu/misc/morph.py
@@ -20,20 +20,25 @@
 # ---------------------------------------------------------------------------
 """Module for data type morphing functions"""
 
-import numpy as xp
 import numpy as np
-
+cupy_run = False
 try:
     import cupy as xp
 
     try:
         xp.cuda.Device(0).compute_capability
-
+        cupy_run = True
     except xp.cuda.runtime.CUDARuntimeError:
         print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as np
+        import numpy as xp
 except ImportError:
-    import numpy as np
+    import numpy as xp
+
+if cupy_run:
+    from cupyx.scipy.interpolate import interpn
+else:
+    from scipy.interpolate import interpn
+
 import nvtx
 from typing import Literal
 
@@ -123,8 +128,7 @@ def data_resampler(
 
     Returns:
         cp.ndarray: Up/Down-scaled 3D cupy array
-    """
-    from cupyx.scipy.interpolate import interpn
+    """   
 
     if data.ndim != 3:
         raise ValueError("only 3D data is supported")
diff --git a/httomolibgpu/misc/rescale.py b/httomolibgpu/misc/rescale.py
index 339808e2..274e17f4 100644
--- a/httomolibgpu/misc/rescale.py
+++ b/httomolibgpu/misc/rescale.py
@@ -1,17 +1,36 @@
-import numpy as xp
-import numpy as np
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# ---------------------------------------------------------------------------
+# Copyright 2023 Diamond Light Source Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ---------------------------------------------------------------------------
+# Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
+# Created Date: 1 March 2024
+# ---------------------------------------------------------------------------
 
+import numpy as np
 try:
     import cupy as xp
 
     try:
         xp.cuda.Device(0).compute_capability
-
     except xp.cuda.runtime.CUDARuntimeError:
         print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as np
+        import numpy as xp
 except ImportError:
-    import numpy as np
+    import numpy as xp
+
 import nvtx
 from typing import Literal, Optional, Tuple, Union
 
diff --git a/httomolibgpu/prep/alignment.py b/httomolibgpu/prep/alignment.py
index 00159093..484488db 100644
--- a/httomolibgpu/prep/alignment.py
+++ b/httomolibgpu/prep/alignment.py
@@ -20,22 +20,19 @@
 # ---------------------------------------------------------------------------
 """Modules for data correction"""
 
-import numpy as xp
 import numpy as np
-
 cupy_run = False
 try:
     import cupy as xp
-    from cupy import mean
 
     try:
         xp.cuda.Device(0).compute_capability
         cupy_run = True
     except xp.cuda.runtime.CUDARuntimeError:
         print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as np
+        import numpy as xp
 except ImportError:
-    import numpy as np
+    import numpy as xp
 
 from typing import Dict, List
 import nvtx
diff --git a/httomolibgpu/prep/normalize.py b/httomolibgpu/prep/normalize.py
index 94276142..c2e65993 100644
--- a/httomolibgpu/prep/normalize.py
+++ b/httomolibgpu/prep/normalize.py
@@ -20,9 +20,8 @@
 # ---------------------------------------------------------------------------
 """Modules for raw projection data normalization"""
 
-import numpy as xp
-import numpy as np
 
+import numpy as np
 try:
     import cupy as xp
     from cupy import mean
@@ -31,9 +30,9 @@
         xp.cuda.Device(0).compute_capability
     except xp.cuda.runtime.CUDARuntimeError:
         print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as np
+        import numpy as xp
 except ImportError:
-    import numpy as np
+    import numpy as xp
 
 import nvtx
 from numpy import float32

From 959a3af96405af2077c9b0c0263f86ca65ed1ed6 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 10:21:32 +0100
Subject: [PATCH 15/36] fixing imports2

---
 httomolibgpu/__init__.py       |  2 +-
 httomolibgpu/prep/normalize.py | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index 15a49d24..9585d4d4 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -3,7 +3,7 @@
 from httomolibgpu.misc.rescale import rescale_to_int
 from httomolibgpu.prep.alignment import distortion_correction_proj_discorpy
 from httomolibgpu.prep.normalize import normalize
-from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
+#from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
 # from httomolibgpu.prep.stripe import (
 #     remove_stripe_based_sorting,
 #     remove_stripe_ti,
diff --git a/httomolibgpu/prep/normalize.py b/httomolibgpu/prep/normalize.py
index c2e65993..d8c1625a 100644
--- a/httomolibgpu/prep/normalize.py
+++ b/httomolibgpu/prep/normalize.py
@@ -20,21 +20,25 @@
 # ---------------------------------------------------------------------------
 """Modules for raw projection data normalization"""
 
-
-import numpy as np
+cupy_run = False
 try:
-    import cupy as xp
-    from cupy import mean
+    import cupy as xp    
 
     try:
         xp.cuda.Device(0).compute_capability
+        cupy_run = True
     except xp.cuda.runtime.CUDARuntimeError:
         print("CuPy library is a major dependency for HTTomolibgpu, please install")
         import numpy as xp
 except ImportError:
     import numpy as xp
 
+if cupy_run:
+    from cupy import mean
+else:
+    from numpy import mean
 import nvtx
+import numpy as np
 from numpy import float32
 from typing import Tuple
 

From d2ff482b7cc55ee963fdba44b2377b519378de76 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 10:27:12 +0100
Subject: [PATCH 16/36] adding scikit image

---
 conda/environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda/environment.yml b/conda/environment.yml
index aa0974ff..bfc95175 100644
--- a/conda/environment.yml
+++ b/conda/environment.yml
@@ -24,3 +24,4 @@ dependencies:
   - astra-toolbox::astra-toolbox
   - httomo::tomobar
   - rapidsai::cucim
+  - anaconda::scikit-image

From 768cbc365bf5b52ab1f7577092f8fa9e05dbdf17 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 10:46:41 +0100
Subject: [PATCH 17/36] get the phase in

---
 httomolibgpu/__init__.py   |  2 +-
 httomolibgpu/prep/phase.py | 19 ++++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index 9585d4d4..15a49d24 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -3,7 +3,7 @@
 from httomolibgpu.misc.rescale import rescale_to_int
 from httomolibgpu.prep.alignment import distortion_correction_proj_discorpy
 from httomolibgpu.prep.normalize import normalize
-#from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
+from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
 # from httomolibgpu.prep.stripe import (
 #     remove_stripe_based_sorting,
 #     remove_stripe_ti,
diff --git a/httomolibgpu/prep/phase.py b/httomolibgpu/prep/phase.py
index d126fe37..4261912d 100644
--- a/httomolibgpu/prep/phase.py
+++ b/httomolibgpu/prep/phase.py
@@ -21,6 +21,7 @@
 """Modules for phase retrieval and phase-contrast enhancement"""
 
 cupy_run = False
+import numpy as np
 try:
     import cupy as xp
 
@@ -35,11 +36,13 @@
     import numpy as np
 
 from numpy import float32
-import numpy as np
 import nvtx
 
 if cupy_run:
     from httomolibgpu.cuda_kernels import load_cuda_module
+    from cupyx.scipy.fft import fft2, ifft2, fftshift
+else:
+    from scipy.fft import fft2, ifft2, fftshift
 
 __all__ = [
     "paganin_filter_savu",
@@ -104,8 +107,7 @@ def paganin_filter_savu(
     -------
     cp.ndarray
         The stack of filtered projections.
-    """
-    import cupyx
+    """    
 
     # Check the input data is valid
     if data.ndim != 3:
@@ -170,7 +172,7 @@ def paganin_filter_savu(
 
     # avoid normalising in both directions - we include multiplier in the post_kernel
     data = xp.asarray(data, dtype=xp.complex64)
-    data = cupyx.scipy.fft.fft2(data, axes=(-2, -1), overwrite_x=True, norm="backward")
+    data = fft2(data, axes=(-2, -1), overwrite_x=True, norm="backward")
 
     # prepare filter here, while the GPU is busy with the FFT
     filtercomplex = xp.empty((height1, width1), dtype=np.complex64)
@@ -193,7 +195,7 @@ def paganin_filter_savu(
     )
     data *= filtercomplex
 
-    data = cupyx.scipy.fft.ifft2(data, axes=(-2, -1), overwrite_x=True, norm="forward")
+    data = ifft2(data, axes=(-2, -1), overwrite_x=True, norm="forward")
 
     post_kernel = xp.ElementwiseKernel(
         "C pci1, raw float32 increment, raw float32 ratio, raw float32 fft_scale",
@@ -318,7 +320,6 @@ def paganin_filter_tomopy(
     cp.ndarray
         The 3D array of Paganin phase-filtered projection images.
     """
-    import cupyx
 
     # Check the input data is valid
     if tomo.ndim != 3:
@@ -337,20 +338,20 @@ def paganin_filter_tomopy(
 
     # 3D FFT of tomo data
     padded_tomo = xp.asarray(padded_tomo, dtype=xp.complex64)
-    fft_tomo = cupyx.scipy.fft.fft2(padded_tomo, axes=(-2, -1), overwrite_x=True)
+    fft_tomo = fft2(padded_tomo, axes=(-2, -1), overwrite_x=True)
 
     # Compute the reciprocal grid.
     w2 = _reciprocal_grid(pixel_size, (dy, dx))
 
     # Build filter in the Fourier space.
-    phase_filter = cupyx.scipy.fft.fftshift(
+    phase_filter = fftshift(
         _paganin_filter_factor2(energy, dist, alpha, w2)
     )
     phase_filter = phase_filter / phase_filter.max()  # normalisation
 
     # Apply filter and take inverse FFT
     ifft_filtered_tomo = (
-        cupyx.scipy.fft.ifft2(phase_filter * fft_tomo, axes=(-2, -1), overwrite_x=True)
+        ifft2(phase_filter * fft_tomo, axes=(-2, -1), overwrite_x=True)
     ).real
 
     # slicing indices for cropping

From d9756bb76484d7389fe855615d0a06db451b9925 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 11:20:29 +0100
Subject: [PATCH 18/36] phase corr

---
 httomolibgpu/__init__.py    |  2 +-
 httomolibgpu/prep/phase.py  |  7 ++++---
 httomolibgpu/prep/stripe.py | 16 +++++++++-------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index 15a49d24..e377254f 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -3,7 +3,7 @@
 from httomolibgpu.misc.rescale import rescale_to_int
 from httomolibgpu.prep.alignment import distortion_correction_proj_discorpy
 from httomolibgpu.prep.normalize import normalize
-from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
+from httomolibgpu.prep.phase import paganin_filter_tomopy
 # from httomolibgpu.prep.stripe import (
 #     remove_stripe_based_sorting,
 #     remove_stripe_ti,
diff --git a/httomolibgpu/prep/phase.py b/httomolibgpu/prep/phase.py
index 4261912d..bdf2f151 100644
--- a/httomolibgpu/prep/phase.py
+++ b/httomolibgpu/prep/phase.py
@@ -36,6 +36,7 @@
     import numpy as np
 
 from numpy import float32
+from typing import Union
 import nvtx
 
 if cupy_run:
@@ -379,7 +380,7 @@ def _shift_bit_length(x: int) -> int:
     return 1 << (x - 1).bit_length()
 
 
-def _pad_projections_to_second_power(tomo: xp.ndarray) -> tuple[xp.ndarray, tuple]:
+def _pad_projections_to_second_power(tomo: xp.ndarray) -> Union[xp.ndarray, tuple]:
     """
     Performs padding of each projection to the next power of 2.
     If the shape is not even we also care of that before padding.
@@ -403,8 +404,8 @@ def _pad_projections_to_second_power(tomo: xp.ndarray) -> tuple[xp.ndarray, tupl
         else:
             diff = _shift_bit_length(element + 1) - element
             if element % 2 == 0:
-                pad_width = diff // 2
-                pad_width = (pad_width, pad_width)
+                pad_width_scalar = diff // 2
+                pad_width = (pad_width_scalar, pad_width_scalar)
             else:
                 # need an uneven padding for odd-number lengths
                 left_pad = diff // 2
diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py
index 0c37f608..af7c81a5 100644
--- a/httomolibgpu/prep/stripe.py
+++ b/httomolibgpu/prep/stripe.py
@@ -20,22 +20,24 @@
 # ---------------------------------------------------------------------------
 """Modules for stripes removal"""
 
-import numpy as xp
 import numpy as np
-
+cupy_run = False
 try:
     import cupy as xp
-    from cupyx.scipy.ndimage import median_filter
-    from cupyx.scipy.ndimage import binary_dilation
-    from cupyx.scipy.ndimage import uniform_filter1d
 
     try:
         xp.cuda.Device(0).compute_capability
+        cupy_run = True
     except xp.cuda.runtime.CUDARuntimeError:
         print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as np
+        import numpy as xp
 except ImportError:
-    import numpy as np
+    import numpy as xp
+
+if cupy_run:
+    from cupyx.scipy.ndimage import median_filter, binary_dilation, uniform_filter1d
+else:
+    from scipy.ndimage import median_filter, binary_dilation, uniform_filter1d
 
 import nvtx
 from typing import Union

From 8e0b01a52409e9c43b18581b9f9eb375177b68f5 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 11:25:05 +0100
Subject: [PATCH 19/36] removes phase, get stripes in

---
 httomolibgpu/__init__.py    | 12 ++++++------
 httomolibgpu/prep/stripe.py |  5 -----
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index e377254f..d5b04dd0 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -3,11 +3,11 @@
 from httomolibgpu.misc.rescale import rescale_to_int
 from httomolibgpu.prep.alignment import distortion_correction_proj_discorpy
 from httomolibgpu.prep.normalize import normalize
-from httomolibgpu.prep.phase import paganin_filter_tomopy
-# from httomolibgpu.prep.stripe import (
-#     remove_stripe_based_sorting,
-#     remove_stripe_ti,
-#     remove_all_stripe,
-# )
+#from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
+from httomolibgpu.prep.stripe import (
+    remove_stripe_based_sorting,
+    remove_stripe_ti,
+    remove_all_stripe,
+)
 # from httomolibgpu.recon.algorithm import FBP, SIRT, CGLS
 # from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py
index af7c81a5..f55a3e31 100644
--- a/httomolibgpu/prep/stripe.py
+++ b/httomolibgpu/prep/stripe.py
@@ -111,11 +111,6 @@ def _rs_sort(sinogram, size, dim):
     sino_sort = xp.take_along_axis(sinogram, sortvals, axis=1)
 
     #: Now apply the median filter on the sorted image along each row
-    if xp.__name__ == "cupy":
-        from cupyx.scipy.ndimage import median_filter
-    else:
-        from scipy.ndimage import median_filter
-
     sino_sort = median_filter(sino_sort, (size, 1) if dim == 1 else (size, size))
 
     #: step 3: re-sort the smoothed image columns to the original rows

From 69e392f94f7f43656c900066029ff2d9c30a8d0d Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 11:34:07 +0100
Subject: [PATCH 20/36] get algorithms in

---
 httomolibgpu/__init__.py        |  2 +-
 httomolibgpu/recon/algorithm.py | 18 +++++++-----------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index d5b04dd0..371d1325 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -9,5 +9,5 @@
     remove_stripe_ti,
     remove_all_stripe,
 )
-# from httomolibgpu.recon.algorithm import FBP, SIRT, CGLS
+from httomolibgpu.recon.algorithm import FBP, SIRT, CGLS
 # from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
index 4daedc91..1680309a 100644
--- a/httomolibgpu/recon/algorithm.py
+++ b/httomolibgpu/recon/algorithm.py
@@ -20,9 +20,7 @@
 # ---------------------------------------------------------------------------
 """Module for tomographic reconstruction"""
 
-import numpy as xp
 import numpy as np
-
 cupy_run = False
 try:
     import cupy as xp
@@ -30,17 +28,15 @@
     try:
         xp.cuda.Device(0).compute_capability
         cupy_run = True
-
     except xp.cuda.runtime.CUDARuntimeError:
         print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as np
+        import numpy as xp
 except ImportError:
-    import numpy as np
+    import numpy as xp
 
 import nvtx
 from numpy import float32, complex64
-from typing import Optional, Tuple, Union
-from typing import Type
+from typing import Optional, Type
 
 from tomobar.methodsDIR_CuPy import RecToolsDIRCuPy
 from tomobar.methodsIR_CuPy import RecToolsIRCuPy
@@ -223,7 +219,7 @@ def _instantiate_direct_recon_class(
     center: Optional[float] = None,
     recon_size: Optional[int] = None,
     gpu_id: int = 0,
-) -> type[RecToolsDIRCuPy]:
+) -> Type[RecToolsDIRCuPy]:
     """instantiate ToMoBAR's direct recon class
 
     Args:
@@ -234,7 +230,7 @@ def _instantiate_direct_recon_class(
         gpu_id (int, optional): gpu ID. Defaults to 0.
 
     Returns:
-        type[RecToolsDIRCuPy]: an instance of the direct recon class
+        Type[RecToolsDIRCuPy]: an instance of the direct recon class
     """
     if center is None:
         center = data.shape[2] // 2  # making a crude guess
@@ -260,7 +256,7 @@ def _instantiate_iterative_recon_class(
     recon_size: Optional[int] = None,
     gpu_id: int = 0,
     datafidelity: str = "LS",
-) -> type[RecToolsIRCuPy]:
+) -> Type[RecToolsIRCuPy]:
     """instantiate ToMoBAR's iterative recon class
 
     Args:
@@ -272,7 +268,7 @@ def _instantiate_iterative_recon_class(
         gpu_id (int, optional): gpu ID. Defaults to 0.
 
     Returns:
-        type[RecToolsIRCuPy]: an instance of the iterative class
+        Type[RecToolsIRCuPy]: an instance of the iterative class
     """
     if center is None:
         center = data.shape[2] // 2  # making a crude guess

From 5032aef26b2509780c9e4ea851ec022fd8ed2948 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 11:43:42 +0100
Subject: [PATCH 21/36] algorithms_corr

---
 httomolibgpu/prep/phase.py      | 4 ++--
 httomolibgpu/recon/algorithm.py | 8 ++++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/httomolibgpu/prep/phase.py b/httomolibgpu/prep/phase.py
index bdf2f151..e078e566 100644
--- a/httomolibgpu/prep/phase.py
+++ b/httomolibgpu/prep/phase.py
@@ -20,8 +20,8 @@
 # ---------------------------------------------------------------------------
 """Modules for phase retrieval and phase-contrast enhancement"""
 
-cupy_run = False
 import numpy as np
+cupy_run = False
 try:
     import cupy as xp
 
@@ -176,7 +176,7 @@ def paganin_filter_savu(
     data = fft2(data, axes=(-2, -1), overwrite_x=True, norm="backward")
 
     # prepare filter here, while the GPU is busy with the FFT
-    filtercomplex = xp.empty((height1, width1), dtype=np.complex64)
+    filtercomplex = xp.empty((height1, width1), dtype=xp.complex64)
     bx = 16
     by = 8
     gx = (width1 + bx - 1) // bx
diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
index 1680309a..4b126066 100644
--- a/httomolibgpu/recon/algorithm.py
+++ b/httomolibgpu/recon/algorithm.py
@@ -38,8 +38,12 @@
 from numpy import float32, complex64
 from typing import Optional, Type
 
-from tomobar.methodsDIR_CuPy import RecToolsDIRCuPy
-from tomobar.methodsIR_CuPy import RecToolsIRCuPy
+if cupy_run:
+    from tomobar.methodsDIR_CuPy import RecToolsDIRCuPy
+    from tomobar.methodsIR_CuPy import RecToolsIRCuPy
+else:
+    from tomobar.methodsDIR import RecToolsDIR as RecToolsDIRCuPy
+    from tomobar.methodsIR import RecToolsIR as RecToolsIRCuPy
 
 __all__ = [
     "FBP",

From ac771dad44a25dfda63e2b0c7f4a57811786e6f5 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 11:57:57 +0100
Subject: [PATCH 22/36] phase back

---
 httomolibgpu/__init__.py   |  2 +-
 httomolibgpu/prep/phase.py | 26 +++++++++++---------------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index 371d1325..02f6bd89 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -3,7 +3,7 @@
 from httomolibgpu.misc.rescale import rescale_to_int
 from httomolibgpu.prep.alignment import distortion_correction_proj_discorpy
 from httomolibgpu.prep.normalize import normalize
-#from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
+from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
 from httomolibgpu.prep.stripe import (
     remove_stripe_based_sorting,
     remove_stripe_ti,
diff --git a/httomolibgpu/prep/phase.py b/httomolibgpu/prep/phase.py
index e078e566..073bcdb0 100644
--- a/httomolibgpu/prep/phase.py
+++ b/httomolibgpu/prep/phase.py
@@ -38,6 +38,7 @@
 from numpy import float32
 from typing import Union
 import nvtx
+import math
 
 if cupy_run:
     from httomolibgpu.cuda_kernels import load_cuda_module
@@ -50,13 +51,6 @@
     "paganin_filter_tomopy",
 ]
 
-# Define constants used in phase retrieval method
-BOLTZMANN_CONSTANT = 1.3806488e-16  # [erg/k]
-SPEED_OF_LIGHT = 299792458e2  # [cm/s]
-PI = 3.14159265359
-PLANCK_CONSTANT = 6.58211928e-19  # [keV*s]
-
-
 ## %%%%%%%%%%%%%%%%%%%%%%% paganin_filter %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 #: CuPy implementation of Paganin filter from Savu
 @nvtx.annotate()
@@ -206,7 +200,7 @@ def paganin_filter_savu(
         no_return=True,
     )
     fft_scale = 1.0 / (data.shape[1] * data.shape[2])
-    res = xp.empty((data.shape[0], height, width), dtype=np.float32)
+    res = xp.empty((data.shape[0], height, width), dtype=xp.float32)
     post_kernel(
         data[:, pad_y : pad_y + height, pad_x : pad_x + width],
         np.float32(increment),
@@ -219,17 +213,19 @@ def paganin_filter_savu(
 
 
 def _wavelength(energy: float) -> float:
-    return 2 * PI * PLANCK_CONSTANT * SPEED_OF_LIGHT / energy
+    SPEED_OF_LIGHT = 299792458e2  # [cm/s]
+    PLANCK_CONSTANT = 6.58211928e-19  # [keV*s]
+    return 2 * math.pi * PLANCK_CONSTANT * SPEED_OF_LIGHT / energy
 
 
 def _paganin_filter_factor(
     energy: float, dist: float, alpha: float, w2: xp.ndarray
 ) -> xp.ndarray:
-    return 1 / (_wavelength(energy) * dist * w2 / (4 * PI) + alpha)
+    return 1 / (_wavelength(energy) * dist * w2 / (4 * math.pi) + alpha)
 
 
 def _calc_pad_width(dim: int, pixel_size: float, wavelength: float, dist: float) -> int:
-    pad_pix = xp.ceil(PI * wavelength * dist / pixel_size**2)
+    pad_pix = xp.ceil(math.pi * wavelength * dist / pixel_size**2)
     return int((pow(2, xp.ceil(xp.log2(dim + pad_pix))) - dim) * 0.5)
 
 
@@ -281,7 +277,7 @@ def _reciprocal_coord(pixel_size: float, num_grid: int) -> xp.ndarray:
     """
     n = num_grid - 1
     rc = xp.arange(-n, num_grid, 2, dtype=xp.float32)
-    rc *= 2 * PI / (n * pixel_size)
+    rc *= 2 * math.pi / (n * pixel_size)
     return rc
 
 
@@ -329,13 +325,13 @@ def paganin_filter_tomopy(
             " please provide a stack of 2D projections."
         )
 
-    dz_orig, dy_orig, dx_orig = xp.shape(tomo)
+    dz_orig, dy_orig, dx_orig = tomo.shape
 
     # Perform padding to the power of 2 as FFT is O(n*log(n)) complexity
     # TODO: adding other options of padding?
     padded_tomo, pad_tup = _pad_projections_to_second_power(tomo)
 
-    dz, dy, dx = xp.shape(padded_tomo)
+    dz, dy, dx = padded_tomo.shape
 
     # 3D FFT of tomo data
     padded_tomo = xp.asarray(padded_tomo, dtype=xp.complex64)
@@ -421,4 +417,4 @@ def _pad_projections_to_second_power(tomo: xp.ndarray) -> Union[xp.ndarray, tupl
 
 def _paganin_filter_factor2(energy, dist, alpha, w2):
     # Alpha represents the ratio of delta/beta.
-    return 1 / (_wavelength(energy) * dist * w2 / (4 * PI) + alpha)
+    return 1 / (_wavelength(energy) * dist * w2 / (4 * math.pi) + alpha)

From 4890d5b6e114b07c52cedde91268265381adeab2 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 12:11:30 +0100
Subject: [PATCH 23/36] phase removed, rotation added

---
 httomolibgpu/__init__.py       |  4 ++--
 httomolibgpu/recon/rotation.py | 21 ++++++++++++++-------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index 02f6bd89..c6869f2a 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -3,11 +3,11 @@
 from httomolibgpu.misc.rescale import rescale_to_int
 from httomolibgpu.prep.alignment import distortion_correction_proj_discorpy
 from httomolibgpu.prep.normalize import normalize
-from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
+#from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
 from httomolibgpu.prep.stripe import (
     remove_stripe_based_sorting,
     remove_stripe_ti,
     remove_all_stripe,
 )
 from httomolibgpu.recon.algorithm import FBP, SIRT, CGLS
-# from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
+from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
index edfea414..9df596f4 100644
--- a/httomolibgpu/recon/rotation.py
+++ b/httomolibgpu/recon/rotation.py
@@ -20,13 +20,10 @@
 # ---------------------------------------------------------------------------
 """Modules for finding the axis of rotation"""
 
-import numpy as xp
 import numpy as np
-
 cupy_run = False
 try:
     import cupy as xp
-    import cupyx
 
     try:
         xp.cuda.Device(0).compute_capability
@@ -45,10 +42,20 @@
 if cupy_run:
     from httomolibgpu.cuda_kernels import load_cuda_module
     from cupyx.scipy.ndimage import shift, gaussian_filter
-    from cucim.skimage.registration import phase_cross_correlation
+    from cupyx.scipy.fftpack import get_fft_plan
+    from cupyx.scipy.fft import rfft2
 else:
     from scipy.ndimage import shift, gaussian_filter
-    from skimage.registration import phase_cross_correlation
+    from scipy.fft import fftfreq as get_fft_plan # get_fft_plan doesn't exist in scipyfft
+    from scipy.fft import rfft2
+
+try:
+    from cucim.skimage.registration import phase_cross_correlation
+except ImportError:
+    print(
+        "Cucim library of Rapidsai is a required dependency for find_center_pc module, please install"
+    )
+    from skimage.registration import phase_cross_correlation  
 
 __all__ = [
     "find_center_vo",
@@ -295,7 +302,7 @@ def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
     mat = xp.empty((chunks[0], na1 + na2, sino2.shape[1]), dtype=xp.float32)
     mat[:, :na1, :] = sino1
     # explicitly create FFT plan here, so it's not cached and clearly re-used
-    plan = cupyx.scipy.fftpack.get_fft_plan(
+    plan = get_fft_plan(
         mat, mat.shape[-2:], axes=(1, 2), value_type="R2C"
     )
 
@@ -340,7 +347,7 @@ def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
         # stack and transform
         # (we do the full sized mat FFT, even though the last chunk may be smaller, to
         # make sure we can re-use the same FFT plan as before)
-        mat_freq = cupyx.scipy.fft.rfft2(mat, axes=(1, 2), norm=None, plan=plan)
+        mat_freq = rfft2(mat, axes=(1, 2), norm=None, plan=plan)
         masked_sum_abs_kernel(
             mat_freq[:size, :, :], mask, out=out[start_idx:stop_idx], axis=(1, 2)
         )

From 4762e62ddcf6ada91a949a5b0c66c4535001bcc0 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 12:39:29 +0100
Subject: [PATCH 24/36] corr1

---
 httomolibgpu/__init__.py       |    3 +-
 httomolibgpu/recon/rotation.py | 1317 ++++++++++++++++----------------
 2 files changed, 660 insertions(+), 660 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index c6869f2a..5ce9fcef 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -10,4 +10,5 @@
     remove_all_stripe,
 )
 from httomolibgpu.recon.algorithm import FBP, SIRT, CGLS
-from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
+#from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
+from httomolibgpu.recon.rotation import find_center_pc
diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
index 9df596f4..a58ef146 100644
--- a/httomolibgpu/recon/rotation.py
+++ b/httomolibgpu/recon/rotation.py
@@ -37,7 +37,7 @@
 
 import nvtx
 import math
-from typing import List, Literal, Optional, Tuple
+from typing import List, Literal, Optional, Tuple, Union
 
 if cupy_run:
     from httomolibgpu.cuda_kernels import load_cuda_module
@@ -58,672 +58,671 @@
     from skimage.registration import phase_cross_correlation  
 
 __all__ = [
-    "find_center_vo",
-    "find_center_360",
+    # "find_center_vo",
+    # "find_center_360",
     "find_center_pc",
 ]
 
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%find_center_vo%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-@nvtx.annotate()
-def find_center_vo(
-    data: xp.ndarray,
-    ind: Optional[int] = None,
-    smin: int = -50,
-    smax: int = 50,
-    srad: float = 6.0,
-    step: float = 0.25,
-    ratio: float = 0.5,
-    drop: int = 20,
-) -> float:
-    """
-    Find rotation axis location using Nghia Vo's method. See the paper
-    https://opg.optica.org/oe/fulltext.cfm?uri=oe-22-16-19078&id=297315
-
-    Parameters
-    ----------
-    data : cp.ndarray
-        3D tomographic data or a 2D sinogram as a CuPy array.
-    ind : int, optional
-        Index of the slice to be used to estimate the CoR.
-    smin : int, optional
-        Coarse search radius. Reference to the horizontal center of
-        the sinogram.
-    smax : int, optional
-        Coarse search radius. Reference to the horizontal center of
-        the sinogram.
-    srad : float, optional
-        Fine search radius.
-    step : float, optional
-        Step of fine searching.
-    ratio : float, optional
-        The ratio between the FOV of the camera and the size of object.
-        It's used to generate the mask.
-    drop : int, optional
-        Drop lines around vertical center of the mask.
-
-    Returns
-    -------
-    float
-        Rotation axis location.
-    """
-
-    if data.ndim == 2:
-        data = xp.expand_dims(data, 1)
-        ind = 0
-
-    height = data.shape[1]
-
-    if ind is None:
-        ind = height // 2
-        if height > 10:
-            _sino = xp.mean(data[:, ind - 5 : ind + 5, :], axis=1)
-        else:
-            _sino = data[:, ind, :]
-    else:
-        _sino = data[:, ind, :]
-
-    with nvtx.annotate("gaussian_filter_1", color="green"):
-        _sino_cs = gaussian_filter(_sino, (3, 1), mode="reflect")
-    with nvtx.annotate("gaussian_filter_2", color="green"):
-        _sino_fs = gaussian_filter(_sino, (2, 2), mode="reflect")
-
-    if _sino.shape[0] * _sino.shape[1] > 4e6:
-        # data is large, so downsample it before performing search for
-        # centre of rotation
-        _sino_coarse = _downsample(_sino_cs, 2, 1)
-        init_cen = _search_coarse(_sino_coarse, smin / 4.0, smax / 4.0, ratio, drop)
-        fine_cen = _search_fine(_sino_fs, srad, step, init_cen * 4.0, ratio, drop)
-    else:
-        init_cen = _search_coarse(_sino_cs, smin, smax, ratio, drop)
-        fine_cen = _search_fine(_sino_fs, srad, step, init_cen, ratio, drop)
-
-    return xp.asnumpy(fine_cen)
-
-
-@nvtx.annotate()
-def _search_coarse(sino, smin, smax, ratio, drop):
-    (nrow, ncol) = sino.shape
-    flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
-    comp_sino = xp.ascontiguousarray(xp.flipud(sino))
-    mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
-
-    cen_fliplr = (ncol - 1.0) / 2.0
-    smin_clip_val = max(min(smin + cen_fliplr, ncol - 1), 0)
-    smin = smin_clip_val - cen_fliplr
-    smax_clip_val = max(min(smax + cen_fliplr, ncol - 1), 0)
-    smax = smax_clip_val - cen_fliplr
-    start_cor = ncol // 2 + smin
-    stop_cor = ncol // 2 + smax
-    list_cor = xp.arange(start_cor, stop_cor + 0.5, 0.5, dtype=xp.float32)
-    list_shift = 2.0 * (list_cor - cen_fliplr)
-    list_metric = xp.empty(list_shift.shape, dtype=xp.float32)
-    _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, list_metric)
-
-    minpos = xp.argmin(list_metric)
-    if minpos == 0:
-        print("WARNING!!!Global minimum is out of searching range")
-        print(f"Please extend smin: {smin}")
-    if minpos == len(list_metric) - 1:
-        print("WARNING!!!Global minimum is out of searching range")
-        print(f"Please extend smax: {smax}")
-    cor = list_cor[minpos]
-    return cor
-
-
-@nvtx.annotate()
-def _search_fine(sino, srad, step, init_cen, ratio, drop):
-    (nrow, ncol) = sino.shape
-
-    flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
-    comp_sino = xp.ascontiguousarray(xp.flipud(sino))
-    mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
-
-    cen_fliplr = (ncol - 1.0) / 2.0
-    srad = max(min(abs(float(srad)), ncol / 4.0), 1.0)
-    step = max(min(abs(step), srad), 0.1)
-    init_cen = max(min(init_cen, ncol - srad - 1), srad)
-    list_cor = init_cen + xp.arange(-srad, srad + step, step, dtype=np.float32)
-    list_shift = 2.0 * (list_cor - cen_fliplr)
-    list_metric = xp.empty(list_shift.shape, dtype="float32")
-
-    _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, out=list_metric)
-    cor = list_cor[xp.argmin(list_metric)]
-    return cor
-
-
-@nvtx.annotate()
-def _create_mask(nrow, ncol, radius, drop):
-    du = 1.0 / ncol
-    dv = (nrow - 1.0) / (nrow * 2.0 * np.pi)
-    cen_row = int(math.ceil(nrow / 2.0) - 1)
-    cen_col = int(math.ceil(ncol / 2.0) - 1)
-    drop = min([drop, int(math.ceil(0.05 * nrow))])
-
-    block_x = 128
-    block_y = 1
-    block_dims = (block_x, block_y)
-    grid_x = (ncol // 2 + 1 + block_x - 1) // block_x
-    grid_y = nrow
-    grid_dims = (grid_x, grid_y)
-    mask = xp.empty((nrow, ncol // 2 + 1), dtype="uint16")
-    params = (
-        ncol,
-        nrow,
-        cen_col,
-        cen_row,
-        xp.float32(du),
-        xp.float32(dv),
-        xp.float32(radius),
-        xp.float32(drop),
-        mask,
-    )
-    module = load_cuda_module("generate_mask")
-    kernel = module.get_function("generate_mask")
-    kernel(grid_dims, block_dims, params)
-    return mask
-
-
-def round_up(x: float) -> int:
-    if x >= 0.0:
-        return int(math.ceil(x))
-    else:
-        return int(math.floor(x))
-
-
-def _get_available_gpu_memory() -> int:
-    dev = xp.cuda.Device()
-    # first, let's make some space
-    xp.get_default_memory_pool().free_all_blocks()
-    cache = xp.fft.config.get_plan_cache()
-    cache.clear()
-    available_memory = dev.mem_info[0] + xp.get_default_memory_pool().free_bytes()
-    return int(available_memory * 0.9)  # 10% safety margin
-
-
-def _calculate_chunks(
-    nshifts: int, shift_size: int, available_memory: Optional[int] = None
-) -> List[int]:
-    if available_memory is None:
-        available_memory = _get_available_gpu_memory()
-
-    available_memory -= shift_size
-    freq_domain_size = (
-        shift_size  # it needs only half (RFFT), but complex64, so it's the same
-    )
-    fft_plan_size = freq_domain_size
-    size_per_shift = fft_plan_size + freq_domain_size + shift_size
-    nshift_max = available_memory // size_per_shift
-    assert nshift_max > 0, "Not enough memory to process"
-    num_chunks = int(np.ceil(nshifts / nshift_max))
-    chunk_size = int(np.ceil(nshifts / num_chunks))
-    chunks = [chunk_size] * (num_chunks - 1)
-    stop_idx = list(np.cumsum(chunks))
-    stop_idx.append(nshifts)
-    return stop_idx
-
-
-@nvtx.annotate()
-def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
-    # this tries to simplify - if shift_col is integer, no need to spline interpolate
-    assert list_shift.dtype == xp.float32, "shifts must be single precision floats"
-    assert sino1.dtype == xp.float32, "sino1 must be float32"
-    assert sino2.dtype == xp.float32, "sino1 must be float32"
-    assert sino3.dtype == xp.float32, "sino1 must be float32"
-    assert out.dtype == xp.float32, "sino1 must be float32"
-    assert sino2.flags["C_CONTIGUOUS"], "sino2 must be C-contiguous"
-    assert sino3.flags["C_CONTIGUOUS"], "sino3 must be C-contiguous"
-    assert list_shift.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
-    nshifts = list_shift.shape[0]
-    na1 = sino1.shape[0]
-    na2 = sino2.shape[0]
-
-    module = load_cuda_module("center_360_shifts")
-    shift_whole_shifts = module.get_function("shift_whole_shifts")
-    # note: we don't have to calculate the mean here, as we're only looking for minimum metric.
-    # The sum is enough.
-    masked_sum_abs_kernel = xp.ReductionKernel(
-        in_params="complex64 x, uint16 mask",  # input, complex + mask
-        out_params="float32 out",  # output, real
-        map_expr="mask ? abs(x) : 0.0f",
-        reduce_expr="a + b",
-        post_map_expr="out = a",
-        identity="0.0f",
-        reduce_type="float",
-        name="masked_sum_abs",
-    )
-
-    # determine how many shifts we can fit in the available memory
-    # and iterate in chunks
-    chunks = _calculate_chunks(
-        nshifts, (na1 + na2) * sino2.shape[1] * xp.float32().nbytes
-    )
-
-    mat = xp.empty((chunks[0], na1 + na2, sino2.shape[1]), dtype=xp.float32)
-    mat[:, :na1, :] = sino1
-    # explicitly create FFT plan here, so it's not cached and clearly re-used
-    plan = get_fft_plan(
-        mat, mat.shape[-2:], axes=(1, 2), value_type="R2C"
-    )
-
-    for i, stop_idx in enumerate(chunks):
-        if i > 0:
-            # more than one iteration means we're tight on memory, so clear up freed blocks
-            mat_freq = None
-            xp.get_default_memory_pool().free_all_blocks()
-
-        start_idx = 0 if i == 0 else chunks[i - 1]
-        size = stop_idx - start_idx
-
-        # first, handle the integer shifts without spline in a raw kernel,
-        # and shift in the sino3 one accordingly
-        bx = 128
-        gx = (sino3.shape[1] + bx - 1) // bx
-        shift_whole_shifts(
-            grid=(gx, na2, size),  ####
-            block=(bx, 1, 1),
-            args=(
-                sino2,
-                sino3,
-                list_shift[start_idx:stop_idx],
-                mat[:, na1:, :],
-                sino3.shape[1],
-                na1 + na2,
-            ),
-        )
-
-        # now we can only look at the spline shifting, the rest is done
-        list_shift_host = xp.asnumpy(list_shift[start_idx:stop_idx])
-        for i in range(list_shift_host.shape[0]):
-            shift_col = float(list_shift_host[i])
-            if not shift_col.is_integer():
-                shifted = shift(sino2, (0, shift_col), order=3, prefilter=True)
-                shift_int = round_up(shift_col)
-                if shift_int >= 0:
-                    mat[i, na1:, shift_int:] = shifted[:, shift_int:]
-                else:
-                    mat[i, na1:, :shift_int] = shifted[:, :shift_int]
-
-        # stack and transform
-        # (we do the full sized mat FFT, even though the last chunk may be smaller, to
-        # make sure we can re-use the same FFT plan as before)
-        mat_freq = rfft2(mat, axes=(1, 2), norm=None, plan=plan)
-        masked_sum_abs_kernel(
-            mat_freq[:size, :, :], mask, out=out[start_idx:stop_idx], axis=(1, 2)
-        )
-
-
-@nvtx.annotate()
-def _downsample(sino, level, axis):
-    assert sino.dtype == xp.float32, "single precision floating point input required"
-    assert sino.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
-
-    dx, dz = sino.shape
-    # Determine the new size, dim, of the downsampled dimension
-    dim = int(sino.shape[axis] / math.pow(2, level))
-    shape = [dx, dz]
-    shape[axis] = dim
-    downsampled_data = xp.empty(shape, dtype="float32")
-
-    block_x = 8
-    block_y = 8
-    block_dims = (block_x, block_y)
-    grid_x = (sino.shape[1] + block_x - 1) // block_x
-    grid_y = (sino.shape[0] + block_y - 1) // block_y
-    grid_dims = (grid_x, grid_y)
-    # 8x8 thread-block, which means 16 "lots" of columns to downsample per
-    # thread-block; 4 bytes per float, so allocate 16*6 = 64 bytes of shared
-    # memeory per thread-block
-    shared_mem_bytes = 64
-    params = (sino, dx, dz, level, downsampled_data)
-    module = load_cuda_module("downsample_sino")
-    kernel = module.get_function("downsample_sino")
-    kernel(grid_dims, block_dims, params, shared_mem=shared_mem_bytes)
-    return downsampled_data
-
-
-##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%find_center_360%%%%%%%%%%%%%%%%%%%%%%%%%
-# --- Center of rotation (COR) estimation method ---#
-@nvtx.annotate()
-def find_center_360(
-    data: xp.ndarray,
-    ind: Optional[int] = None,
-    win_width: int = 10,
-    side: Optional[Literal[0, 1]] = None,
-    denoise: bool = True,
-    norm: bool = False,
-    use_overlap: bool = False,
-) -> Tuple[float, float, Optional[Literal[0, 1]], float]:
-    """
-    Find the center-of-rotation (COR) in a 360-degree scan with offset COR use
-    the method presented in Ref. [1] by Nghia Vo.
-
-    This function supports both numpy and cupy - the implementation is selected
-    by where the input data array resides.
-
-    Parameters
-    ----------
-    data : cp.ndarray
-        3D tomographic data as a Cupy array.
-    ind : int, optional
-        Index of the slice to be used for estimate the CoR and the overlap.
-    win_width : int, optional
-        Window width used for finding the overlap area.
-    side : {None, 0, 1}, optional
-        Overlap size. Only there options: None, 0, or 1. "None" corresponds
-        to fully automated determination. "0" corresponds to the left side.
-        "1" corresponds to the right side.
-    denoise : bool, optional
-        Apply the Gaussian filter if True.
-    norm : bool, optional
-        Apply the normalisation if True.
-    use_overlap : bool, optional
-        Use the combination of images in the overlap area for calculating
-        correlation coefficients if True.
-
-    Returns
-    -------
-    cor : float
-        Center-of-rotation.
-    overlap : float
-        Width of the overlap area between two halves of the sinogram.
-    side : int
-        Overlap side between two halves of the sinogram.
-    overlap_position : float
-        Position of the window in the first image giving the best
-        correlation metric.
-
-    References
-    ----------
-    [1] : https://doi.org/10.1364/OE.418448
-    """
-    if data.ndim != 3:
-        raise ValueError("A 3D array must be provided")
-
-    # this method works with a 360-degree sinogram.
-    if ind is None:
-        _sino = data[:, 0, :]
-    else:
-        _sino = data[:, ind, :]
-
-    (nrow, ncol) = _sino.shape
-    nrow_180 = nrow // 2 + 1
-    sino_top = _sino[0:nrow_180, :]
-    sino_bot = xp.fliplr(_sino[-nrow_180:, :])
-    (overlap, side, overlap_position) = _find_overlap(
-        sino_top, sino_bot, win_width, side, denoise, norm, use_overlap
-    )
-    if side == 0:
-        cor = overlap / 2.0 - 1.0
-    else:
-        cor = ncol - overlap / 2.0 - 1.0
-
-    return cor, overlap, side, overlap_position
-
-
-def _find_overlap(
-    mat1, mat2, win_width, side=None, denoise=True, norm=False, use_overlap=False
-):
-    """
-    Find the overlap area and overlap side between two images (Ref. [1]) where
-    the overlap side referring to the first image.
-
-    Parameters
-    ----------
-    mat1 : array_like
-        2D array. Projection image or sinogram image.
-    mat2 :  array_like
-        2D array. Projection image or sinogram image.
-    win_width : int
-        Width of the searching window.
-    side : {None, 0, 1}, optional
-        Only there options: None, 0, or 1. "None" corresponding to fully
-        automated determination. "0" corresponding to the left side. "1"
-        corresponding to the right side.
-    denoise : bool, optional
-        Apply the Gaussian filter if True.
-    norm : bool, optional
-        Apply the normalization if True.
-    use_overlap : bool, optional
-        Use the combination of images in the overlap area for calculating
-        correlation coefficients if True.
-
-    Returns
-    -------
-    overlap : float
-        Width of the overlap area between two images.
-    side : int
-        Overlap side between two images.
-    overlap_position : float
-        Position of the window in the first image giving the best
-        correlation metric.
-
-    """
-    ncol1 = mat1.shape[1]
-    ncol2 = mat2.shape[1]
-    win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2))
-
-    if side == 1:
-        (list_metric, offset) = _search_overlap(
-            mat1,
-            mat2,
-            win_width,
-            side=side,
-            denoise=denoise,
-            norm=norm,
-            use_overlap=use_overlap,
-        )
-        overlap_position = _calculate_curvature(list_metric)[1]
-        overlap_position += offset
-        overlap = ncol1 - overlap_position + win_width // 2
-    elif side == 0:
-        (list_metric, offset) = _search_overlap(
-            mat1,
-            mat2,
-            win_width,
-            side=side,
-            denoise=denoise,
-            norm=norm,
-            use_overlap=use_overlap,
-        )
-        overlap_position = _calculate_curvature(list_metric)[1]
-        overlap_position += offset
-        overlap = overlap_position + win_width // 2
-    else:
-        (list_metric1, offset1) = _search_overlap(
-            mat1,
-            mat2,
-            win_width,
-            side=1,
-            denoise=denoise,
-            norm=norm,
-            use_overlap=use_overlap,
-        )
-        (list_metric2, offset2) = _search_overlap(
-            mat1,
-            mat2,
-            win_width,
-            side=0,
-            denoise=denoise,
-            norm=norm,
-            use_overlap=use_overlap,
-        )
-
-        (curvature1, overlap_position1) = _calculate_curvature(list_metric1)
-        overlap_position1 += offset1
-        (curvature2, overlap_position2) = _calculate_curvature(list_metric2)
-        overlap_position2 += offset2
-
-        if curvature1 > curvature2:
-            side = 1
-            overlap_position = overlap_position1
-            overlap = ncol1 - overlap_position + win_width // 2
-        else:
-            side = 0
-            overlap_position = overlap_position2
-            overlap = overlap_position + win_width // 2
-
-    return overlap, side, overlap_position
-
-
-@nvtx.annotate()
-def _search_overlap(
-    mat1, mat2, win_width, side, denoise=True, norm=False, use_overlap=False
-):
-    """
-    Calculate the correlation metrics between a rectangular region, defined
-    by the window width, on the utmost left/right side of image 2 and the
-    same size region in image 1 where the region is slided across image 1.
-
-    Parameters
-    ----------
-    mat1 : array_like
-        2D array. Projection image or sinogram image.
-    mat2 : array_like
-        2D array. Projection image or sinogram image.
-    win_width : int
-        Width of the searching window.
-    side : {0, 1}
-        Only two options: 0 or 1. It is used to indicate the overlap side
-        respects to image 1. "0" corresponds to the left side. "1" corresponds
-        to the right side.
-    denoise : bool, optional
-        Apply the Gaussian filter if True.
-    norm : bool, optional
-        Apply the normalization if True.
-    use_overlap : bool, optional
-        Use the combination of images in the overlap area for calculating
-        correlation coefficients if True.
-
-    Returns
-    -------
-    list_metric : array_like
-        1D array. List of the correlation metrics.
-    offset : int
-        Initial position of the searching window where the position
-        corresponds to the center of the window.
-    """
-    if denoise is True:
-        # note: the filtering makes the output contiguous
-        with nvtx.annotate("denoise_filter", color="green"):
-            mat1 = gaussian_filter(mat1, (2, 2), mode="reflect")
-            mat2 = gaussian_filter(mat2, (2, 2), mode="reflect")
-    else:
-        mat1 = xp.ascontiguousarray(mat1, dtype=xp.float32)
-        mat2 = xp.ascontiguousarray(mat2, dtype=xp.float32)
-
-    (nrow1, ncol1) = mat1.shape
-    (nrow2, ncol2) = mat2.shape
-
-    if nrow1 != nrow2:
-        raise ValueError("Two images are not at the same height!!!")
-
-    win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2 - 1))
-    offset = win_width // 2
-    win_width = 2 * offset  # Make it even
-
-    list_metric = _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm)
-
-    min_metric = xp.min(list_metric)
-    if min_metric != 0.0:
-        list_metric /= min_metric
-
-    return list_metric, offset
-
-
-_calc_metrics_module = load_cuda_module(
-    "calc_metrics",
-    name_expressions=[
-        "calc_metrics_kernel<false, false>",
-        "calc_metrics_kernel<true, false>",
-        "calc_metrics_kernel<false, true>",
-        "calc_metrics_kernel<true, true>",
-    ],
-    options=("--maxrregcount=32",),
-)
-
-
-@nvtx.annotate()
-def _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm):
-    assert mat1.dtype == xp.float32, "only float32 supported"
-    assert mat2.dtype == xp.float32, "only float32 supported"
-    assert mat1.shape[0] == mat2.shape[0]
-    assert mat1.flags.c_contiguous, "only contiguos arrays supported"
-    assert mat2.flags.c_contiguous, "only contiguos arrays supported"
-
-    num_pos = mat1.shape[1] - win_width
-    list_metric = xp.empty(num_pos, dtype=xp.float32)
-
-    args = (
-        mat1,
-        np.int32(mat1.strides[0] / mat1.strides[1]),
-        mat2,
-        np.int32(mat2.strides[0] / mat2.strides[1]),
-        np.int32(win_width),
-        np.int32(mat1.shape[0]),
-        np.int32(side),
-        list_metric,
-    )
-    block = (128, 1, 1)
-    grid = (1, np.int32(num_pos), 1)
-    smem = block[0] * 4 * 6 if use_overlap else block[0] * 4 * 3
-    bool2str = lambda x: "true" if x is True else "false"
-    calc_metrics = _calc_metrics_module.get_function(
-        f"calc_metrics_kernel<{bool2str(norm)}, {bool2str(use_overlap)}>"
-    )
-    calc_metrics(grid=grid, block=block, args=args, shared_mem=smem)
-
-    return list_metric
-
-
-@nvtx.annotate()
-def _calculate_curvature(list_metric):
-    """
-    Calculate the curvature of a fitted curve going through the minimum
-    value of a metric list.
-
-    Parameters
-    ----------
-    list_metric : array_like
-        1D array. List of metrics.
-
-    Returns
-    -------
-    curvature : float
-        Quadratic coefficient of the parabola fitting.
-    min_pos : float
-        Position of the minimum value with sub-pixel accuracy.
-    """
-    radi = 2
-    num_metric = list_metric.size
-    min_metric_idx = int(xp.argmin(list_metric))
-    min_pos = int(np.clip(min_metric_idx, radi, num_metric - radi - 1))
-
-    # work mostly on CPU here - we have very small arrays here
-    list1 = xp.asnumpy(list_metric[min_pos - radi : min_pos + radi + 1])
-    afact1 = np.polyfit(np.arange(0, 2 * radi + 1), list1, 2)[0]
-    list2 = xp.asnumpy(list_metric[min_pos - 1 : min_pos + 2])
-    (afact2, bfact2, _) = np.polyfit(np.arange(min_pos - 1, min_pos + 2), list2, 2)
-
-    curvature = np.abs(afact1)
-    if afact2 != 0.0:
-        num = -bfact2 / (2 * afact2)
-        if (num >= min_pos - 1) and (num <= min_pos + 1):
-            min_pos = num
-
-    return curvature, np.float32(min_pos)
-
-
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# # %%%%%%%%%%%%%%%%%%%%%%%%%find_center_vo%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# @nvtx.annotate()
+# def find_center_vo(
+#     data: xp.ndarray,
+#     ind: Optional[int] = None,
+#     smin: int = -50,
+#     smax: int = 50,
+#     srad: float = 6.0,
+#     step: float = 0.25,
+#     ratio: float = 0.5,
+#     drop: int = 20,
+# ) -> float:
+#     """
+#     Find rotation axis location using Nghia Vo's method. See the paper
+#     https://opg.optica.org/oe/fulltext.cfm?uri=oe-22-16-19078&id=297315
+
+#     Parameters
+#     ----------
+#     data : cp.ndarray
+#         3D tomographic data or a 2D sinogram as a CuPy array.
+#     ind : int, optional
+#         Index of the slice to be used to estimate the CoR.
+#     smin : int, optional
+#         Coarse search radius. Reference to the horizontal center of
+#         the sinogram.
+#     smax : int, optional
+#         Coarse search radius. Reference to the horizontal center of
+#         the sinogram.
+#     srad : float, optional
+#         Fine search radius.
+#     step : float, optional
+#         Step of fine searching.
+#     ratio : float, optional
+#         The ratio between the FOV of the camera and the size of object.
+#         It's used to generate the mask.
+#     drop : int, optional
+#         Drop lines around vertical center of the mask.
+
+#     Returns
+#     -------
+#     float
+#         Rotation axis location.
+#     """
+
+#     if data.ndim == 2:
+#         data = xp.expand_dims(data, 1)
+#         ind = 0
+
+#     height = data.shape[1]
+
+#     if ind is None:
+#         ind = height // 2
+#         if height > 10:
+#             _sino = xp.mean(data[:, ind - 5 : ind + 5, :], axis=1)
+#         else:
+#             _sino = data[:, ind, :]
+#     else:
+#         _sino = data[:, ind, :]
+
+#     with nvtx.annotate("gaussian_filter_1", color="green"):
+#         _sino_cs = gaussian_filter(_sino, (3, 1), mode="reflect")
+#     with nvtx.annotate("gaussian_filter_2", color="green"):
+#         _sino_fs = gaussian_filter(_sino, (2, 2), mode="reflect")
+
+#     if _sino.shape[0] * _sino.shape[1] > 4e6:
+#         # data is large, so downsample it before performing search for
+#         # centre of rotation
+#         _sino_coarse = _downsample(_sino_cs, 2, 1)
+#         init_cen = _search_coarse(_sino_coarse, smin / 4.0, smax / 4.0, ratio, drop)
+#         fine_cen = _search_fine(_sino_fs, srad, step, init_cen * 4.0, ratio, drop)
+#     else:
+#         init_cen = _search_coarse(_sino_cs, smin, smax, ratio, drop)
+#         fine_cen = _search_fine(_sino_fs, srad, step, init_cen, ratio, drop)
+
+#     return xp.asnumpy(fine_cen)
+
+
+# @nvtx.annotate()
+# def _search_coarse(sino, smin, smax, ratio, drop):
+#     (nrow, ncol) = sino.shape
+#     flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
+#     comp_sino = xp.ascontiguousarray(xp.flipud(sino))
+#     mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
+
+#     cen_fliplr = (ncol - 1.0) / 2.0
+#     smin_clip_val = max(min(smin + cen_fliplr, ncol - 1), 0)
+#     smin = smin_clip_val - cen_fliplr
+#     smax_clip_val = max(min(smax + cen_fliplr, ncol - 1), 0)
+#     smax = smax_clip_val - cen_fliplr
+#     start_cor = ncol // 2 + smin
+#     stop_cor = ncol // 2 + smax
+#     list_cor = xp.arange(start_cor, stop_cor + 0.5, 0.5, dtype=xp.float32)
+#     list_shift = 2.0 * (list_cor - cen_fliplr)
+#     list_metric = xp.empty(list_shift.shape, dtype=xp.float32)
+#     _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, list_metric)
+
+#     minpos = xp.argmin(list_metric)
+#     if minpos == 0:
+#         print("WARNING!!!Global minimum is out of searching range")
+#         print(f"Please extend smin: {smin}")
+#     if minpos == len(list_metric) - 1:
+#         print("WARNING!!!Global minimum is out of searching range")
+#         print(f"Please extend smax: {smax}")
+#     cor = list_cor[minpos]
+#     return cor
+
+
+# @nvtx.annotate()
+# def _search_fine(sino, srad, step, init_cen, ratio, drop):
+#     (nrow, ncol) = sino.shape
+
+#     flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
+#     comp_sino = xp.ascontiguousarray(xp.flipud(sino))
+#     mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
+
+#     cen_fliplr = (ncol - 1.0) / 2.0
+#     srad = max(min(abs(float(srad)), ncol / 4.0), 1.0)
+#     step = max(min(abs(step), srad), 0.1)
+#     init_cen = max(min(init_cen, ncol - srad - 1), srad)
+#     list_cor = init_cen + xp.arange(-srad, srad + step, step, dtype=np.float32)
+#     list_shift = 2.0 * (list_cor - cen_fliplr)
+#     list_metric = xp.empty(list_shift.shape, dtype="float32")
+
+#     _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, out=list_metric)
+#     cor = list_cor[xp.argmin(list_metric)]
+#     return cor
+
+
+# @nvtx.annotate()
+# def _create_mask(nrow, ncol, radius, drop):
+#     du = 1.0 / ncol
+#     dv = (nrow - 1.0) / (nrow * 2.0 * np.pi)
+#     cen_row = int(math.ceil(nrow / 2.0) - 1)
+#     cen_col = int(math.ceil(ncol / 2.0) - 1)
+#     drop = min([drop, int(math.ceil(0.05 * nrow))])
+
+#     block_x = 128
+#     block_y = 1
+#     block_dims = (block_x, block_y)
+#     grid_x = (ncol // 2 + 1 + block_x - 1) // block_x
+#     grid_y = nrow
+#     grid_dims = (grid_x, grid_y)
+#     mask = xp.empty((nrow, ncol // 2 + 1), dtype="uint16")
+#     params = (
+#         ncol,
+#         nrow,
+#         cen_col,
+#         cen_row,
+#         xp.float32(du),
+#         xp.float32(dv),
+#         xp.float32(radius),
+#         xp.float32(drop),
+#         mask,
+#     )
+#     module = load_cuda_module("generate_mask")
+#     kernel = module.get_function("generate_mask")
+#     kernel(grid_dims, block_dims, params)
+#     return mask
+
+
+# def round_up(x: float) -> int:
+#     if x >= 0.0:
+#         return int(math.ceil(x))
+#     else:
+#         return int(math.floor(x))
+
+
+# def _get_available_gpu_memory() -> int:
+#     dev = xp.cuda.Device()
+#     # first, let's make some space
+#     xp.get_default_memory_pool().free_all_blocks()
+#     cache = xp.fft.config.get_plan_cache()
+#     cache.clear()
+#     available_memory = dev.mem_info[0] + xp.get_default_memory_pool().free_bytes()
+#     return int(available_memory * 0.9)  # 10% safety margin
+
+
+# def _calculate_chunks(
+#     nshifts: int, shift_size: int, available_memory: Optional[int] = None
+# ) -> List[int]:
+#     if available_memory is None:
+#         available_memory = _get_available_gpu_memory()
+
+#     available_memory -= shift_size
+#     freq_domain_size = (
+#         shift_size  # it needs only half (RFFT), but complex64, so it's the same
+#     )
+#     fft_plan_size = freq_domain_size
+#     size_per_shift = fft_plan_size + freq_domain_size + shift_size
+#     nshift_max = available_memory // size_per_shift
+#     assert nshift_max > 0, "Not enough memory to process"
+#     num_chunks = int(np.ceil(nshifts / nshift_max))
+#     chunk_size = int(np.ceil(nshifts / num_chunks))
+#     chunks = [chunk_size] * (num_chunks - 1)
+#     stop_idx = list(np.cumsum(chunks))
+#     stop_idx.append(nshifts)
+#     return stop_idx
+
+
+# @nvtx.annotate()
+# def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
+#     # this tries to simplify - if shift_col is integer, no need to spline interpolate
+#     assert list_shift.dtype == xp.float32, "shifts must be single precision floats"
+#     assert sino1.dtype == xp.float32, "sino1 must be float32"
+#     assert sino2.dtype == xp.float32, "sino1 must be float32"
+#     assert sino3.dtype == xp.float32, "sino1 must be float32"
+#     assert out.dtype == xp.float32, "sino1 must be float32"
+#     assert sino2.flags["C_CONTIGUOUS"], "sino2 must be C-contiguous"
+#     assert sino3.flags["C_CONTIGUOUS"], "sino3 must be C-contiguous"
+#     assert list_shift.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
+#     nshifts = list_shift.shape[0]
+#     na1 = sino1.shape[0]
+#     na2 = sino2.shape[0]
+
+#     module = load_cuda_module("center_360_shifts")
+#     shift_whole_shifts = module.get_function("shift_whole_shifts")
+#     # note: we don't have to calculate the mean here, as we're only looking for minimum metric.
+#     # The sum is enough.
+#     masked_sum_abs_kernel = xp.ReductionKernel(
+#         in_params="complex64 x, uint16 mask",  # input, complex + mask
+#         out_params="float32 out",  # output, real
+#         map_expr="mask ? abs(x) : 0.0f",
+#         reduce_expr="a + b",
+#         post_map_expr="out = a",
+#         identity="0.0f",
+#         reduce_type="float",
+#         name="masked_sum_abs",
+#     )
+
+#     # determine how many shifts we can fit in the available memory
+#     # and iterate in chunks
+#     chunks = _calculate_chunks(
+#         nshifts, (na1 + na2) * sino2.shape[1] * xp.float32().nbytes
+#     )
+
+#     mat = xp.empty((chunks[0], na1 + na2, sino2.shape[1]), dtype=xp.float32)
+#     mat[:, :na1, :] = sino1
+#     # explicitly create FFT plan here, so it's not cached and clearly re-used
+#     plan = get_fft_plan(
+#         mat, mat.shape[-2:], axes=(1, 2), value_type="R2C"
+#     )
+
+#     for i, stop_idx in enumerate(chunks):
+#         if i > 0:
+#             # more than one iteration means we're tight on memory, so clear up freed blocks
+#             mat_freq = None
+#             xp.get_default_memory_pool().free_all_blocks()
+
+#         start_idx = 0 if i == 0 else chunks[i - 1]
+#         size = stop_idx - start_idx
+
+#         # first, handle the integer shifts without spline in a raw kernel,
+#         # and shift in the sino3 one accordingly
+#         bx = 128
+#         gx = (sino3.shape[1] + bx - 1) // bx
+#         shift_whole_shifts(
+#             grid=(gx, na2, size),  ####
+#             block=(bx, 1, 1),
+#             args=(
+#                 sino2,
+#                 sino3,
+#                 list_shift[start_idx:stop_idx],
+#                 mat[:, na1:, :],
+#                 sino3.shape[1],
+#                 na1 + na2,
+#             ),
+#         )
+
+#         # now we can only look at the spline shifting, the rest is done
+#         list_shift_host = xp.asnumpy(list_shift[start_idx:stop_idx])
+#         for i in range(list_shift_host.shape[0]):
+#             shift_col = float(list_shift_host[i])
+#             if not shift_col.is_integer():
+#                 shifted = shift(sino2, (0, shift_col), order=3, prefilter=True)
+#                 shift_int = round_up(shift_col)
+#                 if shift_int >= 0:
+#                     mat[i, na1:, shift_int:] = shifted[:, shift_int:]
+#                 else:
+#                     mat[i, na1:, :shift_int] = shifted[:, :shift_int]
+
+#         # stack and transform
+#         # (we do the full sized mat FFT, even though the last chunk may be smaller, to
+#         # make sure we can re-use the same FFT plan as before)
+#         mat_freq = rfft2(mat, axes=(1, 2), norm=None, plan=plan)
+#         masked_sum_abs_kernel(
+#             mat_freq[:size, :, :], mask, out=out[start_idx:stop_idx], axis=(1, 2)
+#         )
+
+
+# @nvtx.annotate()
+# def _downsample(sino, level, axis):
+#     assert sino.dtype == xp.float32, "single precision floating point input required"
+#     assert sino.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
+
+#     dx, dz = sino.shape
+#     # Determine the new size, dim, of the downsampled dimension
+#     dim = int(sino.shape[axis] / math.pow(2, level))
+#     shape = [dx, dz]
+#     shape[axis] = dim
+#     downsampled_data = xp.empty(shape, dtype="float32")
+
+#     block_x = 8
+#     block_y = 8
+#     block_dims = (block_x, block_y)
+#     grid_x = (sino.shape[1] + block_x - 1) // block_x
+#     grid_y = (sino.shape[0] + block_y - 1) // block_y
+#     grid_dims = (grid_x, grid_y)
+#     # 8x8 thread-block, which means 16 "lots" of columns to downsample per
+#     # thread-block; 4 bytes per float, so allocate 16*6 = 64 bytes of shared
+#     # memeory per thread-block
+#     shared_mem_bytes = 64
+#     params = (sino, dx, dz, level, downsampled_data)
+#     module = load_cuda_module("downsample_sino")
+#     kernel = module.get_function("downsample_sino")
+#     kernel(grid_dims, block_dims, params, shared_mem=shared_mem_bytes)
+#     return downsampled_data
+
+
+# ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+
+# # %%%%%%%%%%%%%%%%%%%%%%%%%find_center_360%%%%%%%%%%%%%%%%%%%%%%%%%
+# # --- Center of rotation (COR) estimation method ---#
+# @nvtx.annotate()
+# def find_center_360(
+#     data: xp.ndarray,
+#     ind: Optional[int] = None,
+#     win_width: int = 10,
+#     side: Optional[Literal[0, 1]] = None,
+#     denoise: bool = True,
+#     norm: bool = False,
+#     use_overlap: bool = False,
+# ) -> Tuple[float, float, Optional[Literal[0, 1]], float]:
+#     """
+#     Find the center-of-rotation (COR) in a 360-degree scan with offset COR use
+#     the method presented in Ref. [1] by Nghia Vo.
+
+#     This function supports both numpy and cupy - the implementation is selected
+#     by where the input data array resides.
+
+#     Parameters
+#     ----------
+#     data : cp.ndarray
+#         3D tomographic data as a Cupy array.
+#     ind : int, optional
+#         Index of the slice to be used for estimate the CoR and the overlap.
+#     win_width : int, optional
+#         Window width used for finding the overlap area.
+#     side : {None, 0, 1}, optional
+#         Overlap size. Only there options: None, 0, or 1. "None" corresponds
+#         to fully automated determination. "0" corresponds to the left side.
+#         "1" corresponds to the right side.
+#     denoise : bool, optional
+#         Apply the Gaussian filter if True.
+#     norm : bool, optional
+#         Apply the normalisation if True.
+#     use_overlap : bool, optional
+#         Use the combination of images in the overlap area for calculating
+#         correlation coefficients if True.
+
+#     Returns
+#     -------
+#     cor : float
+#         Center-of-rotation.
+#     overlap : float
+#         Width of the overlap area between two halves of the sinogram.
+#     side : int
+#         Overlap side between two halves of the sinogram.
+#     overlap_position : float
+#         Position of the window in the first image giving the best
+#         correlation metric.
+
+#     References
+#     ----------
+#     [1] : https://doi.org/10.1364/OE.418448
+#     """
+#     if data.ndim != 3:
+#         raise ValueError("A 3D array must be provided")
+
+#     # this method works with a 360-degree sinogram.
+#     if ind is None:
+#         _sino = data[:, 0, :]
+#     else:
+#         _sino = data[:, ind, :]
+
+#     (nrow, ncol) = _sino.shape
+#     nrow_180 = nrow // 2 + 1
+#     sino_top = _sino[0:nrow_180, :]
+#     sino_bot = xp.fliplr(_sino[-nrow_180:, :])
+#     (overlap, side, overlap_position) = _find_overlap(
+#         sino_top, sino_bot, win_width, side, denoise, norm, use_overlap
+#     )
+#     if side == 0:
+#         cor = overlap / 2.0 - 1.0
+#     else:
+#         cor = ncol - overlap / 2.0 - 1.0
+
+#     return cor, overlap, side, overlap_position
+
+
+# def _find_overlap(
+#     mat1, mat2, win_width, side=None, denoise=True, norm=False, use_overlap=False
+# ):
+#     """
+#     Find the overlap area and overlap side between two images (Ref. [1]) where
+#     the overlap side referring to the first image.
+
+#     Parameters
+#     ----------
+#     mat1 : array_like
+#         2D array. Projection image or sinogram image.
+#     mat2 :  array_like
+#         2D array. Projection image or sinogram image.
+#     win_width : int
+#         Width of the searching window.
+#     side : {None, 0, 1}, optional
+#         Only there options: None, 0, or 1. "None" corresponding to fully
+#         automated determination. "0" corresponding to the left side. "1"
+#         corresponding to the right side.
+#     denoise : bool, optional
+#         Apply the Gaussian filter if True.
+#     norm : bool, optional
+#         Apply the normalization if True.
+#     use_overlap : bool, optional
+#         Use the combination of images in the overlap area for calculating
+#         correlation coefficients if True.
+
+#     Returns
+#     -------
+#     overlap : float
+#         Width of the overlap area between two images.
+#     side : int
+#         Overlap side between two images.
+#     overlap_position : float
+#         Position of the window in the first image giving the best
+#         correlation metric.
+
+#     """
+#     ncol1 = mat1.shape[1]
+#     ncol2 = mat2.shape[1]
+#     win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2))
+
+#     if side == 1:
+#         (list_metric, offset) = _search_overlap(
+#             mat1,
+#             mat2,
+#             win_width,
+#             side=side,
+#             denoise=denoise,
+#             norm=norm,
+#             use_overlap=use_overlap,
+#         )
+#         overlap_position = _calculate_curvature(list_metric)[1]
+#         overlap_position += offset
+#         overlap = ncol1 - overlap_position + win_width // 2
+#     elif side == 0:
+#         (list_metric, offset) = _search_overlap(
+#             mat1,
+#             mat2,
+#             win_width,
+#             side=side,
+#             denoise=denoise,
+#             norm=norm,
+#             use_overlap=use_overlap,
+#         )
+#         overlap_position = _calculate_curvature(list_metric)[1]
+#         overlap_position += offset
+#         overlap = overlap_position + win_width // 2
+#     else:
+#         (list_metric1, offset1) = _search_overlap(
+#             mat1,
+#             mat2,
+#             win_width,
+#             side=1,
+#             denoise=denoise,
+#             norm=norm,
+#             use_overlap=use_overlap,
+#         )
+#         (list_metric2, offset2) = _search_overlap(
+#             mat1,
+#             mat2,
+#             win_width,
+#             side=0,
+#             denoise=denoise,
+#             norm=norm,
+#             use_overlap=use_overlap,
+#         )
+
+#         (curvature1, overlap_position1) = _calculate_curvature(list_metric1)
+#         overlap_position1 += offset1
+#         (curvature2, overlap_position2) = _calculate_curvature(list_metric2)
+#         overlap_position2 += offset2
+
+#         if curvature1 > curvature2:
+#             side = 1
+#             overlap_position = overlap_position1
+#             overlap = ncol1 - overlap_position + win_width // 2
+#         else:
+#             side = 0
+#             overlap_position = overlap_position2
+#             overlap = overlap_position + win_width // 2
+
+#     return overlap, side, overlap_position
+
+
+# @nvtx.annotate()
+# def _search_overlap(
+#     mat1, mat2, win_width, side, denoise=True, norm=False, use_overlap=False
+# ):
+#     """
+#     Calculate the correlation metrics between a rectangular region, defined
+#     by the window width, on the utmost left/right side of image 2 and the
+#     same size region in image 1 where the region is slided across image 1.
+
+#     Parameters
+#     ----------
+#     mat1 : array_like
+#         2D array. Projection image or sinogram image.
+#     mat2 : array_like
+#         2D array. Projection image or sinogram image.
+#     win_width : int
+#         Width of the searching window.
+#     side : {0, 1}
+#         Only two options: 0 or 1. It is used to indicate the overlap side
+#         respects to image 1. "0" corresponds to the left side. "1" corresponds
+#         to the right side.
+#     denoise : bool, optional
+#         Apply the Gaussian filter if True.
+#     norm : bool, optional
+#         Apply the normalization if True.
+#     use_overlap : bool, optional
+#         Use the combination of images in the overlap area for calculating
+#         correlation coefficients if True.
+
+#     Returns
+#     -------
+#     list_metric : array_like
+#         1D array. List of the correlation metrics.
+#     offset : int
+#         Initial position of the searching window where the position
+#         corresponds to the center of the window.
+#     """
+#     if denoise is True:
+#         # note: the filtering makes the output contiguous
+#         with nvtx.annotate("denoise_filter", color="green"):
+#             mat1 = gaussian_filter(mat1, (2, 2), mode="reflect")
+#             mat2 = gaussian_filter(mat2, (2, 2), mode="reflect")
+#     else:
+#         mat1 = xp.ascontiguousarray(mat1, dtype=xp.float32)
+#         mat2 = xp.ascontiguousarray(mat2, dtype=xp.float32)
+
+#     (nrow1, ncol1) = mat1.shape
+#     (nrow2, ncol2) = mat2.shape
+
+#     if nrow1 != nrow2:
+#         raise ValueError("Two images are not at the same height!!!")
+
+#     win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2 - 1))
+#     offset = win_width // 2
+#     win_width = 2 * offset  # Make it even
+
+#     list_metric = _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm)
+
+#     min_metric = xp.min(list_metric)
+#     if min_metric != 0.0:
+#         list_metric /= min_metric
+
+#     return list_metric, offset
+
+
+# _calc_metrics_module = load_cuda_module(
+#     "calc_metrics",
+#     name_expressions=[
+#         "calc_metrics_kernel<false, false>",
+#         "calc_metrics_kernel<true, false>",
+#         "calc_metrics_kernel<false, true>",
+#         "calc_metrics_kernel<true, true>",
+#     ],
+#     options=("--maxrregcount=32",),
+# )
+
+
+# @nvtx.annotate()
+# def _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm):
+#     assert mat1.dtype == xp.float32, "only float32 supported"
+#     assert mat2.dtype == xp.float32, "only float32 supported"
+#     assert mat1.shape[0] == mat2.shape[0]
+#     assert mat1.flags.c_contiguous, "only contiguos arrays supported"
+#     assert mat2.flags.c_contiguous, "only contiguos arrays supported"
+
+#     num_pos = mat1.shape[1] - win_width
+#     list_metric = xp.empty(num_pos, dtype=xp.float32)
+
+#     args = (
+#         mat1,
+#         np.int32(mat1.strides[0] / mat1.strides[1]),
+#         mat2,
+#         np.int32(mat2.strides[0] / mat2.strides[1]),
+#         np.int32(win_width),
+#         np.int32(mat1.shape[0]),
+#         np.int32(side),
+#         list_metric,
+#     )
+#     block = (128, 1, 1)
+#     grid = (1, np.int32(num_pos), 1)
+#     smem = block[0] * 4 * 6 if use_overlap else block[0] * 4 * 3
+#     bool2str = lambda x: "true" if x is True else "false"
+#     calc_metrics = _calc_metrics_module.get_function(
+#         f"calc_metrics_kernel<{bool2str(norm)}, {bool2str(use_overlap)}>"
+#     )
+#     calc_metrics(grid=grid, block=block, args=args, shared_mem=smem)
+
+#     return list_metric
+
+
+# @nvtx.annotate()
+# def _calculate_curvature(list_metric):
+#     """
+#     Calculate the curvature of a fitted curve going through the minimum
+#     value of a metric list.
+
+#     Parameters
+#     ----------
+#     list_metric : array_like
+#         1D array. List of metrics.
+
+#     Returns
+#     -------
+#     curvature : float
+#         Quadratic coefficient of the parabola fitting.
+#     min_pos : float
+#         Position of the minimum value with sub-pixel accuracy.
+#     """
+#     radi = 2
+#     num_metric = list_metric.size
+#     min_metric_idx = int(xp.argmin(list_metric))
+#     min_pos = int(np.clip(min_metric_idx, radi, num_metric - radi - 1))
+
+#     # work mostly on CPU here - we have very small arrays here
+#     list1 = xp.asnumpy(list_metric[min_pos - radi : min_pos + radi + 1])
+#     afact1 = np.polyfit(np.arange(0, 2 * radi + 1), list1, 2)[0]
+#     list2 = xp.asnumpy(list_metric[min_pos - 1 : min_pos + 2])
+#     (afact2, bfact2, _) = np.polyfit(np.arange(min_pos - 1, min_pos + 2), list2, 2)
+
+#     curvature = np.abs(afact1)
+#     if afact2 != 0.0:
+#         num = -bfact2 / (2 * afact2)
+#         if (num >= min_pos - 1) and (num <= min_pos + 1):
+#             min_pos = num
+
+#     return curvature, np.float32(min_pos)
+
+
+# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 
 # %%%%%%%%%%%%%%%%%%%%%%find_center_pc%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 @nvtx.annotate()
 def find_center_pc(
-    proj1: xp.ndarray, proj2: xp.ndarray, tol: float = 0.5, rotc_guess: float = None
+    proj1: xp.ndarray, proj2: xp.ndarray, tol: float = 0.5, rotc_guess: Union[float, Optional[str]] = None
 ) -> float:
     """Find rotation axis location by finding the offset between the first
     projection and a mirrored projection 180 degrees apart using

From 12437cdbe0eb7db758f3dc33f295a6f3368f0dc2 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 14:08:57 +0100
Subject: [PATCH 25/36] corr2

---
 httomolibgpu/recon/rotation.py | 46 +++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
index a58ef146..61994ebe 100644
--- a/httomolibgpu/recon/rotation.py
+++ b/httomolibgpu/recon/rotation.py
@@ -39,23 +39,23 @@
 import math
 from typing import List, Literal, Optional, Tuple, Union
 
-if cupy_run:
-    from httomolibgpu.cuda_kernels import load_cuda_module
-    from cupyx.scipy.ndimage import shift, gaussian_filter
-    from cupyx.scipy.fftpack import get_fft_plan
-    from cupyx.scipy.fft import rfft2
-else:
-    from scipy.ndimage import shift, gaussian_filter
-    from scipy.fft import fftfreq as get_fft_plan # get_fft_plan doesn't exist in scipyfft
-    from scipy.fft import rfft2
-
-try:
-    from cucim.skimage.registration import phase_cross_correlation
-except ImportError:
-    print(
-        "Cucim library of Rapidsai is a required dependency for find_center_pc module, please install"
-    )
-    from skimage.registration import phase_cross_correlation  
+# if cupy_run:
+#     from httomolibgpu.cuda_kernels import load_cuda_module
+#     from cupyx.scipy.ndimage import shift, gaussian_filter
+#     from cupyx.scipy.fftpack import get_fft_plan
+#     from cupyx.scipy.fft import rfft2
+# else:
+#     from scipy.ndimage import shift, gaussian_filter
+#     from scipy.fft import fftfreq as get_fft_plan # get_fft_plan doesn't exist in scipyfft
+#     from scipy.fft import rfft2
+
+# try:
+#     from cucim.skimage.registration import phase_cross_correlation
+# except ImportError:
+#     print(
+#         "Cucim library of Rapidsai is a required dependency for find_center_pc module, please install"
+#     )
+#     from skimage.registration import phase_cross_correlation  
 
 __all__ = [
     # "find_center_vo",
@@ -740,6 +740,18 @@ def find_center_pc(
     Returns:
         float: Rotation axis location.
     """
+    if cupy_run:
+        from cupyx.scipy.ndimage import shift
+        try:
+            from cucim.skimage.registration import phase_cross_correlation
+        except ImportError:
+            print(
+            "Cucim library of Rapidsai is a required dependency for find_center_pc module, please install"
+            )
+    else:
+        from skimage.registration import phase_cross_correlation  
+        from scipy.ndimage import shift
+    
     imgshift = 0.0 if rotc_guess is None else rotc_guess - (proj1.shape[1] - 1.0) / 2.0
 
     proj1 = shift(proj1, [0, -imgshift], mode="constant", cval=0)

From d8b87c51b70305a179513ec2dcf9e70ea248c814 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 14:22:47 +0100
Subject: [PATCH 26/36] corr3

---
 httomolibgpu/recon/rotation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
index 61994ebe..628a80ad 100644
--- a/httomolibgpu/recon/rotation.py
+++ b/httomolibgpu/recon/rotation.py
@@ -727,7 +727,7 @@ def find_center_pc(
     """Find rotation axis location by finding the offset between the first
     projection and a mirrored projection 180 degrees apart using
     phase correlation in Fourier space.
-    The ``phase_cross_correlation`` function uses cross-correlation in Fourier
+    The `phase_cross_correlation` function uses cross-correlation in Fourier
     space, optionally employing an upsampled matrix-multiplication DFT to
     achieve arbitrary subpixel precision. :cite:`Guizar:08`.
 
@@ -740,7 +740,7 @@ def find_center_pc(
     Returns:
         float: Rotation axis location.
     """
-    if cupy_run:
+    if xp.__name__ == "cupy":
         from cupyx.scipy.ndimage import shift
         try:
             from cucim.skimage.registration import phase_cross_correlation

From e7a291cac802b512242d9457ea5dc37909fc3b0d Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 14:27:59 +0100
Subject: [PATCH 27/36] corr4

---
 httomolibgpu/recon/rotation.py | 83 +++++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 31 deletions(-)

diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
index 628a80ad..2e875fcb 100644
--- a/httomolibgpu/recon/rotation.py
+++ b/httomolibgpu/recon/rotation.py
@@ -720,6 +720,57 @@
 
 
 # %%%%%%%%%%%%%%%%%%%%%%find_center_pc%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# @nvtx.annotate()
+# def find_center_pc(
+#     proj1: xp.ndarray, proj2: xp.ndarray, tol: float = 0.5, rotc_guess: Union[float, Optional[str]] = None
+# ) -> float:
+#     """Find rotation axis location by finding the offset between the first
+#     projection and a mirrored projection 180 degrees apart using
+#     phase correlation in Fourier space.
+#     The `phase_cross_correlation` function uses cross-correlation in Fourier
+#     space, optionally employing an upsampled matrix-multiplication DFT to
+#     achieve arbitrary subpixel precision. :cite:`Guizar:08`.
+
+#     Args:
+#         proj1 (xp.ndarray): Projection from the 0th degree
+#         proj2 (xp.ndarray): Projection from the 180th degree
+#         tol (float, optional): Subpixel accuracy. Defaults to 0.5.
+#         rotc_guess (float, optional): Initial guess value for the rotation center. Defaults to None.
+
+#     Returns:
+#         float: Rotation axis location.
+#     """
+#     if xp.__name__ == "cupy":
+#         from cupyx.scipy.ndimage import shift
+#         try:
+#             from cucim.skimage.registration import phase_cross_correlation
+#         except ImportError:
+#             print(
+#             "Cucim library of Rapidsai is a required dependency for find_center_pc module, please install"
+#             )
+#     else:
+#         from skimage.registration import phase_cross_correlation  
+#         from scipy.ndimage import shift
+    
+#     imgshift = 0.0 if rotc_guess is None else rotc_guess - (proj1.shape[1] - 1.0) / 2.0
+
+#     proj1 = shift(proj1, [0, -imgshift], mode="constant", cval=0)
+#     proj2 = shift(proj2, [0, -imgshift], mode="constant", cval=0)
+
+#     # create reflection of second projection
+#     proj2 = xp.fliplr(proj2)
+
+#     # using cucim of rapids to do phase cross correlation between two images
+#     shiftr = phase_cross_correlation(
+#         reference_image=proj1, moving_image=proj2, upsample_factor=1.0 / tol
+#     )
+
+#     # Compute center of rotation as the center of first image and the
+#     # registered translation with the second image
+#     center = (proj1.shape[1] + shiftr[0][1] - 1.0) / 2.0
+
+#     return center + imgshift
+
 @nvtx.annotate()
 def find_center_pc(
     proj1: xp.ndarray, proj2: xp.ndarray, tol: float = 0.5, rotc_guess: Union[float, Optional[str]] = None
@@ -740,36 +791,6 @@ def find_center_pc(
     Returns:
         float: Rotation axis location.
     """
-    if xp.__name__ == "cupy":
-        from cupyx.scipy.ndimage import shift
-        try:
-            from cucim.skimage.registration import phase_cross_correlation
-        except ImportError:
-            print(
-            "Cucim library of Rapidsai is a required dependency for find_center_pc module, please install"
-            )
-    else:
-        from skimage.registration import phase_cross_correlation  
-        from scipy.ndimage import shift
-    
-    imgshift = 0.0 if rotc_guess is None else rotc_guess - (proj1.shape[1] - 1.0) / 2.0
-
-    proj1 = shift(proj1, [0, -imgshift], mode="constant", cval=0)
-    proj2 = shift(proj2, [0, -imgshift], mode="constant", cval=0)
-
-    # create reflection of second projection
-    proj2 = xp.fliplr(proj2)
-
-    # using cucim of rapids to do phase cross correlation between two images
-    shiftr = phase_cross_correlation(
-        reference_image=proj1, moving_image=proj2, upsample_factor=1.0 / tol
-    )
-
-    # Compute center of rotation as the center of first image and the
-    # registered translation with the second image
-    center = (proj1.shape[1] + shiftr[0][1] - 1.0) / 2.0
-
-    return center + imgshift
-
 
+    return 0.5
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

From cfe18a94c20e50e8efe1f84e2e082e1832383c25 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 14:35:51 +0100
Subject: [PATCH 28/36] corr5

---
 httomolibgpu/__init__.py              |  2 +-
 httomolibgpu/cuda_kernels/__init__.py | 14 +++++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index 5ce9fcef..9e2c8cce 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -11,4 +11,4 @@
 )
 from httomolibgpu.recon.algorithm import FBP, SIRT, CGLS
 #from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
-from httomolibgpu.recon.rotation import find_center_pc
+#from httomolibgpu.recon.rotation import find_center_pc
diff --git a/httomolibgpu/cuda_kernels/__init__.py b/httomolibgpu/cuda_kernels/__init__.py
index 72d3415b..71688024 100644
--- a/httomolibgpu/cuda_kernels/__init__.py
+++ b/httomolibgpu/cuda_kernels/__init__.py
@@ -2,16 +2,20 @@
 from typing import List, Optional, Tuple
 
 try:
-    import cupy as cp
+    import cupy as xp
+    try:
+        xp.cuda.Device(0).compute_capability
+    except xp.cuda.runtime.CUDARuntimeError:
+        print("CuPy library is a major dependency for HTTomolibgpu, please install")
+        import numpy as xp
 except ImportError:
-    print("Cupy library is a required dependency for HTTomolibgpu, please install")
-
+    import numpy as xp
 
 def load_cuda_module(
     file: str,
     name_expressions: Optional[List[str]] = None,
     options: Tuple[str, ...] = tuple(),
-) -> cp.RawModule:
+) -> xp.RawModule:
     """Load a CUDA module file, i.e. a .cu file, from the file system,
     compile it, and return is as a CuPy RawModule for further
     processing.
@@ -25,6 +29,6 @@ def load_cuda_module(
     with open(file, "r") as f:
         code += f.read()
 
-    return cp.RawModule(
+    return xp.RawModule(
         options=("-std=c++11", *options), code=code, name_expressions=name_expressions
     )

From 22c803355f4af6d808a0250f2014bb8c2788e752 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 14:48:32 +0100
Subject: [PATCH 29/36] corr6

---
 httomolibgpu/__init__.py       |   2 +-
 httomolibgpu/recon/rotation.py | 668 ++++++++++++++++-----------------
 2 files changed, 326 insertions(+), 344 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index 9e2c8cce..25b171b7 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -11,4 +11,4 @@
 )
 from httomolibgpu.recon.algorithm import FBP, SIRT, CGLS
 #from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
-#from httomolibgpu.recon.rotation import find_center_pc
+from httomolibgpu.recon.rotation import find_center_vo
diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
index 2e875fcb..6e6feba1 100644
--- a/httomolibgpu/recon/rotation.py
+++ b/httomolibgpu/recon/rotation.py
@@ -39,349 +39,353 @@
 import math
 from typing import List, Literal, Optional, Tuple, Union
 
-# if cupy_run:
-#     from httomolibgpu.cuda_kernels import load_cuda_module
-#     from cupyx.scipy.ndimage import shift, gaussian_filter
-#     from cupyx.scipy.fftpack import get_fft_plan
-#     from cupyx.scipy.fft import rfft2
-# else:
-#     from scipy.ndimage import shift, gaussian_filter
-#     from scipy.fft import fftfreq as get_fft_plan # get_fft_plan doesn't exist in scipyfft
-#     from scipy.fft import rfft2
-
-# try:
-#     from cucim.skimage.registration import phase_cross_correlation
-# except ImportError:
-#     print(
-#         "Cucim library of Rapidsai is a required dependency for find_center_pc module, please install"
-#     )
-#     from skimage.registration import phase_cross_correlation  
+if cupy_run:
+    from httomolibgpu.cuda_kernels import load_cuda_module
+    from cupyx.scipy.ndimage import shift, gaussian_filter
+    from cupyx.scipy.fftpack import get_fft_plan
+    from cupyx.scipy.fft import rfft2
+else:
+    from scipy.ndimage import shift, gaussian_filter
+    from scipy.fft import fftfreq as get_fft_plan # get_fft_plan doesn't exist in scipyfft
+    from scipy.fft import rfft2
 
 __all__ = [
-    # "find_center_vo",
-    # "find_center_360",
-    "find_center_pc",
+    "find_center_vo",
+    #"find_center_360",
+    #"find_center_pc",
 ]
 
-# # %%%%%%%%%%%%%%%%%%%%%%%%%find_center_vo%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# @nvtx.annotate()
-# def find_center_vo(
-#     data: xp.ndarray,
-#     ind: Optional[int] = None,
-#     smin: int = -50,
-#     smax: int = 50,
-#     srad: float = 6.0,
-#     step: float = 0.25,
-#     ratio: float = 0.5,
-#     drop: int = 20,
-# ) -> float:
-#     """
-#     Find rotation axis location using Nghia Vo's method. See the paper
-#     https://opg.optica.org/oe/fulltext.cfm?uri=oe-22-16-19078&id=297315
-
-#     Parameters
-#     ----------
-#     data : cp.ndarray
-#         3D tomographic data or a 2D sinogram as a CuPy array.
-#     ind : int, optional
-#         Index of the slice to be used to estimate the CoR.
-#     smin : int, optional
-#         Coarse search radius. Reference to the horizontal center of
-#         the sinogram.
-#     smax : int, optional
-#         Coarse search radius. Reference to the horizontal center of
-#         the sinogram.
-#     srad : float, optional
-#         Fine search radius.
-#     step : float, optional
-#         Step of fine searching.
-#     ratio : float, optional
-#         The ratio between the FOV of the camera and the size of object.
-#         It's used to generate the mask.
-#     drop : int, optional
-#         Drop lines around vertical center of the mask.
-
-#     Returns
-#     -------
-#     float
-#         Rotation axis location.
-#     """
-
-#     if data.ndim == 2:
-#         data = xp.expand_dims(data, 1)
-#         ind = 0
-
-#     height = data.shape[1]
-
-#     if ind is None:
-#         ind = height // 2
-#         if height > 10:
-#             _sino = xp.mean(data[:, ind - 5 : ind + 5, :], axis=1)
-#         else:
-#             _sino = data[:, ind, :]
-#     else:
-#         _sino = data[:, ind, :]
-
-#     with nvtx.annotate("gaussian_filter_1", color="green"):
-#         _sino_cs = gaussian_filter(_sino, (3, 1), mode="reflect")
-#     with nvtx.annotate("gaussian_filter_2", color="green"):
-#         _sino_fs = gaussian_filter(_sino, (2, 2), mode="reflect")
-
-#     if _sino.shape[0] * _sino.shape[1] > 4e6:
-#         # data is large, so downsample it before performing search for
-#         # centre of rotation
-#         _sino_coarse = _downsample(_sino_cs, 2, 1)
-#         init_cen = _search_coarse(_sino_coarse, smin / 4.0, smax / 4.0, ratio, drop)
-#         fine_cen = _search_fine(_sino_fs, srad, step, init_cen * 4.0, ratio, drop)
-#     else:
-#         init_cen = _search_coarse(_sino_cs, smin, smax, ratio, drop)
-#         fine_cen = _search_fine(_sino_fs, srad, step, init_cen, ratio, drop)
-
-#     return xp.asnumpy(fine_cen)
-
-
-# @nvtx.annotate()
-# def _search_coarse(sino, smin, smax, ratio, drop):
-#     (nrow, ncol) = sino.shape
-#     flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
-#     comp_sino = xp.ascontiguousarray(xp.flipud(sino))
-#     mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
-
-#     cen_fliplr = (ncol - 1.0) / 2.0
-#     smin_clip_val = max(min(smin + cen_fliplr, ncol - 1), 0)
-#     smin = smin_clip_val - cen_fliplr
-#     smax_clip_val = max(min(smax + cen_fliplr, ncol - 1), 0)
-#     smax = smax_clip_val - cen_fliplr
-#     start_cor = ncol // 2 + smin
-#     stop_cor = ncol // 2 + smax
-#     list_cor = xp.arange(start_cor, stop_cor + 0.5, 0.5, dtype=xp.float32)
-#     list_shift = 2.0 * (list_cor - cen_fliplr)
-#     list_metric = xp.empty(list_shift.shape, dtype=xp.float32)
-#     _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, list_metric)
-
-#     minpos = xp.argmin(list_metric)
-#     if minpos == 0:
-#         print("WARNING!!!Global minimum is out of searching range")
-#         print(f"Please extend smin: {smin}")
-#     if minpos == len(list_metric) - 1:
-#         print("WARNING!!!Global minimum is out of searching range")
-#         print(f"Please extend smax: {smax}")
-#     cor = list_cor[minpos]
-#     return cor
-
-
-# @nvtx.annotate()
-# def _search_fine(sino, srad, step, init_cen, ratio, drop):
-#     (nrow, ncol) = sino.shape
+def find_center_vo(
+    data: xp.ndarray,
+    ind: Optional[int] = None,
+    smin: int = -50,
+    smax: int = 50,
+    srad: float = 6.0,
+    step: float = 0.25,
+    ratio: float = 0.5,
+    drop: int = 20,
+) -> float:
+    """
+    Find rotation axis location using Nghia Vo's method. See the paper
+    https://opg.optica.org/oe/fulltext.cfm?uri=oe-22-16-19078&id=297315
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        3D tomographic data or a 2D sinogram as a CuPy array.
+    ind : int, optional
+        Index of the slice to be used to estimate the CoR.
+    smin : int, optional
+        Coarse search radius. Reference to the horizontal center of
+        the sinogram.
+    smax : int, optional
+        Coarse search radius. Reference to the horizontal center of
+        the sinogram.
+    srad : float, optional
+        Fine search radius.
+    step : float, optional
+        Step of fine searching.
+    ratio : float, optional
+        The ratio between the FOV of the camera and the size of object.
+        It's used to generate the mask.
+    drop : int, optional
+        Drop lines around vertical center of the mask.
+
+    Returns
+    -------
+    float
+        Rotation axis location.
+    """
+    return __find_center_vo(data, ind, smin, smax, srad, step, ratio, drop)    
 
-#     flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
-#     comp_sino = xp.ascontiguousarray(xp.flipud(sino))
-#     mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
+# %%%%%%%%%%%%%%%%%%%%%%%%%find_center_vo%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+@nvtx.annotate()
+def __find_center_vo(
+    data: xp.ndarray,
+    ind: Optional[int] = None,
+    smin: int = -50,
+    smax: int = 50,
+    srad: float = 6.0,
+    step: float = 0.25,
+    ratio: float = 0.5,
+    drop: int = 20,
+) -> float:
 
-#     cen_fliplr = (ncol - 1.0) / 2.0
-#     srad = max(min(abs(float(srad)), ncol / 4.0), 1.0)
-#     step = max(min(abs(step), srad), 0.1)
-#     init_cen = max(min(init_cen, ncol - srad - 1), srad)
-#     list_cor = init_cen + xp.arange(-srad, srad + step, step, dtype=np.float32)
-#     list_shift = 2.0 * (list_cor - cen_fliplr)
-#     list_metric = xp.empty(list_shift.shape, dtype="float32")
+    if data.ndim == 2:
+        data = xp.expand_dims(data, 1)
+        ind = 0
 
-#     _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, out=list_metric)
-#     cor = list_cor[xp.argmin(list_metric)]
-#     return cor
+    height = data.shape[1]
 
+    if ind is None:
+        ind = height // 2
+        if height > 10:
+            _sino = xp.mean(data[:, ind - 5 : ind + 5, :], axis=1)
+        else:
+            _sino = data[:, ind, :]
+    else:
+        _sino = data[:, ind, :]
 
-# @nvtx.annotate()
-# def _create_mask(nrow, ncol, radius, drop):
-#     du = 1.0 / ncol
-#     dv = (nrow - 1.0) / (nrow * 2.0 * np.pi)
-#     cen_row = int(math.ceil(nrow / 2.0) - 1)
-#     cen_col = int(math.ceil(ncol / 2.0) - 1)
-#     drop = min([drop, int(math.ceil(0.05 * nrow))])
-
-#     block_x = 128
-#     block_y = 1
-#     block_dims = (block_x, block_y)
-#     grid_x = (ncol // 2 + 1 + block_x - 1) // block_x
-#     grid_y = nrow
-#     grid_dims = (grid_x, grid_y)
-#     mask = xp.empty((nrow, ncol // 2 + 1), dtype="uint16")
-#     params = (
-#         ncol,
-#         nrow,
-#         cen_col,
-#         cen_row,
-#         xp.float32(du),
-#         xp.float32(dv),
-#         xp.float32(radius),
-#         xp.float32(drop),
-#         mask,
-#     )
-#     module = load_cuda_module("generate_mask")
-#     kernel = module.get_function("generate_mask")
-#     kernel(grid_dims, block_dims, params)
-#     return mask
+    with nvtx.annotate("gaussian_filter_1", color="green"):
+        _sino_cs = gaussian_filter(_sino, (3, 1), mode="reflect")
+    with nvtx.annotate("gaussian_filter_2", color="green"):
+        _sino_fs = gaussian_filter(_sino, (2, 2), mode="reflect")
 
+    if _sino.shape[0] * _sino.shape[1] > 4e6:
+        # data is large, so downsample it before performing search for
+        # centre of rotation
+        _sino_coarse = _downsample(_sino_cs, 2, 1)
+        init_cen = _search_coarse(_sino_coarse, smin / 4.0, smax / 4.0, ratio, drop)
+        fine_cen = _search_fine(_sino_fs, srad, step, init_cen * 4.0, ratio, drop)
+    else:
+        init_cen = _search_coarse(_sino_cs, smin, smax, ratio, drop)
+        fine_cen = _search_fine(_sino_fs, srad, step, init_cen, ratio, drop)
 
-# def round_up(x: float) -> int:
-#     if x >= 0.0:
-#         return int(math.ceil(x))
-#     else:
-#         return int(math.floor(x))
-
+    return xp.asnumpy(fine_cen)
 
-# def _get_available_gpu_memory() -> int:
-#     dev = xp.cuda.Device()
-#     # first, let's make some space
-#     xp.get_default_memory_pool().free_all_blocks()
-#     cache = xp.fft.config.get_plan_cache()
-#     cache.clear()
-#     available_memory = dev.mem_info[0] + xp.get_default_memory_pool().free_bytes()
-#     return int(available_memory * 0.9)  # 10% safety margin
 
+@nvtx.annotate()
+def _search_coarse(sino, smin, smax, ratio, drop):
+    (nrow, ncol) = sino.shape
+    flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
+    comp_sino = xp.ascontiguousarray(xp.flipud(sino))
+    mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
+
+    cen_fliplr = (ncol - 1.0) / 2.0
+    smin_clip_val = max(min(smin + cen_fliplr, ncol - 1), 0)
+    smin = smin_clip_val - cen_fliplr
+    smax_clip_val = max(min(smax + cen_fliplr, ncol - 1), 0)
+    smax = smax_clip_val - cen_fliplr
+    start_cor = ncol // 2 + smin
+    stop_cor = ncol // 2 + smax
+    list_cor = xp.arange(start_cor, stop_cor + 0.5, 0.5, dtype=xp.float32)
+    list_shift = 2.0 * (list_cor - cen_fliplr)
+    list_metric = xp.empty(list_shift.shape, dtype=xp.float32)
+    _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, list_metric)
+
+    minpos = xp.argmin(list_metric)
+    if minpos == 0:
+        print("WARNING!!!Global minimum is out of searching range")
+        print(f"Please extend smin: {smin}")
+    if minpos == len(list_metric) - 1:
+        print("WARNING!!!Global minimum is out of searching range")
+        print(f"Please extend smax: {smax}")
+    cor = list_cor[minpos]
+    return cor
 
-# def _calculate_chunks(
-#     nshifts: int, shift_size: int, available_memory: Optional[int] = None
-# ) -> List[int]:
-#     if available_memory is None:
-#         available_memory = _get_available_gpu_memory()
-
-#     available_memory -= shift_size
-#     freq_domain_size = (
-#         shift_size  # it needs only half (RFFT), but complex64, so it's the same
-#     )
-#     fft_plan_size = freq_domain_size
-#     size_per_shift = fft_plan_size + freq_domain_size + shift_size
-#     nshift_max = available_memory // size_per_shift
-#     assert nshift_max > 0, "Not enough memory to process"
-#     num_chunks = int(np.ceil(nshifts / nshift_max))
-#     chunk_size = int(np.ceil(nshifts / num_chunks))
-#     chunks = [chunk_size] * (num_chunks - 1)
-#     stop_idx = list(np.cumsum(chunks))
-#     stop_idx.append(nshifts)
-#     return stop_idx
 
+@nvtx.annotate()
+def _search_fine(sino, srad, step, init_cen, ratio, drop):
+    (nrow, ncol) = sino.shape
 
-# @nvtx.annotate()
-# def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
-#     # this tries to simplify - if shift_col is integer, no need to spline interpolate
-#     assert list_shift.dtype == xp.float32, "shifts must be single precision floats"
-#     assert sino1.dtype == xp.float32, "sino1 must be float32"
-#     assert sino2.dtype == xp.float32, "sino1 must be float32"
-#     assert sino3.dtype == xp.float32, "sino1 must be float32"
-#     assert out.dtype == xp.float32, "sino1 must be float32"
-#     assert sino2.flags["C_CONTIGUOUS"], "sino2 must be C-contiguous"
-#     assert sino3.flags["C_CONTIGUOUS"], "sino3 must be C-contiguous"
-#     assert list_shift.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
-#     nshifts = list_shift.shape[0]
-#     na1 = sino1.shape[0]
-#     na2 = sino2.shape[0]
-
-#     module = load_cuda_module("center_360_shifts")
-#     shift_whole_shifts = module.get_function("shift_whole_shifts")
-#     # note: we don't have to calculate the mean here, as we're only looking for minimum metric.
-#     # The sum is enough.
-#     masked_sum_abs_kernel = xp.ReductionKernel(
-#         in_params="complex64 x, uint16 mask",  # input, complex + mask
-#         out_params="float32 out",  # output, real
-#         map_expr="mask ? abs(x) : 0.0f",
-#         reduce_expr="a + b",
-#         post_map_expr="out = a",
-#         identity="0.0f",
-#         reduce_type="float",
-#         name="masked_sum_abs",
-#     )
+    flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
+    comp_sino = xp.ascontiguousarray(xp.flipud(sino))
+    mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
 
-#     # determine how many shifts we can fit in the available memory
-#     # and iterate in chunks
-#     chunks = _calculate_chunks(
-#         nshifts, (na1 + na2) * sino2.shape[1] * xp.float32().nbytes
-#     )
+    cen_fliplr = (ncol - 1.0) / 2.0
+    srad = max(min(abs(float(srad)), ncol / 4.0), 1.0)
+    step = max(min(abs(step), srad), 0.1)
+    init_cen = max(min(init_cen, ncol - srad - 1), srad)
+    list_cor = init_cen + xp.arange(-srad, srad + step, step, dtype=np.float32)
+    list_shift = 2.0 * (list_cor - cen_fliplr)
+    list_metric = xp.empty(list_shift.shape, dtype="float32")
 
-#     mat = xp.empty((chunks[0], na1 + na2, sino2.shape[1]), dtype=xp.float32)
-#     mat[:, :na1, :] = sino1
-#     # explicitly create FFT plan here, so it's not cached and clearly re-used
-#     plan = get_fft_plan(
-#         mat, mat.shape[-2:], axes=(1, 2), value_type="R2C"
-#     )
+    _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, out=list_metric)
+    cor = list_cor[xp.argmin(list_metric)]
+    return cor
 
-#     for i, stop_idx in enumerate(chunks):
-#         if i > 0:
-#             # more than one iteration means we're tight on memory, so clear up freed blocks
-#             mat_freq = None
-#             xp.get_default_memory_pool().free_all_blocks()
-
-#         start_idx = 0 if i == 0 else chunks[i - 1]
-#         size = stop_idx - start_idx
-
-#         # first, handle the integer shifts without spline in a raw kernel,
-#         # and shift in the sino3 one accordingly
-#         bx = 128
-#         gx = (sino3.shape[1] + bx - 1) // bx
-#         shift_whole_shifts(
-#             grid=(gx, na2, size),  ####
-#             block=(bx, 1, 1),
-#             args=(
-#                 sino2,
-#                 sino3,
-#                 list_shift[start_idx:stop_idx],
-#                 mat[:, na1:, :],
-#                 sino3.shape[1],
-#                 na1 + na2,
-#             ),
-#         )
 
-#         # now we can only look at the spline shifting, the rest is done
-#         list_shift_host = xp.asnumpy(list_shift[start_idx:stop_idx])
-#         for i in range(list_shift_host.shape[0]):
-#             shift_col = float(list_shift_host[i])
-#             if not shift_col.is_integer():
-#                 shifted = shift(sino2, (0, shift_col), order=3, prefilter=True)
-#                 shift_int = round_up(shift_col)
-#                 if shift_int >= 0:
-#                     mat[i, na1:, shift_int:] = shifted[:, shift_int:]
-#                 else:
-#                     mat[i, na1:, :shift_int] = shifted[:, :shift_int]
-
-#         # stack and transform
-#         # (we do the full sized mat FFT, even though the last chunk may be smaller, to
-#         # make sure we can re-use the same FFT plan as before)
-#         mat_freq = rfft2(mat, axes=(1, 2), norm=None, plan=plan)
-#         masked_sum_abs_kernel(
-#             mat_freq[:size, :, :], mask, out=out[start_idx:stop_idx], axis=(1, 2)
-#         )
+@nvtx.annotate()
+def _create_mask(nrow, ncol, radius, drop):
+    du = 1.0 / ncol
+    dv = (nrow - 1.0) / (nrow * 2.0 * np.pi)
+    cen_row = int(math.ceil(nrow / 2.0) - 1)
+    cen_col = int(math.ceil(ncol / 2.0) - 1)
+    drop = min([drop, int(math.ceil(0.05 * nrow))])
+
+    block_x = 128
+    block_y = 1
+    block_dims = (block_x, block_y)
+    grid_x = (ncol // 2 + 1 + block_x - 1) // block_x
+    grid_y = nrow
+    grid_dims = (grid_x, grid_y)
+    mask = xp.empty((nrow, ncol // 2 + 1), dtype="uint16")
+    params = (
+        ncol,
+        nrow,
+        cen_col,
+        cen_row,
+        xp.float32(du),
+        xp.float32(dv),
+        xp.float32(radius),
+        xp.float32(drop),
+        mask,
+    )
+    module = load_cuda_module("generate_mask")
+    kernel = module.get_function("generate_mask")
+    kernel(grid_dims, block_dims, params)
+    return mask
+
+
+def round_up(x: float) -> int:
+    if x >= 0.0:
+        return int(math.ceil(x))
+    else:
+        return int(math.floor(x))
+
+
+def _get_available_gpu_memory() -> int:
+    dev = xp.cuda.Device()
+    # first, let's make some space
+    xp.get_default_memory_pool().free_all_blocks()
+    cache = xp.fft.config.get_plan_cache()
+    cache.clear()
+    available_memory = dev.mem_info[0] + xp.get_default_memory_pool().free_bytes()
+    return int(available_memory * 0.9)  # 10% safety margin
+
+
+def _calculate_chunks(
+    nshifts: int, shift_size: int, available_memory: Optional[int] = None
+) -> List[int]:
+    if available_memory is None:
+        available_memory = _get_available_gpu_memory()
+
+    available_memory -= shift_size
+    freq_domain_size = (
+        shift_size  # it needs only half (RFFT), but complex64, so it's the same
+    )
+    fft_plan_size = freq_domain_size
+    size_per_shift = fft_plan_size + freq_domain_size + shift_size
+    nshift_max = available_memory // size_per_shift
+    assert nshift_max > 0, "Not enough memory to process"
+    num_chunks = int(np.ceil(nshifts / nshift_max))
+    chunk_size = int(np.ceil(nshifts / num_chunks))
+    chunks = [chunk_size] * (num_chunks - 1)
+    stop_idx = list(np.cumsum(chunks))
+    stop_idx.append(nshifts)
+    return stop_idx
 
 
-# @nvtx.annotate()
-# def _downsample(sino, level, axis):
-#     assert sino.dtype == xp.float32, "single precision floating point input required"
-#     assert sino.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
-
-#     dx, dz = sino.shape
-#     # Determine the new size, dim, of the downsampled dimension
-#     dim = int(sino.shape[axis] / math.pow(2, level))
-#     shape = [dx, dz]
-#     shape[axis] = dim
-#     downsampled_data = xp.empty(shape, dtype="float32")
-
-#     block_x = 8
-#     block_y = 8
-#     block_dims = (block_x, block_y)
-#     grid_x = (sino.shape[1] + block_x - 1) // block_x
-#     grid_y = (sino.shape[0] + block_y - 1) // block_y
-#     grid_dims = (grid_x, grid_y)
-#     # 8x8 thread-block, which means 16 "lots" of columns to downsample per
-#     # thread-block; 4 bytes per float, so allocate 16*6 = 64 bytes of shared
-#     # memeory per thread-block
-#     shared_mem_bytes = 64
-#     params = (sino, dx, dz, level, downsampled_data)
-#     module = load_cuda_module("downsample_sino")
-#     kernel = module.get_function("downsample_sino")
-#     kernel(grid_dims, block_dims, params, shared_mem=shared_mem_bytes)
-#     return downsampled_data
+@nvtx.annotate()
+def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
+    # this tries to simplify - if shift_col is integer, no need to spline interpolate
+    assert list_shift.dtype == xp.float32, "shifts must be single precision floats"
+    assert sino1.dtype == xp.float32, "sino1 must be float32"
+    assert sino2.dtype == xp.float32, "sino1 must be float32"
+    assert sino3.dtype == xp.float32, "sino1 must be float32"
+    assert out.dtype == xp.float32, "sino1 must be float32"
+    assert sino2.flags["C_CONTIGUOUS"], "sino2 must be C-contiguous"
+    assert sino3.flags["C_CONTIGUOUS"], "sino3 must be C-contiguous"
+    assert list_shift.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
+    nshifts = list_shift.shape[0]
+    na1 = sino1.shape[0]
+    na2 = sino2.shape[0]
+
+    module = load_cuda_module("center_360_shifts")
+    shift_whole_shifts = module.get_function("shift_whole_shifts")
+    # note: we don't have to calculate the mean here, as we're only looking for minimum metric.
+    # The sum is enough.
+    masked_sum_abs_kernel = xp.ReductionKernel(
+        in_params="complex64 x, uint16 mask",  # input, complex + mask
+        out_params="float32 out",  # output, real
+        map_expr="mask ? abs(x) : 0.0f",
+        reduce_expr="a + b",
+        post_map_expr="out = a",
+        identity="0.0f",
+        reduce_type="float",
+        name="masked_sum_abs",
+    )
+
+    # determine how many shifts we can fit in the available memory
+    # and iterate in chunks
+    chunks = _calculate_chunks(
+        nshifts, (na1 + na2) * sino2.shape[1] * xp.float32().nbytes
+    )
+
+    mat = xp.empty((chunks[0], na1 + na2, sino2.shape[1]), dtype=xp.float32)
+    mat[:, :na1, :] = sino1
+    # explicitly create FFT plan here, so it's not cached and clearly re-used
+    plan = get_fft_plan(
+        mat, mat.shape[-2:], axes=(1, 2), value_type="R2C"
+    )
+
+    for i, stop_idx in enumerate(chunks):
+        if i > 0:
+            # more than one iteration means we're tight on memory, so clear up freed blocks
+            mat_freq = None
+            xp.get_default_memory_pool().free_all_blocks()
+
+        start_idx = 0 if i == 0 else chunks[i - 1]
+        size = stop_idx - start_idx
+
+        # first, handle the integer shifts without spline in a raw kernel,
+        # and shift in the sino3 one accordingly
+        bx = 128
+        gx = (sino3.shape[1] + bx - 1) // bx
+        shift_whole_shifts(
+            grid=(gx, na2, size),  ####
+            block=(bx, 1, 1),
+            args=(
+                sino2,
+                sino3,
+                list_shift[start_idx:stop_idx],
+                mat[:, na1:, :],
+                sino3.shape[1],
+                na1 + na2,
+            ),
+        )
+
+        # now we can only look at the spline shifting, the rest is done
+        list_shift_host = xp.asnumpy(list_shift[start_idx:stop_idx])
+        for i in range(list_shift_host.shape[0]):
+            shift_col = float(list_shift_host[i])
+            if not shift_col.is_integer():
+                shifted = shift(sino2, (0, shift_col), order=3, prefilter=True)
+                shift_int = round_up(shift_col)
+                if shift_int >= 0:
+                    mat[i, na1:, shift_int:] = shifted[:, shift_int:]
+                else:
+                    mat[i, na1:, :shift_int] = shifted[:, :shift_int]
+
+        # stack and transform
+        # (we do the full sized mat FFT, even though the last chunk may be smaller, to
+        # make sure we can re-use the same FFT plan as before)
+        mat_freq = rfft2(mat, axes=(1, 2), norm=None, plan=plan)
+        masked_sum_abs_kernel(
+            mat_freq[:size, :, :], mask, out=out[start_idx:stop_idx], axis=(1, 2)
+        )
 
 
-# ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+@nvtx.annotate()
+def _downsample(sino, level, axis):
+    assert sino.dtype == xp.float32, "single precision floating point input required"
+    assert sino.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
+
+    dx, dz = sino.shape
+    # Determine the new size, dim, of the downsampled dimension
+    dim = int(sino.shape[axis] / math.pow(2, level))
+    shape = [dx, dz]
+    shape[axis] = dim
+    downsampled_data = xp.empty(shape, dtype="float32")
+
+    block_x = 8
+    block_y = 8
+    block_dims = (block_x, block_y)
+    grid_x = (sino.shape[1] + block_x - 1) // block_x
+    grid_y = (sino.shape[0] + block_y - 1) // block_y
+    grid_dims = (grid_x, grid_y)
+    # 8x8 thread-block, which means 16 "lots" of columns to downsample per
+    # thread-block; 4 bytes per float, so allocate 16*6 = 64 bytes of shared
+    # memeory per thread-block
+    shared_mem_bytes = 64
+    params = (sino, dx, dz, level, downsampled_data)
+    module = load_cuda_module("downsample_sino")
+    kernel = module.get_function("downsample_sino")
+    kernel(grid_dims, block_dims, params, shared_mem=shared_mem_bytes)
+    return downsampled_data
+
+
+##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 
 # # %%%%%%%%%%%%%%%%%%%%%%%%%find_center_360%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -719,7 +723,7 @@
 # # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 
-# %%%%%%%%%%%%%%%%%%%%%%find_center_pc%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# ## %%%%%%%%%%%%%%%%%%%%%%find_center_pc%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # @nvtx.annotate()
 # def find_center_pc(
 #     proj1: xp.ndarray, proj2: xp.ndarray, tol: float = 0.5, rotc_guess: Union[float, Optional[str]] = None
@@ -771,26 +775,4 @@
 
 #     return center + imgshift
 
-@nvtx.annotate()
-def find_center_pc(
-    proj1: xp.ndarray, proj2: xp.ndarray, tol: float = 0.5, rotc_guess: Union[float, Optional[str]] = None
-) -> float:
-    """Find rotation axis location by finding the offset between the first
-    projection and a mirrored projection 180 degrees apart using
-    phase correlation in Fourier space.
-    The `phase_cross_correlation` function uses cross-correlation in Fourier
-    space, optionally employing an upsampled matrix-multiplication DFT to
-    achieve arbitrary subpixel precision. :cite:`Guizar:08`.
-
-    Args:
-        proj1 (xp.ndarray): Projection from the 0th degree
-        proj2 (xp.ndarray): Projection from the 180th degree
-        tol (float, optional): Subpixel accuracy. Defaults to 0.5.
-        rotc_guess (float, optional): Initial guess value for the rotation center. Defaults to None.
-
-    Returns:
-        float: Rotation axis location.
-    """
-
-    return 0.5
-# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

From 706e8e3d353d73994b77d8f1466137f745dc3819 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 15:09:07 +0100
Subject: [PATCH 30/36] corr7

---
 httomolibgpu/__init__.py  |  2 +-
 httomolibgpu/misc/corr.py | 36 ++++++++++++++++++++++++------------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index 25b171b7..ccb48651 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -11,4 +11,4 @@
 )
 from httomolibgpu.recon.algorithm import FBP, SIRT, CGLS
 #from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
-from httomolibgpu.recon.rotation import find_center_vo
+#from httomolibgpu.recon.rotation import find_center_vo
diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index f00e3d88..4215eac4 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -35,28 +35,27 @@
 except ImportError:
     import numpy as xp
 
-try:
-    from cucim.skimage.filters import median
-    from cucim.skimage.morphology import disk
-except ImportError:
-    print(
-        "Cucim library of Rapidsai is a required dependency for median_filter and remove_outlier modules, please install"
-    )
-    from skimage.filters import median
-    from skimage.morphology import disk
+# try:
+#     from cucim.skimage.filters import median
+#     from cucim.skimage.morphology import disk
+# except ImportError:
+#     print(
+#         "Cucim library of Rapidsai is a required dependency for median_filter and remove_outlier modules, please install"
+#     )
+#     from skimage.filters import median
+#     from skimage.morphology import disk
 
 from numpy import float32
 import nvtx
 
-if cupy_run:
-    from httomolibgpu.cuda_kernels import load_cuda_module
+# if cupy_run:
+#     from httomolibgpu.cuda_kernels import load_cuda_module
 
 __all__ = [
     "median_filter",
     "remove_outlier",
 ]
 
-
 @nvtx.annotate()
 def median_filter(
     data: xp.ndarray,
@@ -89,6 +88,19 @@ def median_filter(
     ValueError
         If the input array is not three dimensional.
     """
+    return __median_filter(data,kernel_size,axis,dif)
+
+
+def __median_filter(
+    data: xp.ndarray,
+    kernel_size: int = 3,
+    axis: int = 0,
+    dif: float = 0.0,
+) -> xp.ndarray:
+
+    from cucim.skimage.filters import median
+    from cucim.skimage.morphology import disk
+    from httomolibgpu.cuda_kernels import load_cuda_module
 
     input_type = data.dtype
 

From 08967e39ee48b3db6477a052f871e168c4c0c977 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 15:36:07 +0100
Subject: [PATCH 31/36] corr8

---
 httomolibgpu/__init__.py   |  2 +-
 httomolibgpu/misc/corr.py  | 37 ++++++++++----------
 httomolibgpu/prep/phase.py | 69 ++++++++++++++++++++------------------
 3 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index ccb48651..08251bc6 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -3,7 +3,7 @@
 from httomolibgpu.misc.rescale import rescale_to_int
 from httomolibgpu.prep.alignment import distortion_correction_proj_discorpy
 from httomolibgpu.prep.normalize import normalize
-#from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
+from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
 from httomolibgpu.prep.stripe import (
     remove_stripe_based_sorting,
     remove_stripe_ti,
diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index 4215eac4..5ea0d387 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -24,7 +24,6 @@
 cupy_run = False
 try:
     import cupy as xp
-    from cupy import mean
 
     try:
         xp.cuda.Device(0).compute_capability
@@ -35,22 +34,9 @@
 except ImportError:
     import numpy as xp
 
-# try:
-#     from cucim.skimage.filters import median
-#     from cucim.skimage.morphology import disk
-# except ImportError:
-#     print(
-#         "Cucim library of Rapidsai is a required dependency for median_filter and remove_outlier modules, please install"
-#     )
-#     from skimage.filters import median
-#     from skimage.morphology import disk
-
 from numpy import float32
 import nvtx
 
-# if cupy_run:
-#     from httomolibgpu.cuda_kernels import load_cuda_module
-
 __all__ = [
     "median_filter",
     "remove_outlier",
@@ -88,7 +74,11 @@ def median_filter(
     ValueError
         If the input array is not three dimensional.
     """
-    return __median_filter(data,kernel_size,axis,dif)
+    if cupy_run:
+        return __median_filter(data, kernel_size, axis, dif)
+    else:
+        print("median_filter won't be executed because CuPy is not installed")
+        return data
 
 
 def __median_filter(
@@ -98,8 +88,13 @@ def __median_filter(
     dif: float = 0.0,
 ) -> xp.ndarray:
 
-    from cucim.skimage.filters import median
-    from cucim.skimage.morphology import disk
+    try:
+        from cucim.skimage.filters import median
+        from cucim.skimage.morphology import disk
+    except ImportError:
+        print(
+            "Cucim library of Rapidsai is a required dependency for median_filter and remove_outlier modules, please install"
+        )
     from httomolibgpu.cuda_kernels import load_cuda_module
 
     input_type = data.dtype
@@ -172,7 +167,7 @@ def __median_filter(
         thresholding_kernel(data, float32(dif), output)
     return output
 
-
+@nvtx.annotate()
 def remove_outlier(
     data: xp.ndarray, kernel_size: int = 3, axis: int = 0, dif: float = 0.1
 ) -> xp.ndarray:
@@ -205,4 +200,8 @@ def remove_outlier(
     if dif <= 0.0:
         raise ValueError("Threshold value (dif) must be positive and nonzero.")
 
-    return median_filter(data=data, kernel_size=kernel_size, axis=axis, dif=dif)
+    if cupy_run:
+        return __median_filter(data, kernel_size, axis, dif)
+    else:
+        print("remove_outlier won't be executed because CuPy is not installed")
+        return data
diff --git a/httomolibgpu/prep/phase.py b/httomolibgpu/prep/phase.py
index 073bcdb0..54e69e45 100644
--- a/httomolibgpu/prep/phase.py
+++ b/httomolibgpu/prep/phase.py
@@ -24,11 +24,9 @@
 cupy_run = False
 try:
     import cupy as xp
-
     try:
         xp.cuda.Device(0).compute_capability
         cupy_run = True
-
     except xp.cuda.runtime.CUDARuntimeError:
         print("CuPy library is a major dependency for HTTomolibgpu, please install")
         import numpy as np
@@ -40,12 +38,6 @@
 import nvtx
 import math
 
-if cupy_run:
-    from httomolibgpu.cuda_kernels import load_cuda_module
-    from cupyx.scipy.fft import fft2, ifft2, fftshift
-else:
-    from scipy.fft import fft2, ifft2, fftshift
-
 __all__ = [
     "paganin_filter_savu",
     "paganin_filter_tomopy",
@@ -102,7 +94,27 @@ def paganin_filter_savu(
     -------
     cp.ndarray
         The stack of filtered projections.
-    """    
+    """   
+    if cupy_run:
+        return __paganin_filter_savu(data, ratio, energy, distance, resolution,  pad_y, pad_x, pad_method, increment)
+    else:
+        print("__paganin_filter_savu won't be executed because CuPy is not installed")
+        return data
+
+def __paganin_filter_savu(
+    data: xp.ndarray,
+    ratio: float = 250.0,
+    energy: float = 53.0,
+    distance: float = 1.0,
+    resolution: float = 1.28,
+    pad_y: int = 100,
+    pad_x: int = 100,
+    pad_method: str = "edge",
+    increment: float = 0.0,
+) -> xp.ndarray:
+    
+    from httomolibgpu.cuda_kernels import load_cuda_module
+    from cupyx.scipy.fft import fft2, ifft2
 
     # Check the input data is valid
     if data.ndim != 3:
@@ -208,31 +220,13 @@ def paganin_filter_savu(
         np.float32(fft_scale),
         res,
     )
-
     return res
 
-
 def _wavelength(energy: float) -> float:
     SPEED_OF_LIGHT = 299792458e2  # [cm/s]
     PLANCK_CONSTANT = 6.58211928e-19  # [keV*s]
     return 2 * math.pi * PLANCK_CONSTANT * SPEED_OF_LIGHT / energy
 
-
-def _paganin_filter_factor(
-    energy: float, dist: float, alpha: float, w2: xp.ndarray
-) -> xp.ndarray:
-    return 1 / (_wavelength(energy) * dist * w2 / (4 * math.pi) + alpha)
-
-
-def _calc_pad_width(dim: int, pixel_size: float, wavelength: float, dist: float) -> int:
-    pad_pix = xp.ceil(math.pi * wavelength * dist / pixel_size**2)
-    return int((pow(2, xp.ceil(xp.log2(dim + pad_pix))) - dim) * 0.5)
-
-
-def _calc_pad_val(tomo: xp.ndarray) -> float:
-    return xp.mean((tomo[..., 0] + tomo[..., -1]) * 0.5)
-
-
 def _reciprocal_grid(pixel_size: float, shape_proj: tuple) -> xp.ndarray:
     """
     Calculate reciprocal grid.
@@ -257,7 +251,6 @@ def _reciprocal_grid(pixel_size: float, shape_proj: tuple) -> xp.ndarray:
 
     return xp.add.outer(indx_sq, indy_sq)
 
-
 def _reciprocal_coord(pixel_size: float, num_grid: int) -> xp.ndarray:
     """
     Calculate reciprocal grid coordinates for a given pixel size
@@ -280,11 +273,8 @@ def _reciprocal_coord(pixel_size: float, num_grid: int) -> xp.ndarray:
     rc *= 2 * math.pi / (n * pixel_size)
     return rc
 
-
 ##-------------------------------------------------------------##
 ##-------------------------------------------------------------##
-
-
 # Adaptation with some corrections of retrieve_phase (Paganin filter)
 # from TomoPy
 @nvtx.annotate()
@@ -317,6 +307,21 @@ def paganin_filter_tomopy(
     cp.ndarray
         The 3D array of Paganin phase-filtered projection images.
     """
+    if cupy_run:
+        return __paganin_filter_tomopy(tomo, pixel_size, dist, energy, alpha)
+    else:
+        print("paganin_filter_tomopy won't be executed because CuPy is not installed")
+        return tomo
+
+def __paganin_filter_tomopy(
+    tomo: xp.ndarray,
+    pixel_size: float = 1e-4,
+    dist: float = 50.0,
+    energy: float = 53.0,
+    alpha: float = 1e-3,
+) -> xp.ndarray:
+
+    from cupyx.scipy.fft import fft2, ifft2, fftshift
 
     # Check the input data is valid
     if tomo.ndim != 3:
@@ -375,7 +380,6 @@ def paganin_filter_tomopy(
 def _shift_bit_length(x: int) -> int:
     return 1 << (x - 1).bit_length()
 
-
 def _pad_projections_to_second_power(tomo: xp.ndarray) -> Union[xp.ndarray, tuple]:
     """
     Performs padding of each projection to the next power of 2.
@@ -414,7 +418,6 @@ def _pad_projections_to_second_power(tomo: xp.ndarray) -> Union[xp.ndarray, tupl
 
     return padded_tomo, pad_tup
 
-
 def _paganin_filter_factor2(energy, dist, alpha, w2):
     # Alpha represents the ratio of delta/beta.
     return 1 / (_wavelength(energy) * dist * w2 / (4 * math.pi) + alpha)

From a1369aa4abfe20abd4290f0c3ad2312aebf3bf21 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 15:47:57 +0100
Subject: [PATCH 32/36] corr8

---
 httomolibgpu/__init__.py              |  2 +-
 httomolibgpu/cuda_kernels/__init__.py | 15 ++++--------
 httomolibgpu/misc/corr.py             | 34 ++++++++++-----------------
 3 files changed, 17 insertions(+), 34 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index 08251bc6..ccb48651 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -3,7 +3,7 @@
 from httomolibgpu.misc.rescale import rescale_to_int
 from httomolibgpu.prep.alignment import distortion_correction_proj_discorpy
 from httomolibgpu.prep.normalize import normalize
-from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
+#from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
 from httomolibgpu.prep.stripe import (
     remove_stripe_based_sorting,
     remove_stripe_ti,
diff --git a/httomolibgpu/cuda_kernels/__init__.py b/httomolibgpu/cuda_kernels/__init__.py
index 71688024..32d4312d 100644
--- a/httomolibgpu/cuda_kernels/__init__.py
+++ b/httomolibgpu/cuda_kernels/__init__.py
@@ -1,21 +1,14 @@
 import os
 from typing import List, Optional, Tuple
 
-try:
-    import cupy as xp
-    try:
-        xp.cuda.Device(0).compute_capability
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as xp
-except ImportError:
-    import numpy as xp
+from httomolibgpu import cupywrapper
+cp = cupywrapper.cp
 
 def load_cuda_module(
     file: str,
     name_expressions: Optional[List[str]] = None,
     options: Tuple[str, ...] = tuple(),
-) -> xp.RawModule:
+) -> cp.RawModule:
     """Load a CUDA module file, i.e. a .cu file, from the file system,
     compile it, and return is as a CuPy RawModule for further
     processing.
@@ -29,6 +22,6 @@ def load_cuda_module(
     with open(file, "r") as f:
         code += f.read()
 
-    return xp.RawModule(
+    return cp.RawModule(
         options=("-std=c++11", *options), code=code, name_expressions=name_expressions
     )
diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index 5ea0d387..28da4b67 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -21,18 +21,8 @@
 """ Module for data correction """
 
 import numpy as np
-cupy_run = False
-try:
-    import cupy as xp
-
-    try:
-        xp.cuda.Device(0).compute_capability
-        cupy_run = True
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as xp
-except ImportError:
-    import numpy as xp
+from httomolibgpu import cupywrapper
+cp = cupywrapper.cp
 
 from numpy import float32
 import nvtx
@@ -44,11 +34,11 @@
 
 @nvtx.annotate()
 def median_filter(
-    data: xp.ndarray,
+    data: cp.ndarray,
     kernel_size: int = 3,
     axis: int = 0,
     dif: float = 0.0,
-) -> xp.ndarray:
+) -> cp.ndarray:
     """
     Apply 2D or 3D median or dezinger (when dif>0) filter to a 3D array.
 
@@ -74,7 +64,7 @@ def median_filter(
     ValueError
         If the input array is not three dimensional.
     """
-    if cupy_run:
+    if cupywrapper.cupy_run:
         return __median_filter(data, kernel_size, axis, dif)
     else:
         print("median_filter won't be executed because CuPy is not installed")
@@ -82,11 +72,11 @@ def median_filter(
 
 
 def __median_filter(
-    data: xp.ndarray,
+    data: cp.ndarray,
     kernel_size: int = 3,
     axis: int = 0,
     dif: float = 0.0,
-) -> xp.ndarray:
+) -> cp.ndarray:
 
     try:
         from cucim.skimage.filters import median
@@ -115,7 +105,7 @@ def __median_filter(
         raise ValueError("The axis should be 0,1,2 or None for full 3d processing")
 
     dz, dy, dx = data.shape
-    output = xp.empty(data.shape, dtype=input_type, order="C")
+    output = cp.empty(data.shape, dtype=input_type, order="C")
 
     if axis == 0:
         for j in range(dz):
@@ -156,7 +146,7 @@ def __median_filter(
                 output = data;
             }
             """
-        thresholding_kernel = xp.ElementwiseKernel(
+        thresholding_kernel = cp.ElementwiseKernel(
             "T data, raw float32 dif",
             "T output",
             kernel,
@@ -169,8 +159,8 @@ def __median_filter(
 
 @nvtx.annotate()
 def remove_outlier(
-    data: xp.ndarray, kernel_size: int = 3, axis: int = 0, dif: float = 0.1
-) -> xp.ndarray:
+    data: cp.ndarray, kernel_size: int = 3, axis: int = 0, dif: float = 0.1
+) -> cp.ndarray:
     """
     Selectively applies 3D median filter to a 3D array to remove outliers. Also called a dezinger.
 
@@ -200,7 +190,7 @@ def remove_outlier(
     if dif <= 0.0:
         raise ValueError("Threshold value (dif) must be positive and nonzero.")
 
-    if cupy_run:
+    if cupywrapper.cupy_run:
         return __median_filter(data, kernel_size, axis, dif)
     else:
         print("remove_outlier won't be executed because CuPy is not installed")

From 5bec95de2efd3c306f0b2eadab16f71b45973aa5 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 16:00:01 +0100
Subject: [PATCH 33/36] corr9

---
 conda/recipe/meta.yaml | 1 +
 pyproject.toml         | 1 +
 2 files changed, 2 insertions(+)

diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml
index 98625e81..5dba8804 100644
--- a/conda/recipe/meta.yaml
+++ b/conda/recipe/meta.yaml
@@ -40,6 +40,7 @@ test:
     - pytest
   imports:
     - httomolibgpu
+    - httomolibgpu.cupywrapper
     - httomolibgpu.misc
     - httomolibgpu.prep
     - httomolibgpu.recon
diff --git a/pyproject.toml b/pyproject.toml
index 282e66ac..2ca79416 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,6 +5,7 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools]
 include-package-data = true
 packages = ["httomolibgpu",
+            "httomolibgpu.cupywrapper",
             "httomolibgpu.misc",
             "httomolibgpu.prep",
             "httomolibgpu.recon",

From aad6294a6788cc9ca8abf323288e1ad4f45b8050 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 16:05:27 +0100
Subject: [PATCH 34/36] corr10

---
 conda/recipe/meta.yaml      |  1 -
 httomolibgpu/cupywrapper.py | 11 +++++++++++
 pyproject.toml              |  1 -
 3 files changed, 11 insertions(+), 2 deletions(-)
 create mode 100644 httomolibgpu/cupywrapper.py

diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml
index 5dba8804..98625e81 100644
--- a/conda/recipe/meta.yaml
+++ b/conda/recipe/meta.yaml
@@ -40,7 +40,6 @@ test:
     - pytest
   imports:
     - httomolibgpu
-    - httomolibgpu.cupywrapper
     - httomolibgpu.misc
     - httomolibgpu.prep
     - httomolibgpu.recon
diff --git a/httomolibgpu/cupywrapper.py b/httomolibgpu/cupywrapper.py
new file mode 100644
index 00000000..a5b858cf
--- /dev/null
+++ b/httomolibgpu/cupywrapper.py
@@ -0,0 +1,11 @@
+cupy_run = False
+try:
+    import cupy as cp
+    try:
+        cp.cuda.Device(0).compute_capability
+        cupy_run = True
+    except cp.cuda.runtime.CUDARuntimeError:
+        print("CuPy library is a major dependency for HTTomolibgpu, please install")
+        import numpy as cp
+except ImportError:
+    import numpy as cp
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 2ca79416..282e66ac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,6 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools]
 include-package-data = true
 packages = ["httomolibgpu",
-            "httomolibgpu.cupywrapper",
             "httomolibgpu.misc",
             "httomolibgpu.prep",
             "httomolibgpu.recon",

From 7e88035affd17bb71278177f159ab829f9c95126 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 1 May 2024 16:31:01 +0100
Subject: [PATCH 35/36] corr11

---
 httomolibgpu/misc/morph.py   | 86 +++++++++++++++++++-----------------
 httomolibgpu/misc/rescale.py | 70 +++++++++++++++++++++--------
 2 files changed, 98 insertions(+), 58 deletions(-)

diff --git a/httomolibgpu/misc/morph.py b/httomolibgpu/misc/morph.py
index 75fa8774..0a9c0b94 100644
--- a/httomolibgpu/misc/morph.py
+++ b/httomolibgpu/misc/morph.py
@@ -21,23 +21,8 @@
 """Module for data type morphing functions"""
 
 import numpy as np
-cupy_run = False
-try:
-    import cupy as xp
-
-    try:
-        xp.cuda.Device(0).compute_capability
-        cupy_run = True
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as xp
-except ImportError:
-    import numpy as xp
-
-if cupy_run:
-    from cupyx.scipy.interpolate import interpn
-else:
-    from scipy.interpolate import interpn
+from httomolibgpu import cupywrapper
+cp = cupywrapper.cp
 
 import nvtx
 from typing import Literal
@@ -47,11 +32,10 @@
     "data_resampler",
 ]
 
-
 @nvtx.annotate()
 def sino_360_to_180(
-    data: xp.ndarray, overlap: int = 0, rotation: Literal["left", "right"] = "left"
-) -> xp.ndarray:
+    data: cp.ndarray, overlap: int = 0, rotation: Literal["left", "right"] = "left"
+) -> cp.ndarray:
     """
     Converts 0-360 degrees sinogram to a 0-180 sinogram.
     If the number of projections in the input data is odd, the last projection
@@ -71,6 +55,17 @@ def sino_360_to_180(
     cp.ndarray
         Output 3D data.
     """
+    if cupywrapper.cupy_run:
+        return __sino_360_to_180(data, overlap, rotation)
+    else:
+        print("sino_360_to_180 won't be executed because CuPy is not installed")
+        return data
+
+
+def __sino_360_to_180(
+    data: cp.ndarray, overlap: int = 0, rotation: Literal["left", "right"] = "left"
+) -> cp.ndarray:
+
     if data.ndim != 3:
         raise ValueError("only 3D data is supported")
 
@@ -84,10 +79,10 @@ def sino_360_to_180(
 
     n = dx // 2
 
-    out = xp.empty((n, dy, 2 * dz - overlap), dtype=data.dtype)
+    out = cp.empty((n, dy, 2 * dz - overlap), dtype=data.dtype)
 
     if rotation == "left":
-        weights = xp.linspace(0, 1.0, overlap)
+        weights = cp.linspace(0, 1.0, overlap)
         out[:, :, -dz + overlap :] = data[:n, :, overlap:]
         out[:, :, : dz - overlap] = data[n : 2 * n, :, overlap:][:, :, ::-1]
         out[:, :, dz - overlap : dz] = (
@@ -95,7 +90,7 @@ def sino_360_to_180(
             + (weights * data[n : 2 * n, :, :overlap])[:, :, ::-1]
         )
     elif rotation == "right":
-        weights = xp.linspace(1.0, 0, overlap)
+        weights = cp.linspace(1.0, 0, overlap)
         out[:, :, : dz - overlap] = data[:n, :, :-overlap]
         out[:, :, -dz + overlap :] = data[n : 2 * n, :, :-overlap][:, :, ::-1]
         out[:, :, dz - overlap : dz] = (
@@ -110,8 +105,8 @@ def sino_360_to_180(
 
 @nvtx.annotate()
 def data_resampler(
-    data: xp.ndarray, newshape: list, axis: int = 1, interpolation: str = "linear"
-) -> xp.ndarray:
+    data: cp.ndarray, newshape: list, axis: int = 1, interpolation: str = "linear"
+) -> cp.ndarray:
     """Down/Up-resampler of the input data implemented through interpn function.
        Please note that the method will leave the specified axis
        dimension unchanged, e.g. (128,128,128) -> (128,256,256) for axis = 0 and
@@ -128,31 +123,42 @@ def data_resampler(
 
     Returns:
         cp.ndarray: Up/Down-scaled 3D cupy array
-    """   
+    """  
+    if cupywrapper.cupy_run:
+        return __data_resampler(data, newshape, axis, interpolation)
+    else:
+        print("data_resampler won't be executed because CuPy is not installed")
+        return data
+
+def __data_resampler(
+    data: cp.ndarray, newshape: list, axis: int = 1, interpolation: str = "linear"
+) -> cp.ndarray:
+    
+    from cupyx.scipy.interpolate import interpn
 
     if data.ndim != 3:
         raise ValueError("only 3D data is supported")
 
-    N, M, Z = xp.shape(data)
+    N, M, Z = cp.shape(data)
 
     if axis == 0:
-        xaxis = xp.arange(M) - M / 2
-        yaxis = xp.arange(Z) - Z / 2
+        xaxis = cp.arange(M) - M / 2
+        yaxis = cp.arange(Z) - Z / 2
         step_x = M / newshape[0]
         step_y = Z / newshape[1]
-        scaled_data = xp.empty((N, newshape[0], newshape[1]), dtype=xp.float32)
+        scaled_data = cp.empty((N, newshape[0], newshape[1]), dtype=cp.float32)
     elif axis == 1:
-        xaxis = xp.arange(N) - N / 2
-        yaxis = xp.arange(Z) - Z / 2
+        xaxis = cp.arange(N) - N / 2
+        yaxis = cp.arange(Z) - Z / 2
         step_x = N / newshape[0]
         step_y = Z / newshape[1]
-        scaled_data = xp.empty((newshape[0], M, newshape[1]), dtype=xp.float32)
+        scaled_data = cp.empty((newshape[0], M, newshape[1]), dtype=cp.float32)
     elif axis == 2:
-        xaxis = xp.arange(N) - N / 2
-        yaxis = xp.arange(M) - M / 2
+        xaxis = cp.arange(N) - N / 2
+        yaxis = cp.arange(M) - M / 2
         step_x = N / newshape[0]
         step_y = M / newshape[1]
-        scaled_data = xp.empty((newshape[0], newshape[1], Z), dtype=xp.float32)
+        scaled_data = cp.empty((newshape[0], newshape[1], Z), dtype=cp.float32)
     else:
         raise ValueError("Only 0,1,2 values for axes are supported")
 
@@ -181,7 +187,7 @@ def data_resampler(
     xi_size = xi.size
     xi = np.rollaxis(xi, 0, 3)
     xi = np.reshape(xi, [xi_size // 2, 2])
-    xi = xp.asarray(xi, dtype=xp.float32, order="C")
+    xi = cp.asarray(xi, dtype=cp.float32, order="C")
 
     if axis == 0:
         for j in range(N):
@@ -193,7 +199,7 @@ def data_resampler(
                 bounds_error=False,
                 fill_value=0.0,
             )
-            scaled_data[j, :, :] = xp.reshape(
+            scaled_data[j, :, :] = cp.reshape(
                 res, [newshape[0], newshape[1]], order="C"
             )
     elif axis == 1:
@@ -207,7 +213,7 @@ def data_resampler(
                 bounds_error=False,
                 fill_value=0.0,
             )
-            scaled_data[:, j, :] = xp.reshape(
+            scaled_data[:, j, :] = cp.reshape(
                 res, [newshape[0], newshape[1]], order="C"
             )
     else:
@@ -220,7 +226,7 @@ def data_resampler(
                 bounds_error=False,
                 fill_value=0.0,
             )
-            scaled_data[:, :, j] = xp.reshape(
+            scaled_data[:, :, j] = cp.reshape(
                 res, [newshape[0], newshape[1]], order="C"
             )
 
diff --git a/httomolibgpu/misc/rescale.py b/httomolibgpu/misc/rescale.py
index 274e17f4..506db058 100644
--- a/httomolibgpu/misc/rescale.py
+++ b/httomolibgpu/misc/rescale.py
@@ -11,7 +11,7 @@
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either ecpress or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ---------------------------------------------------------------------------
@@ -20,16 +20,8 @@
 # ---------------------------------------------------------------------------
 
 import numpy as np
-try:
-    import cupy as xp
-
-    try:
-        xp.cuda.Device(0).compute_capability
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as xp
-except ImportError:
-    import numpy as xp
+from httomolibgpu import cupywrapper
+cp = cupywrapper.cp
 
 import nvtx
 from typing import Literal, Optional, Tuple, Union
@@ -37,11 +29,53 @@
 __all__ = [
     "rescale_to_int",
 ]
-
-
 @nvtx.annotate()
 def rescale_to_int(
-    data: xp.ndarray,
+    data: cp.ndarray,
+    perc_range_min: float = 0.0,
+    perc_range_max: float = 100.0,
+    bits: Literal[8, 16, 32] = 8,
+    glob_stats: Optional[Tuple[float, float, float, int]] = None,
+):
+    """
+    Rescales the data and converts it fit into the range of an unsigned integer type
+    with the given number of bits.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        Required input data array, on GPU
+    perc_range_min: float, optional
+        The lower cutoff point in the input data, in percent of the data range (defaults to 0).
+        The lower bound is computed as min + perc_range_min/100*(max-min)
+    perc_range_max: float, optional
+        The upper cutoff point in the input data, in percent of the data range (defaults to 100).
+        The upper bound is computed as min + perc_range_max/100*(max-min)
+    bits: Literal[8, 16, 32], optional
+        The number of bits in the output integer range (defaults to 8).
+        Allowed values are:
+        - 8 -> uint8
+        - 16 -> uint16
+        - 32 -> uint32
+    glob_stats: tuple, optional
+        Global statistics of the full dataset (beyond the data passed into this call).
+        It's a tuple with (min, max, sum, num_items). If not given, the min/max is
+        computed from the given data.
+
+    Returns
+    -------
+    cp.ndarray
+        The original data, clipped to the range specified with the perc_range_min and
+        perc_range_max, and scaled to the full range of the output integer type
+    """
+    if cupywrapper.cupy_run:
+        return __rescale_to_int(data, perc_range_min, perc_range_max, bits, glob_stats)
+    else:
+        print("rescale_to_int won't be executed because CuPy is not installed")
+        return data    
+
+def __rescale_to_int(
+    data: cp.ndarray,
     perc_range_min: float = 0.0,
     perc_range_max: float = 100.0,
     bits: Literal[8, 16, 32] = 8,
@@ -91,8 +125,8 @@ def rescale_to_int(
     output_max = np.iinfo(output_dtype).max
 
     if not isinstance(glob_stats, tuple):
-        min_value = float(xp.min(data))
-        max_value = float(xp.max(data))
+        min_value = float(cp.min(data))
+        max_value = float(cp.max(data))
     else:
         min_value = glob_stats[0]
         max_value = glob_stats[1]
@@ -103,8 +137,8 @@ def rescale_to_int(
 
     factor = (output_max - output_min) / (input_max - input_min)
 
-    res = xp.empty(data.shape, dtype=output_dtype)
-    rescale_kernel = xp.ElementwiseKernel(
+    res = cp.empty(data.shape, dtype=output_dtype)
+    rescale_kernel = cp.ElementwiseKernel(
         "T x, raw T input_min, raw T input_max, raw T factor",
         "O out",
         """

From 73911e7ea58e7654e6f108ef46f60e81b9676357 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Thu, 2 May 2024 10:33:44 +0100
Subject: [PATCH 36/36] fully working local version

---
 httomolibgpu/__init__.py        |   6 +-
 httomolibgpu/cupywrapper.py     |  12 +-
 httomolibgpu/misc/corr.py       |   8 +-
 httomolibgpu/misc/morph.py      |  13 +-
 httomolibgpu/misc/rescale.py    |  10 +-
 httomolibgpu/prep/alignment.py  | 104 ++--
 httomolibgpu/prep/normalize.py  |  66 ++-
 httomolibgpu/prep/phase.py      | 125 +++--
 httomolibgpu/prep/stripe.py     | 221 ++++----
 httomolibgpu/recon/algorithm.py | 142 +++--
 httomolibgpu/recon/rotation.py  | 930 ++++++++++++++++----------------
 11 files changed, 917 insertions(+), 720 deletions(-)

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index ccb48651..e2181454 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -3,12 +3,12 @@
 from httomolibgpu.misc.rescale import rescale_to_int
 from httomolibgpu.prep.alignment import distortion_correction_proj_discorpy
 from httomolibgpu.prep.normalize import normalize
-#from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
+from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
 from httomolibgpu.prep.stripe import (
     remove_stripe_based_sorting,
     remove_stripe_ti,
     remove_all_stripe,
 )
+
 from httomolibgpu.recon.algorithm import FBP, SIRT, CGLS
-#from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
-#from httomolibgpu.recon.rotation import find_center_vo
+from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
diff --git a/httomolibgpu/cupywrapper.py b/httomolibgpu/cupywrapper.py
index a5b858cf..f3cccf61 100644
--- a/httomolibgpu/cupywrapper.py
+++ b/httomolibgpu/cupywrapper.py
@@ -1,11 +1,19 @@
 cupy_run = False
 try:
     import cupy as cp
+    import nvtx
+
     try:
         cp.cuda.Device(0).compute_capability
         cupy_run = True
     except cp.cuda.runtime.CUDARuntimeError:
         print("CuPy library is a major dependency for HTTomolibgpu, please install")
         import numpy as cp
-except ImportError:
-    import numpy as cp
\ No newline at end of file
+except ImportError as e:
+    print(
+        f"Failed to import module in {__file__} with error: {e}; defaulting to CPU-only mode"
+    )
+    from unittest.mock import Mock
+    import numpy as cp
+
+    nvtx = Mock()
diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index 28da4b67..49fe1521 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -22,17 +22,18 @@
 
 import numpy as np
 from httomolibgpu import cupywrapper
+
 cp = cupywrapper.cp
 
+nvtx = cupywrapper.nvtx
 from numpy import float32
-import nvtx
 
 __all__ = [
     "median_filter",
     "remove_outlier",
 ]
 
-@nvtx.annotate()
+
 def median_filter(
     data: cp.ndarray,
     kernel_size: int = 3,
@@ -71,6 +72,7 @@ def median_filter(
         return data
 
 
+@nvtx.annotate()
 def __median_filter(
     data: cp.ndarray,
     kernel_size: int = 3,
@@ -157,7 +159,7 @@ def __median_filter(
         thresholding_kernel(data, float32(dif), output)
     return output
 
-@nvtx.annotate()
+
 def remove_outlier(
     data: cp.ndarray, kernel_size: int = 3, axis: int = 0, dif: float = 0.1
 ) -> cp.ndarray:
diff --git a/httomolibgpu/misc/morph.py b/httomolibgpu/misc/morph.py
index 0a9c0b94..098d1712 100644
--- a/httomolibgpu/misc/morph.py
+++ b/httomolibgpu/misc/morph.py
@@ -22,9 +22,10 @@
 
 import numpy as np
 from httomolibgpu import cupywrapper
+
 cp = cupywrapper.cp
 
-import nvtx
+nvtx = cupywrapper.nvtx
 from typing import Literal
 
 __all__ = [
@@ -32,7 +33,7 @@
     "data_resampler",
 ]
 
-@nvtx.annotate()
+
 def sino_360_to_180(
     data: cp.ndarray, overlap: int = 0, rotation: Literal["left", "right"] = "left"
 ) -> cp.ndarray:
@@ -62,6 +63,7 @@ def sino_360_to_180(
         return data
 
 
+@nvtx.annotate()
 def __sino_360_to_180(
     data: cp.ndarray, overlap: int = 0, rotation: Literal["left", "right"] = "left"
 ) -> cp.ndarray:
@@ -103,7 +105,6 @@ def __sino_360_to_180(
     return out
 
 
-@nvtx.annotate()
 def data_resampler(
     data: cp.ndarray, newshape: list, axis: int = 1, interpolation: str = "linear"
 ) -> cp.ndarray:
@@ -123,17 +124,19 @@ def data_resampler(
 
     Returns:
         cp.ndarray: Up/Down-scaled 3D cupy array
-    """  
+    """
     if cupywrapper.cupy_run:
         return __data_resampler(data, newshape, axis, interpolation)
     else:
         print("data_resampler won't be executed because CuPy is not installed")
         return data
 
+
+@nvtx.annotate()
 def __data_resampler(
     data: cp.ndarray, newshape: list, axis: int = 1, interpolation: str = "linear"
 ) -> cp.ndarray:
-    
+
     from cupyx.scipy.interpolate import interpn
 
     if data.ndim != 3:
diff --git a/httomolibgpu/misc/rescale.py b/httomolibgpu/misc/rescale.py
index 506db058..3cdb25f1 100644
--- a/httomolibgpu/misc/rescale.py
+++ b/httomolibgpu/misc/rescale.py
@@ -21,15 +21,17 @@
 
 import numpy as np
 from httomolibgpu import cupywrapper
+
 cp = cupywrapper.cp
 
-import nvtx
+nvtx = cupywrapper.nvtx
 from typing import Literal, Optional, Tuple, Union
 
 __all__ = [
     "rescale_to_int",
 ]
-@nvtx.annotate()
+
+
 def rescale_to_int(
     data: cp.ndarray,
     perc_range_min: float = 0.0,
@@ -72,8 +74,10 @@ def rescale_to_int(
         return __rescale_to_int(data, perc_range_min, perc_range_max, bits, glob_stats)
     else:
         print("rescale_to_int won't be executed because CuPy is not installed")
-        return data    
+        return data
+
 
+@nvtx.annotate()
 def __rescale_to_int(
     data: cp.ndarray,
     perc_range_min: float = 0.0,
diff --git a/httomolibgpu/prep/alignment.py b/httomolibgpu/prep/alignment.py
index 484488db..480db1af 100644
--- a/httomolibgpu/prep/alignment.py
+++ b/httomolibgpu/prep/alignment.py
@@ -21,26 +21,12 @@
 """Modules for data correction"""
 
 import numpy as np
-cupy_run = False
-try:
-    import cupy as xp
-
-    try:
-        xp.cuda.Device(0).compute_capability
-        cupy_run = True
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as xp
-except ImportError:
-    import numpy as xp
+from httomolibgpu import cupywrapper
 
-from typing import Dict, List
-import nvtx
+cp = cupywrapper.cp
+nvtx = cupywrapper.nvtx
 
-if cupy_run:
-    from cupyx.scipy.ndimage import map_coordinates
-else:
-    from scipy.ndimage import map_coordinates
+from typing import Dict, List
 
 __all__ = [
     "distortion_correction_proj_discorpy",
@@ -52,9 +38,56 @@
 # (which is the same as the TomoPy version
 # https://github.com/tomopy/tomopy/blob/c236a2969074f5fc70189fb5545f0a165924f916/source/tomopy/prep/alignment.py#L950-L981
 # but with the additional params `order` and `mode`).
-@nvtx.annotate()
 def distortion_correction_proj_discorpy(
-    data: xp.ndarray,
+    data: cp.ndarray,
+    metadata_path: str,
+    preview: Dict[str, List[int]],
+    order: int = 1,
+    mode: str = "reflect",
+):
+    """Unwarp a stack of images using a backward model.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        3D array.
+
+    metadata_path : str
+        The path to the file containing the distortion coefficients for the
+        data.
+
+    preview : Dict[str, List[int]]
+        A dict containing three key-value pairs:
+        - a list containing the `start` value of each dimension
+        - a list containing the `stop` value of each dimension
+        - a list containing the `step` value of each dimension
+
+    order : int, optional.
+        The order of the spline interpolation.
+
+    mode : {'reflect', 'grid-mirror', 'constant', 'grid-constant', 'nearest',
+           'mirror', 'grid-wrap', 'wrap'}, optional
+        To determine how to handle image boundaries.
+
+    Returns
+    -------
+    cp.ndarray
+        3D array. Distortion-corrected image(s).
+    """
+    if cupywrapper.cupy_run:
+        return __distortion_correction_proj_discorpy(
+            data, metadata_path, preview, order, mode
+        )
+    else:
+        print(
+            "distortion_correction_proj_discorpy won't be executed because CuPy is not installed"
+        )
+        return data
+
+
+@nvtx.annotate()
+def __distortion_correction_proj_discorpy(
+    data: cp.ndarray,
     metadata_path: str,
     preview: Dict[str, List[int]],
     order: int = 1,
@@ -89,9 +122,12 @@ def distortion_correction_proj_discorpy(
     cp.ndarray
         3D array. Distortion-corrected image(s).
     """
+
+    from cupyx.scipy.ndimage import map_coordinates
+
     # Check if it's a stack of 2D images, or only a single 2D image
     if len(data.shape) == 2:
-        data = xp.expand_dims(data, axis=0)
+        data = cp.expand_dims(data, axis=0)
 
     # Get info from metadata txt file
     xcenter, ycenter, list_fact = _load_metadata_txt(metadata_path)
@@ -118,26 +154,26 @@ def distortion_correction_proj_discorpy(
     ycenter = ycenter - y_offset
 
     height, width = data.shape[y_dim + 1], data.shape[x_dim + 1]
-    xu_list = xp.arange(width) - xcenter
-    yu_list = xp.arange(height) - ycenter
-    xu_mat, yu_mat = xp.meshgrid(xu_list, yu_list)
-    ru_mat = xp.sqrt(xu_mat**2 + yu_mat**2)
-    fact_mat = xp.sum(
-        xp.asarray([factor * ru_mat**i for i, factor in enumerate(list_fact)]), axis=0
+    xu_list = cp.arange(width) - xcenter
+    yu_list = cp.arange(height) - ycenter
+    xu_mat, yu_mat = cp.meshgrid(xu_list, yu_list)
+    ru_mat = cp.sqrt(xu_mat**2 + yu_mat**2)
+    fact_mat = cp.sum(
+        cp.asarray([factor * ru_mat**i for i, factor in enumerate(list_fact)]), axis=0
     )
-    xd_mat = xp.asarray(
-        xp.clip(xcenter + fact_mat * xu_mat, 0, width - 1), dtype=xp.float32
+    xd_mat = cp.asarray(
+        cp.clip(xcenter + fact_mat * xu_mat, 0, width - 1), dtype=cp.float32
     )
-    yd_mat = xp.asarray(
-        xp.clip(ycenter + fact_mat * yu_mat, 0, height - 1), dtype=xp.float32
+    yd_mat = cp.asarray(
+        cp.clip(ycenter + fact_mat * yu_mat, 0, height - 1), dtype=cp.float32
     )
-    indices = [xp.reshape(yd_mat, (-1, 1)), xp.reshape(xd_mat, (-1, 1))]
-    indices = xp.asarray(indices, dtype=xp.float32)
+    indices = [cp.reshape(yd_mat, (-1, 1)), cp.reshape(xd_mat, (-1, 1))]
+    indices = cp.asarray(indices, dtype=cp.float32)
 
     # Loop over images and unwarp them
     for i in range(data.shape[0]):
         mat = map_coordinates(data[i], indices, order=order, mode=mode)
-        mat = xp.reshape(mat, (height, width))
+        mat = cp.reshape(mat, (height, width))
         data[i] = mat
 
     return data
diff --git a/httomolibgpu/prep/normalize.py b/httomolibgpu/prep/normalize.py
index d8c1625a..bcbb2c20 100644
--- a/httomolibgpu/prep/normalize.py
+++ b/httomolibgpu/prep/normalize.py
@@ -20,41 +20,28 @@
 # ---------------------------------------------------------------------------
 """Modules for raw projection data normalization"""
 
-cupy_run = False
-try:
-    import cupy as xp    
-
-    try:
-        xp.cuda.Device(0).compute_capability
-        cupy_run = True
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as xp
-except ImportError:
-    import numpy as xp
-
-if cupy_run:
-    from cupy import mean
-else:
-    from numpy import mean
-import nvtx
 import numpy as np
+from httomolibgpu import cupywrapper
+
+cp = cupywrapper.cp
+
+nvtx = cupywrapper.nvtx
+
 from numpy import float32
 from typing import Tuple
 
 __all__ = ["normalize"]
 
 
-@nvtx.annotate()
 def normalize(
-    data: xp.ndarray,
-    flats: xp.ndarray,
-    darks: xp.ndarray,
+    data: cp.ndarray,
+    flats: cp.ndarray,
+    darks: cp.ndarray,
     cutoff: float = 10.0,
     minus_log: bool = True,
     nonnegativity: bool = False,
     remove_nans: bool = False,
-) -> xp.ndarray:
+) -> cp.ndarray:
     """
     Normalize raw projection data using the flat and dark field projections.
     This is a raw CUDA kernel implementation with CuPy wrappers.
@@ -81,12 +68,33 @@ def normalize(
     cp.ndarray
         Normalised 3D tomographic data as a CuPy array.
     """
+    if cupywrapper.cupy_run:
+        return __normalize(
+            data, flats, darks, cutoff, minus_log, nonnegativity, remove_nans
+        )
+    else:
+        print("normalize won't be executed because CuPy is not installed")
+        return data
+
+
+@nvtx.annotate()
+def __normalize(
+    data: cp.ndarray,
+    flats: cp.ndarray,
+    darks: cp.ndarray,
+    cutoff: float = 10.0,
+    minus_log: bool = True,
+    nonnegativity: bool = False,
+    remove_nans: bool = False,
+) -> cp.ndarray:
+
+    from cupy import mean
 
     _check_valid_input(data, flats, darks)
 
-    dark0 = xp.empty(darks.shape[1:], dtype=float32)
-    flat0 = xp.empty(flats.shape[1:], dtype=float32)
-    out = xp.empty(data.shape, dtype=float32)
+    dark0 = cp.empty(darks.shape[1:], dtype=float32)
+    flat0 = cp.empty(flats.shape[1:], dtype=float32)
+    out = cp.empty(data.shape, dtype=float32)
     mean(darks, axis=0, dtype=float32, out=dark0)
     mean(flats, axis=0, dtype=float32, out=flat0)
 
@@ -111,7 +119,7 @@ def normalize(
     kernel += "if (v > cutoff) v = cutoff;\n"
     kernel += "out = v;\n"
 
-    normalisation_kernel = xp.ElementwiseKernel(
+    normalisation_kernel = cp.ElementwiseKernel(
         "T data, U flats, U darks, raw float32 cutoff",
         "float32 out",
         kernel,
@@ -138,6 +146,6 @@ def _check_valid_input(data, flats, darks) -> None:
         raise ValueError("Input darks must be 2D or 3D data only")
 
     if flats.ndim == 2:
-        flats = flats[xp.newaxis, :, :]
+        flats = flats[cp.newaxis, :, :]
     if darks.ndim == 2:
-        darks = darks[xp.newaxis, :, :]
+        darks = darks[cp.newaxis, :, :]
diff --git a/httomolibgpu/prep/phase.py b/httomolibgpu/prep/phase.py
index 54e69e45..a6febe10 100644
--- a/httomolibgpu/prep/phase.py
+++ b/httomolibgpu/prep/phase.py
@@ -21,21 +21,14 @@
 """Modules for phase retrieval and phase-contrast enhancement"""
 
 import numpy as np
-cupy_run = False
-try:
-    import cupy as xp
-    try:
-        xp.cuda.Device(0).compute_capability
-        cupy_run = True
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as np
-except ImportError:
-    import numpy as np
+from httomolibgpu import cupywrapper
+
+cp = cupywrapper.cp
+
+nvtx = cupywrapper.nvtx
 
 from numpy import float32
 from typing import Union
-import nvtx
 import math
 
 __all__ = [
@@ -43,11 +36,11 @@
     "paganin_filter_tomopy",
 ]
 
+
 ## %%%%%%%%%%%%%%%%%%%%%%% paganin_filter %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 #: CuPy implementation of Paganin filter from Savu
-@nvtx.annotate()
 def paganin_filter_savu(
-    data: xp.ndarray,
+    data: cp.ndarray,
     ratio: float = 250.0,
     energy: float = 53.0,
     distance: float = 1.0,
@@ -56,7 +49,7 @@ def paganin_filter_savu(
     pad_x: int = 100,
     pad_method: str = "edge",
     increment: float = 0.0,
-) -> xp.ndarray:
+) -> cp.ndarray:
     """
     Apply Paganin filter (for denoising or contrast enhancement) to
     projections.
@@ -94,15 +87,27 @@ def paganin_filter_savu(
     -------
     cp.ndarray
         The stack of filtered projections.
-    """   
-    if cupy_run:
-        return __paganin_filter_savu(data, ratio, energy, distance, resolution,  pad_y, pad_x, pad_method, increment)
+    """
+    if cupywrapper.cupy_run:
+        return __paganin_filter_savu(
+            data,
+            ratio,
+            energy,
+            distance,
+            resolution,
+            pad_y,
+            pad_x,
+            pad_method,
+            increment,
+        )
     else:
         print("__paganin_filter_savu won't be executed because CuPy is not installed")
         return data
 
+
+@nvtx.annotate()
 def __paganin_filter_savu(
-    data: xp.ndarray,
+    data: cp.ndarray,
     ratio: float = 250.0,
     energy: float = 53.0,
     distance: float = 1.0,
@@ -111,8 +116,8 @@ def __paganin_filter_savu(
     pad_x: int = 100,
     pad_method: str = "edge",
     increment: float = 0.0,
-) -> xp.ndarray:
-    
+) -> cp.ndarray:
+
     from httomolibgpu.cuda_kernels import load_cuda_module
     from cupyx.scipy.fft import fft2, ifft2
 
@@ -144,10 +149,10 @@ def __paganin_filter_savu(
 
     # Apply padding to all the 2D projections
     # Note: this takes considerable time on GPU...
-    data = xp.pad(data, ((0, 0), (pad_y, pad_y), (pad_x, pad_x)), mode=pad_method)
+    data = cp.pad(data, ((0, 0), (pad_y, pad_y), (pad_x, pad_x)), mode=pad_method)
 
     # Define array to hold result, which will not have the padding applied to it
-    precond_kernel_float = xp.ElementwiseKernel(
+    precond_kernel_float = cp.ElementwiseKernel(
         "T data",
         "T out",
         """
@@ -164,7 +169,7 @@ def __paganin_filter_savu(
         name="paganin_precond_float",
         no_return=True,
     )
-    precond_kernel_int = xp.ElementwiseKernel(
+    precond_kernel_int = cp.ElementwiseKernel(
         "T data",
         "T out",
         """out = data == 0 ? 1 : data""",
@@ -172,17 +177,17 @@ def __paganin_filter_savu(
         no_return=True,
     )
 
-    if data.dtype in (xp.float32, xp.float64):
+    if data.dtype in (cp.float32, cp.float64):
         precond_kernel_float(data, data)
     else:
         precond_kernel_int(data, data)
 
     # avoid normalising in both directions - we include multiplier in the post_kernel
-    data = xp.asarray(data, dtype=xp.complex64)
+    data = cp.asarray(data, dtype=cp.complex64)
     data = fft2(data, axes=(-2, -1), overwrite_x=True, norm="backward")
 
     # prepare filter here, while the GPU is busy with the FFT
-    filtercomplex = xp.empty((height1, width1), dtype=xp.complex64)
+    filtercomplex = cp.empty((height1, width1), dtype=cp.complex64)
     bx = 16
     by = 8
     gx = (width1 + bx - 1) // bx
@@ -191,12 +196,12 @@ def __paganin_filter_savu(
         grid=(gx, gy, 1),
         block=(bx, by, 1),
         args=(
-            xp.int32(width1),
-            xp.int32(height1),
-            xp.float32(resolution),
-            xp.float32(wavelength),
-            xp.float32(distance),
-            xp.float32(ratio),
+            cp.int32(width1),
+            cp.int32(height1),
+            cp.float32(resolution),
+            cp.float32(wavelength),
+            cp.float32(distance),
+            cp.float32(ratio),
             filtercomplex,
         ),
     )
@@ -204,7 +209,7 @@ def __paganin_filter_savu(
 
     data = ifft2(data, axes=(-2, -1), overwrite_x=True, norm="forward")
 
-    post_kernel = xp.ElementwiseKernel(
+    post_kernel = cp.ElementwiseKernel(
         "C pci1, raw float32 increment, raw float32 ratio, raw float32 fft_scale",
         "T out",
         "out = -0.5 * ratio * log(abs(pci1) * fft_scale + increment)",
@@ -212,7 +217,7 @@ def __paganin_filter_savu(
         no_return=True,
     )
     fft_scale = 1.0 / (data.shape[1] * data.shape[2])
-    res = xp.empty((data.shape[0], height, width), dtype=xp.float32)
+    res = cp.empty((data.shape[0], height, width), dtype=cp.float32)
     post_kernel(
         data[:, pad_y : pad_y + height, pad_x : pad_x + width],
         np.float32(increment),
@@ -222,12 +227,14 @@ def __paganin_filter_savu(
     )
     return res
 
+
 def _wavelength(energy: float) -> float:
     SPEED_OF_LIGHT = 299792458e2  # [cm/s]
     PLANCK_CONSTANT = 6.58211928e-19  # [keV*s]
     return 2 * math.pi * PLANCK_CONSTANT * SPEED_OF_LIGHT / energy
 
-def _reciprocal_grid(pixel_size: float, shape_proj: tuple) -> xp.ndarray:
+
+def _reciprocal_grid(pixel_size: float, shape_proj: tuple) -> cp.ndarray:
     """
     Calculate reciprocal grid.
 
@@ -246,12 +253,13 @@ def _reciprocal_grid(pixel_size: float, shape_proj: tuple) -> xp.ndarray:
     # Sampling in reciprocal space.
     indx = _reciprocal_coord(pixel_size, shape_proj[0])
     indy = _reciprocal_coord(pixel_size, shape_proj[1])
-    indx_sq = xp.square(indx)
-    indy_sq = xp.square(indy)
+    indx_sq = cp.square(indx)
+    indy_sq = cp.square(indy)
+
+    return cp.add.outer(indx_sq, indy_sq)
 
-    return xp.add.outer(indx_sq, indy_sq)
 
-def _reciprocal_coord(pixel_size: float, num_grid: int) -> xp.ndarray:
+def _reciprocal_coord(pixel_size: float, num_grid: int) -> cp.ndarray:
     """
     Calculate reciprocal grid coordinates for a given pixel size
     and discretization.
@@ -269,22 +277,21 @@ def _reciprocal_coord(pixel_size: float, num_grid: int) -> xp.ndarray:
         Grid coordinates.
     """
     n = num_grid - 1
-    rc = xp.arange(-n, num_grid, 2, dtype=xp.float32)
+    rc = cp.arange(-n, num_grid, 2, dtype=cp.float32)
     rc *= 2 * math.pi / (n * pixel_size)
     return rc
 
+
 ##-------------------------------------------------------------##
 ##-------------------------------------------------------------##
-# Adaptation with some corrections of retrieve_phase (Paganin filter)
-# from TomoPy
-@nvtx.annotate()
+# Adaptation of retrieve_phase (Paganin filter) from TomoPy
 def paganin_filter_tomopy(
-    tomo: xp.ndarray,
+    tomo: cp.ndarray,
     pixel_size: float = 1e-4,
     dist: float = 50.0,
     energy: float = 53.0,
     alpha: float = 1e-3,
-) -> xp.ndarray:
+) -> cp.ndarray:
     """
     Perform single-material phase retrieval from flats/darks corrected tomographic measurements
     :cite:`Paganin:02`.
@@ -307,19 +314,21 @@ def paganin_filter_tomopy(
     cp.ndarray
         The 3D array of Paganin phase-filtered projection images.
     """
-    if cupy_run:
+    if cupywrapper.cupy_run:
         return __paganin_filter_tomopy(tomo, pixel_size, dist, energy, alpha)
     else:
         print("paganin_filter_tomopy won't be executed because CuPy is not installed")
         return tomo
 
+
+@nvtx.annotate()
 def __paganin_filter_tomopy(
-    tomo: xp.ndarray,
+    tomo: cp.ndarray,
     pixel_size: float = 1e-4,
     dist: float = 50.0,
     energy: float = 53.0,
     alpha: float = 1e-3,
-) -> xp.ndarray:
+) -> cp.ndarray:
 
     from cupyx.scipy.fft import fft2, ifft2, fftshift
 
@@ -339,16 +348,14 @@ def __paganin_filter_tomopy(
     dz, dy, dx = padded_tomo.shape
 
     # 3D FFT of tomo data
-    padded_tomo = xp.asarray(padded_tomo, dtype=xp.complex64)
+    padded_tomo = cp.asarray(padded_tomo, dtype=cp.complex64)
     fft_tomo = fft2(padded_tomo, axes=(-2, -1), overwrite_x=True)
 
     # Compute the reciprocal grid.
     w2 = _reciprocal_grid(pixel_size, (dy, dx))
 
     # Build filter in the Fourier space.
-    phase_filter = fftshift(
-        _paganin_filter_factor2(energy, dist, alpha, w2)
-    )
+    phase_filter = fftshift(_paganin_filter_factor2(energy, dist, alpha, w2))
     phase_filter = phase_filter / phase_filter.max()  # normalisation
 
     # Apply filter and take inverse FFT
@@ -364,10 +371,10 @@ def __paganin_filter_tomopy(
     )
 
     # crop the padded filtered data:
-    tomo = ifft_filtered_tomo[slc_indices].astype(xp.float32)
+    tomo = ifft_filtered_tomo[slc_indices].astype(cp.float32)
 
     # taking the negative log
-    _log_kernel = xp.ElementwiseKernel(
+    _log_kernel = cp.ElementwiseKernel(
         "C tomo",
         "C out",
         "out = -log(tomo)",
@@ -380,7 +387,8 @@ def __paganin_filter_tomopy(
 def _shift_bit_length(x: int) -> int:
     return 1 << (x - 1).bit_length()
 
-def _pad_projections_to_second_power(tomo: xp.ndarray) -> Union[xp.ndarray, tuple]:
+
+def _pad_projections_to_second_power(tomo: cp.ndarray) -> Union[cp.ndarray, tuple]:
     """
     Performs padding of each projection to the next power of 2.
     If the shape is not even we also care of that before padding.
@@ -395,7 +403,7 @@ def _pad_projections_to_second_power(tomo: xp.ndarray) -> Union[xp.ndarray, tupl
     ndarray: padded 3d projection data
     tuple: a tuple with padding dimensions
     """
-    full_shape_tomo = xp.shape(tomo)
+    full_shape_tomo = cp.shape(tomo)
 
     pad_tup = []
     for index, element in enumerate(full_shape_tomo):
@@ -414,10 +422,11 @@ def _pad_projections_to_second_power(tomo: xp.ndarray) -> Union[xp.ndarray, tupl
 
         pad_tup.append(pad_width)
 
-    padded_tomo = xp.pad(tomo, tuple(pad_tup), "edge")
+    padded_tomo = cp.pad(tomo, tuple(pad_tup), "edge")
 
     return padded_tomo, pad_tup
 
+
 def _paganin_filter_factor2(energy, dist, alpha, w2):
     # Alpha represents the ratio of delta/beta.
     return 1 / (_wavelength(energy) * dist * w2 / (4 * math.pi) + alpha)
diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py
index f55a3e31..46ba04a0 100644
--- a/httomolibgpu/prep/stripe.py
+++ b/httomolibgpu/prep/stripe.py
@@ -21,25 +21,11 @@
 """Modules for stripes removal"""
 
 import numpy as np
-cupy_run = False
-try:
-    import cupy as xp
-
-    try:
-        xp.cuda.Device(0).compute_capability
-        cupy_run = True
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as xp
-except ImportError:
-    import numpy as xp
-
-if cupy_run:
-    from cupyx.scipy.ndimage import median_filter, binary_dilation, uniform_filter1d
-else:
-    from scipy.ndimage import median_filter, binary_dilation, uniform_filter1d
-
-import nvtx
+from httomolibgpu import cupywrapper
+
+cp = cupywrapper.cp
+
+nvtx = cupywrapper.nvtx
 from typing import Union
 
 __all__ = [
@@ -49,12 +35,11 @@
 ]
 
 
-@nvtx.annotate()
 def remove_stripe_based_sorting(
-    data: Union[xp.ndarray, np.ndarray],
+    data: Union[cp.ndarray, np.ndarray],
     size: int = 11,
     dim: int = 1,
-) -> Union[xp.ndarray, np.ndarray]:
+) -> Union[cp.ndarray, np.ndarray]:
     """
     Remove full and partial stripe artifacts from sinogram using Nghia Vo's
     approach, algorithm 3 in Ref. [1]. Angular direction is along the axis 0.
@@ -84,6 +69,21 @@ def remove_stripe_based_sorting(
     ----------
     .. [1] https://doi.org/10.1364/OE.26.028396
     """
+    if cupywrapper.cupy_run:
+        return __remove_stripe_based_sorting(data, size, dim)
+    else:
+        print(
+            "remove_stripe_based_sorting won't be executed because CuPy is not installed"
+        )
+        return data
+
+
+@nvtx.annotate()
+def __remove_stripe_based_sorting(
+    data: Union[cp.ndarray, np.ndarray],
+    size: int = 11,
+    dim: int = 1,
+) -> Union[cp.ndarray, np.ndarray]:
 
     if size is None:
         if data.shape[2] > 2000:
@@ -102,28 +102,29 @@ def _rs_sort(sinogram, size, dim):
     """
     Remove stripes using the sorting technique.
     """
-    sinogram = xp.transpose(sinogram)
+    from cupyx.scipy.ndimage import median_filter
+
+    sinogram = cp.transpose(sinogram)
 
     #: Sort each column of the sinogram by its grayscale values
     #: Keep track of the sorting indices so we can reverse it below
-    sortvals = xp.argsort(sinogram, axis=1)
-    sortvals_reverse = xp.argsort(sortvals, axis=1)
-    sino_sort = xp.take_along_axis(sinogram, sortvals, axis=1)
+    sortvals = cp.argsort(sinogram, axis=1)
+    sortvals_reverse = cp.argsort(sortvals, axis=1)
+    sino_sort = cp.take_along_axis(sinogram, sortvals, axis=1)
 
     #: Now apply the median filter on the sorted image along each row
     sino_sort = median_filter(sino_sort, (size, 1) if dim == 1 else (size, size))
 
     #: step 3: re-sort the smoothed image columns to the original rows
-    sino_corrected = xp.take_along_axis(sino_sort, sortvals_reverse, axis=1)
+    sino_corrected = cp.take_along_axis(sino_sort, sortvals_reverse, axis=1)
 
-    return xp.transpose(sino_corrected)
+    return cp.transpose(sino_corrected)
 
 
-@nvtx.annotate()
 def remove_stripe_ti(
-    data: Union[xp.ndarray, np.ndarray],
+    data: Union[cp.ndarray, np.ndarray],
     beta: float = 0.1,
-) -> Union[xp.ndarray, np.ndarray]:
+) -> Union[cp.ndarray, np.ndarray]:
     """
     Removes stripes with the method of V. Titarenko (TomoCuPy implementation)
 
@@ -140,14 +141,27 @@ def remove_stripe_ti(
     ndarray
         3D array of de-striped projections.
     """
+    if cupywrapper.cupy_run:
+        return __remove_stripe_ti(data, beta)
+    else:
+        print("remove_stripe_ti won't be executed because CuPy is not installed")
+        return data
+
+
+@nvtx.annotate()
+def __remove_stripe_ti(
+    data: Union[cp.ndarray, np.ndarray],
+    beta: float = 0.1,
+) -> Union[cp.ndarray, np.ndarray]:
+
     # TODO: detector dimensions must be even otherwise error
-    gamma = beta * ((1 - beta) / (1 + beta)) ** xp.abs(
-        xp.fft.fftfreq(data.shape[-1]) * data.shape[-1]
+    gamma = beta * ((1 - beta) / (1 + beta)) ** cp.abs(
+        cp.fft.fftfreq(data.shape[-1]) * data.shape[-1]
     )
     gamma[0] -= 1
-    v = xp.mean(data, axis=0)
+    v = cp.mean(data, axis=0)
     v = v - v[:, 0:1]
-    v = xp.fft.irfft(xp.fft.rfft(v) * xp.fft.rfft(gamma)).astype(data.dtype)
+    v = cp.fft.irfft(cp.fft.rfft(v) * cp.fft.rfft(gamma)).astype(data.dtype)
     data[:] += v
     return data
 
@@ -176,14 +190,13 @@ def remove_stripe_ti(
 #                                                                             #
 #                                                                             #
 # *************************************************************************** #
-@nvtx.annotate()
 def remove_all_stripe(
-    data: xp.ndarray,
+    data: cp.ndarray,
     snr: float = 3.0,
     la_size: int = 61,
     sm_size: int = 21,
     dim: int = 1,
-) -> xp.ndarray:
+) -> cp.ndarray:
     """
     Remove all types of stripe artifacts from sinogram using Nghia Vo's
     approach :cite:`Vo:18` (combination of algorithm 3,4,5, and 6).
@@ -212,6 +225,22 @@ def remove_all_stripe(
     .. [1] https://doi.org/10.1364/OE.26.028396
 
     """
+    if cupywrapper.cupy_run:
+        return __remove_all_stripe(data, snr, la_size, sm_size, dim)
+    else:
+        print("remove_all_stripe won't be executed because CuPy is not installed")
+        return data
+
+
+@nvtx.annotate()
+def __remove_all_stripe(
+    data: cp.ndarray,
+    snr: float = 3.0,
+    la_size: int = 61,
+    sm_size: int = 21,
+    dim: int = 1,
+) -> cp.ndarray:
+
     matindex = _create_matindex(data.shape[2], data.shape[0])
     for m in range(data.shape[1]):
         sino = data[:, m, :]
@@ -226,38 +255,40 @@ def _rs_sort2(sinogram, size, matindex, dim):
     """
     Remove stripes using the sorting technique.
     """
-    sinogram = xp.transpose(sinogram)
-    matcomb = xp.asarray(xp.dstack((matindex, sinogram)))
+    from cupyx.scipy.ndimage import median_filter
+
+    sinogram = cp.transpose(sinogram)
+    matcomb = cp.asarray(cp.dstack((matindex, sinogram)))
 
-    # matsort = xp.asarray([row[row[:, 1].argsort()] for row in matcomb])
-    ids = xp.argsort(matcomb[:, :, 1], axis=1)
+    # matsort = cp.asarray([row[row[:, 1].argsort()] for row in matcomb])
+    ids = cp.argsort(matcomb[:, :, 1], axis=1)
     matsort = matcomb.copy()
-    matsort[:, :, 0] = xp.take_along_axis(matsort[:, :, 0], ids, axis=1)
-    matsort[:, :, 1] = xp.take_along_axis(matsort[:, :, 1], ids, axis=1)
+    matsort[:, :, 0] = cp.take_along_axis(matsort[:, :, 0], ids, axis=1)
+    matsort[:, :, 1] = cp.take_along_axis(matsort[:, :, 1], ids, axis=1)
     if dim == 1:
         matsort[:, :, 1] = median_filter(matsort[:, :, 1], (size, 1))
     else:
         matsort[:, :, 1] = median_filter(matsort[:, :, 1], (size, size))
 
-    # matsortback = xp.asarray([row[row[:, 0].argsort()] for row in matsort])
+    # matsortback = cp.asarray([row[row[:, 0].argsort()] for row in matsort])
 
-    ids = xp.argsort(matsort[:, :, 0], axis=1)
+    ids = cp.argsort(matsort[:, :, 0], axis=1)
     matsortback = matsort.copy()
-    matsortback[:, :, 0] = xp.take_along_axis(matsortback[:, :, 0], ids, axis=1)
-    matsortback[:, :, 1] = xp.take_along_axis(matsortback[:, :, 1], ids, axis=1)
+    matsortback[:, :, 0] = cp.take_along_axis(matsortback[:, :, 0], ids, axis=1)
+    matsortback[:, :, 1] = cp.take_along_axis(matsortback[:, :, 1], ids, axis=1)
 
     sino_corrected = matsortback[:, :, 1]
-    return xp.transpose(sino_corrected)
+    return cp.transpose(sino_corrected)
 
 
 @nvtx.annotate()
 def _mpolyfit(x, y):
     n = len(x)
-    x_mean = xp.mean(x)
-    y_mean = xp.mean(y)
+    x_mean = cp.mean(x)
+    y_mean = cp.mean(y)
 
-    Sxy = xp.sum(x * y) - n * x_mean * y_mean
-    Sxx = xp.sum(x * x) - n * x_mean * x_mean
+    Sxy = cp.sum(x * y) - n * x_mean * y_mean
+    Sxx = cp.sum(x * x) - n * x_mean * x_mean
 
     slope = Sxy / Sxx
     intercept = y_mean - slope * x_mean
@@ -269,22 +300,23 @@ def _detect_stripe(listdata, snr):
     """
     Algorithm 4 in :cite:`Vo:18`. Used to locate stripes.
     """
+
     numdata = len(listdata)
-    listsorted = xp.sort(listdata)[::-1]
-    xlist = xp.arange(0, numdata, 1.0)
-    ndrop = xp.int16(0.25 * numdata)
-    # (_slope, _intercept) = xp.polyfit(xlist[ndrop:-ndrop - 1],
+    listsorted = cp.sort(listdata)[::-1]
+    xlist = cp.arange(0, numdata, 1.0)
+    ndrop = cp.int16(0.25 * numdata)
+    # (_slope, _intercept) = cp.polyfit(xlist[ndrop:-ndrop - 1],
     #   listsorted[ndrop:-ndrop - 1], 1)
     (_slope, _intercept) = _mpolyfit(
         xlist[ndrop : -ndrop - 1], listsorted[ndrop : -ndrop - 1]
     )
 
     numt1 = _intercept + _slope * xlist[-1]
-    noiselevel = xp.abs(numt1 - _intercept)
-    noiselevel = xp.clip(noiselevel, 1e-6, None)
-    val1 = xp.abs(listsorted[0] - _intercept) / noiselevel
-    val2 = xp.abs(listsorted[-1] - numt1) / noiselevel
-    listmask = xp.zeros_like(listdata)
+    noiselevel = cp.abs(numt1 - _intercept)
+    noiselevel = cp.clip(noiselevel, 1e-6, None)
+    val1 = cp.abs(listsorted[0] - _intercept) / noiselevel
+    val2 = cp.abs(listsorted[-1] - numt1) / noiselevel
+    listmask = cp.zeros_like(listdata)
     if val1 >= snr:
         upper_thresh = _intercept + noiselevel * snr * 0.5
         listmask[listdata > upper_thresh] = 1.0
@@ -299,16 +331,19 @@ def _rs_large(sinogram, snr, size, matindex, drop_ratio=0.1, norm=True):
     """
     Remove large stripes.
     """
-    drop_ratio = max(min(drop_ratio, 0.8), 0)  # = xp.clip(drop_ratio, 0.0, 0.8)
+    from cupyx.scipy.ndimage import median_filter
+    from cupyx.scipy.ndimage import binary_dilation
+
+    drop_ratio = max(min(drop_ratio, 0.8), 0)  # = cp.clip(drop_ratio, 0.0, 0.8)
     (nrow, ncol) = sinogram.shape
     ndrop = int(0.5 * drop_ratio * nrow)
-    sinosort = xp.sort(sinogram, axis=0)
+    sinosort = cp.sort(sinogram, axis=0)
     sinosmooth = median_filter(sinosort, (1, size))
-    list1 = xp.mean(sinosort[ndrop : nrow - ndrop], axis=0)
-    list2 = xp.mean(sinosmooth[ndrop : nrow - ndrop], axis=0)
-    # listfact = xp.divide(list1,
+    list1 = cp.mean(sinosort[ndrop : nrow - ndrop], axis=0)
+    list2 = cp.mean(sinosmooth[ndrop : nrow - ndrop], axis=0)
+    # listfact = cp.divide(list1,
     #                      list2,
-    #                      out=xp.ones_like(list1),
+    #                      out=cp.ones_like(list1),
     #                      where=list2 != 0)
 
     listfact = list1 / list2
@@ -316,28 +351,28 @@ def _rs_large(sinogram, snr, size, matindex, drop_ratio=0.1, norm=True):
     # Locate stripes
     listmask = _detect_stripe(listfact, snr)
     listmask = binary_dilation(listmask, iterations=1).astype(listmask.dtype)
-    matfact = xp.tile(listfact, (nrow, 1))
+    matfact = cp.tile(listfact, (nrow, 1))
     # Normalize
     if norm is True:
         sinogram = sinogram / matfact
-    sinogram1 = xp.transpose(sinogram)
-    matcombine = xp.asarray(xp.dstack((matindex, sinogram1)))
+    sinogram1 = cp.transpose(sinogram)
+    matcombine = cp.asarray(cp.dstack((matindex, sinogram1)))
 
-    # matsort = xp.asarray([row[row[:, 1].argsort()] for row in matcombine])
-    ids = xp.argsort(matcombine[:, :, 1], axis=1)
+    # matsort = cp.asarray([row[row[:, 1].argsort()] for row in matcombine])
+    ids = cp.argsort(matcombine[:, :, 1], axis=1)
     matsort = matcombine.copy()
-    matsort[:, :, 0] = xp.take_along_axis(matsort[:, :, 0], ids, axis=1)
-    matsort[:, :, 1] = xp.take_along_axis(matsort[:, :, 1], ids, axis=1)
+    matsort[:, :, 0] = cp.take_along_axis(matsort[:, :, 0], ids, axis=1)
+    matsort[:, :, 1] = cp.take_along_axis(matsort[:, :, 1], ids, axis=1)
 
-    matsort[:, :, 1] = xp.transpose(sinosmooth)
-    # matsortback = xp.asarray([row[row[:, 0].argsort()] for row in matsort])
-    ids = xp.argsort(matsort[:, :, 0], axis=1)
+    matsort[:, :, 1] = cp.transpose(sinosmooth)
+    # matsortback = cp.asarray([row[row[:, 0].argsort()] for row in matsort])
+    ids = cp.argsort(matsort[:, :, 0], axis=1)
     matsortback = matsort.copy()
-    matsortback[:, :, 0] = xp.take_along_axis(matsortback[:, :, 0], ids, axis=1)
-    matsortback[:, :, 1] = xp.take_along_axis(matsortback[:, :, 1], ids, axis=1)
+    matsortback[:, :, 0] = cp.take_along_axis(matsortback[:, :, 0], ids, axis=1)
+    matsortback[:, :, 1] = cp.take_along_axis(matsortback[:, :, 1], ids, axis=1)
 
-    sino_corrected = xp.transpose(matsortback[:, :, 1])
-    listxmiss = xp.where(listmask > 0.0)[0]
+    sino_corrected = cp.transpose(matsortback[:, :, 1])
+    listxmiss = cp.where(listmask > 0.0)[0]
     sinogram[:, listxmiss] = sino_corrected[:, listxmiss]
     return sinogram
 
@@ -347,12 +382,16 @@ def _rs_dead(sinogram, snr, size, matindex, norm=True):
     """
     Remove unresponsive and fluctuating stripes.
     """
-    sinogram = xp.copy(sinogram)  # Make it mutable
+    from cupyx.scipy.ndimage import median_filter
+    from cupyx.scipy.ndimage import binary_dilation
+    from cupyx.scipy.ndimage import uniform_filter1d
+
+    sinogram = cp.copy(sinogram)  # Make it mutable
     (nrow, _) = sinogram.shape
-    # sinosmooth = xp.apply_along_axis(uniform_filter1d, 0, sinogram, 10)
+    # sinosmooth = cp.apply_along_axis(uniform_filter1d, 0, sinogram, 10)
     sinosmooth = uniform_filter1d(sinogram, 10, axis=0)
 
-    listdiff = xp.sum(xp.abs(sinogram - sinosmooth), axis=0)
+    listdiff = cp.sum(cp.abs(sinogram - sinosmooth), axis=0)
     listdiffbck = median_filter(listdiff, size)
 
     listfact = listdiff / listdiffbck
@@ -361,16 +400,16 @@ def _rs_dead(sinogram, snr, size, matindex, norm=True):
     listmask = binary_dilation(listmask, iterations=1).astype(listmask.dtype)
     listmask[0:2] = 0.0
     listmask[-2:] = 0.0
-    listx = xp.where(listmask < 1.0)[0]
-    listy = xp.arange(nrow)
+    listx = cp.where(listmask < 1.0)[0]
+    listy = cp.arange(nrow)
     matz = sinogram[:, listx]
 
-    listxmiss = xp.where(listmask > 0.0)[0]
+    listxmiss = cp.where(listmask > 0.0)[0]
 
     # finter = interpolate.interp2d(listx.get(), listy.get(), matz.get(), kind='linear')
     if len(listxmiss) > 0:
         # sinogram_c[:, listxmiss.get()] = finter(listxmiss.get(), listy.get())
-        ids = xp.searchsorted(listx, listxmiss)
+        ids = cp.searchsorted(listx, listxmiss)
         sinogram[:, listxmiss] = matz[:, ids - 1] + (listxmiss - listx[ids - 1]) * (
             matz[:, ids] - matz[:, ids - 1]
         ) / (listx[ids] - listx[ids - 1])
@@ -386,6 +425,6 @@ def _create_matindex(nrow, ncol):
     """
     Create a 2D array of indexes used for the sorting technique.
     """
-    listindex = xp.arange(0.0, ncol, 1.0)
-    matindex = xp.tile(listindex, (nrow, 1))
+    listindex = cp.arange(0.0, ncol, 1.0)
+    matindex = cp.tile(listindex, (nrow, 1))
     return matindex.astype(np.float32)
diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
index 4b126066..1de5400d 100644
--- a/httomolibgpu/recon/algorithm.py
+++ b/httomolibgpu/recon/algorithm.py
@@ -21,30 +21,14 @@
 """Module for tomographic reconstruction"""
 
 import numpy as np
-cupy_run = False
-try:
-    import cupy as xp
-
-    try:
-        xp.cuda.Device(0).compute_capability
-        cupy_run = True
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as xp
-except ImportError:
-    import numpy as xp
-
-import nvtx
+from httomolibgpu import cupywrapper
+
+cp = cupywrapper.cp
+
+nvtx = cupywrapper.nvtx
 from numpy import float32, complex64
 from typing import Optional, Type
 
-if cupy_run:
-    from tomobar.methodsDIR_CuPy import RecToolsDIRCuPy
-    from tomobar.methodsIR_CuPy import RecToolsIRCuPy
-else:
-    from tomobar.methodsDIR import RecToolsDIR as RecToolsDIRCuPy
-    from tomobar.methodsIR import RecToolsIR as RecToolsIRCuPy
-
 __all__ = [
     "FBP",
     "SIRT",
@@ -55,16 +39,15 @@
 
 
 ## %%%%%%%%%%%%%%%%%%%%%%% FBP reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-@nvtx.annotate()
 def FBP(
-    data: xp.ndarray,
+    data: cp.ndarray,
     angles: np.ndarray,
     center: Optional[float] = None,
     filter_freq_cutoff: Optional[float] = 0.6,
     recon_size: Optional[int] = None,
     recon_mask_radius: Optional[float] = None,
     gpu_id: int = 0,
-) -> xp.ndarray:
+) -> cp.ndarray:
     """
     Perform Filtered Backprojection (FBP) reconstruction using ASTRA toolbox and ToMoBAR wrappers.
     This is a 3D recon from a CuPy array and a custom built filter.
@@ -94,6 +77,32 @@ def FBP(
     cp.ndarray
         The FBP reconstructed volume as a CuPy array.
     """
+    if cupywrapper.cupy_run:
+        return __FBP(
+            data,
+            angles,
+            center,
+            filter_freq_cutoff,
+            recon_size,
+            recon_mask_radius,
+            gpu_id,
+        )
+    else:
+        print("FBP won't be executed because CuPy is not installed")
+        return data
+
+
+@nvtx.annotate()
+def __FBP(
+    data: cp.ndarray,
+    angles: np.ndarray,
+    center: Optional[float] = None,
+    filter_freq_cutoff: Optional[float] = 0.6,
+    recon_size: Optional[int] = None,
+    recon_mask_radius: Optional[float] = None,
+    gpu_id: int = 0,
+) -> cp.ndarray:
+
     RecToolsCP = _instantiate_direct_recon_class(
         data, angles, center, recon_size, gpu_id
     )
@@ -104,21 +113,20 @@ def FBP(
         recon_mask_radius=recon_mask_radius,
         data_axes_labels_order=input_data_axis_labels,
     )
-    xp._default_memory_pool.free_all_blocks()
-    return xp.require(xp.swapaxes(reconstruction, 0, 1), requirements="C")
+    cp._default_memory_pool.free_all_blocks()
+    return cp.require(cp.swapaxes(reconstruction, 0, 1), requirements="C")
 
 
 ## %%%%%%%%%%%%%%%%%%%%%%% SIRT reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-@nvtx.annotate()
 def SIRT(
-    data: xp.ndarray,
+    data: cp.ndarray,
     angles: np.ndarray,
     center: Optional[float] = None,
     recon_size: Optional[int] = None,
     iterations: Optional[int] = 300,
     nonnegativity: Optional[bool] = True,
     gpu_id: int = 0,
-) -> xp.ndarray:
+) -> cp.ndarray:
     """
     Perform Simultaneous Iterative Recostruction Technique (SIRT) using ASTRA toolbox and ToMoBAR wrappers.
     This is 3D recon directly from a CuPy array while using ASTRA GPUlink capability.
@@ -146,6 +154,31 @@ def SIRT(
     cp.ndarray
         The SIRT reconstructed volume as a CuPy array.
     """
+    if cupywrapper.cupy_run:
+        return __SIRT(
+            data,
+            angles,
+            center,
+            recon_size,
+            iterations,
+            nonnegativity,
+            gpu_id,
+        )
+    else:
+        print("SIRT won't be executed because CuPy is not installed")
+        return data
+
+
+@nvtx.annotate()
+def __SIRT(
+    data: cp.ndarray,
+    angles: np.ndarray,
+    center: Optional[float] = None,
+    recon_size: Optional[int] = None,
+    iterations: Optional[int] = 300,
+    nonnegativity: Optional[bool] = True,
+    gpu_id: int = 0,
+) -> cp.ndarray:
 
     RecToolsCP = _instantiate_iterative_recon_class(
         data, angles, center, recon_size, gpu_id, datafidelity="LS"
@@ -160,21 +193,20 @@ def SIRT(
         "nonnegativity": nonnegativity,
     }
     reconstruction = RecToolsCP.SIRT(_data_, _algorithm_)
-    xp._default_memory_pool.free_all_blocks()
-    return xp.require(xp.swapaxes(reconstruction, 0, 1), requirements="C")
+    cp._default_memory_pool.free_all_blocks()
+    return cp.require(cp.swapaxes(reconstruction, 0, 1), requirements="C")
 
 
 ## %%%%%%%%%%%%%%%%%%%%%%% CGLS reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-@nvtx.annotate()
 def CGLS(
-    data: xp.ndarray,
+    data: cp.ndarray,
     angles: np.ndarray,
     center: Optional[float] = None,
     recon_size: Optional[int] = None,
     iterations: Optional[int] = 20,
     nonnegativity: Optional[bool] = True,
     gpu_id: int = 0,
-) -> xp.ndarray:
+) -> cp.ndarray:
     """
     Perform Congugate Gradient Least Squares (CGLS) using ASTRA toolbox and ToMoBAR wrappers.
     This is 3D recon directly from a CuPy array while using ASTRA GPUlink capability.
@@ -202,6 +234,32 @@ def CGLS(
     cp.ndarray
         The CGLS reconstructed volume as a CuPy array.
     """
+    if cupywrapper.cupy_run:
+        return __CGLS(
+            data,
+            angles,
+            center,
+            recon_size,
+            iterations,
+            nonnegativity,
+            gpu_id,
+        )
+    else:
+        print("CGLS won't be executed because CuPy is not installed")
+        return data
+
+
+@nvtx.annotate()
+def __CGLS(
+    data: cp.ndarray,
+    angles: np.ndarray,
+    center: Optional[float] = None,
+    recon_size: Optional[int] = None,
+    iterations: Optional[int] = 20,
+    nonnegativity: Optional[bool] = True,
+    gpu_id: int = 0,
+) -> cp.ndarray:
+
     RecToolsCP = _instantiate_iterative_recon_class(
         data, angles, center, recon_size, gpu_id, datafidelity="LS"
     )
@@ -212,18 +270,18 @@ def CGLS(
     }  # data dictionary
     _algorithm_ = {"iterations": iterations, "nonnegativity": nonnegativity}
     reconstruction = RecToolsCP.CGLS(_data_, _algorithm_)
-    xp._default_memory_pool.free_all_blocks()
-    return xp.require(xp.swapaxes(reconstruction, 0, 1), requirements="C")
+    cp._default_memory_pool.free_all_blocks()
+    return cp.require(cp.swapaxes(reconstruction, 0, 1), requirements="C")
 
 
 ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 def _instantiate_direct_recon_class(
-    data: xp.ndarray,
+    data: cp.ndarray,
     angles: np.ndarray,
     center: Optional[float] = None,
     recon_size: Optional[int] = None,
     gpu_id: int = 0,
-) -> Type[RecToolsDIRCuPy]:
+) -> Type:
     """instantiate ToMoBAR's direct recon class
 
     Args:
@@ -236,6 +294,8 @@ def _instantiate_direct_recon_class(
     Returns:
         Type[RecToolsDIRCuPy]: an instance of the direct recon class
     """
+    from tomobar.methodsDIR_CuPy import RecToolsDIRCuPy
+
     if center is None:
         center = data.shape[2] // 2  # making a crude guess
     if recon_size is None:
@@ -254,13 +314,13 @@ def _instantiate_direct_recon_class(
 
 
 def _instantiate_iterative_recon_class(
-    data: xp.ndarray,
+    data: cp.ndarray,
     angles: np.ndarray,
     center: Optional[float] = None,
     recon_size: Optional[int] = None,
     gpu_id: int = 0,
     datafidelity: str = "LS",
-) -> Type[RecToolsIRCuPy]:
+) -> Type:
     """instantiate ToMoBAR's iterative recon class
 
     Args:
@@ -274,6 +334,8 @@ def _instantiate_iterative_recon_class(
     Returns:
         Type[RecToolsIRCuPy]: an instance of the iterative class
     """
+    from tomobar.methodsIR_CuPy import RecToolsIRCuPy
+
     if center is None:
         center = data.shape[2] // 2  # making a crude guess
     if recon_size is None:
diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
index 6e6feba1..dbade838 100644
--- a/httomolibgpu/recon/rotation.py
+++ b/httomolibgpu/recon/rotation.py
@@ -21,42 +21,24 @@
 """Modules for finding the axis of rotation"""
 
 import numpy as np
-cupy_run = False
-try:
-    import cupy as xp
+from httomolibgpu import cupywrapper
 
-    try:
-        xp.cuda.Device(0).compute_capability
-        cupy_run = True
+cp = cupywrapper.cp
 
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as np
-except ImportError:
-    import numpy as np
+nvtx = cupywrapper.nvtx
 
-import nvtx
 import math
 from typing import List, Literal, Optional, Tuple, Union
 
-if cupy_run:
-    from httomolibgpu.cuda_kernels import load_cuda_module
-    from cupyx.scipy.ndimage import shift, gaussian_filter
-    from cupyx.scipy.fftpack import get_fft_plan
-    from cupyx.scipy.fft import rfft2
-else:
-    from scipy.ndimage import shift, gaussian_filter
-    from scipy.fft import fftfreq as get_fft_plan # get_fft_plan doesn't exist in scipyfft
-    from scipy.fft import rfft2
-
 __all__ = [
     "find_center_vo",
-    #"find_center_360",
-    #"find_center_pc",
+    "find_center_360",
+    "find_center_pc",
 ]
 
+
 def find_center_vo(
-    data: xp.ndarray,
+    data: cp.ndarray,
     ind: Optional[int] = None,
     smin: int = -50,
     smax: int = 50,
@@ -96,12 +78,17 @@ def find_center_vo(
     float
         Rotation axis location.
     """
-    return __find_center_vo(data, ind, smin, smax, srad, step, ratio, drop)    
+    if cupywrapper.cupy_run:
+        return __find_center_vo(data, ind, smin, smax, srad, step, ratio, drop)
+    else:
+        print("find_center_vo won't be executed because CuPy is not installed")
+        return 0.0
+
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%find_center_vo%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 @nvtx.annotate()
 def __find_center_vo(
-    data: xp.ndarray,
+    data: cp.ndarray,
     ind: Optional[int] = None,
     smin: int = -50,
     smax: int = 50,
@@ -111,8 +98,10 @@ def __find_center_vo(
     drop: int = 20,
 ) -> float:
 
+    from cupyx.scipy.ndimage import gaussian_filter
+
     if data.ndim == 2:
-        data = xp.expand_dims(data, 1)
+        data = cp.expand_dims(data, 1)
         ind = 0
 
     height = data.shape[1]
@@ -120,7 +109,7 @@ def __find_center_vo(
     if ind is None:
         ind = height // 2
         if height > 10:
-            _sino = xp.mean(data[:, ind - 5 : ind + 5, :], axis=1)
+            _sino = cp.mean(data[:, ind - 5 : ind + 5, :], axis=1)
         else:
             _sino = data[:, ind, :]
     else:
@@ -141,14 +130,14 @@ def __find_center_vo(
         init_cen = _search_coarse(_sino_cs, smin, smax, ratio, drop)
         fine_cen = _search_fine(_sino_fs, srad, step, init_cen, ratio, drop)
 
-    return xp.asnumpy(fine_cen)
+    return cp.asnumpy(fine_cen)
 
 
 @nvtx.annotate()
 def _search_coarse(sino, smin, smax, ratio, drop):
     (nrow, ncol) = sino.shape
-    flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
-    comp_sino = xp.ascontiguousarray(xp.flipud(sino))
+    flip_sino = cp.ascontiguousarray(cp.fliplr(sino))
+    comp_sino = cp.ascontiguousarray(cp.flipud(sino))
     mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
 
     cen_fliplr = (ncol - 1.0) / 2.0
@@ -158,12 +147,12 @@ def _search_coarse(sino, smin, smax, ratio, drop):
     smax = smax_clip_val - cen_fliplr
     start_cor = ncol // 2 + smin
     stop_cor = ncol // 2 + smax
-    list_cor = xp.arange(start_cor, stop_cor + 0.5, 0.5, dtype=xp.float32)
+    list_cor = cp.arange(start_cor, stop_cor + 0.5, 0.5, dtype=cp.float32)
     list_shift = 2.0 * (list_cor - cen_fliplr)
-    list_metric = xp.empty(list_shift.shape, dtype=xp.float32)
+    list_metric = cp.empty(list_shift.shape, dtype=cp.float32)
     _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, list_metric)
 
-    minpos = xp.argmin(list_metric)
+    minpos = cp.argmin(list_metric)
     if minpos == 0:
         print("WARNING!!!Global minimum is out of searching range")
         print(f"Please extend smin: {smin}")
@@ -178,25 +167,27 @@ def _search_coarse(sino, smin, smax, ratio, drop):
 def _search_fine(sino, srad, step, init_cen, ratio, drop):
     (nrow, ncol) = sino.shape
 
-    flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
-    comp_sino = xp.ascontiguousarray(xp.flipud(sino))
+    flip_sino = cp.ascontiguousarray(cp.fliplr(sino))
+    comp_sino = cp.ascontiguousarray(cp.flipud(sino))
     mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
 
     cen_fliplr = (ncol - 1.0) / 2.0
     srad = max(min(abs(float(srad)), ncol / 4.0), 1.0)
     step = max(min(abs(step), srad), 0.1)
     init_cen = max(min(init_cen, ncol - srad - 1), srad)
-    list_cor = init_cen + xp.arange(-srad, srad + step, step, dtype=np.float32)
+    list_cor = init_cen + cp.arange(-srad, srad + step, step, dtype=np.float32)
     list_shift = 2.0 * (list_cor - cen_fliplr)
-    list_metric = xp.empty(list_shift.shape, dtype="float32")
+    list_metric = cp.empty(list_shift.shape, dtype="float32")
 
     _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, out=list_metric)
-    cor = list_cor[xp.argmin(list_metric)]
+    cor = list_cor[cp.argmin(list_metric)]
     return cor
 
 
 @nvtx.annotate()
 def _create_mask(nrow, ncol, radius, drop):
+    from httomolibgpu.cuda_kernels import load_cuda_module
+
     du = 1.0 / ncol
     dv = (nrow - 1.0) / (nrow * 2.0 * np.pi)
     cen_row = int(math.ceil(nrow / 2.0) - 1)
@@ -209,16 +200,16 @@ def _create_mask(nrow, ncol, radius, drop):
     grid_x = (ncol // 2 + 1 + block_x - 1) // block_x
     grid_y = nrow
     grid_dims = (grid_x, grid_y)
-    mask = xp.empty((nrow, ncol // 2 + 1), dtype="uint16")
+    mask = cp.empty((nrow, ncol // 2 + 1), dtype="uint16")
     params = (
         ncol,
         nrow,
         cen_col,
         cen_row,
-        xp.float32(du),
-        xp.float32(dv),
-        xp.float32(radius),
-        xp.float32(drop),
+        cp.float32(du),
+        cp.float32(dv),
+        cp.float32(radius),
+        cp.float32(drop),
         mask,
     )
     module = load_cuda_module("generate_mask")
@@ -235,12 +226,12 @@ def round_up(x: float) -> int:
 
 
 def _get_available_gpu_memory() -> int:
-    dev = xp.cuda.Device()
+    dev = cp.cuda.Device()
     # first, let's make some space
-    xp.get_default_memory_pool().free_all_blocks()
-    cache = xp.fft.config.get_plan_cache()
+    cp.get_default_memory_pool().free_all_blocks()
+    cache = cp.fft.config.get_plan_cache()
     cache.clear()
-    available_memory = dev.mem_info[0] + xp.get_default_memory_pool().free_bytes()
+    available_memory = dev.mem_info[0] + cp.get_default_memory_pool().free_bytes()
     return int(available_memory * 0.9)  # 10% safety margin
 
 
@@ -268,12 +259,17 @@ def _calculate_chunks(
 
 @nvtx.annotate()
 def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
+    from httomolibgpu.cuda_kernels import load_cuda_module
+    from cupyx.scipy.ndimage import shift
+    from cupyx.scipy.fftpack import get_fft_plan
+    from cupyx.scipy.fft import rfft2
+
     # this tries to simplify - if shift_col is integer, no need to spline interpolate
-    assert list_shift.dtype == xp.float32, "shifts must be single precision floats"
-    assert sino1.dtype == xp.float32, "sino1 must be float32"
-    assert sino2.dtype == xp.float32, "sino1 must be float32"
-    assert sino3.dtype == xp.float32, "sino1 must be float32"
-    assert out.dtype == xp.float32, "sino1 must be float32"
+    assert list_shift.dtype == cp.float32, "shifts must be single precision floats"
+    assert sino1.dtype == cp.float32, "sino1 must be float32"
+    assert sino2.dtype == cp.float32, "sino1 must be float32"
+    assert sino3.dtype == cp.float32, "sino1 must be float32"
+    assert out.dtype == cp.float32, "sino1 must be float32"
     assert sino2.flags["C_CONTIGUOUS"], "sino2 must be C-contiguous"
     assert sino3.flags["C_CONTIGUOUS"], "sino3 must be C-contiguous"
     assert list_shift.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
@@ -285,7 +281,7 @@ def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
     shift_whole_shifts = module.get_function("shift_whole_shifts")
     # note: we don't have to calculate the mean here, as we're only looking for minimum metric.
     # The sum is enough.
-    masked_sum_abs_kernel = xp.ReductionKernel(
+    masked_sum_abs_kernel = cp.ReductionKernel(
         in_params="complex64 x, uint16 mask",  # input, complex + mask
         out_params="float32 out",  # output, real
         map_expr="mask ? abs(x) : 0.0f",
@@ -299,21 +295,19 @@ def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
     # determine how many shifts we can fit in the available memory
     # and iterate in chunks
     chunks = _calculate_chunks(
-        nshifts, (na1 + na2) * sino2.shape[1] * xp.float32().nbytes
+        nshifts, (na1 + na2) * sino2.shape[1] * cp.float32().nbytes
     )
 
-    mat = xp.empty((chunks[0], na1 + na2, sino2.shape[1]), dtype=xp.float32)
+    mat = cp.empty((chunks[0], na1 + na2, sino2.shape[1]), dtype=cp.float32)
     mat[:, :na1, :] = sino1
     # explicitly create FFT plan here, so it's not cached and clearly re-used
-    plan = get_fft_plan(
-        mat, mat.shape[-2:], axes=(1, 2), value_type="R2C"
-    )
+    plan = get_fft_plan(mat, mat.shape[-2:], axes=(1, 2), value_type="R2C")
 
     for i, stop_idx in enumerate(chunks):
         if i > 0:
             # more than one iteration means we're tight on memory, so clear up freed blocks
             mat_freq = None
-            xp.get_default_memory_pool().free_all_blocks()
+            cp.get_default_memory_pool().free_all_blocks()
 
         start_idx = 0 if i == 0 else chunks[i - 1]
         size = stop_idx - start_idx
@@ -336,7 +330,7 @@ def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
         )
 
         # now we can only look at the spline shifting, the rest is done
-        list_shift_host = xp.asnumpy(list_shift[start_idx:stop_idx])
+        list_shift_host = cp.asnumpy(list_shift[start_idx:stop_idx])
         for i in range(list_shift_host.shape[0]):
             shift_col = float(list_shift_host[i])
             if not shift_col.is_integer():
@@ -358,7 +352,9 @@ def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
 
 @nvtx.annotate()
 def _downsample(sino, level, axis):
-    assert sino.dtype == xp.float32, "single precision floating point input required"
+    from httomolibgpu.cuda_kernels import load_cuda_module
+
+    assert sino.dtype == cp.float32, "single precision floating point input required"
     assert sino.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
 
     dx, dz = sino.shape
@@ -366,7 +362,7 @@ def _downsample(sino, level, axis):
     dim = int(sino.shape[axis] / math.pow(2, level))
     shape = [dx, dz]
     shape[axis] = dim
-    downsampled_data = xp.empty(shape, dtype="float32")
+    downsampled_data = cp.empty(shape, dtype="float32")
 
     block_x = 8
     block_y = 8
@@ -386,393 +382,423 @@ def _downsample(sino, level, axis):
 
 
 ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# # %%%%%%%%%%%%%%%%%%%%%%%%%find_center_360%%%%%%%%%%%%%%%%%%%%%%%%%
+# --- Center of rotation (COR) estimation method ---#
 
 
-# # %%%%%%%%%%%%%%%%%%%%%%%%%find_center_360%%%%%%%%%%%%%%%%%%%%%%%%%
-# # --- Center of rotation (COR) estimation method ---#
-# @nvtx.annotate()
-# def find_center_360(
-#     data: xp.ndarray,
-#     ind: Optional[int] = None,
-#     win_width: int = 10,
-#     side: Optional[Literal[0, 1]] = None,
-#     denoise: bool = True,
-#     norm: bool = False,
-#     use_overlap: bool = False,
-# ) -> Tuple[float, float, Optional[Literal[0, 1]], float]:
-#     """
-#     Find the center-of-rotation (COR) in a 360-degree scan with offset COR use
-#     the method presented in Ref. [1] by Nghia Vo.
-
-#     This function supports both numpy and cupy - the implementation is selected
-#     by where the input data array resides.
-
-#     Parameters
-#     ----------
-#     data : cp.ndarray
-#         3D tomographic data as a Cupy array.
-#     ind : int, optional
-#         Index of the slice to be used for estimate the CoR and the overlap.
-#     win_width : int, optional
-#         Window width used for finding the overlap area.
-#     side : {None, 0, 1}, optional
-#         Overlap size. Only there options: None, 0, or 1. "None" corresponds
-#         to fully automated determination. "0" corresponds to the left side.
-#         "1" corresponds to the right side.
-#     denoise : bool, optional
-#         Apply the Gaussian filter if True.
-#     norm : bool, optional
-#         Apply the normalisation if True.
-#     use_overlap : bool, optional
-#         Use the combination of images in the overlap area for calculating
-#         correlation coefficients if True.
-
-#     Returns
-#     -------
-#     cor : float
-#         Center-of-rotation.
-#     overlap : float
-#         Width of the overlap area between two halves of the sinogram.
-#     side : int
-#         Overlap side between two halves of the sinogram.
-#     overlap_position : float
-#         Position of the window in the first image giving the best
-#         correlation metric.
-
-#     References
-#     ----------
-#     [1] : https://doi.org/10.1364/OE.418448
-#     """
-#     if data.ndim != 3:
-#         raise ValueError("A 3D array must be provided")
-
-#     # this method works with a 360-degree sinogram.
-#     if ind is None:
-#         _sino = data[:, 0, :]
-#     else:
-#         _sino = data[:, ind, :]
-
-#     (nrow, ncol) = _sino.shape
-#     nrow_180 = nrow // 2 + 1
-#     sino_top = _sino[0:nrow_180, :]
-#     sino_bot = xp.fliplr(_sino[-nrow_180:, :])
-#     (overlap, side, overlap_position) = _find_overlap(
-#         sino_top, sino_bot, win_width, side, denoise, norm, use_overlap
-#     )
-#     if side == 0:
-#         cor = overlap / 2.0 - 1.0
-#     else:
-#         cor = ncol - overlap / 2.0 - 1.0
-
-#     return cor, overlap, side, overlap_position
-
-
-# def _find_overlap(
-#     mat1, mat2, win_width, side=None, denoise=True, norm=False, use_overlap=False
-# ):
-#     """
-#     Find the overlap area and overlap side between two images (Ref. [1]) where
-#     the overlap side referring to the first image.
-
-#     Parameters
-#     ----------
-#     mat1 : array_like
-#         2D array. Projection image or sinogram image.
-#     mat2 :  array_like
-#         2D array. Projection image or sinogram image.
-#     win_width : int
-#         Width of the searching window.
-#     side : {None, 0, 1}, optional
-#         Only there options: None, 0, or 1. "None" corresponding to fully
-#         automated determination. "0" corresponding to the left side. "1"
-#         corresponding to the right side.
-#     denoise : bool, optional
-#         Apply the Gaussian filter if True.
-#     norm : bool, optional
-#         Apply the normalization if True.
-#     use_overlap : bool, optional
-#         Use the combination of images in the overlap area for calculating
-#         correlation coefficients if True.
-
-#     Returns
-#     -------
-#     overlap : float
-#         Width of the overlap area between two images.
-#     side : int
-#         Overlap side between two images.
-#     overlap_position : float
-#         Position of the window in the first image giving the best
-#         correlation metric.
-
-#     """
-#     ncol1 = mat1.shape[1]
-#     ncol2 = mat2.shape[1]
-#     win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2))
-
-#     if side == 1:
-#         (list_metric, offset) = _search_overlap(
-#             mat1,
-#             mat2,
-#             win_width,
-#             side=side,
-#             denoise=denoise,
-#             norm=norm,
-#             use_overlap=use_overlap,
-#         )
-#         overlap_position = _calculate_curvature(list_metric)[1]
-#         overlap_position += offset
-#         overlap = ncol1 - overlap_position + win_width // 2
-#     elif side == 0:
-#         (list_metric, offset) = _search_overlap(
-#             mat1,
-#             mat2,
-#             win_width,
-#             side=side,
-#             denoise=denoise,
-#             norm=norm,
-#             use_overlap=use_overlap,
-#         )
-#         overlap_position = _calculate_curvature(list_metric)[1]
-#         overlap_position += offset
-#         overlap = overlap_position + win_width // 2
-#     else:
-#         (list_metric1, offset1) = _search_overlap(
-#             mat1,
-#             mat2,
-#             win_width,
-#             side=1,
-#             denoise=denoise,
-#             norm=norm,
-#             use_overlap=use_overlap,
-#         )
-#         (list_metric2, offset2) = _search_overlap(
-#             mat1,
-#             mat2,
-#             win_width,
-#             side=0,
-#             denoise=denoise,
-#             norm=norm,
-#             use_overlap=use_overlap,
-#         )
-
-#         (curvature1, overlap_position1) = _calculate_curvature(list_metric1)
-#         overlap_position1 += offset1
-#         (curvature2, overlap_position2) = _calculate_curvature(list_metric2)
-#         overlap_position2 += offset2
-
-#         if curvature1 > curvature2:
-#             side = 1
-#             overlap_position = overlap_position1
-#             overlap = ncol1 - overlap_position + win_width // 2
-#         else:
-#             side = 0
-#             overlap_position = overlap_position2
-#             overlap = overlap_position + win_width // 2
-
-#     return overlap, side, overlap_position
-
-
-# @nvtx.annotate()
-# def _search_overlap(
-#     mat1, mat2, win_width, side, denoise=True, norm=False, use_overlap=False
-# ):
-#     """
-#     Calculate the correlation metrics between a rectangular region, defined
-#     by the window width, on the utmost left/right side of image 2 and the
-#     same size region in image 1 where the region is slided across image 1.
-
-#     Parameters
-#     ----------
-#     mat1 : array_like
-#         2D array. Projection image or sinogram image.
-#     mat2 : array_like
-#         2D array. Projection image or sinogram image.
-#     win_width : int
-#         Width of the searching window.
-#     side : {0, 1}
-#         Only two options: 0 or 1. It is used to indicate the overlap side
-#         respects to image 1. "0" corresponds to the left side. "1" corresponds
-#         to the right side.
-#     denoise : bool, optional
-#         Apply the Gaussian filter if True.
-#     norm : bool, optional
-#         Apply the normalization if True.
-#     use_overlap : bool, optional
-#         Use the combination of images in the overlap area for calculating
-#         correlation coefficients if True.
-
-#     Returns
-#     -------
-#     list_metric : array_like
-#         1D array. List of the correlation metrics.
-#     offset : int
-#         Initial position of the searching window where the position
-#         corresponds to the center of the window.
-#     """
-#     if denoise is True:
-#         # note: the filtering makes the output contiguous
-#         with nvtx.annotate("denoise_filter", color="green"):
-#             mat1 = gaussian_filter(mat1, (2, 2), mode="reflect")
-#             mat2 = gaussian_filter(mat2, (2, 2), mode="reflect")
-#     else:
-#         mat1 = xp.ascontiguousarray(mat1, dtype=xp.float32)
-#         mat2 = xp.ascontiguousarray(mat2, dtype=xp.float32)
-
-#     (nrow1, ncol1) = mat1.shape
-#     (nrow2, ncol2) = mat2.shape
-
-#     if nrow1 != nrow2:
-#         raise ValueError("Two images are not at the same height!!!")
-
-#     win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2 - 1))
-#     offset = win_width // 2
-#     win_width = 2 * offset  # Make it even
-
-#     list_metric = _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm)
-
-#     min_metric = xp.min(list_metric)
-#     if min_metric != 0.0:
-#         list_metric /= min_metric
-
-#     return list_metric, offset
-
-
-# _calc_metrics_module = load_cuda_module(
-#     "calc_metrics",
-#     name_expressions=[
-#         "calc_metrics_kernel<false, false>",
-#         "calc_metrics_kernel<true, false>",
-#         "calc_metrics_kernel<false, true>",
-#         "calc_metrics_kernel<true, true>",
-#     ],
-#     options=("--maxrregcount=32",),
-# )
-
-
-# @nvtx.annotate()
-# def _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm):
-#     assert mat1.dtype == xp.float32, "only float32 supported"
-#     assert mat2.dtype == xp.float32, "only float32 supported"
-#     assert mat1.shape[0] == mat2.shape[0]
-#     assert mat1.flags.c_contiguous, "only contiguos arrays supported"
-#     assert mat2.flags.c_contiguous, "only contiguos arrays supported"
-
-#     num_pos = mat1.shape[1] - win_width
-#     list_metric = xp.empty(num_pos, dtype=xp.float32)
-
-#     args = (
-#         mat1,
-#         np.int32(mat1.strides[0] / mat1.strides[1]),
-#         mat2,
-#         np.int32(mat2.strides[0] / mat2.strides[1]),
-#         np.int32(win_width),
-#         np.int32(mat1.shape[0]),
-#         np.int32(side),
-#         list_metric,
-#     )
-#     block = (128, 1, 1)
-#     grid = (1, np.int32(num_pos), 1)
-#     smem = block[0] * 4 * 6 if use_overlap else block[0] * 4 * 3
-#     bool2str = lambda x: "true" if x is True else "false"
-#     calc_metrics = _calc_metrics_module.get_function(
-#         f"calc_metrics_kernel<{bool2str(norm)}, {bool2str(use_overlap)}>"
-#     )
-#     calc_metrics(grid=grid, block=block, args=args, shared_mem=smem)
-
-#     return list_metric
-
-
-# @nvtx.annotate()
-# def _calculate_curvature(list_metric):
-#     """
-#     Calculate the curvature of a fitted curve going through the minimum
-#     value of a metric list.
-
-#     Parameters
-#     ----------
-#     list_metric : array_like
-#         1D array. List of metrics.
-
-#     Returns
-#     -------
-#     curvature : float
-#         Quadratic coefficient of the parabola fitting.
-#     min_pos : float
-#         Position of the minimum value with sub-pixel accuracy.
-#     """
-#     radi = 2
-#     num_metric = list_metric.size
-#     min_metric_idx = int(xp.argmin(list_metric))
-#     min_pos = int(np.clip(min_metric_idx, radi, num_metric - radi - 1))
-
-#     # work mostly on CPU here - we have very small arrays here
-#     list1 = xp.asnumpy(list_metric[min_pos - radi : min_pos + radi + 1])
-#     afact1 = np.polyfit(np.arange(0, 2 * radi + 1), list1, 2)[0]
-#     list2 = xp.asnumpy(list_metric[min_pos - 1 : min_pos + 2])
-#     (afact2, bfact2, _) = np.polyfit(np.arange(min_pos - 1, min_pos + 2), list2, 2)
-
-#     curvature = np.abs(afact1)
-#     if afact2 != 0.0:
-#         num = -bfact2 / (2 * afact2)
-#         if (num >= min_pos - 1) and (num <= min_pos + 1):
-#             min_pos = num
-
-#     return curvature, np.float32(min_pos)
-
-
-# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-
-# ## %%%%%%%%%%%%%%%%%%%%%%find_center_pc%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# @nvtx.annotate()
-# def find_center_pc(
-#     proj1: xp.ndarray, proj2: xp.ndarray, tol: float = 0.5, rotc_guess: Union[float, Optional[str]] = None
-# ) -> float:
-#     """Find rotation axis location by finding the offset between the first
-#     projection and a mirrored projection 180 degrees apart using
-#     phase correlation in Fourier space.
-#     The `phase_cross_correlation` function uses cross-correlation in Fourier
-#     space, optionally employing an upsampled matrix-multiplication DFT to
-#     achieve arbitrary subpixel precision. :cite:`Guizar:08`.
-
-#     Args:
-#         proj1 (xp.ndarray): Projection from the 0th degree
-#         proj2 (xp.ndarray): Projection from the 180th degree
-#         tol (float, optional): Subpixel accuracy. Defaults to 0.5.
-#         rotc_guess (float, optional): Initial guess value for the rotation center. Defaults to None.
-
-#     Returns:
-#         float: Rotation axis location.
-#     """
-#     if xp.__name__ == "cupy":
-#         from cupyx.scipy.ndimage import shift
-#         try:
-#             from cucim.skimage.registration import phase_cross_correlation
-#         except ImportError:
-#             print(
-#             "Cucim library of Rapidsai is a required dependency for find_center_pc module, please install"
-#             )
-#     else:
-#         from skimage.registration import phase_cross_correlation  
-#         from scipy.ndimage import shift
-    
-#     imgshift = 0.0 if rotc_guess is None else rotc_guess - (proj1.shape[1] - 1.0) / 2.0
-
-#     proj1 = shift(proj1, [0, -imgshift], mode="constant", cval=0)
-#     proj2 = shift(proj2, [0, -imgshift], mode="constant", cval=0)
-
-#     # create reflection of second projection
-#     proj2 = xp.fliplr(proj2)
-
-#     # using cucim of rapids to do phase cross correlation between two images
-#     shiftr = phase_cross_correlation(
-#         reference_image=proj1, moving_image=proj2, upsample_factor=1.0 / tol
-#     )
-
-#     # Compute center of rotation as the center of first image and the
-#     # registered translation with the second image
-#     center = (proj1.shape[1] + shiftr[0][1] - 1.0) / 2.0
-
-#     return center + imgshift
-
-# ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+def find_center_360(
+    data: cp.ndarray,
+    ind: Optional[int] = None,
+    win_width: int = 10,
+    side: Optional[Literal[0, 1]] = None,
+    denoise: bool = True,
+    norm: bool = False,
+    use_overlap: bool = False,
+) -> Tuple[float, float, Optional[Literal[0, 1]], float]:
+    """
+    Find the center-of-rotation (COR) in a 360-degree scan with offset COR use
+    the method presented in Ref. [1] by Nghia Vo.
+
+    This function supports both numpy and cupy - the implementation is selected
+    by where the input data array resides.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        3D tomographic data as a Cupy array.
+    ind : int, optional
+        Index of the slice to be used for estimate the CoR and the overlap.
+    win_width : int, optional
+        Window width used for finding the overlap area.
+    side : {None, 0, 1}, optional
+        Overlap size. Only there options: None, 0, or 1. "None" corresponds
+        to fully automated determination. "0" corresponds to the left side.
+        "1" corresponds to the right side.
+    denoise : bool, optional
+        Apply the Gaussian filter if True.
+    norm : bool, optional
+        Apply the normalisation if True.
+    use_overlap : bool, optional
+        Use the combination of images in the overlap area for calculating
+        correlation coefficients if True.
+
+    Returns
+    -------
+    cor : float
+        Center-of-rotation.
+    overlap : float
+        Width of the overlap area between two halves of the sinogram.
+    side : int
+        Overlap side between two halves of the sinogram.
+    overlap_position : float
+        Position of the window in the first image giving the best
+        correlation metric.
+
+    References
+    ----------
+    [1] : https://doi.org/10.1364/OE.418448
+    """
+
+    if cupywrapper.cupy_run:
+        return __find_center_360(data, ind, win_width, side, denoise, norm, use_overlap)
+    else:
+        print("find_center_360 won't be executed because CuPy is not installed")
+        return (0, 0, 0, 0)
+
+
+@nvtx.annotate()
+def __find_center_360(
+    data: cp.ndarray,
+    ind: Optional[int] = None,
+    win_width: int = 10,
+    side: Optional[Literal[0, 1]] = None,
+    denoise: bool = True,
+    norm: bool = False,
+    use_overlap: bool = False,
+) -> Tuple[float, float, Optional[Literal[0, 1]], float]:
+
+    if data.ndim != 3:
+        raise ValueError("A 3D array must be provided")
+
+    # this method works with a 360-degree sinogram.
+    if ind is None:
+        _sino = data[:, 0, :]
+    else:
+        _sino = data[:, ind, :]
+
+    (nrow, ncol) = _sino.shape
+    nrow_180 = nrow // 2 + 1
+    sino_top = _sino[0:nrow_180, :]
+    sino_bot = cp.fliplr(_sino[-nrow_180:, :])
+    (overlap, side, overlap_position) = _find_overlap(
+        sino_top, sino_bot, win_width, side, denoise, norm, use_overlap
+    )
+    if side == 0:
+        cor = overlap / 2.0 - 1.0
+    else:
+        cor = ncol - overlap / 2.0 - 1.0
+
+    return cor, overlap, side, overlap_position
+
+
+def _find_overlap(
+    mat1, mat2, win_width, side=None, denoise=True, norm=False, use_overlap=False
+):
+    """
+    Find the overlap area and overlap side between two images (Ref. [1]) where
+    the overlap side referring to the first image.
+
+    Parameters
+    ----------
+    mat1 : array_like
+        2D array. Projection image or sinogram image.
+    mat2 :  array_like
+        2D array. Projection image or sinogram image.
+    win_width : int
+        Width of the searching window.
+    side : {None, 0, 1}, optional
+        Only there options: None, 0, or 1. "None" corresponding to fully
+        automated determination. "0" corresponding to the left side. "1"
+        corresponding to the right side.
+    denoise : bool, optional
+        Apply the Gaussian filter if True.
+    norm : bool, optional
+        Apply the normalization if True.
+    use_overlap : bool, optional
+        Use the combination of images in the overlap area for calculating
+        correlation coefficients if True.
+
+    Returns
+    -------
+    overlap : float
+        Width of the overlap area between two images.
+    side : int
+        Overlap side between two images.
+    overlap_position : float
+        Position of the window in the first image giving the best
+        correlation metric.
+
+    """
+    ncol1 = mat1.shape[1]
+    ncol2 = mat2.shape[1]
+    win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2))
+
+    if side == 1:
+        (list_metric, offset) = _search_overlap(
+            mat1,
+            mat2,
+            win_width,
+            side=side,
+            denoise=denoise,
+            norm=norm,
+            use_overlap=use_overlap,
+        )
+        overlap_position = _calculate_curvature(list_metric)[1]
+        overlap_position += offset
+        overlap = ncol1 - overlap_position + win_width // 2
+    elif side == 0:
+        (list_metric, offset) = _search_overlap(
+            mat1,
+            mat2,
+            win_width,
+            side=side,
+            denoise=denoise,
+            norm=norm,
+            use_overlap=use_overlap,
+        )
+        overlap_position = _calculate_curvature(list_metric)[1]
+        overlap_position += offset
+        overlap = overlap_position + win_width // 2
+    else:
+        (list_metric1, offset1) = _search_overlap(
+            mat1,
+            mat2,
+            win_width,
+            side=1,
+            denoise=denoise,
+            norm=norm,
+            use_overlap=use_overlap,
+        )
+        (list_metric2, offset2) = _search_overlap(
+            mat1,
+            mat2,
+            win_width,
+            side=0,
+            denoise=denoise,
+            norm=norm,
+            use_overlap=use_overlap,
+        )
+
+        (curvature1, overlap_position1) = _calculate_curvature(list_metric1)
+        overlap_position1 += offset1
+        (curvature2, overlap_position2) = _calculate_curvature(list_metric2)
+        overlap_position2 += offset2
+
+        if curvature1 > curvature2:
+            side = 1
+            overlap_position = overlap_position1
+            overlap = ncol1 - overlap_position + win_width // 2
+        else:
+            side = 0
+            overlap_position = overlap_position2
+            overlap = overlap_position + win_width // 2
+
+    return overlap, side, overlap_position
+
+
+@nvtx.annotate()
+def _search_overlap(
+    mat1, mat2, win_width, side, denoise=True, norm=False, use_overlap=False
+):
+    """
+    Calculate the correlation metrics between a rectangular region, defined
+    by the window width, on the utmost left/right side of image 2 and the
+    same size region in image 1 where the region is slided across image 1.
+
+    Parameters
+    ----------
+    mat1 : array_like
+        2D array. Projection image or sinogram image.
+    mat2 : array_like
+        2D array. Projection image or sinogram image.
+    win_width : int
+        Width of the searching window.
+    side : {0, 1}
+        Only two options: 0 or 1. It is used to indicate the overlap side
+        respects to image 1. "0" corresponds to the left side. "1" corresponds
+        to the right side.
+    denoise : bool, optional
+        Apply the Gaussian filter if True.
+    norm : bool, optional
+        Apply the normalization if True.
+    use_overlap : bool, optional
+        Use the combination of images in the overlap area for calculating
+        correlation coefficients if True.
+
+    Returns
+    -------
+    list_metric : array_like
+        1D array. List of the correlation metrics.
+    offset : int
+        Initial position of the searching window where the position
+        corresponds to the center of the window.
+    """
+    from cupyx.scipy.ndimage import gaussian_filter
+
+    if denoise is True:
+        # note: the filtering makes the output contiguous
+        with nvtx.annotate("denoise_filter", color="green"):
+            mat1 = gaussian_filter(mat1, (2, 2), mode="reflect")
+            mat2 = gaussian_filter(mat2, (2, 2), mode="reflect")
+    else:
+        mat1 = cp.ascontiguousarray(mat1, dtype=cp.float32)
+        mat2 = cp.ascontiguousarray(mat2, dtype=cp.float32)
+
+    (nrow1, ncol1) = mat1.shape
+    (nrow2, ncol2) = mat2.shape
+
+    if nrow1 != nrow2:
+        raise ValueError("Two images are not at the same height!!!")
+
+    win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2 - 1))
+    offset = win_width // 2
+    win_width = 2 * offset  # Make it even
+
+    list_metric = _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm)
+
+    min_metric = cp.min(list_metric)
+    if min_metric != 0.0:
+        list_metric /= min_metric
+
+    return list_metric, offset
+
+
+@nvtx.annotate()
+def _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm):
+    assert mat1.dtype == cp.float32, "only float32 supported"
+    assert mat2.dtype == cp.float32, "only float32 supported"
+    assert mat1.shape[0] == mat2.shape[0]
+    assert mat1.flags.c_contiguous, "only contiguos arrays supported"
+    assert mat2.flags.c_contiguous, "only contiguos arrays supported"
+
+    from httomolibgpu.cuda_kernels import load_cuda_module
+
+    _calc_metrics_module = load_cuda_module(
+        "calc_metrics",
+        name_expressions=[
+            "calc_metrics_kernel<false, false>",
+            "calc_metrics_kernel<true, false>",
+            "calc_metrics_kernel<false, true>",
+            "calc_metrics_kernel<true, true>",
+        ],
+        options=("--maxrregcount=32",),
+    )
+
+    num_pos = mat1.shape[1] - win_width
+    list_metric = cp.empty(num_pos, dtype=cp.float32)
+
+    args = (
+        mat1,
+        np.int32(mat1.strides[0] / mat1.strides[1]),
+        mat2,
+        np.int32(mat2.strides[0] / mat2.strides[1]),
+        np.int32(win_width),
+        np.int32(mat1.shape[0]),
+        np.int32(side),
+        list_metric,
+    )
+    block = (128, 1, 1)
+    grid = (1, np.int32(num_pos), 1)
+    smem = block[0] * 4 * 6 if use_overlap else block[0] * 4 * 3
+    bool2str = lambda x: "true" if x is True else "false"
+    calc_metrics = _calc_metrics_module.get_function(
+        f"calc_metrics_kernel<{bool2str(norm)}, {bool2str(use_overlap)}>"
+    )
+    calc_metrics(grid=grid, block=block, args=args, shared_mem=smem)
+
+    return list_metric
+
+
+@nvtx.annotate()
+def _calculate_curvature(list_metric):
+    """
+    Calculate the curvature of a fitted curve going through the minimum
+    value of a metric list.
+
+    Parameters
+    ----------
+    list_metric : array_like
+        1D array. List of metrics.
+
+    Returns
+    -------
+    curvature : float
+        Quadratic coefficient of the parabola fitting.
+    min_pos : float
+        Position of the minimum value with sub-pixel accuracy.
+    """
+    radi = 2
+    num_metric = list_metric.size
+    min_metric_idx = int(cp.argmin(list_metric))
+    min_pos = int(np.clip(min_metric_idx, radi, num_metric - radi - 1))
+
+    # work mostly on CPU here - we have very small arrays here
+    list1 = cp.asnumpy(list_metric[min_pos - radi : min_pos + radi + 1])
+    afact1 = np.polyfit(np.arange(0, 2 * radi + 1), list1, 2)[0]
+    list2 = cp.asnumpy(list_metric[min_pos - 1 : min_pos + 2])
+    (afact2, bfact2, _) = np.polyfit(np.arange(min_pos - 1, min_pos + 2), list2, 2)
+
+    curvature = np.abs(afact1)
+    if afact2 != 0.0:
+        num = -bfact2 / (2 * afact2)
+        if (num >= min_pos - 1) and (num <= min_pos + 1):
+            min_pos = num
+
+    return curvature, np.float32(min_pos)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+
+## %%%%%%%%%%%%%%%%%%%%%%find_center_pc%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+def find_center_pc(
+    proj1: cp.ndarray,
+    proj2: cp.ndarray,
+    tol: float = 0.5,
+    rotc_guess: Union[float, Optional[str]] = None,
+) -> float:
+    """Find rotation axis location by finding the offset between the first
+    projection and a mirrored projection 180 degrees apart using
+    phase correlation in Fourier space.
+    The `phase_cross_correlation` function uses cross-correlation in Fourier
+    space, optionally employing an upsampled matrix-multiplication DFT to
+    achieve arbitrary subpixel precision. :cite:`Guizar:08`.
+
+    Args:
+        proj1 (cp.ndarray): Projection from the 0th degree
+        proj2 (cp.ndarray): Projection from the 180th degree
+        tol (float, optional): Subpixel accuracy. Defaults to 0.5.
+        rotc_guess (float, optional): Initial guess value for the rotation center. Defaults to None.
+
+    Returns:
+        float: Rotation axis location.
+    """
+    if cupywrapper.cupy_run:
+        return __find_center_pc(proj1, proj2, tol, rotc_guess)
+    else:
+        print("find_center_pc won't be executed because CuPy is not installed")
+        return 0
+
+
+@nvtx.annotate()
+def __find_center_pc(
+    proj1: cp.ndarray,
+    proj2: cp.ndarray,
+    tol: float = 0.5,
+    rotc_guess: Union[float, Optional[str]] = None,
+) -> float:
+
+    from cupyx.scipy.ndimage import shift
+    from cucim.skimage.registration import phase_cross_correlation
+
+    imgshift = 0.0 if rotc_guess is None else rotc_guess - (proj1.shape[1] - 1.0) / 2.0
+
+    proj1 = shift(proj1, [0, -imgshift], mode="constant", cval=0)
+    proj2 = shift(proj2, [0, -imgshift], mode="constant", cval=0)
+
+    # create reflection of second projection
+    proj2 = cp.fliplr(proj2)
+
+    # using cucim of rapids to do phase cross correlation between two images
+    shiftr = phase_cross_correlation(
+        reference_image=proj1, moving_image=proj2, upsample_factor=1.0 / tol
+    )
+
+    # Compute center of rotation as the center of first image and the
+    # registered translation with the second image
+    center = (proj1.shape[1] + shiftr[0][1] - 1.0) / 2.0
+
+    return center + imgshift
+
+
+##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%