diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index ccb48651..e2181454 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -3,12 +3,12 @@
 from httomolibgpu.misc.rescale import rescale_to_int
 from httomolibgpu.prep.alignment import distortion_correction_proj_discorpy
 from httomolibgpu.prep.normalize import normalize
-#from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
+from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
 from httomolibgpu.prep.stripe import (
     remove_stripe_based_sorting,
     remove_stripe_ti,
     remove_all_stripe,
 )
+
 from httomolibgpu.recon.algorithm import FBP, SIRT, CGLS
-#from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
-#from httomolibgpu.recon.rotation import find_center_vo
+from httomolibgpu.recon.rotation import find_center_vo, find_center_360, find_center_pc
diff --git a/httomolibgpu/cupywrapper.py b/httomolibgpu/cupywrapper.py
index a5b858cf..f3cccf61 100644
--- a/httomolibgpu/cupywrapper.py
+++ b/httomolibgpu/cupywrapper.py
@@ -1,11 +1,19 @@
 cupy_run = False
 try:
     import cupy as cp
+    import nvtx
+
     try:
         cp.cuda.Device(0).compute_capability
         cupy_run = True
     except cp.cuda.runtime.CUDARuntimeError:
         print("CuPy library is a major dependency for HTTomolibgpu, please install")
         import numpy as cp
-except ImportError:
-    import numpy as cp
\ No newline at end of file
+except ImportError as e:
+    print(
+        f"Failed to import module in {__file__} with error: {e}; defaulting to CPU-only mode"
+    )
+    from unittest.mock import Mock
+    import numpy as cp
+
+    nvtx = Mock()
diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index 28da4b67..49fe1521 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -22,17 +22,18 @@
 
 import numpy as np
 from httomolibgpu import cupywrapper
+
 cp = cupywrapper.cp
 
+nvtx = cupywrapper.nvtx
 from numpy import float32
-import nvtx
 
 __all__ = [
     "median_filter",
     "remove_outlier",
 ]
 
-@nvtx.annotate()
+
 def median_filter(
     data: cp.ndarray,
     kernel_size: int = 3,
@@ -71,6 +72,7 @@ def median_filter(
         return data
 
 
+@nvtx.annotate()
 def __median_filter(
     data: cp.ndarray,
     kernel_size: int = 3,
@@ -157,7 +159,7 @@ def __median_filter(
         thresholding_kernel(data, float32(dif), output)
     return output
 
-@nvtx.annotate()
+
 def remove_outlier(
     data: cp.ndarray, kernel_size: int = 3, axis: int = 0, dif: float = 0.1
 ) -> cp.ndarray:
diff --git a/httomolibgpu/misc/morph.py b/httomolibgpu/misc/morph.py
index 0a9c0b94..098d1712 100644
--- a/httomolibgpu/misc/morph.py
+++ b/httomolibgpu/misc/morph.py
@@ -22,9 +22,10 @@
 
 import numpy as np
 from httomolibgpu import cupywrapper
+
 cp = cupywrapper.cp
 
-import nvtx
+nvtx = cupywrapper.nvtx
 from typing import Literal
 
 __all__ = [
@@ -32,7 +33,7 @@
     "data_resampler",
 ]
 
-@nvtx.annotate()
+
 def sino_360_to_180(
     data: cp.ndarray, overlap: int = 0, rotation: Literal["left", "right"] = "left"
 ) -> cp.ndarray:
@@ -62,6 +63,7 @@ def sino_360_to_180(
         return data
 
 
+@nvtx.annotate()
 def __sino_360_to_180(
     data: cp.ndarray, overlap: int = 0, rotation: Literal["left", "right"] = "left"
 ) -> cp.ndarray:
@@ -103,7 +105,6 @@ def __sino_360_to_180(
     return out
 
 
-@nvtx.annotate()
 def data_resampler(
     data: cp.ndarray, newshape: list, axis: int = 1, interpolation: str = "linear"
 ) -> cp.ndarray:
@@ -123,17 +124,19 @@ def data_resampler(
 
     Returns:
         cp.ndarray: Up/Down-scaled 3D cupy array
-    """  
+    """
     if cupywrapper.cupy_run:
         return __data_resampler(data, newshape, axis, interpolation)
     else:
         print("data_resampler won't be executed because CuPy is not installed")
         return data
 
+
+@nvtx.annotate()
 def __data_resampler(
     data: cp.ndarray, newshape: list, axis: int = 1, interpolation: str = "linear"
 ) -> cp.ndarray:
-    
+
     from cupyx.scipy.interpolate import interpn
 
     if data.ndim != 3:
diff --git a/httomolibgpu/misc/rescale.py b/httomolibgpu/misc/rescale.py
index 506db058..3cdb25f1 100644
--- a/httomolibgpu/misc/rescale.py
+++ b/httomolibgpu/misc/rescale.py
@@ -21,15 +21,17 @@
 
 import numpy as np
 from httomolibgpu import cupywrapper
+
 cp = cupywrapper.cp
 
-import nvtx
+nvtx = cupywrapper.nvtx
 from typing import Literal, Optional, Tuple, Union
 
 __all__ = [
     "rescale_to_int",
 ]
-@nvtx.annotate()
+
+
 def rescale_to_int(
     data: cp.ndarray,
     perc_range_min: float = 0.0,
@@ -72,8 +74,10 @@ def rescale_to_int(
         return __rescale_to_int(data, perc_range_min, perc_range_max, bits, glob_stats)
     else:
         print("rescale_to_int won't be executed because CuPy is not installed")
-        return data    
+        return data
+
 
+@nvtx.annotate()
 def __rescale_to_int(
     data: cp.ndarray,
     perc_range_min: float = 0.0,
diff --git a/httomolibgpu/prep/alignment.py b/httomolibgpu/prep/alignment.py
index 484488db..480db1af 100644
--- a/httomolibgpu/prep/alignment.py
+++ b/httomolibgpu/prep/alignment.py
@@ -21,26 +21,12 @@
 """Modules for data correction"""
 
 import numpy as np
-cupy_run = False
-try:
-    import cupy as xp
-
-    try:
-        xp.cuda.Device(0).compute_capability
-        cupy_run = True
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as xp
-except ImportError:
-    import numpy as xp
+from httomolibgpu import cupywrapper
 
-from typing import Dict, List
-import nvtx
+cp = cupywrapper.cp
+nvtx = cupywrapper.nvtx
 
-if cupy_run:
-    from cupyx.scipy.ndimage import map_coordinates
-else:
-    from scipy.ndimage import map_coordinates
+from typing import Dict, List
 
 __all__ = [
     "distortion_correction_proj_discorpy",
@@ -52,9 +38,56 @@
 # (which is the same as the TomoPy version
 # https://github.com/tomopy/tomopy/blob/c236a2969074f5fc70189fb5545f0a165924f916/source/tomopy/prep/alignment.py#L950-L981
 # but with the additional params `order` and `mode`).
-@nvtx.annotate()
 def distortion_correction_proj_discorpy(
-    data: xp.ndarray,
+    data: cp.ndarray,
+    metadata_path: str,
+    preview: Dict[str, List[int]],
+    order: int = 1,
+    mode: str = "reflect",
+):
+    """Unwarp a stack of images using a backward model.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        3D array.
+
+    metadata_path : str
+        The path to the file containing the distortion coefficients for the
+        data.
+
+    preview : Dict[str, List[int]]
+        A dict containing three key-value pairs:
+        - a list containing the `start` value of each dimension
+        - a list containing the `stop` value of each dimension
+        - a list containing the `step` value of each dimension
+
+    order : int, optional.
+        The order of the spline interpolation.
+
+    mode : {'reflect', 'grid-mirror', 'constant', 'grid-constant', 'nearest',
+           'mirror', 'grid-wrap', 'wrap'}, optional
+        To determine how to handle image boundaries.
+
+    Returns
+    -------
+    cp.ndarray
+        3D array. Distortion-corrected image(s).
+    """
+    if cupywrapper.cupy_run:
+        return __distortion_correction_proj_discorpy(
+            data, metadata_path, preview, order, mode
+        )
+    else:
+        print(
+            "distortion_correction_proj_discorpy won't be executed because CuPy is not installed"
+        )
+        return data
+
+
+@nvtx.annotate()
+def __distortion_correction_proj_discorpy(
+    data: cp.ndarray,
     metadata_path: str,
     preview: Dict[str, List[int]],
     order: int = 1,
@@ -89,9 +122,12 @@ def distortion_correction_proj_discorpy(
     cp.ndarray
         3D array. Distortion-corrected image(s).
     """
+
+    from cupyx.scipy.ndimage import map_coordinates
+
     # Check if it's a stack of 2D images, or only a single 2D image
     if len(data.shape) == 2:
-        data = xp.expand_dims(data, axis=0)
+        data = cp.expand_dims(data, axis=0)
 
     # Get info from metadata txt file
     xcenter, ycenter, list_fact = _load_metadata_txt(metadata_path)
@@ -118,26 +154,26 @@ def distortion_correction_proj_discorpy(
     ycenter = ycenter - y_offset
 
     height, width = data.shape[y_dim + 1], data.shape[x_dim + 1]
-    xu_list = xp.arange(width) - xcenter
-    yu_list = xp.arange(height) - ycenter
-    xu_mat, yu_mat = xp.meshgrid(xu_list, yu_list)
-    ru_mat = xp.sqrt(xu_mat**2 + yu_mat**2)
-    fact_mat = xp.sum(
-        xp.asarray([factor * ru_mat**i for i, factor in enumerate(list_fact)]), axis=0
+    xu_list = cp.arange(width) - xcenter
+    yu_list = cp.arange(height) - ycenter
+    xu_mat, yu_mat = cp.meshgrid(xu_list, yu_list)
+    ru_mat = cp.sqrt(xu_mat**2 + yu_mat**2)
+    fact_mat = cp.sum(
+        cp.asarray([factor * ru_mat**i for i, factor in enumerate(list_fact)]), axis=0
     )
-    xd_mat = xp.asarray(
-        xp.clip(xcenter + fact_mat * xu_mat, 0, width - 1), dtype=xp.float32
+    xd_mat = cp.asarray(
+        cp.clip(xcenter + fact_mat * xu_mat, 0, width - 1), dtype=cp.float32
     )
-    yd_mat = xp.asarray(
-        xp.clip(ycenter + fact_mat * yu_mat, 0, height - 1), dtype=xp.float32
+    yd_mat = cp.asarray(
+        cp.clip(ycenter + fact_mat * yu_mat, 0, height - 1), dtype=cp.float32
     )
-    indices = [xp.reshape(yd_mat, (-1, 1)), xp.reshape(xd_mat, (-1, 1))]
-    indices = xp.asarray(indices, dtype=xp.float32)
+    indices = [cp.reshape(yd_mat, (-1, 1)), cp.reshape(xd_mat, (-1, 1))]
+    indices = cp.asarray(indices, dtype=cp.float32)
 
     # Loop over images and unwarp them
     for i in range(data.shape[0]):
         mat = map_coordinates(data[i], indices, order=order, mode=mode)
-        mat = xp.reshape(mat, (height, width))
+        mat = cp.reshape(mat, (height, width))
         data[i] = mat
 
     return data
diff --git a/httomolibgpu/prep/normalize.py b/httomolibgpu/prep/normalize.py
index d8c1625a..bcbb2c20 100644
--- a/httomolibgpu/prep/normalize.py
+++ b/httomolibgpu/prep/normalize.py
@@ -20,41 +20,28 @@
 # ---------------------------------------------------------------------------
 """Modules for raw projection data normalization"""
 
-cupy_run = False
-try:
-    import cupy as xp    
-
-    try:
-        xp.cuda.Device(0).compute_capability
-        cupy_run = True
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as xp
-except ImportError:
-    import numpy as xp
-
-if cupy_run:
-    from cupy import mean
-else:
-    from numpy import mean
-import nvtx
 import numpy as np
+from httomolibgpu import cupywrapper
+
+cp = cupywrapper.cp
+
+nvtx = cupywrapper.nvtx
+
 from numpy import float32
 from typing import Tuple
 
 __all__ = ["normalize"]
 
 
-@nvtx.annotate()
 def normalize(
-    data: xp.ndarray,
-    flats: xp.ndarray,
-    darks: xp.ndarray,
+    data: cp.ndarray,
+    flats: cp.ndarray,
+    darks: cp.ndarray,
     cutoff: float = 10.0,
     minus_log: bool = True,
     nonnegativity: bool = False,
     remove_nans: bool = False,
-) -> xp.ndarray:
+) -> cp.ndarray:
     """
     Normalize raw projection data using the flat and dark field projections.
     This is a raw CUDA kernel implementation with CuPy wrappers.
@@ -81,12 +68,33 @@ def normalize(
     cp.ndarray
         Normalised 3D tomographic data as a CuPy array.
     """
+    if cupywrapper.cupy_run:
+        return __normalize(
+            data, flats, darks, cutoff, minus_log, nonnegativity, remove_nans
+        )
+    else:
+        print("normalize won't be executed because CuPy is not installed")
+        return data
+
+
+@nvtx.annotate()
+def __normalize(
+    data: cp.ndarray,
+    flats: cp.ndarray,
+    darks: cp.ndarray,
+    cutoff: float = 10.0,
+    minus_log: bool = True,
+    nonnegativity: bool = False,
+    remove_nans: bool = False,
+) -> cp.ndarray:
+
+    from cupy import mean
 
     _check_valid_input(data, flats, darks)
 
-    dark0 = xp.empty(darks.shape[1:], dtype=float32)
-    flat0 = xp.empty(flats.shape[1:], dtype=float32)
-    out = xp.empty(data.shape, dtype=float32)
+    dark0 = cp.empty(darks.shape[1:], dtype=float32)
+    flat0 = cp.empty(flats.shape[1:], dtype=float32)
+    out = cp.empty(data.shape, dtype=float32)
     mean(darks, axis=0, dtype=float32, out=dark0)
     mean(flats, axis=0, dtype=float32, out=flat0)
 
@@ -111,7 +119,7 @@ def normalize(
     kernel += "if (v > cutoff) v = cutoff;\n"
     kernel += "out = v;\n"
 
-    normalisation_kernel = xp.ElementwiseKernel(
+    normalisation_kernel = cp.ElementwiseKernel(
         "T data, U flats, U darks, raw float32 cutoff",
         "float32 out",
         kernel,
@@ -138,6 +146,6 @@ def _check_valid_input(data, flats, darks) -> None:
         raise ValueError("Input darks must be 2D or 3D data only")
 
     if flats.ndim == 2:
-        flats = flats[xp.newaxis, :, :]
+        flats = flats[cp.newaxis, :, :]
     if darks.ndim == 2:
-        darks = darks[xp.newaxis, :, :]
+        darks = darks[cp.newaxis, :, :]
diff --git a/httomolibgpu/prep/phase.py b/httomolibgpu/prep/phase.py
index 54e69e45..a6febe10 100644
--- a/httomolibgpu/prep/phase.py
+++ b/httomolibgpu/prep/phase.py
@@ -21,21 +21,14 @@
 """Modules for phase retrieval and phase-contrast enhancement"""
 
 import numpy as np
-cupy_run = False
-try:
-    import cupy as xp
-    try:
-        xp.cuda.Device(0).compute_capability
-        cupy_run = True
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as np
-except ImportError:
-    import numpy as np
+from httomolibgpu import cupywrapper
+
+cp = cupywrapper.cp
+
+nvtx = cupywrapper.nvtx
 
 from numpy import float32
 from typing import Union
-import nvtx
 import math
 
 __all__ = [
@@ -43,11 +36,11 @@
     "paganin_filter_tomopy",
 ]
 
+
 ## %%%%%%%%%%%%%%%%%%%%%%% paganin_filter %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 #: CuPy implementation of Paganin filter from Savu
-@nvtx.annotate()
 def paganin_filter_savu(
-    data: xp.ndarray,
+    data: cp.ndarray,
     ratio: float = 250.0,
     energy: float = 53.0,
     distance: float = 1.0,
@@ -56,7 +49,7 @@ def paganin_filter_savu(
     pad_x: int = 100,
     pad_method: str = "edge",
     increment: float = 0.0,
-) -> xp.ndarray:
+) -> cp.ndarray:
     """
     Apply Paganin filter (for denoising or contrast enhancement) to
     projections.
@@ -94,15 +87,27 @@ def paganin_filter_savu(
     -------
     cp.ndarray
         The stack of filtered projections.
-    """   
-    if cupy_run:
-        return __paganin_filter_savu(data, ratio, energy, distance, resolution,  pad_y, pad_x, pad_method, increment)
+    """
+    if cupywrapper.cupy_run:
+        return __paganin_filter_savu(
+            data,
+            ratio,
+            energy,
+            distance,
+            resolution,
+            pad_y,
+            pad_x,
+            pad_method,
+            increment,
+        )
     else:
         print("__paganin_filter_savu won't be executed because CuPy is not installed")
         return data
 
+
+@nvtx.annotate()
 def __paganin_filter_savu(
-    data: xp.ndarray,
+    data: cp.ndarray,
     ratio: float = 250.0,
     energy: float = 53.0,
     distance: float = 1.0,
@@ -111,8 +116,8 @@ def __paganin_filter_savu(
     pad_x: int = 100,
     pad_method: str = "edge",
     increment: float = 0.0,
-) -> xp.ndarray:
-    
+) -> cp.ndarray:
+
     from httomolibgpu.cuda_kernels import load_cuda_module
     from cupyx.scipy.fft import fft2, ifft2
 
@@ -144,10 +149,10 @@ def __paganin_filter_savu(
 
     # Apply padding to all the 2D projections
     # Note: this takes considerable time on GPU...
-    data = xp.pad(data, ((0, 0), (pad_y, pad_y), (pad_x, pad_x)), mode=pad_method)
+    data = cp.pad(data, ((0, 0), (pad_y, pad_y), (pad_x, pad_x)), mode=pad_method)
 
     # Define array to hold result, which will not have the padding applied to it
-    precond_kernel_float = xp.ElementwiseKernel(
+    precond_kernel_float = cp.ElementwiseKernel(
         "T data",
         "T out",
         """
@@ -164,7 +169,7 @@ def __paganin_filter_savu(
         name="paganin_precond_float",
         no_return=True,
     )
-    precond_kernel_int = xp.ElementwiseKernel(
+    precond_kernel_int = cp.ElementwiseKernel(
         "T data",
         "T out",
         """out = data == 0 ? 1 : data""",
@@ -172,17 +177,17 @@ def __paganin_filter_savu(
         no_return=True,
     )
 
-    if data.dtype in (xp.float32, xp.float64):
+    if data.dtype in (cp.float32, cp.float64):
         precond_kernel_float(data, data)
     else:
         precond_kernel_int(data, data)
 
     # avoid normalising in both directions - we include multiplier in the post_kernel
-    data = xp.asarray(data, dtype=xp.complex64)
+    data = cp.asarray(data, dtype=cp.complex64)
     data = fft2(data, axes=(-2, -1), overwrite_x=True, norm="backward")
 
     # prepare filter here, while the GPU is busy with the FFT
-    filtercomplex = xp.empty((height1, width1), dtype=xp.complex64)
+    filtercomplex = cp.empty((height1, width1), dtype=cp.complex64)
     bx = 16
     by = 8
     gx = (width1 + bx - 1) // bx
@@ -191,12 +196,12 @@ def __paganin_filter_savu(
         grid=(gx, gy, 1),
         block=(bx, by, 1),
         args=(
-            xp.int32(width1),
-            xp.int32(height1),
-            xp.float32(resolution),
-            xp.float32(wavelength),
-            xp.float32(distance),
-            xp.float32(ratio),
+            cp.int32(width1),
+            cp.int32(height1),
+            cp.float32(resolution),
+            cp.float32(wavelength),
+            cp.float32(distance),
+            cp.float32(ratio),
             filtercomplex,
         ),
     )
@@ -204,7 +209,7 @@ def __paganin_filter_savu(
 
     data = ifft2(data, axes=(-2, -1), overwrite_x=True, norm="forward")
 
-    post_kernel = xp.ElementwiseKernel(
+    post_kernel = cp.ElementwiseKernel(
         "C pci1, raw float32 increment, raw float32 ratio, raw float32 fft_scale",
         "T out",
         "out = -0.5 * ratio * log(abs(pci1) * fft_scale + increment)",
@@ -212,7 +217,7 @@ def __paganin_filter_savu(
         no_return=True,
     )
     fft_scale = 1.0 / (data.shape[1] * data.shape[2])
-    res = xp.empty((data.shape[0], height, width), dtype=xp.float32)
+    res = cp.empty((data.shape[0], height, width), dtype=cp.float32)
     post_kernel(
         data[:, pad_y : pad_y + height, pad_x : pad_x + width],
         np.float32(increment),
@@ -222,12 +227,14 @@ def __paganin_filter_savu(
     )
     return res
 
+
 def _wavelength(energy: float) -> float:
     SPEED_OF_LIGHT = 299792458e2  # [cm/s]
     PLANCK_CONSTANT = 6.58211928e-19  # [keV*s]
     return 2 * math.pi * PLANCK_CONSTANT * SPEED_OF_LIGHT / energy
 
-def _reciprocal_grid(pixel_size: float, shape_proj: tuple) -> xp.ndarray:
+
+def _reciprocal_grid(pixel_size: float, shape_proj: tuple) -> cp.ndarray:
     """
     Calculate reciprocal grid.
 
@@ -246,12 +253,13 @@ def _reciprocal_grid(pixel_size: float, shape_proj: tuple) -> xp.ndarray:
     # Sampling in reciprocal space.
     indx = _reciprocal_coord(pixel_size, shape_proj[0])
     indy = _reciprocal_coord(pixel_size, shape_proj[1])
-    indx_sq = xp.square(indx)
-    indy_sq = xp.square(indy)
+    indx_sq = cp.square(indx)
+    indy_sq = cp.square(indy)
+
+    return cp.add.outer(indx_sq, indy_sq)
 
-    return xp.add.outer(indx_sq, indy_sq)
 
-def _reciprocal_coord(pixel_size: float, num_grid: int) -> xp.ndarray:
+def _reciprocal_coord(pixel_size: float, num_grid: int) -> cp.ndarray:
     """
     Calculate reciprocal grid coordinates for a given pixel size
     and discretization.
@@ -269,22 +277,21 @@ def _reciprocal_coord(pixel_size: float, num_grid: int) -> xp.ndarray:
         Grid coordinates.
     """
     n = num_grid - 1
-    rc = xp.arange(-n, num_grid, 2, dtype=xp.float32)
+    rc = cp.arange(-n, num_grid, 2, dtype=cp.float32)
     rc *= 2 * math.pi / (n * pixel_size)
     return rc
 
+
 ##-------------------------------------------------------------##
 ##-------------------------------------------------------------##
-# Adaptation with some corrections of retrieve_phase (Paganin filter)
-# from TomoPy
-@nvtx.annotate()
+# Adaptation of retrieve_phase (Paganin filter) from TomoPy
 def paganin_filter_tomopy(
-    tomo: xp.ndarray,
+    tomo: cp.ndarray,
     pixel_size: float = 1e-4,
     dist: float = 50.0,
     energy: float = 53.0,
     alpha: float = 1e-3,
-) -> xp.ndarray:
+) -> cp.ndarray:
     """
     Perform single-material phase retrieval from flats/darks corrected tomographic measurements
     :cite:`Paganin:02`.
@@ -307,19 +314,21 @@ def paganin_filter_tomopy(
     cp.ndarray
         The 3D array of Paganin phase-filtered projection images.
     """
-    if cupy_run:
+    if cupywrapper.cupy_run:
         return __paganin_filter_tomopy(tomo, pixel_size, dist, energy, alpha)
     else:
         print("paganin_filter_tomopy won't be executed because CuPy is not installed")
         return tomo
 
+
+@nvtx.annotate()
 def __paganin_filter_tomopy(
-    tomo: xp.ndarray,
+    tomo: cp.ndarray,
     pixel_size: float = 1e-4,
     dist: float = 50.0,
     energy: float = 53.0,
     alpha: float = 1e-3,
-) -> xp.ndarray:
+) -> cp.ndarray:
 
     from cupyx.scipy.fft import fft2, ifft2, fftshift
 
@@ -339,16 +348,14 @@ def __paganin_filter_tomopy(
     dz, dy, dx = padded_tomo.shape
 
     # 3D FFT of tomo data
-    padded_tomo = xp.asarray(padded_tomo, dtype=xp.complex64)
+    padded_tomo = cp.asarray(padded_tomo, dtype=cp.complex64)
     fft_tomo = fft2(padded_tomo, axes=(-2, -1), overwrite_x=True)
 
     # Compute the reciprocal grid.
     w2 = _reciprocal_grid(pixel_size, (dy, dx))
 
     # Build filter in the Fourier space.
-    phase_filter = fftshift(
-        _paganin_filter_factor2(energy, dist, alpha, w2)
-    )
+    phase_filter = fftshift(_paganin_filter_factor2(energy, dist, alpha, w2))
     phase_filter = phase_filter / phase_filter.max()  # normalisation
 
     # Apply filter and take inverse FFT
@@ -364,10 +371,10 @@ def __paganin_filter_tomopy(
     )
 
     # crop the padded filtered data:
-    tomo = ifft_filtered_tomo[slc_indices].astype(xp.float32)
+    tomo = ifft_filtered_tomo[slc_indices].astype(cp.float32)
 
     # taking the negative log
-    _log_kernel = xp.ElementwiseKernel(
+    _log_kernel = cp.ElementwiseKernel(
         "C tomo",
         "C out",
         "out = -log(tomo)",
@@ -380,7 +387,8 @@ def __paganin_filter_tomopy(
 def _shift_bit_length(x: int) -> int:
     return 1 << (x - 1).bit_length()
 
-def _pad_projections_to_second_power(tomo: xp.ndarray) -> Union[xp.ndarray, tuple]:
+
+def _pad_projections_to_second_power(tomo: cp.ndarray) -> Union[cp.ndarray, tuple]:
     """
     Performs padding of each projection to the next power of 2.
     If the shape is not even we also care of that before padding.
@@ -395,7 +403,7 @@ def _pad_projections_to_second_power(tomo: xp.ndarray) -> Union[xp.ndarray, tupl
     ndarray: padded 3d projection data
     tuple: a tuple with padding dimensions
     """
-    full_shape_tomo = xp.shape(tomo)
+    full_shape_tomo = cp.shape(tomo)
 
     pad_tup = []
     for index, element in enumerate(full_shape_tomo):
@@ -414,10 +422,11 @@ def _pad_projections_to_second_power(tomo: xp.ndarray) -> Union[xp.ndarray, tupl
 
         pad_tup.append(pad_width)
 
-    padded_tomo = xp.pad(tomo, tuple(pad_tup), "edge")
+    padded_tomo = cp.pad(tomo, tuple(pad_tup), "edge")
 
     return padded_tomo, pad_tup
 
+
 def _paganin_filter_factor2(energy, dist, alpha, w2):
     # Alpha represents the ratio of delta/beta.
     return 1 / (_wavelength(energy) * dist * w2 / (4 * math.pi) + alpha)
diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py
index f55a3e31..46ba04a0 100644
--- a/httomolibgpu/prep/stripe.py
+++ b/httomolibgpu/prep/stripe.py
@@ -21,25 +21,11 @@
 """Modules for stripes removal"""
 
 import numpy as np
-cupy_run = False
-try:
-    import cupy as xp
-
-    try:
-        xp.cuda.Device(0).compute_capability
-        cupy_run = True
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as xp
-except ImportError:
-    import numpy as xp
-
-if cupy_run:
-    from cupyx.scipy.ndimage import median_filter, binary_dilation, uniform_filter1d
-else:
-    from scipy.ndimage import median_filter, binary_dilation, uniform_filter1d
-
-import nvtx
+from httomolibgpu import cupywrapper
+
+cp = cupywrapper.cp
+
+nvtx = cupywrapper.nvtx
 from typing import Union
 
 __all__ = [
@@ -49,12 +35,11 @@
 ]
 
 
-@nvtx.annotate()
 def remove_stripe_based_sorting(
-    data: Union[xp.ndarray, np.ndarray],
+    data: Union[cp.ndarray, np.ndarray],
     size: int = 11,
     dim: int = 1,
-) -> Union[xp.ndarray, np.ndarray]:
+) -> Union[cp.ndarray, np.ndarray]:
     """
     Remove full and partial stripe artifacts from sinogram using Nghia Vo's
     approach, algorithm 3 in Ref. [1]. Angular direction is along the axis 0.
@@ -84,6 +69,21 @@ def remove_stripe_based_sorting(
     ----------
     .. [1] https://doi.org/10.1364/OE.26.028396
     """
+    if cupywrapper.cupy_run:
+        return __remove_stripe_based_sorting(data, size, dim)
+    else:
+        print(
+            "remove_stripe_based_sorting won't be executed because CuPy is not installed"
+        )
+        return data
+
+
+@nvtx.annotate()
+def __remove_stripe_based_sorting(
+    data: Union[cp.ndarray, np.ndarray],
+    size: int = 11,
+    dim: int = 1,
+) -> Union[cp.ndarray, np.ndarray]:
 
     if size is None:
         if data.shape[2] > 2000:
@@ -102,28 +102,29 @@ def _rs_sort(sinogram, size, dim):
     """
     Remove stripes using the sorting technique.
     """
-    sinogram = xp.transpose(sinogram)
+    from cupyx.scipy.ndimage import median_filter
+
+    sinogram = cp.transpose(sinogram)
 
     #: Sort each column of the sinogram by its grayscale values
     #: Keep track of the sorting indices so we can reverse it below
-    sortvals = xp.argsort(sinogram, axis=1)
-    sortvals_reverse = xp.argsort(sortvals, axis=1)
-    sino_sort = xp.take_along_axis(sinogram, sortvals, axis=1)
+    sortvals = cp.argsort(sinogram, axis=1)
+    sortvals_reverse = cp.argsort(sortvals, axis=1)
+    sino_sort = cp.take_along_axis(sinogram, sortvals, axis=1)
 
     #: Now apply the median filter on the sorted image along each row
     sino_sort = median_filter(sino_sort, (size, 1) if dim == 1 else (size, size))
 
     #: step 3: re-sort the smoothed image columns to the original rows
-    sino_corrected = xp.take_along_axis(sino_sort, sortvals_reverse, axis=1)
+    sino_corrected = cp.take_along_axis(sino_sort, sortvals_reverse, axis=1)
 
-    return xp.transpose(sino_corrected)
+    return cp.transpose(sino_corrected)
 
 
-@nvtx.annotate()
 def remove_stripe_ti(
-    data: Union[xp.ndarray, np.ndarray],
+    data: Union[cp.ndarray, np.ndarray],
     beta: float = 0.1,
-) -> Union[xp.ndarray, np.ndarray]:
+) -> Union[cp.ndarray, np.ndarray]:
     """
     Removes stripes with the method of V. Titarenko (TomoCuPy implementation)
 
@@ -140,14 +141,27 @@ def remove_stripe_ti(
     ndarray
         3D array of de-striped projections.
     """
+    if cupywrapper.cupy_run:
+        return __remove_stripe_ti(data, beta)
+    else:
+        print("remove_stripe_ti won't be executed because CuPy is not installed")
+        return data
+
+
+@nvtx.annotate()
+def __remove_stripe_ti(
+    data: Union[cp.ndarray, np.ndarray],
+    beta: float = 0.1,
+) -> Union[cp.ndarray, np.ndarray]:
+
     # TODO: detector dimensions must be even otherwise error
-    gamma = beta * ((1 - beta) / (1 + beta)) ** xp.abs(
-        xp.fft.fftfreq(data.shape[-1]) * data.shape[-1]
+    gamma = beta * ((1 - beta) / (1 + beta)) ** cp.abs(
+        cp.fft.fftfreq(data.shape[-1]) * data.shape[-1]
     )
     gamma[0] -= 1
-    v = xp.mean(data, axis=0)
+    v = cp.mean(data, axis=0)
     v = v - v[:, 0:1]
-    v = xp.fft.irfft(xp.fft.rfft(v) * xp.fft.rfft(gamma)).astype(data.dtype)
+    v = cp.fft.irfft(cp.fft.rfft(v) * cp.fft.rfft(gamma)).astype(data.dtype)
     data[:] += v
     return data
 
@@ -176,14 +190,13 @@ def remove_stripe_ti(
 #                                                                             #
 #                                                                             #
 # *************************************************************************** #
-@nvtx.annotate()
 def remove_all_stripe(
-    data: xp.ndarray,
+    data: cp.ndarray,
     snr: float = 3.0,
     la_size: int = 61,
     sm_size: int = 21,
     dim: int = 1,
-) -> xp.ndarray:
+) -> cp.ndarray:
     """
     Remove all types of stripe artifacts from sinogram using Nghia Vo's
     approach :cite:`Vo:18` (combination of algorithm 3,4,5, and 6).
@@ -212,6 +225,22 @@ def remove_all_stripe(
     .. [1] https://doi.org/10.1364/OE.26.028396
 
     """
+    if cupywrapper.cupy_run:
+        return __remove_all_stripe(data, snr, la_size, sm_size, dim)
+    else:
+        print("remove_all_stripe won't be executed because CuPy is not installed")
+        return data
+
+
+@nvtx.annotate()
+def __remove_all_stripe(
+    data: cp.ndarray,
+    snr: float = 3.0,
+    la_size: int = 61,
+    sm_size: int = 21,
+    dim: int = 1,
+) -> cp.ndarray:
+
     matindex = _create_matindex(data.shape[2], data.shape[0])
     for m in range(data.shape[1]):
         sino = data[:, m, :]
@@ -226,38 +255,40 @@ def _rs_sort2(sinogram, size, matindex, dim):
     """
     Remove stripes using the sorting technique.
     """
-    sinogram = xp.transpose(sinogram)
-    matcomb = xp.asarray(xp.dstack((matindex, sinogram)))
+    from cupyx.scipy.ndimage import median_filter
+
+    sinogram = cp.transpose(sinogram)
+    matcomb = cp.asarray(cp.dstack((matindex, sinogram)))
 
-    # matsort = xp.asarray([row[row[:, 1].argsort()] for row in matcomb])
-    ids = xp.argsort(matcomb[:, :, 1], axis=1)
+    # matsort = cp.asarray([row[row[:, 1].argsort()] for row in matcomb])
+    ids = cp.argsort(matcomb[:, :, 1], axis=1)
     matsort = matcomb.copy()
-    matsort[:, :, 0] = xp.take_along_axis(matsort[:, :, 0], ids, axis=1)
-    matsort[:, :, 1] = xp.take_along_axis(matsort[:, :, 1], ids, axis=1)
+    matsort[:, :, 0] = cp.take_along_axis(matsort[:, :, 0], ids, axis=1)
+    matsort[:, :, 1] = cp.take_along_axis(matsort[:, :, 1], ids, axis=1)
     if dim == 1:
         matsort[:, :, 1] = median_filter(matsort[:, :, 1], (size, 1))
     else:
         matsort[:, :, 1] = median_filter(matsort[:, :, 1], (size, size))
 
-    # matsortback = xp.asarray([row[row[:, 0].argsort()] for row in matsort])
+    # matsortback = cp.asarray([row[row[:, 0].argsort()] for row in matsort])
 
-    ids = xp.argsort(matsort[:, :, 0], axis=1)
+    ids = cp.argsort(matsort[:, :, 0], axis=1)
     matsortback = matsort.copy()
-    matsortback[:, :, 0] = xp.take_along_axis(matsortback[:, :, 0], ids, axis=1)
-    matsortback[:, :, 1] = xp.take_along_axis(matsortback[:, :, 1], ids, axis=1)
+    matsortback[:, :, 0] = cp.take_along_axis(matsortback[:, :, 0], ids, axis=1)
+    matsortback[:, :, 1] = cp.take_along_axis(matsortback[:, :, 1], ids, axis=1)
 
     sino_corrected = matsortback[:, :, 1]
-    return xp.transpose(sino_corrected)
+    return cp.transpose(sino_corrected)
 
 
 @nvtx.annotate()
 def _mpolyfit(x, y):
     n = len(x)
-    x_mean = xp.mean(x)
-    y_mean = xp.mean(y)
+    x_mean = cp.mean(x)
+    y_mean = cp.mean(y)
 
-    Sxy = xp.sum(x * y) - n * x_mean * y_mean
-    Sxx = xp.sum(x * x) - n * x_mean * x_mean
+    Sxy = cp.sum(x * y) - n * x_mean * y_mean
+    Sxx = cp.sum(x * x) - n * x_mean * x_mean
 
     slope = Sxy / Sxx
     intercept = y_mean - slope * x_mean
@@ -269,22 +300,23 @@ def _detect_stripe(listdata, snr):
     """
     Algorithm 4 in :cite:`Vo:18`. Used to locate stripes.
     """
+
     numdata = len(listdata)
-    listsorted = xp.sort(listdata)[::-1]
-    xlist = xp.arange(0, numdata, 1.0)
-    ndrop = xp.int16(0.25 * numdata)
-    # (_slope, _intercept) = xp.polyfit(xlist[ndrop:-ndrop - 1],
+    listsorted = cp.sort(listdata)[::-1]
+    xlist = cp.arange(0, numdata, 1.0)
+    ndrop = cp.int16(0.25 * numdata)
+    # (_slope, _intercept) = cp.polyfit(xlist[ndrop:-ndrop - 1],
     #   listsorted[ndrop:-ndrop - 1], 1)
     (_slope, _intercept) = _mpolyfit(
         xlist[ndrop : -ndrop - 1], listsorted[ndrop : -ndrop - 1]
     )
 
     numt1 = _intercept + _slope * xlist[-1]
-    noiselevel = xp.abs(numt1 - _intercept)
-    noiselevel = xp.clip(noiselevel, 1e-6, None)
-    val1 = xp.abs(listsorted[0] - _intercept) / noiselevel
-    val2 = xp.abs(listsorted[-1] - numt1) / noiselevel
-    listmask = xp.zeros_like(listdata)
+    noiselevel = cp.abs(numt1 - _intercept)
+    noiselevel = cp.clip(noiselevel, 1e-6, None)
+    val1 = cp.abs(listsorted[0] - _intercept) / noiselevel
+    val2 = cp.abs(listsorted[-1] - numt1) / noiselevel
+    listmask = cp.zeros_like(listdata)
     if val1 >= snr:
         upper_thresh = _intercept + noiselevel * snr * 0.5
         listmask[listdata > upper_thresh] = 1.0
@@ -299,16 +331,19 @@ def _rs_large(sinogram, snr, size, matindex, drop_ratio=0.1, norm=True):
     """
     Remove large stripes.
     """
-    drop_ratio = max(min(drop_ratio, 0.8), 0)  # = xp.clip(drop_ratio, 0.0, 0.8)
+    from cupyx.scipy.ndimage import median_filter
+    from cupyx.scipy.ndimage import binary_dilation
+
+    drop_ratio = max(min(drop_ratio, 0.8), 0)  # = cp.clip(drop_ratio, 0.0, 0.8)
     (nrow, ncol) = sinogram.shape
     ndrop = int(0.5 * drop_ratio * nrow)
-    sinosort = xp.sort(sinogram, axis=0)
+    sinosort = cp.sort(sinogram, axis=0)
     sinosmooth = median_filter(sinosort, (1, size))
-    list1 = xp.mean(sinosort[ndrop : nrow - ndrop], axis=0)
-    list2 = xp.mean(sinosmooth[ndrop : nrow - ndrop], axis=0)
-    # listfact = xp.divide(list1,
+    list1 = cp.mean(sinosort[ndrop : nrow - ndrop], axis=0)
+    list2 = cp.mean(sinosmooth[ndrop : nrow - ndrop], axis=0)
+    # listfact = cp.divide(list1,
     #                      list2,
-    #                      out=xp.ones_like(list1),
+    #                      out=cp.ones_like(list1),
     #                      where=list2 != 0)
 
     listfact = list1 / list2
@@ -316,28 +351,28 @@ def _rs_large(sinogram, snr, size, matindex, drop_ratio=0.1, norm=True):
     # Locate stripes
     listmask = _detect_stripe(listfact, snr)
     listmask = binary_dilation(listmask, iterations=1).astype(listmask.dtype)
-    matfact = xp.tile(listfact, (nrow, 1))
+    matfact = cp.tile(listfact, (nrow, 1))
     # Normalize
     if norm is True:
         sinogram = sinogram / matfact
-    sinogram1 = xp.transpose(sinogram)
-    matcombine = xp.asarray(xp.dstack((matindex, sinogram1)))
+    sinogram1 = cp.transpose(sinogram)
+    matcombine = cp.asarray(cp.dstack((matindex, sinogram1)))
 
-    # matsort = xp.asarray([row[row[:, 1].argsort()] for row in matcombine])
-    ids = xp.argsort(matcombine[:, :, 1], axis=1)
+    # matsort = cp.asarray([row[row[:, 1].argsort()] for row in matcombine])
+    ids = cp.argsort(matcombine[:, :, 1], axis=1)
     matsort = matcombine.copy()
-    matsort[:, :, 0] = xp.take_along_axis(matsort[:, :, 0], ids, axis=1)
-    matsort[:, :, 1] = xp.take_along_axis(matsort[:, :, 1], ids, axis=1)
+    matsort[:, :, 0] = cp.take_along_axis(matsort[:, :, 0], ids, axis=1)
+    matsort[:, :, 1] = cp.take_along_axis(matsort[:, :, 1], ids, axis=1)
 
-    matsort[:, :, 1] = xp.transpose(sinosmooth)
-    # matsortback = xp.asarray([row[row[:, 0].argsort()] for row in matsort])
-    ids = xp.argsort(matsort[:, :, 0], axis=1)
+    matsort[:, :, 1] = cp.transpose(sinosmooth)
+    # matsortback = cp.asarray([row[row[:, 0].argsort()] for row in matsort])
+    ids = cp.argsort(matsort[:, :, 0], axis=1)
     matsortback = matsort.copy()
-    matsortback[:, :, 0] = xp.take_along_axis(matsortback[:, :, 0], ids, axis=1)
-    matsortback[:, :, 1] = xp.take_along_axis(matsortback[:, :, 1], ids, axis=1)
+    matsortback[:, :, 0] = cp.take_along_axis(matsortback[:, :, 0], ids, axis=1)
+    matsortback[:, :, 1] = cp.take_along_axis(matsortback[:, :, 1], ids, axis=1)
 
-    sino_corrected = xp.transpose(matsortback[:, :, 1])
-    listxmiss = xp.where(listmask > 0.0)[0]
+    sino_corrected = cp.transpose(matsortback[:, :, 1])
+    listxmiss = cp.where(listmask > 0.0)[0]
     sinogram[:, listxmiss] = sino_corrected[:, listxmiss]
     return sinogram
 
@@ -347,12 +382,16 @@ def _rs_dead(sinogram, snr, size, matindex, norm=True):
     """
     Remove unresponsive and fluctuating stripes.
     """
-    sinogram = xp.copy(sinogram)  # Make it mutable
+    from cupyx.scipy.ndimage import median_filter
+    from cupyx.scipy.ndimage import binary_dilation
+    from cupyx.scipy.ndimage import uniform_filter1d
+
+    sinogram = cp.copy(sinogram)  # Make it mutable
     (nrow, _) = sinogram.shape
-    # sinosmooth = xp.apply_along_axis(uniform_filter1d, 0, sinogram, 10)
+    # sinosmooth = cp.apply_along_axis(uniform_filter1d, 0, sinogram, 10)
     sinosmooth = uniform_filter1d(sinogram, 10, axis=0)
 
-    listdiff = xp.sum(xp.abs(sinogram - sinosmooth), axis=0)
+    listdiff = cp.sum(cp.abs(sinogram - sinosmooth), axis=0)
     listdiffbck = median_filter(listdiff, size)
 
     listfact = listdiff / listdiffbck
@@ -361,16 +400,16 @@ def _rs_dead(sinogram, snr, size, matindex, norm=True):
     listmask = binary_dilation(listmask, iterations=1).astype(listmask.dtype)
     listmask[0:2] = 0.0
     listmask[-2:] = 0.0
-    listx = xp.where(listmask < 1.0)[0]
-    listy = xp.arange(nrow)
+    listx = cp.where(listmask < 1.0)[0]
+    listy = cp.arange(nrow)
     matz = sinogram[:, listx]
 
-    listxmiss = xp.where(listmask > 0.0)[0]
+    listxmiss = cp.where(listmask > 0.0)[0]
 
     # finter = interpolate.interp2d(listx.get(), listy.get(), matz.get(), kind='linear')
     if len(listxmiss) > 0:
         # sinogram_c[:, listxmiss.get()] = finter(listxmiss.get(), listy.get())
-        ids = xp.searchsorted(listx, listxmiss)
+        ids = cp.searchsorted(listx, listxmiss)
         sinogram[:, listxmiss] = matz[:, ids - 1] + (listxmiss - listx[ids - 1]) * (
             matz[:, ids] - matz[:, ids - 1]
         ) / (listx[ids] - listx[ids - 1])
@@ -386,6 +425,6 @@ def _create_matindex(nrow, ncol):
     """
     Create a 2D array of indexes used for the sorting technique.
     """
-    listindex = xp.arange(0.0, ncol, 1.0)
-    matindex = xp.tile(listindex, (nrow, 1))
+    listindex = cp.arange(0.0, ncol, 1.0)
+    matindex = cp.tile(listindex, (nrow, 1))
     return matindex.astype(np.float32)
diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
index 4b126066..1de5400d 100644
--- a/httomolibgpu/recon/algorithm.py
+++ b/httomolibgpu/recon/algorithm.py
@@ -21,30 +21,14 @@
 """Module for tomographic reconstruction"""
 
 import numpy as np
-cupy_run = False
-try:
-    import cupy as xp
-
-    try:
-        xp.cuda.Device(0).compute_capability
-        cupy_run = True
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as xp
-except ImportError:
-    import numpy as xp
-
-import nvtx
+from httomolibgpu import cupywrapper
+
+cp = cupywrapper.cp
+
+nvtx = cupywrapper.nvtx
 from numpy import float32, complex64
 from typing import Optional, Type
 
-if cupy_run:
-    from tomobar.methodsDIR_CuPy import RecToolsDIRCuPy
-    from tomobar.methodsIR_CuPy import RecToolsIRCuPy
-else:
-    from tomobar.methodsDIR import RecToolsDIR as RecToolsDIRCuPy
-    from tomobar.methodsIR import RecToolsIR as RecToolsIRCuPy
-
 __all__ = [
     "FBP",
     "SIRT",
@@ -55,16 +39,15 @@
 
 
 ## %%%%%%%%%%%%%%%%%%%%%%% FBP reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-@nvtx.annotate()
 def FBP(
-    data: xp.ndarray,
+    data: cp.ndarray,
     angles: np.ndarray,
     center: Optional[float] = None,
     filter_freq_cutoff: Optional[float] = 0.6,
     recon_size: Optional[int] = None,
     recon_mask_radius: Optional[float] = None,
     gpu_id: int = 0,
-) -> xp.ndarray:
+) -> cp.ndarray:
     """
     Perform Filtered Backprojection (FBP) reconstruction using ASTRA toolbox and ToMoBAR wrappers.
     This is a 3D recon from a CuPy array and a custom built filter.
@@ -94,6 +77,32 @@ def FBP(
     cp.ndarray
         The FBP reconstructed volume as a CuPy array.
     """
+    if cupywrapper.cupy_run:
+        return __FBP(
+            data,
+            angles,
+            center,
+            filter_freq_cutoff,
+            recon_size,
+            recon_mask_radius,
+            gpu_id,
+        )
+    else:
+        print("FBP won't be executed because CuPy is not installed")
+        return data
+
+
+@nvtx.annotate()
+def __FBP(
+    data: cp.ndarray,
+    angles: np.ndarray,
+    center: Optional[float] = None,
+    filter_freq_cutoff: Optional[float] = 0.6,
+    recon_size: Optional[int] = None,
+    recon_mask_radius: Optional[float] = None,
+    gpu_id: int = 0,
+) -> cp.ndarray:
+
     RecToolsCP = _instantiate_direct_recon_class(
         data, angles, center, recon_size, gpu_id
     )
@@ -104,21 +113,20 @@ def FBP(
         recon_mask_radius=recon_mask_radius,
         data_axes_labels_order=input_data_axis_labels,
     )
-    xp._default_memory_pool.free_all_blocks()
-    return xp.require(xp.swapaxes(reconstruction, 0, 1), requirements="C")
+    cp._default_memory_pool.free_all_blocks()
+    return cp.require(cp.swapaxes(reconstruction, 0, 1), requirements="C")
 
 
 ## %%%%%%%%%%%%%%%%%%%%%%% SIRT reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-@nvtx.annotate()
 def SIRT(
-    data: xp.ndarray,
+    data: cp.ndarray,
     angles: np.ndarray,
     center: Optional[float] = None,
     recon_size: Optional[int] = None,
     iterations: Optional[int] = 300,
     nonnegativity: Optional[bool] = True,
     gpu_id: int = 0,
-) -> xp.ndarray:
+) -> cp.ndarray:
     """
     Perform Simultaneous Iterative Recostruction Technique (SIRT) using ASTRA toolbox and ToMoBAR wrappers.
     This is 3D recon directly from a CuPy array while using ASTRA GPUlink capability.
@@ -146,6 +154,31 @@ def SIRT(
     cp.ndarray
         The SIRT reconstructed volume as a CuPy array.
     """
+    if cupywrapper.cupy_run:
+        return __SIRT(
+            data,
+            angles,
+            center,
+            recon_size,
+            iterations,
+            nonnegativity,
+            gpu_id,
+        )
+    else:
+        print("SIRT won't be executed because CuPy is not installed")
+        return data
+
+
+@nvtx.annotate()
+def __SIRT(
+    data: cp.ndarray,
+    angles: np.ndarray,
+    center: Optional[float] = None,
+    recon_size: Optional[int] = None,
+    iterations: Optional[int] = 300,
+    nonnegativity: Optional[bool] = True,
+    gpu_id: int = 0,
+) -> cp.ndarray:
 
     RecToolsCP = _instantiate_iterative_recon_class(
         data, angles, center, recon_size, gpu_id, datafidelity="LS"
@@ -160,21 +193,20 @@ def SIRT(
         "nonnegativity": nonnegativity,
     }
     reconstruction = RecToolsCP.SIRT(_data_, _algorithm_)
-    xp._default_memory_pool.free_all_blocks()
-    return xp.require(xp.swapaxes(reconstruction, 0, 1), requirements="C")
+    cp._default_memory_pool.free_all_blocks()
+    return cp.require(cp.swapaxes(reconstruction, 0, 1), requirements="C")
 
 
 ## %%%%%%%%%%%%%%%%%%%%%%% CGLS reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-@nvtx.annotate()
 def CGLS(
-    data: xp.ndarray,
+    data: cp.ndarray,
     angles: np.ndarray,
     center: Optional[float] = None,
     recon_size: Optional[int] = None,
     iterations: Optional[int] = 20,
     nonnegativity: Optional[bool] = True,
     gpu_id: int = 0,
-) -> xp.ndarray:
+) -> cp.ndarray:
     """
     Perform Congugate Gradient Least Squares (CGLS) using ASTRA toolbox and ToMoBAR wrappers.
     This is 3D recon directly from a CuPy array while using ASTRA GPUlink capability.
@@ -202,6 +234,32 @@ def CGLS(
     cp.ndarray
         The CGLS reconstructed volume as a CuPy array.
     """
+    if cupywrapper.cupy_run:
+        return __CGLS(
+            data,
+            angles,
+            center,
+            recon_size,
+            iterations,
+            nonnegativity,
+            gpu_id,
+        )
+    else:
+        print("CGLS won't be executed because CuPy is not installed")
+        return data
+
+
+@nvtx.annotate()
+def __CGLS(
+    data: cp.ndarray,
+    angles: np.ndarray,
+    center: Optional[float] = None,
+    recon_size: Optional[int] = None,
+    iterations: Optional[int] = 20,
+    nonnegativity: Optional[bool] = True,
+    gpu_id: int = 0,
+) -> cp.ndarray:
+
     RecToolsCP = _instantiate_iterative_recon_class(
         data, angles, center, recon_size, gpu_id, datafidelity="LS"
     )
@@ -212,18 +270,18 @@ def CGLS(
     }  # data dictionary
     _algorithm_ = {"iterations": iterations, "nonnegativity": nonnegativity}
     reconstruction = RecToolsCP.CGLS(_data_, _algorithm_)
-    xp._default_memory_pool.free_all_blocks()
-    return xp.require(xp.swapaxes(reconstruction, 0, 1), requirements="C")
+    cp._default_memory_pool.free_all_blocks()
+    return cp.require(cp.swapaxes(reconstruction, 0, 1), requirements="C")
 
 
 ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 def _instantiate_direct_recon_class(
-    data: xp.ndarray,
+    data: cp.ndarray,
     angles: np.ndarray,
     center: Optional[float] = None,
     recon_size: Optional[int] = None,
     gpu_id: int = 0,
-) -> Type[RecToolsDIRCuPy]:
+) -> Type:
     """instantiate ToMoBAR's direct recon class
 
     Args:
@@ -236,6 +294,8 @@ def _instantiate_direct_recon_class(
     Returns:
         Type[RecToolsDIRCuPy]: an instance of the direct recon class
     """
+    from tomobar.methodsDIR_CuPy import RecToolsDIRCuPy
+
     if center is None:
         center = data.shape[2] // 2  # making a crude guess
     if recon_size is None:
@@ -254,13 +314,13 @@ def _instantiate_direct_recon_class(
 
 
 def _instantiate_iterative_recon_class(
-    data: xp.ndarray,
+    data: cp.ndarray,
     angles: np.ndarray,
     center: Optional[float] = None,
     recon_size: Optional[int] = None,
     gpu_id: int = 0,
     datafidelity: str = "LS",
-) -> Type[RecToolsIRCuPy]:
+) -> Type:
     """instantiate ToMoBAR's iterative recon class
 
     Args:
@@ -274,6 +334,8 @@ def _instantiate_iterative_recon_class(
     Returns:
         Type[RecToolsIRCuPy]: an instance of the iterative class
     """
+    from tomobar.methodsIR_CuPy import RecToolsIRCuPy
+
     if center is None:
         center = data.shape[2] // 2  # making a crude guess
     if recon_size is None:
diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
index 6e6feba1..dbade838 100644
--- a/httomolibgpu/recon/rotation.py
+++ b/httomolibgpu/recon/rotation.py
@@ -21,42 +21,24 @@
 """Modules for finding the axis of rotation"""
 
 import numpy as np
-cupy_run = False
-try:
-    import cupy as xp
+from httomolibgpu import cupywrapper
 
-    try:
-        xp.cuda.Device(0).compute_capability
-        cupy_run = True
+cp = cupywrapper.cp
 
-    except xp.cuda.runtime.CUDARuntimeError:
-        print("CuPy library is a major dependency for HTTomolibgpu, please install")
-        import numpy as np
-except ImportError:
-    import numpy as np
+nvtx = cupywrapper.nvtx
 
-import nvtx
 import math
 from typing import List, Literal, Optional, Tuple, Union
 
-if cupy_run:
-    from httomolibgpu.cuda_kernels import load_cuda_module
-    from cupyx.scipy.ndimage import shift, gaussian_filter
-    from cupyx.scipy.fftpack import get_fft_plan
-    from cupyx.scipy.fft import rfft2
-else:
-    from scipy.ndimage import shift, gaussian_filter
-    from scipy.fft import fftfreq as get_fft_plan # get_fft_plan doesn't exist in scipyfft
-    from scipy.fft import rfft2
-
 __all__ = [
     "find_center_vo",
-    #"find_center_360",
-    #"find_center_pc",
+    "find_center_360",
+    "find_center_pc",
 ]
 
+
 def find_center_vo(
-    data: xp.ndarray,
+    data: cp.ndarray,
     ind: Optional[int] = None,
     smin: int = -50,
     smax: int = 50,
@@ -96,12 +78,17 @@ def find_center_vo(
     float
         Rotation axis location.
     """
-    return __find_center_vo(data, ind, smin, smax, srad, step, ratio, drop)    
+    if cupywrapper.cupy_run:
+        return __find_center_vo(data, ind, smin, smax, srad, step, ratio, drop)
+    else:
+        print("find_center_vo won't be executed because CuPy is not installed")
+        return 0.0
+
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%find_center_vo%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 @nvtx.annotate()
 def __find_center_vo(
-    data: xp.ndarray,
+    data: cp.ndarray,
     ind: Optional[int] = None,
     smin: int = -50,
     smax: int = 50,
@@ -111,8 +98,10 @@ def __find_center_vo(
     drop: int = 20,
 ) -> float:
 
+    from cupyx.scipy.ndimage import gaussian_filter
+
     if data.ndim == 2:
-        data = xp.expand_dims(data, 1)
+        data = cp.expand_dims(data, 1)
         ind = 0
 
     height = data.shape[1]
@@ -120,7 +109,7 @@ def __find_center_vo(
     if ind is None:
         ind = height // 2
         if height > 10:
-            _sino = xp.mean(data[:, ind - 5 : ind + 5, :], axis=1)
+            _sino = cp.mean(data[:, ind - 5 : ind + 5, :], axis=1)
         else:
             _sino = data[:, ind, :]
     else:
@@ -141,14 +130,14 @@ def __find_center_vo(
         init_cen = _search_coarse(_sino_cs, smin, smax, ratio, drop)
         fine_cen = _search_fine(_sino_fs, srad, step, init_cen, ratio, drop)
 
-    return xp.asnumpy(fine_cen)
+    return cp.asnumpy(fine_cen)
 
 
 @nvtx.annotate()
 def _search_coarse(sino, smin, smax, ratio, drop):
     (nrow, ncol) = sino.shape
-    flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
-    comp_sino = xp.ascontiguousarray(xp.flipud(sino))
+    flip_sino = cp.ascontiguousarray(cp.fliplr(sino))
+    comp_sino = cp.ascontiguousarray(cp.flipud(sino))
     mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
 
     cen_fliplr = (ncol - 1.0) / 2.0
@@ -158,12 +147,12 @@ def _search_coarse(sino, smin, smax, ratio, drop):
     smax = smax_clip_val - cen_fliplr
     start_cor = ncol // 2 + smin
     stop_cor = ncol // 2 + smax
-    list_cor = xp.arange(start_cor, stop_cor + 0.5, 0.5, dtype=xp.float32)
+    list_cor = cp.arange(start_cor, stop_cor + 0.5, 0.5, dtype=cp.float32)
     list_shift = 2.0 * (list_cor - cen_fliplr)
-    list_metric = xp.empty(list_shift.shape, dtype=xp.float32)
+    list_metric = cp.empty(list_shift.shape, dtype=cp.float32)
     _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, list_metric)
 
-    minpos = xp.argmin(list_metric)
+    minpos = cp.argmin(list_metric)
     if minpos == 0:
         print("WARNING!!!Global minimum is out of searching range")
         print(f"Please extend smin: {smin}")
@@ -178,25 +167,27 @@ def _search_coarse(sino, smin, smax, ratio, drop):
 def _search_fine(sino, srad, step, init_cen, ratio, drop):
     (nrow, ncol) = sino.shape
 
-    flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
-    comp_sino = xp.ascontiguousarray(xp.flipud(sino))
+    flip_sino = cp.ascontiguousarray(cp.fliplr(sino))
+    comp_sino = cp.ascontiguousarray(cp.flipud(sino))
     mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
 
     cen_fliplr = (ncol - 1.0) / 2.0
     srad = max(min(abs(float(srad)), ncol / 4.0), 1.0)
     step = max(min(abs(step), srad), 0.1)
     init_cen = max(min(init_cen, ncol - srad - 1), srad)
-    list_cor = init_cen + xp.arange(-srad, srad + step, step, dtype=np.float32)
+    list_cor = init_cen + cp.arange(-srad, srad + step, step, dtype=np.float32)
     list_shift = 2.0 * (list_cor - cen_fliplr)
-    list_metric = xp.empty(list_shift.shape, dtype="float32")
+    list_metric = cp.empty(list_shift.shape, dtype="float32")
 
     _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, out=list_metric)
-    cor = list_cor[xp.argmin(list_metric)]
+    cor = list_cor[cp.argmin(list_metric)]
     return cor
 
 
 @nvtx.annotate()
 def _create_mask(nrow, ncol, radius, drop):
+    from httomolibgpu.cuda_kernels import load_cuda_module
+
     du = 1.0 / ncol
     dv = (nrow - 1.0) / (nrow * 2.0 * np.pi)
     cen_row = int(math.ceil(nrow / 2.0) - 1)
@@ -209,16 +200,16 @@ def _create_mask(nrow, ncol, radius, drop):
     grid_x = (ncol // 2 + 1 + block_x - 1) // block_x
     grid_y = nrow
     grid_dims = (grid_x, grid_y)
-    mask = xp.empty((nrow, ncol // 2 + 1), dtype="uint16")
+    mask = cp.empty((nrow, ncol // 2 + 1), dtype="uint16")
     params = (
         ncol,
         nrow,
         cen_col,
         cen_row,
-        xp.float32(du),
-        xp.float32(dv),
-        xp.float32(radius),
-        xp.float32(drop),
+        cp.float32(du),
+        cp.float32(dv),
+        cp.float32(radius),
+        cp.float32(drop),
         mask,
     )
     module = load_cuda_module("generate_mask")
@@ -235,12 +226,12 @@ def round_up(x: float) -> int:
 
 
 def _get_available_gpu_memory() -> int:
-    dev = xp.cuda.Device()
+    dev = cp.cuda.Device()
     # first, let's make some space
-    xp.get_default_memory_pool().free_all_blocks()
-    cache = xp.fft.config.get_plan_cache()
+    cp.get_default_memory_pool().free_all_blocks()
+    cache = cp.fft.config.get_plan_cache()
     cache.clear()
-    available_memory = dev.mem_info[0] + xp.get_default_memory_pool().free_bytes()
+    available_memory = dev.mem_info[0] + cp.get_default_memory_pool().free_bytes()
     return int(available_memory * 0.9)  # 10% safety margin
 
 
@@ -268,12 +259,17 @@ def _calculate_chunks(
 
 @nvtx.annotate()
 def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
+    from httomolibgpu.cuda_kernels import load_cuda_module
+    from cupyx.scipy.ndimage import shift
+    from cupyx.scipy.fftpack import get_fft_plan
+    from cupyx.scipy.fft import rfft2
+
     # this tries to simplify - if shift_col is integer, no need to spline interpolate
-    assert list_shift.dtype == xp.float32, "shifts must be single precision floats"
-    assert sino1.dtype == xp.float32, "sino1 must be float32"
-    assert sino2.dtype == xp.float32, "sino1 must be float32"
-    assert sino3.dtype == xp.float32, "sino1 must be float32"
-    assert out.dtype == xp.float32, "sino1 must be float32"
+    assert list_shift.dtype == cp.float32, "shifts must be single precision floats"
+    assert sino1.dtype == cp.float32, "sino1 must be float32"
+    assert sino2.dtype == cp.float32, "sino1 must be float32"
+    assert sino3.dtype == cp.float32, "sino1 must be float32"
+    assert out.dtype == cp.float32, "sino1 must be float32"
     assert sino2.flags["C_CONTIGUOUS"], "sino2 must be C-contiguous"
     assert sino3.flags["C_CONTIGUOUS"], "sino3 must be C-contiguous"
     assert list_shift.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
@@ -285,7 +281,7 @@ def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
     shift_whole_shifts = module.get_function("shift_whole_shifts")
     # note: we don't have to calculate the mean here, as we're only looking for minimum metric.
     # The sum is enough.
-    masked_sum_abs_kernel = xp.ReductionKernel(
+    masked_sum_abs_kernel = cp.ReductionKernel(
         in_params="complex64 x, uint16 mask",  # input, complex + mask
         out_params="float32 out",  # output, real
         map_expr="mask ? abs(x) : 0.0f",
@@ -299,21 +295,19 @@ def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
     # determine how many shifts we can fit in the available memory
     # and iterate in chunks
     chunks = _calculate_chunks(
-        nshifts, (na1 + na2) * sino2.shape[1] * xp.float32().nbytes
+        nshifts, (na1 + na2) * sino2.shape[1] * cp.float32().nbytes
     )
 
-    mat = xp.empty((chunks[0], na1 + na2, sino2.shape[1]), dtype=xp.float32)
+    mat = cp.empty((chunks[0], na1 + na2, sino2.shape[1]), dtype=cp.float32)
     mat[:, :na1, :] = sino1
     # explicitly create FFT plan here, so it's not cached and clearly re-used
-    plan = get_fft_plan(
-        mat, mat.shape[-2:], axes=(1, 2), value_type="R2C"
-    )
+    plan = get_fft_plan(mat, mat.shape[-2:], axes=(1, 2), value_type="R2C")
 
     for i, stop_idx in enumerate(chunks):
         if i > 0:
             # more than one iteration means we're tight on memory, so clear up freed blocks
             mat_freq = None
-            xp.get_default_memory_pool().free_all_blocks()
+            cp.get_default_memory_pool().free_all_blocks()
 
         start_idx = 0 if i == 0 else chunks[i - 1]
         size = stop_idx - start_idx
@@ -336,7 +330,7 @@ def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
         )
 
         # now we can only look at the spline shifting, the rest is done
-        list_shift_host = xp.asnumpy(list_shift[start_idx:stop_idx])
+        list_shift_host = cp.asnumpy(list_shift[start_idx:stop_idx])
         for i in range(list_shift_host.shape[0]):
             shift_col = float(list_shift_host[i])
             if not shift_col.is_integer():
@@ -358,7 +352,9 @@ def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
 
 @nvtx.annotate()
 def _downsample(sino, level, axis):
-    assert sino.dtype == xp.float32, "single precision floating point input required"
+    from httomolibgpu.cuda_kernels import load_cuda_module
+
+    assert sino.dtype == cp.float32, "single precision floating point input required"
     assert sino.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
 
     dx, dz = sino.shape
@@ -366,7 +362,7 @@ def _downsample(sino, level, axis):
     dim = int(sino.shape[axis] / math.pow(2, level))
     shape = [dx, dz]
     shape[axis] = dim
-    downsampled_data = xp.empty(shape, dtype="float32")
+    downsampled_data = cp.empty(shape, dtype="float32")
 
     block_x = 8
     block_y = 8
@@ -386,393 +382,423 @@ def _downsample(sino, level, axis):
 
 
 ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# # %%%%%%%%%%%%%%%%%%%%%%%%%find_center_360%%%%%%%%%%%%%%%%%%%%%%%%%
+# --- Center of rotation (COR) estimation method ---#
 
 
-# # %%%%%%%%%%%%%%%%%%%%%%%%%find_center_360%%%%%%%%%%%%%%%%%%%%%%%%%
-# # --- Center of rotation (COR) estimation method ---#
-# @nvtx.annotate()
-# def find_center_360(
-#     data: xp.ndarray,
-#     ind: Optional[int] = None,
-#     win_width: int = 10,
-#     side: Optional[Literal[0, 1]] = None,
-#     denoise: bool = True,
-#     norm: bool = False,
-#     use_overlap: bool = False,
-# ) -> Tuple[float, float, Optional[Literal[0, 1]], float]:
-#     """
-#     Find the center-of-rotation (COR) in a 360-degree scan with offset COR use
-#     the method presented in Ref. [1] by Nghia Vo.
-
-#     This function supports both numpy and cupy - the implementation is selected
-#     by where the input data array resides.
-
-#     Parameters
-#     ----------
-#     data : cp.ndarray
-#         3D tomographic data as a Cupy array.
-#     ind : int, optional
-#         Index of the slice to be used for estimate the CoR and the overlap.
-#     win_width : int, optional
-#         Window width used for finding the overlap area.
-#     side : {None, 0, 1}, optional
-#         Overlap size. Only there options: None, 0, or 1. "None" corresponds
-#         to fully automated determination. "0" corresponds to the left side.
-#         "1" corresponds to the right side.
-#     denoise : bool, optional
-#         Apply the Gaussian filter if True.
-#     norm : bool, optional
-#         Apply the normalisation if True.
-#     use_overlap : bool, optional
-#         Use the combination of images in the overlap area for calculating
-#         correlation coefficients if True.
-
-#     Returns
-#     -------
-#     cor : float
-#         Center-of-rotation.
-#     overlap : float
-#         Width of the overlap area between two halves of the sinogram.
-#     side : int
-#         Overlap side between two halves of the sinogram.
-#     overlap_position : float
-#         Position of the window in the first image giving the best
-#         correlation metric.
-
-#     References
-#     ----------
-#     [1] : https://doi.org/10.1364/OE.418448
-#     """
-#     if data.ndim != 3:
-#         raise ValueError("A 3D array must be provided")
-
-#     # this method works with a 360-degree sinogram.
-#     if ind is None:
-#         _sino = data[:, 0, :]
-#     else:
-#         _sino = data[:, ind, :]
-
-#     (nrow, ncol) = _sino.shape
-#     nrow_180 = nrow // 2 + 1
-#     sino_top = _sino[0:nrow_180, :]
-#     sino_bot = xp.fliplr(_sino[-nrow_180:, :])
-#     (overlap, side, overlap_position) = _find_overlap(
-#         sino_top, sino_bot, win_width, side, denoise, norm, use_overlap
-#     )
-#     if side == 0:
-#         cor = overlap / 2.0 - 1.0
-#     else:
-#         cor = ncol - overlap / 2.0 - 1.0
-
-#     return cor, overlap, side, overlap_position
-
-
-# def _find_overlap(
-#     mat1, mat2, win_width, side=None, denoise=True, norm=False, use_overlap=False
-# ):
-#     """
-#     Find the overlap area and overlap side between two images (Ref. [1]) where
-#     the overlap side referring to the first image.
-
-#     Parameters
-#     ----------
-#     mat1 : array_like
-#         2D array. Projection image or sinogram image.
-#     mat2 :  array_like
-#         2D array. Projection image or sinogram image.
-#     win_width : int
-#         Width of the searching window.
-#     side : {None, 0, 1}, optional
-#         Only there options: None, 0, or 1. "None" corresponding to fully
-#         automated determination. "0" corresponding to the left side. "1"
-#         corresponding to the right side.
-#     denoise : bool, optional
-#         Apply the Gaussian filter if True.
-#     norm : bool, optional
-#         Apply the normalization if True.
-#     use_overlap : bool, optional
-#         Use the combination of images in the overlap area for calculating
-#         correlation coefficients if True.
-
-#     Returns
-#     -------
-#     overlap : float
-#         Width of the overlap area between two images.
-#     side : int
-#         Overlap side between two images.
-#     overlap_position : float
-#         Position of the window in the first image giving the best
-#         correlation metric.
-
-#     """
-#     ncol1 = mat1.shape[1]
-#     ncol2 = mat2.shape[1]
-#     win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2))
-
-#     if side == 1:
-#         (list_metric, offset) = _search_overlap(
-#             mat1,
-#             mat2,
-#             win_width,
-#             side=side,
-#             denoise=denoise,
-#             norm=norm,
-#             use_overlap=use_overlap,
-#         )
-#         overlap_position = _calculate_curvature(list_metric)[1]
-#         overlap_position += offset
-#         overlap = ncol1 - overlap_position + win_width // 2
-#     elif side == 0:
-#         (list_metric, offset) = _search_overlap(
-#             mat1,
-#             mat2,
-#             win_width,
-#             side=side,
-#             denoise=denoise,
-#             norm=norm,
-#             use_overlap=use_overlap,
-#         )
-#         overlap_position = _calculate_curvature(list_metric)[1]
-#         overlap_position += offset
-#         overlap = overlap_position + win_width // 2
-#     else:
-#         (list_metric1, offset1) = _search_overlap(
-#             mat1,
-#             mat2,
-#             win_width,
-#             side=1,
-#             denoise=denoise,
-#             norm=norm,
-#             use_overlap=use_overlap,
-#         )
-#         (list_metric2, offset2) = _search_overlap(
-#             mat1,
-#             mat2,
-#             win_width,
-#             side=0,
-#             denoise=denoise,
-#             norm=norm,
-#             use_overlap=use_overlap,
-#         )
-
-#         (curvature1, overlap_position1) = _calculate_curvature(list_metric1)
-#         overlap_position1 += offset1
-#         (curvature2, overlap_position2) = _calculate_curvature(list_metric2)
-#         overlap_position2 += offset2
-
-#         if curvature1 > curvature2:
-#             side = 1
-#             overlap_position = overlap_position1
-#             overlap = ncol1 - overlap_position + win_width // 2
-#         else:
-#             side = 0
-#             overlap_position = overlap_position2
-#             overlap = overlap_position + win_width // 2
-
-#     return overlap, side, overlap_position
-
-
-# @nvtx.annotate()
-# def _search_overlap(
-#     mat1, mat2, win_width, side, denoise=True, norm=False, use_overlap=False
-# ):
-#     """
-#     Calculate the correlation metrics between a rectangular region, defined
-#     by the window width, on the utmost left/right side of image 2 and the
-#     same size region in image 1 where the region is slided across image 1.
-
-#     Parameters
-#     ----------
-#     mat1 : array_like
-#         2D array. Projection image or sinogram image.
-#     mat2 : array_like
-#         2D array. Projection image or sinogram image.
-#     win_width : int
-#         Width of the searching window.
-#     side : {0, 1}
-#         Only two options: 0 or 1. It is used to indicate the overlap side
-#         respects to image 1. "0" corresponds to the left side. "1" corresponds
-#         to the right side.
-#     denoise : bool, optional
-#         Apply the Gaussian filter if True.
-#     norm : bool, optional
-#         Apply the normalization if True.
-#     use_overlap : bool, optional
-#         Use the combination of images in the overlap area for calculating
-#         correlation coefficients if True.
-
-#     Returns
-#     -------
-#     list_metric : array_like
-#         1D array. List of the correlation metrics.
-#     offset : int
-#         Initial position of the searching window where the position
-#         corresponds to the center of the window.
-#     """
-#     if denoise is True:
-#         # note: the filtering makes the output contiguous
-#         with nvtx.annotate("denoise_filter", color="green"):
-#             mat1 = gaussian_filter(mat1, (2, 2), mode="reflect")
-#             mat2 = gaussian_filter(mat2, (2, 2), mode="reflect")
-#     else:
-#         mat1 = xp.ascontiguousarray(mat1, dtype=xp.float32)
-#         mat2 = xp.ascontiguousarray(mat2, dtype=xp.float32)
-
-#     (nrow1, ncol1) = mat1.shape
-#     (nrow2, ncol2) = mat2.shape
-
-#     if nrow1 != nrow2:
-#         raise ValueError("Two images are not at the same height!!!")
-
-#     win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2 - 1))
-#     offset = win_width // 2
-#     win_width = 2 * offset  # Make it even
-
-#     list_metric = _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm)
-
-#     min_metric = xp.min(list_metric)
-#     if min_metric != 0.0:
-#         list_metric /= min_metric
-
-#     return list_metric, offset
-
-
-# _calc_metrics_module = load_cuda_module(
-#     "calc_metrics",
-#     name_expressions=[
-#         "calc_metrics_kernel<false, false>",
-#         "calc_metrics_kernel<true, false>",
-#         "calc_metrics_kernel<false, true>",
-#         "calc_metrics_kernel<true, true>",
-#     ],
-#     options=("--maxrregcount=32",),
-# )
-
-
-# @nvtx.annotate()
-# def _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm):
-#     assert mat1.dtype == xp.float32, "only float32 supported"
-#     assert mat2.dtype == xp.float32, "only float32 supported"
-#     assert mat1.shape[0] == mat2.shape[0]
-#     assert mat1.flags.c_contiguous, "only contiguos arrays supported"
-#     assert mat2.flags.c_contiguous, "only contiguos arrays supported"
-
-#     num_pos = mat1.shape[1] - win_width
-#     list_metric = xp.empty(num_pos, dtype=xp.float32)
-
-#     args = (
-#         mat1,
-#         np.int32(mat1.strides[0] / mat1.strides[1]),
-#         mat2,
-#         np.int32(mat2.strides[0] / mat2.strides[1]),
-#         np.int32(win_width),
-#         np.int32(mat1.shape[0]),
-#         np.int32(side),
-#         list_metric,
-#     )
-#     block = (128, 1, 1)
-#     grid = (1, np.int32(num_pos), 1)
-#     smem = block[0] * 4 * 6 if use_overlap else block[0] * 4 * 3
-#     bool2str = lambda x: "true" if x is True else "false"
-#     calc_metrics = _calc_metrics_module.get_function(
-#         f"calc_metrics_kernel<{bool2str(norm)}, {bool2str(use_overlap)}>"
-#     )
-#     calc_metrics(grid=grid, block=block, args=args, shared_mem=smem)
-
-#     return list_metric
-
-
-# @nvtx.annotate()
-# def _calculate_curvature(list_metric):
-#     """
-#     Calculate the curvature of a fitted curve going through the minimum
-#     value of a metric list.
-
-#     Parameters
-#     ----------
-#     list_metric : array_like
-#         1D array. List of metrics.
-
-#     Returns
-#     -------
-#     curvature : float
-#         Quadratic coefficient of the parabola fitting.
-#     min_pos : float
-#         Position of the minimum value with sub-pixel accuracy.
-#     """
-#     radi = 2
-#     num_metric = list_metric.size
-#     min_metric_idx = int(xp.argmin(list_metric))
-#     min_pos = int(np.clip(min_metric_idx, radi, num_metric - radi - 1))
-
-#     # work mostly on CPU here - we have very small arrays here
-#     list1 = xp.asnumpy(list_metric[min_pos - radi : min_pos + radi + 1])
-#     afact1 = np.polyfit(np.arange(0, 2 * radi + 1), list1, 2)[0]
-#     list2 = xp.asnumpy(list_metric[min_pos - 1 : min_pos + 2])
-#     (afact2, bfact2, _) = np.polyfit(np.arange(min_pos - 1, min_pos + 2), list2, 2)
-
-#     curvature = np.abs(afact1)
-#     if afact2 != 0.0:
-#         num = -bfact2 / (2 * afact2)
-#         if (num >= min_pos - 1) and (num <= min_pos + 1):
-#             min_pos = num
-
-#     return curvature, np.float32(min_pos)
-
-
-# # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-
-# ## %%%%%%%%%%%%%%%%%%%%%%find_center_pc%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-# @nvtx.annotate()
-# def find_center_pc(
-#     proj1: xp.ndarray, proj2: xp.ndarray, tol: float = 0.5, rotc_guess: Union[float, Optional[str]] = None
-# ) -> float:
-#     """Find rotation axis location by finding the offset between the first
-#     projection and a mirrored projection 180 degrees apart using
-#     phase correlation in Fourier space.
-#     The `phase_cross_correlation` function uses cross-correlation in Fourier
-#     space, optionally employing an upsampled matrix-multiplication DFT to
-#     achieve arbitrary subpixel precision. :cite:`Guizar:08`.
-
-#     Args:
-#         proj1 (xp.ndarray): Projection from the 0th degree
-#         proj2 (xp.ndarray): Projection from the 180th degree
-#         tol (float, optional): Subpixel accuracy. Defaults to 0.5.
-#         rotc_guess (float, optional): Initial guess value for the rotation center. Defaults to None.
-
-#     Returns:
-#         float: Rotation axis location.
-#     """
-#     if xp.__name__ == "cupy":
-#         from cupyx.scipy.ndimage import shift
-#         try:
-#             from cucim.skimage.registration import phase_cross_correlation
-#         except ImportError:
-#             print(
-#             "Cucim library of Rapidsai is a required dependency for find_center_pc module, please install"
-#             )
-#     else:
-#         from skimage.registration import phase_cross_correlation  
-#         from scipy.ndimage import shift
-    
-#     imgshift = 0.0 if rotc_guess is None else rotc_guess - (proj1.shape[1] - 1.0) / 2.0
-
-#     proj1 = shift(proj1, [0, -imgshift], mode="constant", cval=0)
-#     proj2 = shift(proj2, [0, -imgshift], mode="constant", cval=0)
-
-#     # create reflection of second projection
-#     proj2 = xp.fliplr(proj2)
-
-#     # using cucim of rapids to do phase cross correlation between two images
-#     shiftr = phase_cross_correlation(
-#         reference_image=proj1, moving_image=proj2, upsample_factor=1.0 / tol
-#     )
-
-#     # Compute center of rotation as the center of first image and the
-#     # registered translation with the second image
-#     center = (proj1.shape[1] + shiftr[0][1] - 1.0) / 2.0
-
-#     return center + imgshift
-
-# ##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+def find_center_360(
+    data: cp.ndarray,
+    ind: Optional[int] = None,
+    win_width: int = 10,
+    side: Optional[Literal[0, 1]] = None,
+    denoise: bool = True,
+    norm: bool = False,
+    use_overlap: bool = False,
+) -> Tuple[float, float, Optional[Literal[0, 1]], float]:
+    """
+    Find the center-of-rotation (COR) in a 360-degree scan with offset COR use
+    the method presented in Ref. [1] by Nghia Vo.
+
+    This function supports both numpy and cupy - the implementation is selected
+    by where the input data array resides.
+
+    Parameters
+    ----------
+    data : cp.ndarray
+        3D tomographic data as a Cupy array.
+    ind : int, optional
+        Index of the slice to be used for estimate the CoR and the overlap.
+    win_width : int, optional
+        Window width used for finding the overlap area.
+    side : {None, 0, 1}, optional
+        Overlap size. Only there options: None, 0, or 1. "None" corresponds
+        to fully automated determination. "0" corresponds to the left side.
+        "1" corresponds to the right side.
+    denoise : bool, optional
+        Apply the Gaussian filter if True.
+    norm : bool, optional
+        Apply the normalisation if True.
+    use_overlap : bool, optional
+        Use the combination of images in the overlap area for calculating
+        correlation coefficients if True.
+
+    Returns
+    -------
+    cor : float
+        Center-of-rotation.
+    overlap : float
+        Width of the overlap area between two halves of the sinogram.
+    side : int
+        Overlap side between two halves of the sinogram.
+    overlap_position : float
+        Position of the window in the first image giving the best
+        correlation metric.
+
+    References
+    ----------
+    [1] : https://doi.org/10.1364/OE.418448
+    """
+
+    if cupywrapper.cupy_run:
+        return __find_center_360(data, ind, win_width, side, denoise, norm, use_overlap)
+    else:
+        print("find_center_360 won't be executed because CuPy is not installed")
+        return (0, 0, 0, 0)
+
+
+@nvtx.annotate()
+def __find_center_360(
+    data: cp.ndarray,
+    ind: Optional[int] = None,
+    win_width: int = 10,
+    side: Optional[Literal[0, 1]] = None,
+    denoise: bool = True,
+    norm: bool = False,
+    use_overlap: bool = False,
+) -> Tuple[float, float, Optional[Literal[0, 1]], float]:
+
+    if data.ndim != 3:
+        raise ValueError("A 3D array must be provided")
+
+    # this method works with a 360-degree sinogram.
+    if ind is None:
+        _sino = data[:, 0, :]
+    else:
+        _sino = data[:, ind, :]
+
+    (nrow, ncol) = _sino.shape
+    nrow_180 = nrow // 2 + 1
+    sino_top = _sino[0:nrow_180, :]
+    sino_bot = cp.fliplr(_sino[-nrow_180:, :])
+    (overlap, side, overlap_position) = _find_overlap(
+        sino_top, sino_bot, win_width, side, denoise, norm, use_overlap
+    )
+    if side == 0:
+        cor = overlap / 2.0 - 1.0
+    else:
+        cor = ncol - overlap / 2.0 - 1.0
+
+    return cor, overlap, side, overlap_position
+
+
+def _find_overlap(
+    mat1, mat2, win_width, side=None, denoise=True, norm=False, use_overlap=False
+):
+    """
+    Find the overlap area and overlap side between two images (Ref. [1]) where
+    the overlap side referring to the first image.
+
+    Parameters
+    ----------
+    mat1 : array_like
+        2D array. Projection image or sinogram image.
+    mat2 :  array_like
+        2D array. Projection image or sinogram image.
+    win_width : int
+        Width of the searching window.
+    side : {None, 0, 1}, optional
+        Only there options: None, 0, or 1. "None" corresponding to fully
+        automated determination. "0" corresponding to the left side. "1"
+        corresponding to the right side.
+    denoise : bool, optional
+        Apply the Gaussian filter if True.
+    norm : bool, optional
+        Apply the normalization if True.
+    use_overlap : bool, optional
+        Use the combination of images in the overlap area for calculating
+        correlation coefficients if True.
+
+    Returns
+    -------
+    overlap : float
+        Width of the overlap area between two images.
+    side : int
+        Overlap side between two images.
+    overlap_position : float
+        Position of the window in the first image giving the best
+        correlation metric.
+
+    """
+    ncol1 = mat1.shape[1]
+    ncol2 = mat2.shape[1]
+    win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2))
+
+    if side == 1:
+        (list_metric, offset) = _search_overlap(
+            mat1,
+            mat2,
+            win_width,
+            side=side,
+            denoise=denoise,
+            norm=norm,
+            use_overlap=use_overlap,
+        )
+        overlap_position = _calculate_curvature(list_metric)[1]
+        overlap_position += offset
+        overlap = ncol1 - overlap_position + win_width // 2
+    elif side == 0:
+        (list_metric, offset) = _search_overlap(
+            mat1,
+            mat2,
+            win_width,
+            side=side,
+            denoise=denoise,
+            norm=norm,
+            use_overlap=use_overlap,
+        )
+        overlap_position = _calculate_curvature(list_metric)[1]
+        overlap_position += offset
+        overlap = overlap_position + win_width // 2
+    else:
+        (list_metric1, offset1) = _search_overlap(
+            mat1,
+            mat2,
+            win_width,
+            side=1,
+            denoise=denoise,
+            norm=norm,
+            use_overlap=use_overlap,
+        )
+        (list_metric2, offset2) = _search_overlap(
+            mat1,
+            mat2,
+            win_width,
+            side=0,
+            denoise=denoise,
+            norm=norm,
+            use_overlap=use_overlap,
+        )
+
+        (curvature1, overlap_position1) = _calculate_curvature(list_metric1)
+        overlap_position1 += offset1
+        (curvature2, overlap_position2) = _calculate_curvature(list_metric2)
+        overlap_position2 += offset2
+
+        if curvature1 > curvature2:
+            side = 1
+            overlap_position = overlap_position1
+            overlap = ncol1 - overlap_position + win_width // 2
+        else:
+            side = 0
+            overlap_position = overlap_position2
+            overlap = overlap_position + win_width // 2
+
+    return overlap, side, overlap_position
+
+
+@nvtx.annotate()
+def _search_overlap(
+    mat1, mat2, win_width, side, denoise=True, norm=False, use_overlap=False
+):
+    """
+    Calculate the correlation metrics between a rectangular region, defined
+    by the window width, on the utmost left/right side of image 2 and the
+    same size region in image 1 where the region is slided across image 1.
+
+    Parameters
+    ----------
+    mat1 : array_like
+        2D array. Projection image or sinogram image.
+    mat2 : array_like
+        2D array. Projection image or sinogram image.
+    win_width : int
+        Width of the searching window.
+    side : {0, 1}
+        Only two options: 0 or 1. It is used to indicate the overlap side
+        respects to image 1. "0" corresponds to the left side. "1" corresponds
+        to the right side.
+    denoise : bool, optional
+        Apply the Gaussian filter if True.
+    norm : bool, optional
+        Apply the normalization if True.
+    use_overlap : bool, optional
+        Use the combination of images in the overlap area for calculating
+        correlation coefficients if True.
+
+    Returns
+    -------
+    list_metric : array_like
+        1D array. List of the correlation metrics.
+    offset : int
+        Initial position of the searching window where the position
+        corresponds to the center of the window.
+    """
+    from cupyx.scipy.ndimage import gaussian_filter
+
+    if denoise is True:
+        # note: the filtering makes the output contiguous
+        with nvtx.annotate("denoise_filter", color="green"):
+            mat1 = gaussian_filter(mat1, (2, 2), mode="reflect")
+            mat2 = gaussian_filter(mat2, (2, 2), mode="reflect")
+    else:
+        mat1 = cp.ascontiguousarray(mat1, dtype=cp.float32)
+        mat2 = cp.ascontiguousarray(mat2, dtype=cp.float32)
+
+    (nrow1, ncol1) = mat1.shape
+    (nrow2, ncol2) = mat2.shape
+
+    if nrow1 != nrow2:
+        raise ValueError("Two images are not at the same height!!!")
+
+    win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2 - 1))
+    offset = win_width // 2
+    win_width = 2 * offset  # Make it even
+
+    list_metric = _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm)
+
+    min_metric = cp.min(list_metric)
+    if min_metric != 0.0:
+        list_metric /= min_metric
+
+    return list_metric, offset
+
+
+@nvtx.annotate()
+def _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm):
+    assert mat1.dtype == cp.float32, "only float32 supported"
+    assert mat2.dtype == cp.float32, "only float32 supported"
+    assert mat1.shape[0] == mat2.shape[0]
+    assert mat1.flags.c_contiguous, "only contiguos arrays supported"
+    assert mat2.flags.c_contiguous, "only contiguos arrays supported"
+
+    from httomolibgpu.cuda_kernels import load_cuda_module
+
+    _calc_metrics_module = load_cuda_module(
+        "calc_metrics",
+        name_expressions=[
+            "calc_metrics_kernel<false, false>",
+            "calc_metrics_kernel<true, false>",
+            "calc_metrics_kernel<false, true>",
+            "calc_metrics_kernel<true, true>",
+        ],
+        options=("--maxrregcount=32",),
+    )
+
+    num_pos = mat1.shape[1] - win_width
+    list_metric = cp.empty(num_pos, dtype=cp.float32)
+
+    args = (
+        mat1,
+        np.int32(mat1.strides[0] / mat1.strides[1]),
+        mat2,
+        np.int32(mat2.strides[0] / mat2.strides[1]),
+        np.int32(win_width),
+        np.int32(mat1.shape[0]),
+        np.int32(side),
+        list_metric,
+    )
+    block = (128, 1, 1)
+    grid = (1, np.int32(num_pos), 1)
+    smem = block[0] * 4 * 6 if use_overlap else block[0] * 4 * 3
+    bool2str = lambda x: "true" if x is True else "false"
+    calc_metrics = _calc_metrics_module.get_function(
+        f"calc_metrics_kernel<{bool2str(norm)}, {bool2str(use_overlap)}>"
+    )
+    calc_metrics(grid=grid, block=block, args=args, shared_mem=smem)
+
+    return list_metric
+
+
+@nvtx.annotate()
+def _calculate_curvature(list_metric):
+    """
+    Calculate the curvature of a fitted curve going through the minimum
+    value of a metric list.
+
+    Parameters
+    ----------
+    list_metric : array_like
+        1D array. List of metrics.
+
+    Returns
+    -------
+    curvature : float
+        Quadratic coefficient of the parabola fitting.
+    min_pos : float
+        Position of the minimum value with sub-pixel accuracy.
+    """
+    radi = 2
+    num_metric = list_metric.size
+    min_metric_idx = int(cp.argmin(list_metric))
+    min_pos = int(np.clip(min_metric_idx, radi, num_metric - radi - 1))
+
+    # work mostly on CPU here - we have very small arrays here
+    list1 = cp.asnumpy(list_metric[min_pos - radi : min_pos + radi + 1])
+    afact1 = np.polyfit(np.arange(0, 2 * radi + 1), list1, 2)[0]
+    list2 = cp.asnumpy(list_metric[min_pos - 1 : min_pos + 2])
+    (afact2, bfact2, _) = np.polyfit(np.arange(min_pos - 1, min_pos + 2), list2, 2)
+
+    curvature = np.abs(afact1)
+    if afact2 != 0.0:
+        num = -bfact2 / (2 * afact2)
+        if (num >= min_pos - 1) and (num <= min_pos + 1):
+            min_pos = num
+
+    return curvature, np.float32(min_pos)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+
+## %%%%%%%%%%%%%%%%%%%%%%find_center_pc%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+def find_center_pc(
+    proj1: cp.ndarray,
+    proj2: cp.ndarray,
+    tol: float = 0.5,
+    rotc_guess: Union[float, Optional[str]] = None,
+) -> float:
+    """Find rotation axis location by finding the offset between the first
+    projection and a mirrored projection 180 degrees apart using
+    phase correlation in Fourier space.
+    The `phase_cross_correlation` function uses cross-correlation in Fourier
+    space, optionally employing an upsampled matrix-multiplication DFT to
+    achieve arbitrary subpixel precision. :cite:`Guizar:08`.
+
+    Args:
+        proj1 (cp.ndarray): Projection from the 0th degree
+        proj2 (cp.ndarray): Projection from the 180th degree
+        tol (float, optional): Subpixel accuracy. Defaults to 0.5.
+        rotc_guess (float, optional): Initial guess value for the rotation center. Defaults to None.
+
+    Returns:
+        float: Rotation axis location.
+    """
+    if cupywrapper.cupy_run:
+        return __find_center_pc(proj1, proj2, tol, rotc_guess)
+    else:
+        print("find_center_pc won't be executed because CuPy is not installed")
+        return 0
+
+
+@nvtx.annotate()
+def __find_center_pc(
+    proj1: cp.ndarray,
+    proj2: cp.ndarray,
+    tol: float = 0.5,
+    rotc_guess: Union[float, Optional[str]] = None,
+) -> float:
+
+    from cupyx.scipy.ndimage import shift
+    from cucim.skimage.registration import phase_cross_correlation
+
+    imgshift = 0.0 if rotc_guess is None else rotc_guess - (proj1.shape[1] - 1.0) / 2.0
+
+    proj1 = shift(proj1, [0, -imgshift], mode="constant", cval=0)
+    proj2 = shift(proj2, [0, -imgshift], mode="constant", cval=0)
+
+    # create reflection of second projection
+    proj2 = cp.fliplr(proj2)
+
+    # using cucim of rapids to do phase cross correlation between two images
+    shiftr = phase_cross_correlation(
+        reference_image=proj1, moving_image=proj2, upsample_factor=1.0 / tol
+    )
+
+    # Compute center of rotation as the center of first image and the
+    # registered translation with the second image
+    center = (proj1.shape[1] + shiftr[0][1] - 1.0) / 2.0
+
+    return center + imgshift
+
+
+##%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%