diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
index b61f579f..3c26fb42 100644
--- a/httomolibgpu/recon/rotation.py
+++ b/httomolibgpu/recon/rotation.py
@@ -53,89 +53,674 @@
 if cupy_run:
     from httomolibgpu.cuda_kernels import load_cuda_module
 
+# __all__ = [
+#     "find_center_vo",
+#     "find_center_360",
+#     "find_center_pc",
+# ]
+
 __all__ = [
-    "find_center_vo",
-    "find_center_360",
     "find_center_pc",
 ]
 
 
-@nvtx.annotate()
-def find_center_vo(
-    data: xp.ndarray,
-    ind: Optional[int] = None,
-    smin: int = -50,
-    smax: int = 50,
-    srad: float = 6.0,
-    step: float = 0.25,
-    ratio: float = 0.5,
-    drop: int = 20,
-) -> float:
-    """
-    Find rotation axis location using Nghia Vo's method. See the paper
-    https://opg.optica.org/oe/fulltext.cfm?uri=oe-22-16-19078&id=297315
-
-    Parameters
-    ----------
-    data : cp.ndarray
-        3D tomographic data or a 2D sinogram as a CuPy array.
-    ind : int, optional
-        Index of the slice to be used to estimate the CoR.
-    smin : int, optional
-        Coarse search radius. Reference to the horizontal center of
-        the sinogram.
-    smax : int, optional
-        Coarse search radius. Reference to the horizontal center of
-        the sinogram.
-    srad : float, optional
-        Fine search radius.
-    step : float, optional
-        Step of fine searching.
-    ratio : float, optional
-        The ratio between the FOV of the camera and the size of object.
-        It's used to generate the mask.
-    drop : int, optional
-        Drop lines around vertical center of the mask.
-
-    Returns
-    -------
-    float
-        Rotation axis location.
-    """
-
-    if data.ndim == 2:
-        data = xp.expand_dims(data, 1)
-        ind = 0
-
-    height = data.shape[1]
-
-    if ind is None:
-        ind = height // 2
-        if height > 10:
-            _sino = xp.mean(data[:, ind - 5 : ind + 5, :], axis=1)
-        else:
-            _sino = data[:, ind, :]
-    else:
-        _sino = data[:, ind, :]
-
-    with nvtx.annotate("gaussian_filter_1", color="green"):
-        _sino_cs = gaussian_filter(_sino, (3, 1), mode="reflect")
-    with nvtx.annotate("gaussian_filter_2", color="green"):
-        _sino_fs = gaussian_filter(_sino, (2, 2), mode="reflect")
-
-    if _sino.shape[0] * _sino.shape[1] > 4e6:
-        # data is large, so downsample it before performing search for
-        # centre of rotation
-        _sino_coarse = _downsample(_sino_cs, 2, 1)
-        init_cen = _search_coarse(_sino_coarse, smin / 4.0, smax / 4.0, ratio, drop)
-        fine_cen = _search_fine(_sino_fs, srad, step, init_cen * 4.0, ratio, drop)
-    else:
-        init_cen = _search_coarse(_sino_cs, smin, smax, ratio, drop)
-        fine_cen = _search_fine(_sino_fs, srad, step, init_cen, ratio, drop)
-
-    return fine_cen.get()
-
-
+# %%%%%%%%%%%%%%%%%%%%%%%%%find_center_vo%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+# @nvtx.annotate()
+# def find_center_vo(
+#     data: xp.ndarray,
+#     ind: Optional[int] = None,
+#     smin: int = -50,
+#     smax: int = 50,
+#     srad: float = 6.0,
+#     step: float = 0.25,
+#     ratio: float = 0.5,
+#     drop: int = 20,
+# ) -> float:
+#     """
+#     Find rotation axis location using Nghia Vo's method. See the paper
+#     https://opg.optica.org/oe/fulltext.cfm?uri=oe-22-16-19078&id=297315
+
+#     Parameters
+#     ----------
+#     data : cp.ndarray
+#         3D tomographic data or a 2D sinogram as a CuPy array.
+#     ind : int, optional
+#         Index of the slice to be used to estimate the CoR.
+#     smin : int, optional
+#         Coarse search radius. Reference to the horizontal center of
+#         the sinogram.
+#     smax : int, optional
+#         Coarse search radius. Reference to the horizontal center of
+#         the sinogram.
+#     srad : float, optional
+#         Fine search radius.
+#     step : float, optional
+#         Step of fine searching.
+#     ratio : float, optional
+#         The ratio between the FOV of the camera and the size of object.
+#         It's used to generate the mask.
+#     drop : int, optional
+#         Drop lines around vertical center of the mask.
+
+#     Returns
+#     -------
+#     float
+#         Rotation axis location.
+#     """
+
+#     if data.ndim == 2:
+#         data = xp.expand_dims(data, 1)
+#         ind = 0
+
+#     height = data.shape[1]
+
+#     if ind is None:
+#         ind = height // 2
+#         if height > 10:
+#             _sino = xp.mean(data[:, ind - 5 : ind + 5, :], axis=1)
+#         else:
+#             _sino = data[:, ind, :]
+#     else:
+#         _sino = data[:, ind, :]
+
+#     with nvtx.annotate("gaussian_filter_1", color="green"):
+#         _sino_cs = gaussian_filter(_sino, (3, 1), mode="reflect")
+#     with nvtx.annotate("gaussian_filter_2", color="green"):
+#         _sino_fs = gaussian_filter(_sino, (2, 2), mode="reflect")
+
+#     if _sino.shape[0] * _sino.shape[1] > 4e6:
+#         # data is large, so downsample it before performing search for
+#         # centre of rotation
+#         _sino_coarse = _downsample(_sino_cs, 2, 1)
+#         init_cen = _search_coarse(_sino_coarse, smin / 4.0, smax / 4.0, ratio, drop)
+#         fine_cen = _search_fine(_sino_fs, srad, step, init_cen * 4.0, ratio, drop)
+#     else:
+#         init_cen = _search_coarse(_sino_cs, smin, smax, ratio, drop)
+#         fine_cen = _search_fine(_sino_fs, srad, step, init_cen, ratio, drop)
+
+#     return xp.asnumpy(fine_cen)
+
+
+# @nvtx.annotate()
+# def _search_coarse(sino, smin, smax, ratio, drop):
+#     (nrow, ncol) = sino.shape
+#     flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
+#     comp_sino = xp.ascontiguousarray(xp.flipud(sino))
+#     mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
+
+#     cen_fliplr = (ncol - 1.0) / 2.0
+#     smin_clip_val = max(min(smin + cen_fliplr, ncol - 1), 0)
+#     smin = smin_clip_val - cen_fliplr
+#     smax_clip_val = max(min(smax + cen_fliplr, ncol - 1), 0)
+#     smax = smax_clip_val - cen_fliplr
+#     start_cor = ncol // 2 + smin
+#     stop_cor = ncol // 2 + smax
+#     list_cor = xp.arange(start_cor, stop_cor + 0.5, 0.5, dtype=xp.float32)
+#     list_shift = 2.0 * (list_cor - cen_fliplr)
+#     list_metric = xp.empty(list_shift.shape, dtype=xp.float32)
+#     _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, list_metric)
+
+#     minpos = xp.argmin(list_metric)
+#     if minpos == 0:
+#         print("WARNING!!!Global minimum is out of searching range")
+#         print(f"Please extend smin: {smin}")
+#     if minpos == len(list_metric) - 1:
+#         print("WARNING!!!Global minimum is out of searching range")
+#         print(f"Please extend smax: {smax}")
+#     cor = list_cor[minpos]
+#     return cor
+
+
+# @nvtx.annotate()
+# def _search_fine(sino, srad, step, init_cen, ratio, drop):
+#     (nrow, ncol) = sino.shape
+
+#     flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
+#     comp_sino = xp.ascontiguousarray(xp.flipud(sino))
+#     mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
+
+#     cen_fliplr = (ncol - 1.0) / 2.0
+#     srad = max(min(abs(float(srad)), ncol / 4.0), 1.0)
+#     step = max(min(abs(step), srad), 0.1)
+#     init_cen = max(min(init_cen, ncol - srad - 1), srad)
+#     list_cor = init_cen + xp.arange(-srad, srad + step, step, dtype=np.float32)
+#     list_shift = 2.0 * (list_cor - cen_fliplr)
+#     list_metric = xp.empty(list_shift.shape, dtype="float32")
+
+#     _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, out=list_metric)
+#     cor = list_cor[xp.argmin(list_metric)]
+#     return cor
+
+
+# @nvtx.annotate()
+# def _create_mask(nrow, ncol, radius, drop):
+#     du = 1.0 / ncol
+#     dv = (nrow - 1.0) / (nrow * 2.0 * np.pi)
+#     cen_row = int(math.ceil(nrow / 2.0) - 1)
+#     cen_col = int(math.ceil(ncol / 2.0) - 1)
+#     drop = min([drop, int(math.ceil(0.05 * nrow))])
+
+#     block_x = 128
+#     block_y = 1
+#     block_dims = (block_x, block_y)
+#     grid_x = (ncol // 2 + 1 + block_x - 1) // block_x
+#     grid_y = nrow
+#     grid_dims = (grid_x, grid_y)
+#     mask = xp.empty((nrow, ncol // 2 + 1), dtype="uint16")
+#     params = (
+#         ncol,
+#         nrow,
+#         cen_col,
+#         cen_row,
+#         xp.float32(du),
+#         xp.float32(dv),
+#         xp.float32(radius),
+#         xp.float32(drop),
+#         mask,
+#     )
+#     module = load_cuda_module("generate_mask")
+#     kernel = module.get_function("generate_mask")
+#     kernel(grid_dims, block_dims, params)
+#     return mask
+
+
+# def round_up(x: float) -> int:
+#     if x >= 0.0:
+#         return int(math.ceil(x))
+#     else:
+#         return int(math.floor(x))
+
+
+# def _get_available_gpu_memory() -> int:
+#     dev = xp.cuda.Device()
+#     # first, let's make some space
+#     xp.get_default_memory_pool().free_all_blocks()
+#     cache = xp.fft.config.get_plan_cache()
+#     cache.clear()
+#     available_memory = dev.mem_info[0] + xp.get_default_memory_pool().free_bytes()
+#     return int(available_memory * 0.9)  # 10% safety margin
+
+
+# def _calculate_chunks(
+#     nshifts: int, shift_size: int, available_memory: Optional[int] = None
+# ) -> List[int]:
+#     if available_memory is None:
+#         available_memory = _get_available_gpu_memory()
+
+#     available_memory -= shift_size
+#     freq_domain_size = (
+#         shift_size  # it needs only half (RFFT), but complex64, so it's the same
+#     )
+#     fft_plan_size = freq_domain_size
+#     size_per_shift = fft_plan_size + freq_domain_size + shift_size
+#     nshift_max = available_memory // size_per_shift
+#     assert nshift_max > 0, "Not enough memory to process"
+#     num_chunks = int(np.ceil(nshifts / nshift_max))
+#     chunk_size = int(np.ceil(nshifts / num_chunks))
+#     chunks = [chunk_size] * (num_chunks - 1)
+#     stop_idx = list(np.cumsum(chunks))
+#     stop_idx.append(nshifts)
+#     return stop_idx
+
+
+# @nvtx.annotate()
+# def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
+#     # this tries to simplify - if shift_col is integer, no need to spline interpolate
+#     assert list_shift.dtype == xp.float32, "shifts must be single precision floats"
+#     assert sino1.dtype == xp.float32, "sino1 must be float32"
+#     assert sino2.dtype == xp.float32, "sino1 must be float32"
+#     assert sino3.dtype == xp.float32, "sino1 must be float32"
+#     assert out.dtype == xp.float32, "sino1 must be float32"
+#     assert sino2.flags["C_CONTIGUOUS"], "sino2 must be C-contiguous"
+#     assert sino3.flags["C_CONTIGUOUS"], "sino3 must be C-contiguous"
+#     assert list_shift.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
+#     nshifts = list_shift.shape[0]
+#     na1 = sino1.shape[0]
+#     na2 = sino2.shape[0]
+
+#     module = load_cuda_module("center_360_shifts")
+#     shift_whole_shifts = module.get_function("shift_whole_shifts")
+#     # note: we don't have to calculate the mean here, as we're only looking for minimum metric.
+#     # The sum is enough.
+#     masked_sum_abs_kernel = xp.ReductionKernel(
+#         in_params="complex64 x, uint16 mask",  # input, complex + mask
+#         out_params="float32 out",  # output, real
+#         map_expr="mask ? abs(x) : 0.0f",
+#         reduce_expr="a + b",
+#         post_map_expr="out = a",
+#         identity="0.0f",
+#         reduce_type="float",
+#         name="masked_sum_abs",
+#     )
+
+#     # determine how many shifts we can fit in the available memory
+#     # and iterate in chunks
+#     chunks = _calculate_chunks(
+#         nshifts, (na1 + na2) * sino2.shape[1] * xp.float32().nbytes
+#     )
+
+#     mat = xp.empty((chunks[0], na1 + na2, sino2.shape[1]), dtype=xp.float32)
+#     mat[:, :na1, :] = sino1
+#     # explicitly create FFT plan here, so it's not cached and clearly re-used
+#     plan = cupyx.scipy.fftpack.get_fft_plan(
+#         mat, mat.shape[-2:], axes=(1, 2), value_type="R2C"
+#     )
+
+#     for i, stop_idx in enumerate(chunks):
+#         if i > 0:
+#             # more than one iteration means we're tight on memory, so clear up freed blocks
+#             mat_freq = None
+#             xp.get_default_memory_pool().free_all_blocks()
+
+#         start_idx = 0 if i == 0 else chunks[i - 1]
+#         size = stop_idx - start_idx
+
+#         # first, handle the integer shifts without spline in a raw kernel,
+#         # and shift in the sino3 one accordingly
+#         bx = 128
+#         gx = (sino3.shape[1] + bx - 1) // bx
+#         shift_whole_shifts(
+#             grid=(gx, na2, size),  ####
+#             block=(bx, 1, 1),
+#             args=(
+#                 sino2,
+#                 sino3,
+#                 list_shift[start_idx:stop_idx],
+#                 mat[:, na1:, :],
+#                 sino3.shape[1],
+#                 na1 + na2,
+#             ),
+#         )
+
+#         # now we can only look at the spline shifting, the rest is done
+#         list_shift_host = xp.asnumpy(list_shift[start_idx:stop_idx])
+#         for i in range(list_shift_host.shape[0]):
+#             shift_col = float(list_shift_host[i])
+#             if not shift_col.is_integer():
+#                 shifted = shift(sino2, (0, shift_col), order=3, prefilter=True)
+#                 shift_int = round_up(shift_col)
+#                 if shift_int >= 0:
+#                     mat[i, na1:, shift_int:] = shifted[:, shift_int:]
+#                 else:
+#                     mat[i, na1:, :shift_int] = shifted[:, :shift_int]
+
+#         # stack and transform
+#         # (we do the full sized mat FFT, even though the last chunk may be smaller, to
+#         # make sure we can re-use the same FFT plan as before)
+#         mat_freq = cupyx.scipy.fft.rfft2(mat, axes=(1, 2), norm=None, plan=plan)
+#         masked_sum_abs_kernel(
+#             mat_freq[:size, :, :], mask, out=out[start_idx:stop_idx], axis=(1, 2)
+#         )
+
+
+# @nvtx.annotate()
+# def _downsample(sino, level, axis):
+#     assert sino.dtype == xp.float32, "single precision floating point input required"
+#     assert sino.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
+
+#     dx, dz = sino.shape
+#     # Determine the new size, dim, of the downsampled dimension
+#     dim = int(sino.shape[axis] / math.pow(2, level))
+#     shape = [dx, dz]
+#     shape[axis] = dim
+#     downsampled_data = xp.empty(shape, dtype="float32")
+
+#     block_x = 8
+#     block_y = 8
+#     block_dims = (block_x, block_y)
+#     grid_x = (sino.shape[1] + block_x - 1) // block_x
+#     grid_y = (sino.shape[0] + block_y - 1) // block_y
+#     grid_dims = (grid_x, grid_y)
+#     # 8x8 thread-block, which means 16 "lots" of columns to downsample per
+#     # thread-block; 4 bytes per float, so allocate 16*6 = 64 bytes of shared
+#     # memeory per thread-block
+#     shared_mem_bytes = 64
+#     params = (sino, dx, dz, level, downsampled_data)
+#     module = load_cuda_module("downsample_sino")
+#     kernel = module.get_function("downsample_sino")
+#     kernel(grid_dims, block_dims, params, shared_mem=shared_mem_bytes)
+#     return downsampled_data
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+
+# # %%%%%%%%%%%%%%%%%%%%%%%%%find_center_360%%%%%%%%%%%%%%%%%%%%%%%%%
+# # --- Center of rotation (COR) estimation method ---#
+# @nvtx.annotate()
+# def find_center_360(
+#     data: xp.ndarray,
+#     ind: Optional[int] = None,
+#     win_width: int = 10,
+#     side: Optional[Literal[0, 1]] = None,
+#     denoise: bool = True,
+#     norm: bool = False,
+#     use_overlap: bool = False,
+# ) -> Tuple[float, float, Optional[Literal[0, 1]], float]:
+#     """
+#     Find the center-of-rotation (COR) in a 360-degree scan with offset COR use
+#     the method presented in Ref. [1] by Nghia Vo.
+
+#     This function supports both numpy and cupy - the implementation is selected
+#     by where the input data array resides.
+
+#     Parameters
+#     ----------
+#     data : cp.ndarray
+#         3D tomographic data as a Cupy array.
+#     ind : int, optional
+#         Index of the slice to be used for estimate the CoR and the overlap.
+#     win_width : int, optional
+#         Window width used for finding the overlap area.
+#     side : {None, 0, 1}, optional
+#         Overlap size. Only there options: None, 0, or 1. "None" corresponds
+#         to fully automated determination. "0" corresponds to the left side.
+#         "1" corresponds to the right side.
+#     denoise : bool, optional
+#         Apply the Gaussian filter if True.
+#     norm : bool, optional
+#         Apply the normalisation if True.
+#     use_overlap : bool, optional
+#         Use the combination of images in the overlap area for calculating
+#         correlation coefficients if True.
+
+#     Returns
+#     -------
+#     cor : float
+#         Center-of-rotation.
+#     overlap : float
+#         Width of the overlap area between two halves of the sinogram.
+#     side : int
+#         Overlap side between two halves of the sinogram.
+#     overlap_position : float
+#         Position of the window in the first image giving the best
+#         correlation metric.
+
+#     References
+#     ----------
+#     [1] : https://doi.org/10.1364/OE.418448
+#     """
+#     if data.ndim != 3:
+#         raise ValueError("A 3D array must be provided")
+
+#     # this method works with a 360-degree sinogram.
+#     if ind is None:
+#         _sino = data[:, 0, :]
+#     else:
+#         _sino = data[:, ind, :]
+
+#     (nrow, ncol) = _sino.shape
+#     nrow_180 = nrow // 2 + 1
+#     sino_top = _sino[0:nrow_180, :]
+#     sino_bot = xp.fliplr(_sino[-nrow_180:, :])
+#     (overlap, side, overlap_position) = _find_overlap(
+#         sino_top, sino_bot, win_width, side, denoise, norm, use_overlap
+#     )
+#     if side == 0:
+#         cor = overlap / 2.0 - 1.0
+#     else:
+#         cor = ncol - overlap / 2.0 - 1.0
+
+#     return cor, overlap, side, overlap_position
+
+
+# def _find_overlap(
+#     mat1, mat2, win_width, side=None, denoise=True, norm=False, use_overlap=False
+# ):
+#     """
+#     Find the overlap area and overlap side between two images (Ref. [1]) where
+#     the overlap side referring to the first image.
+
+#     Parameters
+#     ----------
+#     mat1 : array_like
+#         2D array. Projection image or sinogram image.
+#     mat2 :  array_like
+#         2D array. Projection image or sinogram image.
+#     win_width : int
+#         Width of the searching window.
+#     side : {None, 0, 1}, optional
+#         Only there options: None, 0, or 1. "None" corresponding to fully
+#         automated determination. "0" corresponding to the left side. "1"
+#         corresponding to the right side.
+#     denoise : bool, optional
+#         Apply the Gaussian filter if True.
+#     norm : bool, optional
+#         Apply the normalization if True.
+#     use_overlap : bool, optional
+#         Use the combination of images in the overlap area for calculating
+#         correlation coefficients if True.
+
+#     Returns
+#     -------
+#     overlap : float
+#         Width of the overlap area between two images.
+#     side : int
+#         Overlap side between two images.
+#     overlap_position : float
+#         Position of the window in the first image giving the best
+#         correlation metric.
+
+#     """
+#     ncol1 = mat1.shape[1]
+#     ncol2 = mat2.shape[1]
+#     win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2))
+
+#     if side == 1:
+#         (list_metric, offset) = _search_overlap(
+#             mat1,
+#             mat2,
+#             win_width,
+#             side=side,
+#             denoise=denoise,
+#             norm=norm,
+#             use_overlap=use_overlap,
+#         )
+#         overlap_position = _calculate_curvature(list_metric)[1]
+#         overlap_position += offset
+#         overlap = ncol1 - overlap_position + win_width // 2
+#     elif side == 0:
+#         (list_metric, offset) = _search_overlap(
+#             mat1,
+#             mat2,
+#             win_width,
+#             side=side,
+#             denoise=denoise,
+#             norm=norm,
+#             use_overlap=use_overlap,
+#         )
+#         overlap_position = _calculate_curvature(list_metric)[1]
+#         overlap_position += offset
+#         overlap = overlap_position + win_width // 2
+#     else:
+#         (list_metric1, offset1) = _search_overlap(
+#             mat1,
+#             mat2,
+#             win_width,
+#             side=1,
+#             denoise=denoise,
+#             norm=norm,
+#             use_overlap=use_overlap,
+#         )
+#         (list_metric2, offset2) = _search_overlap(
+#             mat1,
+#             mat2,
+#             win_width,
+#             side=0,
+#             denoise=denoise,
+#             norm=norm,
+#             use_overlap=use_overlap,
+#         )
+
+#         (curvature1, overlap_position1) = _calculate_curvature(list_metric1)
+#         overlap_position1 += offset1
+#         (curvature2, overlap_position2) = _calculate_curvature(list_metric2)
+#         overlap_position2 += offset2
+
+#         if curvature1 > curvature2:
+#             side = 1
+#             overlap_position = overlap_position1
+#             overlap = ncol1 - overlap_position + win_width // 2
+#         else:
+#             side = 0
+#             overlap_position = overlap_position2
+#             overlap = overlap_position + win_width // 2
+
+#     return overlap, side, overlap_position
+
+
+# @nvtx.annotate()
+# def _search_overlap(
+#     mat1, mat2, win_width, side, denoise=True, norm=False, use_overlap=False
+# ):
+#     """
+#     Calculate the correlation metrics between a rectangular region, defined
+#     by the window width, on the utmost left/right side of image 2 and the
+#     same size region in image 1 where the region is slided across image 1.
+
+#     Parameters
+#     ----------
+#     mat1 : array_like
+#         2D array. Projection image or sinogram image.
+#     mat2 : array_like
+#         2D array. Projection image or sinogram image.
+#     win_width : int
+#         Width of the searching window.
+#     side : {0, 1}
+#         Only two options: 0 or 1. It is used to indicate the overlap side
+#         respects to image 1. "0" corresponds to the left side. "1" corresponds
+#         to the right side.
+#     denoise : bool, optional
+#         Apply the Gaussian filter if True.
+#     norm : bool, optional
+#         Apply the normalization if True.
+#     use_overlap : bool, optional
+#         Use the combination of images in the overlap area for calculating
+#         correlation coefficients if True.
+
+#     Returns
+#     -------
+#     list_metric : array_like
+#         1D array. List of the correlation metrics.
+#     offset : int
+#         Initial position of the searching window where the position
+#         corresponds to the center of the window.
+#     """
+#     if denoise is True:
+#         # note: the filtering makes the output contiguous
+#         with nvtx.annotate("denoise_filter", color="green"):
+#             mat1 = gaussian_filter(mat1, (2, 2), mode="reflect")
+#             mat2 = gaussian_filter(mat2, (2, 2), mode="reflect")
+#     else:
+#         mat1 = xp.ascontiguousarray(mat1, dtype=xp.float32)
+#         mat2 = xp.ascontiguousarray(mat2, dtype=xp.float32)
+
+#     (nrow1, ncol1) = mat1.shape
+#     (nrow2, ncol2) = mat2.shape
+
+#     if nrow1 != nrow2:
+#         raise ValueError("Two images are not at the same height!!!")
+
+#     win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2 - 1))
+#     offset = win_width // 2
+#     win_width = 2 * offset  # Make it even
+
+#     list_metric = _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm)
+
+#     min_metric = xp.min(list_metric)
+#     if min_metric != 0.0:
+#         list_metric /= min_metric
+
+#     return list_metric, offset
+
+
+# _calc_metrics_module = load_cuda_module(
+#     "calc_metrics",
+#     name_expressions=[
+#         "calc_metrics_kernel<false, false>",
+#         "calc_metrics_kernel<true, false>",
+#         "calc_metrics_kernel<false, true>",
+#         "calc_metrics_kernel<true, true>",
+#     ],
+#     options=("--maxrregcount=32",),
+# )
+
+
+# @nvtx.annotate()
+# def _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm):
+#     assert mat1.dtype == xp.float32, "only float32 supported"
+#     assert mat2.dtype == xp.float32, "only float32 supported"
+#     assert mat1.shape[0] == mat2.shape[0]
+#     assert mat1.flags.c_contiguous, "only contiguos arrays supported"
+#     assert mat2.flags.c_contiguous, "only contiguos arrays supported"
+
+#     num_pos = mat1.shape[1] - win_width
+#     list_metric = xp.empty(num_pos, dtype=xp.float32)
+
+#     args = (
+#         mat1,
+#         np.int32(mat1.strides[0] / mat1.strides[1]),
+#         mat2,
+#         np.int32(mat2.strides[0] / mat2.strides[1]),
+#         np.int32(win_width),
+#         np.int32(mat1.shape[0]),
+#         np.int32(side),
+#         list_metric,
+#     )
+#     block = (128, 1, 1)
+#     grid = (1, np.int32(num_pos), 1)
+#     smem = block[0] * 4 * 6 if use_overlap else block[0] * 4 * 3
+#     bool2str = lambda x: "true" if x is True else "false"
+#     calc_metrics = _calc_metrics_module.get_function(
+#         f"calc_metrics_kernel<{bool2str(norm)}, {bool2str(use_overlap)}>"
+#     )
+#     calc_metrics(grid=grid, block=block, args=args, shared_mem=smem)
+
+#     return list_metric
+
+
+# @nvtx.annotate()
+# def _calculate_curvature(list_metric):
+#     """
+#     Calculate the curvature of a fitted curve going through the minimum
+#     value of a metric list.
+
+#     Parameters
+#     ----------
+#     list_metric : array_like
+#         1D array. List of metrics.
+
+#     Returns
+#     -------
+#     curvature : float
+#         Quadratic coefficient of the parabola fitting.
+#     min_pos : float
+#         Position of the minimum value with sub-pixel accuracy.
+#     """
+#     radi = 2
+#     num_metric = list_metric.size
+#     min_metric_idx = int(xp.argmin(list_metric))
+#     min_pos = int(np.clip(min_metric_idx, radi, num_metric - radi - 1))
+
+#     # work mostly on CPU here - we have very small arrays here
+#     list1 = xp.asnumpy(list_metric[min_pos - radi : min_pos + radi + 1])
+#     afact1 = np.polyfit(np.arange(0, 2 * radi + 1), list1, 2)[0]
+#     list2 = xp.asnumpy(list_metric[min_pos - 1 : min_pos + 2])
+#     (afact2, bfact2, _) = np.polyfit(np.arange(min_pos - 1, min_pos + 2), list2, 2)
+
+#     curvature = np.abs(afact1)
+#     if afact2 != 0.0:
+#         num = -bfact2 / (2 * afact2)
+#         if (num >= min_pos - 1) and (num <= min_pos + 1):
+#             min_pos = num
+
+#     return curvature, np.float32(min_pos)
+
+
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+
+# %%%%%%%%%%%%%%%%%%%%%%find_center_pc%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 @nvtx.annotate()
 def find_center_pc(proj1, proj2, tol=0.5, rotc_guess=None):
     """
@@ -186,573 +771,4 @@ def find_center_pc(proj1, proj2, tol=0.5, rotc_guess=None):
     return center + imgshift
 
 
-@nvtx.annotate()
-def _search_coarse(sino, smin, smax, ratio, drop):
-    (nrow, ncol) = sino.shape
-    flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
-    comp_sino = xp.ascontiguousarray(xp.flipud(sino))
-    mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
-
-    cen_fliplr = (ncol - 1.0) / 2.0
-    smin_clip_val = max(min(smin + cen_fliplr, ncol - 1), 0)
-    smin = smin_clip_val - cen_fliplr
-    smax_clip_val = max(min(smax + cen_fliplr, ncol - 1), 0)
-    smax = smax_clip_val - cen_fliplr
-    start_cor = ncol // 2 + smin
-    stop_cor = ncol // 2 + smax
-    list_cor = xp.arange(start_cor, stop_cor + 0.5, 0.5, dtype=xp.float32)
-    list_shift = 2.0 * (list_cor - cen_fliplr)
-    list_metric = xp.empty(list_shift.shape, dtype=xp.float32)
-    _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, list_metric)
-
-    minpos = xp.argmin(list_metric)
-    if minpos == 0:
-        print("WARNING!!!Global minimum is out of searching range")
-        print(f"Please extend smin: {smin}")
-    if minpos == len(list_metric) - 1:
-        print("WARNING!!!Global minimum is out of searching range")
-        print(f"Please extend smax: {smax}")
-    cor = list_cor[minpos]
-    return cor
-
-
-@nvtx.annotate()
-def _search_fine(sino, srad, step, init_cen, ratio, drop):
-    (nrow, ncol) = sino.shape
-
-    flip_sino = xp.ascontiguousarray(xp.fliplr(sino))
-    comp_sino = xp.ascontiguousarray(xp.flipud(sino))
-    mask = _create_mask(2 * nrow, ncol, 0.5 * ratio * ncol, drop)
-
-    cen_fliplr = (ncol - 1.0) / 2.0
-    srad = max(min(abs(float(srad)), ncol / 4.0), 1.0)
-    step = max(min(abs(step), srad), 0.1)
-    init_cen = max(min(init_cen, ncol - srad - 1), srad)
-    list_cor = init_cen + xp.arange(-srad, srad + step, step, dtype=np.float32)
-    list_shift = 2.0 * (list_cor - cen_fliplr)
-    list_metric = xp.empty(list_shift.shape, dtype="float32")
-
-    _calculate_metric(list_shift, sino, flip_sino, comp_sino, mask, out=list_metric)
-    cor = list_cor[xp.argmin(list_metric)]
-    return cor
-
-
-@nvtx.annotate()
-def _create_mask(nrow, ncol, radius, drop):
-    du = 1.0 / ncol
-    dv = (nrow - 1.0) / (nrow * 2.0 * np.pi)
-    cen_row = int(math.ceil(nrow / 2.0) - 1)
-    cen_col = int(math.ceil(ncol / 2.0) - 1)
-    drop = min([drop, int(math.ceil(0.05 * nrow))])
-
-    block_x = 128
-    block_y = 1
-    block_dims = (block_x, block_y)
-    grid_x = (ncol // 2 + 1 + block_x - 1) // block_x
-    grid_y = nrow
-    grid_dims = (grid_x, grid_y)
-    mask = xp.empty((nrow, ncol // 2 + 1), dtype="uint16")
-    params = (
-        ncol,
-        nrow,
-        cen_col,
-        cen_row,
-        xp.float32(du),
-        xp.float32(dv),
-        xp.float32(radius),
-        xp.float32(drop),
-        mask,
-    )
-    module = load_cuda_module("generate_mask")
-    kernel = module.get_function("generate_mask")
-    kernel(grid_dims, block_dims, params)
-    return mask
-
-
-def round_up(x: float) -> int:
-    if x >= 0.0:
-        return int(math.ceil(x))
-    else:
-        return int(math.floor(x))
-
-
-def _get_available_gpu_memory() -> int:
-    dev = xp.cuda.Device()
-    # first, let's make some space
-    xp.get_default_memory_pool().free_all_blocks()
-    cache = xp.fft.config.get_plan_cache()
-    cache.clear()
-    available_memory = dev.mem_info[0] + xp.get_default_memory_pool().free_bytes()
-    return int(available_memory * 0.9)  # 10% safety margin
-
-
-def _calculate_chunks(
-    nshifts: int, shift_size: int, available_memory: Optional[int] = None
-) -> List[int]:
-    if available_memory is None:
-        available_memory = _get_available_gpu_memory()
-
-    available_memory -= shift_size
-    freq_domain_size = (
-        shift_size  # it needs only half (RFFT), but complex64, so it's the same
-    )
-    fft_plan_size = freq_domain_size
-    size_per_shift = fft_plan_size + freq_domain_size + shift_size
-    nshift_max = available_memory // size_per_shift
-    assert nshift_max > 0, "Not enough memory to process"
-    num_chunks = int(np.ceil(nshifts / nshift_max))
-    chunk_size = int(np.ceil(nshifts / num_chunks))
-    chunks = [chunk_size] * (num_chunks - 1)
-    stop_idx = list(np.cumsum(chunks))
-    stop_idx.append(nshifts)
-    return stop_idx
-
-
-@nvtx.annotate()
-def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
-    # this tries to simplify - if shift_col is integer, no need to spline interpolate
-    assert list_shift.dtype == xp.float32, "shifts must be single precision floats"
-    assert sino1.dtype == xp.float32, "sino1 must be float32"
-    assert sino2.dtype == xp.float32, "sino1 must be float32"
-    assert sino3.dtype == xp.float32, "sino1 must be float32"
-    assert out.dtype == xp.float32, "sino1 must be float32"
-    assert sino2.flags["C_CONTIGUOUS"], "sino2 must be C-contiguous"
-    assert sino3.flags["C_CONTIGUOUS"], "sino3 must be C-contiguous"
-    assert list_shift.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
-    nshifts = list_shift.shape[0]
-    na1 = sino1.shape[0]
-    na2 = sino2.shape[0]
-
-    module = load_cuda_module("center_360_shifts")
-    shift_whole_shifts = module.get_function("shift_whole_shifts")
-    # note: we don't have to calculate the mean here, as we're only looking for minimum metric.
-    # The sum is enough.
-    masked_sum_abs_kernel = xp.ReductionKernel(
-        in_params="complex64 x, uint16 mask",  # input, complex + mask
-        out_params="float32 out",  # output, real
-        map_expr="mask ? abs(x) : 0.0f",
-        reduce_expr="a + b",
-        post_map_expr="out = a",
-        identity="0.0f",
-        reduce_type="float",
-        name="masked_sum_abs",
-    )
-
-    # determine how many shifts we can fit in the available memory
-    # and iterate in chunks
-    chunks = _calculate_chunks(
-        nshifts, (na1 + na2) * sino2.shape[1] * xp.float32().nbytes
-    )
-
-    mat = xp.empty((chunks[0], na1 + na2, sino2.shape[1]), dtype=xp.float32)
-    mat[:, :na1, :] = sino1
-    # explicitly create FFT plan here, so it's not cached and clearly re-used
-    plan = cupyx.scipy.fftpack.get_fft_plan(
-        mat, mat.shape[-2:], axes=(1, 2), value_type="R2C"
-    )
-
-    for i, stop_idx in enumerate(chunks):
-        if i > 0:
-            # more than one iteration means we're tight on memory, so clear up freed blocks
-            mat_freq = None
-            xp.get_default_memory_pool().free_all_blocks()
-
-        start_idx = 0 if i == 0 else chunks[i - 1]
-        size = stop_idx - start_idx
-
-        # first, handle the integer shifts without spline in a raw kernel,
-        # and shift in the sino3 one accordingly
-        bx = 128
-        gx = (sino3.shape[1] + bx - 1) // bx
-        shift_whole_shifts(
-            grid=(gx, na2, size),  ####
-            block=(bx, 1, 1),
-            args=(
-                sino2,
-                sino3,
-                list_shift[start_idx:stop_idx],
-                mat[:, na1:, :],
-                sino3.shape[1],
-                na1 + na2,
-            ),
-        )
-
-        # now we can only look at the spline shifting, the rest is done
-        list_shift_host = xp.asnumpy(list_shift[start_idx:stop_idx])
-        for i in range(list_shift_host.shape[0]):
-            shift_col = float(list_shift_host[i])
-            if not shift_col.is_integer():
-                shifted = shift(sino2, (0, shift_col), order=3, prefilter=True)
-                shift_int = round_up(shift_col)
-                if shift_int >= 0:
-                    mat[i, na1:, shift_int:] = shifted[:, shift_int:]
-                else:
-                    mat[i, na1:, :shift_int] = shifted[:, :shift_int]
-
-        # stack and transform
-        # (we do the full sized mat FFT, even though the last chunk may be smaller, to
-        # make sure we can re-use the same FFT plan as before)
-        mat_freq = cupyx.scipy.fft.rfft2(mat, axes=(1, 2), norm=None, plan=plan)
-        masked_sum_abs_kernel(
-            mat_freq[:size, :, :], mask, out=out[start_idx:stop_idx], axis=(1, 2)
-        )
-
-
-@nvtx.annotate()
-def _downsample(sino, level, axis):
-    assert sino.dtype == xp.float32, "single precision floating point input required"
-    assert sino.flags["C_CONTIGUOUS"], "list_shift must be C-contiguous"
-
-    dx, dz = sino.shape
-    # Determine the new size, dim, of the downsampled dimension
-    dim = int(sino.shape[axis] / math.pow(2, level))
-    shape = [dx, dz]
-    shape[axis] = dim
-    downsampled_data = xp.empty(shape, dtype="float32")
-
-    block_x = 8
-    block_y = 8
-    block_dims = (block_x, block_y)
-    grid_x = (sino.shape[1] + block_x - 1) // block_x
-    grid_y = (sino.shape[0] + block_y - 1) // block_y
-    grid_dims = (grid_x, grid_y)
-    # 8x8 thread-block, which means 16 "lots" of columns to downsample per
-    # thread-block; 4 bytes per float, so allocate 16*6 = 64 bytes of shared
-    # memeory per thread-block
-    shared_mem_bytes = 64
-    params = (sino, dx, dz, level, downsampled_data)
-    module = load_cuda_module("downsample_sino")
-    kernel = module.get_function("downsample_sino")
-    kernel(grid_dims, block_dims, params, shared_mem=shared_mem_bytes)
-    return downsampled_data
-
-
-# --- Center of rotation (COR) estimation method ---#
-@nvtx.annotate()
-def find_center_360(
-    data: xp.ndarray,
-    ind: Optional[int] = None,
-    win_width: int = 10,
-    side: Optional[Literal[0, 1]] = None,
-    denoise: bool = True,
-    norm: bool = False,
-    use_overlap: bool = False,
-) -> Tuple[float, float, Optional[Literal[0, 1]], float]:
-    """
-    Find the center-of-rotation (COR) in a 360-degree scan with offset COR use
-    the method presented in Ref. [1] by Nghia Vo.
-
-    This function supports both numpy and cupy - the implementation is selected
-    by where the input data array resides.
-
-    Parameters
-    ----------
-    data : cp.ndarray
-        3D tomographic data as a Cupy array.
-    ind : int, optional
-        Index of the slice to be used for estimate the CoR and the overlap.
-    win_width : int, optional
-        Window width used for finding the overlap area.
-    side : {None, 0, 1}, optional
-        Overlap size. Only there options: None, 0, or 1. "None" corresponds
-        to fully automated determination. "0" corresponds to the left side.
-        "1" corresponds to the right side.
-    denoise : bool, optional
-        Apply the Gaussian filter if True.
-    norm : bool, optional
-        Apply the normalisation if True.
-    use_overlap : bool, optional
-        Use the combination of images in the overlap area for calculating
-        correlation coefficients if True.
-
-    Returns
-    -------
-    cor : float
-        Center-of-rotation.
-    overlap : float
-        Width of the overlap area between two halves of the sinogram.
-    side : int
-        Overlap side between two halves of the sinogram.
-    overlap_position : float
-        Position of the window in the first image giving the best
-        correlation metric.
-
-    References
-    ----------
-    [1] : https://doi.org/10.1364/OE.418448
-    """
-    if data.ndim != 3:
-        raise ValueError("A 3D array must be provided")
-
-    # this method works with a 360-degree sinogram.
-    if ind is None:
-        _sino = data[:, 0, :]
-    else:
-        _sino = data[:, ind, :]
-
-    (nrow, ncol) = _sino.shape
-    nrow_180 = nrow // 2 + 1
-    sino_top = _sino[0:nrow_180, :]
-    sino_bot = xp.fliplr(_sino[-nrow_180:, :])
-    (overlap, side, overlap_position) = _find_overlap(
-        sino_top, sino_bot, win_width, side, denoise, norm, use_overlap
-    )
-    if side == 0:
-        cor = overlap / 2.0 - 1.0
-    else:
-        cor = ncol - overlap / 2.0 - 1.0
-
-    return cor, overlap, side, overlap_position
-
-
-def _find_overlap(
-    mat1, mat2, win_width, side=None, denoise=True, norm=False, use_overlap=False
-):
-    """
-    Find the overlap area and overlap side between two images (Ref. [1]) where
-    the overlap side referring to the first image.
-
-    Parameters
-    ----------
-    mat1 : array_like
-        2D array. Projection image or sinogram image.
-    mat2 :  array_like
-        2D array. Projection image or sinogram image.
-    win_width : int
-        Width of the searching window.
-    side : {None, 0, 1}, optional
-        Only there options: None, 0, or 1. "None" corresponding to fully
-        automated determination. "0" corresponding to the left side. "1"
-        corresponding to the right side.
-    denoise : bool, optional
-        Apply the Gaussian filter if True.
-    norm : bool, optional
-        Apply the normalization if True.
-    use_overlap : bool, optional
-        Use the combination of images in the overlap area for calculating
-        correlation coefficients if True.
-
-    Returns
-    -------
-    overlap : float
-        Width of the overlap area between two images.
-    side : int
-        Overlap side between two images.
-    overlap_position : float
-        Position of the window in the first image giving the best
-        correlation metric.
-
-    """
-    ncol1 = mat1.shape[1]
-    ncol2 = mat2.shape[1]
-    win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2))
-
-    if side == 1:
-        (list_metric, offset) = _search_overlap(
-            mat1,
-            mat2,
-            win_width,
-            side=side,
-            denoise=denoise,
-            norm=norm,
-            use_overlap=use_overlap,
-        )
-        overlap_position = _calculate_curvature(list_metric)[1]
-        overlap_position += offset
-        overlap = ncol1 - overlap_position + win_width // 2
-    elif side == 0:
-        (list_metric, offset) = _search_overlap(
-            mat1,
-            mat2,
-            win_width,
-            side=side,
-            denoise=denoise,
-            norm=norm,
-            use_overlap=use_overlap,
-        )
-        overlap_position = _calculate_curvature(list_metric)[1]
-        overlap_position += offset
-        overlap = overlap_position + win_width // 2
-    else:
-        (list_metric1, offset1) = _search_overlap(
-            mat1,
-            mat2,
-            win_width,
-            side=1,
-            denoise=denoise,
-            norm=norm,
-            use_overlap=use_overlap,
-        )
-        (list_metric2, offset2) = _search_overlap(
-            mat1,
-            mat2,
-            win_width,
-            side=0,
-            denoise=denoise,
-            norm=norm,
-            use_overlap=use_overlap,
-        )
-
-        (curvature1, overlap_position1) = _calculate_curvature(list_metric1)
-        overlap_position1 += offset1
-        (curvature2, overlap_position2) = _calculate_curvature(list_metric2)
-        overlap_position2 += offset2
-
-        if curvature1 > curvature2:
-            side = 1
-            overlap_position = overlap_position1
-            overlap = ncol1 - overlap_position + win_width // 2
-        else:
-            side = 0
-            overlap_position = overlap_position2
-            overlap = overlap_position + win_width // 2
-
-    return overlap, side, overlap_position
-
-
-@nvtx.annotate()
-def _search_overlap(
-    mat1, mat2, win_width, side, denoise=True, norm=False, use_overlap=False
-):
-    """
-    Calculate the correlation metrics between a rectangular region, defined
-    by the window width, on the utmost left/right side of image 2 and the
-    same size region in image 1 where the region is slided across image 1.
-
-    Parameters
-    ----------
-    mat1 : array_like
-        2D array. Projection image or sinogram image.
-    mat2 : array_like
-        2D array. Projection image or sinogram image.
-    win_width : int
-        Width of the searching window.
-    side : {0, 1}
-        Only two options: 0 or 1. It is used to indicate the overlap side
-        respects to image 1. "0" corresponds to the left side. "1" corresponds
-        to the right side.
-    denoise : bool, optional
-        Apply the Gaussian filter if True.
-    norm : bool, optional
-        Apply the normalization if True.
-    use_overlap : bool, optional
-        Use the combination of images in the overlap area for calculating
-        correlation coefficients if True.
-
-    Returns
-    -------
-    list_metric : array_like
-        1D array. List of the correlation metrics.
-    offset : int
-        Initial position of the searching window where the position
-        corresponds to the center of the window.
-    """
-    if denoise is True:
-        # note: the filtering makes the output contiguous
-        with nvtx.annotate("denoise_filter", color="green"):
-            mat1 = gaussian_filter(mat1, (2, 2), mode="reflect")
-            mat2 = gaussian_filter(mat2, (2, 2), mode="reflect")
-    else:
-        mat1 = xp.ascontiguousarray(mat1, dtype=xp.float32)
-        mat2 = xp.ascontiguousarray(mat2, dtype=xp.float32)
-
-    (nrow1, ncol1) = mat1.shape
-    (nrow2, ncol2) = mat2.shape
-
-    if nrow1 != nrow2:
-        raise ValueError("Two images are not at the same height!!!")
-
-    win_width = int(np.clip(win_width, 6, min(ncol1, ncol2) // 2 - 1))
-    offset = win_width // 2
-    win_width = 2 * offset  # Make it even
-
-    list_metric = _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm)
-
-    min_metric = xp.min(list_metric)
-    if min_metric != 0.0:
-        list_metric /= min_metric
-
-    return list_metric, offset
-
-
-_calc_metrics_module = load_cuda_module(
-    "calc_metrics",
-    name_expressions=[
-        "calc_metrics_kernel<false, false>",
-        "calc_metrics_kernel<true, false>",
-        "calc_metrics_kernel<false, true>",
-        "calc_metrics_kernel<true, true>",
-    ],
-    options=("--maxrregcount=32",),
-)
-
-
-@nvtx.annotate()
-def _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm):
-    assert mat1.dtype == xp.float32, "only float32 supported"
-    assert mat2.dtype == xp.float32, "only float32 supported"
-    assert mat1.shape[0] == mat2.shape[0]
-    assert mat1.flags.c_contiguous, "only contiguos arrays supported"
-    assert mat2.flags.c_contiguous, "only contiguos arrays supported"
-
-    num_pos = mat1.shape[1] - win_width
-    list_metric = xp.empty(num_pos, dtype=xp.float32)
-
-    args = (
-        mat1,
-        np.int32(mat1.strides[0] / mat1.strides[1]),
-        mat2,
-        np.int32(mat2.strides[0] / mat2.strides[1]),
-        np.int32(win_width),
-        np.int32(mat1.shape[0]),
-        np.int32(side),
-        list_metric,
-    )
-    block = (128, 1, 1)
-    grid = (1, np.int32(num_pos), 1)
-    smem = block[0] * 4 * 6 if use_overlap else block[0] * 4 * 3
-    bool2str = lambda x: "true" if x is True else "false"
-    calc_metrics = _calc_metrics_module.get_function(
-        f"calc_metrics_kernel<{bool2str(norm)}, {bool2str(use_overlap)}>"
-    )
-    calc_metrics(grid=grid, block=block, args=args, shared_mem=smem)
-
-    return list_metric
-
-
-@nvtx.annotate()
-def _calculate_curvature(list_metric):
-    """
-    Calculate the curvature of a fitted curve going through the minimum
-    value of a metric list.
-
-    Parameters
-    ----------
-    list_metric : array_like
-        1D array. List of metrics.
-
-    Returns
-    -------
-    curvature : float
-        Quadratic coefficient of the parabola fitting.
-    min_pos : float
-        Position of the minimum value with sub-pixel accuracy.
-    """
-    radi = 2
-    num_metric = list_metric.size
-    min_metric_idx = int(xp.argmin(list_metric))
-    min_pos = int(np.clip(min_metric_idx, radi, num_metric - radi - 1))
-
-    # work mostly on CPU here - we have very small arrays here
-    list1 = xp.asnumpy(list_metric[min_pos - radi : min_pos + radi + 1])
-    afact1 = np.polyfit(np.arange(0, 2 * radi + 1), list1, 2)[0]
-    list2 = xp.asnumpy(list_metric[min_pos - 1 : min_pos + 2])
-    (afact2, bfact2, _) = np.polyfit(np.arange(min_pos - 1, min_pos + 2), list2, 2)
-
-    curvature = np.abs(afact1)
-    if afact2 != 0.0:
-        num = -bfact2 / (2 * afact2)
-        if (num >= min_pos - 1) and (num <= min_pos + 1):
-            min_pos = num
-
-    return curvature, np.float32(min_pos)
+# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%