From 05e366217e7977f56a2d6bb6e5e396617784e99a Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Thu, 21 Sep 2023 16:36:48 +0100
Subject: [PATCH 01/26] test normalise processed

---
 tests/test_prep/test_normalize.py | 26 ++------------------------
 1 file changed, 2 insertions(+), 24 deletions(-)

diff --git a/tests/test_prep/test_normalize.py b/tests/test_prep/test_normalize.py
index 092ffbc0..1ef42557 100644
--- a/tests/test_prep/test_normalize.py
+++ b/tests/test_prep/test_normalize.py
@@ -7,10 +7,8 @@
 from numpy.testing import assert_allclose, assert_equal
 from httomolibgpu.prep.normalize import normalize
 from httomolibgpu import method_registry
-from cupy.cuda import nvtx
-
-from tests import MaxMemoryHook
 
+from cupy.cuda import nvtx
 
 @cp.testing.gpu
 def test_normalize_1D_raises(data, flats, darks, ensure_clean_memory):
@@ -37,26 +35,6 @@ def test_normalize(data, flats, darks, ensure_clean_memory):
     assert_allclose(np.std(data_normalize), 0.524382, rtol=1e-06)
 
 
-@cp.testing.gpu
-def test_normalize_meta(data, flats, darks, ensure_clean_memory):
-    # --- testing normalize  ---#
-    hook = MaxMemoryHook()
-    with hook:
-        data_normalize = normalize(cp.copy(data), flats, darks, minus_log=True).get()
-
-    # make sure estimator function is within range (80% min, 100% max)
-    max_mem = hook.max_mem
-    actual_slices = data.shape[0]
-    estimated_slices, dtype_out, output_dims = normalize.meta.calc_max_slices(0,
-                                                         (data.shape[1], data.shape[2]),                                                         
-                                                         data.dtype, max_mem,
-                                                         flats=flats, darks=darks)
-    assert estimated_slices <= actual_slices
-    assert estimated_slices / actual_slices >= 0.8
-    assert normalize.meta.pattern == 'projection'
-    assert 'normalize' in method_registry['httomolibgpu']['prep']['normalize']
-    
-
 @cp.testing.gpu
 @pytest.mark.perf
 def test_normalize_performance(ensure_clean_memory):
@@ -105,4 +83,4 @@ def test_normalize_performance(ensure_clean_memory):
     duration_ms = float(end - start) * 1e-6 / 10
 
     assert "performance in ms" == duration_ms
-    
\ No newline at end of file
+    

From 4da4eb1a0e9c97ea52c1263456a4fe8321959ee4 Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Fri, 22 Sep 2023 11:48:21 +0100
Subject: [PATCH 02/26] remove fresnel filter for now as it requires more work
 around automation of different patterns with httomo

---
 httomolibgpu/prep/phase.py    | 180 ----------------------------------
 tests/test_prep/test_phase.py | 126 +++---------------------
 2 files changed, 11 insertions(+), 295 deletions(-)

diff --git a/httomolibgpu/prep/phase.py b/httomolibgpu/prep/phase.py
index fb36b50f..6efb2d7f 100644
--- a/httomolibgpu/prep/phase.py
+++ b/httomolibgpu/prep/phase.py
@@ -32,7 +32,6 @@
 from httomolibgpu.decorator import method_proj
 
 __all__ = [
-    "fresnel_filter",
     "paganin_filter_savu",
     "paganin_filter_tomopy",
 ]
@@ -43,138 +42,6 @@
 PI = 3.14159265359
 PLANCK_CONSTANT = 6.58211928e-19  # [keV*s]
 
-
-def _calc_max_slices_fresnel(
-    non_slice_dims_shape: Tuple[int, int],
-    dtype: np.dtype,
-    available_memory: int,
-    **kwargs,
-) -> Tuple[int, np.dtype, Tuple[int, int]]:
-    height1, width1 = non_slice_dims_shape
-    window_size = (height1 * width1) * np.float64().nbytes
-    pad_width = min(150, int(0.1 * width1))
-    padded_height = height1 + 2 * pad_width
-    padded_width = width1 * 2 * pad_width
-    in_slice_size = height1 * width1 * dtype.itemsize
-    # float64 here as res is internally computed in double
-    internal_slice_size = padded_height * padded_width * np.float64().nbytes
-    out_slice_size = padded_height * padded_width * np.float32().nbytes
-    slice_size = in_slice_size + out_slice_size + internal_slice_size
-    # the internal data is computed using a for loop, so the temporaries won't have
-    # a significant impact. But we add a safety margin for that
-    safety = in_slice_size * 4
-
-    available_memory -= window_size + safety
-    return (available_memory // slice_size, float32(), non_slice_dims_shape)
-
-
-# CuPy implementation of Fresnel filter ported from Savu
-@method_proj(_calc_max_slices_fresnel)
-def fresnel_filter(
-    mat: cp.ndarray, pattern: str, ratio: float, apply_log: bool = True
-) -> cp.ndarray:
-    """
-    Apply Fresnel filter.
-
-    Parameters
-    ----------
-    mat : cp.ndarray
-        The data to apply filtering to.
-
-    pattern : str
-        Choose 'PROJECTION' for filtering projection, otherwise, will be handled
-        generically for other cases.
-
-    ratio : float
-        Control the strength of the filter. Greater is stronger.
-
-    apply_log : bool, optional
-        Apply negative log function to the data being filtered.
-
-    Returns
-    -------
-    cp.ndarray
-        The filtered data.
-    """
-
-    if mat.ndim == 2:
-        mat = cp.expand_dims(mat, 0)
-
-    if mat.ndim != 3:
-        raise ValueError(
-            f"Invalid number of dimensions in data: {mat.ndim},"
-            " please provide a stack of 2D projections."
-        )
-
-    if apply_log is True:
-        mat = -cp.log(mat)
-
-    # Define window
-    (depth1, height1, width1) = mat.shape[:3]
-    window = _make_window(height1, width1, ratio, pattern)
-    pad_width = min(150, int(0.1 * width1))
-
-    # Regardless of working with projections or sinograms, the rows and columns
-    # in the images to filter are in the same dimensions of the data: rows in
-    # dimension 1, columns in dimension 2 (ie, for projection images, `nrow` is
-    # the number of rows in a projection image, and for sinogram images, `nrow`
-    # is the number of rows in a sinogram image).
-    (_, nrow, ncol) = mat.shape
-
-    # Define array to hold result. Note that, due to the padding applied, the
-    # shape of the filtered images are different to the shape of the
-    # original/unfiltered images.
-    padded_height = mat.shape[1] + pad_width * 2
-    res_height = min(nrow, padded_height - pad_width)
-    padded_width = mat.shape[2] + pad_width * 2
-    res_width = min(ncol, padded_width - pad_width)
-    res = cp.zeros((mat.shape[0], res_height, res_width))
-
-    # Loop over images and apply filter
-    for i in range(mat.shape[0]):
-        if pattern == "PROJECTION":
-            top_drop = 10  # To remove the time stamp in some data
-            mat_pad = cp.pad(
-                mat[i][top_drop:],
-                ((pad_width + top_drop, pad_width), (pad_width, pad_width)),
-                mode="edge",
-            )
-            win_pad = cp.pad(window, pad_width, mode="edge")
-            mat_dec = cp.fft.ifft2(cp.fft.fft2(mat_pad) / cp.fft.ifftshift(win_pad))
-            mat_dec = cp.real(
-                mat_dec[pad_width : pad_width + nrow, pad_width : pad_width + ncol]
-            )
-            res[i] = mat_dec
-        else:
-            mat_pad = cp.pad(mat[i], ((0, 0), (pad_width, pad_width)), mode="edge")
-            win_pad = cp.pad(window, ((0, 0), (pad_width, pad_width)), mode="edge")
-            mat_fft = cp.fft.fftshift(cp.fft.fft(mat_pad), axes=1) / win_pad
-            mat_dec = cp.fft.ifft(cp.fft.ifftshift(mat_fft, axes=1))
-            mat_dec = cp.real(mat_dec[:, pad_width : pad_width + ncol])
-            res[i] = mat_dec
-
-    if apply_log is True:
-        res = cp.exp(-res)
-
-    return cp.asarray(res, dtype=cp.float32)
-
-
-def _make_window(height, width, ratio, pattern):
-    center_hei = int(cp.ceil((height - 1) * 0.5))
-    center_wid = int(cp.ceil((width - 1) * 0.5))
-    if pattern == "PROJECTION":
-        ulist = (1.0 * cp.arange(0, width) - center_wid) / width
-        vlist = (1.0 * cp.arange(0, height) - center_hei) / height
-        u, v = cp.meshgrid(ulist, vlist)
-        win2d = 1.0 + ratio * (u**2 + v**2)
-    else:
-        ulist = (1.0 * cp.arange(0, width) - center_wid) / width
-        win1d = 1.0 + ratio * ulist**2
-        win2d = cp.tile(win1d, (height, 1))
-
-    return win2d
-
-
 def _calc_max_slices_paganin_filter_savu(
     non_slice_dims_shape: Tuple[int, int],
     dtype: np.dtype,
@@ -426,54 +293,7 @@ def _reciprocal_coord(pixel_size: float, num_grid: int) -> cp.ndarray:
     rc *= 2 * PI / (n * pixel_size)
     return rc
 
-
-def _calc_max_slices_paganin_filter_tomopy(
-    non_slice_dims_shape: Tuple[int, int],
-    dtype: np.dtype,
-    available_memory: int,
-    **kwargs,
-) -> Tuple[int, np.dtype, Tuple[int, int]]:
-    pad_tup = []
-    for index, element in enumerate(non_slice_dims_shape):
-        diff = _shift_bit_length(element + 1) - element
-        if element % 2 == 0:
-            pad_width = diff // 2
-            pad_width = (pad_width, pad_width)
-        else:
-            # need an uneven padding for odd-number lengths
-            left_pad = diff // 2
-            right_pad = diff - left_pad
-            pad_width = (left_pad, right_pad)
-        pad_tup.append(pad_width)
-
-    input_size = np.prod(non_slice_dims_shape) * dtype.itemsize
-    in_slice_size = (
-        (non_slice_dims_shape[0] + pad_tup[0][0]+pad_tup[0][1])
-        * (non_slice_dims_shape[1] + pad_tup[1][0]+pad_tup[1][1])
-        * dtype.itemsize
-    )
-    out_slice_size = (
-        (non_slice_dims_shape[0] + pad_tup[0][0]+pad_tup[0][1])
-        * (non_slice_dims_shape[1] + pad_tup[1][0]+pad_tup[1][1])
-        * np.float32().nbytes
-    )
-    
-    # FFT needs complex inputs, so copy to complex happens first
-    complex_slice = in_slice_size / dtype.itemsize * np.complex64().nbytes
-    fftplan_slice = complex_slice
-    grid_size = np.prod(non_slice_dims_shape) * np.float32().nbytes
-    filter_size = grid_size    
-    res_slice = grid_size    
-    
-    tot_memory = input_size + in_slice_size + out_slice_size + 2*complex_slice + fftplan_slice + res_slice
-    available_memory -= filter_size
-    available_memory -= grid_size
-    
-    return (int(available_memory // tot_memory), float32(), non_slice_dims_shape)
-
-
 # Adaptation with some corrections of retrieve_phase (Paganin filter) from TomoPy
-@method_proj(_calc_max_slices_paganin_filter_tomopy)
 @nvtx.annotate()
 def paganin_filter_tomopy(
     tomo: cp.ndarray,
diff --git a/tests/test_prep/test_phase.py b/tests/test_prep/test_phase.py
index a07f0966..4b84ba95 100644
--- a/tests/test_prep/test_phase.py
+++ b/tests/test_prep/test_phase.py
@@ -4,56 +4,14 @@
 import numpy as np
 import pytest
 from cupy.cuda import nvtx
-from httomolibgpu.prep.phase import fresnel_filter, paganin_filter_savu, paganin_filter_tomopy
+from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
 from numpy.testing import assert_allclose
 from httomolibgpu import method_registry
-from tests import MaxMemoryHook
 
 eps = 1e-6
 
-
-@cp.testing.gpu
-def test_fresnel_filter_projection(data):
-    # --- testing the Fresnel filter on tomo_standard ---#
-    pattern = "PROJECTION"
-    ratio = 100.0
-    filtered_data = fresnel_filter(data, pattern, ratio).get()
-
-    assert_allclose(np.mean(filtered_data), 802.1125, rtol=eps)
-    assert_allclose(np.max(filtered_data), 1039.5293)
-    assert_allclose(np.min(filtered_data), 95.74562)
-
-    #: make sure the output is float32
-    assert filtered_data.dtype == np.float32
-    assert fresnel_filter.meta.pattern == 'projection'
-    assert 'fresnel_filter' in method_registry['httomolibgpu']['prep']['phase']
-
-
-@cp.testing.gpu
-def test_fresnel_filter_sinogram(data):
-    pattern = "SINOGRAM"
-    ratio = 100.0
-    filtered_data = fresnel_filter(data, pattern, ratio).get()
-
-    assert_allclose(np.mean(filtered_data), 806.74347, rtol=eps)
-    assert_allclose(np.max(filtered_data), 1063.7007)
-    assert_allclose(np.min(filtered_data), 87.91508)
-
-    #: make sure the output is float32
-    assert filtered_data.dtype == np.float32
-
-
 @cp.testing.gpu
-def test_fresnel_filter_1D_raises(ensure_clean_memory):
-    _data = cp.ones(10)
-    with pytest.raises(ValueError):
-        fresnel_filter(_data, "SINOGRAM", 100.0)
-
-    _data = None  #: free up GPU memory
-
-
-@cp.testing.gpu
-def test_paganin_filter(data):
+def test_paganin_savu_filter(data):
     # --- testing the Paganin filter on tomo_standard ---#
     filtered_data = paganin_filter_savu(data).get()
     
@@ -64,39 +22,8 @@ def test_paganin_filter(data):
     #: make sure the output is float32
     assert filtered_data.dtype == np.float32
 
-
 @cp.testing.gpu
-@pytest.mark.parametrize("pad", [0, 31, 100])
-@pytest.mark.parametrize("slices", [15, 51, 160])
-@pytest.mark.parametrize("dtype", [np.uint16, np.float32])
-def test_paganin_filter_meta(pad, slices, dtype, ensure_clean_memory):    
-    # --- testing the Paganin filter on tomo_standard ---#
-    cache = cp.fft.config.get_plan_cache()
-    cache.clear()
-    kwargs = dict(pad_x=pad, pad_y=pad)
-    data = cp.random.random_sample((slices, 111, 121), dtype=np.float32)
-    if dtype == np.uint16:
-        data = cp.asarray(data * 300.0, dtype=np.uint16)    
-    hook = MaxMemoryHook(data.size * data.itemsize)
-    with hook:
-        paganin_filter_savu(data, **kwargs)
-    
-    # make sure estimator function is within range (80% min, 100% max)
-    max_mem = hook.max_mem
-    actual_slices = data.shape[0]
-    estimated_slices, dtype_out, output_dims = paganin_filter_savu.meta.calc_max_slices(
-        0, 
-        (data.shape[1], data.shape[2]),
-        data.dtype, max_mem, **kwargs)
-    assert estimated_slices <= actual_slices
-    assert estimated_slices / actual_slices >= 0.8    
-
-    assert paganin_filter_savu.meta.pattern == 'projection'
-    assert 'paganin_filter_savu' in method_registry['httomolibgpu']['prep']['phase']
-
-
-@cp.testing.gpu
-def test_paganin_filter_energy100(data):
+def test_paganin_filter_savu_energy100(data):
     filtered_data = paganin_filter_savu(data, energy=100.0).get()
 
     assert_allclose(np.mean(filtered_data), -778.61926, rtol=1e-05)
@@ -107,7 +34,7 @@ def test_paganin_filter_energy100(data):
 
 
 @cp.testing.gpu
-def test_paganin_filter_padmean(data):
+def test_paganin_filter_savu_padmean(data):
     filtered_data = paganin_filter_savu(data, pad_method="mean").get()
 
     assert_allclose(np.mean(filtered_data), -765.3401, rtol=eps)
@@ -131,7 +58,7 @@ def test_paganin_filter_padmean(data):
 
 @cp.testing.gpu
 @pytest.mark.perf
-def test_paganin_filter_performance(ensure_clean_memory):
+def test_paganin_filter_savu_performance(ensure_clean_memory):
     # Note: low/high and size values taken from sample2_medium.yaml real run
 
     # this test needs ~20GB of memory with 1801 - we'll divide depending on GPU memory
@@ -184,14 +111,14 @@ def test_paganin_filter_performance(ensure_clean_memory):
 
 
 @cp.testing.gpu
-def test_paganin_filter_1D_raises(ensure_clean_memory):
+def test_paganin_filter_savu_1D_raises(ensure_clean_memory):
     _data = cp.ones(10)
     with pytest.raises(ValueError):
         paganin_filter_savu(_data)
 
     _data = None  #: free up GPU memory
 
-
+# paganin filter tomopy
 @cp.testing.gpu
 def test_paganin_filter_tomopy_1D_raises(ensure_clean_memory):
     _data = cp.ones(10)
@@ -200,7 +127,6 @@ def test_paganin_filter_tomopy_1D_raises(ensure_clean_memory):
 
     _data = None  #: free up GPU memory
 
-
 @cp.testing.gpu
 def test_paganin_filter_tomopy(data):
     # --- testing the Paganin filter from TomoPy on tomo_standard ---#
@@ -215,7 +141,7 @@ def test_paganin_filter_tomopy(data):
 
 
 @cp.testing.gpu
-def test_paganin_filter2_energy100(data):
+def test_paganin_filter_tomopy_energy100(data):
     filtered_data = paganin_filter_tomopy(data, energy=100.0).get()
 
     assert_allclose(np.mean(filtered_data), -6.73455, rtol=1e-05)
@@ -226,7 +152,7 @@ def test_paganin_filter2_energy100(data):
 
 
 @cp.testing.gpu
-def test_paganin_filter2_dist75(data):
+def test_paganin_filter_tomopy_dist75(data):
     filtered_data = paganin_filter_tomopy(data, dist=75.0, alpha=1e-6).get()
 
     assert_allclose(np.sum(np.mean(filtered_data, axis=(1, 2))), -1215.4985, rtol=1e-6)
@@ -237,7 +163,7 @@ def test_paganin_filter2_dist75(data):
 
 @cp.testing.gpu
 @pytest.mark.perf
-def test_paganin_filter2_performance(ensure_clean_memory):
+def test_paganin_filter_tomopy_performance(ensure_clean_memory):
     # Note: low/high and size values taken from sample2_medium.yaml real run
 
     # this test needs ~20GB of memory with 1801 - we'll divide depending on GPU memory
@@ -267,34 +193,4 @@ def test_paganin_filter2_performance(ensure_clean_memory):
     dev.synchronize()
     duration_ms = float(time.perf_counter_ns() - start) * 1e-6 / 10
 
-    assert "performance in ms" == duration_ms
-
-
-@cp.testing.gpu
-@pytest.mark.parametrize("slices", [15, 51, 160])
-@pytest.mark.parametrize("dtype", [np.uint16, np.float32])
-def test_paganin_filter2_meta(slices, dtype, ensure_clean_memory):
-    cache = cp.fft.config.get_plan_cache()
-    cache.clear()
-    kwargs = {}
-    data = cp.random.random_sample((slices, 111, 121), dtype=np.float32)
-    if dtype == np.uint16:
-        data = cp.asarray(data * 300.0, dtype=np.uint16)
-    hook = MaxMemoryHook(data.size * data.itemsize)
-    with hook:
-        paganin_filter_tomopy(data, **kwargs)
-
-    assert paganin_filter_tomopy.meta.pattern == 'projection'
-    assert 'paganin_filter_tomopy' in method_registry['httomolibgpu']['prep']['phase']
-    assert not paganin_filter_tomopy.meta.cpu
-    assert paganin_filter_tomopy.meta.gpu
-
-    # make sure estimator function is within range (80% min, 100% max)
-    max_mem = hook.max_mem
-    actual_slices = data.shape[0]
-    estimated_slices, dtype_out, output_dims = paganin_filter_tomopy.meta.calc_max_slices(
-        0,
-        (data.shape[1], data.shape[2]),
-        data.dtype, max_mem, **kwargs)
-    assert estimated_slices <= actual_slices
-    assert estimated_slices / actual_slices >= 0.8
+    assert "performance in ms" == duration_ms
\ No newline at end of file

From 95da65d0167a9a379c0b7ab0f324428900622f08 Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Fri, 22 Sep 2023 14:57:53 +0100
Subject: [PATCH 03/26] removes savu's implementation of distortion correction

---
 httomolibgpu/misc/corr.py         |   5 -
 httomolibgpu/misc/morph.py        |  17 ---
 httomolibgpu/prep/alignment.py    | 182 +-----------------------------
 httomolibgpu/prep/phase.py        |  31 +----
 tests/test_misc/test_corr.py      |  22 ----
 tests/test_misc/test_morph.py     |  28 -----
 tests/test_prep/test_alignment.py |  43 +------
 7 files changed, 8 insertions(+), 320 deletions(-)

diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index 7dcde7d9..9d9b8ad4 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -29,15 +29,12 @@
 import nvtx
 
 from httomolibgpu.cuda_kernels import load_cuda_module
-from httomolibgpu.decorator import calc_max_slices_default, method_all
 
 __all__ = [
     "median_filter3d",
     "remove_outlier3d",
 ]
 
-
-@method_all(calc_max_slices=calc_max_slices_default)
 @nvtx.annotate()
 def median_filter3d(
     data: cp.ndarray, kernel_size: int = 3, dif: float = 0.0
@@ -102,8 +99,6 @@ def median_filter3d(
 
     return out
 
-
-@method_all(calc_max_slices=calc_max_slices_default)
 def remove_outlier3d(
     data: cp.ndarray, kernel_size: int = 3, dif: float = 0.1
 ) -> cp.ndarray:
diff --git a/httomolibgpu/misc/morph.py b/httomolibgpu/misc/morph.py
index c0b9fbcc..d4c580cc 100644
--- a/httomolibgpu/misc/morph.py
+++ b/httomolibgpu/misc/morph.py
@@ -24,29 +24,12 @@
 import numpy as np
 import nvtx
 from typing import Literal, Tuple
-from httomolibgpu.decorator import method_sino
 
 __all__ = [
     "sino_360_to_180",
 ]
 
 
-def _calc_max_slices_sino_360_to_180(
-    other_dims: Tuple[int, int], dtype: np.dtype, available_memory: int, **kwargs
-) -> Tuple[int, np.dtype]:
-    assert 'overlap' in kwargs, "Overlap not given"
-    overlap = int(np.round(kwargs['overlap']))
-    in_slice = np.prod(other_dims) * dtype.itemsize
-    out_slice = other_dims[0] * (other_dims[1] * 2 - overlap) / 2 * dtype.itemsize
-    # we have to leave this as 64bit to match tomopy?
-    weights = overlap * np.float64().nbytes
-
-    available_memory -= weights
-    return int(np.floor(available_memory / (in_slice + out_slice))), dtype
-
-
-
-@method_sino(_calc_max_slices_sino_360_to_180)
 @nvtx.annotate()
 def sino_360_to_180(
     data: cp.ndarray, overlap: int = 0, rotation: Literal["left", "right"] = "left"
diff --git a/httomolibgpu/prep/alignment.py b/httomolibgpu/prep/alignment.py
index 484fe9a3..b7181365 100644
--- a/httomolibgpu/prep/alignment.py
+++ b/httomolibgpu/prep/alignment.py
@@ -28,195 +28,15 @@
 import nvtx
 import numpy as np
 
-
-from httomolibgpu.decorator import method_proj
-
 __all__ = [
-    "distortion_correction_proj",
+    "distortion_correction_proj_discorpy",
 ]
 
-
-def _calc_max_slices_distortion_correction_proj(
-    non_slice_dims_shape: Tuple[int, int],
-    dtype: np.dtype,
-    available_memory: int, **kwargs
-) -> Tuple[int, np.dtype, Tuple[int, int]]:
-    # calculating memory is a bit more involved as various small temporary arrays
-    # are used. We revert to a rough estimation using only the larger elements and
-    # a safety margin
-    height, width = non_slice_dims_shape[0], non_slice_dims_shape[1]
-    lists_size = (width + height) * np.float64().nbytes
-    meshgrid_size = (width * height * 2) * np.float64().nbytes
-    ru_mat_size = meshgrid_size // 2
-    fact_mat_size = ru_mat_size
-    xd_mat_size = yd_mat_size = fact_mat_size // 2   # float32
-    indices_size = xd_mat_size + yd_mat_size
-
-    slice_size = np.prod(non_slice_dims_shape) * dtype.itemsize
-    processing_size = slice_size * 4  # temporaries in final for loop
-
-    available_memory -= (
-        lists_size
-        + meshgrid_size
-        + ru_mat_size
-        + fact_mat_size * 2
-        + xd_mat_size
-        + yd_mat_size
-        + processing_size
-        + indices_size * 3  # The x 3 here is for additional safety margin 
-    )                       # to allow for memory for temporaries
-
-    return (int(available_memory // slice_size), dtype, non_slice_dims_shape)
-
-
-## %%%%%%%%%%%%%%%%%%%%%%%%%distortion_correction_proj%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-# CuPy implementation of distortion correction from Savu
-# TODO: Needs to be implementation from TomoPy
-@method_proj(_calc_max_slices_distortion_correction_proj)
-@nvtx.annotate()
-def distortion_correction_proj(
-    data: cp.ndarray,
-    metadata_path: str,
-    preview: Dict[str, List[int]],
-    center_from_left: Optional[float] = None,
-    center_from_top: Optional[float] = None,
-    polynomial_coeffs: Optional[List[float]] = None,
-    crop: int = 0,
-):
-    """Correct the radial distortion in the given stack of 2D images.
-
-    Parameters
-    ----------
-    data : cp.ndarray
-        The stack of 2D images to apply the distortion correction to.
-
-    metadata_path : str
-        The path to the file containing the distortion coefficients for the
-        data.
-
-    preview : Dict[str, List[int]]
-        A dict containing three key-value pairs:
-        - a list containing the `start` value of each dimension
-        - a list containing the `stop` value of each dimension
-        - a list containing the `step` value of each dimension
-
-    center_from_left : optional, float
-        If no metadata file with the distortion coefficients is provided, then
-        the x center can be provided as a parameter by passing the horizontal
-        distortion center as measured from the left of the image.
-
-    center_from_top : optional, float
-        If no metadata file with the distortion coefficients is provided, then
-        the y center can be provided as a parameter by passing the horizontal
-        distortion center as measured from the top of the image.
-
-    polynomial_coeffs : optional, List[float]
-        If no metadata file with the distortion coefficients is provided, then
-        the polynomial coefficients for the distortion correction can be
-        provided as a parameter by passing a list of floats.
-
-    crop : optional, int
-        The amount to crop both the height and width of the stack of images
-        being corrected.
-
-    Returns
-    -------
-    cp.ndarray
-        The corrected stack of 2D images.
-    """
-    # Check if it's a stack of 2D images, or only a single 2D image
-    if len(data.shape) == 2:
-        data = cp.expand_dims(data, axis=0)
-
-    # Get info from metadat txt file
-    x_center, y_center, list_fact = _load_metadata_txt(metadata_path)
-
-    shift = preview["starts"]
-    step = preview["steps"]
-    x_dim = 1
-    y_dim = 0
-    step_check = max([step[i] for i in [x_dim, y_dim]]) > 1
-    if step_check:
-        msg = (
-            "\n***********************************************\n"
-            "!!! ERROR !!! -> Method doesn't work with the step in"
-            " the preview larger than 1 \n"
-            "***********************************************\n"
-        )
-        raise ValueError(msg)
-
-    x_offset = shift[x_dim]
-    y_offset = shift[y_dim]
-    msg = ""
-    x_center = 0.0
-    y_center = 0.0
-    if metadata_path is None:
-        x_center = cp.asarray(center_from_left, dtype=cp.float32) - x_offset
-        y_center = cp.asarray(center_from_top, dtype=cp.float32) - y_offset
-        list_fact = cp.float32(tuple(polynomial_coeffs))
-    else:
-        if not os.path.isfile(metadata_path):
-            msg = "!!! No such file: %s !!!" " Please check the file path" % str(
-                metadata_path
-            )
-            raise ValueError(msg)
-        try:
-            (x_center, y_center, list_fact) = _load_metadata_txt(metadata_path)
-            x_center = x_center - x_offset
-            y_center = y_center - y_offset
-        except IOError as exc:
-            msg = (
-                "\n*****************************************\n"
-                "!!! ERROR !!! -> Can't open this file: %s \n"
-                "*****************************************\n\
-                  "
-                % str(metadata_path)
-            )
-            raise ValueError(msg) from exc
-
-    height, width = data.shape[y_dim + 1], data.shape[x_dim + 1]
-    xu_list = cp.arange(width) - x_center
-    yu_list = cp.arange(height) - y_center
-    xu_mat, yu_mat = cp.meshgrid(xu_list, yu_list)
-    ru_mat = cp.sqrt(xu_mat**2 + yu_mat**2)
-    fact_mat = cp.sum(
-        cp.asarray([factor * ru_mat**i for i, factor in enumerate(list_fact)]), axis=0
-    )
-    xd_mat = cp.asarray(
-        cp.clip(x_center + fact_mat * xu_mat, 0, width - 1), dtype=cp.float32
-    )
-    yd_mat = cp.asarray(
-        cp.clip(y_center + fact_mat * yu_mat, 0, height - 1), dtype=cp.float32
-    )
-
-    diff_y = cp.max(yd_mat) - cp.min(yd_mat)
-    if diff_y < 1:
-        msg = (
-            "\n*****************************************\n\n"
-            "!!! ERROR !!! -> You need to increase the preview"
-            " size for this plugin to work \n\n"
-            "*****************************************\n"
-        )
-        raise ValueError(msg)
-
-    indices = [cp.reshape(yd_mat, (-1, 1)), cp.reshape(xd_mat, (-1, 1))]
-    indices = cp.asarray(indices, dtype=cp.float32)
-
-    # Loop over images and unwarp them
-    for i in range(data.shape[0]):
-        mat_corrected = cp.reshape(
-            map_coordinates(data[i], indices, order=1, mode="reflect"), (height, width)
-        )
-        data[i] = mat_corrected[crop : height - crop, crop : width - crop]
-    return data
-
-
 # CuPy implementation of distortion correction from Discorpy
 # https://github.com/DiamondLightSource/discorpy/blob/67743842b60bf5dd45b21b8460e369d4a5e94d67/discorpy/post/postprocessing.py#L111-L148
 # (which is the same as the TomoPy version
 # https://github.com/tomopy/tomopy/blob/c236a2969074f5fc70189fb5545f0a165924f916/source/tomopy/prep/alignment.py#L950-L981
 # but with the additional params `order` and `mode`).
-@method_proj(_calc_max_slices_distortion_correction_proj)
 @nvtx.annotate()
 def distortion_correction_proj_discorpy(
     data: cp.ndarray,
diff --git a/httomolibgpu/prep/phase.py b/httomolibgpu/prep/phase.py
index 6efb2d7f..dab052d7 100644
--- a/httomolibgpu/prep/phase.py
+++ b/httomolibgpu/prep/phase.py
@@ -42,33 +42,8 @@
 PI = 3.14159265359
 PLANCK_CONSTANT = 6.58211928e-19  # [keV*s]
 
-def _calc_max_slices_paganin_filter_savu(
-    non_slice_dims_shape: Tuple[int, int],
-    dtype: np.dtype,
-    available_memory: int,
-    **kwargs,
-) -> Tuple[int, np.dtype, Tuple[int, int]]:
-    pad_x = kwargs["pad_x"]
-    pad_y = kwargs["pad_y"]
-    input_size = np.prod(non_slice_dims_shape) * dtype.itemsize
-    in_slice_size = (
-        (non_slice_dims_shape[0] + 2 * pad_y)
-        * (non_slice_dims_shape[1] + 2 * pad_x)
-        * dtype.itemsize
-    )
-    # FFT needs complex inputs, so copy to complex happens first
-    complex_slice = in_slice_size / dtype.itemsize * np.complex64().nbytes
-    fftplan_slice = complex_slice
-    filter_size = complex_slice
-    res_slice = np.prod(non_slice_dims_shape) * np.float32().nbytes
-    slice_size = input_size + in_slice_size + complex_slice + fftplan_slice + res_slice
-    available_memory -= filter_size
-    return (int(available_memory // slice_size), float32(), non_slice_dims_shape)
-
-
 ## %%%%%%%%%%%%%%%%%%%%%%% paganin_filter %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 #: CuPy implementation of Paganin filter from Savu
-@method_proj(_calc_max_slices_paganin_filter_savu)
 @nvtx.annotate()
 def paganin_filter_savu(
     data: cp.ndarray,
@@ -293,7 +268,11 @@ def _reciprocal_coord(pixel_size: float, num_grid: int) -> cp.ndarray:
     rc *= 2 * PI / (n * pixel_size)
     return rc
 
-# Adaptation with some corrections of retrieve_phase (Paganin filter) from TomoPy
+##-------------------------------------------------------------##
+##-------------------------------------------------------------##
+
+# Adaptation with some corrections of retrieve_phase (Paganin filter)
+# from TomoPy
 @nvtx.annotate()
 def paganin_filter_tomopy(
     tomo: cp.ndarray,
diff --git a/tests/test_misc/test_corr.py b/tests/test_misc/test_corr.py
index e250e20b..cf0a432c 100644
--- a/tests/test_misc/test_corr.py
+++ b/tests/test_misc/test_corr.py
@@ -89,28 +89,6 @@ def test_median_filter3d(data):
     )
 
 
-@cp.testing.gpu
-def test_median_filter3d_memory_calc():
-    dy = 5
-    dx = 2560
-    available_memory = (dx*dy*200 + 42) * 2
-    args=dict(kernel_size=3, dif=1.5)
-
-    assert 'median_filter3d' in method_registry['httomolibgpu']['misc']['corr']
-    assert median_filter3d.meta.calc_max_slices(0, 
-                                                (dy, dx),
-                                                np.uint16(), available_memory, **args) == (100, np.uint16(), (dy, dx))
-    assert median_filter3d.meta.calc_max_slices(0, 
-                                                (dy, dx),
-                                                np.float32(), available_memory, **args) == (50, np.float32(), (dy, dx))
-    assert median_filter3d.meta.calc_max_slices(1, 
-                                                (dy, dx),
-                                                np.uint16(), available_memory, **args) == (100, np.uint16(), (dy, dx))
-    assert median_filter3d.meta.calc_max_slices(1, 
-                                                (dy, dx),
-                                                np.float32(), available_memory, **args) == (50, np.float32(), (dy, dx))
-
-
 @cp.testing.gpu
 @pytest.mark.perf
 def test_median_filter3d_performance(ensure_clean_memory):
diff --git a/tests/test_misc/test_morph.py b/tests/test_misc/test_morph.py
index c8927cb2..1a90d8f6 100644
--- a/tests/test_misc/test_morph.py
+++ b/tests/test_misc/test_morph.py
@@ -3,32 +3,9 @@
 import numpy as np
 from cupy.cuda import nvtx
 import pytest
-#from tomopy.misc.morph import sino_360_to_180 as tomopy_sino_360_to_180
 from httomolibgpu.misc.morph import sino_360_to_180
 
 
-"""
-@cp.testing.gpu
-@pytest.mark.parametrize("overlap", [0, 1, 3, 15, 32])
-@pytest.mark.parametrize("rotation", ["left", "right"])
-@cp.testing.numpy_cupy_allclose(rtol=1e-6)
-def test_sino_360_to_180_unity(ensure_clean_memory, xp, overlap, rotation):
-    # this combination has a bug in tomopy, so we'll skip it for now
-    if rotation == "right" and overlap == 0:
-        pytest.skip("Skipping test due to bug in tomopy")
-
-    np.random.seed(12345)
-    data_host = (
-        np.random.random_sample(size=(123, 54, 128)).astype(np.float32) * 200.0 - 100.0
-    )
-    data = xp.asarray(data_host)
-
-    if xp.__name__ == "numpy":
-        return tomopy_sino_360_to_180(data, overlap, rotation)
-    else:
-        return sino_360_to_180(data, overlap, rotation)
-"""
-
 @pytest.mark.parametrize(
     "overlap, rotation",
     [
@@ -55,11 +32,6 @@ def test_sino_360_to_180_wrong_dims(ensure_clean_memory, shape):
         sino_360_to_180(cp.ones(shape, dtype=cp.float32))
 
 
-def test_sino_360_to_180_meta():
-    assert sino_360_to_180.meta.gpu is True
-    assert sino_360_to_180.meta.pattern == 'sinogram'
-
-
 @pytest.mark.parametrize("rotation", ["left", "right"])
 @pytest.mark.perf
 @cp.testing.gpu
diff --git a/tests/test_prep/test_alignment.py b/tests/test_prep/test_alignment.py
index 1499c771..9d4588e0 100644
--- a/tests/test_prep/test_alignment.py
+++ b/tests/test_prep/test_alignment.py
@@ -4,15 +4,12 @@
 import numpy as np
 import pytest
 from httomolibgpu.prep.alignment import (
-    distortion_correction_proj,
     distortion_correction_proj_discorpy,
 )
 from httomolibgpu import method_registry
 from imageio.v2 import imread
 from numpy.testing import assert_allclose
 
-from tests import MaxMemoryHook
-
 
 @cp.testing.gpu
 @pytest.mark.parametrize(
@@ -26,8 +23,8 @@
 )
 @pytest.mark.parametrize(
     "implementation",
-    [distortion_correction_proj, distortion_correction_proj_discorpy],
-    ids=["cupy", "tomopy"],
+    [distortion_correction_proj_discorpy],
+    ids=["tomopy"],
 )
 def test_correct_distortion(
     distortion_correction_path,
@@ -50,40 +47,4 @@ def test_correct_distortion(
     
     assert_allclose(np.mean(corrected_data), mean_value)
     assert np.max(corrected_data) == max_value
-
     assert corrected_data.dtype == np.uint8
-
-
-@pytest.mark.parametrize("stack_size", [20, 50, 100])
-@pytest.mark.parametrize(
-    "implementation",
-    [distortion_correction_proj, distortion_correction_proj_discorpy],
-    ids=["cupy", "tomopy"],
-)
-def test_distortion_correction_meta(distortion_correction_path, stack_size, implementation):
-    distortion_coeffs_path = os.path.join(
-        distortion_correction_path, "distortion-coeffs.txt"
-    )
-
-    path = os.path.join(distortion_correction_path, "dot_pattern_03.tif")
-    im_host = np.asarray(imread(path))
-    im_host = np.expand_dims(im_host, axis=0)
-    # replicate into a stack of images
-    im_stack = cp.asarray(np.tile(im_host, (stack_size, 1, 1)))
-    hook = MaxMemoryHook(im_stack.size * im_stack.itemsize)
-    preview = {"starts": [0, 0], "stops": [im_stack.shape[1], im_stack.shape[2]], "steps": [1, 1]}
-    
-    with hook:
-        implementation(im_stack, distortion_coeffs_path, preview)
-    
-    # make sure estimator function is within range (80% min, 100% max)
-    max_mem = hook.max_mem
-    actual_slices = im_stack.shape[0]
-    estimated_slices, dtype_out, output_dims = implementation.meta.calc_max_slices(
-        0, (im_stack.shape[1], im_stack.shape[2]),
-        im_stack.dtype, max_mem, 
-        metadata_path=distortion_coeffs_path, preview=preview)
-    assert estimated_slices <= actual_slices
-    assert estimated_slices / actual_slices >= 0.8 
-    
-    assert implementation.__name__ in method_registry['httomolibgpu']['prep']['alignment']

From db38b82461c09e09404b49aaa86264bab8c8378d Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Fri, 22 Sep 2023 16:30:52 +0100
Subject: [PATCH 04/26] stripe removal updates

---
 httomolibgpu/prep/stripe.py    | 37 +------------------------
 tests/test_prep/test_stripe.py | 50 ----------------------------------
 2 files changed, 1 insertion(+), 86 deletions(-)

diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py
index d9d200ee..0438b211 100644
--- a/httomolibgpu/prep/stripe.py
+++ b/httomolibgpu/prep/stripe.py
@@ -24,7 +24,6 @@
 import cupy as cp
 import numpy as np
 import nvtx
-from httomolibgpu.decorator import method_sino
 
 __all__ = [
     "remove_stripe_based_sorting",
@@ -34,22 +33,6 @@
 # TODO: port 'remove_all_stripe', 'remove_large_stripe' and 'remove_dead_stripe'
 # from https://github.com/tomopy/tomopy/blob/master/source/tomopy/prep/stripe.py
 
-
-def _calc_max_slices_stripe_based_sorting(
-    non_slice_dims_shape: Tuple[int, int],
-    dtype: np.dtype, available_memory: int, **kwargs
-) -> Tuple[int, np.dtype, Tuple[int, int]]:
-    # the algorithm calls _rsort for each slice independenty, and it needs 
-    # several temporaries in the order of the input slice.
-    # Those temporaries are independent of the number of slices and represent a fixed 
-    # offset. Also, the data is updated in-place
-    slice_mem = np.prod(non_slice_dims_shape) * dtype.itemsize * 1.25
-    temp_mem = slice_mem * 8
-    available_memory -= int(temp_mem)
-    return (int(available_memory // slice_mem), dtype, non_slice_dims_shape)
-
-
-@method_sino(_calc_max_slices_stripe_based_sorting, cpugpu=True)
 @nvtx.annotate()
 def remove_stripe_based_sorting(
     data: Union[cp.ndarray, np.ndarray],
@@ -125,25 +108,6 @@ def _rs_sort(sinogram, size, dim):
 
     return xp.transpose(sino_corrected)
 
-
-def _calc_max_slices_remove_stripe_ti(
-    non_slice_dims_shape: Tuple[int, int],
-    dtype: np.dtype, available_memory: int, **kwargs
-) -> Tuple[int, np.dtype, Tuple[int, int]]:
-    # This is admittedly a rough estimation, but it should be about right
-    gamma_mem = non_slice_dims_shape[1] * np.float64().itemsize
-    
-    in_slice_mem = np.prod(non_slice_dims_shape) * dtype.itemsize
-    slice_mean_mem = non_slice_dims_shape[1] * dtype.itemsize * 2
-    slice_fft_plan_mem = slice_mean_mem * 3
-    extra_temp_mem = slice_mean_mem * 8
-
-    available_memory -= int(gamma_mem)
-    maxslices = int(available_memory // (in_slice_mem + slice_mean_mem + slice_fft_plan_mem + extra_temp_mem))
-    return (maxslices, dtype, non_slice_dims_shape)
-
-
-@method_sino(_calc_max_slices_remove_stripe_ti, cpugpu=True)
 @nvtx.annotate()
 def remove_stripe_ti(
     data: Union[cp.ndarray, np.ndarray],
@@ -165,6 +129,7 @@ def remove_stripe_ti(
     ndarray
         3D array of de-striped projections.
     """
+    # TODO: detector dimensions must be even otherwise error
     xp = cp.get_array_module(data)
     gamma = beta * ((1 - beta) / (1 + beta)) ** xp.abs(
         xp.fft.fftfreq(data.shape[-1]) * data.shape[-1]
diff --git a/tests/test_prep/test_stripe.py b/tests/test_prep/test_stripe.py
index a9c57aa3..8a29ba4d 100644
--- a/tests/test_prep/test_stripe.py
+++ b/tests/test_prep/test_stripe.py
@@ -11,9 +11,6 @@
 from httomolibgpu import method_registry
 from numpy.testing import assert_allclose
 
-from tests import MaxMemoryHook
-
-
 @cp.testing.gpu
 def test_remove_stripe_ti_on_data(data, flats, darks):
     # --- testing the CuPy implementation from TomoCupy ---#
@@ -32,29 +29,6 @@ def test_remove_stripe_ti_on_data(data, flats, darks):
     # make sure the output is float32
     assert data_after_stripe_removal.dtype == np.float32
 
-@cp.testing.gpu
-def test_remove_stripe_ti_on_data_meta(data, flats, darks):
-    # --- testing the CuPy implementation from TomoCupy ---#
-    data = normalize(data, flats, darks, cutoff=10, minus_log=True)
-    
-    hook = MaxMemoryHook()
-    with hook:
-        data_after_stripe_removal = remove_stripe_ti(cp.copy(data)).get()
-
-    # make sure estimator function is within range (80% min, 100% max)
-    max_mem = hook.max_mem
-    actual_slices = data.shape[1]
-    estimated_slices, dtype_out, output_dims = remove_stripe_ti.meta.calc_max_slices(1,
-                                                                (data.shape[0], data.shape[2]),
-                                                                 data.dtype, max_mem)
-    assert estimated_slices <= actual_slices
-    assert estimated_slices / actual_slices >= 0.8 
-
-    data = None  #: free up GPU memory   
-    assert remove_stripe_ti.meta.pattern == 'sinogram'
-    assert 'remove_stripe_ti' in method_registry['httomolibgpu']['prep']['stripe']
-
-
 def test_remove_stripe_ti_on_flats(host_flats):
     #: testing that numpy uint16 arrays can be passed
     corrected_data = remove_stripe_ti(np.copy(host_flats))
@@ -62,7 +36,6 @@ def test_remove_stripe_ti_on_flats(host_flats):
     assert_allclose(np.mean(corrected_data, axis=(1, 2)).sum(), 19531.168945, rtol=1e-7)
     assert_allclose(np.median(corrected_data), 976.0, rtol=1e-7)
 
-
 @cp.testing.gpu
 def test_remove_stripe_ti_numpy_vs_cupy_on_random_data():
     host_data = np.random.random_sample(size=(181, 5, 256)).astype(np.float32) * 2.0
@@ -76,7 +49,6 @@ def test_remove_stripe_ti_numpy_vs_cupy_on_random_data():
         np.median(corrected_data), np.median(corrected_host_data), rtol=1e-6
     )
 
-
 @cp.testing.gpu
 def test_stripe_removal_sorting_cupy(data, flats, darks):
     # --- testing the CuPy port of TomoPy's implementation ---#
@@ -91,28 +63,6 @@ def test_stripe_removal_sorting_cupy(data, flats, darks):
     # make sure the output is float32
     assert corrected_data.dtype == np.float32
 
-@cp.testing.gpu
-def test_stripe_removal_sorting_cupy_meta(data, flats, darks):
-    # --- testing the CuPy port of TomoPy's implementation ---#
-    data = normalize(data, flats, darks, cutoff=10, minus_log=True)
-    hook = MaxMemoryHook(data.size * data.itemsize)
-    with hook:
-        corrected_data = remove_stripe_based_sorting(data).get()
-    
-    # make sure estimator function is within range (80% min, 100% max)
-    max_mem = hook.max_mem
-    actual_slices = data.shape[1]
-    estimated_slices, dtype_out, output_dims = remove_stripe_based_sorting.meta.calc_max_slices(1,
-                                                                           (data.shape[0], data.shape[2]),                                                                           
-                                                                           data.dtype, max_mem)
-    assert estimated_slices <= actual_slices
-    assert estimated_slices / actual_slices >= 0.8 
-    
-    data = None  #: free up GPU memory
-    assert remove_stripe_based_sorting.meta.pattern == 'sinogram'
-    assert 'remove_stripe_based_sorting' in method_registry['httomolibgpu']['prep']['stripe']
-
-
 @cp.testing.gpu
 @cp.testing.numpy_cupy_allclose(rtol=1e-6)
 def test_stripe_removal_sorting_numpy_vs_cupy_on_random_data(ensure_clean_memory, xp):

From 65f740ce40e22633f56f148aa2347abba7abebd8 Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Mon, 25 Sep 2023 16:48:08 +0100
Subject: [PATCH 05/26] added a mask data removal feature in reconstructions,
 tests updated

---
 httomolibgpu/recon/algorithm.py    |  77 +++++--
 tests/test_recon/test_algorithm.py | 318 ++++++++++++++---------------
 2 files changed, 209 insertions(+), 186 deletions(-)

diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
index e1dbe9ea..6d3ea005 100644
--- a/httomolibgpu/recon/algorithm.py
+++ b/httomolibgpu/recon/algorithm.py
@@ -62,6 +62,18 @@ def _calc_max_slices_FBP(
     slices_max = available_memory // int(2*in_slice_size + filtered_in_data + freq_slice + fftplan_size + 2*astra_out_size)
     return (slices_max, float32(), output_dims)
 
+def _apply_circular_mask(data, recon_mask_radius):
+    
+    recon_size = data.shape[1]
+    Y, X = cp.ogrid[:recon_size, :recon_size]
+    half_size = recon_size//2
+    dist_from_center = cp.sqrt((X - half_size)**2 + (Y-half_size)**2)
+    if recon_mask_radius <= 1.0:
+        mask = dist_from_center <= half_size - abs(half_size - half_size/recon_mask_radius)
+    else:
+        mask = dist_from_center <= half_size + abs(half_size - half_size/recon_mask_radius)    
+    return data*mask
+
 
 ## %%%%%%%%%%%%%%%%%%%%%%% FBP reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 @method_sino(_calc_max_slices_FBP)
@@ -70,12 +82,13 @@ def FBP(
     data: cp.ndarray,
     angles: np.ndarray,
     center: Optional[float] = None,
-    objsize: Optional[int] = None,
+    recon_size: Optional[int] = None,
+    recon_mask_radius: Optional[float] = None,
     gpu_id: int = 0,
 ) -> cp.ndarray:
     """
     Perform Filtered Backprojection (FBP) reconstruction using ASTRA toolbox and ToMoBAR wrappers.
-    This is 3D recon from a CuPy array and a custom built filter.
+    This is a 3D recon from a CuPy array and a custom built filter.
 
     Parameters
     ----------
@@ -85,8 +98,13 @@ def FBP(
         An array of angles given in radians.
     center : float, optional
         The center of rotation (CoR).
-    objsize : int, optional
-        The size in pixels of the reconstructed object.
+    recon_size : int, optional
+        The [recon_size, recon_size] shape of the reconstructed slice in pixels. 
+        By default (None), the reconstructed size will be the dimension of the horizontal detector.
+    recon_mask_radius: int, optional
+        The radius of the circular mask that applies to the reconstructed slice in order to crop
+        out some undesirable artefacts. The values outside the diameter will be set to zero. 
+        None by default, to see the effect of the mask try setting the value in the range [0.7-1.0]. 
     gpu_id : int, optional
         A GPU device index to perform operation on.
 
@@ -99,17 +117,20 @@ def FBP(
 
     if center is None:
         center = data.shape[2] // 2  # making a crude guess
-    if objsize is None:
-        objsize = data.shape[2]
+    if recon_size is None:
+        recon_size = data.shape[2]
     RecToolsCP = RecToolsDIRCuPy(DetectorsDimH=data.shape[2],  # Horizontal detector dimension
                                  DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
                                  CenterRotOffset=data.shape[2] / 2 - center - 0.5,  # Center of Rotation scalar or a vector
                                  AnglesVec=-angles,  # A vector of projection angles in radians
-                                 ObjSize=objsize,  # Reconstructed object dimensions (scalar)
+                                 ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
                                  device_projector=gpu_id,
                                  )
     reconstruction = RecToolsCP.FBP3D(data)
     cp._default_memory_pool.free_all_blocks()
+    # perform masking to the result of reconstruction if needed 
+    if recon_mask_radius is not None:
+        reconstruction = _apply_circular_mask(reconstruction, recon_mask_radius)
     return cp.swapaxes(reconstruction,0,1)
 
 ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
@@ -149,9 +170,10 @@ def SIRT(
     data: cp.ndarray,
     angles: np.ndarray,
     center: Optional[float] = None,
-    objsize: Optional[int] = None,
+    recon_size: Optional[int] = None,    
     iterations: Optional[int] = 300,
     nonnegativity: Optional[bool] = True,
+    recon_mask_radius: Optional[float] = None,
     gpu_id: int = 0,
 ) -> cp.ndarray:
     """
@@ -166,12 +188,17 @@ def SIRT(
         An array of angles given in radians.
     center : float, optional
         The center of rotation (CoR).
-    objsize : int, optional
-        The size in pixels of the reconstructed object.
+    recon_size : int, optional
+        The [recon_size, recon_size] shape of the reconstructed slice in pixels. 
+        By default (None), the reconstructed size will be the dimension of the horizontal detector.
     iterations : int, optional
         The number of SIRT iterations.
     nonnegativity : bool, optional
         Impose nonnegativity constraint on reconstructed image.
+    recon_mask_radius: int, optional
+        The radius of the circular mask that applies to the reconstructed slice in order to crop
+        out some undesirable artefacts. The values outside the diameter will be set to zero. 
+        None by default, to see the effect of the mask try setting the value in the range [0.7-1.0].
     gpu_id : int, optional
         A GPU device index to perform operation on.
 
@@ -184,20 +211,23 @@ def SIRT(
 
     if center is None:
         center = data.shape[2] // 2  # making a crude guess
-    if objsize is None:
-        objsize = data.shape[2]
+    if recon_size is None:
+        recon_size = data.shape[2]
         
     RecToolsCP = RecToolsIRCuPy(DetectorsDimH=data.shape[2],  # Horizontal detector dimension
                                  DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
                                  CenterRotOffset=data.shape[2] / 2 - center - 0.5,  # Center of Rotation scalar or a vector
                                  AnglesVec=-angles,  # A vector of projection angles in radians
-                                 ObjSize=objsize,  # Reconstructed object dimensions (scalar)
+                                 ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
                                  device_projector=gpu_id,
                                  )
     _data_ = {"projection_norm_data": data}  # data dictionary
     _algorithm_ = {"iterations": iterations, "nonnegativity": nonnegativity}
     reconstruction = RecToolsCP.SIRT(_data_, _algorithm_)
     cp._default_memory_pool.free_all_blocks()
+    # perform masking to the result of reconstruction if needed 
+    if recon_mask_radius is not None:
+        reconstruction = _apply_circular_mask(reconstruction, recon_mask_radius)    
     return cp.swapaxes(reconstruction,0,1)
 
 ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
@@ -235,9 +265,10 @@ def CGLS(
     data: cp.ndarray,
     angles: np.ndarray,
     center: Optional[float] = None,
-    objsize: Optional[int] = None,
+    recon_size: Optional[int] = None,
     iterations: Optional[int] = 20,
     nonnegativity: Optional[bool] = True,
+    recon_mask_radius: Optional[float] = None,
     gpu_id: int = 0,
 ) -> cp.ndarray:
     """
@@ -252,12 +283,17 @@ def CGLS(
         An array of angles given in radians.
     center : float, optional
         The center of rotation (CoR).
-    objsize : int, optional
-        The size in pixels of the reconstructed object.
+    recon_size : int, optional
+        The [recon_size, recon_size] shape of the reconstructed slice in pixels. 
+        By default (None), the reconstructed size will be the dimension of the horizontal detector.
     iterations : int, optional
         The number of CGLS iterations.
     nonnegativity : bool, optional
         Impose nonnegativity constraint on reconstructed image.
+    recon_mask_radius: int, optional
+        The radius of the circular mask that applies to the reconstructed slice in order to crop
+        out some undesirable artefacts. The values outside the diameter will be set to zero. 
+        None by default, to see the effect of the mask try setting the value in the range [0.7-1.0].        
     gpu_id : int, optional
         A GPU device index to perform operation on.
 
@@ -270,20 +306,23 @@ def CGLS(
 
     if center is None:
         center = data.shape[2] // 2  # making a crude guess
-    if objsize is None:
-        objsize = data.shape[2]
+    if recon_size is None:
+        recon_size = data.shape[2]
         
     RecToolsCP = RecToolsIRCuPy(DetectorsDimH=data.shape[2],  # Horizontal detector dimension
                                  DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
                                  CenterRotOffset=data.shape[2] / 2 - center - 0.5,  # Center of Rotation scalar or a vector
                                  AnglesVec=-angles,  # A vector of projection angles in radians
-                                 ObjSize=objsize,  # Reconstructed object dimensions (scalar)
+                                 ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
                                  device_projector=gpu_id,
                                  )
     _data_ = {"projection_norm_data": data}  # data dictionary
     _algorithm_ = {"iterations": iterations, "nonnegativity": nonnegativity}
     reconstruction = RecToolsCP.CGLS(_data_, _algorithm_)
     cp._default_memory_pool.free_all_blocks()
+    # perform masking to the result of reconstruction if needed 
+    if recon_mask_radius is not None:
+        reconstruction = _apply_circular_mask(reconstruction, recon_mask_radius)
     return cp.swapaxes(reconstruction,0,1)
 
 ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
\ No newline at end of file
diff --git a/tests/test_recon/test_algorithm.py b/tests/test_recon/test_algorithm.py
index 66fb3ac8..c843eca2 100644
--- a/tests/test_recon/test_algorithm.py
+++ b/tests/test_recon/test_algorithm.py
@@ -11,8 +11,6 @@
 import pytest
 from cupy.cuda import nvtx
 
-from tests import MaxMemoryHook
-
 
 @cp.testing.gpu
 def test_reconstruct_FBP_1(data, flats, darks, ensure_clean_memory):
@@ -27,6 +25,7 @@ def test_reconstruct_FBP_1(data, flats, darks, ensure_clean_memory):
     assert_allclose(np.std(recon_data), 0.006293, rtol=1e-07, atol=1e-6)
     assert_allclose(np.median(recon_data), -0.000555, rtol=1e-07, atol=1e-6)
     assert recon_data.dtype == np.float32
+    assert recon_data.shape == (160, 128, 160)
 
 
 @cp.testing.gpu
@@ -47,38 +46,23 @@ def test_reconstruct_FBP_2(data, flats, darks, ensure_clean_memory):
 
 
 @cp.testing.gpu
-def test_reconstruct_FBP_hook(data, flats, darks, ensure_clean_memory):
-    normalized = normalize_cupy(data, flats, darks, cutoff=10, minus_log=True)
-
-    cp.get_default_memory_pool().free_all_blocks()
-    cache = cp.fft.config.get_plan_cache()
-    cache.clear()
-
-    objrecon_size = data.shape[2]
-    hook = MaxMemoryHook(normalized.size * normalized.itemsize)
-    with hook:
-        recon_data = FBP(
-            normalized,
-            np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
-            79.5,
-            objsize=objrecon_size,
-        )
-
-    # make sure estimator function is within range (80% min, 100% max)
-    max_mem = hook.max_mem
-    actual_slices = data.shape[1]
-    estimated_slices, dtype_out, output_dims = FBP.meta.calc_max_slices(1,
-                                                       (data.shape[0], data.shape[2]),
-                                                       normalized.dtype,
-                                                       max_mem,
-                                                       objsize=objrecon_size)
-    assert estimated_slices <= actual_slices
-    assert estimated_slices / actual_slices >= 0.5 # lowering because hook doesn't extend to ASTRA functions
+def test_reconstruct_FBP_3(data, flats, darks, ensure_clean_memory):
+    recon_data = FBP(
+        normalize_cupy(data, flats, darks, cutoff=20.5, minus_log=False),
+        np.linspace(5.0 * np.pi / 360.0, 180.0 * np.pi / 360.0,
+                    data.shape[0]),
+                    79,  # center
+                    210, # recon_size
+                    0.9, # recon_mask_radius
+    )
 
     recon_data = recon_data.get()
-    assert_allclose(np.mean(recon_data), 0.00079770206, rtol=1e-6)
-    assert_allclose(np.mean(recon_data, axis=(0, 2)).sum(), 0.10210582, rtol=1e-6)
-
+    assert_allclose(np.mean(recon_data), -0.000252, atol=1e-6)
+    assert_allclose(
+         np.mean(recon_data, axis=(0, 2)).sum(), -0.03229, rtol=1e-06, atol=1e-5
+    )
+    assert recon_data.dtype == np.float32
+    assert recon_data.shape == (210, 128, 210)
 
 @cp.testing.gpu
 def test_reconstruct_SIRT(data, flats, darks, ensure_clean_memory):
@@ -87,7 +71,7 @@ def test_reconstruct_SIRT(data, flats, darks, ensure_clean_memory):
         normalize_cupy(data, flats, darks, cutoff=10, minus_log=True),
         np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
         79.5,
-        objsize=objrecon_size,
+        recon_size=objrecon_size,
         iterations=10,
         nonnegativity=True,
     )
@@ -97,72 +81,72 @@ def test_reconstruct_SIRT(data, flats, darks, ensure_clean_memory):
     assert recon_data.dtype == np.float32
 
 
-@cp.testing.gpu
-def test_reconstruct_SIRT_hook(data, flats, darks, ensure_clean_memory):
-    objrecon_size = data.shape[2]
-    normalized = normalize_cupy(data, flats, darks, cutoff=10, minus_log=True)
-
-    cp.get_default_memory_pool().free_all_blocks()
-    cache = cp.fft.config.get_plan_cache()
-    cache.clear()
-
-    objrecon_size = data.shape[2]
-    hook = MaxMemoryHook(normalized.size * normalized.itemsize)
-    with hook:
-        recon_data = SIRT(
-            normalized,
-            np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
-            79.5,
-            objsize=objrecon_size,
-            iterations=2,
-            nonnegativity=True,
-        )
-
-    # make sure estimator function is within range (80% min, 100% max)
-    max_mem = hook.max_mem
-    actual_slices = data.shape[1]
-    estimated_slices, dtype_out, output_dims = SIRT.meta.calc_max_slices(1,
-                                                       (data.shape[0], data.shape[2]),
-                                                       normalized.dtype,
-                                                       max_mem,
-                                                       objsize=objrecon_size)
-    assert estimated_slices <= actual_slices
-    assert estimated_slices / actual_slices >= 0.5 # lowering because hook doesn't extend to ASTRA functions
-
-
-@cp.testing.gpu
-def test_reconstruct_SIRT_hook2(ensure_clean_memory):
-    np.random.seed(12345)
-    data_host = np.random.random_sample(size=(1801, 10, 2560)).astype(np.float32) * 2.0
-    data = cp.asarray(data_host, dtype=np.float32)
-
-    objrecon_size = data.shape[2]
-    cp.get_default_memory_pool().free_all_blocks()
-    cache = cp.fft.config.get_plan_cache()
-    cache.clear()
-
-    objrecon_size = data.shape[2]
-    hook = MaxMemoryHook(data.size * data.itemsize)
-    with hook:
-        recon_data = SIRT(
-            data,
-            np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
-            1200,
-            objsize=objrecon_size,
-            iterations=2,
-            nonnegativity=True,
-        )
-
-    # make sure estimator function is within range (80% min, 100% max)
-    max_mem = hook.max_mem
-    actual_slices = data.shape[1]
-    estimated_slices, dtype_out, output_dims = SIRT.meta.calc_max_slices(1,
-                                                       (data.shape[0], data.shape[2]),
-                                                       data.dtype,
-                                                       max_mem,
-                                                       objsize=objrecon_size)
-    assert estimated_slices <= actual_slices
-    assert estimated_slices / actual_slices >= 0.5 # lowering because hook doesn't extend to ASTRA functions
+# @cp.testing.gpu
+# def test_reconstruct_SIRT_hook(data, flats, darks, ensure_clean_memory):
+#     objrecon_size = data.shape[2]
+#     normalized = normalize_cupy(data, flats, darks, cutoff=10, minus_log=True)
+
+#     cp.get_default_memory_pool().free_all_blocks()
+#     cache = cp.fft.config.get_plan_cache()
+#     cache.clear()
+
+#     objrecon_size = data.shape[2]
+#     hook = MaxMemoryHook(normalized.size * normalized.itemsize)
+#     with hook:
+#         recon_data = SIRT(
+#             normalized,
+#             np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
+#             79.5,
+#             objsize=objrecon_size,
+#             iterations=2,
+#             nonnegativity=True,
+#         )
+
+#     # make sure estimator function is within range (80% min, 100% max)
+#     max_mem = hook.max_mem
+#     actual_slices = data.shape[1]
+#     estimated_slices, dtype_out, output_dims = SIRT.meta.calc_max_slices(1,
+#                                                        (data.shape[0], data.shape[2]),
+#                                                        normalized.dtype,
+#                                                        max_mem,
+#                                                        objsize=objrecon_size)
+#     assert estimated_slices <= actual_slices
+#     assert estimated_slices / actual_slices >= 0.5 # lowering because hook doesn't extend to ASTRA functions
+
+
+# @cp.testing.gpu
+# def test_reconstruct_SIRT_hook2(ensure_clean_memory):
+#     np.random.seed(12345)
+#     data_host = np.random.random_sample(size=(1801, 10, 2560)).astype(np.float32) * 2.0
+#     data = cp.asarray(data_host, dtype=np.float32)
+
+#     objrecon_size = data.shape[2]
+#     cp.get_default_memory_pool().free_all_blocks()
+#     cache = cp.fft.config.get_plan_cache()
+#     cache.clear()
+
+#     objrecon_size = data.shape[2]
+#     hook = MaxMemoryHook(data.size * data.itemsize)
+#     with hook:
+#         recon_data = SIRT(
+#             data,
+#             np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
+#             1200,
+#             objsize=objrecon_size,
+#             iterations=2,
+#             nonnegativity=True,
+#         )
+
+#     # make sure estimator function is within range (80% min, 100% max)
+#     max_mem = hook.max_mem
+#     actual_slices = data.shape[1]
+#     estimated_slices, dtype_out, output_dims = SIRT.meta.calc_max_slices(1,
+#                                                        (data.shape[0], data.shape[2]),
+#                                                        data.dtype,
+#                                                        max_mem,
+#                                                        objsize=objrecon_size)
+#     assert estimated_slices <= actual_slices
+#     assert estimated_slices / actual_slices >= 0.5 # lowering because hook doesn't extend to ASTRA functions
 
 
 @cp.testing.gpu
@@ -172,7 +156,7 @@ def test_reconstruct_CGLS(data, flats, darks, ensure_clean_memory):
         normalize_cupy(data, flats, darks, cutoff=10, minus_log=True),
         np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
         79.5,
-        objsize=objrecon_size,
+        recon_size=objrecon_size,
         iterations=5,
         nonnegativity=True,
     )
@@ -182,73 +166,73 @@ def test_reconstruct_CGLS(data, flats, darks, ensure_clean_memory):
     assert recon_data.dtype == np.float32
 
 
-@cp.testing.gpu
-def test_reconstruct_CGLS_hook(data, flats, darks, ensure_clean_memory):
-    objrecon_size = data.shape[2]
-    normalized = normalize_cupy(data, flats, darks, cutoff=10, minus_log=True)
-
-    cp.get_default_memory_pool().free_all_blocks()
-    cache = cp.fft.config.get_plan_cache()
-    cache.clear()
-
-    objrecon_size = data.shape[2]
-    hook = MaxMemoryHook(normalized.size * normalized.itemsize)
-    with hook:
-        recon_data = CGLS(
-            normalized,
-            np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
-            79.5,
-            objsize=objrecon_size,
-            iterations=2,
-            nonnegativity=True,
-        )
-
-    # make sure estimator function is within range (80% min, 100% max)
-    max_mem = hook.max_mem
-    actual_slices = data.shape[1]
-    estimated_slices, dtype_out, output_dims = CGLS.meta.calc_max_slices(1,
-                                                       (data.shape[0], data.shape[2]),                                                       
-                                                       normalized.dtype,
-                                                       max_mem,
-                                                       objsize=objrecon_size)
-
-    assert estimated_slices <= actual_slices
-    assert estimated_slices / actual_slices >= 0.5 # lowering because hook doesn't extend to ASTRA functions
-
-
-@cp.testing.gpu
-def test_reconstruct_CGLS_hook2(ensure_clean_memory):
-    np.random.seed(12345)
-    data_host = np.random.random_sample(size=(1801, 10, 2560)).astype(np.float32) * 2.0
-    data = cp.asarray(data_host, dtype=np.float32)
-
-    objrecon_size = data.shape[2]
-    cp.get_default_memory_pool().free_all_blocks()
-    cache = cp.fft.config.get_plan_cache()
-    cache.clear()
-
-    objrecon_size = data.shape[2]
-    hook = MaxMemoryHook(data.size * data.itemsize)
-    with hook:
-        recon_data = CGLS(
-            data,
-            np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
-            1200,
-            objsize=objrecon_size,
-            iterations=2,
-            nonnegativity=True,
-        )
-
-    # make sure estimator function is within range (80% min, 100% max)
-    max_mem = hook.max_mem
-    actual_slices = data.shape[1]
-    estimated_slices, dtype_out, output_dims = CGLS.meta.calc_max_slices(1,
-                                                       (data.shape[0], data.shape[2]),                                                       
-                                                       data.dtype,
-                                                       max_mem,
-                                                       objsize=objrecon_size)
-    assert estimated_slices <= actual_slices
-    assert estimated_slices / actual_slices >= 0.5 # lowering because hook doesn't extend to ASTRA functions
+# @cp.testing.gpu
+# def test_reconstruct_CGLS_hook(data, flats, darks, ensure_clean_memory):
+#     objrecon_size = data.shape[2]
+#     normalized = normalize_cupy(data, flats, darks, cutoff=10, minus_log=True)
+
+#     cp.get_default_memory_pool().free_all_blocks()
+#     cache = cp.fft.config.get_plan_cache()
+#     cache.clear()
+
+#     objrecon_size = data.shape[2]
+#     hook = MaxMemoryHook(normalized.size * normalized.itemsize)
+#     with hook:
+#         recon_data = CGLS(
+#             normalized,
+#             np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
+#             79.5,
+#             objsize=objrecon_size,
+#             iterations=2,
+#             nonnegativity=True,
+#         )
+
+#     # make sure estimator function is within range (80% min, 100% max)
+#     max_mem = hook.max_mem
+#     actual_slices = data.shape[1]
+#     estimated_slices, dtype_out, output_dims = CGLS.meta.calc_max_slices(1,
+#                                                        (data.shape[0], data.shape[2]),                                                       
+#                                                        normalized.dtype,
+#                                                        max_mem,
+#                                                        objsize=objrecon_size)
+
+#     assert estimated_slices <= actual_slices
+#     assert estimated_slices / actual_slices >= 0.5 # lowering because hook doesn't extend to ASTRA functions
+
+
+# @cp.testing.gpu
+# def test_reconstruct_CGLS_hook2(ensure_clean_memory):
+#     np.random.seed(12345)
+#     data_host = np.random.random_sample(size=(1801, 10, 2560)).astype(np.float32) * 2.0
+#     data = cp.asarray(data_host, dtype=np.float32)
+
+#     objrecon_size = data.shape[2]
+#     cp.get_default_memory_pool().free_all_blocks()
+#     cache = cp.fft.config.get_plan_cache()
+#     cache.clear()
+
+#     objrecon_size = data.shape[2]
+#     hook = MaxMemoryHook(data.size * data.itemsize)
+#     with hook:
+#         recon_data = CGLS(
+#             data,
+#             np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
+#             1200,
+#             objsize=objrecon_size,
+#             iterations=2,
+#             nonnegativity=True,
+#         )
+
+#     # make sure estimator function is within range (80% min, 100% max)
+#     max_mem = hook.max_mem
+#     actual_slices = data.shape[1]
+#     estimated_slices, dtype_out, output_dims = CGLS.meta.calc_max_slices(1,
+#                                                        (data.shape[0], data.shape[2]),                                                       
+#                                                        data.dtype,
+#                                                        max_mem,
+#                                                        objsize=objrecon_size)
+#     assert estimated_slices <= actual_slices
+#     assert estimated_slices / actual_slices >= 0.5 # lowering because hook doesn't extend to ASTRA functions
 
 
 @cp.testing.gpu

From 354e676054a9ac8d082a6c3a9d848ebae9e3b769 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 26 Sep 2023 12:32:43 +0100
Subject: [PATCH 06/26] fixes the remaining reconstruction methods

---
 httomolibgpu/recon/algorithm.py    |  86 ------------------
 tests/test_recon/test_algorithm.py | 138 -----------------------------
 2 files changed, 224 deletions(-)

diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
index 6d3ea005..3f290fda 100644
--- a/httomolibgpu/recon/algorithm.py
+++ b/httomolibgpu/recon/algorithm.py
@@ -27,7 +27,6 @@
 import cupyx
 import numpy as np
 import nvtx
-from httomolibgpu.decorator import method_sino
 
 from httomolibgpu.cuda_kernels import load_cuda_module
 
@@ -37,31 +36,6 @@
     "CGLS",
 ]
 
-def _calc_max_slices_FBP(
-    non_slice_dims_shape: Tuple[int, int],
-    dtype: np.dtype,
-    available_memory: int,
-    **kwargs
-) -> Tuple[int, np.dtype, Tuple[int, int]]:
-    # we first run filtersync, and calc the memory for that
-    DetectorsLengthH = non_slice_dims_shape[1]
-    in_slice_size = np.prod(non_slice_dims_shape) * dtype.itemsize
-    filter_size = (DetectorsLengthH//2+1) * float32().itemsize
-    freq_slice = non_slice_dims_shape[0] * (DetectorsLengthH//2+1) * complex64().itemsize
-    fftplan_size = freq_slice * 2
-    filtered_in_data = np.prod(non_slice_dims_shape) * float32().itemsize
-    # calculate the output shape
-    objsize = kwargs['objsize']
-    if objsize is None:
-        objsize = DetectorsLengthH
-    output_dims = (objsize, objsize)
-    # astra backprojection will generate an output array 
-    astra_out_size = (np.prod(output_dims) * float32().itemsize)
-
-    available_memory -= filter_size
-    slices_max = available_memory // int(2*in_slice_size + filtered_in_data + freq_slice + fftplan_size + 2*astra_out_size)
-    return (slices_max, float32(), output_dims)
-
 def _apply_circular_mask(data, recon_mask_radius):
     
     recon_size = data.shape[1]
@@ -76,7 +50,6 @@ def _apply_circular_mask(data, recon_mask_radius):
 
 
 ## %%%%%%%%%%%%%%%%%%%%%%% FBP reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-@method_sino(_calc_max_slices_FBP)
 @nvtx.annotate()
 def FBP(
     data: cp.ndarray,
@@ -135,36 +108,7 @@ def FBP(
 
 ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 
-def _calc_max_slices_SIRT(
-    non_slice_dims_shape: Tuple[int, int],
-    dtype: np.dtype,
-    available_memory: int, **kwargs
-) -> Tuple[int, np.dtype, Tuple[int, int]]:
-    # calculate the output shape
-    DetectorsLengthH = non_slice_dims_shape[1]
-    objsize = kwargs['objsize']
-    if objsize is None:
-        objsize = DetectorsLengthH
-    output_dims = (objsize, objsize) 
-    
-    # input/output
-    data_out = np.prod(non_slice_dims_shape) * dtype.itemsize
-    x_rec = np.prod(output_dims) * dtype.itemsize
-    # preconditioning matrices R and C
-    R_mat = data_out
-    C_mat = x_rec
-    # update_term
-    C_R_res = C_mat + 2*R_mat
-    # a guess for astra toolbox memory usage for projection/backprojection
-    astra_size = 2*(x_rec+data_out)
-   
-    total_mem = int(data_out + x_rec + R_mat + C_mat + C_R_res + astra_size)
-    slices_max = available_memory // total_mem
-    return (slices_max, float32(), output_dims)
-
-
 ## %%%%%%%%%%%%%%%%%%%%%%% SIRT reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-@method_sino(_calc_max_slices_SIRT)
 @nvtx.annotate()
 def SIRT(
     data: cp.ndarray,
@@ -230,36 +174,7 @@ def SIRT(
         reconstruction = _apply_circular_mask(reconstruction, recon_mask_radius)    
     return cp.swapaxes(reconstruction,0,1)
 
-## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-def _calc_max_slices_CGLS(
-    non_slice_dims_shape: Tuple[int, int],
-    dtype: np.dtype,
-    available_memory: int, **kwargs
-) -> Tuple[int, np.dtype, Tuple[int, int]]:
-    # calculate the output shape
-    DetectorsLengthH = non_slice_dims_shape[1]
-    objsize = kwargs['objsize']
-    if objsize is None:
-        objsize = DetectorsLengthH
-    output_dims = (objsize, objsize) 
-    
-    # input/output
-    data_out = np.prod(non_slice_dims_shape) * dtype.itemsize
-    x_rec = np.prod(output_dims) * dtype.itemsize
-    # d and r vectors    
-    d = x_rec
-    r = data_out
-    Ad = 2*data_out
-    s = x_rec
-    # a guess for astra toolbox memory usage for projection/backprojection
-    astra_size = 2*(x_rec+data_out)
-   
-    total_mem = int(data_out + x_rec + d + r + Ad + s + astra_size)
-    slices_max = available_memory // total_mem
-    return (slices_max, float32(), output_dims)
-
 ## %%%%%%%%%%%%%%%%%%%%%%% CGLS reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
-@method_sino(_calc_max_slices_CGLS)
 @nvtx.annotate()
 def CGLS(
     data: cp.ndarray,
@@ -324,5 +239,4 @@ def CGLS(
     if recon_mask_radius is not None:
         reconstruction = _apply_circular_mask(reconstruction, recon_mask_radius)
     return cp.swapaxes(reconstruction,0,1)
-
 ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
\ No newline at end of file
diff --git a/tests/test_recon/test_algorithm.py b/tests/test_recon/test_algorithm.py
index c843eca2..6a3b7d84 100644
--- a/tests/test_recon/test_algorithm.py
+++ b/tests/test_recon/test_algorithm.py
@@ -80,75 +80,6 @@ def test_reconstruct_SIRT(data, flats, darks, ensure_clean_memory):
     assert_allclose(np.mean(recon_data, axis=(0, 2)).sum(), 0.23612846, rtol=1e-05)
     assert recon_data.dtype == np.float32
 
-
-# @cp.testing.gpu
-# def test_reconstruct_SIRT_hook(data, flats, darks, ensure_clean_memory):
-#     objrecon_size = data.shape[2]
-#     normalized = normalize_cupy(data, flats, darks, cutoff=10, minus_log=True)
-
-#     cp.get_default_memory_pool().free_all_blocks()
-#     cache = cp.fft.config.get_plan_cache()
-#     cache.clear()
-
-#     objrecon_size = data.shape[2]
-#     hook = MaxMemoryHook(normalized.size * normalized.itemsize)
-#     with hook:
-#         recon_data = SIRT(
-#             normalized,
-#             np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
-#             79.5,
-#             objsize=objrecon_size,
-#             iterations=2,
-#             nonnegativity=True,
-#         )
-
-#     # make sure estimator function is within range (80% min, 100% max)
-#     max_mem = hook.max_mem
-#     actual_slices = data.shape[1]
-#     estimated_slices, dtype_out, output_dims = SIRT.meta.calc_max_slices(1,
-#                                                        (data.shape[0], data.shape[2]),
-#                                                        normalized.dtype,
-#                                                        max_mem,
-#                                                        objsize=objrecon_size)
-#     assert estimated_slices <= actual_slices
-#     assert estimated_slices / actual_slices >= 0.5 # lowering because hook doesn't extend to ASTRA functions
-
-
-# @cp.testing.gpu
-# def test_reconstruct_SIRT_hook2(ensure_clean_memory):
-#     np.random.seed(12345)
-#     data_host = np.random.random_sample(size=(1801, 10, 2560)).astype(np.float32) * 2.0
-#     data = cp.asarray(data_host, dtype=np.float32)
-
-#     objrecon_size = data.shape[2]
-#     cp.get_default_memory_pool().free_all_blocks()
-#     cache = cp.fft.config.get_plan_cache()
-#     cache.clear()
-
-#     objrecon_size = data.shape[2]
-#     hook = MaxMemoryHook(data.size * data.itemsize)
-#     with hook:
-#         recon_data = SIRT(
-#             data,
-#             np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
-#             1200,
-#             objsize=objrecon_size,
-#             iterations=2,
-#             nonnegativity=True,
-#         )
-
-#     # make sure estimator function is within range (80% min, 100% max)
-#     max_mem = hook.max_mem
-#     actual_slices = data.shape[1]
-#     estimated_slices, dtype_out, output_dims = SIRT.meta.calc_max_slices(1,
-#                                                        (data.shape[0], data.shape[2]),
-#                                                        data.dtype,
-#                                                        max_mem,
-#                                                        objsize=objrecon_size)
-#     assert estimated_slices <= actual_slices
-#     assert estimated_slices / actual_slices >= 0.5 # lowering because hook doesn't extend to ASTRA functions
-
-
 @cp.testing.gpu
 def test_reconstruct_CGLS(data, flats, darks, ensure_clean_memory):
     objrecon_size = data.shape[2]
@@ -166,75 +97,6 @@ def test_reconstruct_CGLS(data, flats, darks, ensure_clean_memory):
     assert recon_data.dtype == np.float32
 
 
-# @cp.testing.gpu
-# def test_reconstruct_CGLS_hook(data, flats, darks, ensure_clean_memory):
-#     objrecon_size = data.shape[2]
-#     normalized = normalize_cupy(data, flats, darks, cutoff=10, minus_log=True)
-
-#     cp.get_default_memory_pool().free_all_blocks()
-#     cache = cp.fft.config.get_plan_cache()
-#     cache.clear()
-
-#     objrecon_size = data.shape[2]
-#     hook = MaxMemoryHook(normalized.size * normalized.itemsize)
-#     with hook:
-#         recon_data = CGLS(
-#             normalized,
-#             np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
-#             79.5,
-#             objsize=objrecon_size,
-#             iterations=2,
-#             nonnegativity=True,
-#         )
-
-#     # make sure estimator function is within range (80% min, 100% max)
-#     max_mem = hook.max_mem
-#     actual_slices = data.shape[1]
-#     estimated_slices, dtype_out, output_dims = CGLS.meta.calc_max_slices(1,
-#                                                        (data.shape[0], data.shape[2]),                                                       
-#                                                        normalized.dtype,
-#                                                        max_mem,
-#                                                        objsize=objrecon_size)
-
-#     assert estimated_slices <= actual_slices
-#     assert estimated_slices / actual_slices >= 0.5 # lowering because hook doesn't extend to ASTRA functions
-
-
-# @cp.testing.gpu
-# def test_reconstruct_CGLS_hook2(ensure_clean_memory):
-#     np.random.seed(12345)
-#     data_host = np.random.random_sample(size=(1801, 10, 2560)).astype(np.float32) * 2.0
-#     data = cp.asarray(data_host, dtype=np.float32)
-
-#     objrecon_size = data.shape[2]
-#     cp.get_default_memory_pool().free_all_blocks()
-#     cache = cp.fft.config.get_plan_cache()
-#     cache.clear()
-
-#     objrecon_size = data.shape[2]
-#     hook = MaxMemoryHook(data.size * data.itemsize)
-#     with hook:
-#         recon_data = CGLS(
-#             data,
-#             np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
-#             1200,
-#             objsize=objrecon_size,
-#             iterations=2,
-#             nonnegativity=True,
-#         )
-
-#     # make sure estimator function is within range (80% min, 100% max)
-#     max_mem = hook.max_mem
-#     actual_slices = data.shape[1]
-#     estimated_slices, dtype_out, output_dims = CGLS.meta.calc_max_slices(1,
-#                                                        (data.shape[0], data.shape[2]),                                                       
-#                                                        data.dtype,
-#                                                        max_mem,
-#                                                        objsize=objrecon_size)
-#     assert estimated_slices <= actual_slices
-#     assert estimated_slices / actual_slices >= 0.5 # lowering because hook doesn't extend to ASTRA functions
-
-
 @cp.testing.gpu
 @pytest.mark.perf
 def test_FBP_performance(ensure_clean_memory):

From 41ed059dbb60885c4514c11d44faf13e0339abdb Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 26 Sep 2023 16:30:47 +0100
Subject: [PATCH 07/26] fixes rotation

---
 httomolibgpu/recon/rotation.py    | 21 ---------------------
 tests/test_recon/test_rotation.py | 28 ++--------------------------
 2 files changed, 2 insertions(+), 47 deletions(-)

diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
index b9de8888..78f4e5b8 100644
--- a/httomolibgpu/recon/rotation.py
+++ b/httomolibgpu/recon/rotation.py
@@ -32,7 +32,6 @@
 from cupyx.scipy.ndimage import gaussian_filter, shift
 
 from httomolibgpu.cuda_kernels import load_cuda_module
-from httomolibgpu.decorator import method_sino
 
 __all__ = [
     "find_center_vo",
@@ -40,15 +39,6 @@
 ]
 
 
-def _calc_max_slices_center_vo(
-    non_slice_dims_shape: Tuple[int, int],
-    dtype: np.dtype, available_memory: int, **kwargs
-) -> Tuple[int, np.dtype, Tuple[int, int]]:
-    # the function works on one slice from the sinogram all the way through (picks a specific index)
-    # so memory is not restricted as long as a single slice can fit
-    return (1000000, dtype, non_slice_dims_shape)
-
-@method_sino(_calc_max_slices_center_vo)
 @nvtx.annotate()
 def find_center_vo(
     data: cp.ndarray,
@@ -345,18 +335,7 @@ def _downsample(sino, level, axis):
     kernel(grid_dims, block_dims, params, shared_mem=shared_mem_bytes)
     return downsampled_data
 
-
-def _calc_max_slices_center_360(
-    non_slice_dims_shape: Tuple[int, int],
-    dtype: np.dtype, available_memory: int, **kwargs
-) -> Tuple[int, np.dtype, Tuple[int, int]]:
-    # the function works on one slice from the sinogram all the way through (picks 0 or a specific index)
-    # so memory is not restricted as long as a single slice can fit
-    return (1000000, dtype, non_slice_dims_shape)
-
-
 # --- Center of rotation (COR) estimation method ---#
-@method_sino(_calc_max_slices_center_360)
 @nvtx.annotate()
 def find_center_360(
     data: cp.ndarray,
diff --git a/tests/test_recon/test_rotation.py b/tests/test_recon/test_rotation.py
index 8c883af2..97f8ead3 100644
--- a/tests/test_recon/test_rotation.py
+++ b/tests/test_recon/test_rotation.py
@@ -25,8 +25,6 @@ def test_find_center_vo(data, flats, darks):
 
     #: Check that we only get a float32 output
     assert cor.dtype == np.float32
-    assert find_center_vo.meta.pattern == 'sinogram'
-    assert 'find_center_vo' in method_registry['httomolibgpu']['recon']['rotation']
 
 
 @cp.testing.gpu
@@ -38,17 +36,13 @@ def test_find_center_vo_ones(ensure_clean_memory):
     mat = None  #: free up GPU memory
 
 
-
 @cp.testing.gpu
 def test_find_center_vo_random(ensure_clean_memory):
     np.random.seed(12345)
-    data_host = np.random.random_sample(size=(1801, 5, 2560)).astype(np.float32) * 2.0
+    data_host = np.random.random_sample(size=(900, 1, 1280)).astype(np.float32) * 2.0
     data = cp.asarray(data_host, dtype=np.float32)
-
     cent = find_center_vo(data)
-
-    assert_allclose(cent, 1246.75)
-
+    assert_allclose(cent, 680.75)    
 
 @cp.testing.gpu
 def test_find_center_vo_calculate_chunks():
@@ -78,19 +72,6 @@ def test_find_center_vo_calculate_chunks():
             np.testing.assert_array_equal(diffs[:-1], diffs[0])
             assert diffs[-1] <= diffs[0]
 
-
-@cp.testing.gpu
-def test_find_center_vo_data_chunked(data, flats, darks):
-    data = normalize(data, flats, darks)
-
-    # we emulate less memory here - with this amount, we get 12 chunks and 3 chunks in the 2 calls,
-    # and the data should be the same as when it's all fitting
-    with mock.patch('httomolibgpu.recon.rotation._get_available_gpu_memory', return_value=10000000):
-        cor = find_center_vo(data)
-
-    assert_allclose(cor, 79.5)
-
-
 @cp.testing.gpu
 @pytest.mark.perf
 def test_find_center_vo_performance():
@@ -134,11 +115,6 @@ def test_find_center_360_data(data):
     assert side == 1
     assert_allclose(overlap_pos, 111.906334, rtol=eps)
 
-    #: Check meta
-    assert find_center_360.meta.pattern == 'sinogram'
-    assert 'find_center_360' in method_registry['httomolibgpu']['recon']['rotation']
-
-
 @cp.testing.gpu
 def test_find_center_360_1D_raises(data):
     #: 360-degree sinogram must be a 3d array

From 3c182069641b5baaa3ad52e480fbbf3fabe451f4 Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Thu, 28 Sep 2023 10:08:41 +0100
Subject: [PATCH 08/26] tests pass

---
 httomolibgpu/__init__.py          |   1 -
 httomolibgpu/decorator.py         | 329 ------------------------------
 httomolibgpu/prep/normalize.py    |  24 ---
 httomolibgpu/prep/phase.py        |   1 -
 tests/__init__.py                 |  26 ---
 tests/test_decorator.py           |  66 ------
 tests/test_misc/test_corr.py      |   1 -
 tests/test_prep/test_alignment.py |   1 -
 tests/test_prep/test_normalize.py |   1 -
 tests/test_prep/test_phase.py     |   1 -
 tests/test_prep/test_stripe.py    |   1 -
 tests/test_recon/test_rotation.py |   1 -
 12 files changed, 453 deletions(-)
 delete mode 100644 httomolibgpu/decorator.py
 delete mode 100644 tests/__init__.py
 delete mode 100644 tests/test_decorator.py

diff --git a/httomolibgpu/__init__.py b/httomolibgpu/__init__.py
index c89258b5..9643e39b 100644
--- a/httomolibgpu/__init__.py
+++ b/httomolibgpu/__init__.py
@@ -1,4 +1,3 @@
-from .decorator import *
 from httomolibgpu.misc.corr import *
 from httomolibgpu.misc.morph import *
 from httomolibgpu.prep.alignment import *
diff --git a/httomolibgpu/decorator.py b/httomolibgpu/decorator.py
deleted file mode 100644
index f27d1765..00000000
--- a/httomolibgpu/decorator.py
+++ /dev/null
@@ -1,329 +0,0 @@
-import inspect
-
-from typing import Callable, Dict, List, Literal, Tuple, Protocol, TypeAlias, Union
-import numpy as np
-from dataclasses import dataclass, field
-
-
-class MemoryFunction(Protocol):
-    """
-    A callable signature for a function to determine the maximum chunk size supported,
-    given
-    - the slicing dimension,
-    - the size of the remaining (after slicing) input dimensions,
-    - the data type for the input,
-    - and the available memory in bytes.
-    It takes the actual method parameters as kwargs, which it may use if needed.
-
-    It returns the maximum size in slice_dim dimension that can be supported
-    within the given memory.
-    """
-
-    def __call__(
-        self,
-        slice_dim: int,
-        non_slice_dims_shape: Tuple[int, int],
-        dtype: np.dtype,
-        available_memory: int,
-        **kwargs,
-    ) -> Tuple[int, np.dtype, Tuple[int, int]]:
-        """
-        Calculate the maximum number of slices that can fit in the given memory,
-        for a method with the 'all' pattern.
-
-        Parameters
-        ----------
-        slice_dim : int
-            The dimension in which the slicing happens (0 for projection, 1 for sinogram)
-        non_slice_dims_shape : Tuple[int, int]
-            Shape of the data input in the other 2 dimensions that are not sliced
-        dtype : np.dtype
-            The numpy datatype for the input data
-        available_memory : int
-            The available memory to fit the slices, in bytes
-        kwargs : dict
-            Dictionary of the extra method parameters (apart from the data input)
-
-        Returns
-        -------
-        Tuple[int, np.dtype]
-            Tuple consisting of:
-            - the maximum number of slices that it can fit into the given available memory
-            - the output dtype for the given input dtype
-            - the output data shape
-
-        """
-        ...
-
-
-class MemorySinglePattern(Protocol):
-    """
-    Signature for a function to calculate the max chunk size for a method that
-    supports only one pattern.
-    It avoids the slice_dim parameter, as that is redundant in that case.
-    """
-
-    def __call__(
-        self,
-        non_slice_dims_shape: Tuple[int, int],
-        dtype: np.dtype,
-        available_memory: int,
-        **kwargs,
-    ) -> Tuple[int, np.dtype, Tuple[int, int]]:
-        """
-        Calculate the maximum number of slices that can fit in the given memory,
-        for a method with the 'projection' or 'sinogram' pattern.
-
-        Parameters
-        ----------
-        non_slice_dims_shape : Tuple[int, int]
-            Shape of the data input in the other 2 dimensions that are not sliced
-        dtype : np.dtype
-            The numpy datatype for the input data
-        available_memory : int
-            The available memory to fit the slices, in bytes
-        kwargs : dict
-            Dictionary of the extra method paramters (apart from the data input)
-
-        Returns
-        -------
-        Tuple[int, np.dtype]
-            Tuple consisting of:
-            - the maximum number of slices that it can fit into the given available memory
-            - the output dtype for the given input dtype
-            - the output data shape
-
-        """
-        ...
-
-
-@dataclass(frozen=True)
-class MethodMeta:
-    """
-    Class for meta properties of a method (to be stored as func.meta by the decorator).
-
-    Attributes
-    ----------
-    method_name : str
-        Name of the method as a string
-    signature : Signature
-        An inspect signature of the method with its arguments
-    module : List[str]
-        A list representing the module hierarchy where the method is defined, e.g.
-        ['httomolibgpu', 'prep', 'normlize'] for 'httomolibgpu/prep/normalize.py'
-    calc_max_slices : MemoryFunction
-        Method to calculate the maximum number of slices that can fit in the given
-        available memory.
-    pattern : Literal["projection", "sinogram", "all"]
-        The pattern supported by the method.
-    cpu : bool
-        Whether the method supports CPU data (numpy)
-    gpu : bool
-        Whether the method supports GPU data (cupy)
-    function : Callable
-        The actual method itself
-    others : dict
-        Dictionary of additional arbitrary meta information
-    """
-    
-    method_name: str
-    signature: inspect.Signature
-    module: List[str]
-    calc_max_slices: MemoryFunction
-    pattern: Literal["projection", "sinogram", "all"]
-    cpu: bool
-    gpu: bool
-    function: Callable
-    others: dict = field(default_factory=dict)
-
-    def __call__(self, *args, **kwargs):
-        return self.function(*args, **kwargs)
-
-
-MetaDict: TypeAlias = Dict[str, MethodMeta]
-method_registry: Dict[str, Union[MetaDict, MethodMeta]] = dict()
-
-def calc_max_slices_default(
-    slice_dim: int,
-    non_slice_dims_shape: Tuple[int, int],
-    dtype: np.dtype,
-    available_memory: int,
-    **kwargs,
-) -> Tuple[int, np.dtype, Tuple[int, int]]:
-    """
-    Default function for calculating maximum slices, which simply assumes
-    space for input and output only is required, both with the same datatype,
-    and no temporaries.
-    """
-    slices_max = available_memory // int((np.prod(non_slice_dims_shape) + np.prod(non_slice_dims_shape)) * dtype.itemsize)
-    output_dims = non_slice_dims_shape
-    return (slices_max, dtype, output_dims)
-
-def calc_max_slices_single_pattern_default(
-    non_slice_dims_shape: Tuple[int, int],
-    dtype: np.dtype,
-    available_memory: int,
-    **kwargs
-) -> Tuple[int, np.dtype, Tuple[int, int]]:
-    """
-    Default function for calculating maximum slices, which simply assumes
-    space for input and output only is required, both with the same datatype,
-    and no temporaries.
-    """
-
-    return calc_max_slices_default(0,
-                                   non_slice_dims_shape,
-                                   dtype,
-                                   available_memory,
-                                   **kwargs)
-
-def method(
-    calc_max_slices: MemoryFunction = calc_max_slices_default,
-    cpuonly=False,
-    cpugpu=False,
-    **others,
-):
-    """
-    Decorator for exported tomography methods, annotating the method with
-    a function to calculate the memory requirements as well as other properties.
-
-    Parameters
-    ----------
-    calc_max_slices: MemoryFunction, optional
-        Function to calculate how many slices can fit in the given GPU memory at max.
-        If not given, it assumes memory is only needed for input and output and no
-        temporaries is needed.
-        It is not used for CPU-only functions.
-    cpuonly: bool, optional
-        Marks the method as CPU only (default: False).
-    cpugpu: bool, optional
-        Marks the method as supporting both CPU and GPU input arrays (default: False).
-    pattern: string, optional
-        Sets the patterns supported by the method: 'sinogram', 'projection', 'all'. (default: 'all')
-    **other: dict, optional
-        Any other keyword arguments will be added to the meta info as a dictionary.
-    """
-
-    def _method(func):
-        pattern = others["pattern"] if "pattern" in others else "all"
-        gpu = not cpuonly
-        cpu = cpugpu or cpuonly
-        assert not (cpuonly and cpugpu), "cpuonly and cpugpu are mutually exclusive"
-        func.meta = MethodMeta(
-            method_name=func.__name__,
-            signature=inspect.signature(func),
-            module=inspect.getmodule(func).__name__.split("."),
-            calc_max_slices=calc_max_slices,
-            pattern=pattern,
-            gpu=gpu,
-            cpu=cpu,
-            others=others,
-            function=func,
-        )
-
-        # register the method
-        lvl = method_registry
-        for m in func.meta.module:
-            if not m in lvl:
-                lvl[m] = dict()
-            lvl = lvl[m]
-        lvl[func.meta.method_name] = func.meta
-
-        return func
-
-    return _method
-
-
-def method_sino(
-    calc_max_slices: MemorySinglePattern = calc_max_slices_single_pattern_default,
-    cpuonly=False,
-    cpugpu=False,
-    **others,
-):
-    """
-    Decorator for exported tomography methods, annotating the method with
-    a function to calculate the memory requirements as well as other properties.
-
-    This is a convenience version for sinogram pattern.
-
-    Parameters
-    ----------
-    calc_max_slices: MemoryFunction, optional
-        Function to calculate how many slices can fit in the given GPU memory at max.
-        If not given, it assumes memory is only needed for input and output and no
-        temporaries is needed.
-        It is not used for CPU-only functions.
-    cpuonly: bool, optional
-        Marks the method as CPU only (default: False).
-    cpugpu: bool, optional
-        Marks the method as supporting both CPU and GPU input arrays (default: False).
-    **other: dict, optional
-        Any other keyword arguments will be added to the meta info as a dictionary.
-    """
-
-    def _calc_max_slices(
-        slice_dim: int,
-        non_slice_dims_shape: Tuple[int, int],
-        dtype: np.dtype,
-        available_memory: int,
-        **kwargs,
-    ) -> Tuple[int, np.dtype, Tuple[int, int]]:
-        return calc_max_slices(non_slice_dims_shape,
-                               dtype,
-                               available_memory,
-                               **kwargs)
-
-    return method(_calc_max_slices,
-                  cpuonly,
-                  cpugpu,
-                  **others,
-                  pattern="sinogram")
-
-
-def method_proj(
-    calc_max_slices: MemorySinglePattern = calc_max_slices_single_pattern_default,
-    cpuonly=False,
-    cpugpu=False,
-    **others,
-):
-    """
-    Decorator for exported tomography methods, annotating the method with
-    a function to calculate the memory requirements as well as other properties.
-
-    This is a convenience version for projection pattern.
-
-    Parameters
-    ----------
-    calc_max_slices: MemoryFunction, optional
-        Function to calculate how many slices can fit in the given GPU memory at max.
-        If not given, it assumes memory is only needed for input and output and no
-        temporaries is needed.
-        It is not used for CPU-only functions.
-    cpuonly: bool, optional
-        Marks the method as CPU only (default: False).
-    cpugpu: bool, optional
-        Marks the method as supporting both CPU and GPU input arrays (default: False).
-    **other: dict, optional
-        Any other keyword arguments will be added to the meta info as a dictionary.
-    """
-
-    def _calc_max_slices(
-        slice_dim: int,
-        non_slice_dims_shape: Tuple[int, int],
-        dtype: np.dtype,
-        available_memory: int,
-        **others,
-    ) -> Tuple[int, np.dtype, Tuple[int, int]]:
-        return calc_max_slices(non_slice_dims_shape,
-                               dtype,
-                               available_memory,
-                               **others)
-
-    return method(_calc_max_slices,
-                  cpuonly,
-                  cpugpu,
-                  **others,
-                  pattern="projection")
-
-
-method_all = method
diff --git a/httomolibgpu/prep/normalize.py b/httomolibgpu/prep/normalize.py
index bea8ab3d..2c5b87f1 100644
--- a/httomolibgpu/prep/normalize.py
+++ b/httomolibgpu/prep/normalize.py
@@ -25,33 +25,9 @@
 import numpy as np
 import nvtx
 from cupy import uint16, float32, mean
-from httomolibgpu.decorator import method_proj
 
 __all__ = ["normalize"]
 
-
-def _normalize_max_slices(
-    non_slice_dims_shape: Tuple[int, int],
-    dtype: cp.dtype, available_memory: int, **kwargs
-) -> Tuple[int, np.dtype, Tuple[int, int]]:
-    """Calculate the max chunk size it can fit in the available memory"""
-
-    # normalize needs space to store the darks + flats and their means as a fixed cost
-    flats_mean_space = np.prod(non_slice_dims_shape) * float32().nbytes
-    darks_mean_space = np.prod(non_slice_dims_shape) * float32().nbytes
-    available_memory -= flats_mean_space + darks_mean_space
-
-    # it also needs space for data input and output (we don't care about slice_dim)
-    # data: [x, 10, 20], dtype => other_dims = [10, 20]
-    in_slice_memory = np.prod(non_slice_dims_shape) * dtype.itemsize
-    out_slice_memory = np.prod(non_slice_dims_shape) * float32().nbytes
-    slice_memory = in_slice_memory + out_slice_memory
-    max_slices = available_memory // slice_memory  # rounds down
-
-    return (max_slices, float32(), non_slice_dims_shape)
-
-
-@method_proj(calc_max_slices=_normalize_max_slices)
 @nvtx.annotate()
 def normalize(
     data: cp.ndarray,
diff --git a/httomolibgpu/prep/phase.py b/httomolibgpu/prep/phase.py
index dab052d7..79433c0f 100644
--- a/httomolibgpu/prep/phase.py
+++ b/httomolibgpu/prep/phase.py
@@ -29,7 +29,6 @@
 import nvtx
 
 from httomolibgpu.cuda_kernels import load_cuda_module
-from httomolibgpu.decorator import method_proj
 
 __all__ = [
     "paganin_filter_savu",
diff --git a/tests/__init__.py b/tests/__init__.py
deleted file mode 100644
index 91b3c5a0..00000000
--- a/tests/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import cupy as cp
-
-class MaxMemoryHook(cp.cuda.MemoryHook):
-    
-    def __init__(self, initial=0):
-        self.max_mem = initial
-        self.current = initial
-    
-    def malloc_postprocess(self, device_id: int, size: int, mem_size: int, mem_ptr: int, pmem_id: int):
-        self.current += mem_size
-        self.max_mem = max(self.max_mem, self.current)
-
-    def free_postprocess(self, device_id: int, mem_size: int, mem_ptr: int, pmem_id: int):
-        self.current -= mem_size
-
-    def alloc_preprocess(self, **kwargs):
-        pass
-
-    def alloc_postprocess(self, device_id: int, mem_size: int, mem_ptr: int):
-        pass
-    
-    def free_preprocess(self, **kwargs):
-        pass
-
-    def malloc_preprocess(self, **kwargs):
-        pass
diff --git a/tests/test_decorator.py b/tests/test_decorator.py
deleted file mode 100644
index e1acfecf..00000000
--- a/tests/test_decorator.py
+++ /dev/null
@@ -1,66 +0,0 @@
-from httomolibgpu import method_all, method_proj, method_sino, method_registry
-import inspect
-import numpy as np
-from numpy import int32
-
-def test_adds_metdata():
-    @method_all(
-        calc_max_slices=lambda slice_dim, otherdims, dtype, available_memory, **kwargs: (available_memory
-        // dtype().itemsize
-        // np.prod(otherdims)
-        // 2, dtype)
-    )
-    def myfunc(a: int) -> int:
-        return a**2
-
-    assert myfunc.meta.method_name == "myfunc"
-    assert myfunc.meta.module == ["tests", "test_decorator"]
-    assert myfunc.meta.pattern == "all"
-    assert myfunc.meta.cpu is False
-    assert myfunc.meta.gpu is True
-    # last parameter '2' is mapped to the kwargs
-    assert myfunc.meta.calc_max_slices(0, 
-                                       (10, 10), 
-                                       int32, 40000, a=2) == (50, int32)
-    assert myfunc.__name__ == "myfunc"
-    assert inspect.getfullargspec(myfunc).args == ["a"]
-    assert myfunc(2) == 4
-
-    # also make sure it's in the method registry
-    assert method_registry["tests"]["test_decorator"]["myfunc"] == myfunc.meta
-    assert method_registry["tests"]["test_decorator"]["myfunc"](2) == 4
-
-
-
-def test_metadata_sino():
-    @method_sino(calc_max_slices=None)
-    def otherfunc(a: int):
-        pass
-
-    assert otherfunc.meta.pattern == "sinogram"
-
-
-def test_metadata_proj():
-    @method_proj(calc_max_slices=None)
-    def otherfunc(a: int):
-        pass
-
-    assert otherfunc.meta.pattern == "projection"
-
-
-def test_metadata_cpu():
-    @method_all(cpuonly=True)
-    def otherfunc(a: int):
-        pass
-
-    assert otherfunc.meta.cpu is True
-    assert otherfunc.meta.gpu is False
-
-
-def test_metadata_cpu_and_gpu():
-    @method_all(cpugpu=True)
-    def otherfunc(a: int):
-        pass
-
-    assert otherfunc.meta.cpu is True
-    assert otherfunc.meta.gpu is True
\ No newline at end of file
diff --git a/tests/test_misc/test_corr.py b/tests/test_misc/test_corr.py
index cf0a432c..4217c57d 100644
--- a/tests/test_misc/test_corr.py
+++ b/tests/test_misc/test_corr.py
@@ -10,7 +10,6 @@
     median_filter3d,
     remove_outlier3d,
 )
-from httomolibgpu import method_registry
 from numpy.testing import assert_allclose, assert_equal
 
 eps = 1e-6
diff --git a/tests/test_prep/test_alignment.py b/tests/test_prep/test_alignment.py
index 9d4588e0..55e312c6 100644
--- a/tests/test_prep/test_alignment.py
+++ b/tests/test_prep/test_alignment.py
@@ -6,7 +6,6 @@
 from httomolibgpu.prep.alignment import (
     distortion_correction_proj_discorpy,
 )
-from httomolibgpu import method_registry
 from imageio.v2 import imread
 from numpy.testing import assert_allclose
 
diff --git a/tests/test_prep/test_normalize.py b/tests/test_prep/test_normalize.py
index 1ef42557..1505a0b8 100644
--- a/tests/test_prep/test_normalize.py
+++ b/tests/test_prep/test_normalize.py
@@ -6,7 +6,6 @@
 
 from numpy.testing import assert_allclose, assert_equal
 from httomolibgpu.prep.normalize import normalize
-from httomolibgpu import method_registry
 
 from cupy.cuda import nvtx
 
diff --git a/tests/test_prep/test_phase.py b/tests/test_prep/test_phase.py
index 4b84ba95..96deab6c 100644
--- a/tests/test_prep/test_phase.py
+++ b/tests/test_prep/test_phase.py
@@ -6,7 +6,6 @@
 from cupy.cuda import nvtx
 from httomolibgpu.prep.phase import paganin_filter_savu, paganin_filter_tomopy
 from numpy.testing import assert_allclose
-from httomolibgpu import method_registry
 
 eps = 1e-6
 
diff --git a/tests/test_prep/test_stripe.py b/tests/test_prep/test_stripe.py
index 8a29ba4d..5377d4e6 100644
--- a/tests/test_prep/test_stripe.py
+++ b/tests/test_prep/test_stripe.py
@@ -8,7 +8,6 @@
     remove_stripe_based_sorting,
     remove_stripe_ti,
 )
-from httomolibgpu import method_registry
 from numpy.testing import assert_allclose
 
 @cp.testing.gpu
diff --git a/tests/test_recon/test_rotation.py b/tests/test_recon/test_rotation.py
index 97f8ead3..17dadf40 100644
--- a/tests/test_recon/test_rotation.py
+++ b/tests/test_recon/test_rotation.py
@@ -8,7 +8,6 @@
 import pytest
 from httomolibgpu.prep.normalize import normalize
 from httomolibgpu.recon.rotation import _calculate_chunks, find_center_360, find_center_vo
-from httomolibgpu import method_registry
 from numpy.testing import assert_allclose
 from .rotation_cpu_reference import find_center_360_numpy
 

From 6b86d3b765795e3f594d7456abacb1fe1be0c3c7 Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc@hotmail.com>
Date: Thu, 28 Sep 2023 16:01:11 +0100
Subject: [PATCH 09/26] adding remove_all_stripe algorithm from tomocupy

---
 httomolibgpu/prep/stripe.py    | 238 ++++++++++++++++++++++++++++++++-
 tests/test_prep/test_stripe.py |  20 +++
 2 files changed, 255 insertions(+), 3 deletions(-)

diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py
index 0438b211..5aebfb41 100644
--- a/httomolibgpu/prep/stripe.py
+++ b/httomolibgpu/prep/stripe.py
@@ -24,15 +24,17 @@
 import cupy as cp
 import numpy as np
 import nvtx
+from cupyx.scipy.ndimage import median_filter
+from cupyx.scipy import signal
+from cupyx.scipy.ndimage import binary_dilation
+from cupyx.scipy.ndimage import uniform_filter1d
 
 __all__ = [
     "remove_stripe_based_sorting",
     "remove_stripe_ti",
+    "remove_all_stripe",
 ]
 
-# TODO: port 'remove_all_stripe', 'remove_large_stripe' and 'remove_dead_stripe'
-# from https://github.com/tomopy/tomopy/blob/master/source/tomopy/prep/stripe.py
-
 @nvtx.annotate()
 def remove_stripe_based_sorting(
     data: Union[cp.ndarray, np.ndarray],
@@ -141,3 +143,233 @@ def remove_stripe_ti(
     data[:] += v
 
     return data
+
+
+######## Optimized version for Vo-all ring removal in tomopy########
+        # This function is taken from TomoCuPy package
+# *************************************************************************** #
+#                  Copyright © 2022, UChicago Argonne, LLC                    #
+#                           All Rights Reserved                               #
+#                         Software Name: Tomocupy                             #
+#                     By: Argonne National Laboratory                         #
+#                                                                             #
+#                           OPEN SOURCE LICENSE                               #
+#                                                                             #
+# Redistribution and use in source and binary forms, with or without          #
+# modification, are permitted provided that the following conditions are met: #
+#                                                                             #
+# 1. Redistributions of source code must retain the above copyright notice,   #
+#    this list of conditions and the following disclaimer.                    #
+# 2. Redistributions in binary form must reproduce the above copyright        #
+#    notice, this list of conditions and the following disclaimer in the      #
+#    documentation and/or other materials provided with the distribution.     #
+# 3. Neither the name of the copyright holder nor the names of its            #
+#    contributors may be used to endorse or promote products derived          #
+#    from this software without specific prior written permission.            #
+#                                                                             #
+#                                                                             #
+# *************************************************************************** #
+@nvtx.annotate()
+def remove_all_stripe(
+    data: cp.ndarray,
+    snr: float = 3.0,
+    la_size: int = 61,
+    sm_size: int = 21,
+    dim: int = 1,
+) -> cp.ndarray:
+    """
+    Remove all types of stripe artifacts from sinogram using Nghia Vo's
+    approach :cite:`Vo:18` (combination of algorithm 3,4,5, and 6).
+
+    Parameters
+    ----------
+    data : ndarray
+        3D tomographic data as a CuPy array.
+    snr  : float, optional
+        Ratio used to locate large stripes.
+        Greater is less sensitive.
+    la_size : int, optional
+        Window size of the median filter to remove large stripes.
+    sm_size : int, optional
+        Window size of the median filter to remove small-to-medium stripes.
+    dim : {1, 2}, optional
+        Dimension of the window.   
+    
+    Returns
+    -------
+    ndarray
+        Corrected 3D tomographic data as a CuPy or NumPy array.
+
+    References
+    ----------
+    .. [1] https://doi.org/10.1364/OE.26.028396   
+
+    """               
+    matindex = _create_matindex(data.shape[2], data.shape[0])
+    for m in range(data.shape[1]):
+        sino = data[:, m, :]
+        sino = _rs_dead(sino, snr, la_size, matindex)
+        sino = _rs_sort2(sino, sm_size, matindex, dim)
+        data[:, m, :] = sino
+    return data
+
+@nvtx.annotate()
+def _rs_sort2(sinogram, size, matindex, dim):
+    """
+    Remove stripes using the sorting technique.
+    """
+    sinogram = cp.transpose(sinogram)
+    matcomb = cp.asarray(cp.dstack((matindex, sinogram)))
+    
+    # matsort = cp.asarray([row[row[:, 1].argsort()] for row in matcomb])
+    ids = cp.argsort(matcomb[:,:,1],axis=1)
+    matsort = matcomb.copy()
+    matsort[:,:,0] = cp.take_along_axis(matsort[:,:,0],ids,axis=1)
+    matsort[:,:,1] = cp.take_along_axis(matsort[:,:,1],ids,axis=1)
+    if dim == 1:
+        matsort[:, :, 1] = median_filter(matsort[:, :, 1], (size, 1))
+    else:
+        matsort[:, :, 1] = median_filter(matsort[:, :, 1], (size, size))
+    
+    # matsortback = cp.asarray([row[row[:, 0].argsort()] for row in matsort])
+    
+    ids = cp.argsort(matsort[:,:,0],axis=1)
+    matsortback = matsort.copy()
+    matsortback[:,:,0] = cp.take_along_axis(matsortback[:,:,0],ids,axis=1)
+    matsortback[:,:,1] = cp.take_along_axis(matsortback[:,:,1],ids,axis=1)
+    
+    sino_corrected = matsortback[:, :, 1]
+    return cp.transpose(sino_corrected)
+
+@nvtx.annotate()
+def _mpolyfit(x,y):
+    n= len(x)
+    x_mean = cp.mean(x)
+    y_mean = cp.mean(y)
+    
+    Sxy = cp.sum(x*y) - n*x_mean*y_mean
+    Sxx = cp.sum(x*x) - n*x_mean*x_mean
+    
+    slope = Sxy / Sxx
+    intercept = y_mean - slope*x_mean
+    return slope,intercept
+
+@nvtx.annotate()
+def _detect_stripe(listdata, snr):
+    """
+    Algorithm 4 in :cite:`Vo:18`. Used to locate stripes.
+    """
+    numdata = len(listdata)
+    listsorted = cp.sort(listdata)[::-1]
+    xlist = cp.arange(0, numdata, 1.0)
+    ndrop = cp.int16(0.25 * numdata)
+    # (_slope, _intercept) = cp.polyfit(xlist[ndrop:-ndrop - 1],
+                                    #   listsorted[ndrop:-ndrop - 1], 1)
+    (_slope, _intercept) = _mpolyfit(xlist[ndrop:-ndrop - 1], listsorted[ndrop:-ndrop - 1])
+
+    numt1 = _intercept + _slope * xlist[-1]
+    noiselevel = cp.abs(numt1 - _intercept)
+    noiselevel = cp.clip(noiselevel, 1e-6, None)
+    val1 = cp.abs(listsorted[0] - _intercept) / noiselevel
+    val2 = cp.abs(listsorted[-1] - numt1) / noiselevel
+    listmask = cp.zeros_like(listdata)
+    if (val1 >= snr):
+        upper_thresh = _intercept + noiselevel * snr * 0.5
+        listmask[listdata > upper_thresh] = 1.0
+    if (val2 >= snr):
+        lower_thresh = numt1 - noiselevel * snr * 0.5
+        listmask[listdata <= lower_thresh] = 1.0
+    return listmask
+
+@nvtx.annotate()
+def _rs_large(sinogram, snr, size, matindex, drop_ratio=0.1, norm=True):
+    """
+    Remove large stripes.
+    """
+    drop_ratio = max(min(drop_ratio,0.8),0)# = cp.clip(drop_ratio, 0.0, 0.8)
+    (nrow, ncol) = sinogram.shape
+    ndrop = int(0.5 * drop_ratio * nrow)
+    sinosort = cp.sort(sinogram, axis=0)
+    sinosmooth = median_filter(sinosort, (1, size))
+    list1 = cp.mean(sinosort[ndrop:nrow - ndrop], axis=0)
+    list2 = cp.mean(sinosmooth[ndrop:nrow - ndrop], axis=0)
+    # listfact = cp.divide(list1,
+    #                      list2,
+    #                      out=cp.ones_like(list1),
+    #                      where=list2 != 0)
+    
+    listfact = list1/list2
+    
+    # Locate stripes
+    listmask = _detect_stripe(listfact, snr)
+    listmask = binary_dilation(listmask, iterations=1).astype(listmask.dtype)
+    matfact = cp.tile(listfact, (nrow, 1))
+    # Normalize
+    if norm is True:
+        sinogram = sinogram / matfact
+    sinogram1 = cp.transpose(sinogram)
+    matcombine = cp.asarray(cp.dstack((matindex, sinogram1)))
+    
+    # matsort = cp.asarray([row[row[:, 1].argsort()] for row in matcombine])
+    ids = cp.argsort(matcombine[:,:,1],axis=1)
+    matsort = matcombine.copy()
+    matsort[:,:,0] = cp.take_along_axis(matsort[:,:,0],ids,axis=1)
+    matsort[:,:,1] = cp.take_along_axis(matsort[:,:,1],ids,axis=1)
+    
+    matsort[:, :, 1] = cp.transpose(sinosmooth)
+    # matsortback = cp.asarray([row[row[:, 0].argsort()] for row in matsort])
+    ids = cp.argsort(matsort[:,:,0],axis=1)
+    matsortback = matsort.copy()
+    matsortback[:,:,0] = cp.take_along_axis(matsortback[:,:,0],ids,axis=1)
+    matsortback[:,:,1] = cp.take_along_axis(matsortback[:,:,1],ids,axis=1)
+    
+    sino_corrected = cp.transpose(matsortback[:, :, 1])
+    listxmiss = cp.where(listmask > 0.0)[0]
+    sinogram[:, listxmiss] = sino_corrected[:, listxmiss]
+    return sinogram
+
+@nvtx.annotate()
+def _rs_dead(sinogram, snr, size, matindex, norm=True):
+    """
+    Remove unresponsive and fluctuating stripes.
+    """
+    sinogram = cp.copy(sinogram)  # Make it mutable
+    (nrow, _) = sinogram.shape
+    # sinosmooth = cp.apply_along_axis(uniform_filter1d, 0, sinogram, 10)
+    sinosmooth = uniform_filter1d(sinogram, 10, axis=0)
+    
+    listdiff = cp.sum(cp.abs(sinogram - sinosmooth), axis=0)
+    listdiffbck = median_filter(listdiff, size)
+    
+    
+    listfact = listdiff/listdiffbck
+    
+    listmask = _detect_stripe(listfact, snr)
+    listmask = binary_dilation(listmask, iterations=1).astype(listmask.dtype)
+    listmask[0:2] = 0.0
+    listmask[-2:] = 0.0
+    listx = cp.where(listmask < 1.0)[0]
+    listy = cp.arange(nrow)
+    matz = sinogram[:, listx]
+    
+    listxmiss = cp.where(listmask > 0.0)[0]    
+    
+    # finter = interpolate.interp2d(listx.get(), listy.get(), matz.get(), kind='linear')    
+    if len(listxmiss) > 0:
+        # sinogram_c[:, listxmiss.get()] = finter(listxmiss.get(), listy.get())        
+        ids = cp.searchsorted(listx, listxmiss)
+        sinogram[:,listxmiss] = matz[:,ids-1]+(listxmiss-listx[ids-1])*(matz[:,ids]-matz[:,ids-1])/(listx[ids]-listx[ids-1])
+        
+    # Remove residual stripes
+    if norm is True:
+        sinogram = _rs_large(sinogram, snr, size, matindex)
+    return sinogram
+
+@nvtx.annotate()
+def _create_matindex(nrow, ncol):
+    """
+    Create a 2D array of indexes used for the sorting technique.
+    """
+    listindex = cp.arange(0.0, ncol, 1.0)
+    matindex = cp.tile(listindex, (nrow, 1))
+    return matindex
\ No newline at end of file
diff --git a/tests/test_prep/test_stripe.py b/tests/test_prep/test_stripe.py
index 5377d4e6..847316ad 100644
--- a/tests/test_prep/test_stripe.py
+++ b/tests/test_prep/test_stripe.py
@@ -7,6 +7,7 @@
 from httomolibgpu.prep.stripe import (
     remove_stripe_based_sorting,
     remove_stripe_ti,
+    remove_all_stripe,
 )
 from numpy.testing import assert_allclose
 
@@ -121,3 +122,22 @@ def test_remove_stripe_ti_performance(ensure_clean_memory):
     duration_ms = float(time.perf_counter_ns() - start) * 1e-6 / 10
 
     assert "performance in ms" == duration_ms
+
+
+@cp.testing.gpu
+def test_remove_all_stripe_on_data(data, flats, darks):
+    # --- testing the CuPy implementation from TomoCupy ---#
+    data = normalize(data, flats, darks, cutoff=10, minus_log=True)
+    
+    data_after_stripe_removal = remove_all_stripe(cp.copy(data)).get()
+
+    assert_allclose(np.mean(data_after_stripe_removal), 0.266914, rtol=1e-05)
+    assert_allclose(
+        np.mean(data_after_stripe_removal, axis=(1, 2)).sum(), 48.04459, rtol=1e-06
+    )
+    assert_allclose(np.median(data_after_stripe_removal), 0.015338, rtol=1e-04)
+    assert_allclose(np.max(data_after_stripe_removal), 2.298123, rtol=1e-05)
+    
+    data = None  #: free up GPU memory
+    # make sure the output is float32
+    assert data_after_stripe_removal.dtype == np.float32
\ No newline at end of file

From c229785289fabee740ecedcc3be1ead171ea5e37 Mon Sep 17 00:00:00 2001
From: algol <dkazanc@hotmail.com>
Date: Thu, 14 Dec 2023 21:58:41 +0000
Subject: [PATCH 10/26] adds labeling to reconstructors, fixing tests

---
 httomolibgpu/recon/algorithm.py    | 142 +++++++++++++++--------------
 tests/test_misc/test_corr.py       |  10 --
 tests/test_misc/test_morph.py      |   3 -
 tests/test_prep/test_alignment.py  |   2 -
 tests/test_prep/test_normalize.py  |   3 -
 tests/test_prep/test_phase.py      |  11 ---
 tests/test_prep/test_stripe.py     |   7 --
 tests/test_recon/test_algorithm.py |   9 +-
 tests/test_recon/test_rotation.py  |  11 ---
 9 files changed, 77 insertions(+), 121 deletions(-)

diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
index 3f290fda..e1c6f20d 100644
--- a/httomolibgpu/recon/algorithm.py
+++ b/httomolibgpu/recon/algorithm.py
@@ -22,6 +22,8 @@
 
 from typing import Optional, Tuple, Union
 
+from typing import Type
+
 import cupy as cp
 from cupy import float32, complex64
 import cupyx
@@ -30,25 +32,15 @@
 
 from httomolibgpu.cuda_kernels import load_cuda_module
 
+from tomobar.methodsDIR_CuPy import RecToolsDIRCuPy
+from tomobar.methodsIR_CuPy import RecToolsIRCuPy
+
 __all__ = [
     "FBP",
     "SIRT",
     "CGLS",
 ]
 
-def _apply_circular_mask(data, recon_mask_radius):
-    
-    recon_size = data.shape[1]
-    Y, X = cp.ogrid[:recon_size, :recon_size]
-    half_size = recon_size//2
-    dist_from_center = cp.sqrt((X - half_size)**2 + (Y-half_size)**2)
-    if recon_mask_radius <= 1.0:
-        mask = dist_from_center <= half_size - abs(half_size - half_size/recon_mask_radius)
-    else:
-        mask = dist_from_center <= half_size + abs(half_size - half_size/recon_mask_radius)    
-    return data*mask
-
-
 ## %%%%%%%%%%%%%%%%%%%%%%% FBP reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 @nvtx.annotate()
 def FBP(
@@ -74,7 +66,7 @@ def FBP(
     recon_size : int, optional
         The [recon_size, recon_size] shape of the reconstructed slice in pixels. 
         By default (None), the reconstructed size will be the dimension of the horizontal detector.
-    recon_mask_radius: int, optional
+    recon_mask_radius: float, optional
         The radius of the circular mask that applies to the reconstructed slice in order to crop
         out some undesirable artefacts. The values outside the diameter will be set to zero. 
         None by default, to see the effect of the mask try setting the value in the range [0.7-1.0]. 
@@ -86,24 +78,10 @@ def FBP(
     cp.ndarray
         The FBP reconstructed volume as a CuPy array.
     """
-    from tomobar.methodsDIR_CuPy import RecToolsDIRCuPy
+    RecToolsCP = _instantiate_direct_recon_class(data, angles, center, recon_size, gpu_id)
 
-    if center is None:
-        center = data.shape[2] // 2  # making a crude guess
-    if recon_size is None:
-        recon_size = data.shape[2]
-    RecToolsCP = RecToolsDIRCuPy(DetectorsDimH=data.shape[2],  # Horizontal detector dimension
-                                 DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
-                                 CenterRotOffset=data.shape[2] / 2 - center - 0.5,  # Center of Rotation scalar or a vector
-                                 AnglesVec=-angles,  # A vector of projection angles in radians
-                                 ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
-                                 device_projector=gpu_id,
-                                 )
-    reconstruction = RecToolsCP.FBP3D(data)
+    reconstruction = RecToolsCP.FBP(data, recon_mask_radius=recon_mask_radius)
     cp._default_memory_pool.free_all_blocks()
-    # perform masking to the result of reconstruction if needed 
-    if recon_mask_radius is not None:
-        reconstruction = _apply_circular_mask(reconstruction, recon_mask_radius)
     return cp.swapaxes(reconstruction,0,1)
 
 ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
@@ -117,7 +95,6 @@ def SIRT(
     recon_size: Optional[int] = None,    
     iterations: Optional[int] = 300,
     nonnegativity: Optional[bool] = True,
-    recon_mask_radius: Optional[float] = None,
     gpu_id: int = 0,
 ) -> cp.ndarray:
     """
@@ -139,10 +116,6 @@ def SIRT(
         The number of SIRT iterations.
     nonnegativity : bool, optional
         Impose nonnegativity constraint on reconstructed image.
-    recon_mask_radius: int, optional
-        The radius of the circular mask that applies to the reconstructed slice in order to crop
-        out some undesirable artefacts. The values outside the diameter will be set to zero. 
-        None by default, to see the effect of the mask try setting the value in the range [0.7-1.0].
     gpu_id : int, optional
         A GPU device index to perform operation on.
 
@@ -151,27 +124,13 @@ def SIRT(
     cp.ndarray
         The SIRT reconstructed volume as a CuPy array.
     """
-    from tomobar.methodsIR_CuPy import RecToolsIRCuPy
 
-    if center is None:
-        center = data.shape[2] // 2  # making a crude guess
-    if recon_size is None:
-        recon_size = data.shape[2]
-        
-    RecToolsCP = RecToolsIRCuPy(DetectorsDimH=data.shape[2],  # Horizontal detector dimension
-                                 DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
-                                 CenterRotOffset=data.shape[2] / 2 - center - 0.5,  # Center of Rotation scalar or a vector
-                                 AnglesVec=-angles,  # A vector of projection angles in radians
-                                 ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
-                                 device_projector=gpu_id,
-                                 )
+    RecToolsCP = _instantiate_iterative_recon_class(data, angles, center, recon_size, gpu_id, datafidelity='LS')
+
     _data_ = {"projection_norm_data": data}  # data dictionary
     _algorithm_ = {"iterations": iterations, "nonnegativity": nonnegativity}
     reconstruction = RecToolsCP.SIRT(_data_, _algorithm_)
     cp._default_memory_pool.free_all_blocks()
-    # perform masking to the result of reconstruction if needed 
-    if recon_mask_radius is not None:
-        reconstruction = _apply_circular_mask(reconstruction, recon_mask_radius)    
     return cp.swapaxes(reconstruction,0,1)
 
 ## %%%%%%%%%%%%%%%%%%%%%%% CGLS reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
@@ -183,7 +142,6 @@ def CGLS(
     recon_size: Optional[int] = None,
     iterations: Optional[int] = 20,
     nonnegativity: Optional[bool] = True,
-    recon_mask_radius: Optional[float] = None,
     gpu_id: int = 0,
 ) -> cp.ndarray:
     """
@@ -205,10 +163,6 @@ def CGLS(
         The number of CGLS iterations.
     nonnegativity : bool, optional
         Impose nonnegativity constraint on reconstructed image.
-    recon_mask_radius: int, optional
-        The radius of the circular mask that applies to the reconstructed slice in order to crop
-        out some undesirable artefacts. The values outside the diameter will be set to zero. 
-        None by default, to see the effect of the mask try setting the value in the range [0.7-1.0].        
     gpu_id : int, optional
         A GPU device index to perform operation on.
 
@@ -217,26 +171,80 @@ def CGLS(
     cp.ndarray
         The CGLS reconstructed volume as a CuPy array.
     """
-    from tomobar.methodsIR_CuPy import RecToolsIRCuPy
+    RecToolsCP = _instantiate_iterative_recon_class(data, angles, center, recon_size, gpu_id, datafidelity='LS')
+
+    _data_ = {"projection_norm_data": data}  # data dictionary
+    _algorithm_ = {"iterations": iterations, "nonnegativity": nonnegativity}
+    reconstruction = RecToolsCP.CGLS(_data_, _algorithm_)
+    cp._default_memory_pool.free_all_blocks()    
+    return cp.swapaxes(reconstruction,0,1)
+## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
+
+def _instantiate_direct_recon_class(data: cp.ndarray,
+                               angles: np.ndarray,
+                               center: Optional[float] = None,
+                               recon_size: Optional[int] = None,
+                               gpu_id: int = 0) -> type[RecToolsDIRCuPy]:
+    """instantiate ToMoBAR's direct recon class
+
+    Args:
+        data (cp.ndarray): data array
+        angles (np.ndarray): angles
+        center (Optional[float], optional): center of recon. Defaults to None.
+        recon_size (Optional[int], optional): recon_size. Defaults to None.
+        gpu_id (int, optional): gpu ID. Defaults to 0.
+
+    Returns:
+        type[RecToolsDIRCuPy]: an instance of the class
+    """
+    input_data_axis_labels=["angles", "detY", "detX"]  # set the labels of the input data
+
+    if center is None:
+        center = data.shape[2] // 2  # making a crude guess
+    if recon_size is None:
+        recon_size = data.shape[2]
+    RecToolsCP = RecToolsDIRCuPy(DetectorsDimH=data.shape[2],  # Horizontal detector dimension
+                                 DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
+                                 CenterRotOffset=data.shape[2] / 2 - center - 0.5,  # Center of Rotation scalar or a vector
+                                 AnglesVec=-angles,  # A vector of projection angles in radians
+                                 ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
+                                 device_projector=gpu_id,
+                                 data_axis_labels=input_data_axis_labels,
+                                 )
+    return RecToolsCP
+
+def _instantiate_iterative_recon_class(data: cp.ndarray,
+                               angles: np.ndarray,
+                               center: Optional[float] = None,
+                               recon_size: Optional[int] = None,                               
+                               gpu_id: int = 0,
+                               datafidelity: str = 'LS') -> type[RecToolsIRCuPy]:
+    """instantiate ToMoBAR's iterative recon class
+
+    Args:
+        data (cp.ndarray): data array
+        angles (np.ndarray): angles
+        center (Optional[float], optional): center of recon. Defaults to None.
+        recon_size (Optional[int], optional): recon_size. Defaults to None.
+        datafidelity (str, optional): Data fidelity
+        gpu_id (int, optional): gpu ID. Defaults to 0.
+
+    Returns:
+        type[RecToolsIRCuPy]: an instance of the class
+    """
+    input_data_axis_labels=["angles", "detY", "detX"]  # set the labels of the input data
 
     if center is None:
         center = data.shape[2] // 2  # making a crude guess
     if recon_size is None:
         recon_size = data.shape[2]
-        
     RecToolsCP = RecToolsIRCuPy(DetectorsDimH=data.shape[2],  # Horizontal detector dimension
                                  DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
                                  CenterRotOffset=data.shape[2] / 2 - center - 0.5,  # Center of Rotation scalar or a vector
                                  AnglesVec=-angles,  # A vector of projection angles in radians
                                  ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
+                                 datafidelity=datafidelity,
                                  device_projector=gpu_id,
+                                 data_axis_labels=input_data_axis_labels,
                                  )
-    _data_ = {"projection_norm_data": data}  # data dictionary
-    _algorithm_ = {"iterations": iterations, "nonnegativity": nonnegativity}
-    reconstruction = RecToolsCP.CGLS(_data_, _algorithm_)
-    cp._default_memory_pool.free_all_blocks()
-    # perform masking to the result of reconstruction if needed 
-    if recon_mask_radius is not None:
-        reconstruction = _apply_circular_mask(reconstruction, recon_mask_radius)
-    return cp.swapaxes(reconstruction,0,1)
-## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
\ No newline at end of file
+    return RecToolsCP
\ No newline at end of file
diff --git a/tests/test_misc/test_corr.py b/tests/test_misc/test_corr.py
index 4217c57d..c377dc48 100644
--- a/tests/test_misc/test_corr.py
+++ b/tests/test_misc/test_corr.py
@@ -15,7 +15,6 @@
 eps = 1e-6
 
 
-@cp.testing.gpu
 def test_median_filter3d_vs_scipy_on_arange(ensure_clean_memory):
     mat = np.arange(4 * 5 * 6).reshape(4, 5, 6)
     assert_equal(
@@ -24,7 +23,6 @@ def test_median_filter3d_vs_scipy_on_arange(ensure_clean_memory):
     )
 
 
-@cp.testing.gpu
 def test_median_filter3d_vs_scipy(host_data, ensure_clean_memory):
     assert_equal(
         scipy.ndimage.median_filter(np.float32(host_data), size=3),
@@ -44,33 +42,28 @@ def test_scipy_median_filter_benchmark(data, ensure_clean_memory, benchmark, siz
     benchmark(median_filter_cupy, data.astype(cp.float32), size=size)
 
 
-@cp.testing.gpu
 def test_median_filter3d_1D_raises(ensure_clean_memory):
     _data = cp.ones(10)
     with pytest.raises(ValueError):
         median_filter3d(_data, kernel_size=3)
 
 
-@cp.testing.gpu
 def test_median_filter3d_zero_dim(ensure_clean_memory):
     _data = cp.ones(shape=(10, 10, 0)) * 100
     with pytest.raises(ValueError):
         median_filter3d(_data, kernel_size=3)
 
 
-@cp.testing.gpu
 def test_median_filter3d_even_kernel_size(data):
     with pytest.raises(ValueError):
         median_filter3d(data, kernel_size=4)
 
 
-@cp.testing.gpu
 def test_median_filter3d_wrong_dtype(data):
     with pytest.raises(ValueError):
         median_filter3d(data.astype(cp.float64), kernel_size=3)
 
 
-@cp.testing.gpu
 def test_median_filter3d(data):
     filtered_data = median_filter3d(data, kernel_size=3).get()
 
@@ -87,8 +80,6 @@ def test_median_filter3d(data):
         == np.float32
     )
 
-
-@cp.testing.gpu
 @pytest.mark.perf
 def test_median_filter3d_performance(ensure_clean_memory):
     dev = cp.cuda.Device()
@@ -110,7 +101,6 @@ def test_median_filter3d_performance(ensure_clean_memory):
     assert "performance in ms" == duration_ms
 
 
-@cp.testing.gpu
 def test_remove_outlier3d(data):
     filtered_data = remove_outlier3d(data, kernel_size=3, dif=1.5).get()
 
diff --git a/tests/test_misc/test_morph.py b/tests/test_misc/test_morph.py
index 1a90d8f6..c8e8ed25 100644
--- a/tests/test_misc/test_morph.py
+++ b/tests/test_misc/test_morph.py
@@ -17,7 +17,6 @@
         (0, ""),
     ],
 )
-@cp.testing.gpu
 def test_sino_360_to_180_invalid(ensure_clean_memory, overlap, rotation):
     data = cp.ones((10, 10, 10), dtype=cp.float32)
 
@@ -26,7 +25,6 @@ def test_sino_360_to_180_invalid(ensure_clean_memory, overlap, rotation):
 
 
 @pytest.mark.parametrize("shape", [(10,), (10, 10)])
-@cp.testing.gpu
 def test_sino_360_to_180_wrong_dims(ensure_clean_memory, shape):
     with pytest.raises(ValueError):
         sino_360_to_180(cp.ones(shape, dtype=cp.float32))
@@ -34,7 +32,6 @@ def test_sino_360_to_180_wrong_dims(ensure_clean_memory, shape):
 
 @pytest.mark.parametrize("rotation", ["left", "right"])
 @pytest.mark.perf
-@cp.testing.gpu
 def test_sino_360_to_180_performance(ensure_clean_memory, rotation):
     # as this is just an index shuffling operation, the data itself doesn't matter
     data = cp.ones((1801, 400, 2560), dtype=np.float32)
diff --git a/tests/test_prep/test_alignment.py b/tests/test_prep/test_alignment.py
index 55e312c6..54d11cf7 100644
--- a/tests/test_prep/test_alignment.py
+++ b/tests/test_prep/test_alignment.py
@@ -9,8 +9,6 @@
 from imageio.v2 import imread
 from numpy.testing import assert_allclose
 
-
-@cp.testing.gpu
 @pytest.mark.parametrize(
     "image, max_value, mean_value",
     [
diff --git a/tests/test_prep/test_normalize.py b/tests/test_prep/test_normalize.py
index 1505a0b8..b9d57c65 100644
--- a/tests/test_prep/test_normalize.py
+++ b/tests/test_prep/test_normalize.py
@@ -9,7 +9,6 @@
 
 from cupy.cuda import nvtx
 
-@cp.testing.gpu
 def test_normalize_1D_raises(data, flats, darks, ensure_clean_memory):
     _data_1d = cp.ones(10)
 
@@ -21,7 +20,6 @@ def test_normalize_1D_raises(data, flats, darks, ensure_clean_memory):
     with pytest.raises(ValueError):
         normalize(data, _data_1d, darks)
 
-@cp.testing.gpu
 def test_normalize(data, flats, darks, ensure_clean_memory):
     # --- testing normalize  ---#
     data_normalize = normalize(cp.copy(data), flats, darks, minus_log=True).get()
@@ -34,7 +32,6 @@ def test_normalize(data, flats, darks, ensure_clean_memory):
     assert_allclose(np.std(data_normalize), 0.524382, rtol=1e-06)
 
 
-@cp.testing.gpu
 @pytest.mark.perf
 def test_normalize_performance(ensure_clean_memory):
     # Note: low/high and size values taken from sample2_medium.yaml real run
diff --git a/tests/test_prep/test_phase.py b/tests/test_prep/test_phase.py
index 96deab6c..89f26649 100644
--- a/tests/test_prep/test_phase.py
+++ b/tests/test_prep/test_phase.py
@@ -9,7 +9,6 @@
 
 eps = 1e-6
 
-@cp.testing.gpu
 def test_paganin_savu_filter(data):
     # --- testing the Paganin filter on tomo_standard ---#
     filtered_data = paganin_filter_savu(data).get()
@@ -21,7 +20,6 @@ def test_paganin_savu_filter(data):
     #: make sure the output is float32
     assert filtered_data.dtype == np.float32
 
-@cp.testing.gpu
 def test_paganin_filter_savu_energy100(data):
     filtered_data = paganin_filter_savu(data, energy=100.0).get()
 
@@ -32,7 +30,6 @@ def test_paganin_filter_savu_energy100(data):
     assert filtered_data.dtype == np.float32
 
 
-@cp.testing.gpu
 def test_paganin_filter_savu_padmean(data):
     filtered_data = paganin_filter_savu(data, pad_method="mean").get()
 
@@ -55,7 +52,6 @@ def test_paganin_filter_savu_padmean(data):
     )
 
 
-@cp.testing.gpu
 @pytest.mark.perf
 def test_paganin_filter_savu_performance(ensure_clean_memory):
     # Note: low/high and size values taken from sample2_medium.yaml real run
@@ -109,7 +105,6 @@ def test_paganin_filter_savu_performance(ensure_clean_memory):
     assert "performance in ms" == duration_ms
 
 
-@cp.testing.gpu
 def test_paganin_filter_savu_1D_raises(ensure_clean_memory):
     _data = cp.ones(10)
     with pytest.raises(ValueError):
@@ -118,7 +113,6 @@ def test_paganin_filter_savu_1D_raises(ensure_clean_memory):
     _data = None  #: free up GPU memory
 
 # paganin filter tomopy
-@cp.testing.gpu
 def test_paganin_filter_tomopy_1D_raises(ensure_clean_memory):
     _data = cp.ones(10)
     with pytest.raises(ValueError):
@@ -126,7 +120,6 @@ def test_paganin_filter_tomopy_1D_raises(ensure_clean_memory):
 
     _data = None  #: free up GPU memory
 
-@cp.testing.gpu
 def test_paganin_filter_tomopy(data):
     # --- testing the Paganin filter from TomoPy on tomo_standard ---#
     filtered_data = paganin_filter_tomopy(data).get()
@@ -138,8 +131,6 @@ def test_paganin_filter_tomopy(data):
     #: make sure the output is float32
     assert filtered_data.dtype == np.float32
 
-
-@cp.testing.gpu
 def test_paganin_filter_tomopy_energy100(data):
     filtered_data = paganin_filter_tomopy(data, energy=100.0).get()
 
@@ -150,7 +141,6 @@ def test_paganin_filter_tomopy_energy100(data):
     assert filtered_data.dtype == np.float32
 
 
-@cp.testing.gpu
 def test_paganin_filter_tomopy_dist75(data):
     filtered_data = paganin_filter_tomopy(data, dist=75.0, alpha=1e-6).get()
 
@@ -160,7 +150,6 @@ def test_paganin_filter_tomopy_dist75(data):
     assert_allclose(np.sum(filtered_data[50:100, 40, 1]), -343.5908, rtol=1e-6)
 
 
-@cp.testing.gpu
 @pytest.mark.perf
 def test_paganin_filter_tomopy_performance(ensure_clean_memory):
     # Note: low/high and size values taken from sample2_medium.yaml real run
diff --git a/tests/test_prep/test_stripe.py b/tests/test_prep/test_stripe.py
index 847316ad..5ee3a59d 100644
--- a/tests/test_prep/test_stripe.py
+++ b/tests/test_prep/test_stripe.py
@@ -11,7 +11,6 @@
 )
 from numpy.testing import assert_allclose
 
-@cp.testing.gpu
 def test_remove_stripe_ti_on_data(data, flats, darks):
     # --- testing the CuPy implementation from TomoCupy ---#
     data = normalize(data, flats, darks, cutoff=10, minus_log=True)
@@ -36,7 +35,6 @@ def test_remove_stripe_ti_on_flats(host_flats):
     assert_allclose(np.mean(corrected_data, axis=(1, 2)).sum(), 19531.168945, rtol=1e-7)
     assert_allclose(np.median(corrected_data), 976.0, rtol=1e-7)
 
-@cp.testing.gpu
 def test_remove_stripe_ti_numpy_vs_cupy_on_random_data():
     host_data = np.random.random_sample(size=(181, 5, 256)).astype(np.float32) * 2.0
     corrected_host_data = remove_stripe_ti(np.copy(host_data))
@@ -49,7 +47,6 @@ def test_remove_stripe_ti_numpy_vs_cupy_on_random_data():
         np.median(corrected_data), np.median(corrected_host_data), rtol=1e-6
     )
 
-@cp.testing.gpu
 def test_stripe_removal_sorting_cupy(data, flats, darks):
     # --- testing the CuPy port of TomoPy's implementation ---#
     data = normalize(data, flats, darks, cutoff=10, minus_log=True)
@@ -63,7 +60,6 @@ def test_stripe_removal_sorting_cupy(data, flats, darks):
     # make sure the output is float32
     assert corrected_data.dtype == np.float32
 
-@cp.testing.gpu
 @cp.testing.numpy_cupy_allclose(rtol=1e-6)
 def test_stripe_removal_sorting_numpy_vs_cupy_on_random_data(ensure_clean_memory, xp):
     np.random.seed(12345)
@@ -72,7 +68,6 @@ def test_stripe_removal_sorting_numpy_vs_cupy_on_random_data(ensure_clean_memory
     return xp.asarray(remove_stripe_based_sorting(data))
 
 
-@cp.testing.gpu
 @pytest.mark.perf
 def test_stripe_removal_sorting_cupy_performance(ensure_clean_memory):
     data_host = (
@@ -98,7 +93,6 @@ def test_stripe_removal_sorting_cupy_performance(ensure_clean_memory):
     assert "performance in ms" == duration_ms
 
 
-@cp.testing.gpu
 @pytest.mark.perf
 def test_remove_stripe_ti_performance(ensure_clean_memory):
     data_host = (
@@ -124,7 +118,6 @@ def test_remove_stripe_ti_performance(ensure_clean_memory):
     assert "performance in ms" == duration_ms
 
 
-@cp.testing.gpu
 def test_remove_all_stripe_on_data(data, flats, darks):
     # --- testing the CuPy implementation from TomoCupy ---#
     data = normalize(data, flats, darks, cutoff=10, minus_log=True)
diff --git a/tests/test_recon/test_algorithm.py b/tests/test_recon/test_algorithm.py
index 6a3b7d84..e8d7f7d9 100644
--- a/tests/test_recon/test_algorithm.py
+++ b/tests/test_recon/test_algorithm.py
@@ -12,7 +12,6 @@
 from cupy.cuda import nvtx
 
 
-@cp.testing.gpu
 def test_reconstruct_FBP_1(data, flats, darks, ensure_clean_memory):
     recon_data = FBP(
         normalize_cupy(data, flats, darks, cutoff=10, minus_log=True),
@@ -28,7 +27,6 @@ def test_reconstruct_FBP_1(data, flats, darks, ensure_clean_memory):
     assert recon_data.shape == (160, 128, 160)
 
 
-@cp.testing.gpu
 def test_reconstruct_FBP_2(data, flats, darks, ensure_clean_memory):
     recon_data = FBP(
         normalize_cupy(data, flats, darks, cutoff=20.5, minus_log=False),
@@ -45,7 +43,6 @@ def test_reconstruct_FBP_2(data, flats, darks, ensure_clean_memory):
     assert recon_data.dtype == np.float32
 
 
-@cp.testing.gpu
 def test_reconstruct_FBP_3(data, flats, darks, ensure_clean_memory):
     recon_data = FBP(
         normalize_cupy(data, flats, darks, cutoff=20.5, minus_log=False),
@@ -64,7 +61,7 @@ def test_reconstruct_FBP_3(data, flats, darks, ensure_clean_memory):
     assert recon_data.dtype == np.float32
     assert recon_data.shape == (210, 128, 210)
 
-@cp.testing.gpu
+
 def test_reconstruct_SIRT(data, flats, darks, ensure_clean_memory):
     objrecon_size = data.shape[2]
     recon_data = SIRT(
@@ -80,7 +77,7 @@ def test_reconstruct_SIRT(data, flats, darks, ensure_clean_memory):
     assert_allclose(np.mean(recon_data, axis=(0, 2)).sum(), 0.23612846, rtol=1e-05)
     assert recon_data.dtype == np.float32
 
-@cp.testing.gpu
+
 def test_reconstruct_CGLS(data, flats, darks, ensure_clean_memory):
     objrecon_size = data.shape[2]
     recon_data = CGLS(
@@ -96,8 +93,6 @@ def test_reconstruct_CGLS(data, flats, darks, ensure_clean_memory):
     assert_allclose(np.mean(recon_data, axis=(0, 2)).sum(), 0.279187, rtol=1e-03)
     assert recon_data.dtype == np.float32
 
-
-@cp.testing.gpu
 @pytest.mark.perf
 def test_FBP_performance(ensure_clean_memory):
     dev = cp.cuda.Device()
diff --git a/tests/test_recon/test_rotation.py b/tests/test_recon/test_rotation.py
index 17dadf40..ba5e22b5 100644
--- a/tests/test_recon/test_rotation.py
+++ b/tests/test_recon/test_rotation.py
@@ -11,8 +11,6 @@
 from numpy.testing import assert_allclose
 from .rotation_cpu_reference import find_center_360_numpy
 
-
-@cp.testing.gpu
 def test_find_center_vo(data, flats, darks):
     data = normalize(data, flats, darks)
 
@@ -25,8 +23,6 @@ def test_find_center_vo(data, flats, darks):
     #: Check that we only get a float32 output
     assert cor.dtype == np.float32
 
-
-@cp.testing.gpu
 def test_find_center_vo_ones(ensure_clean_memory):
     mat = cp.ones(shape=(103, 450, 230), dtype=cp.float32)
     cor = find_center_vo(mat)
@@ -35,7 +31,6 @@ def test_find_center_vo_ones(ensure_clean_memory):
     mat = None  #: free up GPU memory
 
 
-@cp.testing.gpu
 def test_find_center_vo_random(ensure_clean_memory):
     np.random.seed(12345)
     data_host = np.random.random_sample(size=(900, 1, 1280)).astype(np.float32) * 2.0
@@ -43,7 +38,6 @@ def test_find_center_vo_random(ensure_clean_memory):
     cent = find_center_vo(data)
     assert_allclose(cent, 680.75)    
 
-@cp.testing.gpu
 def test_find_center_vo_calculate_chunks():
     # we need the split to fit into the available memory, and also make sure
     # that the last chunk is either the same or smaller than the previous ones
@@ -71,7 +65,6 @@ def test_find_center_vo_calculate_chunks():
             np.testing.assert_array_equal(diffs[:-1], diffs[0])
             assert diffs[-1] <= diffs[0]
 
-@cp.testing.gpu
 @pytest.mark.perf
 def test_find_center_vo_performance():
     dev = cp.cuda.Device()
@@ -92,7 +85,6 @@ def test_find_center_vo_performance():
     assert "performance in ms" == duration_ms
 
 
-@cp.testing.gpu
 def test_find_center_360_ones():
     mat = cp.ones(shape=(100, 100, 100), dtype=cp.float32)
 
@@ -104,7 +96,6 @@ def test_find_center_360_ones():
     assert_allclose(overlap_position, 7.0)
 
 
-@cp.testing.gpu
 def test_find_center_360_data(data):
     eps = 1e-5
     (cor, overlap, side, overlap_pos) = find_center_360(data, norm=True, denoise=False)
@@ -114,7 +105,6 @@ def test_find_center_360_data(data):
     assert side == 1
     assert_allclose(overlap_pos, 111.906334, rtol=eps)
 
-@cp.testing.gpu
 def test_find_center_360_1D_raises(data):
     #: 360-degree sinogram must be a 3d array
     with pytest.raises(ValueError):
@@ -124,7 +114,6 @@ def test_find_center_360_1D_raises(data):
         find_center_360(cp.ones(10))
 
 
-@cp.testing.gpu
 @pytest.mark.parametrize("norm", [False, True], ids=["no_normalise", "normalise"])
 @pytest.mark.parametrize("overlap", [False, True], ids=["no_overlap", "overlap"])
 @pytest.mark.parametrize("denoise", [False, True], ids=["no_denoise", "denoise"])

From e1292fd7ff7af2a7e1287754db84a28462710111 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 10 Jan 2024 16:44:05 +0000
Subject: [PATCH 11/26] correction to fit tomobar 2024.01 version

---
 httomolibgpu/recon/algorithm.py | 57 +++++++++++++++++----------------
 1 file changed, 30 insertions(+), 27 deletions(-)

diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
index e1c6f20d..2a7a37fc 100644
--- a/httomolibgpu/recon/algorithm.py
+++ b/httomolibgpu/recon/algorithm.py
@@ -16,7 +16,7 @@
 # limitations under the License.
 # ---------------------------------------------------------------------------
 # Created By  : Tomography Team at DLS <scientificsoftware@diamond.ac.uk>
-# Created Date: 01 November 2022
+# Changes relative to ToMoBAR 2024.01 version
 # ---------------------------------------------------------------------------
 """Module for tomographic reconstruction"""
 
@@ -30,8 +30,6 @@
 import numpy as np
 import nvtx
 
-from httomolibgpu.cuda_kernels import load_cuda_module
-
 from tomobar.methodsDIR_CuPy import RecToolsDIRCuPy
 from tomobar.methodsIR_CuPy import RecToolsIRCuPy
 
@@ -41,6 +39,8 @@
     "CGLS",
 ]
 
+input_data_axis_labels=["angles", "detY", "detX"]  # set the labels of the input data
+
 ## %%%%%%%%%%%%%%%%%%%%%%% FBP reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 @nvtx.annotate()
 def FBP(
@@ -78,9 +78,11 @@ def FBP(
     cp.ndarray
         The FBP reconstructed volume as a CuPy array.
     """
-    RecToolsCP = _instantiate_direct_recon_class(data, angles, center, recon_size, gpu_id)
+    RecToolsCP = _instantiate_direct_recon_class(data, angles, center, recon_size, gpu_id)   
 
-    reconstruction = RecToolsCP.FBP(data, recon_mask_radius=recon_mask_radius)
+    reconstruction = RecToolsCP.FBP(data, 
+                                    recon_mask_radius=recon_mask_radius,
+                                    data_axes_labels_order=input_data_axis_labels)
     cp._default_memory_pool.free_all_blocks()
     return cp.swapaxes(reconstruction,0,1)
 
@@ -127,8 +129,12 @@ def SIRT(
 
     RecToolsCP = _instantiate_iterative_recon_class(data, angles, center, recon_size, gpu_id, datafidelity='LS')
 
-    _data_ = {"projection_norm_data": data}  # data dictionary
-    _algorithm_ = {"iterations": iterations, "nonnegativity": nonnegativity}
+    _data_ = {"projection_norm_data": data,
+              "data_axes_labels_order": input_data_axis_labels,
+              }  # data dictionary
+    _algorithm_ = {"iterations": iterations, 
+                   "nonnegativity": nonnegativity,                   
+                   }
     reconstruction = RecToolsCP.SIRT(_data_, _algorithm_)
     cp._default_memory_pool.free_all_blocks()
     return cp.swapaxes(reconstruction,0,1)
@@ -173,18 +179,21 @@ def CGLS(
     """
     RecToolsCP = _instantiate_iterative_recon_class(data, angles, center, recon_size, gpu_id, datafidelity='LS')
 
-    _data_ = {"projection_norm_data": data}  # data dictionary
-    _algorithm_ = {"iterations": iterations, "nonnegativity": nonnegativity}
+    _data_ = {"projection_norm_data": data,
+              "data_axes_labels_order": input_data_axis_labels,
+              }  # data dictionary
+    _algorithm_ = {"iterations": iterations,
+                   "nonnegativity": nonnegativity}
     reconstruction = RecToolsCP.CGLS(_data_, _algorithm_)
     cp._default_memory_pool.free_all_blocks()    
     return cp.swapaxes(reconstruction,0,1)
 ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 
 def _instantiate_direct_recon_class(data: cp.ndarray,
-                               angles: np.ndarray,
-                               center: Optional[float] = None,
-                               recon_size: Optional[int] = None,
-                               gpu_id: int = 0) -> type[RecToolsDIRCuPy]:
+                                    angles: np.ndarray,
+                                    center: Optional[float] = None,
+                                    recon_size: Optional[int] = None,
+                                    gpu_id: int = 0) -> type[RecToolsDIRCuPy]:
     """instantiate ToMoBAR's direct recon class
 
     Args:
@@ -195,10 +204,8 @@ def _instantiate_direct_recon_class(data: cp.ndarray,
         gpu_id (int, optional): gpu ID. Defaults to 0.
 
     Returns:
-        type[RecToolsDIRCuPy]: an instance of the class
+        type[RecToolsDIRCuPy]: an instance of the direct recon class
     """
-    input_data_axis_labels=["angles", "detY", "detX"]  # set the labels of the input data
-
     if center is None:
         center = data.shape[2] // 2  # making a crude guess
     if recon_size is None:
@@ -208,17 +215,16 @@ def _instantiate_direct_recon_class(data: cp.ndarray,
                                  CenterRotOffset=data.shape[2] / 2 - center - 0.5,  # Center of Rotation scalar or a vector
                                  AnglesVec=-angles,  # A vector of projection angles in radians
                                  ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
-                                 device_projector=gpu_id,
-                                 data_axis_labels=input_data_axis_labels,
+                                 device_projector=gpu_id,                                 
                                  )
     return RecToolsCP
 
 def _instantiate_iterative_recon_class(data: cp.ndarray,
-                               angles: np.ndarray,
-                               center: Optional[float] = None,
-                               recon_size: Optional[int] = None,                               
-                               gpu_id: int = 0,
-                               datafidelity: str = 'LS') -> type[RecToolsIRCuPy]:
+                                        angles: np.ndarray,
+                                        center: Optional[float] = None,
+                                        recon_size: Optional[int] = None,
+                                        gpu_id: int = 0,
+                                        datafidelity: str = 'LS') -> type[RecToolsIRCuPy]:
     """instantiate ToMoBAR's iterative recon class
 
     Args:
@@ -230,10 +236,8 @@ def _instantiate_iterative_recon_class(data: cp.ndarray,
         gpu_id (int, optional): gpu ID. Defaults to 0.
 
     Returns:
-        type[RecToolsIRCuPy]: an instance of the class
+        type[RecToolsIRCuPy]: an instance of the iterative class
     """
-    input_data_axis_labels=["angles", "detY", "detX"]  # set the labels of the input data
-
     if center is None:
         center = data.shape[2] // 2  # making a crude guess
     if recon_size is None:
@@ -245,6 +249,5 @@ def _instantiate_iterative_recon_class(data: cp.ndarray,
                                  ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
                                  datafidelity=datafidelity,
                                  device_projector=gpu_id,
-                                 data_axis_labels=input_data_axis_labels,
                                  )
     return RecToolsCP
\ No newline at end of file

From ac850a37a52e5403b3cbf9388921e3485607f139 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 31 Jan 2024 13:51:45 +0000
Subject: [PATCH 12/26] fixing the bug for reconstruction not being contiguous

---
 httomolibgpu/recon/algorithm.py | 137 +++++++++++++++++++-------------
 1 file changed, 80 insertions(+), 57 deletions(-)

diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
index 2a7a37fc..c499b022 100644
--- a/httomolibgpu/recon/algorithm.py
+++ b/httomolibgpu/recon/algorithm.py
@@ -39,7 +39,8 @@
     "CGLS",
 ]
 
-input_data_axis_labels=["angles", "detY", "detX"]  # set the labels of the input data
+input_data_axis_labels = ["angles", "detY", "detX"]  # set the labels of the input data
+
 
 ## %%%%%%%%%%%%%%%%%%%%%%% FBP reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 @nvtx.annotate()
@@ -64,12 +65,12 @@ def FBP(
     center : float, optional
         The center of rotation (CoR).
     recon_size : int, optional
-        The [recon_size, recon_size] shape of the reconstructed slice in pixels. 
+        The [recon_size, recon_size] shape of the reconstructed slice in pixels.
         By default (None), the reconstructed size will be the dimension of the horizontal detector.
     recon_mask_radius: float, optional
         The radius of the circular mask that applies to the reconstructed slice in order to crop
-        out some undesirable artefacts. The values outside the diameter will be set to zero. 
-        None by default, to see the effect of the mask try setting the value in the range [0.7-1.0]. 
+        out some undesirable artefacts. The values outside the diameter will be set to zero.
+        None by default, to see the effect of the mask try setting the value in the range [0.7-1.0].
     gpu_id : int, optional
         A GPU device index to perform operation on.
 
@@ -78,15 +79,18 @@ def FBP(
     cp.ndarray
         The FBP reconstructed volume as a CuPy array.
     """
-    RecToolsCP = _instantiate_direct_recon_class(data, angles, center, recon_size, gpu_id)   
+    RecToolsCP = _instantiate_direct_recon_class(
+        data, angles, center, recon_size, gpu_id
+    )
 
-    reconstruction = RecToolsCP.FBP(data, 
-                                    recon_mask_radius=recon_mask_radius,
-                                    data_axes_labels_order=input_data_axis_labels)
+    reconstruction = RecToolsCP.FBP(
+        data,
+        recon_mask_radius=recon_mask_radius,
+        data_axes_labels_order=input_data_axis_labels,
+    )
     cp._default_memory_pool.free_all_blocks()
-    return cp.swapaxes(reconstruction,0,1)
+    return cp.require(cp.swapaxes(reconstruction, 0, 1), requirements="C")
 
-## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 
 ## %%%%%%%%%%%%%%%%%%%%%%% SIRT reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 @nvtx.annotate()
@@ -94,7 +98,7 @@ def SIRT(
     data: cp.ndarray,
     angles: np.ndarray,
     center: Optional[float] = None,
-    recon_size: Optional[int] = None,    
+    recon_size: Optional[int] = None,
     iterations: Optional[int] = 300,
     nonnegativity: Optional[bool] = True,
     gpu_id: int = 0,
@@ -112,7 +116,7 @@ def SIRT(
     center : float, optional
         The center of rotation (CoR).
     recon_size : int, optional
-        The [recon_size, recon_size] shape of the reconstructed slice in pixels. 
+        The [recon_size, recon_size] shape of the reconstructed slice in pixels.
         By default (None), the reconstructed size will be the dimension of the horizontal detector.
     iterations : int, optional
         The number of SIRT iterations.
@@ -127,17 +131,22 @@ def SIRT(
         The SIRT reconstructed volume as a CuPy array.
     """
 
-    RecToolsCP = _instantiate_iterative_recon_class(data, angles, center, recon_size, gpu_id, datafidelity='LS')
+    RecToolsCP = _instantiate_iterative_recon_class(
+        data, angles, center, recon_size, gpu_id, datafidelity="LS"
+    )
 
-    _data_ = {"projection_norm_data": data,
-              "data_axes_labels_order": input_data_axis_labels,
-              }  # data dictionary
-    _algorithm_ = {"iterations": iterations, 
-                   "nonnegativity": nonnegativity,                   
-                   }
+    _data_ = {
+        "projection_norm_data": data,
+        "data_axes_labels_order": input_data_axis_labels,
+    }  # data dictionary
+    _algorithm_ = {
+        "iterations": iterations,
+        "nonnegativity": nonnegativity,
+    }
     reconstruction = RecToolsCP.SIRT(_data_, _algorithm_)
     cp._default_memory_pool.free_all_blocks()
-    return cp.swapaxes(reconstruction,0,1)
+    return cp.require(cp.swapaxes(reconstruction, 0, 1), requirements="C")
+
 
 ## %%%%%%%%%%%%%%%%%%%%%%% CGLS reconstruction %%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
 @nvtx.annotate()
@@ -163,7 +172,7 @@ def CGLS(
     center : float, optional
         The center of rotation (CoR).
     recon_size : int, optional
-        The [recon_size, recon_size] shape of the reconstructed slice in pixels. 
+        The [recon_size, recon_size] shape of the reconstructed slice in pixels.
         By default (None), the reconstructed size will be the dimension of the horizontal detector.
     iterations : int, optional
         The number of CGLS iterations.
@@ -177,23 +186,28 @@ def CGLS(
     cp.ndarray
         The CGLS reconstructed volume as a CuPy array.
     """
-    RecToolsCP = _instantiate_iterative_recon_class(data, angles, center, recon_size, gpu_id, datafidelity='LS')
+    RecToolsCP = _instantiate_iterative_recon_class(
+        data, angles, center, recon_size, gpu_id, datafidelity="LS"
+    )
 
-    _data_ = {"projection_norm_data": data,
-              "data_axes_labels_order": input_data_axis_labels,
-              }  # data dictionary
-    _algorithm_ = {"iterations": iterations,
-                   "nonnegativity": nonnegativity}
+    _data_ = {
+        "projection_norm_data": data,
+        "data_axes_labels_order": input_data_axis_labels,
+    }  # data dictionary
+    _algorithm_ = {"iterations": iterations, "nonnegativity": nonnegativity}
     reconstruction = RecToolsCP.CGLS(_data_, _algorithm_)
-    cp._default_memory_pool.free_all_blocks()    
-    return cp.swapaxes(reconstruction,0,1)
-## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
+    cp._default_memory_pool.free_all_blocks()
+    return cp.require(cp.swapaxes(reconstruction, 0, 1), requirements="C")
+
 
-def _instantiate_direct_recon_class(data: cp.ndarray,
-                                    angles: np.ndarray,
-                                    center: Optional[float] = None,
-                                    recon_size: Optional[int] = None,
-                                    gpu_id: int = 0) -> type[RecToolsDIRCuPy]:
+## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%  ##
+def _instantiate_direct_recon_class(
+    data: cp.ndarray,
+    angles: np.ndarray,
+    center: Optional[float] = None,
+    recon_size: Optional[int] = None,
+    gpu_id: int = 0,
+) -> type[RecToolsDIRCuPy]:
     """instantiate ToMoBAR's direct recon class
 
     Args:
@@ -210,21 +224,27 @@ def _instantiate_direct_recon_class(data: cp.ndarray,
         center = data.shape[2] // 2  # making a crude guess
     if recon_size is None:
         recon_size = data.shape[2]
-    RecToolsCP = RecToolsDIRCuPy(DetectorsDimH=data.shape[2],  # Horizontal detector dimension
-                                 DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
-                                 CenterRotOffset=data.shape[2] / 2 - center - 0.5,  # Center of Rotation scalar or a vector
-                                 AnglesVec=-angles,  # A vector of projection angles in radians
-                                 ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
-                                 device_projector=gpu_id,                                 
-                                 )
+    RecToolsCP = RecToolsDIRCuPy(
+        DetectorsDimH=data.shape[2],  # Horizontal detector dimension
+        DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
+        CenterRotOffset=data.shape[2] / 2
+        - center
+        - 0.5,  # Center of Rotation scalar or a vector
+        AnglesVec=-angles,  # A vector of projection angles in radians
+        ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
+        device_projector=gpu_id,
+    )
     return RecToolsCP
 
-def _instantiate_iterative_recon_class(data: cp.ndarray,
-                                        angles: np.ndarray,
-                                        center: Optional[float] = None,
-                                        recon_size: Optional[int] = None,
-                                        gpu_id: int = 0,
-                                        datafidelity: str = 'LS') -> type[RecToolsIRCuPy]:
+
+def _instantiate_iterative_recon_class(
+    data: cp.ndarray,
+    angles: np.ndarray,
+    center: Optional[float] = None,
+    recon_size: Optional[int] = None,
+    gpu_id: int = 0,
+    datafidelity: str = "LS",
+) -> type[RecToolsIRCuPy]:
     """instantiate ToMoBAR's iterative recon class
 
     Args:
@@ -242,12 +262,15 @@ def _instantiate_iterative_recon_class(data: cp.ndarray,
         center = data.shape[2] // 2  # making a crude guess
     if recon_size is None:
         recon_size = data.shape[2]
-    RecToolsCP = RecToolsIRCuPy(DetectorsDimH=data.shape[2],  # Horizontal detector dimension
-                                 DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
-                                 CenterRotOffset=data.shape[2] / 2 - center - 0.5,  # Center of Rotation scalar or a vector
-                                 AnglesVec=-angles,  # A vector of projection angles in radians
-                                 ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
-                                 datafidelity=datafidelity,
-                                 device_projector=gpu_id,
-                                 )
-    return RecToolsCP
\ No newline at end of file
+    RecToolsCP = RecToolsIRCuPy(
+        DetectorsDimH=data.shape[2],  # Horizontal detector dimension
+        DetectorsDimV=data.shape[1],  # Vertical detector dimension (3D case)
+        CenterRotOffset=data.shape[2] / 2
+        - center
+        - 0.5,  # Center of Rotation scalar or a vector
+        AnglesVec=-angles,  # A vector of projection angles in radians
+        ObjSize=recon_size,  # Reconstructed object dimensions (scalar)
+        datafidelity=datafidelity,
+        device_projector=gpu_id,
+    )
+    return RecToolsCP

From 34bf0dd3c4f0c094505ed1343960ed3c2c580c67 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Mon, 5 Feb 2024 16:42:20 +0000
Subject: [PATCH 13/26] adds data_scaler method and tests

---
 conda/environment.yml         |   4 +-
 httomolibgpu/misc/morph.py    | 112 +++++++++++++++++++++++++++++++++-
 tests/test_misc/test_morph.py |  22 ++++++-
 3 files changed, 134 insertions(+), 4 deletions(-)

diff --git a/conda/environment.yml b/conda/environment.yml
index 90d26457..1f6bdc9e 100644
--- a/conda/environment.yml
+++ b/conda/environment.yml
@@ -5,8 +5,8 @@ channels:
   - astra-toolbox
   - httomo
 dependencies:
-  - conda-forge::cupy
-  - conda-forge::numpy<=1.23
+  - conda-forge::cupy==12.3.0
+  - conda-forge::numpy<=1.24
   - conda-forge::nvtx
   - conda-forge::scipy
   - conda-forge::python
diff --git a/httomolibgpu/misc/morph.py b/httomolibgpu/misc/morph.py
index d4c580cc..720b7e96 100644
--- a/httomolibgpu/misc/morph.py
+++ b/httomolibgpu/misc/morph.py
@@ -20,13 +20,17 @@
 # ---------------------------------------------------------------------------
 """Module for data type morphing functions"""
 
-import cupy as cp 
+import cupy as cp
 import numpy as np
 import nvtx
 from typing import Literal, Tuple
 
+from cupyx.scipy.interpolate import interpn
+
+
 __all__ = [
     "sino_360_to_180",
+    "data_scaler",
 ]
 
 
@@ -88,3 +92,109 @@ def sino_360_to_180(
         raise ValueError('rotation parameter must be either "left" or "right"')
 
     return out
+
+
+@nvtx.annotate()
+def data_scaler(
+    data: cp.ndarray, newshape: tuple, axis: int = 1, method: str = "linear"
+) -> cp.ndarray:
+    """Down/Up-scaler of the input data implemented through interpn function.
+       Please note that the method will leave the specified axis
+       dimension unchanged, e.g. (128,128,128) -> (128,256,256) for axis = 0 and
+       newshape = (256,256).
+
+    Args:
+        data (cp.ndarray): 3d cupy array.
+        newshape (tuple): 2d tuple that defines the new shape size.
+        axis (int, optional): Axis along which the scaling is applied. Defaults to 1.
+        method (str, optional): Selection of interpn method for interpolation. Defaults to 'linear'.
+
+    Raises:
+        ValueError: When data is not 3D
+
+    Returns:
+        cp.ndarray: Up/Down-scaled 3D cupy array
+    """
+
+    if data.ndim != 3:
+        raise ValueError("only 3D data is supported")
+
+    N, M, Z = cp.shape(data)
+
+    if axis == 0:
+        xaxis = cp.arange(M) - M / 2
+        yaxis = cp.arange(Z) - Z / 2
+        step_x = M / newshape[0]
+        step_y = Z / newshape[1]
+        scaled_data = cp.empty((N, newshape[0], newshape[1]), dtype=cp.float32)
+    elif axis == 1:
+        xaxis = cp.arange(N) - N / 2
+        yaxis = cp.arange(Z) - Z / 2
+        step_x = N / newshape[0]
+        step_y = Z / newshape[1]
+        scaled_data = cp.empty((newshape[0], M, newshape[1]), dtype=cp.float32)
+    elif axis == 2:
+        xaxis = cp.arange(N) - N / 2
+        yaxis = cp.arange(M) - M / 2
+        step_x = N / newshape[0]
+        step_y = M / newshape[1]
+        scaled_data = cp.empty((newshape[0], newshape[1], Z), dtype=cp.float32)
+    else:
+        raise ValueError("Only 0,1,2 values for axes are supported")
+
+    points = (xaxis, yaxis)
+
+    scale_x = 2 / step_x
+    scale_y = 2 / step_y
+
+    xi = np.mgrid[
+        -newshape[0] / scale_x : newshape[0] / scale_x : step_x,
+        -newshape[1] / scale_y : newshape[1] / scale_y : step_y,
+    ]
+    xi_size = xi.size
+    xi = np.rollaxis(xi, 0, 3)
+    xi = np.reshape(xi, [xi_size // 2, 2])
+    xi = cp.asarray(xi, dtype=cp.float32, order="C")
+
+    if axis == 0:
+        for j in range(N):
+            res = interpn(
+                points,
+                data[j, :, :],
+                xi,
+                method=method,
+                bounds_error=False,
+                fill_value=0.0,
+            )
+            scaled_data[j, :, :] = cp.reshape(
+                res, [newshape[0], newshape[1]], order="C"
+            )
+    elif axis == 1:
+
+        for j in range(M):
+            res = interpn(
+                points,
+                data[:, j, :],
+                xi,
+                method=method,
+                bounds_error=False,
+                fill_value=0.0,
+            )
+            scaled_data[:, j, :] = cp.reshape(
+                res, [newshape[0], newshape[1]], order="C"
+            )
+    else:
+        for j in range(Z):
+            res = interpn(
+                points,
+                data[:, :, j],
+                xi,
+                method=method,
+                bounds_error=False,
+                fill_value=0.0,
+            )
+            scaled_data[:, :, j] = cp.reshape(
+                res, [newshape[0], newshape[1]], order="C"
+            )
+
+    return scaled_data
diff --git a/tests/test_misc/test_morph.py b/tests/test_misc/test_morph.py
index c8e8ed25..287681a3 100644
--- a/tests/test_misc/test_morph.py
+++ b/tests/test_misc/test_morph.py
@@ -3,7 +3,8 @@
 import numpy as np
 from cupy.cuda import nvtx
 import pytest
-from httomolibgpu.misc.morph import sino_360_to_180
+from numpy.testing import assert_allclose
+from httomolibgpu.misc.morph import sino_360_to_180, data_scaler
 
 
 @pytest.mark.parametrize(
@@ -30,6 +31,25 @@ def test_sino_360_to_180_wrong_dims(ensure_clean_memory, shape):
         sino_360_to_180(cp.ones(shape, dtype=cp.float32))
 
 
+@pytest.mark.parametrize("axis", [0, 1, 2])
+def test_data_scaler(data, axis, ensure_clean_memory):
+    newshape = (200, 250)
+    scaled_data = data_scaler(data, newshape=newshape, axis=axis, method="linear").get()
+
+    assert scaled_data.ndim == 3
+    if axis == 0:
+        assert scaled_data.shape == (180, newshape[0], newshape[1])
+        assert_allclose(np.max(scaled_data), 1113.2)
+    if axis == 1:
+        assert scaled_data.shape == (newshape[0], 128, newshape[1])
+        assert_allclose(np.max(scaled_data), 1118.0)
+    if axis == 2:
+        assert scaled_data.shape == (newshape[0], newshape[1], 160)
+        assert_allclose(np.max(scaled_data), 1115.5359)
+    assert_allclose(np.min(scaled_data), 0.0)
+    assert scaled_data.dtype == np.float32
+
+
 @pytest.mark.parametrize("rotation", ["left", "right"])
 @pytest.mark.perf
 def test_sino_360_to_180_performance(ensure_clean_memory, rotation):

From e3180ac48d5180cdbb8bc9fe0fd54ab04e680068 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Mon, 5 Feb 2024 21:53:02 +0000
Subject: [PATCH 14/26] add c-contiguous checks in tests

---
 tests/test_misc/test_corr.py       | 15 ++++++++++++---
 tests/test_misc/test_morph.py      |  1 +
 tests/test_prep/test_alignment.py  |  4 +++-
 tests/test_prep/test_normalize.py  |  4 +++-
 tests/test_prep/test_phase.py      | 13 ++++++++++---
 tests/test_prep/test_stripe.py     | 22 +++++++++++++++-------
 tests/test_recon/test_algorithm.py | 15 +++++++++------
 7 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/tests/test_misc/test_corr.py b/tests/test_misc/test_corr.py
index c377dc48..49b425af 100644
--- a/tests/test_misc/test_corr.py
+++ b/tests/test_misc/test_corr.py
@@ -32,8 +32,14 @@ def test_median_filter3d_vs_scipy(host_data, ensure_clean_memory):
 
 @pytest.mark.perf
 @pytest.mark.parametrize("kernel_size", [3, 5])
-def test_median_filter3d_benchmark(host_data, ensure_clean_memory, kernel_size, benchmark):
-    benchmark(median_filter3d, cp.asarray(host_data, dtype=cp.float32), kernel_size=kernel_size)
+def test_median_filter3d_benchmark(
+    host_data, ensure_clean_memory, kernel_size, benchmark
+):
+    benchmark(
+        median_filter3d,
+        cp.asarray(host_data, dtype=cp.float32),
+        kernel_size=kernel_size,
+    )
 
 
 @pytest.mark.perf
@@ -74,12 +80,14 @@ def test_median_filter3d(data):
     assert_allclose(np.min(filtered_data), 89.0)
 
     assert filtered_data.dtype == np.uint16
+    assert filtered_data.flags.c_contiguous
 
     assert (
         median_filter3d(data.astype(cp.float32), kernel_size=5, dif=1.5).get().dtype
         == np.float32
     )
 
+
 @pytest.mark.perf
 def test_median_filter3d_performance(ensure_clean_memory):
     dev = cp.cuda.Device()
@@ -107,7 +115,7 @@ def test_remove_outlier3d(data):
     assert filtered_data.ndim == 3
     assert_allclose(np.mean(filtered_data), 808.753494, rtol=eps)
     assert_allclose(np.mean(filtered_data, axis=(1, 2)).sum(), 145575.628906)
-    assert_allclose(np.median(filtered_data), 976.)
+    assert_allclose(np.median(filtered_data), 976.0)
     assert_allclose(np.median(filtered_data, axis=(1, 2)).sum(), 175741.5)
 
     assert filtered_data.dtype == np.uint16
@@ -116,3 +124,4 @@ def test_remove_outlier3d(data):
         remove_outlier3d(data.astype(cp.float32), kernel_size=5, dif=1.5).get().dtype
         == np.float32
     )
+    assert filtered_data.flags.c_contiguous
diff --git a/tests/test_misc/test_morph.py b/tests/test_misc/test_morph.py
index 287681a3..18a408b7 100644
--- a/tests/test_misc/test_morph.py
+++ b/tests/test_misc/test_morph.py
@@ -48,6 +48,7 @@ def test_data_scaler(data, axis, ensure_clean_memory):
         assert_allclose(np.max(scaled_data), 1115.5359)
     assert_allclose(np.min(scaled_data), 0.0)
     assert scaled_data.dtype == np.float32
+    assert scaled_data.flags.c_contiguous
 
 
 @pytest.mark.parametrize("rotation", ["left", "right"])
diff --git a/tests/test_prep/test_alignment.py b/tests/test_prep/test_alignment.py
index 54d11cf7..c4b484cf 100644
--- a/tests/test_prep/test_alignment.py
+++ b/tests/test_prep/test_alignment.py
@@ -9,6 +9,7 @@
 from imageio.v2 import imread
 from numpy.testing import assert_allclose
 
+
 @pytest.mark.parametrize(
     "image, max_value, mean_value",
     [
@@ -41,7 +42,8 @@ def test_correct_distortion(
 
     preview = {"starts": [0, 0], "stops": [im.shape[0], im.shape[1]], "steps": [1, 1]}
     corrected_data = implementation(im, distortion_coeffs_path, preview).get()
-    
+
     assert_allclose(np.mean(corrected_data), mean_value)
     assert np.max(corrected_data) == max_value
     assert corrected_data.dtype == np.uint8
+    assert corrected_data.flags.c_contiguous
diff --git a/tests/test_prep/test_normalize.py b/tests/test_prep/test_normalize.py
index b9d57c65..4ad41328 100644
--- a/tests/test_prep/test_normalize.py
+++ b/tests/test_prep/test_normalize.py
@@ -9,6 +9,7 @@
 
 from cupy.cuda import nvtx
 
+
 def test_normalize_1D_raises(data, flats, darks, ensure_clean_memory):
     _data_1d = cp.ones(10)
 
@@ -20,6 +21,7 @@ def test_normalize_1D_raises(data, flats, darks, ensure_clean_memory):
     with pytest.raises(ValueError):
         normalize(data, _data_1d, darks)
 
+
 def test_normalize(data, flats, darks, ensure_clean_memory):
     # --- testing normalize  ---#
     data_normalize = normalize(cp.copy(data), flats, darks, minus_log=True).get()
@@ -30,6 +32,7 @@ def test_normalize(data, flats, darks, ensure_clean_memory):
     assert_allclose(np.mean(data_normalize, axis=(1, 2)).sum(), 52.064465, rtol=1e-06)
     assert_allclose(np.median(data_normalize), 0.01723744, rtol=1e-06)
     assert_allclose(np.std(data_normalize), 0.524382, rtol=1e-06)
+    assert data_normalize.flags.c_contiguous
 
 
 @pytest.mark.perf
@@ -79,4 +82,3 @@ def test_normalize_performance(ensure_clean_memory):
     duration_ms = float(end - start) * 1e-6 / 10
 
     assert "performance in ms" == duration_ms
-    
diff --git a/tests/test_prep/test_phase.py b/tests/test_prep/test_phase.py
index 89f26649..93b7eed3 100644
--- a/tests/test_prep/test_phase.py
+++ b/tests/test_prep/test_phase.py
@@ -9,16 +9,19 @@
 
 eps = 1e-6
 
+
 def test_paganin_savu_filter(data):
     # --- testing the Paganin filter on tomo_standard ---#
     filtered_data = paganin_filter_savu(data).get()
-    
+
     assert filtered_data.ndim == 3
     assert_allclose(np.mean(filtered_data), -770.5339, rtol=eps)
     assert_allclose(np.max(filtered_data), -679.80945, rtol=eps)
 
     #: make sure the output is float32
     assert filtered_data.dtype == np.float32
+    assert filtered_data.flags.c_contiguous
+
 
 def test_paganin_filter_savu_energy100(data):
     filtered_data = paganin_filter_savu(data, energy=100.0).get()
@@ -112,6 +115,7 @@ def test_paganin_filter_savu_1D_raises(ensure_clean_memory):
 
     _data = None  #: free up GPU memory
 
+
 # paganin filter tomopy
 def test_paganin_filter_tomopy_1D_raises(ensure_clean_memory):
     _data = cp.ones(10)
@@ -120,6 +124,7 @@ def test_paganin_filter_tomopy_1D_raises(ensure_clean_memory):
 
     _data = None  #: free up GPU memory
 
+
 def test_paganin_filter_tomopy(data):
     # --- testing the Paganin filter from TomoPy on tomo_standard ---#
     filtered_data = paganin_filter_tomopy(data).get()
@@ -130,6 +135,8 @@ def test_paganin_filter_tomopy(data):
 
     #: make sure the output is float32
     assert filtered_data.dtype == np.float32
+    assert filtered_data.flags.c_contiguous
+
 
 def test_paganin_filter_tomopy_energy100(data):
     filtered_data = paganin_filter_tomopy(data, energy=100.0).get()
@@ -145,7 +152,7 @@ def test_paganin_filter_tomopy_dist75(data):
     filtered_data = paganin_filter_tomopy(data, dist=75.0, alpha=1e-6).get()
 
     assert_allclose(np.sum(np.mean(filtered_data, axis=(1, 2))), -1215.4985, rtol=1e-6)
-    assert_allclose(np.sum(filtered_data), -24893412., rtol=1e-6)
+    assert_allclose(np.sum(filtered_data), -24893412.0, rtol=1e-6)
     assert_allclose(np.mean(filtered_data[0, 60:63, 90]), -6.645878, rtol=1e-6)
     assert_allclose(np.sum(filtered_data[50:100, 40, 1]), -343.5908, rtol=1e-6)
 
@@ -181,4 +188,4 @@ def test_paganin_filter_tomopy_performance(ensure_clean_memory):
     dev.synchronize()
     duration_ms = float(time.perf_counter_ns() - start) * 1e-6 / 10
 
-    assert "performance in ms" == duration_ms
\ No newline at end of file
+    assert "performance in ms" == duration_ms
diff --git a/tests/test_prep/test_stripe.py b/tests/test_prep/test_stripe.py
index 5ee3a59d..230687d7 100644
--- a/tests/test_prep/test_stripe.py
+++ b/tests/test_prep/test_stripe.py
@@ -11,10 +11,11 @@
 )
 from numpy.testing import assert_allclose
 
+
 def test_remove_stripe_ti_on_data(data, flats, darks):
     # --- testing the CuPy implementation from TomoCupy ---#
     data = normalize(data, flats, darks, cutoff=10, minus_log=True)
-    
+
     data_after_stripe_removal = remove_stripe_ti(cp.copy(data)).get()
 
     assert_allclose(np.mean(data_after_stripe_removal), 0.28924704, rtol=1e-05)
@@ -23,11 +24,13 @@ def test_remove_stripe_ti_on_data(data, flats, darks):
     )
     assert_allclose(np.median(data_after_stripe_removal), 0.026177486, rtol=1e-05)
     assert_allclose(np.max(data_after_stripe_removal), 2.715983, rtol=1e-05)
-    
+    assert data_after_stripe_removal.flags.c_contiguous
+
     data = None  #: free up GPU memory
     # make sure the output is float32
     assert data_after_stripe_removal.dtype == np.float32
 
+
 def test_remove_stripe_ti_on_flats(host_flats):
     #: testing that numpy uint16 arrays can be passed
     corrected_data = remove_stripe_ti(np.copy(host_flats))
@@ -35,6 +38,7 @@ def test_remove_stripe_ti_on_flats(host_flats):
     assert_allclose(np.mean(corrected_data, axis=(1, 2)).sum(), 19531.168945, rtol=1e-7)
     assert_allclose(np.median(corrected_data), 976.0, rtol=1e-7)
 
+
 def test_remove_stripe_ti_numpy_vs_cupy_on_random_data():
     host_data = np.random.random_sample(size=(181, 5, 256)).astype(np.float32) * 2.0
     corrected_host_data = remove_stripe_ti(np.copy(host_data))
@@ -47,11 +51,12 @@ def test_remove_stripe_ti_numpy_vs_cupy_on_random_data():
         np.median(corrected_data), np.median(corrected_host_data), rtol=1e-6
     )
 
+
 def test_stripe_removal_sorting_cupy(data, flats, darks):
     # --- testing the CuPy port of TomoPy's implementation ---#
     data = normalize(data, flats, darks, cutoff=10, minus_log=True)
-    corrected_data = remove_stripe_based_sorting(data).get()    
-   
+    corrected_data = remove_stripe_based_sorting(data).get()
+
     data = None  #: free up GPU memory
     assert_allclose(np.mean(corrected_data), 0.288198, rtol=1e-06)
     assert_allclose(np.mean(corrected_data, axis=(1, 2)).sum(), 51.87565, rtol=1e-06)
@@ -59,6 +64,8 @@ def test_stripe_removal_sorting_cupy(data, flats, darks):
 
     # make sure the output is float32
     assert corrected_data.dtype == np.float32
+    assert corrected_data.flags.c_contiguous
+
 
 @cp.testing.numpy_cupy_allclose(rtol=1e-6)
 def test_stripe_removal_sorting_numpy_vs_cupy_on_random_data(ensure_clean_memory, xp):
@@ -121,7 +128,7 @@ def test_remove_stripe_ti_performance(ensure_clean_memory):
 def test_remove_all_stripe_on_data(data, flats, darks):
     # --- testing the CuPy implementation from TomoCupy ---#
     data = normalize(data, flats, darks, cutoff=10, minus_log=True)
-    
+
     data_after_stripe_removal = remove_all_stripe(cp.copy(data)).get()
 
     assert_allclose(np.mean(data_after_stripe_removal), 0.266914, rtol=1e-05)
@@ -130,7 +137,8 @@ def test_remove_all_stripe_on_data(data, flats, darks):
     )
     assert_allclose(np.median(data_after_stripe_removal), 0.015338, rtol=1e-04)
     assert_allclose(np.max(data_after_stripe_removal), 2.298123, rtol=1e-05)
-    
+
     data = None  #: free up GPU memory
     # make sure the output is float32
-    assert data_after_stripe_removal.dtype == np.float32
\ No newline at end of file
+    assert data_after_stripe_removal.dtype == np.float32
+    assert data_after_stripe_removal.flags.c_contiguous
diff --git a/tests/test_recon/test_algorithm.py b/tests/test_recon/test_algorithm.py
index e8d7f7d9..769e52f4 100644
--- a/tests/test_recon/test_algorithm.py
+++ b/tests/test_recon/test_algorithm.py
@@ -18,6 +18,7 @@ def test_reconstruct_FBP_1(data, flats, darks, ensure_clean_memory):
         np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
         79.5,
     )
+    assert recon_data.flags.c_contiguous
     recon_data = recon_data.get()
     assert_allclose(np.mean(recon_data), 0.000798, rtol=1e-07, atol=1e-6)
     assert_allclose(np.mean(recon_data, axis=(0, 2)).sum(), 0.102106, rtol=1e-05)
@@ -46,17 +47,16 @@ def test_reconstruct_FBP_2(data, flats, darks, ensure_clean_memory):
 def test_reconstruct_FBP_3(data, flats, darks, ensure_clean_memory):
     recon_data = FBP(
         normalize_cupy(data, flats, darks, cutoff=20.5, minus_log=False),
-        np.linspace(5.0 * np.pi / 360.0, 180.0 * np.pi / 360.0,
-                    data.shape[0]),
-                    79,  # center
-                    210, # recon_size
-                    0.9, # recon_mask_radius
+        np.linspace(5.0 * np.pi / 360.0, 180.0 * np.pi / 360.0, data.shape[0]),
+        79,  # center
+        210,  # recon_size
+        0.9,  # recon_mask_radius
     )
 
     recon_data = recon_data.get()
     assert_allclose(np.mean(recon_data), -0.000252, atol=1e-6)
     assert_allclose(
-         np.mean(recon_data, axis=(0, 2)).sum(), -0.03229, rtol=1e-06, atol=1e-5
+        np.mean(recon_data, axis=(0, 2)).sum(), -0.03229, rtol=1e-06, atol=1e-5
     )
     assert recon_data.dtype == np.float32
     assert recon_data.shape == (210, 128, 210)
@@ -72,6 +72,7 @@ def test_reconstruct_SIRT(data, flats, darks, ensure_clean_memory):
         iterations=10,
         nonnegativity=True,
     )
+    assert recon_data.flags.c_contiguous
     recon_data = recon_data.get()
     assert_allclose(np.mean(recon_data), 0.0018447536, rtol=1e-07, atol=1e-6)
     assert_allclose(np.mean(recon_data, axis=(0, 2)).sum(), 0.23612846, rtol=1e-05)
@@ -88,11 +89,13 @@ def test_reconstruct_CGLS(data, flats, darks, ensure_clean_memory):
         iterations=5,
         nonnegativity=True,
     )
+    assert recon_data.flags.c_contiguous
     recon_data = recon_data.get()
     assert_allclose(np.mean(recon_data), 0.0021818762, rtol=1e-07, atol=1e-6)
     assert_allclose(np.mean(recon_data, axis=(0, 2)).sum(), 0.279187, rtol=1e-03)
     assert recon_data.dtype == np.float32
 
+
 @pytest.mark.perf
 def test_FBP_performance(ensure_clean_memory):
     dev = cp.cuda.Device()

From b08325927130e52e31dd1fdd93a811ffddb7fc3c Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Wed, 7 Feb 2024 16:13:37 +0000
Subject: [PATCH 15/26] updating the name for data rescaler

---
 httomolibgpu/misc/morph.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/httomolibgpu/misc/morph.py b/httomolibgpu/misc/morph.py
index 720b7e96..4af50008 100644
--- a/httomolibgpu/misc/morph.py
+++ b/httomolibgpu/misc/morph.py
@@ -30,7 +30,7 @@
 
 __all__ = [
     "sino_360_to_180",
-    "data_scaler",
+    "data_resampler",
 ]
 
 
@@ -95,10 +95,10 @@ def sino_360_to_180(
 
 
 @nvtx.annotate()
-def data_scaler(
+def data_resampler(
     data: cp.ndarray, newshape: tuple, axis: int = 1, method: str = "linear"
 ) -> cp.ndarray:
-    """Down/Up-scaler of the input data implemented through interpn function.
+    """Down/Up-resampler of the input data implemented through interpn function.
        Please note that the method will leave the specified axis
        dimension unchanged, e.g. (128,128,128) -> (128,256,256) for axis = 0 and
        newshape = (256,256).

From 4cad258e40b6c244ed637d331bf666b0489d3e95 Mon Sep 17 00:00:00 2001
From: Team GPU <121804917+team-gpu@users.noreply.github.com>
Date: Thu, 8 Feb 2024 10:25:05 +0000
Subject: [PATCH 16/26] Adds a rescale_to_int method for mapping float data to
 integer range

---
 httomolibgpu/misc/rescale.py    |  85 +++++++++++++++++++++
 tests/test_misc/test_rescale.py | 131 ++++++++++++++++++++++++++++++++
 2 files changed, 216 insertions(+)
 create mode 100644 httomolibgpu/misc/rescale.py
 create mode 100644 tests/test_misc/test_rescale.py

diff --git a/httomolibgpu/misc/rescale.py b/httomolibgpu/misc/rescale.py
new file mode 100644
index 00000000..8d934a03
--- /dev/null
+++ b/httomolibgpu/misc/rescale.py
@@ -0,0 +1,85 @@
+import cupy as cp
+import numpy as np
+from typing import Literal, Optional, Tuple, Union
+
+__all__ = [
+    "rescale_to_int",
+]
+
+
+rescale_kernel = cp.ElementwiseKernel(
+    'T x, raw T input_min, raw T input_max, raw T factor',
+    'O out',
+    '''
+      T x_clean = isnan(x) || isinf(x) ? T(0) : x;
+      T x_clipped = x_clean < input_min ? input_min : (x_clean > input_max ? input_max : x_clean);
+      T x_rebased = x_clipped - input_min;
+      out = O(x_rebased * factor);
+    ''',
+    'rescale_to_int'
+)
+
+def rescale_to_int(data: cp.ndarray, 
+            perc_range_min: float = 0.0,
+            perc_range_max: float = 100.0,
+            bits: Literal[8, 16, 32] = 8, 
+            glob_stats: Optional[Tuple[float, float, float, int]] = None):
+    """
+        Rescales the data and converts it fit into the range of an unsigned integer type
+        with the given number of bits.
+        
+        Parameters
+        ----------
+        data : cp.ndarray
+            Required input data array, on GPU
+        perc_range_min: float, optional
+            The lower cutoff point in the input data, in percent of the data range (defaults to 0).
+            The lower bound is computed as min + perc_range_min/100*(max-min)
+        perc_range_max: float, optional
+            The upper cutoff point in the input data, in percent of the data range (defaults to 100).
+            The upper bound is computed as min + perc_range_max/100*(max-min)
+        bits: Literal[8, 16, 32], optional
+            The number of bits in the output integer range (defaults to 8). 
+            Allowed values are:
+            - 8 -> uint8
+            - 16 -> uint16
+            - 32 -> uint32
+        glob_stats: tuple, optional
+            Global statistics of the full dataset (beyond the data passed into this call).
+            It's a tuple with (min, max, sum, num_items). If not given, the min/max is 
+            computed from the given data.
+            
+        Returns
+        -------
+        cp.ndarray
+            The original data, clipped to the range specified with the perc_range_min and 
+            perc_range_max, and scaled to the full range of the output integer type
+    """
+    
+    if bits == 8:
+        output_dtype: Union[type[np.uint8], type[np.uint16], type[np.uint32]] = np.uint8
+    elif bits == 16:
+        output_dtype = np.uint16
+    else:
+        output_dtype = np.uint32
+    
+    # get the min and max integer values of the output type
+    output_min = np.iinfo(output_dtype).min
+    output_max = np.iinfo(output_dtype).max
+    
+    if not isinstance(glob_stats, tuple):
+        min_value = float(cp.min(data))
+        max_value = float(cp.max(data))
+    else:
+        min_value = glob_stats[0]
+        max_value = glob_stats[1]
+    
+    range_intensity = max_value - min_value
+    input_min = (perc_range_min * (range_intensity) / 100) + min_value
+    input_max = (perc_range_max * (range_intensity) / 100) + min_value
+    
+    factor = (output_max - output_min) / (input_max - input_min)
+    
+    res = cp.empty(data.shape, dtype=output_dtype)
+    rescale_kernel(data, input_min, input_max, factor, res)
+    return res
\ No newline at end of file
diff --git a/tests/test_misc/test_rescale.py b/tests/test_misc/test_rescale.py
new file mode 100644
index 00000000..f39dc2fc
--- /dev/null
+++ b/tests/test_misc/test_rescale.py
@@ -0,0 +1,131 @@
+import time
+from typing import Literal
+import numpy as np
+import cupy as cp
+import pytest
+from cupy.cuda import nvtx
+
+from httomolibgpu.misc.rescale import rescale_to_int
+
+
+def test_rescale_no_change():
+    data = np.random.randint(0, 255, size=(50, 50), dtype=np.uint8).astype(np.float32)
+    data_dev = cp.asarray(data)
+    res_dev = rescale_to_int(
+        data_dev, bits=8, glob_stats=(0.0, 255.0, 100.0, data.size)
+    )
+
+    res = cp.asnumpy(res_dev).astype(np.float32)
+
+    assert res_dev.dtype == np.uint8
+    np.testing.assert_array_equal(res, data)
+
+
+@pytest.mark.parametrize("bits", [8, 16, 32])
+def test_rescale_no_change_no_stats(bits: Literal[8, 16, 32]):
+    data = np.ones((30, 50), dtype=np.float32)
+    data[0, 0] = 0.0
+    data[13, 1] = (2**bits) - 1
+    data_dev = cp.asarray(data)
+    res_dev = rescale_to_int(data_dev, bits=bits)
+
+    res = cp.asnumpy(res_dev).astype(np.float32)
+
+    assert res_dev.dtype.itemsize == bits // 8
+    np.testing.assert_array_equal(res, data)
+
+
+def test_rescale_double():
+    data = np.ones((30, 50), dtype=np.float32)
+
+    data_dev = cp.asarray(data)
+    res_dev = rescale_to_int(data_dev, bits=8, glob_stats=(0, 2, 100, data.size))
+
+    res = cp.asnumpy(res_dev).astype(np.float32)
+
+    np.testing.assert_array_almost_equal(res, 127.0)
+
+
+def test_rescale_handles_nan_inf():
+    data = np.ones((30, 50), dtype=np.float32)
+    data[0, 0] = float("nan")
+    data[0, 1] = float("inf")
+    data[0, 2] = float("-inf")
+
+    data_dev = cp.asarray(data)
+    res_dev = rescale_to_int(data_dev, bits=8, glob_stats=(0, 2, 100, data.size))
+
+    res = cp.asnumpy(res_dev).astype(np.float32)
+
+    np.testing.assert_array_equal(res[0, 0:3], 0.0)
+
+
+def test_rescale_double_offset():
+    data = np.ones((30, 50), dtype=np.float32) + 10
+
+    data_dev = cp.asarray(data)
+    res_dev = rescale_to_int(data_dev, bits=8, glob_stats=(10, 12, 100, data.size))
+
+    res = cp.asnumpy(res_dev).astype(np.float32)
+
+    np.testing.assert_array_almost_equal(res, 127.0)
+
+
+@pytest.mark.parametrize("bits", [8, 16])
+def test_rescale_double_offset_min_percentage(bits: Literal[8, 16, 32]):
+    data = np.ones((30, 50), dtype=np.float32) + 15
+    data[0, 0] = 10
+    data[0, 1] = 20
+
+    data_dev = cp.asarray(data)
+    res_dev = rescale_to_int(
+        data_dev,
+        bits=bits,
+        glob_stats=(10, 20, 100, data.size),
+        perc_range_min=10.0,
+        perc_range_max=90.0,
+    )
+
+    res = cp.asnumpy(res_dev).astype(np.float32)
+
+    max = (2**bits) - 1
+    num = int(5 / 8 * max)
+    # note: with 32bit, the float type actually overflows and the result is not full precision
+    np.testing.assert_array_almost_equal(res[1:, :], num)
+    assert res[0, 0] == 0.0
+    assert res[0, 1] == max
+
+
+@pytest.mark.perf
+def test_rescale_performance():
+    data = cp.random.random((1801, 400, 2560), dtype=np.float32) * 500 + 20
+    in_min = float(cp.min(data))
+    in_max = float(cp.max(data))
+
+    # do a cold run first
+    rescale_to_int(
+        data,
+        bits=8,
+        glob_stats=(in_min, in_max, 100.0, data.size),
+        perc_range_max=95.0,
+        perc_range_min=5.0,
+    )
+
+    dev = cp.cuda.Device()
+    dev.synchronize()
+
+    start = time.perf_counter_ns()
+    nvtx.RangePush("Core")
+    for _ in range(20):
+        rescale_to_int(
+            data,
+            bits=8,
+            glob_stats=(in_min, in_max, 100.0, data.size),
+            perc_range_max=95.0,
+            perc_range_min=5.0,
+        )
+    nvtx.RangePop()
+    dev.synchronize()
+    duration_ms = float(time.perf_counter_ns() - start) * 1e-6 / 20
+
+    assert "performance in ms" == duration_ms

From e63efe3bc1daa7bd15bf53858ff94977028d98c3 Mon Sep 17 00:00:00 2001
From: Team GPU <121804917+team-gpu@users.noreply.github.com>
Date: Thu, 8 Feb 2024 11:09:26 +0000
Subject: [PATCH 17/26] Adds nvtx annotation for profiler view

---
 httomolibgpu/misc/rescale.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/httomolibgpu/misc/rescale.py b/httomolibgpu/misc/rescale.py
index 8d934a03..daddc4a5 100644
--- a/httomolibgpu/misc/rescale.py
+++ b/httomolibgpu/misc/rescale.py
@@ -1,6 +1,7 @@
 import cupy as cp
 import numpy as np
 from typing import Literal, Optional, Tuple, Union
+import nvtx
 
 __all__ = [
     "rescale_to_int",
@@ -19,6 +20,7 @@
     'rescale_to_int'
 )
 
+@nvtx.annotate()
 def rescale_to_int(data: cp.ndarray, 
             perc_range_min: float = 0.0,
             perc_range_max: float = 100.0,

From ef1af5a3379d8622de4f84624d34210a4ba782fa Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Thu, 8 Feb 2024 15:27:11 +0000
Subject: [PATCH 18/26] corrections to data_resampler and tests

---
 httomolibgpu/misc/morph.py    |  21 ++++--
 httomolibgpu/prep/stripe.py   | 121 ++++++++++++++++++----------------
 tests/test_misc/test_morph.py |  17 ++---
 3 files changed, 92 insertions(+), 67 deletions(-)

diff --git a/httomolibgpu/misc/morph.py b/httomolibgpu/misc/morph.py
index 4af50008..c7bcf95b 100644
--- a/httomolibgpu/misc/morph.py
+++ b/httomolibgpu/misc/morph.py
@@ -147,10 +147,23 @@ def data_resampler(
     scale_x = 2 / step_x
     scale_y = 2 / step_y
 
-    xi = np.mgrid[
-        -newshape[0] / scale_x : newshape[0] / scale_x : step_x,
-        -newshape[1] / scale_y : newshape[1] / scale_y : step_y,
-    ]
+    y1 = np.linspace(
+        -newshape[0] / scale_x,
+        newshape[0] / scale_x - step_x,
+        num=newshape[0],
+        endpoint=False,
+    ).astype(np.float32)
+    x1 = np.linspace(
+        -newshape[1] / scale_y,
+        newshape[1] / scale_y - step_y,
+        num=newshape[1],
+        endpoint=False,
+    ).astype(np.float32)
+
+    xi_mesh = np.meshgrid(x1, y1)
+    xi = np.empty((2, newshape[0], newshape[1]), dtype=np.float32)
+    xi[0, :, :] = xi_mesh[1]
+    xi[1, :, :] = xi_mesh[0]
     xi_size = xi.size
     xi = np.rollaxis(xi, 0, 3)
     xi = np.reshape(xi, [xi_size // 2, 2])
diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py
index 5aebfb41..d4a29908 100644
--- a/httomolibgpu/prep/stripe.py
+++ b/httomolibgpu/prep/stripe.py
@@ -35,6 +35,7 @@
     "remove_all_stripe",
 ]
 
+
 @nvtx.annotate()
 def remove_stripe_based_sorting(
     data: Union[cp.ndarray, np.ndarray],
@@ -110,6 +111,7 @@ def _rs_sort(sinogram, size, dim):
 
     return xp.transpose(sino_corrected)
 
+
 @nvtx.annotate()
 def remove_stripe_ti(
     data: Union[cp.ndarray, np.ndarray],
@@ -146,7 +148,7 @@ def remove_stripe_ti(
 
 
 ######## Optimized version for Vo-all ring removal in tomopy########
-        # This function is taken from TomoCuPy package
+# This function is taken from TomoCuPy package
 # *************************************************************************** #
 #                  Copyright © 2022, UChicago Argonne, LLC                    #
 #                           All Rights Reserved                               #
@@ -193,8 +195,8 @@ def remove_all_stripe(
     sm_size : int, optional
         Window size of the median filter to remove small-to-medium stripes.
     dim : {1, 2}, optional
-        Dimension of the window.   
-    
+        Dimension of the window.
+
     Returns
     -------
     ndarray
@@ -202,9 +204,9 @@ def remove_all_stripe(
 
     References
     ----------
-    .. [1] https://doi.org/10.1364/OE.26.028396   
+    .. [1] https://doi.org/10.1364/OE.26.028396
 
-    """               
+    """
     matindex = _create_matindex(data.shape[2], data.shape[0])
     for m in range(data.shape[1]):
         sino = data[:, m, :]
@@ -213,6 +215,7 @@ def remove_all_stripe(
         data[:, m, :] = sino
     return data
 
+
 @nvtx.annotate()
 def _rs_sort2(sinogram, size, matindex, dim):
     """
@@ -220,39 +223,41 @@ def _rs_sort2(sinogram, size, matindex, dim):
     """
     sinogram = cp.transpose(sinogram)
     matcomb = cp.asarray(cp.dstack((matindex, sinogram)))
-    
+
     # matsort = cp.asarray([row[row[:, 1].argsort()] for row in matcomb])
-    ids = cp.argsort(matcomb[:,:,1],axis=1)
+    ids = cp.argsort(matcomb[:, :, 1], axis=1)
     matsort = matcomb.copy()
-    matsort[:,:,0] = cp.take_along_axis(matsort[:,:,0],ids,axis=1)
-    matsort[:,:,1] = cp.take_along_axis(matsort[:,:,1],ids,axis=1)
+    matsort[:, :, 0] = cp.take_along_axis(matsort[:, :, 0], ids, axis=1)
+    matsort[:, :, 1] = cp.take_along_axis(matsort[:, :, 1], ids, axis=1)
     if dim == 1:
         matsort[:, :, 1] = median_filter(matsort[:, :, 1], (size, 1))
     else:
         matsort[:, :, 1] = median_filter(matsort[:, :, 1], (size, size))
-    
+
     # matsortback = cp.asarray([row[row[:, 0].argsort()] for row in matsort])
-    
-    ids = cp.argsort(matsort[:,:,0],axis=1)
+
+    ids = cp.argsort(matsort[:, :, 0], axis=1)
     matsortback = matsort.copy()
-    matsortback[:,:,0] = cp.take_along_axis(matsortback[:,:,0],ids,axis=1)
-    matsortback[:,:,1] = cp.take_along_axis(matsortback[:,:,1],ids,axis=1)
-    
+    matsortback[:, :, 0] = cp.take_along_axis(matsortback[:, :, 0], ids, axis=1)
+    matsortback[:, :, 1] = cp.take_along_axis(matsortback[:, :, 1], ids, axis=1)
+
     sino_corrected = matsortback[:, :, 1]
     return cp.transpose(sino_corrected)
 
+
 @nvtx.annotate()
-def _mpolyfit(x,y):
-    n= len(x)
+def _mpolyfit(x, y):
+    n = len(x)
     x_mean = cp.mean(x)
     y_mean = cp.mean(y)
-    
-    Sxy = cp.sum(x*y) - n*x_mean*y_mean
-    Sxx = cp.sum(x*x) - n*x_mean*x_mean
-    
+
+    Sxy = cp.sum(x * y) - n * x_mean * y_mean
+    Sxx = cp.sum(x * x) - n * x_mean * x_mean
+
     slope = Sxy / Sxx
-    intercept = y_mean - slope*x_mean
-    return slope,intercept
+    intercept = y_mean - slope * x_mean
+    return slope, intercept
+
 
 @nvtx.annotate()
 def _detect_stripe(listdata, snr):
@@ -264,8 +269,10 @@ def _detect_stripe(listdata, snr):
     xlist = cp.arange(0, numdata, 1.0)
     ndrop = cp.int16(0.25 * numdata)
     # (_slope, _intercept) = cp.polyfit(xlist[ndrop:-ndrop - 1],
-                                    #   listsorted[ndrop:-ndrop - 1], 1)
-    (_slope, _intercept) = _mpolyfit(xlist[ndrop:-ndrop - 1], listsorted[ndrop:-ndrop - 1])
+    #   listsorted[ndrop:-ndrop - 1], 1)
+    (_slope, _intercept) = _mpolyfit(
+        xlist[ndrop : -ndrop - 1], listsorted[ndrop : -ndrop - 1]
+    )
 
     numt1 = _intercept + _slope * xlist[-1]
     noiselevel = cp.abs(numt1 - _intercept)
@@ -273,33 +280,34 @@ def _detect_stripe(listdata, snr):
     val1 = cp.abs(listsorted[0] - _intercept) / noiselevel
     val2 = cp.abs(listsorted[-1] - numt1) / noiselevel
     listmask = cp.zeros_like(listdata)
-    if (val1 >= snr):
+    if val1 >= snr:
         upper_thresh = _intercept + noiselevel * snr * 0.5
         listmask[listdata > upper_thresh] = 1.0
-    if (val2 >= snr):
+    if val2 >= snr:
         lower_thresh = numt1 - noiselevel * snr * 0.5
         listmask[listdata <= lower_thresh] = 1.0
     return listmask
 
+
 @nvtx.annotate()
 def _rs_large(sinogram, snr, size, matindex, drop_ratio=0.1, norm=True):
     """
     Remove large stripes.
     """
-    drop_ratio = max(min(drop_ratio,0.8),0)# = cp.clip(drop_ratio, 0.0, 0.8)
+    drop_ratio = max(min(drop_ratio, 0.8), 0)  # = cp.clip(drop_ratio, 0.0, 0.8)
     (nrow, ncol) = sinogram.shape
     ndrop = int(0.5 * drop_ratio * nrow)
     sinosort = cp.sort(sinogram, axis=0)
     sinosmooth = median_filter(sinosort, (1, size))
-    list1 = cp.mean(sinosort[ndrop:nrow - ndrop], axis=0)
-    list2 = cp.mean(sinosmooth[ndrop:nrow - ndrop], axis=0)
+    list1 = cp.mean(sinosort[ndrop : nrow - ndrop], axis=0)
+    list2 = cp.mean(sinosmooth[ndrop : nrow - ndrop], axis=0)
     # listfact = cp.divide(list1,
     #                      list2,
     #                      out=cp.ones_like(list1),
     #                      where=list2 != 0)
-    
-    listfact = list1/list2
-    
+
+    listfact = list1 / list2
+
     # Locate stripes
     listmask = _detect_stripe(listfact, snr)
     listmask = binary_dilation(listmask, iterations=1).astype(listmask.dtype)
@@ -309,25 +317,26 @@ def _rs_large(sinogram, snr, size, matindex, drop_ratio=0.1, norm=True):
         sinogram = sinogram / matfact
     sinogram1 = cp.transpose(sinogram)
     matcombine = cp.asarray(cp.dstack((matindex, sinogram1)))
-    
+
     # matsort = cp.asarray([row[row[:, 1].argsort()] for row in matcombine])
-    ids = cp.argsort(matcombine[:,:,1],axis=1)
+    ids = cp.argsort(matcombine[:, :, 1], axis=1)
     matsort = matcombine.copy()
-    matsort[:,:,0] = cp.take_along_axis(matsort[:,:,0],ids,axis=1)
-    matsort[:,:,1] = cp.take_along_axis(matsort[:,:,1],ids,axis=1)
-    
+    matsort[:, :, 0] = cp.take_along_axis(matsort[:, :, 0], ids, axis=1)
+    matsort[:, :, 1] = cp.take_along_axis(matsort[:, :, 1], ids, axis=1)
+
     matsort[:, :, 1] = cp.transpose(sinosmooth)
     # matsortback = cp.asarray([row[row[:, 0].argsort()] for row in matsort])
-    ids = cp.argsort(matsort[:,:,0],axis=1)
+    ids = cp.argsort(matsort[:, :, 0], axis=1)
     matsortback = matsort.copy()
-    matsortback[:,:,0] = cp.take_along_axis(matsortback[:,:,0],ids,axis=1)
-    matsortback[:,:,1] = cp.take_along_axis(matsortback[:,:,1],ids,axis=1)
-    
+    matsortback[:, :, 0] = cp.take_along_axis(matsortback[:, :, 0], ids, axis=1)
+    matsortback[:, :, 1] = cp.take_along_axis(matsortback[:, :, 1], ids, axis=1)
+
     sino_corrected = cp.transpose(matsortback[:, :, 1])
     listxmiss = cp.where(listmask > 0.0)[0]
     sinogram[:, listxmiss] = sino_corrected[:, listxmiss]
     return sinogram
 
+
 @nvtx.annotate()
 def _rs_dead(sinogram, snr, size, matindex, norm=True):
     """
@@ -337,13 +346,12 @@ def _rs_dead(sinogram, snr, size, matindex, norm=True):
     (nrow, _) = sinogram.shape
     # sinosmooth = cp.apply_along_axis(uniform_filter1d, 0, sinogram, 10)
     sinosmooth = uniform_filter1d(sinogram, 10, axis=0)
-    
+
     listdiff = cp.sum(cp.abs(sinogram - sinosmooth), axis=0)
     listdiffbck = median_filter(listdiff, size)
-    
-    
-    listfact = listdiff/listdiffbck
-    
+
+    listfact = listdiff / listdiffbck
+
     listmask = _detect_stripe(listfact, snr)
     listmask = binary_dilation(listmask, iterations=1).astype(listmask.dtype)
     listmask[0:2] = 0.0
@@ -351,20 +359,23 @@ def _rs_dead(sinogram, snr, size, matindex, norm=True):
     listx = cp.where(listmask < 1.0)[0]
     listy = cp.arange(nrow)
     matz = sinogram[:, listx]
-    
-    listxmiss = cp.where(listmask > 0.0)[0]    
-    
-    # finter = interpolate.interp2d(listx.get(), listy.get(), matz.get(), kind='linear')    
+
+    listxmiss = cp.where(listmask > 0.0)[0]
+
+    # finter = interpolate.interp2d(listx.get(), listy.get(), matz.get(), kind='linear')
     if len(listxmiss) > 0:
-        # sinogram_c[:, listxmiss.get()] = finter(listxmiss.get(), listy.get())        
+        # sinogram_c[:, listxmiss.get()] = finter(listxmiss.get(), listy.get())
         ids = cp.searchsorted(listx, listxmiss)
-        sinogram[:,listxmiss] = matz[:,ids-1]+(listxmiss-listx[ids-1])*(matz[:,ids]-matz[:,ids-1])/(listx[ids]-listx[ids-1])
-        
+        sinogram[:, listxmiss] = matz[:, ids - 1] + (listxmiss - listx[ids - 1]) * (
+            matz[:, ids] - matz[:, ids - 1]
+        ) / (listx[ids] - listx[ids - 1])
+
     # Remove residual stripes
     if norm is True:
         sinogram = _rs_large(sinogram, snr, size, matindex)
     return sinogram
 
+
 @nvtx.annotate()
 def _create_matindex(nrow, ncol):
     """
@@ -372,4 +383,4 @@ def _create_matindex(nrow, ncol):
     """
     listindex = cp.arange(0.0, ncol, 1.0)
     matindex = cp.tile(listindex, (nrow, 1))
-    return matindex
\ No newline at end of file
+    return matindex.astype(np.float32)
diff --git a/tests/test_misc/test_morph.py b/tests/test_misc/test_morph.py
index 18a408b7..d03f8a5d 100644
--- a/tests/test_misc/test_morph.py
+++ b/tests/test_misc/test_morph.py
@@ -4,7 +4,7 @@
 from cupy.cuda import nvtx
 import pytest
 from numpy.testing import assert_allclose
-from httomolibgpu.misc.morph import sino_360_to_180, data_scaler
+from httomolibgpu.misc.morph import sino_360_to_180, data_resampler
 
 
 @pytest.mark.parametrize(
@@ -32,21 +32,22 @@ def test_sino_360_to_180_wrong_dims(ensure_clean_memory, shape):
 
 
 @pytest.mark.parametrize("axis", [0, 1, 2])
-def test_data_scaler(data, axis, ensure_clean_memory):
-    newshape = (200, 250)
-    scaled_data = data_scaler(data, newshape=newshape, axis=axis, method="linear").get()
+def test_data_resampler(data, axis, ensure_clean_memory):
+    newshape = (60, 80)
+    scaled_data = data_resampler(
+        data, newshape=newshape, axis=axis, method="linear"
+    ).get()
 
     assert scaled_data.ndim == 3
     if axis == 0:
         assert scaled_data.shape == (180, newshape[0], newshape[1])
-        assert_allclose(np.max(scaled_data), 1113.2)
+        assert_allclose(np.max(scaled_data), 1111.7404)
     if axis == 1:
         assert scaled_data.shape == (newshape[0], 128, newshape[1])
-        assert_allclose(np.max(scaled_data), 1118.0)
+        assert_allclose(np.max(scaled_data), 1102.0)
     if axis == 2:
         assert scaled_data.shape == (newshape[0], newshape[1], 160)
-        assert_allclose(np.max(scaled_data), 1115.5359)
-    assert_allclose(np.min(scaled_data), 0.0)
+        assert_allclose(np.max(scaled_data), 1113.2761)
     assert scaled_data.dtype == np.float32
     assert scaled_data.flags.c_contiguous
 

From 0395d31842da507d7121cf780c8f5050d292cf8b Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Thu, 8 Feb 2024 17:38:55 +0000
Subject: [PATCH 19/26] args name corrections

---
 httomolibgpu/misc/morph.py    | 14 +++++++-------
 tests/test_misc/test_morph.py |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/httomolibgpu/misc/morph.py b/httomolibgpu/misc/morph.py
index c7bcf95b..93608dbe 100644
--- a/httomolibgpu/misc/morph.py
+++ b/httomolibgpu/misc/morph.py
@@ -96,18 +96,18 @@ def sino_360_to_180(
 
 @nvtx.annotate()
 def data_resampler(
-    data: cp.ndarray, newshape: tuple, axis: int = 1, method: str = "linear"
+    data: cp.ndarray, newshape: list, axis: int = 1, interpolation: str = "linear"
 ) -> cp.ndarray:
     """Down/Up-resampler of the input data implemented through interpn function.
        Please note that the method will leave the specified axis
        dimension unchanged, e.g. (128,128,128) -> (128,256,256) for axis = 0 and
-       newshape = (256,256).
+       newshape = [256,256].
 
     Args:
         data (cp.ndarray): 3d cupy array.
-        newshape (tuple): 2d tuple that defines the new shape size.
+        newshape (list): 2d list that defines the 2D slice shape of new shape data.
         axis (int, optional): Axis along which the scaling is applied. Defaults to 1.
-        method (str, optional): Selection of interpn method for interpolation. Defaults to 'linear'.
+        interpolation (str, optional): Selection of interpolation method. Defaults to 'linear'.
 
     Raises:
         ValueError: When data is not 3D
@@ -175,7 +175,7 @@ def data_resampler(
                 points,
                 data[j, :, :],
                 xi,
-                method=method,
+                method=interpolation,
                 bounds_error=False,
                 fill_value=0.0,
             )
@@ -189,7 +189,7 @@ def data_resampler(
                 points,
                 data[:, j, :],
                 xi,
-                method=method,
+                method=interpolation,
                 bounds_error=False,
                 fill_value=0.0,
             )
@@ -202,7 +202,7 @@ def data_resampler(
                 points,
                 data[:, :, j],
                 xi,
-                method=method,
+                method=interpolation,
                 bounds_error=False,
                 fill_value=0.0,
             )
diff --git a/tests/test_misc/test_morph.py b/tests/test_misc/test_morph.py
index d03f8a5d..d7b015a1 100644
--- a/tests/test_misc/test_morph.py
+++ b/tests/test_misc/test_morph.py
@@ -33,9 +33,9 @@ def test_sino_360_to_180_wrong_dims(ensure_clean_memory, shape):
 
 @pytest.mark.parametrize("axis", [0, 1, 2])
 def test_data_resampler(data, axis, ensure_clean_memory):
-    newshape = (60, 80)
+    newshape = [60, 80]
     scaled_data = data_resampler(
-        data, newshape=newshape, axis=axis, method="linear"
+        data, newshape=newshape, axis=axis, interpolation="linear"
     ).get()
 
     assert scaled_data.ndim == 3

From 152ec1d7170d9365a7293337ca60cf3def3bddd1 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Thu, 15 Feb 2024 10:23:56 +0000
Subject: [PATCH 20/26] adding find_center_pc and cucim dependency

---
 conda/environment.yml             |   5 +-
 conda/recipe/meta.yaml            |   1 +
 httomolibgpu/recon/rotation.py    | 119 ++++++++++++++++++++++++------
 pyproject.toml                    |   1 +
 tests/test_recon/test_rotation.py |  61 ++++++++++++++-
 5 files changed, 159 insertions(+), 28 deletions(-)

diff --git a/conda/environment.yml b/conda/environment.yml
index 1f6bdc9e..d3f916db 100644
--- a/conda/environment.yml
+++ b/conda/environment.yml
@@ -4,8 +4,9 @@ channels:
   - anaconda
   - astra-toolbox
   - httomo
+  - rapidsai
 dependencies:
-  - conda-forge::cupy==12.3.0
+  - conda-forge::cupy
   - conda-forge::numpy<=1.24
   - conda-forge::nvtx
   - conda-forge::scipy
@@ -22,4 +23,4 @@ dependencies:
   - anaconda::h5py
   - astra-toolbox::astra-toolbox
   - httomo::tomobar
-  
+  - rapidsai::cucim
diff --git a/conda/recipe/meta.yaml b/conda/recipe/meta.yaml
index 9079fa6f..57e825fd 100644
--- a/conda/recipe/meta.yaml
+++ b/conda/recipe/meta.yaml
@@ -23,6 +23,7 @@ requirements:
     - python
     - numpy
     - cupy
+    - cucim
     - astra-toolbox
     - nvtx
     - tomobar
diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
index 78f4e5b8..338782df 100644
--- a/httomolibgpu/recon/rotation.py
+++ b/httomolibgpu/recon/rotation.py
@@ -36,6 +36,7 @@
 __all__ = [
     "find_center_vo",
     "find_center_360",
+    "find_center_pc",
 ]
 
 
@@ -45,7 +46,7 @@ def find_center_vo(
     ind: Optional[int] = None,
     smin: int = -50,
     smax: int = 50,
-    srad: float = 6.,
+    srad: float = 6.0,
     step: float = 0.25,
     ratio: float = 0.5,
     drop: int = 20,
@@ -114,6 +115,58 @@ def find_center_vo(
 
     return fine_cen.get()
 
+
+@nvtx.annotate()
+def find_center_pc(proj1, proj2, tol=0.5, rotc_guess=None):
+    """
+    Find rotation axis location by finding the offset between the first
+    projection and a mirrored projection 180 degrees apart using
+    phase correlation in Fourier space.
+    The ``phase_cross_correlation`` function uses cross-correlation in Fourier
+    space, optionally employing an upsampled matrix-multiplication DFT to
+    achieve arbitrary subpixel precision. :cite:`Guizar:08`.
+
+    Parameters
+    ----------
+    proj1 : cp.ndarray
+        Projection data
+
+    proj2 : cp.ndarray
+        Projection data
+
+    tol : scalar, optional
+        Subpixel accuracy
+
+    rotc_guess : float, optional
+        Initial guess value for the rotation center
+
+    Returns
+    -------
+    float
+        Rotation axis location.
+    """
+    from cucim.skimage.registration import phase_cross_correlation
+
+    imgshift = 0.0 if rotc_guess is None else rotc_guess - (proj1.shape[1] - 1.0) / 2.0
+
+    proj1 = cpndi.shift(proj1, [0, -imgshift], mode="constant", cval=0)
+    proj2 = cpndi.shift(proj2, [0, -imgshift], mode="constant", cval=0)
+
+    # create reflection of second projection
+    proj2 = cp.fliplr(proj2)
+
+    # using cucim of rapids to do phase cross correlation between two images
+    shift = phase_cross_correlation(
+        reference_image=proj1, moving_image=proj2, upsample_factor=1.0 / tol
+    )
+
+    # Compute center of rotation as the center of first image and the
+    # registered translation with the second image
+    center = (proj1.shape[1] + shift[0][1] - 1.0) / 2.0
+
+    return center + imgshift
+
+
 @nvtx.annotate()
 def _search_coarse(sino, smin, smax, ratio, drop):
     (nrow, ncol) = sino.shape
@@ -176,10 +229,10 @@ def _create_mask(nrow, ncol, radius, drop):
     block_x = 128
     block_y = 1
     block_dims = (block_x, block_y)
-    grid_x = (ncol//2+1 + block_x - 1) // block_x
+    grid_x = (ncol // 2 + 1 + block_x - 1) // block_x
     grid_y = nrow
     grid_dims = (grid_x, grid_y)
-    mask = cp.empty((nrow, ncol//2+1), dtype="uint16")
+    mask = cp.empty((nrow, ncol // 2 + 1), dtype="uint16")
     params = (
         ncol,
         nrow,
@@ -203,6 +256,7 @@ def round_up(x: float) -> int:
     else:
         return int(math.floor(x))
 
+
 def _get_available_gpu_memory() -> int:
     dev = cp.cuda.Device()
     # first, let's make some space
@@ -212,12 +266,17 @@ def _get_available_gpu_memory() -> int:
     available_memory = dev.mem_info[0] + cp.get_default_memory_pool().free_bytes()
     return int(available_memory * 0.9)  # 10% safety margin
 
-def _calculate_chunks(nshifts: int, shift_size: int, available_memory: Optional[int] = None) -> List[int]:
+
+def _calculate_chunks(
+    nshifts: int, shift_size: int, available_memory: Optional[int] = None
+) -> List[int]:
     if available_memory is None:
         available_memory = _get_available_gpu_memory()
-    
+
     available_memory -= shift_size
-    freq_domain_size = shift_size  # it needs only half (RFFT), but complex64, so it's the same
+    freq_domain_size = (
+        shift_size  # it needs only half (RFFT), but complex64, so it's the same
+    )
     fft_plan_size = freq_domain_size
     size_per_shift = fft_plan_size + freq_domain_size + shift_size
     nshift_max = available_memory // size_per_shift
@@ -250,42 +309,53 @@ def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
     # note: we don't have to calculate the mean here, as we're only looking for minimum metric.
     # The sum is enough.
     masked_sum_abs_kernel = cp.ReductionKernel(
-        in_params="complex64 x, uint16 mask",                   # input, complex + mask
-        out_params="float32 out",                        # output, real
+        in_params="complex64 x, uint16 mask",  # input, complex + mask
+        out_params="float32 out",  # output, real
         map_expr="mask ? abs(x) : 0.0f",
-        reduce_expr="a + b",            
-        post_map_expr="out = a",   
-        identity='0.0f',
+        reduce_expr="a + b",
+        post_map_expr="out = a",
+        identity="0.0f",
         reduce_type="float",
-        name="masked_sum_abs", 
+        name="masked_sum_abs",
     )
 
     # determine how many shifts we can fit in the available memory
     # and iterate in chunks
-    chunks = _calculate_chunks(nshifts, (na1 + na2) * sino2.shape[1] * cp.float32().nbytes)
+    chunks = _calculate_chunks(
+        nshifts, (na1 + na2) * sino2.shape[1] * cp.float32().nbytes
+    )
 
     mat = cp.empty((chunks[0], na1 + na2, sino2.shape[1]), dtype=cp.float32)
     mat[:, :na1, :] = sino1
     # explicitly create FFT plan here, so it's not cached and clearly re-used
-    plan = cupyx.scipy.fftpack.get_fft_plan(mat, mat.shape[-2:], axes=(1, 2), value_type='R2C')
+    plan = cupyx.scipy.fftpack.get_fft_plan(
+        mat, mat.shape[-2:], axes=(1, 2), value_type="R2C"
+    )
 
-    for i,stop_idx in enumerate(chunks):
+    for i, stop_idx in enumerate(chunks):
         if i > 0:
             # more than one iteration means we're tight on memory, so clear up freed blocks
             mat_freq = None
             cp.get_default_memory_pool().free_all_blocks()
-        
-        start_idx = 0 if i == 0 else chunks[i-1]
+
+        start_idx = 0 if i == 0 else chunks[i - 1]
         size = stop_idx - start_idx
-        
+
         # first, handle the integer shifts without spline in a raw kernel,
         # and shift in the sino3 one accordingly
         bx = 128
         gx = (sino3.shape[1] + bx - 1) // bx
         shift_whole_shifts(
-            grid=(gx, na2, size), ####
+            grid=(gx, na2, size),  ####
             block=(bx, 1, 1),
-            args=(sino2, sino3, list_shift[start_idx:stop_idx], mat[:, na1:, :], sino3.shape[1], na1 + na2),
+            args=(
+                sino2,
+                sino3,
+                list_shift[start_idx:stop_idx],
+                mat[:, na1:, :],
+                sino3.shape[1],
+                na1 + na2,
+            ),
         )
 
         # now we can only look at the spline shifting, the rest is done
@@ -301,10 +371,12 @@ def _calculate_metric(list_shift, sino1, sino2, sino3, mask, out):
                     mat[i, na1:, :shift_int] = shifted[:, :shift_int]
 
         # stack and transform
-        # (we do the full sized mat FFT, even though the last chunk may be smaller, to 
+        # (we do the full sized mat FFT, even though the last chunk may be smaller, to
         # make sure we can re-use the same FFT plan as before)
         mat_freq = cupyx.scipy.fft.rfft2(mat, axes=(1, 2), norm=None, plan=plan)
-        masked_sum_abs_kernel(mat_freq[:size, :, :], mask, out=out[start_idx:stop_idx], axis=(1, 2))
+        masked_sum_abs_kernel(
+            mat_freq[:size, :, :], mask, out=out[start_idx:stop_idx], axis=(1, 2)
+        )
 
 
 @nvtx.annotate()
@@ -335,6 +407,7 @@ def _downsample(sino, level, axis):
     kernel(grid_dims, block_dims, params, shared_mem=shared_mem_bytes)
     return downsampled_data
 
+
 # --- Center of rotation (COR) estimation method ---#
 @nvtx.annotate()
 def find_center_360(
@@ -415,7 +488,7 @@ def find_center_360(
 
 def _find_overlap(
     mat1, mat2, win_width, side=None, denoise=True, norm=False, use_overlap=False
-) :
+):
     """
     Find the overlap area and overlap side between two images (Ref. [1]) where
     the overlap side referring to the first image.
diff --git a/pyproject.toml b/pyproject.toml
index a4f193ed..282e66ac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,6 +39,7 @@ requires-python = ">=3.9"
 dynamic = ["version"]
 dependencies = [
     "cupy",
+    "cucim",
     "numpy",
     "scipy",
     "pillow",
diff --git a/tests/test_recon/test_rotation.py b/tests/test_recon/test_rotation.py
index ba5e22b5..93ee9871 100644
--- a/tests/test_recon/test_rotation.py
+++ b/tests/test_recon/test_rotation.py
@@ -4,13 +4,20 @@
 from unittest import mock
 import cupy as cp
 from cupy.cuda import nvtx
+from cupyx.scipy.ndimage import shift
 import numpy as np
 import pytest
 from httomolibgpu.prep.normalize import normalize
-from httomolibgpu.recon.rotation import _calculate_chunks, find_center_360, find_center_vo
+from httomolibgpu.recon.rotation import (
+    _calculate_chunks,
+    find_center_360,
+    find_center_vo,
+    find_center_pc,
+)
 from numpy.testing import assert_allclose
 from .rotation_cpu_reference import find_center_360_numpy
 
+
 def test_find_center_vo(data, flats, darks):
     data = normalize(data, flats, darks)
 
@@ -23,6 +30,7 @@ def test_find_center_vo(data, flats, darks):
     #: Check that we only get a float32 output
     assert cor.dtype == np.float32
 
+
 def test_find_center_vo_ones(ensure_clean_memory):
     mat = cp.ones(shape=(103, 450, 230), dtype=cp.float32)
     cor = find_center_vo(mat)
@@ -36,7 +44,8 @@ def test_find_center_vo_random(ensure_clean_memory):
     data_host = np.random.random_sample(size=(900, 1, 1280)).astype(np.float32) * 2.0
     data = cp.asarray(data_host, dtype=np.float32)
     cent = find_center_vo(data)
-    assert_allclose(cent, 680.75)    
+    assert_allclose(cent, 680.75)
+
 
 def test_find_center_vo_calculate_chunks():
     # we need the split to fit into the available memory, and also make sure
@@ -53,7 +62,9 @@ def test_find_center_vo_calculate_chunks():
     # add a bit of randomness here, to check basic assumptions
     random.seed(123456)
     for _ in range(100):
-        available = random.randint(1*300+100, 100*300 + 100)  # memory to fit anywhere between 1 and 100 shifts
+        available = random.randint(
+            1 * 300 + 100, 100 * 300 + 100
+        )  # memory to fit anywhere between 1 and 100 shifts
         nshifts = random.randint(1, 1000)
         chunks = _calculate_chunks(nshifts, 100, available)
         assert len(chunks) > 0
@@ -65,6 +76,7 @@ def test_find_center_vo_calculate_chunks():
             np.testing.assert_array_equal(diffs[:-1], diffs[0])
             assert diffs[-1] <= diffs[0]
 
+
 @pytest.mark.perf
 def test_find_center_vo_performance():
     dev = cp.cuda.Device()
@@ -105,6 +117,7 @@ def test_find_center_360_data(data):
     assert side == 1
     assert_allclose(overlap_pos, 111.906334, rtol=eps)
 
+
 def test_find_center_360_1D_raises(data):
     #: 360-degree sinogram must be a 3d array
     with pytest.raises(ValueError):
@@ -160,3 +173,45 @@ def test_find_center_360_performance(ensure_clean_memory):
     duration_ms = float(time.perf_counter_ns() - start) * 1e-6 / 10
 
     assert "performance in ms" == duration_ms
+
+
+def test_find_center_pc(data, flats, darks, ensure_clean_memory):
+    data = normalize(data, flats, darks)[:, :, 3]
+    shifted_data = shift(cp.fliplr(data), (0, 18.75), mode="reflect")
+
+    # --- testing the center of rotation on tomo_standard ---#
+    cor = find_center_pc(data, shifted_data)
+
+    assert_allclose(cor, 73.0, rtol=1e-7)
+
+
+def test_find_center_pc2(data, flats, darks, ensure_clean_memory):
+    data = normalize(data, flats, darks)
+    proj1 = data[0, :, :]
+    proj2 = data[179, :, :]
+
+    # --- testing the center of rotation on tomo_standard ---#
+    cor = find_center_pc(proj1, proj2)
+
+    assert_allclose(cor, 79.5, rtol=1e-7)
+
+
+@pytest.mark.perf
+def test_find_center_pc_performance(ensure_clean_memory):
+    dev = cp.cuda.Device()
+    data_host = np.random.random_sample(size=(1801, 2560, 5)).astype(np.float32) * 2.0
+    data = cp.asarray(data_host, dtype=np.float32)[:, :, 3]
+    shifted_data = shift(cp.fliplr(data), (0, 18.75), mode="reflect")
+
+    # cold run first
+    find_center_pc(data, shifted_data)
+
+    start = time.perf_counter_ns()
+    nvtx.RangePush("Core")
+    for _ in range(10):
+        find_center_pc(data, shifted_data)
+    nvtx.RangePop()
+    dev.synchronize()
+    duration_ms = float(time.perf_counter_ns() - start) * 1e-6 / 10
+
+    assert "performance in ms" == duration_ms

From f01bb13b84b4b2b5819bf7100252c864cc6d60bf Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 5 Mar 2024 14:47:32 +0000
Subject: [PATCH 21/26] env file cupy version fixed

---
 conda/environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/environment.yml b/conda/environment.yml
index d3f916db..aa0974ff 100644
--- a/conda/environment.yml
+++ b/conda/environment.yml
@@ -6,7 +6,7 @@ channels:
   - httomo
   - rapidsai
 dependencies:
-  - conda-forge::cupy
+  - conda-forge::cupy=12.3.0
   - conda-forge::numpy<=1.24
   - conda-forge::nvtx
   - conda-forge::scipy

From bedb524f92e4491d104e325d2ebdbbace2f5b74f Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Mon, 11 Mar 2024 11:09:29 +0000
Subject: [PATCH 22/26] adding frequency parameter for FBP and fixing tests

---
 httomolibgpu/recon/algorithm.py    | 6 +++++-
 tests/test_recon/test_algorithm.py | 7 ++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/httomolibgpu/recon/algorithm.py b/httomolibgpu/recon/algorithm.py
index c499b022..ad2110bc 100644
--- a/httomolibgpu/recon/algorithm.py
+++ b/httomolibgpu/recon/algorithm.py
@@ -48,6 +48,7 @@ def FBP(
     data: cp.ndarray,
     angles: np.ndarray,
     center: Optional[float] = None,
+    filter_freq_cutoff: Optional[float] = 0.6,
     recon_size: Optional[int] = None,
     recon_mask_radius: Optional[float] = None,
     gpu_id: int = 0,
@@ -64,6 +65,8 @@ def FBP(
         An array of angles given in radians.
     center : float, optional
         The center of rotation (CoR).
+    filter_freq_cutoff : float, optional
+        Cutoff frequency parameter for the sinc filter, the lowest values produce more crispy but noisy reconstruction.
     recon_size : int, optional
         The [recon_size, recon_size] shape of the reconstructed slice in pixels.
         By default (None), the reconstructed size will be the dimension of the horizontal detector.
@@ -85,7 +88,8 @@ def FBP(
 
     reconstruction = RecToolsCP.FBP(
         data,
-        recon_mask_radius=recon_mask_radius,
+        cutoff_freq=filter_freq_cutoff,
+        recon_mask_radius=recon_mask_radius,        
         data_axes_labels_order=input_data_axis_labels,
     )
     cp._default_memory_pool.free_all_blocks()
diff --git a/tests/test_recon/test_algorithm.py b/tests/test_recon/test_algorithm.py
index 769e52f4..ddd846af 100644
--- a/tests/test_recon/test_algorithm.py
+++ b/tests/test_recon/test_algorithm.py
@@ -17,6 +17,7 @@ def test_reconstruct_FBP_1(data, flats, darks, ensure_clean_memory):
         normalize_cupy(data, flats, darks, cutoff=10, minus_log=True),
         np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0]),
         79.5,
+        filter_freq_cutoff=1.1,
     )
     assert recon_data.flags.c_contiguous
     recon_data = recon_data.get()
@@ -33,6 +34,7 @@ def test_reconstruct_FBP_2(data, flats, darks, ensure_clean_memory):
         normalize_cupy(data, flats, darks, cutoff=20.5, minus_log=False),
         np.linspace(5.0 * np.pi / 360.0, 180.0 * np.pi / 360.0, data.shape[0]),
         15.5,
+        filter_freq_cutoff=1.1,
     )
 
     recon_data = recon_data.get()
@@ -49,6 +51,7 @@ def test_reconstruct_FBP_3(data, flats, darks, ensure_clean_memory):
         normalize_cupy(data, flats, darks, cutoff=20.5, minus_log=False),
         np.linspace(5.0 * np.pi / 360.0, 180.0 * np.pi / 360.0, data.shape[0]),
         79,  # center
+        1.1, # filter_freq_cutoff
         210,  # recon_size
         0.9,  # recon_mask_radius
     )
@@ -103,9 +106,11 @@ def test_FBP_performance(ensure_clean_memory):
     data = cp.asarray(data_host, dtype=np.float32)
     angles = np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0])
     cor = 79.5
+    filter_freq_cutoff=1.1
+
 
     # cold run first
-    FBP(data, angles, cor)
+    FBP(data, angles, cor, filter_freq_cutoff)
     dev.synchronize()
 
     start = time.perf_counter_ns()

From 6da0f842ff858c44208b632835eecc40672e5ed2 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Mon, 11 Mar 2024 16:33:27 +0000
Subject: [PATCH 23/26] work on median2d with cucim

---
 httomolibgpu/cuda_kernels/median_kernel.cu | 54 +++++++++++-
 httomolibgpu/misc/corr.py                  | 96 +++++++++++++++-------
 tests/test_misc/test_corr.py               | 51 ++++++++----
 3 files changed, 154 insertions(+), 47 deletions(-)

diff --git a/httomolibgpu/cuda_kernels/median_kernel.cu b/httomolibgpu/cuda_kernels/median_kernel.cu
index 2c0febe3..a79db596 100644
--- a/httomolibgpu/cuda_kernels/median_kernel.cu
+++ b/httomolibgpu/cuda_kernels/median_kernel.cu
@@ -1,5 +1,5 @@
 template <typename Type, int diameter>
-__global__ void median_general_kernel(const Type *in, Type *out, float dif,
+__global__ void median_general_kernel3d(const Type *in, Type *out, float dif,
                                       int Z, int M, int N) {
   constexpr int radius = diameter / 2;
   constexpr int d3 = diameter * diameter * diameter;
@@ -53,4 +53,56 @@ __global__ void median_general_kernel(const Type *in, Type *out, float dif,
     out[index] =
         fabsf(in_value - ValVec[midpoint]) >= dif ? ValVec[midpoint] : in_value;
   }
+}
+
+
+template <typename Type, int diameter>
+__global__ void median_general_kernel2d(const Type *in, Type *out, float dif,
+                                      int M, int N) {
+  constexpr int radius = diameter / 2;
+  constexpr int d3 = diameter * diameter;
+  constexpr int midpoint = d3 / 2;
+
+  Type ValVec[d3];
+  const long i = blockDim.x * blockIdx.x + threadIdx.x;
+  const long j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (i >= N || j >= M )
+    return;
+
+  int counter = 0;
+  for (int i_m = -radius; i_m <= radius; i_m++) {
+    long long i1 = i + i_m;   // using long long to avoid integer overflows
+    if ((i1 < 0) || (i1 >= N))
+      i1 = i;
+    for (int j_m = -radius; j_m <= radius; j_m++) {
+      long long j1 = j + j_m;
+      if ((j1 < 0) || (j1 >= M))
+        j1 = j;
+      ValVec[counter] = in[i1 + N * j1];
+      counter++;
+    }
+  }
+
+  /* do bubble sort here */
+  for (int x = 0; x < d3 - 1; x++) {
+    for (int y = 0; y < d3 - x - 1; y++) {
+      if (ValVec[y] > ValVec[y + 1]) {
+        Type temp = ValVec[y];
+        ValVec[y] = ValVec[y + 1];
+        ValVec[y + 1] = temp;
+      }
+    }
+  }
+
+  /* perform median filtration */
+  long long index = static_cast<long long>(i) + N * static_cast<long long>(j);
+  if (dif == 0.0f)
+    out[index] = ValVec[midpoint];
+  else {
+    /* perform dezingering */
+    Type in_value = in[index];
+    out[index] =
+        fabsf(in_value - ValVec[midpoint]) >= dif ? ValVec[midpoint] : in_value;
+  }
 }
\ No newline at end of file
diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index 9d9b8ad4..702a99f2 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -28,26 +28,34 @@
 import numpy as np
 import nvtx
 
+from cucim.skimage.filters import median
+from cucim.skimage.morphology import disk
+
 from httomolibgpu.cuda_kernels import load_cuda_module
 
 __all__ = [
-    "median_filter3d",
-    "remove_outlier3d",
+    "median_filter",
+    "remove_outlier",
 ]
 
 @nvtx.annotate()
-def median_filter3d(
-    data: cp.ndarray, kernel_size: int = 3, dif: float = 0.0
+def median_filter(
+    data: cp.ndarray,
+    kernel_size: int = 3,
+    axis: int = 0,
+    dif: float = 0.0,    
 ) -> cp.ndarray:
     """
-    Apply 3D median or dezinger (when dif>0) filter to a 3D array.
+    Apply 2D or 3D median or dezinger (when dif>0) filter to a 3D array.
 
     Parameters
     ----------
     data : cp.ndarray
         Input CuPy 3D array either float32 or uint16 data type.
     kernel_size : int, optional
-        The size of the filter's kernel.
+        The size of the filter's kernel (a diameter).
+    axis (int, optional):
+        Axis along which the 2D filter kernel should be applied. If set to None, then the kernel is 3D.
     dif : float, optional
         Expected difference value between outlier value and the
         median value of the array, leave equal to 0 for classical median.
@@ -76,31 +84,55 @@ def median_filter3d(
     if kernel_size not in [3, 5, 7, 9, 11, 13]:
         raise ValueError("Please select a correct kernel size: 3, 5, 7, 9, 11, 13")
 
-    kernel_args = "median_general_kernel<{0}, {1}>".format(
-        "float" if input_type == "float32" else "unsigned short", kernel_size
-    )
-    median_module = load_cuda_module("median_kernel", name_expressions=[kernel_args])
-    median3d = median_module.get_function(kernel_args)
-
-    out = cp.empty(data.shape, dtype=input_type, order="C")
+    if axis not in [0, 1, 2, None]:
+        raise ValueError("The axis should be 0,1,2 or None for full 3d processing")
 
     dz, dy, dx = data.shape
-    # setting grid/block parameters
-    block_x = 128
-    block_dims = (block_x, 1, 1)
-    grid_x = (dx + block_x - 1) // block_x
-    grid_y = dy
-    grid_z = dz
-    grid_dims = (grid_x, grid_y, grid_z)
-
-    params = (data, out, dif, dz, dy, dx)
-
-    median3d(grid_dims, block_dims, params)
-
-    return out
-
-def remove_outlier3d(
-    data: cp.ndarray, kernel_size: int = 3, dif: float = 0.1
+    output = cp.empty(data.shape, dtype=input_type, order="C")
+
+    if axis == 0:        
+        for j in range(dz):
+            median(data[j, :, :],footprint=disk(kernel_size // 2), out=output[j, :, :])
+    elif axis == 1:
+        for j in range(dy):
+            median(data[:, j, :],footprint=disk(kernel_size // 2), out=output[:, j, :])
+    elif axis == 2:
+        for j in range(dx):
+            median(data[:, :, j],footprint=disk(kernel_size // 2), out=output[:, :, j])
+    else:
+        kernel_args = "median_general_kernel3d<{0}, {1}>".format(
+            "float" if input_type == "float32" else "unsigned short", kernel_size
+        )        
+        block_x = 128
+        # setting grid/block parameters
+        block_dims = (block_x, 1, 1)
+        grid_x = (dx + block_x - 1) // block_x
+        grid_y = dy
+        grid_z = dz
+        grid_dims = (grid_x, grid_y, grid_z)
+        params = (data, output, dif, dz, dy, dx)
+
+        median_module = load_cuda_module("median_kernel", name_expressions=[kernel_args])
+        median_filt = median_module.get_function(kernel_args)
+
+        median_filt(grid_dims, block_dims, params)
+    if axis is not None:
+        # perform dezingering        
+        kernel_name = "thresholding"
+        kernel = r"""
+            float dif_curr = abs(float(data) - float(output));
+            if (dif_curr < dif) {
+                output = data;
+            }            
+            """
+        print("boo")
+    return output
+
+def remove_outlier(
+    data: cp.ndarray,
+    kernel_size: int = 3,
+    axis: int = 0,
+    dif: float = 0.1
 ) -> cp.ndarray:
     """
     Selectively applies 3D median filter to a 3D array to remove outliers. Also called a dezinger.
@@ -110,7 +142,9 @@ def remove_outlier3d(
     data : cp.ndarray
         Input CuPy 3D array either float32 or uint16 data type.
     kernel_size : int, optional
-        The size of the filter's kernel.
+        The size of the filter's kernel (a diameter).
+    axis (int, optional):
+        Axis along which the 2D filter kernel should be applied. If set to None, then the kernel is 3D.
     dif : float, optional
         Expected difference value between outlier value and the
         median value of the array.
@@ -125,4 +159,4 @@ def remove_outlier3d(
     ValueError
         If the input array is not three dimensional.
     """
-    return median_filter3d(data=data, kernel_size=kernel_size, dif=dif)
+    return median_filter(data=data, kernel_size=kernel_size, axis=axis, dif=dif)
diff --git a/tests/test_misc/test_corr.py b/tests/test_misc/test_corr.py
index 49b425af..e7321870 100644
--- a/tests/test_misc/test_corr.py
+++ b/tests/test_misc/test_corr.py
@@ -7,8 +7,8 @@
 from cupy.cuda import nvtx
 from cupyx.scipy.ndimage import median_filter as median_filter_cupy
 from httomolibgpu.misc.corr import (
-    median_filter3d,
-    remove_outlier3d,
+    median_filter,
+    remove_outlier,
 )
 from numpy.testing import assert_allclose, assert_equal
 
@@ -19,14 +19,14 @@ def test_median_filter3d_vs_scipy_on_arange(ensure_clean_memory):
     mat = np.arange(4 * 5 * 6).reshape(4, 5, 6)
     assert_equal(
         scipy.ndimage.median_filter(np.float32(mat), size=3),
-        median_filter3d(cp.asarray(mat, dtype=cp.float32), kernel_size=3).get(),
+        median_filter(cp.asarray(mat, dtype=cp.float32), kernel_size=3, axis=None).get(),
     )
 
 
 def test_median_filter3d_vs_scipy(host_data, ensure_clean_memory):
     assert_equal(
         scipy.ndimage.median_filter(np.float32(host_data), size=3),
-        median_filter3d(cp.asarray(host_data, dtype=cp.float32), kernel_size=3).get(),
+        median_filter(cp.asarray(host_data, dtype=cp.float32), kernel_size=3, axis=None).get(),
     )
 
 
@@ -36,9 +36,10 @@ def test_median_filter3d_benchmark(
     host_data, ensure_clean_memory, kernel_size, benchmark
 ):
     benchmark(
-        median_filter3d,
+        median_filter,
         cp.asarray(host_data, dtype=cp.float32),
         kernel_size=kernel_size,
+        axis=None
     )
 
 
@@ -51,27 +52,27 @@ def test_scipy_median_filter_benchmark(data, ensure_clean_memory, benchmark, siz
 def test_median_filter3d_1D_raises(ensure_clean_memory):
     _data = cp.ones(10)
     with pytest.raises(ValueError):
-        median_filter3d(_data, kernel_size=3)
+        median_filter(_data, kernel_size=3)
 
 
 def test_median_filter3d_zero_dim(ensure_clean_memory):
     _data = cp.ones(shape=(10, 10, 0)) * 100
     with pytest.raises(ValueError):
-        median_filter3d(_data, kernel_size=3)
+        median_filter(_data, kernel_size=3)
 
 
 def test_median_filter3d_even_kernel_size(data):
     with pytest.raises(ValueError):
-        median_filter3d(data, kernel_size=4)
+        median_filter(data, kernel_size=4)
 
 
 def test_median_filter3d_wrong_dtype(data):
     with pytest.raises(ValueError):
-        median_filter3d(data.astype(cp.float64), kernel_size=3)
+        median_filter(data.astype(cp.float64), kernel_size=3)
 
 
 def test_median_filter3d(data):
-    filtered_data = median_filter3d(data, kernel_size=3).get()
+    filtered_data = median_filter(data, kernel_size=3, axis=None).get()
 
     assert filtered_data.ndim == 3
     assert_allclose(np.mean(filtered_data), 808.753494, rtol=eps)
@@ -83,10 +84,30 @@ def test_median_filter3d(data):
     assert filtered_data.flags.c_contiguous
 
     assert (
-        median_filter3d(data.astype(cp.float32), kernel_size=5, dif=1.5).get().dtype
+        median_filter(data.astype(cp.float32), kernel_size=5, axis=None, dif=1.5).get().dtype
         == np.float32
     )
 
+@pytest.mark.parametrize("axis", [0, 1, 2])
+def test_median_filter2d_axes(data, axis):
+    filtered_data = median_filter(data, kernel_size=3, axis=axis).get()
+
+    assert filtered_data.ndim == 3
+    assert filtered_data.dtype == np.uint16
+    assert filtered_data.flags.c_contiguous    
+
+def test_median_filter2d(data):
+    filtered_data = median_filter(data, kernel_size=3, axis=0).get()
+
+    assert filtered_data.ndim == 3
+    assert_allclose(np.mean(filtered_data), 808.86170708, rtol=eps)
+    assert_allclose(np.mean(filtered_data, axis=(1, 2)).sum(), 145595.1072753)
+    assert_allclose(np.max(filtered_data), 1080)
+    assert_allclose(np.min(filtered_data), 80)
+
+    assert filtered_data.dtype == np.uint16
+    assert filtered_data.flags.c_contiguous    
+
 
 @pytest.mark.perf
 def test_median_filter3d_performance(ensure_clean_memory):
@@ -95,13 +116,13 @@ def test_median_filter3d_performance(ensure_clean_memory):
     data = cp.asarray(data_host, dtype=cp.float32)
 
     # warm up
-    median_filter3d(data, kernel_size=3)
+    median_filter(data, kernel_size=3, axis=None)
     dev.synchronize()
 
     start = time.perf_counter_ns()
     nvtx.RangePush("Core")
     for _ in range(10):
-        median_filter3d(data, kernel_size=3)
+        median_filter(data, kernel_size=3, axis=None)
     nvtx.RangePop()
     dev.synchronize()
     duration_ms = float(time.perf_counter_ns() - start) * 1e-6 / 10
@@ -110,7 +131,7 @@ def test_median_filter3d_performance(ensure_clean_memory):
 
 
 def test_remove_outlier3d(data):
-    filtered_data = remove_outlier3d(data, kernel_size=3, dif=1.5).get()
+    filtered_data = remove_outlier(data, kernel_size=3, axis=None, dif=1.5).get()
 
     assert filtered_data.ndim == 3
     assert_allclose(np.mean(filtered_data), 808.753494, rtol=eps)
@@ -121,7 +142,7 @@ def test_remove_outlier3d(data):
     assert filtered_data.dtype == np.uint16
 
     assert (
-        remove_outlier3d(data.astype(cp.float32), kernel_size=5, dif=1.5).get().dtype
+        remove_outlier(data.astype(cp.float32), kernel_size=5, dif=1.5).get().dtype
         == np.float32
     )
     assert filtered_data.flags.c_contiguous

From 46bbb1b9e5b9b71c82b701bbb16dc4ce93911e46 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Mon, 11 Mar 2024 16:36:49 +0000
Subject: [PATCH 24/26] kernel changes

---
 httomolibgpu/cuda_kernels/median_kernel.cu | 52 ----------------------
 1 file changed, 52 deletions(-)

diff --git a/httomolibgpu/cuda_kernels/median_kernel.cu b/httomolibgpu/cuda_kernels/median_kernel.cu
index a79db596..570d40c1 100644
--- a/httomolibgpu/cuda_kernels/median_kernel.cu
+++ b/httomolibgpu/cuda_kernels/median_kernel.cu
@@ -54,55 +54,3 @@ __global__ void median_general_kernel3d(const Type *in, Type *out, float dif,
         fabsf(in_value - ValVec[midpoint]) >= dif ? ValVec[midpoint] : in_value;
   }
 }
-
-
-template <typename Type, int diameter>
-__global__ void median_general_kernel2d(const Type *in, Type *out, float dif,
-                                      int M, int N) {
-  constexpr int radius = diameter / 2;
-  constexpr int d3 = diameter * diameter;
-  constexpr int midpoint = d3 / 2;
-
-  Type ValVec[d3];
-  const long i = blockDim.x * blockIdx.x + threadIdx.x;
-  const long j = blockDim.y * blockIdx.y + threadIdx.y;
-
-  if (i >= N || j >= M )
-    return;
-
-  int counter = 0;
-  for (int i_m = -radius; i_m <= radius; i_m++) {
-    long long i1 = i + i_m;   // using long long to avoid integer overflows
-    if ((i1 < 0) || (i1 >= N))
-      i1 = i;
-    for (int j_m = -radius; j_m <= radius; j_m++) {
-      long long j1 = j + j_m;
-      if ((j1 < 0) || (j1 >= M))
-        j1 = j;
-      ValVec[counter] = in[i1 + N * j1];
-      counter++;
-    }
-  }
-
-  /* do bubble sort here */
-  for (int x = 0; x < d3 - 1; x++) {
-    for (int y = 0; y < d3 - x - 1; y++) {
-      if (ValVec[y] > ValVec[y + 1]) {
-        Type temp = ValVec[y];
-        ValVec[y] = ValVec[y + 1];
-        ValVec[y + 1] = temp;
-      }
-    }
-  }
-
-  /* perform median filtration */
-  long long index = static_cast<long long>(i) + N * static_cast<long long>(j);
-  if (dif == 0.0f)
-    out[index] = ValVec[midpoint];
-  else {
-    /* perform dezingering */
-    Type in_value = in[index];
-    out[index] =
-        fabsf(in_value - ValVec[midpoint]) >= dif ? ValVec[midpoint] : in_value;
-  }
-}
\ No newline at end of file

From cbc94eb1d52d6805c1b5bd87e31d704d8adc314a Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 12 Mar 2024 10:38:38 +0000
Subject: [PATCH 25/26] working median/outlier2d version

---
 httomolibgpu/misc/corr.py          | 48 ++++++++++++++++++++----------
 httomolibgpu/prep/normalize.py     |  3 +-
 tests/test_misc/test_corr.py       | 33 ++++++++++++++++----
 tests/test_recon/test_algorithm.py |  5 ++--
 4 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index 702a99f2..040cc4e6 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -27,6 +27,7 @@
 from typing import Tuple
 import numpy as np
 import nvtx
+from cupy import float32
 
 from cucim.skimage.filters import median
 from cucim.skimage.morphology import disk
@@ -38,12 +39,13 @@
     "remove_outlier",
 ]
 
+
 @nvtx.annotate()
 def median_filter(
     data: cp.ndarray,
     kernel_size: int = 3,
     axis: int = 0,
-    dif: float = 0.0,    
+    dif: float = 0.0,
 ) -> cp.ndarray:
     """
     Apply 2D or 3D median or dezinger (when dif>0) filter to a 3D array.
@@ -90,19 +92,20 @@ def median_filter(
     dz, dy, dx = data.shape
     output = cp.empty(data.shape, dtype=input_type, order="C")
 
-    if axis == 0:        
+    if axis == 0:
         for j in range(dz):
-            median(data[j, :, :],footprint=disk(kernel_size // 2), out=output[j, :, :])
+            median(data[j, :, :], footprint=disk(kernel_size // 2), out=output[j, :, :])
     elif axis == 1:
         for j in range(dy):
-            median(data[:, j, :],footprint=disk(kernel_size // 2), out=output[:, j, :])
+            median(data[:, j, :], footprint=disk(kernel_size // 2), out=output[:, j, :])
     elif axis == 2:
         for j in range(dx):
-            median(data[:, :, j],footprint=disk(kernel_size // 2), out=output[:, :, j])
+            median(data[:, :, j], footprint=disk(kernel_size // 2), out=output[:, :, j])
     else:
+        # 3d median or dezinger
         kernel_args = "median_general_kernel3d<{0}, {1}>".format(
             "float" if input_type == "float32" else "unsigned short", kernel_size
-        )        
+        )
         block_x = 128
         # setting grid/block parameters
         block_dims = (block_x, 1, 1)
@@ -112,27 +115,36 @@ def median_filter(
         grid_dims = (grid_x, grid_y, grid_z)
         params = (data, output, dif, dz, dy, dx)
 
-        median_module = load_cuda_module("median_kernel", name_expressions=[kernel_args])
+        median_module = load_cuda_module(
+            "median_kernel", name_expressions=[kernel_args]
+        )
         median_filt = median_module.get_function(kernel_args)
 
         median_filt(grid_dims, block_dims, params)
-    if axis is not None:
-        # perform dezingering        
+
+    if axis is not None and dif > 0:
+        # 2d dezingering enabled
         kernel_name = "thresholding"
         kernel = r"""
             float dif_curr = abs(float(data) - float(output));
             if (dif_curr < dif) {
                 output = data;
-            }            
+            }
             """
-        print("boo")
+        thresholding_kernel = cp.ElementwiseKernel(
+            "T data, raw float32 dif",
+            "T output",
+            kernel,
+            kernel_name,
+            options=("-std=c++11",),
+            no_return=True,
+        )
+        thresholding_kernel(data, float32(dif), output)
     return output
 
+
 def remove_outlier(
-    data: cp.ndarray,
-    kernel_size: int = 3,
-    axis: int = 0,
-    dif: float = 0.1
+    data: cp.ndarray, kernel_size: int = 3, axis: int = 0, dif: float = 0.1
 ) -> cp.ndarray:
     """
     Selectively applies 3D median filter to a 3D array to remove outliers. Also called a dezinger.
@@ -157,6 +169,10 @@ def remove_outlier(
     Raises
     ------
     ValueError
-        If the input array is not three dimensional.
+        Threshold value (dif) must be positive and nonzero.
     """
+
+    if dif <= 0.0:
+        raise ValueError("Threshold value (dif) must be positive and nonzero.")
+
     return median_filter(data=data, kernel_size=kernel_size, axis=axis, dif=dif)
diff --git a/httomolibgpu/prep/normalize.py b/httomolibgpu/prep/normalize.py
index 2c5b87f1..0287e07e 100644
--- a/httomolibgpu/prep/normalize.py
+++ b/httomolibgpu/prep/normalize.py
@@ -28,6 +28,7 @@
 
 __all__ = ["normalize"]
 
+
 @nvtx.annotate()
 def normalize(
     data: cp.ndarray,
@@ -94,7 +95,7 @@ def normalize(
     kernel += "out = v;\n"
 
     normalisation_kernel = cp.ElementwiseKernel(
-        "T data, U flats, V darks, raw float32 cutoff",
+        "T data, U flats, U darks, raw float32 cutoff",
         "float32 out",
         kernel,
         kernel_name,
diff --git a/tests/test_misc/test_corr.py b/tests/test_misc/test_corr.py
index e7321870..acf273e3 100644
--- a/tests/test_misc/test_corr.py
+++ b/tests/test_misc/test_corr.py
@@ -19,14 +19,18 @@ def test_median_filter3d_vs_scipy_on_arange(ensure_clean_memory):
     mat = np.arange(4 * 5 * 6).reshape(4, 5, 6)
     assert_equal(
         scipy.ndimage.median_filter(np.float32(mat), size=3),
-        median_filter(cp.asarray(mat, dtype=cp.float32), kernel_size=3, axis=None).get(),
+        median_filter(
+            cp.asarray(mat, dtype=cp.float32), kernel_size=3, axis=None
+        ).get(),
     )
 
 
 def test_median_filter3d_vs_scipy(host_data, ensure_clean_memory):
     assert_equal(
         scipy.ndimage.median_filter(np.float32(host_data), size=3),
-        median_filter(cp.asarray(host_data, dtype=cp.float32), kernel_size=3, axis=None).get(),
+        median_filter(
+            cp.asarray(host_data, dtype=cp.float32), kernel_size=3, axis=None
+        ).get(),
     )
 
 
@@ -39,7 +43,7 @@ def test_median_filter3d_benchmark(
         median_filter,
         cp.asarray(host_data, dtype=cp.float32),
         kernel_size=kernel_size,
-        axis=None
+        axis=None,
     )
 
 
@@ -84,17 +88,21 @@ def test_median_filter3d(data):
     assert filtered_data.flags.c_contiguous
 
     assert (
-        median_filter(data.astype(cp.float32), kernel_size=5, axis=None, dif=1.5).get().dtype
+        median_filter(data.astype(cp.float32), kernel_size=5, axis=None, dif=1.5)
+        .get()
+        .dtype
         == np.float32
     )
 
+
 @pytest.mark.parametrize("axis", [0, 1, 2])
 def test_median_filter2d_axes(data, axis):
     filtered_data = median_filter(data, kernel_size=3, axis=axis).get()
 
     assert filtered_data.ndim == 3
     assert filtered_data.dtype == np.uint16
-    assert filtered_data.flags.c_contiguous    
+    assert filtered_data.flags.c_contiguous
+
 
 def test_median_filter2d(data):
     filtered_data = median_filter(data, kernel_size=3, axis=0).get()
@@ -106,7 +114,7 @@ def test_median_filter2d(data):
     assert_allclose(np.min(filtered_data), 80)
 
     assert filtered_data.dtype == np.uint16
-    assert filtered_data.flags.c_contiguous    
+    assert filtered_data.flags.c_contiguous
 
 
 @pytest.mark.perf
@@ -146,3 +154,16 @@ def test_remove_outlier3d(data):
         == np.float32
     )
     assert filtered_data.flags.c_contiguous
+
+
+def test_remove_outlier2d(data):
+    filtered_data = remove_outlier(data, kernel_size=3, axis=0, dif=1.5).get()
+
+    assert filtered_data.ndim == 3
+    assert_allclose(np.mean(filtered_data), 808.861578504, rtol=eps)
+    assert_allclose(np.mean(filtered_data, axis=(1, 2)).sum(), 145595.0841308)
+    assert_allclose(np.max(filtered_data), 1080)
+    assert_allclose(np.min(filtered_data), 80)
+
+    assert filtered_data.dtype == np.uint16
+    assert filtered_data.flags.c_contiguous
diff --git a/tests/test_recon/test_algorithm.py b/tests/test_recon/test_algorithm.py
index ddd846af..0359a050 100644
--- a/tests/test_recon/test_algorithm.py
+++ b/tests/test_recon/test_algorithm.py
@@ -51,7 +51,7 @@ def test_reconstruct_FBP_3(data, flats, darks, ensure_clean_memory):
         normalize_cupy(data, flats, darks, cutoff=20.5, minus_log=False),
         np.linspace(5.0 * np.pi / 360.0, 180.0 * np.pi / 360.0, data.shape[0]),
         79,  # center
-        1.1, # filter_freq_cutoff
+        1.1,  # filter_freq_cutoff
         210,  # recon_size
         0.9,  # recon_mask_radius
     )
@@ -106,8 +106,7 @@ def test_FBP_performance(ensure_clean_memory):
     data = cp.asarray(data_host, dtype=np.float32)
     angles = np.linspace(0.0 * np.pi / 180.0, 180.0 * np.pi / 180.0, data.shape[0])
     cor = 79.5
-    filter_freq_cutoff=1.1
-
+    filter_freq_cutoff = 1.1
 
     # cold run first
     FBP(data, angles, cor, filter_freq_cutoff)

From e9b086152f65f89111b704ef1f29fa2ba8bd7a0d Mon Sep 17 00:00:00 2001
From: Daniil Kazantsev <dkazanc3@googlemail.com>
Date: Wed, 17 Apr 2024 14:25:35 +0100
Subject: [PATCH 26/26] making log in the normalisation true by default

---
 httomolibgpu/prep/normalize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/httomolibgpu/prep/normalize.py b/httomolibgpu/prep/normalize.py
index 0287e07e..54d09010 100644
--- a/httomolibgpu/prep/normalize.py
+++ b/httomolibgpu/prep/normalize.py
@@ -35,7 +35,7 @@ def normalize(
     flats: cp.ndarray,
     darks: cp.ndarray,
     cutoff: float = 10.0,
-    minus_log: bool = False,
+    minus_log: bool = True,
     nonnegativity: bool = False,
     remove_nans: bool = False,
 ) -> cp.ndarray: