diff --git a/CHANGELOG.md b/CHANGELOG.md
index 106b7c3..38f8053 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,7 +4,18 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [2.0.0] (05/DD/2025)
+## [dev] (MM/DD/YYYY)
+
+### Added
+
+### Changed
+* Dropped support for `scipy.fftpack` interface [gh-185](https://github.com/IntelPython/mkl_fft/pull/185)
+* Dropped support for `overwrite_x` parameter in `mkl_fft` [gh-185](https://github.com/IntelPython/mkl_fft/pull/185)
+
+### Fixed
+* Fixed a bug for N-D FFTs when both `s` and `out` are given [gh-185](https://github.com/IntelPython/mkl_fft/pull/185)
+
+## [2.0.0] (06/DD/2025)
 
 ### Added
 * Added Hermitian FFT functions to SciPy interface `mkl_fft.interfaces.scipy_fft`: `hfft`, `ihfft`, `hfftn`, `ihfftn`, `hfft2`, and `ihfft2` [gh-161](https://github.com/IntelPython/mkl_fft/pull/161)
@@ -17,8 +28,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * SciPy interface `mkl_fft.interfaces.scipy_fft` uses the same function from SciPy for handling `s` and `axes` for N-D FFTs [gh-181](https://github.com/IntelPython/mkl_fft/pull/181)
 
 ### Fixed
-* Fixed an issue for calling `mkl_fft.interfaces.numpy.fftn` with an empty axes [gh-139](https://github.com/IntelPython/mkl_fft/pull/139)
-* Fixed an issue for calling `mkl_fft.interfaces.numpy.fftn` with a zero-size array [gh-139](https://github.com/IntelPython/mkl_fft/pull/139)
+* Fixed a bug in `mkl_fft.interfaces.numpy.fftn` when an empty tuple is passed for `axes` [gh-139](https://github.com/IntelPython/mkl_fft/pull/139)
+* Fixed a bug for a case when a zero-size array is passed to `mkl_fft.interfaces.numpy.fftn` [gh-139](https://github.com/IntelPython/mkl_fft/pull/139)
 * Fixed inconsistency of input and output arrays dtype for `irfft` function [gh-180](https://github.com/IntelPython/mkl_fft/pull/180)
 * Fixed issues with `set_workers` function in SciPy interface `mkl_fft.interfaces.scipy_fft` [gh-183](https://github.com/IntelPython/mkl_fft/pull/183)
 
diff --git a/README.md b/README.md
index 8f865fd..e7637db 100644
--- a/README.md
+++ b/README.md
@@ -52,11 +52,11 @@ While using the interfaces module is the recommended way to leverage `mk_fft`, o
 
 ### complex-to-complex (c2c) transforms:
 
-`fft(x, n=None, axis=-1, overwrite_x=False, fwd_scale=1.0, out=None)` - 1D FFT, similar to `scipy.fft.fft`
+`fft(x, n=None, axis=-1, fwd_scale=1.0, out=None)` - 1D FFT, similar to `numpy.fft.fft`
 
-`fft2(x, s=None, axes=(-2, -1), overwrite_x=False, fwd_scale=1.0, out=None)` - 2D FFT, similar to `scipy.fft.fft2`
+`fft2(x, s=None, axes=(-2, -1), fwd_scale=1.0, out=None)` - 2D FFT, similar to `numpy.fft.fft2`
 
-`fftn(x, s=None, axes=None, overwrite_x=False, fwd_scale=1.0, out=None)` - ND FFT, similar to `scipy.fft.fftn`
+`fftn(x, s=None, axes=None, fwd_scale=1.0, out=None)` - ND FFT, similar to `numpy.fft.fftn`
 
 and similar inverse FFT (`ifft*`) functions.
 
diff --git a/mkl_fft/__init__.py b/mkl_fft/__init__.py
index 5cfada7..04586ea 100644
--- a/mkl_fft/__init__.py
+++ b/mkl_fft/__init__.py
@@ -39,7 +39,6 @@
     rfft2,
     rfftn,
 )
-from ._pydfti import irfftpack, rfftpack  # pylint: disable=no-name-in-module
 from ._version import __version__
 
 import mkl_fft.interfaces  # isort: skip
@@ -51,8 +50,6 @@
     "ifft2",
     "fftn",
     "ifftn",
-    "rfftpack",
-    "irfftpack",
     "rfft",
     "irfft",
     "rfft2",
diff --git a/mkl_fft/_fft_utils.py b/mkl_fft/_fft_utils.py
index 8a6189e..160d64b 100644
--- a/mkl_fft/_fft_utils.py
+++ b/mkl_fft/_fft_utils.py
@@ -262,23 +262,43 @@ def _iter_fftnd(
     axes=None,
     out=None,
     direction=+1,
-    overwrite_x=False,
-    scale_function=lambda n, ind: 1.0,
+    scale_function=lambda ind: 1.0,
 ):
     a = np.asarray(a)
     s, axes = _init_nd_shape_and_axes(a, s, axes)
-    ovwr = overwrite_x
-    for ii in reversed(range(len(axes))):
+
+    # Combine the two, but in reverse, to end with the first axis given.
+    axes_and_s = list(zip(axes, s))[::-1]
+    # We try to use in-place calculations where possible, which is
+    # everywhere except when the size changes after the first FFT.
+    size_changes = [axis for axis, n in axes_and_s[1:] if a.shape[axis] != n]
+
+    # If there are any size changes, we cannot use out
+    res = None if size_changes else out
+    for ind, (axis, n) in enumerate(axes_and_s):
+        if axis in size_changes:
+            if axis == size_changes[-1]:
+                # Last size change, so any output should now be OK
+                # (an error will be raised if not), and if no output is
+                # required, we want a freshly allocated array of the right size.
+                res = out
+            elif res is not None and n < res.shape[axis]:
+                # For an intermediate step where we return fewer elements, we
+                # can use a smaller view of the previous array.
+                res = res[(slice(None),) * axis + (slice(n),)]
+            else:
+                # If we need more elements, we cannot use res.
+                res = None
         a = _c2c_fft1d_impl(
             a,
-            n=s[ii],
-            axis=axes[ii],
-            overwrite_x=ovwr,
+            n=n,
+            axis=axis,
             direction=direction,
-            fsc=scale_function(s[ii], ii),
-            out=out,
+            fsc=scale_function(ind),
+            out=res,
         )
-        ovwr = True
+        # Default output for next iteration.
+        res = a
     return a
 
 
@@ -360,7 +380,6 @@ def _c2c_fftnd_impl(
     x,
     s=None,
     axes=None,
-    overwrite_x=False,
     direction=+1,
     fsc=1.0,
     out=None,
@@ -385,7 +404,6 @@ def _c2c_fftnd_impl(
     if _direct:
         return _direct_fftnd(
             x,
-            overwrite_x=overwrite_x,
             direction=direction,
             fsc=fsc,
             out=out,
@@ -403,11 +421,7 @@ def _c2c_fftnd_impl(
                 x,
                 axes,
                 _direct_fftnd,
-                {
-                    "overwrite_x": overwrite_x,
-                    "direction": direction,
-                    "fsc": fsc,
-                },
+                {"direction": direction, "fsc": fsc},
                 res,
             )
         else:
@@ -418,8 +432,7 @@ def _c2c_fftnd_impl(
                 axes=axes,
                 out=out,
                 direction=direction,
-                overwrite_x=overwrite_x,
-                scale_function=lambda n, i: fsc if i == 0 else 1.0,
+                scale_function=lambda i: fsc if i == 0 else 1.0,
             )
 
 
@@ -427,38 +440,59 @@ def _r2c_fftnd_impl(x, s=None, axes=None, fsc=1.0, out=None):
     a = np.asarray(x)
     no_trim = (s is None) and (axes is None)
     s, axes = _cook_nd_args(a, s, axes)
+    axes = [ax + a.ndim if ax < 0 else ax for ax in axes]
     la = axes[-1]
+
     # trim array, so that rfft avoids doing unnecessary computations
     if not no_trim:
         a = _trim_array(a, s, axes)
+
+    # last axis is not included since we calculate r2c FFT separately
+    # and not in the loop
+    axes_and_s = list(zip(axes, s))[-2::-1]
+    size_changes = [axis for axis, n in axes_and_s if a.shape[axis] != n]
+    res = None if size_changes else out
+
     # r2c along last axis
-    a = _r2c_fft1d_impl(a, n=s[-1], axis=la, fsc=fsc, out=out)
+    a = _r2c_fft1d_impl(a, n=s[-1], axis=la, fsc=fsc, out=res)
+    res = a
     if len(s) > 1:
-        if not no_trim:
-            ss = list(s)
-            ss[-1] = a.shape[la]
-            a = _pad_array(a, tuple(ss), axes)
+
         len_axes = len(axes)
         if len(set(axes)) == len_axes and len_axes == a.ndim and len_axes > 2:
+            if not no_trim:
+                ss = list(s)
+                ss[-1] = a.shape[la]
+                a = _pad_array(a, tuple(ss), axes)
             # a series of ND c2c FFTs along last axis
             ss, aa = _remove_axis(s, axes, -1)
-            ind = [
-                slice(None, None, 1),
-            ] * len(s)
+            ind = [slice(None, None, 1)] * len(s)
             for ii in range(a.shape[la]):
                 ind[la] = ii
                 tind = tuple(ind)
                 a_inp = a[tind]
-                res = out[tind] if out is not None else None
-                a_res = _c2c_fftnd_impl(
-                    a_inp, s=ss, axes=aa, overwrite_x=True, direction=1, out=res
-                )
-                if a_res is not a_inp:
-                    a[tind] = a_res  # copy in place
+                res = out[tind] if out is not None else a_inp
+                _ = _c2c_fftnd_impl(a_inp, s=ss, axes=aa, direction=1, out=res)
+            if out is not None:
+                a = out
         else:
+            # another size_changes check is needed if there are repeated axes
+            # of last axis, since since FFT changes the shape along last axis
+            size_changes = [
+                axis for axis, n in axes_and_s if a.shape[axis] != n
+            ]
+
             # a series of 1D c2c FFTs along all axes except last
-            for ii in range(len(axes) - 2, -1, -1):
-                a = _c2c_fft1d_impl(a, s[ii], axes[ii], overwrite_x=True)
+            for axis, n in axes_and_s:
+                if axis in size_changes:
+                    if axis == size_changes[-1]:
+                        res = out
+                    elif res is not None and n < res.shape[axis]:
+                        res = res[(slice(None),) * axis + (slice(n),)]
+                    else:
+                        res = None
+                a = _c2c_fft1d_impl(a, n, axis, out=res)
+                res = a
     return a
 
 
@@ -466,49 +500,54 @@ def _c2r_fftnd_impl(x, s=None, axes=None, fsc=1.0, out=None):
     a = np.asarray(x)
     no_trim = (s is None) and (axes is None)
     s, axes = _cook_nd_args(a, s, axes, invreal=True)
+    axes = [ax + a.ndim if ax < 0 else ax for ax in axes]
     la = axes[-1]
     if not no_trim:
         a = _trim_array(a, s, axes)
     if len(s) > 1:
-        if not no_trim:
-            a = _pad_array(a, s, axes)
-        ovr_x = True if _datacopied(a, x) else False
         len_axes = len(axes)
         if len(set(axes)) == len_axes and len_axes == a.ndim and len_axes > 2:
+            if not no_trim:
+                a = _pad_array(a, s, axes)
             # a series of ND c2c FFTs along last axis
             # due to need to write into a, we must copy
-            if not ovr_x:
-                a = a.copy()
-                ovr_x = True
+            a = a if _datacopied(a, x) else a.copy()
             if not np.issubdtype(a.dtype, np.complexfloating):
                 # complex output will be copied to input, copy is needed
                 if a.dtype == np.float32:
                     a = a.astype(np.complex64)
                 else:
                     a = a.astype(np.complex128)
-                ovr_x = True
             ss, aa = _remove_axis(s, axes, -1)
-            ind = [
-                slice(None, None, 1),
-            ] * len(s)
+            ind = [slice(None, None, 1)] * len(s)
             for ii in range(a.shape[la]):
                 ind[la] = ii
                 tind = tuple(ind)
                 a_inp = a[tind]
                 # out has real dtype and cannot be used in intermediate steps
-                a_res = _c2c_fftnd_impl(
-                    a_inp, s=ss, axes=aa, overwrite_x=True, direction=-1
+                # ss and aa are reversed since np.irfftn uses forward order but
+                # np.ifftn uses reverse order see numpy-gh-28950
+                _ = _c2c_fftnd_impl(
+                    a_inp, s=ss[::-1], axes=aa[::-1], out=a_inp, direction=-1
                 )
-                if a_res is not a_inp:
-                    a[tind] = a_res  # copy in place
         else:
             # a series of 1D c2c FFTs along all axes except last
-            for ii in range(len(axes) - 1):
-                # out has real dtype and cannot be used in intermediate steps
-                a = _c2c_fft1d_impl(
-                    a, s[ii], axes[ii], overwrite_x=ovr_x, direction=-1
-                )
-                ovr_x = True
+            # forward order, see numpy-gh-28950
+            axes_and_s = list(zip(axes, s))[:-1]
+            size_changes = [
+                axis for axis, n in axes_and_s[1:] if a.shape[axis] != n
+            ]
+            # out has real dtype cannot be used for intermediate steps
+            res = None
+            for axis, n in axes_and_s:
+                if axis in size_changes:
+                    if res is not None and n < res.shape[axis]:
+                        # pylint: disable=unsubscriptable-object
+                        res = res[(slice(None),) * axis + (slice(n),)]
+                    else:
+                        res = None
+                a = _c2c_fft1d_impl(a, n, axis, out=res, direction=-1)
+                res = a
     # c2r along last axis
     a = _c2r_fft1d_impl(a, n=s[-1], axis=la, fsc=fsc, out=out)
     return a
diff --git a/mkl_fft/_mkl_fft.py b/mkl_fft/_mkl_fft.py
index 92d0d53..b15c3d5 100644
--- a/mkl_fft/_mkl_fft.py
+++ b/mkl_fft/_mkl_fft.py
@@ -45,85 +45,57 @@
 ]
 
 
-def fft(x, n=None, axis=-1, out=None, overwrite_x=False, fwd_scale=1.0):
+def fft(x, n=None, axis=-1, fwd_scale=1.0, out=None):
     return _c2c_fft1d_impl(
-        x,
-        n=n,
-        axis=axis,
-        out=out,
-        overwrite_x=overwrite_x,
-        direction=+1,
-        fsc=fwd_scale,
+        x, n=n, axis=axis, out=out, direction=+1, fsc=fwd_scale
     )
 
 
-def ifft(x, n=None, axis=-1, out=None, overwrite_x=False, fwd_scale=1.0):
+def ifft(x, n=None, axis=-1, fwd_scale=1.0, out=None):
     return _c2c_fft1d_impl(
-        x,
-        n=n,
-        axis=axis,
-        out=out,
-        overwrite_x=overwrite_x,
-        direction=-1,
-        fsc=fwd_scale,
+        x, n=n, axis=axis, out=out, direction=-1, fsc=fwd_scale
     )
 
 
-def fft2(x, s=None, axes=(-2, -1), out=None, overwrite_x=False, fwd_scale=1.0):
-    return fftn(
-        x, s=s, axes=axes, out=out, overwrite_x=overwrite_x, fwd_scale=fwd_scale
-    )
+def fft2(x, s=None, axes=(-2, -1), fwd_scale=1.0, out=None):
+    return fftn(x, s=s, axes=axes, out=out, fwd_scale=fwd_scale)
 
 
-def ifft2(x, s=None, axes=(-2, -1), out=None, overwrite_x=False, fwd_scale=1.0):
-    return ifftn(
-        x, s=s, axes=axes, out=out, overwrite_x=overwrite_x, fwd_scale=fwd_scale
-    )
+def ifft2(x, s=None, axes=(-2, -1), fwd_scale=1.0, out=None):
+    return ifftn(x, s=s, axes=axes, out=out, fwd_scale=fwd_scale)
 
 
-def fftn(x, s=None, axes=None, out=None, overwrite_x=False, fwd_scale=1.0):
+def fftn(x, s=None, axes=None, fwd_scale=1.0, out=None):
     return _c2c_fftnd_impl(
-        x,
-        s=s,
-        axes=axes,
-        out=out,
-        overwrite_x=overwrite_x,
-        direction=+1,
-        fsc=fwd_scale,
+        x, s=s, axes=axes, out=out, direction=+1, fsc=fwd_scale
     )
 
 
-def ifftn(x, s=None, axes=None, out=None, overwrite_x=False, fwd_scale=1.0):
+def ifftn(x, s=None, axes=None, fwd_scale=1.0, out=None):
     return _c2c_fftnd_impl(
-        x,
-        s=s,
-        axes=axes,
-        out=out,
-        overwrite_x=overwrite_x,
-        direction=-1,
-        fsc=fwd_scale,
+        x, s=s, axes=axes, out=out, direction=-1, fsc=fwd_scale
     )
 
 
-def rfft(x, n=None, axis=-1, out=None, fwd_scale=1.0):
+def rfft(x, n=None, axis=-1, fwd_scale=1.0, out=None):
     return _r2c_fft1d_impl(x, n=n, axis=axis, out=out, fsc=fwd_scale)
 
 
-def irfft(x, n=None, axis=-1, out=None, fwd_scale=1.0):
+def irfft(x, n=None, axis=-1, fwd_scale=1.0, out=None):
     return _c2r_fft1d_impl(x, n=n, axis=axis, out=out, fsc=fwd_scale)
 
 
-def rfft2(x, s=None, axes=(-2, -1), out=None, fwd_scale=1.0):
+def rfft2(x, s=None, axes=(-2, -1), fwd_scale=1.0, out=None):
     return rfftn(x, s=s, axes=axes, out=out, fwd_scale=fwd_scale)
 
 
-def irfft2(x, s=None, axes=(-2, -1), out=None, fwd_scale=1.0):
+def irfft2(x, s=None, axes=(-2, -1), fwd_scale=1.0, out=None):
     return irfftn(x, s=s, axes=axes, out=out, fwd_scale=fwd_scale)
 
 
-def rfftn(x, s=None, axes=None, out=None, fwd_scale=1.0):
+def rfftn(x, s=None, axes=None, fwd_scale=1.0, out=None):
     return _r2c_fftnd_impl(x, s=s, axes=axes, out=out, fsc=fwd_scale)
 
 
-def irfftn(x, s=None, axes=None, out=None, fwd_scale=1.0):
+def irfftn(x, s=None, axes=None, fwd_scale=1.0, out=None):
     return _c2r_fftnd_impl(x, s=s, axes=axes, out=out, fsc=fwd_scale)
diff --git a/mkl_fft/_pydfti.pyx b/mkl_fft/_pydfti.pyx
index 20f6a94..d9158da 100644
--- a/mkl_fft/_pydfti.pyx
+++ b/mkl_fft/_pydfti.pyx
@@ -224,13 +224,10 @@ cdef cnp.ndarray _process_arguments(
     object x,
     object n,
     object axis,
-    object overwrite_x,
-    object direction,
     long *axis_,
     long *n_,
     int *in_place,
     int *xnd,
-    int *dir_,
     int realQ,
 ):
     """
@@ -240,13 +237,6 @@ cdef cnp.ndarray _process_arguments(
     cdef long n_max = 0
     cdef cnp.ndarray x_arr "xx_arrayObject"
 
-    if direction not in [-1, +1]:
-        raise ValueError("Direction of FFT should +1 or -1")
-    else:
-        dir_[0] = -1 if direction is -1 else +1
-
-    in_place[0] = 1 if overwrite_x else 0
-
     # convert x to ndarray, ensure that strides are multiples of itemsize
     x_arr = PyArray_CheckFromAny(
         x, NULL, 0, 0,
@@ -259,8 +249,8 @@ cdef cnp.ndarray _process_arguments(
     if (<void *> x_arr) is NULL:
         raise ValueError("An input argument x is not an array-like object")
 
-    if _datacopied(x_arr, x):
-        in_place[0] = 1  # a copy was made, so we can work in place.
+    # a copy was made, so we can work in place.
+    in_place[0] = 1 if _datacopied(x_arr, x) else 0
 
     xnd[0] = cnp.PyArray_NDIM(x_arr)  # tensor-rank of the array
 
@@ -379,21 +369,13 @@ def _validate_out_array(out, x, out_dtype, axis=None, n=None):
 # float/double inputs are not cast to complex, but are effectively
 # treated as complexes with zero imaginary parts.
 # All other types are cast to complex double.
-def _c2c_fft1d_impl(
-    x,
-    n=None,
-    axis=-1,
-    overwrite_x=False,
-    direction=+1,
-    double fsc=1.0,
-    out=None,
-):
+def _c2c_fft1d_impl(x, n=None, axis=-1, direction=+1, double fsc=1.0, out=None):
     """
     Uses MKL to perform 1D FFT on the input array x along the given axis.
     """
     cdef cnp.ndarray x_arr "x_arrayObject"
     cdef cnp.ndarray f_arr "f_arrayObject"
-    cdef int xnd, n_max = 0, in_place, dir_
+    cdef int xnd, n_max = 0, in_place
     cdef long n_, axis_
     cdef int x_type, f_type, status = 0
     cdef int ALL_HARMONICS = 1
@@ -401,9 +383,10 @@ def _c2c_fft1d_impl(
     cdef bytes py_error_msg
     cdef DftiCache *_cache
 
-    x_arr = _process_arguments(x, n, axis, overwrite_x, direction,
-                               &axis_, &n_, &in_place, &xnd, &dir_, 0)
+    if direction not in [-1, +1]:
+        raise ValueError("Direction of FFT should +1 or -1")
 
+    x_arr = _process_arguments(x, n, axis, &axis_, &n_, &in_place, &xnd, 0)
     x_type = cnp.PyArray_TYPE(x_arr)
 
     if out is not None:
@@ -439,7 +422,7 @@ def _c2c_fft1d_impl(
             _cache_capsule, capsule_name
         )
         if x_type is cnp.NPY_CDOUBLE:
-            if dir_ < 0:
+            if direction < 0:
                 status = cdouble_mkl_ifft1d_in(
                     x_arr, n_, <int> axis_, fsc, _cache
                 )
@@ -448,7 +431,7 @@ def _c2c_fft1d_impl(
                     x_arr, n_, <int> axis_, fsc, _cache
                 )
         elif x_type is cnp.NPY_CFLOAT:
-            if dir_ < 0:
+            if direction < 0:
                 status = cfloat_mkl_ifft1d_in(
                     x_arr, n_, <int> axis_, fsc, _cache
                 )
@@ -497,7 +480,7 @@ def _c2c_fft1d_impl(
         )
         if f_type is cnp.NPY_CDOUBLE:
             if x_type is cnp.NPY_DOUBLE:
-                if dir_ < 0:
+                if direction < 0:
                     status = double_cdouble_mkl_ifft1d_out(
                         x_arr,
                         n_,
@@ -518,7 +501,7 @@ def _c2c_fft1d_impl(
                         _cache,
                     )
             elif x_type is cnp.NPY_CDOUBLE:
-                if dir_ < 0:
+                if direction < 0:
                     status = cdouble_cdouble_mkl_ifft1d_out(
                         x_arr, n_, <int> axis_, f_arr, fsc, _cache
                     )
@@ -528,7 +511,7 @@ def _c2c_fft1d_impl(
                     )
         else:
             if x_type is cnp.NPY_FLOAT:
-                if dir_ < 0:
+                if direction < 0:
                     status = float_cfloat_mkl_ifft1d_out(
                         x_arr,
                         n_,
@@ -549,7 +532,7 @@ def _c2c_fft1d_impl(
                         _cache,
                     )
             elif x_type is cnp.NPY_CFLOAT:
-                if dir_ < 0:
+                if direction < 0:
                     status = cfloat_cfloat_mkl_ifft1d_out(
                         x_arr, n_, <int> axis_, f_arr, fsc, _cache
                     )
@@ -571,7 +554,7 @@ def _c2c_fft1d_impl(
 
 
 def _r2c_fft1d_impl(
-    x, n=None, axis=-1, overwrite_x=False, double fsc=1.0, out=None
+    x, n=None, axis=-1, double fsc=1.0, out=None
 ):
     """
     Uses MKL to perform 1D FFT on the real input array x along the given axis,
@@ -581,17 +564,15 @@ def _r2c_fft1d_impl(
     """
     cdef cnp.ndarray x_arr "x_arrayObject"
     cdef cnp.ndarray f_arr "f_arrayObject"
-    cdef int xnd, in_place, dir_
+    cdef int xnd, in_place
     cdef long n_, axis_
     cdef int x_type, f_type, status, requirement
     cdef int HALF_HARMONICS = 0  # give only positive index harmonics
-    cdef int direction = 1  # dummy, only used for the sake of arg-processing
     cdef char * c_error_msg = NULL
     cdef bytes py_error_msg
     cdef DftiCache *_cache
 
-    x_arr = _process_arguments(x, n, axis, overwrite_x, direction,
-                               &axis_, &n_, &in_place, &xnd, &dir_, 1)
+    x_arr = _process_arguments(x, n, axis, &axis_, &n_, &in_place, &xnd, 1)
 
     x_type = cnp.PyArray_TYPE(x_arr)
 
@@ -671,7 +652,7 @@ def _r2c_fft1d_impl(
 
 
 def _c2r_fft1d_impl(
-    x, n=None, axis=-1, overwrite_x=False, double fsc=1.0, out=None
+    x, n=None, axis=-1, double fsc=1.0, out=None
 ):
     """
     Uses MKL to perform 1D FFT on the real/complex input array x along the
@@ -681,10 +662,9 @@ def _c2r_fft1d_impl(
     """
     cdef cnp.ndarray x_arr "x_arrayObject"
     cdef cnp.ndarray f_arr "f_arrayObject"
-    cdef int xnd, in_place, dir_, int_n
+    cdef int xnd, in_place, int_n
     cdef long n_, axis_
     cdef int x_type, f_type, status
-    cdef int direction = 1  # dummy, only used for the sake of arg-processing
     cdef char * c_error_msg = NULL
     cdef bytes py_error_msg
     cdef DftiCache *_cache
@@ -692,8 +672,7 @@ def _c2r_fft1d_impl(
     int_n = _is_integral(n)
     # nn gives the number elements along axis of the input that we use
     nn = (n // 2 + 1) if int_n and n > 0 else n
-    x_arr = _process_arguments(x, nn, axis, overwrite_x, direction,
-                               &axis_, &n_, &in_place, &xnd, &dir_, 0)
+    x_arr = _process_arguments(x, nn, axis, &axis_, &n_, &in_place, &xnd, 0)
     n_ = 2*(n_ - 1)
     if int_n and (n % 2 == 1):
         n_ += 1
@@ -776,20 +755,16 @@ def _c2r_fft1d_impl(
 
 
 def _direct_fftnd(
-    x, overwrite_x=False, direction=+1, double fsc=1.0, out=None
+    x, direction=+1, double fsc=1.0, out=None
 ):
     """Perform n-dimensional FFT over all axes"""
     cdef int err
     cdef cnp.ndarray x_arr "xxnd_arrayObject"
     cdef cnp.ndarray f_arr "ffnd_arrayObject"
-    cdef int dir_, in_place, x_type, f_type
+    cdef int in_place, x_type, f_type
 
     if direction not in [-1, +1]:
         raise ValueError("Direction of FFT should +1 or -1")
-    else:
-        dir_ = -1 if direction is -1 else +1
-
-    in_place = 1 if overwrite_x else 0
 
     # convert x to ndarray, ensure that strides are multiples of itemsize
     x_arr = PyArray_CheckFromAny(
@@ -803,8 +778,8 @@ def _direct_fftnd(
     if <void *> x_arr is NULL:
         raise ValueError("An input argument x is not an array-like object")
 
-    if _datacopied(x_arr, x):
-        in_place = 1  # a copy was made, so we can work in place.
+    # a copy was made, so we can work in place.
+    in_place = 1 if _datacopied(x_arr, x) else 0
 
     x_type = cnp.PyArray_TYPE(x_arr)
     if (
@@ -834,12 +809,12 @@ def _direct_fftnd(
 
     if in_place:
         if x_type == cnp.NPY_CDOUBLE:
-            if dir_ == 1:
+            if direction == 1:
                 err = cdouble_cdouble_mkl_fftnd_in(x_arr, fsc)
             else:
                 err = cdouble_cdouble_mkl_ifftnd_in(x_arr, fsc)
         elif x_type == cnp.NPY_CFLOAT:
-            if dir_ == 1:
+            if direction == 1:
                 err = cfloat_cfloat_mkl_fftnd_in(x_arr, fsc)
             else:
                 err = cfloat_cfloat_mkl_ifftnd_in(x_arr, fsc)
@@ -866,22 +841,22 @@ def _direct_fftnd(
                 f_arr = _allocate_result(x_arr, -1, 0, f_type)
 
         if x_type == cnp.NPY_CDOUBLE:
-            if dir_ == 1:
+            if direction == 1:
                 err = cdouble_cdouble_mkl_fftnd_out(x_arr, f_arr, fsc)
             else:
                 err = cdouble_cdouble_mkl_ifftnd_out(x_arr, f_arr, fsc)
         elif x_type == cnp.NPY_CFLOAT:
-            if dir_ == 1:
+            if direction == 1:
                 err = cfloat_cfloat_mkl_fftnd_out(x_arr, f_arr, fsc)
             else:
                 err = cfloat_cfloat_mkl_ifftnd_out(x_arr, f_arr, fsc)
         elif x_type == cnp.NPY_DOUBLE:
-            if dir_ == 1:
+            if direction == 1:
                 err = double_cdouble_mkl_fftnd_out(x_arr, f_arr, fsc)
             else:
                 err = double_cdouble_mkl_ifftnd_out(x_arr, f_arr, fsc)
         elif x_type == cnp.NPY_FLOAT:
-            if dir_ == 1:
+            if direction == 1:
                 err = float_cfloat_mkl_fftnd_out(x_arr, f_arr, fsc)
             else:
                 err = float_cfloat_mkl_ifftnd_out(x_arr, f_arr, fsc)
@@ -893,253 +868,3 @@ def _direct_fftnd(
             return out
         else:
             return f_arr
-
-
-# ========================= deprecated functions ==============================
-cdef object _rc_to_rr(cnp.ndarray rc_arr, int n, int axis, int xnd, int x_type):
-    cdef object res
-    inp = <object>rc_arr
-
-    slice_ = [slice(None, None, None)] * xnd
-    sl_0 = list(slice_)
-    sl_0[axis] = 0
-
-    sl_1 = list(slice_)
-    sl_1[axis] = 1
-    if (inp.flags["C"] and inp.strides[axis] == inp.itemsize):
-        res = inp
-        res = res.view(
-            dtype=np.single if x_type == cnp.NPY_FLOAT else np.double
-        )
-        res[tuple(sl_1)] = res[tuple(sl_0)]
-
-        slice_[axis] = slice(1, n + 1, None)
-
-        return res[tuple(slice_)]
-    else:
-        res_shape = list(inp.shape)
-        res_shape[axis] = n
-        res = np.empty(
-            tuple(res_shape),
-            dtype = np.single if x_type == cnp.NPY_FLOAT else np.double,
-        )
-
-        res[tuple(sl_0)] = inp[tuple(sl_0)].real
-        sl_dst_real = list(slice_)
-        sl_dst_real[axis] = slice(1, None, 2)
-        sl_src_real = list(slice_)
-        sl_src_real[axis] = slice(1, None, None)
-        res[tuple(sl_dst_real)] = inp[tuple(sl_src_real)].real
-        sl_dst_imag = list(slice_)
-        sl_dst_imag[axis] = slice(2, None, 2)
-        sl_src_imag = list(slice_)
-        sl_src_imag[axis] = slice(
-            1, inp.shape[axis] if (n & 1) else inp.shape[axis] - 1, None
-        )
-        res[tuple(sl_dst_imag)] = inp[tuple(sl_src_imag)].imag
-
-        return res[tuple(slice_)]
-
-
-cdef object _rr_to_rc(cnp.ndarray rr_arr, int n, int axis, int xnd, int x_type):
-
-    inp = <object> rr_arr
-
-    rc_shape = list(inp.shape)
-    rc_shape[axis] = (n // 2 + 1)
-    rc_shape = tuple(rc_shape)
-
-    rc_dtype = np.cdouble if x_type == cnp.NPY_DOUBLE else np.csingle
-    rc = np.empty(rc_shape, dtype=rc_dtype, order="C")
-
-    slice_ = [slice(None, None, None)] * xnd
-    sl_src_real = list(slice_)
-    sl_src_imag = list(slice_)
-    sl_src_real[axis] = slice(1, n, 2)
-    sl_src_imag[axis] = slice(2, n, 2)
-
-    sl_dest_real = list(slice_)
-    sl_dest_real[axis] = slice(1, None, None)
-    sl_dest_imag = list(slice_)
-    sl_dest_imag[axis] = slice(1, (n+1)//2, None)
-
-    sl_0 = list(slice_)
-    sl_0[axis] = 0
-
-    rc_real = rc.real
-    rc_imag = rc.imag
-
-    rc_real[tuple(sl_dest_real)] = inp[tuple(sl_src_real)]
-    rc_imag[tuple(sl_dest_imag)] = inp[tuple(sl_src_imag)]
-    rc_real[tuple(sl_0)] = inp[tuple(sl_0)]
-    rc_imag[tuple(sl_0)] = 0
-    if (n & 1 == 0):
-        sl_last = list(slice_)
-        sl_last[axis] = -1
-        rc_imag[tuple(sl_last)] = 0
-
-    return rc
-
-
-def _rr_fft1d_impl(x, n=None, axis=-1, overwrite_x=False, double fsc=1.0):
-    """
-    Uses MKL to perform real packed 1D FFT on the input array x
-    along the given axis.
-
-    This done by using rfft and post-processing the result.
-    Thus overwrite_x is effectively discarded.
-
-    Functionally equivalent to scipy.fftpack.rfft
-    """
-    cdef cnp.ndarray x_arr "x_arrayObject"
-    cdef cnp.ndarray f_arr "f_arrayObject"
-    cdef int xnd, in_place, dir_
-    cdef long n_, axis_
-    cdef int HALF_HARMONICS = 0  # give only positive index harmonics
-    cdef int x_type, status, f_type
-    cdef char * c_error_msg = NULL
-    cdef bytes py_error_msg
-    cdef DftiCache *_cache
-
-    x_arr = _process_arguments(x, n, axis, overwrite_x, <object>(+1),
-                               &axis_, &n_, &in_place, &xnd, &dir_, 1)
-
-    x_type = cnp.PyArray_TYPE(x_arr)
-
-    if x_type is cnp.NPY_FLOAT or x_type is cnp.NPY_DOUBLE:
-        in_place = 0
-    elif x_type is cnp.NPY_CFLOAT or x_type is cnp.NPY_CDOUBLE:
-        raise TypeError("1st argument must be a real sequence")
-    else:
-        try:
-            x_arr = <cnp.ndarray> cnp.PyArray_FROM_OTF(
-                x_arr, cnp.NPY_DOUBLE,
-                cnp.NPY_ARRAY_BEHAVED | cnp.NPY_ARRAY_ENSURECOPY
-            )
-        except:
-            raise TypeError("1st argument must be a real sequence")
-        x_type = cnp.PyArray_TYPE(x_arr)
-        in_place = 0
-
-    f_type = cnp.NPY_CFLOAT if x_type is cnp.NPY_FLOAT else cnp.NPY_CDOUBLE
-    f_arr = _allocate_result(x_arr, n_ // 2 + 1, axis_, f_type)
-
-    _cache_capsule = _tls_dfti_cache_capsule()
-    _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
-        _cache_capsule, capsule_name
-    )
-    if x_type is cnp.NPY_DOUBLE:
-        status = double_cdouble_mkl_fft1d_out(
-            x_arr, n_, <int> axis_, f_arr, HALF_HARMONICS, fsc, _cache
-        )
-    else:
-        status = float_cfloat_mkl_fft1d_out(
-            x_arr, n_, <int> axis_, f_arr, HALF_HARMONICS, fsc, _cache
-        )
-
-    if (status):
-        c_error_msg = mkl_dfti_error(status)
-        py_error_msg = c_error_msg
-        raise ValueError("Internal error occurred: {}".format(py_error_msg))
-
-    # post-process and return
-    return _rc_to_rr(f_arr, n_, axis_, xnd, x_type)
-
-
-def _rr_ifft1d_impl(x, n=None, axis=-1, overwrite_x=False, double fsc=1.0):
-    """
-    Uses MKL to perform real packed 1D FFT on the input array x along
-    the given axis.
-
-    This done by using rfft and post-processing the result.
-    Thus overwrite_x is effectively discarded.
-
-    Functionally equivalent to scipy.fftpack.irfft
-    """
-    cdef cnp.ndarray x_arr "x_arrayObject"
-    cdef cnp.ndarray f_arr "f_arrayObject"
-    cdef int xnd, in_place, dir_
-    cdef long n_, axis_
-    cdef int x_type, rc_type, status
-    cdef char * c_error_msg = NULL
-    cdef bytes py_error_msg
-    cdef DftiCache *_cache
-
-    x_arr = _process_arguments(x, n, axis, overwrite_x, <object>(-1),
-                               &axis_, &n_, &in_place, &xnd, &dir_, 1)
-
-    x_type = cnp.PyArray_TYPE(x_arr)
-
-    if x_type is cnp.NPY_FLOAT or x_type is cnp.NPY_DOUBLE:
-        pass
-    else:
-        # we must cast the input and allocate the output,
-        # so we cast to complex double and operate in place
-        try:
-            x_arr = <cnp.ndarray> cnp.PyArray_FROM_OTF(
-                x_arr, cnp.NPY_DOUBLE,
-                cnp.NPY_ARRAY_BEHAVED | cnp.NPY_ARRAY_ENSURECOPY
-            )
-        except:
-            raise ValueError(
-                "First argument should be a real "
-                "or a complex sequence of single or double precision"
-            )
-        x_type = cnp.PyArray_TYPE(x_arr)
-        in_place = 1
-
-    # need to convert this into complex array
-    rc_obj = _rr_to_rc(x_arr, n_, axis_, xnd, x_type)
-    rc_arr = <cnp.ndarray> rc_obj
-
-    rc_type = cnp.NPY_CFLOAT if x_type is cnp.NPY_FLOAT else cnp.NPY_CDOUBLE
-    in_place = False
-    if in_place:
-        f_arr = x_arr
-    else:
-        f_arr = _allocate_result(x_arr, n_, axis_, x_type)
-
-    # call out-of-place FFT
-    if rc_type is cnp.NPY_CFLOAT:
-        _cache_capsule = _tls_dfti_cache_capsule()
-        _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
-            _cache_capsule, capsule_name
-        )
-        status = cfloat_float_mkl_irfft_out(
-            rc_arr, n_, <int> axis_, f_arr, fsc, _cache
-        )
-    elif rc_type is cnp.NPY_CDOUBLE:
-        _cache_capsule = _tls_dfti_cache_capsule()
-        _cache = <DftiCache *>cpython.pycapsule.PyCapsule_GetPointer(
-            _cache_capsule, capsule_name
-        )
-        status = cdouble_double_mkl_irfft_out(
-            rc_arr, n_, <int> axis_, f_arr, fsc, _cache
-        )
-    else:
-        raise ValueError(
-            "Internal mkl_fft error occurred: Unrecognized rc_type"
-        )
-
-    if (status):
-        c_error_msg = mkl_dfti_error(status)
-        py_error_msg = c_error_msg
-        raise ValueError(
-            "Internal error occurred: {}".format(str(py_error_msg))
-        )
-
-    return f_arr
-
-
-def rfftpack(x, n=None, axis=-1, overwrite_x=False, fwd_scale=1.0):
-    """Packed real-valued harmonics of FFT of a real sequence x"""
-    return _rr_fft1d_impl(
-        x, n=n, axis=axis, overwrite_x=overwrite_x, fsc=fwd_scale
-    )
-
-
-def irfftpack(x, n=None, axis=-1, overwrite_x=False, fwd_scale=1.0):
-    """IFFT of a real sequence, takes packed real-valued harmonics of FFT"""
-    return _rr_ifft1d_impl(
-        x, n=n, axis=axis, overwrite_x=overwrite_x, fsc=fwd_scale
-    )
diff --git a/mkl_fft/interfaces/_scipy_fft.py b/mkl_fft/interfaces/_scipy_fft.py
index 78b7dc3..0e796fb 100644
--- a/mkl_fft/interfaces/_scipy_fft.py
+++ b/mkl_fft/interfaces/_scipy_fft.py
@@ -203,6 +203,15 @@ def _init_nd_shape_and_axes(x, shape, axes, invreal=False):
     return tuple(shape), list(axes)
 
 
+def _use_input_as_out(x, overwrite_x):
+    """Check if the input can be used as output."""
+    if overwrite_x and np.issubdtype(x.dtype, np.complexfloating):
+        # pass input as out to overwrite it
+        return x
+    else:
+        return None
+
+
 def _validate_input(x):
     try:
         x = _supported_array_or_not_implemented(x)
@@ -223,12 +232,11 @@ def fft(
     """
     _check_plan(plan)
     x = _validate_input(x)
+    out = _use_input_as_out(x, overwrite_x)
     fsc = _compute_fwd_scale(norm, n, x.shape[axis])
 
     with _Workers(workers):
-        return mkl_fft.fft(
-            x, n=n, axis=axis, overwrite_x=overwrite_x, fwd_scale=fsc
-        )
+        return mkl_fft.fft(x, n=n, axis=axis, fwd_scale=fsc, out=out)
 
 
 def ifft(
@@ -242,12 +250,11 @@ def ifft(
     """
     _check_plan(plan)
     x = _validate_input(x)
+    out = _use_input_as_out(x, overwrite_x)
     fsc = _compute_fwd_scale(norm, n, x.shape[axis])
 
     with _Workers(workers):
-        return mkl_fft.ifft(
-            x, n=n, axis=axis, overwrite_x=overwrite_x, fwd_scale=fsc
-        )
+        return mkl_fft.ifft(x, n=n, axis=axis, fwd_scale=fsc, out=out)
 
 
 def fft2(
@@ -322,13 +329,12 @@ def fftn(
     """
     _check_plan(plan)
     x = _validate_input(x)
+    out = _use_input_as_out(x, overwrite_x)
     s, axes = _init_nd_shape_and_axes(x, s, axes)
     fsc = _compute_fwd_scale(norm, s, x.shape)
 
     with _Workers(workers):
-        return mkl_fft.fftn(
-            x, s=s, axes=axes, overwrite_x=overwrite_x, fwd_scale=fsc
-        )
+        return mkl_fft.fftn(x, s=s, axes=axes, fwd_scale=fsc, out=out)
 
 
 def ifftn(
@@ -349,13 +355,12 @@ def ifftn(
     """
     _check_plan(plan)
     x = _validate_input(x)
+    out = _use_input_as_out(x, overwrite_x)
     s, axes = _init_nd_shape_and_axes(x, s, axes)
     fsc = _compute_fwd_scale(norm, s, x.shape)
 
     with _Workers(workers):
-        return mkl_fft.ifftn(
-            x, s=s, axes=axes, overwrite_x=overwrite_x, fwd_scale=fsc
-        )
+        return mkl_fft.ifftn(x, s=s, axes=axes, fwd_scale=fsc, out=out)
 
 
 def rfft(
diff --git a/mkl_fft/interfaces/_scipy_fftpack.py b/mkl_fft/interfaces/_scipy_fftpack.py
deleted file mode 100644
index a4cd27d..0000000
--- a/mkl_fft/interfaces/_scipy_fftpack.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2025, Intel Corporation
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-#     * Redistributions of source code must retain the above copyright notice,
-#       this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * Neither the name of Intel Corporation nor the names of its contributors
-#       may be used to endorse or promote products derived from this software
-#       without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import mkl_fft
-
-from ._float_utils import _upcast_float16_array
-
-__all__ = ["fft", "ifft", "fftn", "ifftn", "fft2", "ifft2", "rfft", "irfft"]
-
-
-def fft(a, n=None, axis=-1, overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.fft(x, n=n, axis=axis, overwrite_x=overwrite_x)
-
-
-def ifft(a, n=None, axis=-1, overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.ifft(x, n=n, axis=axis, overwrite_x=overwrite_x)
-
-
-def fftn(a, shape=None, axes=None, overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.fftn(x, s=shape, axes=axes, overwrite_x=overwrite_x)
-
-
-def ifftn(a, shape=None, axes=None, overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.ifftn(x, s=shape, axes=axes, overwrite_x=overwrite_x)
-
-
-def fft2(a, shape=None, axes=(-2, -1), overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.fftn(x, s=shape, axes=axes, overwrite_x=overwrite_x)
-
-
-def ifft2(a, shape=None, axes=(-2, -1), overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.ifftn(x, s=shape, axes=axes, overwrite_x=overwrite_x)
-
-
-def rfft(a, n=None, axis=-1, overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.rfftpack(x, n=n, axis=axis, overwrite_x=overwrite_x)
-
-
-def irfft(a, n=None, axis=-1, overwrite_x=False):
-    x = _upcast_float16_array(a)
-    return mkl_fft.irfftpack(x, n=n, axis=axis, overwrite_x=overwrite_x)
diff --git a/mkl_fft/tests/test_fft1d.py b/mkl_fft/tests/test_fft1d.py
index fbff2bd..7a1e0fd 100644
--- a/mkl_fft/tests/test_fft1d.py
+++ b/mkl_fft/tests/test_fft1d.py
@@ -124,18 +124,18 @@ def test_vector4(self):
     def test_vector5(self):
         """fft in-place is the same as fft out-of-place"""
         x = self.xz1.copy()[::-2]
-        f1 = mkl_fft.fft(x, overwrite_x=True)
+        f1 = mkl_fft.fft(x, out=x)
         f2 = mkl_fft.fft(self.xz1[::-2])
         assert_(np.allclose(f1, f2))
 
     def test_vector6(self):
         """fft in place"""
         x = self.xz1.copy()
-        f1 = mkl_fft.fft(x, overwrite_x=True)
+        f1 = mkl_fft.fft(x, out=x)
         assert_(not _datacopied(f1, x))  # this is in-place
 
         x = self.xz1.copy()
-        f1 = mkl_fft.fft(x[::-2], overwrite_x=True)
+        f1 = mkl_fft.fft(x[::-2], out=x[::-2])
         assert_(not np.allclose(x, self.xz1))  # this is also in-place
         assert_(np.allclose(x[-2::-2], self.xz1[-2::-2]))
         assert_(np.allclose(x[-1::-2], f1))
@@ -239,7 +239,7 @@ def test_matrix3(self):
     def test_matrix4(self):
         x = self.az2.copy()
         f1 = mkl_fft.fft(x[::3, ::-1])
-        f2 = mkl_fft.fft(x[::3, ::-1], overwrite_x=True)
+        f2 = mkl_fft.fft(x[::3, ::-1], out=x[::3, ::-1])
         assert_allclose(f1, f2)
 
     def test_matrix5(self):
@@ -352,55 +352,6 @@ def test_array6(self):
         assert_allclose(y1, y2, atol=2e-15)
 
 
-class Test_mklfft_rfftpack(TestCase):
-    def setUp(self):
-        rnd.seed(1234567)
-        self.v1 = rnd.randn(16)
-        self.m2 = rnd.randn(5, 7)
-        self.t3 = rnd.randn(5, 7, 11)
-
-    def test1(self):
-        x = self.v1.copy()
-        f1 = mkl_fft.rfftpack(x)
-        f2 = mkl_fft.irfftpack(f1)
-        assert_allclose(f2, x)
-
-    def test2(self):
-        x = self.v1.copy()
-        f1 = mkl_fft.irfftpack(x)
-        f2 = mkl_fft.rfftpack(f1)
-        assert_allclose(f2, x)
-
-    def test3(self):
-        for a in range(0, 2):
-            for ovwr_x in [True, False]:
-                for dt, atol in zip([np.float32, np.float64], [2e-7, 2e-15]):
-                    x = self.m2.copy().astype(dt)
-                    f1 = mkl_fft.rfftpack(x, axis=a, overwrite_x=ovwr_x)
-                    f2 = mkl_fft.irfftpack(f1, axis=a, overwrite_x=ovwr_x)
-                    assert_allclose(
-                        f2, self.m2.astype(dt), atol=atol, err_msg=(a, ovwr_x)
-                    )
-
-    def test4(self):
-        for a in range(0, 2):
-            for ovwr_x in [True, False]:
-                for dt, atol in zip([np.float32, np.float64], [2e-7, 2e-15]):
-                    x = self.m2.copy().astype(dt)
-                    f1 = mkl_fft.irfftpack(x, axis=a, overwrite_x=ovwr_x)
-                    f2 = mkl_fft.rfftpack(f1, axis=a, overwrite_x=ovwr_x)
-                    assert_allclose(f2, self.m2.astype(dt), atol=atol)
-
-    def test5(self):
-        for a in range(0, 3):
-            for ovwr_x in [True, False]:
-                for dt, atol in zip([np.float32, np.float64], [4e-7, 4e-15]):
-                    x = self.t3.copy().astype(dt)
-                    f1 = mkl_fft.irfftpack(x, axis=a, overwrite_x=ovwr_x)
-                    f2 = mkl_fft.rfftpack(f1, axis=a, overwrite_x=ovwr_x)
-                    assert_allclose(f2, self.t3.astype(dt), atol=atol)
-
-
 @requires_numpy_2
 @pytest.mark.parametrize("axis", [0, 1, 2])
 @pytest.mark.parametrize("func", ["fft", "ifft"])
diff --git a/mkl_fft/tests/test_fftnd.py b/mkl_fft/tests/test_fftnd.py
index ef54f1a..f50da97 100644
--- a/mkl_fft/tests/test_fftnd.py
+++ b/mkl_fft/tests/test_fftnd.py
@@ -283,10 +283,8 @@ def test_gh109():
     b_int = np.array([[5, 7, 6, 5], [4, 6, 4, 8], [9, 3, 7, 5]], dtype=np.int64)
     b = np.asarray(b_int, dtype=np.float32)
 
-    r1 = mkl_fft.fftn(b, s=None, axes=(0,), overwrite_x=False, fwd_scale=1 / 3)
-    r2 = mkl_fft.fftn(
-        b_int, s=None, axes=(0,), overwrite_x=False, fwd_scale=1 / 3
-    )
+    r1 = mkl_fft.fftn(b, s=None, axes=(0,), fwd_scale=1 / 3)
+    r2 = mkl_fft.fftn(b_int, s=None, axes=(0,), fwd_scale=1 / 3)
 
     rtol, atol = _get_rtol_atol(b)
     assert_allclose(r1, r2, rtol=rtol, atol=atol)
@@ -303,8 +301,29 @@ def test_s_axes(dtype, s, axes, func):
     else:
         x = np.random.random(shape)
 
-    r1 = getattr(mkl_fft, func)(x, s=s, axes=axes)
-    r2 = getattr(np.fft, func)(x, s=s, axes=axes)
+    r1 = getattr(np.fft, func)(x, s=s, axes=axes)
+    r2 = getattr(mkl_fft, func)(x, s=s, axes=axes)
+
+    rtol, atol = _get_rtol_atol(x)
+    assert_allclose(r1, r2, rtol=rtol, atol=atol)
+
+
+@requires_numpy_2
+@pytest.mark.parametrize("dtype", [complex, float])
+@pytest.mark.parametrize("s", [(15, 24, 10), [35, 25, 15], [25, 15, 5]])
+@pytest.mark.parametrize("axes", [(0, 1, 2), (-1, -2, -3), [1, 0, 2]])
+@pytest.mark.parametrize("func", ["fftn", "ifftn", "rfftn", "irfftn"])
+def test_s_axes_out(dtype, s, axes, func):
+    shape = (30, 20, 10)
+    if dtype is complex and func != "rfftn":
+        x = np.random.random(shape) + 1j * np.random.random(shape)
+    else:
+        x = np.random.random(shape)
+
+    r1 = getattr(np.fft, func)(x, s=s, axes=axes)
+    out = np.empty_like(r1)
+    r2 = getattr(mkl_fft, func)(x, s=s, axes=axes, out=out)
+    assert r2 is out
 
     rtol, atol = _get_rtol_atol(x)
     assert_allclose(r1, r2, rtol=rtol, atol=atol)
@@ -320,8 +339,8 @@ def test_repeated_axes(dtype, axes, func):
     else:
         x = np.random.random(shape)
 
-    r1 = getattr(mkl_fft, func)(x, axes=axes)
-    r2 = getattr(np.fft, func)(x, axes=axes)
+    r1 = getattr(np.fft, func)(x, axes=axes)
+    r2 = getattr(mkl_fft, func)(x, axes=axes)
 
     rtol, atol = _get_rtol_atol(x)
     assert_allclose(r1, r2, rtol=rtol, atol=atol)
@@ -340,4 +359,4 @@ def test_out_strided(axes, func):
     result = getattr(mkl_fft, func)(x, axes=axes, out=out)
     expected = getattr(np.fft, func)(x, axes=axes, out=out)
 
-    assert_allclose(result, expected)
+    assert_allclose(result, expected, strict=True)