diff --git a/.gitignore b/.gitignore
index fd9d9e9..419f809 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,15 +13,13 @@ dist
 build
 wheelhouse
 
-#Pytest
+#Test
 .pytest_cache
+*.pickle
+.hypothesis/*
 
 #Pipenv
 .env*
 Pipfile*
 .venv*
 .pipenv-cache
-
-#Test outputs
-*.pickle
-.hypothesis/*
diff --git a/.markdownlint.yaml b/.markdownlint.yaml
index bf950d3..859154b 100644
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@@ -1,7 +1,9 @@
 MD013:
-  line_length: 120
+  line_length: 92
+  code_block_line_length: 88
   code_blocks: true
   tables: false
+  stern: true
 MD033:
   allowed_elements:
     - img
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 95f1e3d..740be91 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -108,7 +108,15 @@ repos:
     rev: v0.41.0
     hooks:
       - id: markdownlint
-        args: ["--fix"]
+      # Ignore old internal README that will not be rendered as docs page
+        args: ["--fix", "--ignore", "internal/gtc2024/README.md"]
+
+  - repo: https://github.com/sphinx-contrib/sphinx-lint
+    rev: v1.0.0
+    hooks:
+      - id: sphinx-lint
+        args: ["--enable", "all", "--max-line-length", "92"]
+        exclude: ^(.*)generated/(.*)\.rst$
 
 default_language_version:
-      python: python3
+  python: python310
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7067c58..4a2f2ed 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,18 +1,21 @@
 # Contributing
 
-Thank you for your interest in contributing to `nvmath-python`! Based on the type of contribution, it will fall into
-three categories:
+Thank you for your interest in contributing to `nvmath-python`! Based on the type of
+contribution, it will fall into three categories:
 
 1. You want to report a bug, feature request, or documentation issue
-    - File an [issue](https://github.com/NVIDIA/nvmath-python/issues) describing what you encountered or what you want
-    to see changed.
-    - The NVIDIA team will evaluate the issues and triage them, scheduling them for a release. If you believe the issue
-    needs priority attention comment on the issue to notify the team.
+    - File an [issue](https://github.com/NVIDIA/nvmath-python/issues) describing what you
+    encountered or what you want to see changed.
+    - The NVIDIA team will evaluate the issues and triage them, scheduling them for a
+    release. If you believe the issue needs priority attention comment on the issue to
+    notify the team.
 2. You want to implement a feature or bug-fix
-    - At this time we do not accept code contributions. However, we do plan to change and accept contributions in the
-    near future, although the timeline is TBD. Please let us know if you're interested in contributing (and in what
-    aspect) so that we can plan accordingly.
+    - At this time we do not accept code contributions. However, we do plan to change and
+    accept contributions in the near future, although the timeline is TBD. Please let us
+    know if you're interested in contributing (and in what aspect) so that we can plan
+    accordingly.
 3. You want to share your nice work built upon `nvmath-python`:
-    - We would love to hear more about your work! Please share with us on [NVIDIA/nvmath-python GitHub
-    Discussions](https://github.com/NVIDIA/nvmath-python/discussions)! We also take any `nvmath-python`-related
-    questions on this forum.
+    - We would love to hear more about your work! Please share with us on
+    [NVIDIA/nvmath-python GitHub
+    Discussions](https://github.com/NVIDIA/nvmath-python/discussions)! We also take any
+    `nvmath-python`-related questions on this forum.
diff --git a/DESCRIPTION.rst b/DESCRIPTION.rst
index a3e1a40..36af58a 100644
--- a/DESCRIPTION.rst
+++ b/DESCRIPTION.rst
@@ -2,8 +2,10 @@
 nvmath-python: NVIDIA Math Libraries for the Python Ecosystem
 *************************************************************
 
-`nvmath-python <https://developer.nvidia.com/nvmath-python>`_ brings the power and performance of NVIDIA math libraries to the Python ecosystem
-with intuitive, pythonic APIs. The ultimate goal is to provide users full access to all of the available library features in a variety of execution spaces.
+`nvmath-python <https://developer.nvidia.com/nvmath-python>`_ brings the power and
+performance of NVIDIA math libraries to the Python ecosystem with intuitive, pythonic APIs.
+The ultimate goal is to provide users full access to all of the available library features
+in a variety of execution spaces.
 
 * `Documentation <https://docs.nvidia.com/cuda/nvmath-python/index.html>`_
 * `Examples <https://github.com/NVIDIA/nvmath-python/tree/main/examples>`_
@@ -15,4 +17,6 @@ Installation
 
    pip install nvmath-python[cu12]
 
-Please refer to the `installation instructions <https://docs.nvidia.com/cuda/nvmath-python/latest/getting-started.html>`_ for different ways of installing nvmath-python, including building from source.
+Please refer to the `installation instructions
+<https://docs.nvidia.com/cuda/nvmath-python/latest/installation.html>`_ for different
+ways of installing nvmath-python, including building from source.
diff --git a/README.md b/README.md
index 0092ee8..e1c9cef 100644
--- a/README.md
+++ b/README.md
@@ -1,41 +1,48 @@
 <h1 align="center"><img
     src="docs/sphinx/_static/nvmath-python-green-r4.svg"
-    alt="A green cube with three sides visible. Dots, a sine wave, and a grid on the three faces."
+    alt="A cube with three sides visible. Dots, a sine wave, and a grid on the faces."
     width="200"/>
 </h1>
 
 # nvmath-python: NVIDIA Math Libraries for the Python Ecosystem
 
-nvmath-python brings the power of the NVIDIA math libraries to the Python ecosystem. The package aims to provide
-intuitive pythonic APIs that provide users full access to all the features offered by NVIDIA's libraries in a variety of
-execution spaces. nvmath-python works seamlessly with existing Python array/tensor frameworks and focuses on providing
+nvmath-python brings the power of the NVIDIA math libraries to the Python ecosystem. The
+package aims to provide intuitive pythonic APIs that provide users full access to all the
+features offered by NVIDIA's libraries in a variety of execution spaces. nvmath-python works
+seamlessly with existing Python array/tensor frameworks and focuses on providing
 functionality that is missing from those frameworks.
 
 ## Some Examples
 
-Using the nvmath-python API allows access to all parameters of the underlying NVIDIA cuBLASLt library. Some of these
-parameters are unavailable in other wrappings of NVIDIA's C-API libraries.
+Using the nvmath-python API allows access to all parameters of the underlying NVIDIA
+cuBLASLt library. Some of these parameters are unavailable in other wrappings of NVIDIA's
+C-API libraries.
 
 ```python
 import cupy as cp
 import nvmath
 
-# Prepare sample input data. nvmath-python accepts input tensors from pytorch, cupy, and numpy.
+# Prepare sample input data. nvmath-python accepts input tensors from pytorch, cupy, and
+# numpy.
 m, n, k = 123, 456, 789
 a = cp.random.rand(m, k).astype(cp.float32)
 b = cp.random.rand(k, n).astype(cp.float32)
 bias = cp.random.rand(m, 1).astype(cp.float32)
 
-# Use the stateful Matmul object in order to perform multiple matrix multiplications without replanning. The nvmath API
-# allows us to fine-tune our operations by, for example, selecting a mixed-precision compute type.
+# Use the stateful Matmul object in order to perform multiple matrix multiplications
+# without replanning. The nvmath API allows us to fine-tune our operations by, for
+# example, selecting a mixed-precision compute type.
 mm = nvmath.linalg.advanced.Matmul(
     a,
     b,
-    options={"compute_type": nvmath.linalg.advanced.MatmulComputeType.COMPUTE_32F_FAST_16F},
+    options={
+        "compute_type": nvmath.linalg.advanced.MatmulComputeType.COMPUTE_32F_FAST_16F
+    },
 )
 
-# Plan the matrix multiplication. Planning returns a sequence of algorithms that can be configured. We can also select
-# epilog operations which are applied to the result of the multiplication without a separate function call.
+# Plan the matrix multiplication. Planning returns a sequence of algorithms that can be
+# configured. We can also select epilog operations which are applied to the result of
+# the multiplication without a separate function call.
 mm.plan(
     epilog=nvmath.linalg.advanced.MatmulEpilog.BIAS,
     epilog_inputs={"bias": bias},
@@ -47,15 +54,16 @@ result = mm.execute()
 # Remember to free the Matmul object when finished or use it as a context manager
 mm.free()
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for
+# GPU operands.
 cp.cuda.get_current_stream().synchronize()
 print(f"Input types = {type(a), type(b)}, device = {a.device, b.device}")
 print(f"Result type = {type(result)}, device = {result.device}")
 ```
 
-nvmath-python exposes NVIDIA's device-side (Dx) APIs. This allows developers to call NVIDIA library functions inside
-their custom device kernels. For example, a numba jit function can call cuFFT in order to implement FFT-based
-convolution.
+nvmath-python exposes NVIDIA's device-side (Dx) APIs. This allows developers to call NVIDIA
+library functions inside their custom device kernels. For example, a numba jit function can
+call cuFFT in order to implement FFT-based convolution.
 
 ```python
 import numpy as np
@@ -63,7 +71,10 @@ from numba import cuda
 from nvmath.device import fft
 
 def random_complex(shape, real_dtype):
-    return np.random.randn(*shape).astype(real_dtype) + 1.j * np.random.randn(*shape).astype(real_dtype)
+    return (
+        np.random.randn(*shape).astype(real_dtype)
+        + 1.j * np.random.randn(*shape).astype(real_dtype)
+    )
 
 def main():
 
@@ -148,9 +159,9 @@ if __name__ == "__main__":
     main()
 ```
 
-nvmath-python provides the ability to write custom prologs and epilogs for FFT functions as a Python functions and
-compiled them LTO-IR. For example, to have unitary scaling for an FFT, we can define an epilog which rescales the output
-by 1/sqrt(N).
+nvmath-python provides the ability to write custom prologs and epilogs for FFT functions as
+a Python functions and compiled them LTO-IR. For example, to have unitary scaling for an
+FFT, we can define an epilog which rescales the output by 1/sqrt(N).
 
 ```python
 import cupy as cp
@@ -175,7 +186,8 @@ with cp.cuda.Device():
 # Perform the forward FFT, applying the filter as a epilog...
 r = nvmath.fft.fft(a, axes=[-1], epilog={"ltoir": epilog})
 
-# Finally, we can test that the fused FFT run result matches the result of separate calls
+# Finally, we can test that the fused FFT run result matches the result of separate
+# calls
 s = cp.fft.fftn(a, axes=[-1], norm="ortho")
 
 assert cp.allclose(r, s)
@@ -187,5 +199,6 @@ All files hosted in this repository are subject to the [Apache 2.0](./LICENSE) l
 
 ## Disclaimer
 
-nvmath-python is in a Beta state. Beta products may not be fully functional, may contain errors or design flaws, and may
-be changed at any time without notice. We appreciate your feedback to improve and iterate on our Beta products.
+nvmath-python is in a Beta state. Beta products may not be fully functional, may contain
+errors or design flaws, and may be changed at any time without notice. We appreciate your
+feedback to improve and iterate on our Beta products.
diff --git a/SECURITY.md b/SECURITY.md
index 656375b..5e34562 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,28 +1,32 @@
 # Security
 
-NVIDIA is dedicated to the security and trust of our software products and services, including all source code
-repositories managed through our organization.
+NVIDIA is dedicated to the security and trust of our software products and services,
+including all source code repositories managed through our organization.
 
-If you need to report a security issue, please use the appropriate contact points outlined below. **Please do not report
-security vulnerabilities through GitHub/GitLab.**
+If you need to report a security issue, please use the appropriate contact points outlined
+below. **Please do not report security vulnerabilities through GitHub/GitLab.**
 
 ## Reporting Potential Security Vulnerability in nvmath-python
 
 To report a potential security vulnerability in nvmath-python:
 
-- Web: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html)
+- Web: [Security Vulnerability Submission
+  Form](https://www.nvidia.com/object/submit-security-vulnerability.html)
 - E-Mail: <psirt@nvidia.com>
-  - We encourage you to use the following PGP key for secure email communication: [NVIDIA public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key)
+  - We encourage you to use the following PGP key for secure email communication: [NVIDIA
+    public PGP Key for communication](https://www.nvidia.com/en-us/security/pgp-key)
   - Please include the following information:
     - Product/Driver name and version/branch that contains the vulnerability
     - Type of vulnerability (code execution, denial of service, buffer overflow, etc.)
     - Instructions to reproduce the vulnerability
     - Proof-of-concept or exploit code
-    - Potential impact of the vulnerability, including how an attacker could exploit the vulnerability
+    - Potential impact of the vulnerability, including how an attacker could exploit the
+      vulnerability
 
-While NVIDIA currently does not have a bug bounty program, we do offer acknowledgement when an externally reported
-security issue is addressed under our coordinated vulnerability disclosure policy. Please visit our [Product Security
-Incident Response Team (PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/) policies page for more
+While NVIDIA currently does not have a bug bounty program, we do offer acknowledgement when
+an externally reported security issue is addressed under our coordinated vulnerability
+disclosure policy. Please visit our [Product Security Incident Response Team
+(PSIRT)](https://www.nvidia.com/en-us/security/psirt-policies/) policies page for more
 information.
 
 ## NVIDIA Product Security
diff --git a/builder/utils.py b/builder/utils.py
index 98ba9ae..7c123f5 100644
--- a/builder/utils.py
+++ b/builder/utils.py
@@ -93,10 +93,11 @@ def _prep_includes_libs_rpaths(self, lib_name):
             if lib_name is not None:
                 ldflag = "-Wl,--disable-new-dtags"
                 if lib_name == "nvpl":
-                    # 1. the nvpl bindings land in site-packages/nvmath/bindings/nvpl/_internal/
-                    # as opposed to other packages that have their bindings in
-                    # site-packages/nvmath/bindings/_internal/, so we need one extra `..`
-                    # to get into `site-packages` and then the lib_name=nvpl is not in nvidia
+                    # 1. the nvpl bindings land in
+                    # site-packages/nvmath/bindings/nvpl/_internal/ as opposed to other
+                    # packages that have their bindings in
+                    # site-packages/nvmath/bindings/_internal/, so we need one extra `..` to
+                    # get into `site-packages` and then the lib_name=nvpl is not in nvidia
                     # dir but directly in the site-packages.
                     # 2. mkl lib is placed directly in the python `lib` directory, not in
                     # python{ver}/site-packages
diff --git a/docs/Makefile b/docs/Makefile
index 068d878..de6a9ce 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -3,7 +3,7 @@ SHELL=/bin/bash
 
 # You can set these variables from the command line or environment
 SPHINX_NVMATH_PYTHON_VER ?= $(shell [[ $$(< ../nvmath/_version.py) =~ __version__[^0-9.]*([0-9.]*) ]] && echo $${BASH_REMATCH[1]})
-SPHINXOPTS    ?= -j auto
+SPHINXOPTS    ?= -W
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = sphinx
 BUILDDIR      ?= _build/html
@@ -27,4 +27,10 @@ html: Makefile
 	cd $(BUILDDIR) && cp -a ${SPHINX_NVMATH_PYTHON_VER}/* latest/
 	cp $(BUILDDIR)/latest/objects.inv $(BUILDDIR)
 
-.PHONY: help Makefile clean html
+
+linkcheck: Makefile
+	$(eval $@_BUILDDIR := $(shell mktemp -d))
+	$(SPHINXBUILD) -b $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	rm -rf $($@_BUILDDIR)
+
+.PHONY: help Makefile clean html linkcheck
diff --git a/docs/sphinx/README.md b/docs/README.md
similarity index 100%
rename from docs/sphinx/README.md
rename to docs/README.md
diff --git a/docs/sphinx/_static/switcher.json b/docs/sphinx/_static/switcher.json
index dc23530..aabddac 100644
--- a/docs/sphinx/_static/switcher.json
+++ b/docs/sphinx/_static/switcher.json
@@ -3,6 +3,10 @@
         "version": "latest",
         "url": "https://docs.nvidia.com/cuda/nvmath-python/latest"
     },
+    {
+        "version": "0.2.1",
+        "url": "https://docs.nvidia.com/cuda/nvmath-python/0.2.1"
+    },
     {
         "version": "0.2.0",
         "url": "https://docs.nvidia.com/cuda/nvmath-python/0.2.0"
diff --git a/docs/sphinx/_templates/autosummary/namedtuple.rst b/docs/sphinx/_templates/autosummary/namedtuple.rst
index 1568d7e..35bc36c 100644
--- a/docs/sphinx/_templates/autosummary/namedtuple.rst
+++ b/docs/sphinx/_templates/autosummary/namedtuple.rst
@@ -5,4 +5,5 @@
 .. autoclass:: {{ objname }}
    :members: __new__
    :special-members: __new__
-   :exclude-members: count, index, __reduce__, __reduce_ex__, __repr__, __hash__, __str__, __getnewargs__
+   :exclude-members: count, index, __reduce__, __reduce_ex__, __repr__, __hash__,
+      __str__, __getnewargs__
diff --git a/docs/sphinx/_templates/main.html b/docs/sphinx/_templates/main.html
index e06fcb4..b249469 100644
--- a/docs/sphinx/_templates/main.html
+++ b/docs/sphinx/_templates/main.html
@@ -6,7 +6,7 @@
         <link rel="canonical" href="latest/index.html" />
     </head>
     <body>
-        <p>If this page does not refresh automatically, then please direct your browser to
+        <p>If this page does not refresh automatically, direct your browser to
             <a href="latest/index.html">our latest docs</a>.
         </p>
     </body>
diff --git a/docs/sphinx/bindings/cublas.rst b/docs/sphinx/bindings/cublas.rst
index a7a4477..2cdba83 100644
--- a/docs/sphinx/bindings/cublas.rst
+++ b/docs/sphinx/bindings/cublas.rst
@@ -3,7 +3,8 @@
 cuBLAS (:mod:`nvmath.bindings.cublas`)
 ======================================
 
-For detailed documentation on the original C APIs, please refer to `cuBLAS documentation <https://docs.nvidia.com/cuda/cublas/>`_.
+For detailed documentation on the original C APIs, refer to the `cuBLAS documentation
+<https://docs.nvidia.com/cuda/cublas/>`_.
 
 Enums and constants
 *******************
diff --git a/docs/sphinx/bindings/cublasLt.rst b/docs/sphinx/bindings/cublasLt.rst
index 47b4735..a0dc8c7 100644
--- a/docs/sphinx/bindings/cublasLt.rst
+++ b/docs/sphinx/bindings/cublasLt.rst
@@ -3,7 +3,8 @@
 cuBLASLt (:mod:`nvmath.bindings.cublaslt`)
 ==========================================
 
-For detailed documentation on the original C APIs, please refer to `cuBLASLt documentation <https://docs.nvidia.com/cuda/cublas/#using-the-cublaslt-api>`_.
+For detailed documentation on the original C APIs, refer to the `cuBLASLt documentation
+<https://docs.nvidia.com/cuda/cublas/#using-the-cublaslt-api>`_.
 
 Enums and constants
 *******************
diff --git a/docs/sphinx/bindings/cufft.rst b/docs/sphinx/bindings/cufft.rst
index b726ac4..c606913 100644
--- a/docs/sphinx/bindings/cufft.rst
+++ b/docs/sphinx/bindings/cufft.rst
@@ -3,7 +3,8 @@
 cuFFT (:mod:`nvmath.bindings.cufft`)
 ====================================
 
-For detailed documentation on the original C APIs, please refer to `cuFFT documentation <https://docs.nvidia.com/cuda/cufft/>`_.
+For detailed documentation on the original C APIs, refer to the `cuFFT documentation
+<https://docs.nvidia.com/cuda/cufft/>`_.
 
 Enums and constants
 *******************
diff --git a/docs/sphinx/bindings/curand.rst b/docs/sphinx/bindings/curand.rst
index 5d60249..9a874e1 100644
--- a/docs/sphinx/bindings/curand.rst
+++ b/docs/sphinx/bindings/curand.rst
@@ -3,7 +3,8 @@
 cuRAND (:mod:`nvmath.bindings.curand`)
 ======================================
 
-For detailed documentation on the original C APIs, please refer to `cuRAND documentation <https://docs.nvidia.com/cuda/curand/>`_.
+For detailed documentation on the original C APIs, refer to the `cuRAND documentation
+<https://docs.nvidia.com/cuda/curand/>`_.
 
 Enums and constants
 *******************
diff --git a/docs/sphinx/bindings/cusolver.rst b/docs/sphinx/bindings/cusolver.rst
index c2ea353..8bbcb79 100644
--- a/docs/sphinx/bindings/cusolver.rst
+++ b/docs/sphinx/bindings/cusolver.rst
@@ -3,7 +3,8 @@
 cuSOLVER (:mod:`nvmath.bindings.cusolver`)
 ==========================================
 
-For detailed documentation on the original C APIs, please refer to `cuSOLVER documentation <https://docs.nvidia.com/cuda/cusolver/>`_.
+For detailed documentation on the original C APIs, refer to the `cuSOLVER documentation
+<https://docs.nvidia.com/cuda/cusolver/>`_.
 
 Enums and constants
 *******************
diff --git a/docs/sphinx/bindings/cusolverDn.rst b/docs/sphinx/bindings/cusolverDn.rst
index 12247ff..393af02 100644
--- a/docs/sphinx/bindings/cusolverDn.rst
+++ b/docs/sphinx/bindings/cusolverDn.rst
@@ -3,7 +3,8 @@
 cuSOLVERDn (:mod:`nvmath.bindings.cusolverDn`)
 ==============================================
 
-For detailed documentation on the original C APIs, please refer to `cuSOLVERDn documentation <https://docs.nvidia.com/cuda/cusolver/#cusolverdn-dense-lapack-function-reference>`_.
+For detailed documentation on the original C APIs, refer to the `cuSOLVERDn documentation
+<https://docs.nvidia.com/cuda/cusolver/#cusolverdn-dense-lapack-function-reference>`_.
 
 Enums and constants
 *******************
diff --git a/docs/sphinx/bindings/cusparse.rst b/docs/sphinx/bindings/cusparse.rst
index 4baf321..98873e2 100644
--- a/docs/sphinx/bindings/cusparse.rst
+++ b/docs/sphinx/bindings/cusparse.rst
@@ -3,7 +3,8 @@
 cuSPARSE (:mod:`nvmath.bindings.cusparse`)
 ==========================================
 
-For detailed documentation on the original C APIs, please refer to `cuSPARSE documentation <https://docs.nvidia.com/cuda/cusparse/>`_.
+For detailed documentation on the original C APIs, refer to the `cuSPARSE documentation
+<https://docs.nvidia.com/cuda/cusparse/>`_.
 
 Enums and constants
 *******************
diff --git a/docs/sphinx/bindings/index.rst b/docs/sphinx/bindings/index.rst
index 6722994..d87532f 100644
--- a/docs/sphinx/bindings/index.rst
+++ b/docs/sphinx/bindings/index.rst
@@ -5,15 +5,21 @@ nvmath-python Bindings
 Overview
 ========
 
-.. warning:: All Python bindings documented in this section are *experimental* and subject to future changes. Use it at your own risk.
+.. warning::
 
-Low-level Python bindings for C APIs from NVIDIA Math Libraries are exposed under the corresponding modules in :mod:`nvmath.bindings`.
-To access the Python bindings, use the modules for the corresponding libraries.
-Under the hood, nvmath-python handles the run-time linking to the libraries for you lazily.
+    All Python bindings documented in this section are *experimental* and subject to future
+    changes. Use it at your own risk.
 
-The currently supported libraries along with the corresponding module names are listed as follows:
+Low-level Python bindings for C APIs from NVIDIA Math Libraries are exposed under the
+corresponding modules in :mod:`nvmath.bindings`. To access the Python bindings, use the
+modules for the corresponding libraries. Under the hood, nvmath-python handles the run-time
+linking to the libraries for you lazily.
+
+The currently supported libraries along with the corresponding module names are listed as
+follows:
 
 .. module:: nvmath
+   :no-index:
 
 .. list-table::
    :widths: 20 40
@@ -42,13 +48,15 @@ Support for more libraries will be added in the future.
 Naming & Calling Convention
 ===========================
 
-Inside each of the modules, all public APIs of the corresponding NVIDIA Math library are exposed following the `PEP 8`_ style guide along with the following changes:
+Inside each of the modules, all public APIs of the corresponding NVIDIA Math library are
+exposed following the `PEP 8`_ style guide along with the following changes:
 
 * All library name prefixes are stripped
 * The function names are broken by words and follow the camel case
 * The first letter in each word in the enum names are capitalized
 * Each enum's name prefix is stripped from its values' names
-* Whenever applicable, the outputs are stripped away from the function arguments and returned directly as Python objects
+* Whenever applicable, the outputs are stripped away from the function arguments and
+  returned directly as Python objects
 * Pointers are passed as Python :class:`int`
 * Exceptions are raised instead of returning the C error code
 
@@ -61,11 +69,14 @@ Below is a non-exhaustive list of examples of such C-to-Python mappings:
 - Enum type: ``cublasLtMatmulTile_t`` -> :class:`cublasLt.MatmulTile`
 - Enum type: ``cufftXtSubFormat`` -> :class:`cufft.XtSubFormat`
 - Enum value name: ``CUSOLVER_EIG_MODE_NOVECTOR`` -> :data:`cusolver.EigMode.NOVECTOR`
-- Enum value name: ``CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED`` -> :data:`cusparse.Status.MATRIX_TYPE_NOT_SUPPORTED`
-- Returns: The outputs of ``cusolverDnXpotrf_bufferSize`` are the workspace sizes on device and host, which are wrapped as a 2-tuple in the corresponding :func:`cusolverDn.xpotrf_buffer_size` Python API.
+- Enum value name: ``CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED`` ->
+  :data:`cusparse.Status.MATRIX_TYPE_NOT_SUPPORTED`
+- Returns: The outputs of ``cusolverDnXpotrf_bufferSize`` are the workspace sizes on device
+  and host, which are wrapped as a 2-tuple in the corresponding
+  :func:`cusolverDn.xpotrf_buffer_size` Python API.
 
-There may be exceptions for the above rules, but they would be self-evident and will be properly documented. In the next section we discuss
-pointer passing in Python.
+There may be exceptions for the above rules, but they would be self-evident and will be
+properly documented. In the next section we discuss pointer passing in Python.
 
 .. _PEP 8: https://www.python.org/dev/peps/pep-0008/
 
@@ -76,23 +87,26 @@ Memory management
 Pointer and data lifetime
 .........................
 
-Unlike in C/C++, Python does not provide low-level primitives to allocate/deallocate host memory (nor device memory).
-In order to make the C APIs work with Python, it is important that memory management is properly done through Python proxy objects.
-In nvmath-python, we ask users to address such needs using NumPy (for host memory) and CuPy (for device memory).
+Unlike in C/C++, Python does not provide low-level primitives to allocate/deallocate host
+memory (nor device memory). In order to make the C APIs work with Python, it is important
+that memory management is properly done through Python proxy objects. In nvmath-python, we
+ask users to address such needs using NumPy (for host memory) and CuPy (for device memory).
 
 .. note::
 
-    It is also possible to use :class:`array.array` (plus :class:`memoryview` as needed) to manage host memory. However it is more
-    laborious compared to using :class:`numpy.ndarray`, especially when it comes to array manipulation and computation.
+    It is also possible to use :class:`array.array` (plus :class:`memoryview` as needed) to
+    manage host memory. However it is more laborious compared to using
+    :class:`numpy.ndarray`, especially when it comes to array manipulation and computation.
 
 .. note::
 
-    It is also possible to use `CUDA Python`_ to manage device memory, but as of CUDA 11 there is no simple, pythonic way to
-    modify the contents stored on GPU, which requires custom kernels. CuPy is a lightweight, NumPy-compatible array library that
-    addresses this need.
+    It is also possible to use `CUDA Python`_ to manage device memory, but as of CUDA 11
+    there is no simple, pythonic way to modify the contents stored on GPU, which requires
+    custom kernels. CuPy is a lightweight, NumPy-compatible array library that addresses
+    this need.
 
-To pass data from Python to C, using pointer addresses (as Python :class:`int`) of various objects is required. We illustrate this using NumPy/CuPy arrays
-as follows:
+To pass data from Python to C, using pointer addresses (as Python :class:`int`) of various
+objects is required. We illustrate this using NumPy/CuPy arrays as follows:
 
 .. code-block:: python
 
@@ -119,16 +133,18 @@ as follows:
     my_func(..., buf.ptr, ...)
     # buf is automatically destroyed when going out of scope
 
-Please be aware that the underlying assumption is that the arrays must be contiguous in memory (unless the C interface allows
-for specifying the array strides).
+The underlying assumption is that the arrays must be contiguous in
+memory (unless the C interface allows for specifying the array strides).
 
-As a consequence, all C structs in NVIDIA Math libraries (including handles and descriptors) are *not exposed*
-as Python classes; that is, they do not have their own types and are simply cast to plain Python :class:`int` for passing around. Any
-downstream consumer should create a wrapper class to hold the pointer address if so desired. In other words, users have full
-control (and responsibility) for managing the *pointer lifetime*.
+As a consequence, all C structs in NVIDIA Math libraries (including handles and descriptors)
+are *not exposed* as Python classes; that is, they do not have their own types and are
+simply cast to plain Python :class:`int` for passing around. Any downstream consumer should
+create a wrapper class to hold the pointer address if so desired. In other words, users have
+full control (and responsibility) for managing the *pointer lifetime*.
 
-However, in certain cases we are able to convert Python objects for users (if *readonly, host* arrays are needed) so as to alleviate
-users' burden. For example, in functions that require a sequence or a nested sequence, the following operations are equivalent:
+However, in certain cases we are able to convert Python objects for users (if *readonly,
+host* arrays are needed) so as to alleviate users' burden. For example, in functions that
+require a sequence or a nested sequence, the following operations are equivalent:
 
 .. code-block:: python
 
@@ -140,14 +156,16 @@ users' burden. For example, in functions that require a sequence or a nested seq
     buf = [0, 1, 3, 5, 6]
     my_func(..., buf, ...)  # the underlying data type is determined by the C API
 
-which is particularly useful when users need to pass multiple sequences or nested sequences to C (ex: :func:`nvmath.bindings.cufft.plan_many`).
+which is particularly useful when users need to pass multiple sequences or nested sequences
+to C (ex: :func:`nvmath.bindings.cufft.plan_many`).
 
 .. note::
 
-    Some functions require their arguments to be in the device memory.
-    You need to pass device memory (for example, `cupy.ndarray`) to such arguments.
-    nvmath-python does not validate the memory pointers passed and does not implicitly transfer the data.
-    Passing host memory where device memory is expected (and vice versa) results in undefined behavior.
+    Some functions require their arguments to be in the device memory. You need to pass
+    device memory (for example, :class:`cupy.ndarray`) to such arguments. nvmath-python
+    does not validate the memory pointers passed and does not implicitly transfer the data.
+    Passing host memory where device memory is expected (and vice versa) results in
+    undefined behavior.
 
 
 .. _CUDA Python: https://nvidia.github.io/cuda-python/index.html
diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py
index 1c522ff..ad66638 100644
--- a/docs/sphinx/conf.py
+++ b/docs/sphinx/conf.py
@@ -58,8 +58,8 @@
 # built documents.
 with open("../../nvmath/_version.py") as f:
     exec(f.read())
-    nvmath_py_ver = __version__
-    del __version__
+    nvmath_py_ver = __version__  # noqa: F821
+    del __version__  # noqa: F821
 
 # The short X.Y version.
 version = nvmath_py_ver
@@ -193,7 +193,8 @@ def autodoc_process_docstring(app, what, name, obj, options, lines):
         match_numba_dtype = re.search(r"nvmath.device.float(\d+)x(\d+)_type", name)
         if match_numba_dtype:
             lines.append(
-                f"A Numba compliant vector type object for float{match_numba_dtype.group(1)} with vector length {match_numba_dtype.group(2)} \n"
+                f"A Numba compliant vector type object for float{match_numba_dtype.group(1)} "
+                f"with vector length {match_numba_dtype.group(2)} \n"
             )
 
 
@@ -203,9 +204,9 @@ class PatchedEnumDocumenter(EnumDocumenter):
     """
 
     def generate(self, *args, **kwargs):
-        output = super().generate(*args, **kwargs)
+        super().generate(*args, **kwargs)
         for i in range(1, len(self.directive.result)):
-            if self.directive.result[i - 1].startswith("   alias of"):
+            if self.directive.result[i - 1].startswith("   alias of"):  # noqa: SIM102
                 if self.directive.result[i].startswith("   :Member Type:"):
                     self.directive.result.insert(i, "", "")
 
@@ -392,7 +393,7 @@ def linkcode_resolve(domain, info):
     if mod.__name__.split(".")[0] not in _top_modules:
         return None
 
-    # If it's wrapped (e.g., by `contextlib.contextmanager`), unwrap it
+    # If it's wrapped (by `contextlib.contextmanager` for example), unwrap it
     try:
         obj = inspect.unwrap(obj)
     except ValueError as e:
diff --git a/docs/sphinx/device-apis/cublas.rst b/docs/sphinx/device-apis/cublas.rst
new file mode 100644
index 0000000..5d3f833
--- /dev/null
+++ b/docs/sphinx/device-apis/cublas.rst
@@ -0,0 +1,37 @@
+
+************************************
+cuBLASDx APIs (:mod:`nvmath.device`)
+************************************
+
+.. _device-api-cublas-overview:
+
+Overview
+========
+
+These APIs offer integration with the NVIDIA cuBLASDx library.
+Detailed documentation of cuBLASDx can be found in the
+`cuBLASDx documentation <https://docs.nvidia.com/cuda/cublasdx/0.1.1>`_.
+
+.. note::
+
+   The :class:`~nvmath.device.matmul` device API in module
+   :mod:`nvmath.device` currently supports cuBLASDx 0.1.1, also available
+   as part of MathDx 24.04.
+
+.. _device-api-cublas-reference:
+
+API Reference
+=============
+
+.. currentmodule:: nvmath.device
+
+.. autosummary::
+   :toctree: generated/
+
+   matmul
+   BlasOptions
+
+   :template: namedtuple.rst
+
+   LeadingDimension
+   TransposeMode
diff --git a/docs/sphinx/device-apis/cufft.rst b/docs/sphinx/device-apis/cufft.rst
new file mode 100644
index 0000000..065ff0a
--- /dev/null
+++ b/docs/sphinx/device-apis/cufft.rst
@@ -0,0 +1,34 @@
+
+***********************************
+cuFFTDx APIs (:mod:`nvmath.device`)
+***********************************
+
+.. _device-api-cufft-overview:
+
+Overview
+========
+
+These APIs offer integration with the NVIDIA cuFFTDx library.
+Detailed documentation of cuBLASDx can be found in the
+`cuFFTDx documentation <https://docs.nvidia.com/cuda/cufftdx/1.2.0>`_.
+
+.. note::
+
+   The :class:`~nvmath.device.fft` device APIs in module
+   :mod:`nvmath.device` currently support cuFFTDx 1.2.0, also available
+   as part of MathDx 24.04. All functionalities from the C++ library are supported with
+   the exception of cuFFTDx C++ APIs with a workspace argument, which are currently not
+   available in nvmath-python.
+
+.. _device-api-cufft-reference:
+
+API Reference
+=============
+
+.. currentmodule:: nvmath.device
+
+.. autosummary::
+   :toctree: generated/
+
+   fft
+   FFTOptions
diff --git a/docs/sphinx/device-apis/curand.rst b/docs/sphinx/device-apis/curand.rst
new file mode 100644
index 0000000..01c62d8
--- /dev/null
+++ b/docs/sphinx/device-apis/curand.rst
@@ -0,0 +1,100 @@
+
+************************************************
+cuRAND Device APIs (:mod:`nvmath.device.random`)
+************************************************
+
+.. _device-api-curand-overview:
+
+Overview
+========
+
+This module provides access to the device APIs of NVIDIA cuRAND library, which allows
+random number generation on the GPU.
+Detailed documentation of cuRAND device APIs can be found in the
+`cuRAND documentation
+<https://docs.nvidia.com/cuda/curand/group__DEVICE.html#group__DEVICE>`_.
+
+.. _device-api-curand-reference:
+
+API Reference
+=============
+
+.. module:: nvmath.device.random
+
+Utilities
+^^^^^^^^^
+
+.. currentmodule:: nvmath.device.random
+.. autosummary::
+   :toctree: generated/
+
+   Compile
+
+Bit Generator and State APIs (:mod:`nvmath.device.random`)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. currentmodule:: nvmath.device.random
+.. autosummary::
+   :toctree: generated/
+
+   init
+   rand
+   rand4
+
+   StatesMRG32k3a
+   StatesPhilox4_32_10
+   StatesSobol32
+   StatesSobol64
+   StatesScrambledSobol32
+   StatesScrambledSobol64
+   StatesXORWOW
+
+
+Distribution Sampling APIs (:mod:`nvmath.device.random`)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. currentmodule:: nvmath.device.random
+.. autosummary::
+   :toctree: generated/
+
+   normal
+   normal_double
+   normal2
+   normal2_double
+   normal4
+   log_normal
+   log_normal_double
+   log_normal2
+   log_normal2_double
+   log_normal4
+   poisson
+   poisson4
+   uniform
+   uniform_double
+   uniform2_double
+   uniform4
+
+Skip Ahead APIs  (:mod:`nvmath.device.random`)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. currentmodule:: nvmath.device.random
+.. autosummary::
+   :toctree: generated/
+
+   skipahead
+   skipahead_sequence
+   skipahead_subsequence
+
+Helper Host APIs (:mod:`nvmath.device.random_helpers`)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. module:: nvmath.device.random_helpers
+.. currentmodule:: nvmath.device.random_helpers
+.. autosummary::
+   :toctree: generated/
+
+    get_direction_vectors32
+    get_direction_vectors64
+    get_scramble_constants32
+    get_scramble_constants64
+    DirectionVectorSet
diff --git a/docs/sphinx/device-apis/index.rst b/docs/sphinx/device-apis/index.rst
index 968915a..d25a257 100644
--- a/docs/sphinx/device-apis/index.rst
+++ b/docs/sphinx/device-apis/index.rst
@@ -3,158 +3,38 @@
 nvmath-python Device APIs
 *************************
 
-.. _device-api-overview:
+.. module:: nvmath.device
 
-Overview
-========
+.. _device-api-overview:
 
-The device module of nvmath-python :mod:`nvmath.device` offers integration with NVIDIA's high-performance computing libraries through device APIs for cuFFTDx, cuBLASDx, and cuRAND.
-Detailed documentation for these libraries can be found at `cuFFTDx <https://docs.nvidia.com/cuda/cufftdx/1.2.0>`_, `cuBLASDx <https://docs.nvidia.com/cuda/cublasdx/0.1.1>`_, and
-`cuRAND device APIs <https://docs.nvidia.com/cuda/curand/group__DEVICE.html#group__DEVICE>`_ respectively.
+The device module of nvmath-python :mod:`nvmath.device` offers integration with NVIDIA's
+high-performance computing libraries through device APIs for cuFFTDx, cuBLASDx, and cuRAND.
+Detailed documentation for these libraries can be found at `cuFFTDx
+<https://docs.nvidia.com/cuda/cufftdx/1.2.0>`_, `cuBLASDx
+<https://docs.nvidia.com/cuda/cublasdx/0.1.1>`_, and `cuRAND device APIs
+<https://docs.nvidia.com/cuda/curand/group__DEVICE.html#group__DEVICE>`_ respectively.
 
 Users may take advantage of the device module via the two approaches below:
 
-- Numba Extensions: Users can access these device APIs via Numba by utilizing specific extensions that simplify the process of defining functions,
-  querying device traits, and calling device functions.
-- Third-party JIT Compilers: The APIs are also available through low-level interfaces in other JIT compilers,
-  allowing advanced users to work directly with the raw device code.
+- Numba Extensions: Users can access these device APIs via Numba by utilizing specific
+  extensions that simplify the process of defining functions, querying device traits, and
+  calling device functions.
+- Third-party JIT Compilers: The APIs are also available through low-level interfaces in
+  other JIT compilers, allowing advanced users to work directly with the raw device code.
 
 .. note::
 
-   The :class:`~nvmath.device.fft` and :class:`~nvmath.device.matmul` device APIs in module :mod:`nvmath.device` currently supports cuFFTDx 1.2.0 and cuBLASDx 0.1.1, also available as part of MathDx 24.04.
-   All functionalities from the C++ libraries are supported with the exception of cuFFTDx C++ APIs with a workspace argument, which are currently not available in nvmath-python.
-
-
-.. _device-api-reference:
-
-API Reference
-=============
-
-.. module:: nvmath.device
-
-Utility APIs (:mod:`nvmath.device`)
------------------------------------
-
-.. autosummary::
-   :toctree: generated/
-
-   current_device_lto
-   float16x2
-   float16x4
-   float32x2
-   float64x2
-   float16x2_type
-   float16x4_type
-   float32x2_type
-   float64x2_type
-
-   :template: namedtuple.rst
-
-   ISAVersion
-   Code
-   CodeType
-   ComputeCapability
-   CodeType
-   Symbol
-   Dim3
-
-cuBLASDx APIs (:mod:`nvmath.device`)
-------------------------------------
-
-.. autosummary::
-   :toctree: generated/
-
-   matmul
-   BlasOptions
-
-   :template: namedtuple.rst
-
-   LeadingDimension
-   TransposeMode
-
-cuFFTDx APIs (:mod:`nvmath.device`)
------------------------------------
-
-.. autosummary::
-   :toctree: generated/
-
-   fft
-   FFTOptions
-
-cuRAND Device APIs (:mod:`nvmath.device.random`)
-------------------------------------------------
-
-.. currentmodule:: nvmath.device.random
-.. autosummary::
-   :toctree: generated/
-
-   Compile
-
-
-Bit Generator and State APIs (:mod:`nvmath.device.random`)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. currentmodule:: nvmath.device.random
-.. autosummary::
-   :toctree: generated/
-
-   init
-   rand
-   rand4
-
-   StatesMRG32k3a
-   StatesPhilox4_32_10
-   StatesSobol32
-   StatesSobol64
-   StatesScrambledSobol32
-   StatesScrambledSobol64
-   StatesXORWOW
-
-
-Distribution Sampling APIs (:mod:`nvmath.device.random`)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. currentmodule:: nvmath.device.random
-.. autosummary::
-   :toctree: generated/
-
-   normal
-   normal_double
-   normal2
-   normal2_double
-   normal4
-   log_normal
-   log_normal_double
-   log_normal2
-   log_normal2_double
-   log_normal4
-   poisson
-   poisson4
-   uniform
-   uniform_double
-   uniform2_double
-   uniform4
-
-Skip Ahead APIs  (:mod:`nvmath.device.random`)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. currentmodule:: nvmath.device.random
-.. autosummary::
-   :toctree: generated/
-
-   skipahead
-   skipahead_sequence
-   skipahead_subsequence
-
-Helper Host APIs (:mod:`nvmath.device.random_helpers`)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+   The :class:`~nvmath.device.fft` and :class:`~nvmath.device.matmul` device APIs in module
+   :mod:`nvmath.device` currently supports cuFFTDx 1.2.0 and cuBLASDx 0.1.1, also available
+   as part of MathDx 24.04. All functionalities from the C++ libraries are supported with
+   the exception of cuFFTDx C++ APIs with a workspace argument, which are currently not
+   available in nvmath-python.
 
-.. currentmodule:: nvmath.device.random_helpers
-.. autosummary::
-   :toctree: generated/
+.. toctree::
+   :caption: Contents
+   :maxdepth: 1
 
-    get_direction_vectors32
-    get_direction_vectors64
-    get_scramble_constants32
-    get_scramble_constants64
-    DirectionVectorSet
+   Device API utilities <utils.rst>
+   cuBLASDx <cublas.rst>
+   cuFFTDx <cufft.rst>
+   cuRAND Device APIs <curand.rst>
diff --git a/docs/sphinx/device-apis/utils.rst b/docs/sphinx/device-apis/utils.rst
new file mode 100644
index 0000000..e250afd
--- /dev/null
+++ b/docs/sphinx/device-apis/utils.rst
@@ -0,0 +1,40 @@
+*******************************************
+Device API utilities (:mod:`nvmath.device`)
+*******************************************
+
+.. _device-api-util-overview:
+
+Overview
+========
+
+nvmath-python provides the following device-side utilities.
+
+.. _device-api-util-reference:
+
+API Reference
+=============
+
+.. currentmodule:: nvmath.device
+
+.. autosummary::
+   :toctree: generated/
+
+   current_device_lto
+   float16x2
+   float16x4
+   float32x2
+   float64x2
+   float16x2_type
+   float16x4_type
+   float32x2_type
+   float64x2_type
+
+   :template: namedtuple.rst
+
+   ISAVersion
+   Code
+   CodeType
+   ComputeCapability
+   CodeType
+   Symbol
+   Dim3
diff --git a/docs/sphinx/fft/index.rst b/docs/sphinx/fft/index.rst
index e3ebd44..d9f3414 100644
--- a/docs/sphinx/fft/index.rst
+++ b/docs/sphinx/fft/index.rst
@@ -7,10 +7,12 @@ Fast Fourier Transform
 Overview
 ========
 
-The Fast Fourier Transform (FFT) module :mod:`nvmath.fft` in nvmath-python leverages the NVIDIA cuFFT library and provides a powerful suite of APIs
-that can be directly called from the host to efficiently perform discrete Fourier Transformations. Both stateless function-form APIs and stateful class-form APIs are
-provided to support a spectrum of N-dimensional FFT operations. These include forward and inverse transformations, as well as complex-to-complex (C2C), complex-to-real (C2R),
-and real-to-complex (R2C) transforms:
+The Fast Fourier Transform (FFT) module :mod:`nvmath.fft` in nvmath-python leverages the
+NVIDIA cuFFT library and provides a powerful suite of APIs that can be directly called from
+the host to efficiently perform discrete Fourier Transformations. Both stateless
+function-form APIs and stateful class-form APIs are provided to support a spectrum of
+N-dimensional FFT operations. These include forward and inverse transformations, as well as
+complex-to-complex (C2C), complex-to-real (C2R), and real-to-complex (R2C) transforms:
 
 - N-dimensional forward C2C FFT transform by :func:`nvmath.fft.fft`.
 - N-dimensional inverse C2C FFT transform by :func:`nvmath.fft.ifft`.
@@ -18,17 +20,25 @@ and real-to-complex (R2C) transforms:
 - N-dimensional inverse C2R FFT transform by :func:`nvmath.fft.irfft`.
 - All types of N-dimensional FFT by stateful :class:`nvmath.fft.FFT`.
 
-Furthermore, the :class:`nvmath.fft.FFT` class includes utility APIs designed to help users cache FFT plans, facilitating the efficient execution of repeated calculations across various computational tasks
-(see :meth:`~nvmath.fft.FFT.create_key`).
+Furthermore, the :class:`nvmath.fft.FFT` class includes utility APIs designed to help users
+cache FFT plans, facilitating the efficient execution of repeated calculations across
+various computational tasks (see :meth:`~nvmath.fft.FFT.create_key`).
 
-The FFT transforms performed on GPU can be fused with other operations using :ref:`FFT callbacks <fft-callback>`.
-This enables users to write custom functions in Python for pre or post-processing, while leveraging Just-In-Time (JIT) and Link-Time Optimization (LTO).
+The FFT transforms performed on GPU can be fused with other operations using :ref:`FFT
+callbacks <fft-callback>`. This enables users to write custom functions in Python for pre or
+post-processing, while leveraging Just-In-Time (JIT) and Link-Time Optimization (LTO).
 
-Users can also choose :ref:`CPU execution <fft-gpu-cpu-execution>` to utilize all available computational resources.
+Users can also choose :ref:`CPU execution <fft-gpu-cpu-execution>` to utilize all available
+computational resources.
 
 .. note::
-    The API :func:`~nvmath.fft.fft` and related function-form APIs perform **N-D FFT** operations, similar to :func:`numpy.fft.fftn`. There are no special 1-D (:func:`numpy.fft.fft`) or 2-D FFT (:func:`numpy.fft.fft2`) APIs.
-    This not only reduces the API surface, but also avoids the potential for incorrect use because the number of batch dimensions is :math:`N - 1` for :func:`numpy.fft.fft` and :math:`N - 2` for :func:`numpy.fft.fft2`, where :math:`N` is the operand dimension.
+
+    The API :func:`~nvmath.fft.fft` and related function-form APIs perform **N-D FFT**
+    operations, similar to :func:`numpy.fft.fftn`. There are no special 1-D
+    (:func:`numpy.fft.fft`) or 2-D FFT (:func:`numpy.fft.fft2`) APIs. This not only reduces
+    the API surface, but also avoids the potential for incorrect use because the number of
+    batch dimensions is :math:`N - 1` for :func:`numpy.fft.fft` and :math:`N - 2` for
+    :func:`numpy.fft.fft2`, where :math:`N` is the operand dimension.
 
 
 .. _fft-callback:
@@ -36,24 +46,31 @@ Users can also choose :ref:`CPU execution <fft-gpu-cpu-execution>` to utilize al
 FFT Callbacks
 =============
 
-User-defined functions can be `compiled to the LTO-IR format <https://docs.nvidia.com/cuda/cufft/index.html#lto-load-and-store-callback-routines>`_ and provided as epilog or prolog to the FFT operation,
-allowing for Link-Time Optimization and fusing. This can be used to implement DFT-based convolutions or scale the FFT output, for example.
+User-defined functions can be `compiled to the LTO-IR format
+<https://docs.nvidia.com/cuda/cufft/index.html#lto-load-and-store-callback-routines>`_ and
+provided as epilog or prolog to the FFT operation, allowing for Link-Time Optimization and
+fusing. This can be used to implement DFT-based convolutions or scale the FFT output, for
+example.
 
-The FFT module comes with convenient helper functions :func:`nvmath.fft.compile_prolog` and :func:`nvmath.fft.compile_epilog` that
-compile functions written in Python to LTO-IR format. Under the hood, the helpers rely on Numba as the compiler.
-The compiled callbacks can be passed to functional or stateful FFT APIs as :class:`~nvmath.fft.DeviceCallable`.
-Alternatively, users can compile the callbacks to LTO-IR format with a compiler of their choice and pass them as :class:`~nvmath.fft.DeviceCallable` to the FFT call.
+The FFT module comes with convenient helper functions :func:`nvmath.fft.compile_prolog` and
+:func:`nvmath.fft.compile_epilog` that compile functions written in Python to LTO-IR format.
+Under the hood, the helpers rely on Numba as the compiler. The compiled callbacks can be
+passed to functional or stateful FFT APIs as :class:`~nvmath.fft.DeviceCallable`.
+Alternatively, users can compile the callbacks to LTO-IR format with a compiler of their
+choice and pass them as :class:`~nvmath.fft.DeviceCallable` to the FFT call.
 
-Examples illustrating use of prolog and epilog functions can be found in the `FFT examples directory <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft>`_.
+Examples illustrating use of prolog and epilog functions can be found in the `FFT examples
+directory <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft>`_.
 
 .. note::
+
     FFT Callbacks are not currently supported on Windows.
 
 Setting-up
 ----------
 
-The fastest way to start using cuFFT LTO with nvmath is to install it with device API dependencies.
-Pip users should run the following command:
+The fastest way to start using cuFFT LTO with nvmath is to install it with device API
+dependencies. Pip users should run the following command:
 
 .. code-block:: bash
 
@@ -65,21 +82,29 @@ Required dependencies
 
 For those who need to collect the required dependencies manually:
 
-- LTO callbacks are supported by cuFFT 11.3 which is shipped with `CUDA Toolkit 12.6 Update 2 and newer <https://developer.nvidia.com/cuda-downloads>`_.
-- Using cuFFT LTO callbacks requires nvJitLink from the same CUDA toolkit or newer (within the same major CUDA release, e.g. 12).
-- Compiling the callbacks with the :func:`nvmath.fft.compile_prolog` and :func:`nvmath.fft.compile_epilog` helpers requires Numba 0.59+ and nvcc/nvvm
-  from the same CUDA toolkit as nvJitLink or older (within the same major CUDA release).
-  The helpers require the target device to have compute capability 7.0 or higher.
+- LTO callbacks are supported by cuFFT 11.3 which is shipped with `CUDA Toolkit 12.6 Update
+  2 and newer <https://developer.nvidia.com/cuda-downloads>`_.
+- Using cuFFT LTO callbacks requires nvJitLink from the same CUDA toolkit or newer (within
+  the same major CUDA release, for example version 12).
+- Compiling the callbacks with the :func:`nvmath.fft.compile_prolog` and
+  :func:`nvmath.fft.compile_epilog` helpers requires Numba 0.59+ and nvcc/nvvm from the same
+  CUDA toolkit as nvJitLink or older (within the same major CUDA release). The helpers
+  require the target device to have compute capability 7.0 or higher.
 
-For further details, please refer to `cuFFT LTO documentation <https://docs.nvidia.com/cuda/cufft/index.html#lto-load-and-store-callback-routines>`_.
+For further details, refer to the `cuFFT LTO documentation
+<https://docs.nvidia.com/cuda/cufft/index.html#lto-load-and-store-callback-routines>`_.
 
 
 Older CTKs
 ^^^^^^^^^^
 
-Adventurous users who want to try callback functionality and cannot upgrade the CUDA Toolkit to 12.6U2, can download and install the older preview release `cuFFT LTO EA version 11.1.3.0 <https://docs.nvidia.com/cuda/cufft/ltoea/release_notes.html#cufft-lto-ea-preview-11-1-3-0>`_ from `here <https://developer.nvidia.com/cufftea>`_, which requires at least CUDA Toolkit 12.2.
-When using LTO EA, setting environmental variables may be needed for nvmath to pick the desired cuFFT version.
-Users should adjust the ``LD_PRELOAD`` variable, so that the right cuFFT shared library is used:
+Adventurous users who want to try callback functionality and cannot upgrade the CUDA Toolkit
+to 12.6U2, can download and install the older preview release `cuFFT LTO EA version 11.1.3.0
+<https://docs.nvidia.com/cuda/cufft/ltoea/release_notes.html
+#cufft-lto-ea-preview-11-1-3-0>`_ from `here <https://developer.nvidia.com/cufftea>`_, which
+requires at least CUDA Toolkit 12.2. When using LTO EA, setting environmental variables may
+be needed for nvmath to pick the desired cuFFT version. Users should adjust the
+``LD_PRELOAD`` variable, so that the right cuFFT shared library is used:
 
 .. code-block:: bash
 
@@ -91,19 +116,25 @@ Users should adjust the ``LD_PRELOAD`` variable, so that the right cuFFT shared
 Execution space
 ===============
 
-FFT transforms can be executed either on NVIDIA GPU or CPU. By default, the execution space is selected based on the memory space of the operand passed to the FFT call,
-but it can be explicitly controlled with :class:`~nvmath.fft.ExecutionCUDA` and :class:`~nvmath.fft.ExecutionCPU` passed as the ``execution`` option to the call (e.g. :class:`~nvmath.fft.FFT` or :func:`~nvmath.fft.fft`).
+FFT transforms can be executed either on NVIDIA GPU or CPU. By default, the execution space
+is selected based on the memory space of the operand passed to the FFT call, but it can be
+explicitly controlled with :class:`~nvmath.fft.ExecutionCUDA` and
+:class:`~nvmath.fft.ExecutionCPU` passed as the ``execution`` option to the call (for example
+:class:`~nvmath.fft.FFT` or :func:`~nvmath.fft.fft`).
 
 .. note::
+
     CPU execution is not currently supported on Windows.
 
 Required dependencies
 ---------------------
 
-With ARM CPUs, such as NVIDIA Grace, nvmath-python can utilize `NVPL (Nvidia Performance Libraries) <https://developer.nvidia.com/nvpl>`_ FFT to run the transform.
-On x86_64 architecture, the `MKL library <https://pypi.org/project/mkl/>`_ can be used.
+With ARM CPUs, such as NVIDIA Grace, nvmath-python can utilize `NVPL (Nvidia Performance
+Libraries) <https://developer.nvidia.com/nvpl>`_ FFT to run the transform. On x86_64
+architecture, the `MKL library <https://pypi.org/project/mkl/>`_ can be used.
 
-For pip users, the fastest way to get the required dependencies is to use ``'cu12'`` / ``'cu11'`` and ``'cpu'`` extras:
+For pip users, the fastest way to get the required dependencies is to use ``'cu12'`` /
+``'cu11'`` and ``'cpu'`` extras:
 
 .. code-block:: bash
 
@@ -120,11 +151,11 @@ For pip users, the fastest way to get the required dependencies is to use ``'cu1
 Custom CPU library
 ^^^^^^^^^^^^^^^^^^
 
-Other libraries that conform to FFTW3 API and ship single and double precision symbols in the
-single ``so`` file can be used to back the CPU FFT execution.
-Users who would like to use different library for CPU FFT, or point to a custom installation of
-NVPL or MKL library, can do so by including the library path in ``LD_LIBRARY_PATH`` and specifying the
-library name with ``NVMATH_FFT_CPU_LIBRARY``. For example:
+Other libraries that conform to FFTW3 API and ship single and double precision symbols in
+the single ``so`` file can be used to back the CPU FFT execution. Users who would like to
+use different library for CPU FFT, or point to a custom installation of NVPL or MKL library,
+can do so by including the library path in ``LD_LIBRARY_PATH`` and specifying the library
+name with ``NVMATH_FFT_CPU_LIBRARY``. For example:
 
 
 .. code-block:: bash
diff --git a/docs/sphinx/host-utils.rst b/docs/sphinx/host-utils.rst
new file mode 100644
index 0000000..1654166
--- /dev/null
+++ b/docs/sphinx/host-utils.rst
@@ -0,0 +1,23 @@
+**********************
+Host API Utilities
+**********************
+
+.. _host-api-util-overview:
+
+Overview
+========
+
+nvmath-python provides host-side APIs for managing device-side memory.
+
+.. _host-api-util-reference:
+
+API Reference
+=============
+
+.. module:: nvmath
+
+.. autosummary::
+   :toctree: generated/
+
+   BaseCUDAMemoryManager
+   MemoryPointer
diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst
index 36c45ea..0245cc7 100644
--- a/docs/sphinx/index.rst
+++ b/docs/sphinx/index.rst
@@ -6,21 +6,47 @@ nvmath-python: Unleashing the Full Capabilities of NVIDIA Math Libraries within
 
 Welcome to the nvmath-python documentation!
 
-**nvmath-python** is a Python library to enable cutting edge performance, productivity, and interoperability within the Python computational ecosystem through NVIDIA's high-performance math libraries.
+**nvmath-python** is a Python library to enable cutting edge performance, productivity, and
+interoperability within the Python computational ecosystem through NVIDIA's high-performance
+math libraries.
 
-To quickly get started with *nvmath-python* installation, please refer to our :doc:`guide on Getting Started <getting-started>` for instructions.
+To quickly get started with *nvmath-python*, take a look at our
+:doc:`Getting Started <quickstart>` manual.
+Refer to our :doc:`Installation Guide <installation>` for detailed instructions on the various installation choices available.
+
+========
+Contents
+========
+
+.. toctree::
+   :hidden:
+
+   Home <self>
 
 .. toctree::
-   :caption: Contents
+   :caption: User Guide
    :maxdepth: 2
 
-   Release Notes <release-notes.rst>
+   Getting Started <quickstart.rst>
    Overview <overview.rst>
-   Getting Started <getting-started.rst>
+   Installation <installation.rst>
+
+.. toctree::
+   :caption: API Reference
+   :maxdepth: 2
+
+   Host API Utilities <host-utils.rst>
    Linear Algebra <linalg/index.rst>
    Fast Fourier Transform <fft/index.rst>
    Device APIs <device-apis/index.rst>
    Bindings <bindings/index.rst>
+
+.. toctree::
+   :caption: Other Resources
+   :maxdepth: 1
+
+   Release Notes <release-notes.rst>
    Code of Conduct <CODE_OF_CONDUCT.md>
    Contributing <CONTRIBUTING.md>
    License <license.rst>
+   GitHub Repository <https://github.com/NVIDIA/nvmath-python>
diff --git a/docs/sphinx/getting-started.rst b/docs/sphinx/installation.rst
similarity index 66%
rename from docs/sphinx/getting-started.rst
rename to docs/sphinx/installation.rst
index f8bf882..c9062d7 100644
--- a/docs/sphinx/getting-started.rst
+++ b/docs/sphinx/installation.rst
@@ -1,21 +1,28 @@
-Getting Started
+Installation
 ***************
 
 Install nvmath-python
 =====================
 
-nvmath-python, like most modern Python packages, provides pre-built binaries (wheels and later conda packages) to the end users.
-The full source code is hosted in the `NVIDIA/nvmath-python <https://github.com/NVIDIA/nvmath-python>`_ repository.
+nvmath-python, like most modern Python packages, provides pre-built binaries (wheels and
+later conda packages) to the end users. The full source code is hosted in the
+`NVIDIA/nvmath-python <https://github.com/NVIDIA/nvmath-python>`_ repository.
 
-In terms of CUDA Toolkit (CTK) choices, nvmath-python is designed and implemented to allow building and running against 1. ``pip``-wheel, 2. ``conda``, or 3. system installation of CTK. Having a full CTK installation at either build- or run- time is not necessary; just a small fraction as explained below is enough.
+In terms of CUDA Toolkit (CTK) choices, nvmath-python is designed and implemented to allow
+building and running against 1. ``pip``-wheel, 2. ``conda``, or 3. system installation of
+CTK. Having a full CTK installation at either build- or run- time is not necessary; just a
+small fraction as explained below is enough.
 
-Host & device APIs (see :ref:`nvmath overview`) have different run-time dependencies and requirements. Even among
-host APIs the needed underlying libraries are different (for example, :func:`~nvmath.fft.fft` on GPUs only needs cuFFT and not cuBLAS). Libraries
-are loaded when only needed. Therefore, nvmath-python is designed to have most of its dependencies *optional*, but provides
-convenient installation commands for users to quickly spin up a working Python environment.
+Host & device APIs (see :ref:`nvmath overview`) have different run-time dependencies and
+requirements. Even among host APIs the needed underlying libraries are different (for
+example, :func:`~nvmath.fft.fft` on GPUs only needs cuFFT and not cuBLAS). Libraries are
+loaded when only needed. Therefore, nvmath-python is designed to have most of its
+dependencies *optional*, but provides convenient installation commands for users to quickly
+spin up a working Python environment.
 
-The :ref:`cheatsheet <cheatsheet>` below captures nvmath-python's required/optional, build-/run- time dependencies.
-Using the installation commands from the sections below should support most of your needs.
+The :ref:`cheatsheet <cheatsheet>` below captures nvmath-python's required/optional,
+build-/run- time dependencies. Using the installation commands from the sections below
+should support most of your needs.
 
 
 .. _install from pypi:
@@ -23,7 +30,10 @@ Using the installation commands from the sections below should support most of y
 Install from PyPI
 -----------------
 
-The pre-built wheels can be ``pip``-installed from the public PyPI. There are several optional dependencies expressible in the standard "extras" bracket notation. The following assumes that **CTK components are also installed via pip** (so no extra step from users is needed; the dependencies are pulled via extras).
+The pre-built wheels can be ``pip``-installed from the public PyPI. There are several
+optional dependencies expressible in the standard "extras" bracket notation. The following
+assumes that **CTK components are also installed via pip** (so no extra step from users is
+needed; the dependencies are pulled via extras).
 
 .. list-table::
    :widths: 25 50
@@ -57,7 +67,8 @@ The pre-built wheels can be ``pip``-installed from the public PyPI. There are se
           FFTW3 (non-guru) API. ``LD_LIBRARY_PATH`` should be set properly to
           include this library if it is not already in the PATH.
 
-The options below are for adventurous users who want to manage most of the dependencies themselves. The following assumes that **system CTK is installed**.
+The options below are for adventurous users who want to manage most of the dependencies
+themselves. The following assumes that **system CTK is installed**.
 
 .. list-table::
    :widths: 25 50
@@ -89,7 +100,8 @@ The options below are for adventurous users who want to manage most of the depen
        2. For using :mod:`nvmath.device` APIs, ``CUDA_HOME`` (or ``CUDA_PATH``) should be
           set to point to the system CTK.
 
-For system admins or expert users, ``pip install nvmath-python`` would be a bare minimal installation (very lightweight). This allows fully explicit control of all dependencies.
+For system admins or expert users, ``pip install nvmath-python`` would be a bare minimal
+installation (very lightweight). This allows fully explicit control of all dependencies.
 
 
 Install from conda
@@ -111,7 +123,8 @@ Conda packages can be installed from the `conda-forge <https://conda-forge.org>`
      - Install nvmath-python along with all CUDA 12 optional
        dependencies (wheels for cuBLAS/cuFFT/... and CuPy) to support
        nvmath host APIs.
-   * - ``conda install -c conda-forge -c rapidsai nvmath-python-dx "pynvjitlink>=0.2" cuda-version=12``
+   * - ``conda install -c conda-forge -c rapidsai nvmath-python-dx "pynvjitlink>=0.2"
+       cuda-version=12``
      - Install nvmath-python along with all CUDA 12 optional
        dependencies (wheels for cuBLAS/cuFFT/..., CuPy, Numba,
        pynvjitlink, ...) to support nvmath host & device APIs (which
@@ -140,14 +153,19 @@ Conda packages can be installed from the `conda-forge <https://conda-forge.org>`
 
 **Notes**:
 
-- For expert users, ``conda install -c conda-forge nvmath-python=*=core*`` would be a bare minimal installation (very lightweight). This allows fully explicit control of all dependencies.
-- If you installed ``conda`` from `miniforge <https://github.com/conda-forge/miniforge>`_, most likely the conda-forge channel is already set as the default, then the ``-c conda-forge`` part in the above instruction can be omitted.
+- For expert users, ``conda install -c conda-forge nvmath-python=*=core*`` would be a bare
+  minimal installation (very lightweight). This allows fully explicit control of all
+  dependencies.
+- If you installed ``conda`` from `miniforge <https://github.com/conda-forge/miniforge>`_,
+  most likely the conda-forge channel is already set as the default, then the ``-c
+  conda-forge`` part in the above instruction can be omitted.
 
 
 Build from source
 -----------------
 
-Once you clone the repository and go into the root directory, you can build the project from source. There are several ways to build it since we need some CUDA headers at build time.
+Once you clone the repository and go into the root directory, you can build the project from
+source. There are several ways to build it since we need some CUDA headers at build time.
 
 .. list-table::
    :widths: 25 50
@@ -169,7 +187,8 @@ Once you clone the repository and go into the root directory, you can build the
        instead), build the project, and install it to the current
        user environment together with the run-time dependencies. One can use:
 
-       - conda: After installing CUDA 12 conda packages, set the environment variable ``CUDA_PATH``
+       - conda: After installing CUDA 12 conda packages, set the environment variable
+         ``CUDA_PATH``
 
          * linux-64: ``CUDA_PATH=$CONDA_PREFIX/targets/x86_64-linux/``
          * linux-aarch64: ``CUDA_PATH=$CONDA_PREFIX/targets/sbsa-linux/``
@@ -179,14 +198,18 @@ Once you clone the repository and go into the root directory, you can build the
 
 **Notes**:
 
-- If you add the "extras" notation after the dot ``.``, e.g., ``.[cu11]``, ``.[cu12,dx]``, ..., it has the same meaning
-  as explained in the :ref:`previous section <install from pypi>`.
-- If you don't want the run-time dependencies to be automatically handled, add ``--no-deps`` after the ``pip install``
-  command above; in this case, however, it's your responsibility to make sure that all the run-time requirements are met.
-- By replacing ``install`` by ``wheel``, a wheel can be built targeting the current OS and CPython version.
-- If you want inplace/editable install, add the ``-e`` flag to the command above (before the dot ``.``).
-  This is suitable for local development with a system-installed CTK. However, our wheels rely on *non-editable builds* so that the RPATH
-  hack can kick in. DO NOT pass the ``-e`` flag when building wheels!
+- If you add the "extras" notation after the dot ``.`` (for example ``.[cu11]``, ``.[cu12,dx]``,
+  ...), it has the same meaning as explained in the :ref:`previous section <install from
+  pypi>`.
+- If you don't want the run-time dependencies to be automatically handled, add ``--no-deps``
+  after the ``pip install`` command above; in this case, however, it's your responsibility
+  to make sure that all the run-time requirements are met.
+- By replacing ``install`` by ``wheel``, a wheel can be built targeting the current OS and
+  CPython version.
+- If you want inplace/editable install, add the ``-e`` flag to the command above (before the
+  dot ``.``). This is suitable for local development with a system-installed CTK. However,
+  our wheels rely on *non-editable builds* so that the RPATH hack can kick in. DO NOT pass
+  the ``-e`` flag when building wheels!
 - All optional run-time dependencies as listed below need to be manually installed.
 
 
@@ -195,8 +218,8 @@ Once you clone the repository and go into the root directory, you can build the
 Cheatsheet
 ----------
 
-Below we provide a summary of requirements to support all nvmath-python functionalities.
-A dependency is *required* unless stated otherwise.
+Below we provide a summary of requirements to support all nvmath-python functionalities. A
+dependency is *required* unless stated otherwise.
 
 .. list-table::
    :widths: 25 25 25 25 25
@@ -334,14 +357,19 @@ nvmath-python is tested in the following environments:
 Run nvmath-python
 =================
 
-As mentioned earlier, nvmath-python can be run with all methods of CUDA installation, including wheels, conda packages, and system CTK. As a result, there is detection logic to discover shared libraries (for host APIs) and headers (for device APIs to do JIT compilation).
+As mentioned earlier, nvmath-python can be run with all methods of CUDA installation,
+including wheels, conda packages, and system CTK. As a result, there is detection logic to
+discover shared libraries (for host APIs) and headers (for device APIs to do JIT
+compilation).
 
 Shared libraries
 ----------------
 
 - pip wheels: Will be auto-discovered if installed
 - conda packages: Will be auto-discovered if installed, after wheel
-- system CTK: On Linux, the users needs to ensure the shared libraries are discoverable by the dynamic linker, say by setting ``LD_LIBRARY_PATH`` or updating system search paths to include the DSO locations.
+- system CTK: On Linux, the users needs to ensure the shared libraries are discoverable by
+  the dynamic linker, say by setting ``LD_LIBRARY_PATH`` or updating system search paths to
+  include the DSO locations.
 
 
 Headers 
@@ -351,13 +379,14 @@ This includes libraries such as CCCL and MathDx.
 
 - pip wheels: Will be auto-discovered if installed
 - conda packages: Will be auto-discovered if installed, after wheel
-- system CTK: Need to set ``CUDA_HOME`` (or ``CUDA_PATH``) and ``MATHDX_HOME`` (for MathDx headers)
+- system CTK: Need to set ``CUDA_HOME`` (or ``CUDA_PATH``) and ``MATHDX_HOME`` (for MathDx
+  headers)
 
 
 Host APIs
 ---------
 
-This terminlogy is explained in the :ref:`host api section`.
+This terminology is explained in the :ref:`host api section`.
 
 Examples
 ........
@@ -371,7 +400,9 @@ See the ``examples`` directory in the repo. Currently we have:
 Tests
 .....
 
-The ``requirements/pip/tests.txt`` file lists dependencies required for ``pip``-controlled environments to run tests. These requirements are installed via the main ``requirements/pip-dev-<name>.txt`` files.
+The ``requirements/pip/tests.txt`` file lists dependencies required for ``pip``-controlled
+environments to run tests. These requirements are installed via the main
+``requirements/pip-dev-<name>.txt`` files.
 
 
 Running functionality tests
@@ -426,11 +457,13 @@ Troubleshooting
 ===============
 
 For ``pip``-users, there are known limitations (many of which are nicely captured in the
-`pypackaging community project <https://pypackaging-native.github.io>`_) in Python packaging tools.
-For a complex library such as nvmath-python that interacts with many native libraries, there are user-visible caveats.
+`pypackaging community project <https://pypackaging-native.github.io>`_) in Python packaging
+tools. For a complex library such as nvmath-python that interacts with many native
+libraries, there are user-visible caveats.
 
-1. Be sure that there are no packages with both ``-cu11`` (for CUDA 11) and ``-cu12`` (for CUDA 12) suffices coexisting
-   in your Python environment. For example, this is a corrupted environment:
+1. Be sure that there are no packages with both ``-cu11`` (for CUDA 11) and ``-cu12`` (for
+   CUDA 12) suffices coexisting in your Python environment. For example, this is a corrupted
+   environment:
 
    .. code-block:: bash
 
@@ -443,30 +476,38 @@ For a complex library such as nvmath-python that interacts with many native libr
       setuptools         70.0.0
       wheel              0.43.0
 
-   Some times such conflicts could come from a dependency of the libraries that you use, so pay extra attention to what's
-   installed.
-2. ``pip`` does not attempt to check if the installed packages can actually be run against the installed GPU driver (CUDA GPU
-   driver cannot be installed by ``pip``), so make sure your GPU driver is new enough to support the installed ``-cuXX``
-   packages [2]_. The driver version can be checked by executing ``nvidia-smi`` and inspecting the ``Driver Version`` field on the
-   output table.
-3. CuPy installed from ``pip`` currently (as of v13.3.0) only supports conda and system CTK, and not ``pip``-installed CUDA wheels.
-   nvmath-python can help CuPy use the CUDA libraries installed to ``site-packages`` (where wheels are installed to) if ``nvmath``
-   is imported. As of beta 2 (v0.2.0) the libraries are "soft-loaded" (no error is raised if a library is not installed) when
-   ``import nvmath`` happens. This behavior may change in a future release.
-4. Numba installed from ``pip`` currently (as of v0.60.0) only supports conda and system CTK, and not ``pip``-installed CUDA wheels.
-   nvmath-python can also help Numba use the CUDA compilers installed to ``site-packages`` if ``nvmath`` is imported.
-   Same as above, this behavior may change in a future release.
-
-In general, mixing-and-matching CTK packages from ``pip``, ``conda``, and the system is possible but can be very fragile, so
-please understand what you're doing.
-The nvmath-python internals are designed to work with everything installed either via ``pip``, ``conda``, or local system
-(system CTK, including `tarball extractions <https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#tarball-and-zip-archive-deliverables>`_, are the fallback solution in the detection logic), but mix-n-match makes the detection logic
-impossible to get right.
-
-To help you perform an integrity check, the rule of thumb is that every single package should only come from one place (either
-``pip``, or ``conda``, or local system). For example, if both ``nvidia-cufft-cu11`` (which is from ``pip``) and ``libcufft`` (from
-``conda``) appear in the output of ``conda list``, something is almost certainly wrong. Below is the package name mapping between
-``pip`` and ``conda``, with ``XX={11,12}`` denoting CUDA's major version:
+   Sometimes such conflicts could come from a dependency of the libraries that you use, so
+   pay extra attention to what's installed.
+2. ``pip`` does not attempt to check if the installed packages can actually be run against
+   the installed GPU driver (CUDA GPU driver cannot be installed by ``pip``), so make sure
+   your GPU driver is new enough to support the installed ``-cuXX`` packages [2]_. The
+   driver version can be checked by executing ``nvidia-smi`` and inspecting the ``Driver
+   Version`` field on the output table.
+3. CuPy installed from ``pip`` currently (as of v13.3.0) only supports conda and system CTK,
+   and not ``pip``-installed CUDA wheels. nvmath-python can help CuPy use the CUDA libraries
+   installed to ``site-packages`` (where wheels are installed to) if ``nvmath`` is imported.
+   From beta 2 (v0.2.0) onwards the libraries are "soft-loaded" (no error is raised if a library is
+   not installed) when ``import nvmath`` happens. This behavior may change in a future
+   release.
+4. Numba installed from ``pip`` currently (as of v0.60.0) only supports conda and system
+   CTK, and not ``pip``-installed CUDA wheels. nvmath-python can also help Numba use the
+   CUDA compilers installed to ``site-packages`` if ``nvmath`` is imported. Same as above,
+   this behavior may change in a future release.
+
+In general, mixing-and-matching CTK packages from ``pip``, ``conda``, and the system is
+possible but can be very fragile, so it's important to understand what you're doing. The nvmath-python
+internals are designed to work with everything installed either via ``pip``, ``conda``, or
+local system (system CTK, including `tarball extractions
+<https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
+#tarball-and-zip-archive-deliverables>`_, are the fallback solution in the detection logic),
+but mix-n-match makes the detection logic impossible to get right.
+
+To help you perform an integrity check, the rule of thumb is that every single package
+should only come from one place (either ``pip``, or ``conda``, or local system). For
+example, if both ``nvidia-cufft-cu11`` (which is from ``pip``) and ``libcufft`` (from
+``conda``) appear in the output of ``conda list``, something is almost certainly wrong.
+Below is the package name mapping between ``pip`` and ``conda``, with ``XX={11,12}``
+denoting CUDA's major version:
 
 .. list-table::
    :widths: 50 50 50
@@ -506,22 +547,32 @@ To help you perform an integrity check, the rule of thumb is that every single p
      - ``libcurand``
      - ``cudatoolkit``
 
-Note that system packages (by design) do not show up in the output of ``conda list`` or ``pip list``. Linux users should check
-the installation list from your distro package manager (``apt``, ``yum``, ``dnf``, ...). See also the `Linux Package Manager
-Installation Guide <https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#package-manager-installation>`_ for
-additional information.
+Note that system packages (by design) do not show up in the output of ``conda list`` or
+``pip list``. Linux users should check the installation list from your distro package
+manager (``apt``, ``yum``, ``dnf``, ...). See also the `Linux Package Manager Installation
+Guide <https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
+#package-manager-installation>`_ for additional information.
 
-For more information with regard to the new CUDA 12+ package layout on conda-forge, see the `CUDA recipe README
-<https://github.com/conda-forge/cuda-feedstock/tree/main/recipe>`_.
+For more information with regard to the new CUDA 12+ package layout on conda-forge, see the
+`CUDA recipe README <https://github.com/conda-forge/cuda-feedstock/tree/main/recipe>`_.
 
 
 .. rubric:: Footnotes
 
 .. [1] Windows support will be added in a future release.
-.. [2] nvmath-python relies on `CUDA minor version compatibility <https://docs.nvidia.com/deploy/cuda-compatibility/index.html#minor-version-comaptibility>`_.
-.. [4] As of beta 2 (v0.2.0), CuPy is a required run-time dependency. In a future release it will be turned into an optional run-time dependency.
-.. [5] For example, Hopper GPUs are supported starting CUDA 11.8, so they would not work with libraries from CUDA 11.7 or below.
-.. [6] While we need some CUDA headers at build time, there is no limitation in the CUDA version seen at build time.
-.. [7] These versions are not supported due to a known compiler bug; the ``[dx]`` extras already takes care of this.
-.. [8] If CCCL is installed via ``pip`` manually it needs to be constrained with ``"nvidia-cuda-cccl-cu12>=12.4.127"`` due to a packaging issue; the ``[dx]`` extras already takes care of this.
-.. [9] The library must ship FFTW3 symbols for single and double precision transforms in a single ``so`` file.
+.. [2] nvmath-python relies on `CUDA minor version compatibility
+    <https://docs.nvidia.com/deploy/cuda-compatibility/index.html
+    #minor-version-compatibility>`_.
+.. [4] As of beta 2.1 (v0.2.1), CuPy is a required run-time dependency except for CPU-only
+    execution. In a future release it will be turned into an optional run-time dependency.
+.. [5] For example, Hopper GPUs are supported starting CUDA 11.8, so they would not work
+    with libraries from CUDA 11.7 or below.
+.. [6] While we need some CUDA headers at build time, there is no limitation in the CUDA
+    version seen at build time.
+.. [7] These versions are not supported due to a known compiler bug; the ``[dx]`` extras
+    already takes care of this.
+.. [8] If CCCL is installed via ``pip`` manually it needs to be constrained with
+    ``"nvidia-cuda-cccl-cu12>=12.4.127"`` due to a packaging issue; the ``[dx]`` extras
+    already takes care of this.
+.. [9] The library must ship FFTW3 symbols for single and double precision transforms in a
+    single ``so`` file.
diff --git a/docs/sphinx/linalg/index.rst b/docs/sphinx/linalg/index.rst
index d6e2cc4..d005942 100644
--- a/docs/sphinx/linalg/index.rst
+++ b/docs/sphinx/linalg/index.rst
@@ -7,8 +7,9 @@ Linear Algebra
 Overview
 ========
 
-The Linear Algebra module :mod:`nvmath.linalg` in nvmath-python leverages various NVIDIA math libraries to support multiple linear algebra computations.
-As of the initial Beta release, we offer the specialized matrix multiplication API based on the cuBLASLt library.
+The Linear Algebra module :mod:`nvmath.linalg` in nvmath-python leverages various NVIDIA
+math libraries to support multiple linear algebra computations. As of the initial Beta
+release, we offer the specialized matrix multiplication API based on the cuBLASLt library.
 
 .. _linalg-api-reference:
 
diff --git a/docs/sphinx/overview.rst b/docs/sphinx/overview.rst
index 89ae59f..fe4dd0d 100644
--- a/docs/sphinx/overview.rst
+++ b/docs/sphinx/overview.rst
@@ -3,50 +3,76 @@
 Overview
 ********
 
-The primary goal of nvmath-python is to bring the power of the NVIDIA math libraries to the Python ecosystem. The package aims to provide intuitive pythonic APIs that provide
-users full access to all the features offered by our libraries in a variety of execution spaces.
+The primary goal of nvmath-python is to bring the power of the NVIDIA math libraries to the
+Python ecosystem. The package aims to provide intuitive pythonic APIs that provide users
+full access to all the features offered by our libraries in a variety of execution spaces.
 
-We hope to empower a wide range of Python users by providing easy access to high-performance core math operations such as FFT, dense and sparse linear algebra, and more.
-This includes the following groups of users:
+We hope to empower a wide range of Python users by providing easy access to high-performance
+core math operations such as FFT, dense and sparse linear algebra, and more. This includes
+the following groups of users:
 
-1. **Practitioners**: Researchers and application programmers who require robust, high-performance mathematical tools.
-2. **Library Package Developers**: Developers crafting libraries that rely on advanced mathematical operations.
-3. **CUDA Kernel Authors**: Programmers who write CUDA kernels and need customized mathematical functionality.
+1. **Practitioners**: Researchers and application programmers who require robust,
+   high-performance mathematical tools.
+2. **Library Package Developers**: Developers crafting libraries that rely on advanced
+   mathematical operations.
+3. **CUDA Kernel Authors**: Programmers who write CUDA kernels and need customized
+   mathematical functionality.
 
 The APIs provided by nvmath-python can be categorized into:
 
-- **Host APIs**: Invoked from the host and executed in the chosen space (currently limited to single GPUs).
+- **Host APIs**: Invoked from the host and executed in the chosen space (currently limited
+  to single GPUs).
 - **Device APIs**: Called directly from within CUDA kernels.
 
 nvmath-python is dedicated to delivering the following key features and commitments:
 
-1. **Logical Feature Parity**: While the pythonic API surface (the number of APIs and the complexity of each) is more concise compared to that of the C libraries, it provides access to their complete functionality.
-2. **Consistent Design Patterns**: Uniform design across all modules to simplify user experience.
-3. **Transparency and Explicitness**: Avoiding implicit, costly operations such as copying data across the same memory space, automatic type promotion, and alterations to the user environment or state (current device, current stream, etc.).
-   This allows users to perform the required conversion once for use in all subsequent operations instead of incurring hidden costs on each call.
-4. **Clear, Actionable Error Messages**: Ensuring that errors are informative and helpful in resolving the problem.
-5. **DRY Principle Compliance**: Automatically utilizing available information such as the current stream and memory pool to avoid redundant specification ("don't repeat yourself").
-
-With nvmath-python, a few lines of code are sufficient to unlock the extensive performance capabilities of the NVIDIA math libraries.
-Explore our sample Python codes and more detailed examples in the `examples directory on GitHub <https://github.com/NVIDIA/nvmath-python/tree/main/examples>`_.
+1. **Logical Feature Parity**: While the pythonic API surface (the number of APIs and the
+   complexity of each) is more concise compared to that of the C libraries, it provides
+   access to their complete functionality.
+2. **Consistent Design Patterns**: Uniform design across all modules to simplify user
+   experience.
+3. **Transparency and Explicitness**: Avoiding implicit, costly operations such as copying
+   data across the same memory space, automatic type promotion, and alterations to the user
+   environment or state (current device, current stream, etc.). This allows users to perform
+   the required conversion once for use in all subsequent operations instead of incurring
+   hidden costs on each call.
+4. **Clear, Actionable Error Messages**: Ensuring that errors are informative and helpful in
+   resolving the problem.
+5. **DRY Principle Compliance**: Automatically utilizing available information such as the
+   current stream and memory pool to avoid redundant specification ("don't repeat
+   yourself").
+
+With nvmath-python, a few lines of code are sufficient to unlock the extensive performance
+capabilities of the NVIDIA math libraries. Explore our sample Python codes and more detailed
+examples in the `examples directory on GitHub
+<https://github.com/NVIDIA/nvmath-python/tree/main/examples>`_.
 
 Architecture
 ============
 
-nvmath-python is designed to support integration at any level desired by the user. This flexibility allows:
+nvmath-python is designed to support integration at any level desired by the user. This
+flexibility allows:
 
-- Alice, a **Python package developer**, to utilize core math operations to compose into higher-level algorithms or adapt these operations into her preferred interfaces.
-- Bob, an **application developer**, to use core operations directly from nvmath-python or indirectly through other libraries that leverage math-python.
-- Carol, a **researcher**, to write kernels entirely in Python that call core math operations such as FFT.
+- Alice, a **Python package developer**, to utilize core math operations to compose into
+  higher-level algorithms or adapt these operations into her preferred interfaces.
+- Bob, an **application developer**, to use core operations directly from nvmath-python or
+  indirectly through other libraries that leverage math-python.
+- Carol, a **researcher**, to write kernels entirely in Python that call core math
+  operations such as FFT.
 
 .. figure:: ./figures/nvmath-python.png
     :width: 1000px
     :align: center
 
-Additionally, we offer :doc:`Python bindings <bindings/index>` that provide a 1:1 mapping with the C APIs. These bindings, which serve as wrappers with API signatures similar to their C counterparts,
-are ideal for library developers looking to integrate the capabilities of the NVIDIA Math Libraries in a customized manner, in the event that the pythonic APIs don't meet their specific requirements.
-Conversely, our high-level pythonic APIs deliver a fully integrated solution suitable for native Python users as well as library developers, encompassing both host and device APIs.
-In the future, select host APIs will accept **callback functions written in Python** and compiled into supported formats such as LTO-IR, using compilers like `Numba`_.
+Additionally, we offer :doc:`Python bindings <bindings/index>` that provide a 1:1 mapping
+with the C APIs. These bindings, which serve as wrappers with API signatures similar to
+their C counterparts, are ideal for library developers looking to integrate the capabilities
+of the NVIDIA Math Libraries in a customized manner, in the event that the pythonic APIs
+don't meet their specific requirements. Conversely, our high-level pythonic APIs deliver a
+fully integrated solution suitable for native Python users as well as library developers,
+encompassing both host and device APIs. In the future, select host APIs will accept
+**callback functions written in Python** and compiled into supported formats such as LTO-IR,
+using compilers like `Numba`_.
 
 .. _host api section:
 
@@ -55,10 +81,14 @@ Host APIs
 
 .. _host apis:
 
-nvmath-python provides a collection of APIs that can be directly invoked from the CPU (host). At present, these APIs encompass a selection of functionalities within the following categories:
+nvmath-python provides a collection of APIs that can be directly invoked from the CPU
+(host). At present, these APIs encompass a selection of functionalities within the following
+categories:
 
-- Fast Fourier Transform in :mod:`nvmath.fft`. Please refer to :doc:`Fast Fourier Transform <fft/index>` for details.
-- Linear Algebra in :mod:`nvmath.linalg`. Please refer to :doc:`Linear Algebra <linalg/index>` for details.
+- Fast Fourier Transform in :mod:`nvmath.fft`. Refer to :doc:`Fast Fourier Transform
+  <fft/index>` for details.
+- Linear Algebra in :mod:`nvmath.linalg`. Refer to :doc:`Linear Algebra
+  <linalg/index>` for details.
 
 
 .. _host api interop:
@@ -66,8 +96,9 @@ nvmath-python provides a collection of APIs that can be directly invoked from th
 Effortless Interoperability
 ---------------------------
 
-All host APIs support input arrays/tensors from NumPy, CuPy, and PyTorch while returning output operands using the same package, thus offering effortless interoperability with these frameworks.
-One example for the interoperability is shown below:
+All host APIs support input arrays/tensors from NumPy, CuPy, and PyTorch while returning
+output operands using the same package, thus offering effortless interoperability with these
+frameworks. One example for the interoperability is shown below:
 
 .. code-block:: python
 
@@ -88,33 +119,57 @@ One example for the interoperability is shown below:
 Stateless and Stateful APIs
 ---------------------------
 
-The host APIs within nvmath-python can be generally categorized into two types: stateless function-form APIs and stateful class-form APIs.
+The host APIs within nvmath-python can be generally categorized into two types: stateless
+function-form APIs and stateful class-form APIs.
 
-The function-form APIs, such as :func:`nvmath.fft.fft` and :func:`nvmath.linalg.advanced.matmul`, are designed to deliver quick, end-to-end results with a single function call.
-These APIs are ideal for instances where a user needs to perform a single computation without the need for intermediate steps, customization of algorithm selection, or cost amortization of preparatory steps.
-Conversely, the stateful class-form APIs, like :class:`nvmath.fft.FFT` and :class:`nvmath.linalg.advanced.Matmul`, offer a more comprehensive and flexible approach.
-They not only encompass the functionality found in their function-form counterparts but also allow for amortization of one-time costs, potentially enhancing performance significantly.
+The function-form APIs, such as :func:`nvmath.fft.fft` and
+:func:`nvmath.linalg.advanced.matmul`, are designed to deliver quick, end-to-end results
+with a single function call. These APIs are ideal for instances where a user needs to
+perform a single computation without the need for intermediate steps, customization of
+algorithm selection, or cost amortization of preparatory steps. Conversely, the stateful
+class-form APIs, like :class:`nvmath.fft.FFT` and :class:`nvmath.linalg.advanced.Matmul`,
+offer a more comprehensive and flexible approach. They not only encompass the functionality
+found in their function-form counterparts but also allow for amortization of one-time costs,
+potentially enhancing performance significantly.
 
 The design pattern for all stateful APIs in nvmath-python consists of several key phases:
 
-    - Problem Specification: This initial phase involves defining the operation and setting options that affect its execution.
-      It's designed to be as lightweight as possible, ensuring the problem is well-defined and supported by the current implementation.
-    - Preparation: Using FFT as an example, this phase includes a planning step to select the optimal algorithm for the defined FFT operation.
-      An optional autotuning operation, when available, also falls within the preparation phase.
-      The preparation phase is generally the most resource-intensive and may incorporate user-specified planning and autotuning options.
-    - Execution: This phase allows for repeated execution, where the operand can be either modified in-place or explicitly reset using the ``reset_operand``/``reset_operands`` method.
-      The costs associated with the first two phases are therefore amortized over these multiple executions.
-    - Resource Release: Users are advised to use stateful objects from within a context using the `with statement <https://docs.python.org/3/reference/compound_stmts.html#the-with-statement>`_,
-      which automatically handles the release of internal resources upon exit. If the object is not used as a context manager using ``with``, it is necessary to explicitly call the ``free`` method to ensure all resources are properly released.
+    - Problem Specification: This initial phase involves defining the operation and setting
+      options that affect its execution. It's designed to be as lightweight as possible,
+      ensuring the problem is well-defined and supported by the current implementation.
+    - Preparation: Using FFT as an example, this phase includes a planning step to select
+      the optimal algorithm for the defined FFT operation. An optional autotuning operation,
+      when available, also falls within the preparation phase. The preparation phase is
+      generally the most resource-intensive and may incorporate user-specified planning and
+      autotuning options.
+    - Execution: This phase allows for repeated execution, where the operand can be either
+      modified in-place or explicitly reset using the ``reset_operand``/``reset_operands``
+      method. The costs associated with the first two phases are therefore amortized over
+      these multiple executions.
+    - Resource Release: Users are advised to use stateful objects from within a context
+      using the `with statement
+      <https://docs.python.org/3/reference/compound_stmts.html#the-with-statement>`_, which
+      automatically handles the release of internal resources upon exit. If the object is
+      not used as a context manager using ``with``, it is necessary to explicitly call the
+      ``free`` method to ensure all resources are properly released.
 
 .. note::
-    By design, nvmath-python does NOT cache plans with stateless function-form APIs. This is to enable library developers and others to use their own caching mechanisms with nvmath-python.
-    Therefore users should use the stateful object APIs for repeated use as well as benchmarking to avoid incurring repeated preparatory costs, or use a cached API
-    (see `caching.py <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/caching.py>`_ for an example implementation).
+
+    By design, nvmath-python does NOT cache plans with stateless function-form APIs. This is
+    to enable library developers and others to use their own caching mechanisms with
+    nvmath-python. Therefore users should use the stateful object APIs for repeated use as
+    well as benchmarking to avoid incurring repeated preparatory costs, or use a cached API
+    (see `caching.py
+    <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/caching.py>`_ for an
+    example implementation).
 
 .. note::
-    The decision to require explicit ``free`` calls for resource release is driven by the fact that Python's garbage collector can delay freeing object resources
-    when the object goes out of scope or its reference count drops to zero. For details, refer to the `__del__ method Python documentation <https://docs.python.org/3/reference/datamodel.html#object.__del__>`_.
+
+    The decision to require explicit ``free`` calls for resource release is driven by the
+    fact that Python's garbage collector can delay freeing object resources when the object
+    goes out of scope or its reference count drops to zero. For details, refer to the
+    `__del__ method Python documentation
+    <https://docs.python.org/3/reference/datamodel.html#object.__del__>`_.
 
 
 .. _generic specialized:
@@ -122,38 +177,56 @@ The design pattern for all stateful APIs in nvmath-python consists of several ke
 Generic and Specialized APIs
 ----------------------------
 
-Another way of categorizing the host APIs within nvmath-python is by splitting them into *generic* and *specialized* APIs, based on their flexibility and the scope of their functionality:
+Another way of categorizing the host APIs within nvmath-python is by splitting them into
+*generic* and *specialized* APIs, based on their flexibility and the scope of their
+functionality:
 
-- **Generic APIs** are designed to accommodate a broad range of operands and customization with these APIs is confined to options that are universally applicable across all supported operand types.
-  For instance, the generic matrix multiplication API can handle structured matrices (such as triangular and banded, in full or packed form) in addition to dense full matrices, but the available options are limited to those applicable to all these matrix types.
+- **Generic APIs** are designed to accommodate a broad range of operands and customization
+  with these APIs is confined to options that are universally applicable across all
+  supported operand types. For instance, the generic matrix multiplication API can handle
+  structured matrices (such as triangular and banded, in full or packed form) in addition to
+  dense full matrices, but the available options are limited to those applicable to all
+  these matrix types.
 
-- **Specialized APIs**, on the other hand, are tailored for specific types of operands, allowing for full customization that is available to this kind.
-  A prime example is the specialized matrix multiplication API for dense matrices, which provides numerous options specifically suited to dense matrices.
+- **Specialized APIs**, on the other hand, are tailored for specific types of operands,
+  allowing for full customization that is available to this kind. A prime example is the
+  specialized matrix multiplication API for dense matrices, which provides numerous options
+  specifically suited to dense matrices.
 
-It should be noted that the notion of generic and specialized APIs is orthogonal to the notion of stateful versus stateless APIs.
-Currently, nvmath-python offers the specialized interface for dense matrix multiplication, in :class:`stateful <nvmath.linalg.advanced.Matmul>` and :func:`stateless <nvmath.linalg.advanced.matmul>` forms.
+It should be noted that the notion of generic and specialized APIs is orthogonal to the
+notion of stateful versus stateless APIs. Currently, nvmath-python offers the specialized
+interface for dense matrix multiplication, in :class:`stateful
+<nvmath.linalg.advanced.Matmul>` and :func:`stateless <nvmath.linalg.advanced.matmul>`
+forms.
 
 .. _high-level api logging:
 
 Full Logging Support
 --------------------
 
-nvmath-python provides integration with the Python standard library logger from the `logging module <https://docs.python.org/3/library/logging.html>`_
-to offer full logging of the computational details at various level, e.g, debug, information, warning and error.
-An example illustrating the use of the global Python logger is shown below:
+nvmath-python provides integration with the Python standard library logger from the `logging
+module <https://docs.python.org/3/library/logging.html>`_ to offer full logging of the
+computational details at various levels, for example debug, information, warning and error. An
+example illustrating the use of the global Python logger is shown below:
 
 .. code-block:: python
 
     import logging
 
     # Turn on logging with level set to "debug" and use a custom format for the log
-    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M:%S')
+    logging.basicConfig(
+      level=logging.DEBUG,
+      format='%(asctime)s %(levelname)-8s %(message)s',
+      datefmt='%m-%d %H:%M:%S'
+   )
 
     # Call nvmath-python pythonic APIs
     out = nvmath.linalg.advanced.matmul(...)
 
-Alternatively, for APIs that contain the ``options`` argument, users can set a custom logger by directly passing it inside a dictionary or as part of the corresponding ``Options`` object,
-e.g., :attr:`nvmath.fft.FFTOptions.logger` for :func:`nvmath.fft.fft` and :class:`nvmath.fft.FFT`. An example based on FFT is shown below:
+Alternatively, for APIs that contain the ``options`` argument, users can set a custom logger
+by directly passing it inside a dictionary or as part of the corresponding ``Options``
+object, for example :attr:`nvmath.fft.FFTOptions.logger` for :func:`nvmath.fft.fft` and
+:class:`nvmath.fft.FFT`. An example based on FFT is shown below:
 
 .. code-block:: python
 
@@ -166,13 +239,20 @@ e.g., :attr:`nvmath.fft.FFTOptions.logger` for :func:`nvmath.fft.fft` and :class
     # Call nvmath-python pythonic APIs
     out = nvmath.fft.fft(..., options={'logger': logger})
 
-For the complete examples, please refer to `global logging example <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/example04_logging_global.py>`_
-and `custom user logging example <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/example04_logging_user.py>`_.
+For the complete examples, refer to `global logging example
+<https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/
+example04_logging_global.py>`_
+and `custom user logging example
+<https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/
+example04_logging_user.py>`_.
 
 .. note::
+
   The Python logging is orthogonal to the logging provided by certain NVIDIA math libraries,
-  which encapsulates low level implementation details and can be activated via either specific environment variables (e.g., ``CUBLASLT_LOG_LEVEL`` for ``cuBLASLt``)
-  or programmatically through the Python bindings (e.g., :func:`nvmath.bindings.cusolverDn.logger_set_level` for ``cuSOLVER``).
+  which encapsulates low level implementation details and can be activated via either
+  specific environment variables (for example ``CUBLASLT_LOG_LEVEL`` for ``cuBLASLt``) or
+  programmatically through the Python bindings (for example
+  :func:`nvmath.bindings.cusolverDn.logger_set_level` for ``cuSOLVER``).
 
 
 .. _high-level call blocking:
@@ -180,12 +260,18 @@ and `custom user logging example <https://github.com/NVIDIA/nvmath-python/tree/m
 Call Blocking Behavior
 ----------------------
 
-By default, calls to all pythonic host APIs that require GPU execution are *not* blocking if the input operands reside on the device. This means that functions like :func:`nvmath.linalg.advanced.matmul`,
-:meth:`nvmath.fft.FFT.execute`, and :meth:`nvmath.linalg.advanced.Matmul.execute` will return immediately after the operation is launched on the GPU without waiting for it to complete.
-Users are therefore responsible for properly synchronizing the stream when needed. The default behavior can be modified by setting the ``blocking`` attribute (default ``'auto'``) of the relevant ``Options`` object to ``True``.
-For example, users may set :attr:`nvmath.fft.FFTOptions.blocking` to ``True`` and pass this options object to the corresponding FFT API calls.
-If the input operands are on the host, the pythonic API calls will always block since the computation yields an output operand that will also reside on the host.
-Meanwhile, APIs that execute on the host (such as :meth:`nvmath.fft.FFT.create_key`) always block.
+By default, calls to all pythonic host APIs that require GPU execution are *not* blocking if
+the input operands reside on the device. This means that functions like
+:func:`nvmath.linalg.advanced.matmul`, :meth:`nvmath.fft.FFT.execute`, and
+:meth:`nvmath.linalg.advanced.Matmul.execute` will return immediately after the operation is
+launched on the GPU without waiting for it to complete. Users are therefore responsible for
+properly synchronizing the stream when needed. The default behavior can be modified by
+setting the ``blocking`` attribute (default ``'auto'``) of the relevant ``Options`` object
+to ``True``. For example, users may set :attr:`nvmath.fft.FFTOptions.blocking` to ``True``
+and pass this options object to the corresponding FFT API calls. If the input operands are
+on the host, the pythonic API calls will always block since the computation yields an output
+operand that will also reside on the host. Meanwhile, APIs that execute on the host (such as
+:meth:`nvmath.fft.FFT.create_key`) always block.
 
 .. _high-level stream semantics:
 
@@ -193,53 +279,51 @@ Meanwhile, APIs that execute on the host (such as :meth:`nvmath.fft.FFT.create_k
 Stream Semantics
 ----------------
 
-The stream semantics depend on whether the behavior of the execution APIs is chosen to be blocking or non-blocking (see :ref:`high-level call blocking`).
+The stream semantics depend on whether the behavior of the execution APIs is chosen to be
+blocking or non-blocking (see :ref:`high-level call blocking`).
 
-For blocking behavior, stream ordering is automatically handled by the nvmath-python high-level APIs for *operations that are performed
-within the package*. A stream can be provided for two reasons:
+For blocking behavior, stream ordering is automatically handled by the nvmath-python
+high-level APIs for *operations that are performed within the package*. A stream can be
+provided for two reasons:
 
-1. When the computation that prepares the input operands is not already complete by the time the execution APIs are called. This is a correctness requirement for user-provided data.
-2. To enable parallel computations across multiple streams if the device has sufficient resources and the current stream (which is the default) has concomitant operations. This can be done for performance reasons.
+1. When the computation that prepares the input operands is not already complete by the time
+   the execution APIs are called. This is a correctness requirement for user-provided data.
+2. To enable parallel computations across multiple streams if the device has sufficient
+   resources and the current stream (which is the default) has concomitant operations. This
+   can be done for performance reasons.
 
-For non-blocking behavior, it is the user's responsibility to ensure correct stream ordering between the execution API calls.
+For non-blocking behavior, it is the user's responsibility to ensure correct stream ordering
+between the execution API calls.
 
 In any case, the execution APIs are launched on the provided stream.
 
-For examples on stream ordering, please refer to `FFT with multiple streams <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/example09_streams.py>`_.
+For examples on stream ordering, refer to `FFT with multiple streams
+<https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/example09_streams.py>`_.
 
 .. _high-level memory management:
 
 Memory Management
 -----------------
 
-By default, the host APIs use the memory pool from the package that their operands belong to. This ensures that there is no contention for memory or spurious out-of-memory errors.
-However the user also has the ability to provide their own memory allocator if they choose to do so.
-In our pythonic APIs, we support an `EMM`_-like interface as proposed and supported by Numba for users to set their Python mempool.
-Taking FFT as an example, users can set the option :attr:`nvmath.fft.FFTOptions.allocator` to a Python object complying with the :class:`nvmath.BaseCUDAMemoryManager`
-protocol, and pass the options to the high-level APIs like :func:`nvmath.fft.fft` or :class:`nvmath.fft.FFT`. Temporary memory allocations will then be done through this interface.
-Internally, we use the same interface to use CuPy or PyTorch's mempool depending on the operands.
+By default, the host APIs use the memory pool from the package that their operands belong
+to. This ensures that there is no contention for memory or spurious out-of-memory errors.
+However the user also has the ability to provide their own memory allocator if they choose
+to do so. In our pythonic APIs, we support an `EMM`_-like interface as proposed and
+supported by Numba for users to set their Python mempool. Taking FFT as an example, users
+can set the option :attr:`nvmath.fft.FFTOptions.allocator` to a Python object complying with
+the :class:`nvmath.BaseCUDAMemoryManager` protocol, and pass the options to the high-level
+APIs like :func:`nvmath.fft.fft` or :class:`nvmath.fft.FFT`. Temporary memory allocations
+will then be done through this interface. Internally, we use the same interface to use CuPy
+or PyTorch's mempool depending on the operands.
 
 .. note::
 
-    nvmath's :class:`~nvmath.BaseCUDAMemoryManager` protocol is slightly different from Numba's EMM interface
-    (:class:`numba.cuda.BaseCUDAMemoryManager`), but duck typing with an existing EMM instance (not type!) at runtime
-    should be possible.
+    nvmath's :class:`~nvmath.BaseCUDAMemoryManager` protocol is slightly different from
+    Numba's EMM interface (:class:`numba.cuda.BaseCUDAMemoryManager`), but duck typing with
+    an existing EMM instance (not type!) at runtime should be possible.
 
 .. _EMM: https://numba.readthedocs.io/en/stable/cuda/external-memory.html
 
-.. _host-api-utility-reference:
-
-.. module:: nvmath
-
-Common Objects (:mod:`nvmath`)
-------------------------------
-
-.. autosummary::
-   :toctree: generated/
-
-   BaseCUDAMemoryManager
-   MemoryPointer
-
 .. _host api callback section:
 
 Host APIs with Callbacks
@@ -247,32 +331,33 @@ Host APIs with Callbacks
 
 .. _host apis callback:
 
-Certain host APIs (such as :func:`nvmath.fft.fft` and :meth:`nvmath.fft.FFT.plan`) allow the user to provide prolog or epilog functions *written in Python*, resulting in
-a *fused kernel*. This improves performance by avoiding extra roundtrips to global memory and effectively increases the arithmetic intensity of the operation.
+Certain host APIs (such as :func:`nvmath.fft.fft` and :meth:`nvmath.fft.FFT.plan`) allow the
+user to provide prolog or epilog functions *written in Python*, resulting in a *fused
+kernel*. This improves performance by avoiding extra roundtrips to global memory and
+effectively increases the arithmetic intensity of the operation.
 
 .. code-block:: python
 
-        import cupy as cp
-        import nvmath
-
+   import cupy as cp
+   import nvmath
 
-        # Create the data for the batched 1-D FFT.
-        B, N = 256, 1024
-        a = cp.random.rand(B, N, dtype=cp.float64) + 1j * cp.random.rand(B, N, dtype=cp.float64)
+   # Create the data for the batched 1-D FFT.
+   B, N = 256, 1024
+   a = cp.random.rand(B, N, dtype=cp.float64) + 1j * cp.random.rand(B, N, dtype=cp.float64)
 
-        # Compute the normalization factor.
-        scale = 1.0 / N
+   # Compute the normalization factor.
+   scale = 1.0 / N
 
-        # Define the epilog function for the FFT.
-        def rescale(data_out, offset, data, user_info, unused):
-            data_out[offset] = data * scale
+   # Define the epilog function for the FFT.
+   def rescale(data_out, offset, data, user_info, unused):
+      data_out[offset] = data * scale
 
-        # Compile the epilog to LTO-IR (in the context of the execution space).
-        with a.device:
-            epilog = nvmath.fft.compile_epilog(rescale, "complex128", "complex128")
+   # Compile the epilog to LTO-IR (in the context of the execution space).
+   with a.device:
+      epilog = nvmath.fft.compile_epilog(rescale, "complex128", "complex128")
 
-        # Perform the forward FFT, applying the filter as an epilog...
-        r = nvmath.fft.fft(a, axes=[-1], epilog={"ltoir": epilog})
+   # Perform the forward FFT, applying the filter as an epilog...
+   r = nvmath.fft.fft(a, axes=[-1], epilog={"ltoir": epilog})
 
 .. _device api section:
 
@@ -281,12 +366,19 @@ Device APIs
 
 .. _device apis:
 
-The :doc:`device APIs <device-apis/index>` enable the user to call core mathematical operations in their Python CUDA kernels, resulting in a *fully fused kernel*. Fusion is essential
-for performance in latency-dominated cases to reduce the number of kernel launches, and in memory-bound operations to avoid the extra roundtrip to global memory.
+The :doc:`device APIs <device-apis/index>` enable the user to call core mathematical
+operations in their Python CUDA kernels, resulting in a *fully fused kernel*. Fusion is
+essential for performance in latency-dominated cases to reduce the number of kernel
+launches, and in memory-bound operations to avoid the extra roundtrip to global memory.
 
-We currently offer support for calling FFT, matrix multiplication, and random number generation APIs in kernels written using `Numba`_, with plans to offer more core operations and support other compilers in the future.
-The design of the device APIs closely mimics that of the C++ APIs from the corresponding NVIDIA Math Libraries (MathDx libraries `cuFFTDx <https://docs.nvidia.com/cuda/cufftdx/1.2.0>`_ and `cuBLASDx <https://docs.nvidia.com/cuda/cublasdx/0.1.1>`_
-for FFT and matrix multiplication, and `cuRAND device APIs <https://docs.nvidia.com/cuda/curand/group__DEVICE.html#group__DEVICE>`_ for random number generation).
+We currently offer support for calling FFT, matrix multiplication, and random number
+generation APIs in kernels written using `Numba`_, with plans to offer more core operations
+and support other compilers in the future. The design of the device APIs closely mimics that
+of the C++ APIs from the corresponding NVIDIA Math Libraries (MathDx libraries `cuFFTDx
+<https://docs.nvidia.com/cuda/cufftdx/1.2.0>`_ and `cuBLASDx
+<https://docs.nvidia.com/cuda/cublasdx/0.1.1>`_ for FFT and matrix multiplication, and
+`cuRAND device APIs <https://docs.nvidia.com/cuda/curand/group__DEVICE.html#group__DEVICE>`_
+for random number generation).
 
 .. _commitment:
 
@@ -294,19 +386,24 @@ Compatibility Policy
 ====================
 
 nvmath-python is no different from any Python package, in that we would not succeed without
-depending on, collaborating with, and evolving alongside the Python community. Given these considerations, we strive to meet the following commitments:
+depending on, collaborating with, and evolving alongside the Python community. Given these
+considerations, we strive to meet the following commitments:
 
 1. For the :doc:`low-level Python bindings <bindings/index>`,
 
-   * if the library to be bound is part of CUDA Toolkit, we support the library from the most recent two CUDA major versions (currently CUDA 11/12)
+   * if the library to be bound is part of CUDA Toolkit, we support the library from the
+     most recent two CUDA major versions (currently CUDA 11/12)
    * otherwise, we support the library within its major version
 
    Note that all bindings are currently *experimental*.
 
-2. For the high-level pythonic APIs, we maintain backward compatibility to the greatest extent feasible.
-   When a breaking change is necessary, we issue a runtime warning to alert users of the upcoming changes in the next major release.
-   This practice ensures that breaking changes are clearly communicated and reserved for major version updates, allowing users to prepare and adapt without surprises.
-3. We comply with `NEP-29`_ and support a community-defined set of core dependencies (CPython, NumPy, etc).
+2. For the high-level pythonic APIs, we maintain backward compatibility to the greatest
+   extent feasible. When a breaking change is necessary, we issue a runtime warning to alert
+   users of the upcoming changes in the next major release. This practice ensures that
+   breaking changes are clearly communicated and reserved for major version updates,
+   allowing users to prepare and adapt without surprises.
+3. We comply with `NEP-29`_ and support a community-defined set of core dependencies
+   (CPython, NumPy, etc).
 
 .. note::
     The policy on backwards compatibility will apply starting with release ``1.0.0``.
diff --git a/docs/sphinx/quickstart.rst b/docs/sphinx/quickstart.rst
new file mode 100644
index 0000000..ee70b8b
--- /dev/null
+++ b/docs/sphinx/quickstart.rst
@@ -0,0 +1,145 @@
+Getting Started
+***************
+
+nvmath-python brings the power of the NVIDIA math libraries to the Python ecosystem.
+The package aims to provide intuitive pythonic APIs that provide users full access
+to all the features offered by NVIDIA's libraries in a variety of execution spaces.
+nvmath-python works seamlessly with existing Python array/tensor frameworks and focuses
+on providing functionality that is missing from those frameworks.
+
+To learn more about the design of nvmath-python, visit our :doc:`Overview<overview>`.
+
+Installation
+============
+
+To quickly install nvmath-python just run the following command:
+
+.. code-block:: bash
+
+    pip install nvmath-python[cu12,dx]
+
+For more details visit the :doc:`Installation Guide<installation>`.
+
+Examples
+=========
+
+In the examples below, we quickly demonstrate the basic capabilities
+of nvmath-python. You can find more examples in our
+`GitHub repository <https://github.com/NVIDIA/nvmath-python/tree/main/examples>`_.
+
+Matrix multiplication
+---------------------
+
+Using the nvmath-python API allows access to all parameters of the underlying
+NVIDIA cuBLASLt library.
+Some of these parameters are unavailable in other wrappings of NVIDIA's C-API libraries.
+
+.. doctest::
+
+    >>> import cupy as cp
+    >>> import nvmath
+    >>>
+    >>> m, n, k = 123, 456, 789
+    >>> a = cp.random.rand(m, k).astype(cp.float32)
+    >>> b = cp.random.rand(k, n).astype(cp.float32)
+    >>>
+    >>> # Use the stateful nvmath.linalg.advanced.Matmul object in order to separate planning
+    >>> # from actual execution of matrix multiplication. nvmath-python allows you to fine-tune
+    >>> # your operations by, for example, selecting a mixed-precision compute type.
+    >>> options = {
+    ...     "compute_type": nvmath.linalg.advanced.MatmulComputeType.COMPUTE_32F_FAST_16F
+    ... }
+    >>> with nvmath.linalg.advanced.Matmul(a, b, options=options) as mm:
+    ...     algorithms = mm.plan()
+    ...     result = mm.execute()
+
+To learn more about matrix multiplication in nvmath-python, have a look at
+:py:class:`~nvmath.linalg.advanced.Matmul`.
+
+FFT with callback
+-----------------
+
+User-defined functions can be `compiled to the LTO-IR format
+<https://docs.nvidia.com/cuda/cufft/index.html#lto-load-and-store-callback-routines>`_ and
+provided as epilog or prolog to the FFT operation, allowing for Link-Time Optimization and
+fusing.
+
+This example shows how to perform a convolution by providing a Python callback function as
+prolog to the IFFT operation.
+
+.. doctest::
+
+    >>> import cupy as cp
+    >>> import nvmath
+    >>>
+    >>> # Create the data for the batched 1-D FFT.
+    >>> B, N = 256, 1024
+    >>> a = cp.random.rand(B, N, dtype=cp.float64) + 1j * cp.random.rand(B, N, dtype=cp.float64)
+    >>>
+    >>> # Create the data to use as filter.
+    >>> filter_data = cp.sin(a)
+    >>>
+    >>> # Define the prolog function for the inverse FFT.
+    >>> # A convolution corresponds to pointwise multiplication in the frequency domain.
+    >>> def convolve(data_in, offset, filter_data, unused):
+    ...     # Note we are accessing `data_out` and `filter_data` with a single `offset` integer,
+    ...     # even though the input and `filter_data` are 2D tensors (batches of samples).
+    ...     # Care must be taken to assure that both arrays accessed here have the same memory
+    ...     # layout.
+    ...     return data_in[offset] * filter_data[offset] / N
+    >>>
+    >>> # Compile the prolog to LTO-IR.
+    >>> with cp.cuda.Device():
+    ...     prolog = nvmath.fft.compile_prolog(convolve, "complex128", "complex128")
+    >>>
+    >>> # Perform the forward FFT, followed by the inverse FFT, applying the filter as a prolog.
+    >>> r = nvmath.fft.fft(a, axes=[-1])
+    >>> r = nvmath.fft.ifft(r, axes=[-1], prolog={
+    ...         "ltoir": prolog,
+    ...         "data": filter_data.data.ptr
+    ...     })
+
+For further details, see the :ref:`FFT callbacks documentation <fft-callback>`.
+
+Device APIs
+-----------
+
+The device APIs of nvmath-python allow you to access the functionalities
+of cuFFTDx, cuBLASDx, and cuRAND libraries in your kernels.
+
+This example shows how to use the cuRAND to sample
+a single-precision value from a normal distribution.
+
+First, create the array of bit-generator states (one per thread).
+In this example, we'll use
+:py:class:`Philox4_32_10<nvmath.device.random.StatesPhilox4_32_10>` generator.
+
+.. doctest::
+
+    >>> from numba import cuda
+    >>> from nvmath.device import random
+    >>> compiled_apis = random.Compile()
+    >>>
+    >>> threads, blocks = 64, 64
+    >>> nthreads = blocks * threads
+    >>>
+    >>> states = random.StatesPhilox4_32_10(nthreads)
+    >>>
+    >>> # Next, define and launch a setup kernel, which will initialize the states using
+    >>> # nvmath.device.random.init function.
+    >>> @cuda.jit(link=compiled_apis.files, extensions=compiled_apis.extension)
+    ... def setup(states):
+    ...     i = cuda.grid(1)
+    ...     random.init(1234, i, 0, states[i])
+    >>>
+    >>> setup[blocks, threads](states)
+    >>>
+    >>> # With your states array ready, you can use samplers such as
+    >>> # nvmath.device.random.normal2 to sample random values in your kernels.
+    >>> @cuda.jit(link=compiled_apis.files, extensions=compiled_apis.extension)
+    ... def kernel(states):
+    ...     i = cuda.grid(1)
+    ...     random_values = random.normal2(states[i])
+
+To learn more about this and other Device APIs,
+visit the documentation of :mod:`nvmath.device`.
diff --git a/docs/sphinx/release-notes.rst b/docs/sphinx/release-notes.rst
index 85c42b2..94852b0 100644
--- a/docs/sphinx/release-notes.rst
+++ b/docs/sphinx/release-notes.rst
@@ -1,22 +1,52 @@
 nvmath-python Release Notes
 ***************************
 
+nvmath-python v0.2.1
+====================
+
+Beta2 update 1 with improved diagnostics, testing enhancements, and bug fixes.
+
+* New tests for batched epilogs and autotuning with epilogs for the advanced matrix multiplication APIs.
+* Added more hypothesis-based tests for host APIs.
+* Improved algorithm for detecting overlapping memory operands for certain sliced tensors, thereby supporting such layouts for FFTs.
+* Added bindings for new APIs introduced in CTK versions 12.5 and 12.6.
+* Further coding style fixes toward meeting PEP8 recommendations.
+* Clarified batched semantics for matrix multiplication epilogs in the documentation.
+* Code snippets in API docstrings are now tested.
+
+Bugs Fixed
+----------
+
+* C2R FFT may fail with "illegal memory access" on sliced tensors.
+* Improved diagnostics to detect incompatible combinations of scale and compute types for matrix multiplication, that previously may have resulted in incorrect results.
+* Matrix multiplication provided incorrect results when operand A is a vector (number of dimensions=1).
+
+API Changes
+-----------
+
+* The ``last_axis_size`` option in :class:`nvmath.fft.FFTOptions` is now deprecated in favor of `last_axis_parity` to better reflect its semantics.
+
+.. note::
+
+   Deprecated APIs will be removed in the next release.
+
 nvmath-python v0.2.0
 ====================
 
 Beta2 release.
 
-* CPU execution space support for FFT libraries that conform to FFTW3 API (e.g. MKL, NVPL).
+* CPU execution space support for FFT libraries that conform to FFTW3 API (for example MKL, NVPL).
 * Support for prolog and epilog callback for FFT, written in Python.
 * New device APIs for random number generation.
 * Notebooks to illustrate use of advanced matrix multiplication APIs.
 * Introduced hypothesis-based tests for host APIs.
-* Reduced Python overhead in `execute` methods.
+* Reduced Python overhead in ``execute`` methods.
 
 Bugs Fixed
 ----------
 
-* Matrix multiplication may fail with "illegal memory access" for K=1 with DRELU and DGELU epilogs.
+* Matrix multiplication may fail with "illegal memory access" for K=1 with DRELU and DGELU
+  epilogs.
 
 Packaging
 ---------
@@ -30,9 +60,10 @@ Known issues
 
 * When ``compute_type`` argument of :class:`nvmath.linalg.advanced.Matmul` is set to
   ``COMPUTE_16F``, an incompatible default for ``scale_type`` is chosen, resulting in
-  incorrect results for CTKs older than 12.6 and an error for CTK 12.6 and newer.
-  As a workaround we recommend setting both ``compute_type`` and ``scale_type`` in a
-  compatible manner according to `supported data types table <https://docs.nvidia.com/cuda/cublas/#cublasltmatmul>`_.
+  incorrect results for CTKs older than 12.6 and an error for CTK 12.6 and newer. As a
+  workaround we recommend setting both ``compute_type`` and ``scale_type`` in a compatible
+  manner according to `supported data types table
+  <https://docs.nvidia.com/cuda/cublas/#cublasltmatmul>`_.
 
 nvmath-python v0.1.0
 ====================
@@ -47,10 +78,15 @@ The required and optional dependencies are summarized in the :ref:`cheatsheet <c
 
 *Limitations:*
 
-* Many matrix multiplication epilogs require CTK 11.5+, and a few require CTK 11.8+.
-  Please refer to `cuBLAS Release Notes <https://docs.nvidia.com/cuda/archive/11.8.0/cuda-toolkit-release-notes/index.html#title-cublas-library>`_ for more details.
+* Many matrix multiplication epilogs require CTK 11.5+, and a few require CTK 11.8+. 
+  Refer to `cuBLAS Release Notes
+  <https://docs.nvidia.com/cuda/archive/11.8.0/cuda-toolkit-release-notes/index.html
+  #title-cublas-library>`_
+  for more details.
 
 Disclaimer
 ==========
 
-nvmath-python is in a Beta state. Beta products may not be fully functional, may contain errors or design flaws, and may be changed at any time without notice. We appreciate your feedback to improve and iterate on our Beta products.
+nvmath-python is in a Beta state. Beta products may not be fully functional, may contain
+errors or design flaws, and may be changed at any time without notice. We appreciate your
+feedback to improve and iterate on our Beta products.
diff --git a/examples/device/common_cupy.py b/examples/device/common_cupy.py
index 13b0d2c..43eed9b 100644
--- a/examples/device/common_cupy.py
+++ b/examples/device/common_cupy.py
@@ -11,7 +11,7 @@ def time_cupy(fun, ncycles, *args):
 
     start.record(None)
     for _ in range(ncycles):
-        out = fun(*args)
+        out = fun(*args)  # noqa: F841
     stop.record(None)
     stop.synchronize()
 
diff --git a/examples/device/common_numba.py b/examples/device/common_numba.py
index 3e3db0a..4f323bd 100644
--- a/examples/device/common_numba.py
+++ b/examples/device/common_numba.py
@@ -51,7 +51,8 @@ def set_max_dynamic_shared_size_bytes(kernel, max_dynamic_smem_size, *args):
     )
 
 
-# matrix is always in C-order (cupy/numpy) but smem should always be in F-order (expected by cuBLASDx)
+# matrix is always in C-order (cupy/numpy) but smem should always be in F-order (expected by
+# cuBLASDx)
 @cuda.jit(inline="always")
 def load_to_shared_batched(matrix, smem, batch, dim, ld):
     start = cuda.threadIdx.x
@@ -65,9 +66,7 @@ def load_to_shared_batched(matrix, smem, batch, dim, ld):
 
 @cuda.jit(inline="always")
 def load_to_shared(matrix, smem, dim, ld):
-    start = (
-        cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y)
-    )
+    start = cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y)
     step = cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z
     stop = dim[0] * dim[1]
     for index in range(start, stop, step):
@@ -78,9 +77,7 @@ def load_to_shared(matrix, smem, dim, ld):
 
 @cuda.jit(inline="always")
 def load_to_shared_2d(matrix, smem, dim):
-    start = (
-        cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y)
-    )
+    start = cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y)
     step = cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z
     stop = dim[0] * dim[1]
     for index in range(start, stop, step):
@@ -91,9 +88,7 @@ def load_to_shared_2d(matrix, smem, dim):
 
 @cuda.jit(inline="always")
 def load_to_shared_1d_float16x2(matrix, smem, dim, ld):
-    start = (
-        cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y)
-    )
+    start = cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y)
     step = cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z
     stop = dim[0] * dim[1]
     for index in range(start, stop, step):
@@ -117,9 +112,7 @@ def store_from_shared_batched(smem, matrix, batch, dim, ld):
 
 @cuda.jit(inline="always")
 def store_from_shared(smem, matrix, dim, ld):
-    start = (
-        cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y)
-    )
+    start = cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y)
     step = cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z
     stop = dim[0] * dim[1]
     for index in range(start, stop, step):
@@ -130,9 +123,7 @@ def store_from_shared(smem, matrix, dim, ld):
 
 @cuda.jit(inline="always")
 def store_from_shared_2d(smem, matrix, dim):
-    start = (
-        cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y)
-    )
+    start = cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y)
     step = cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z
     stop = dim[0] * dim[1]
     for index in range(start, stop, step):
@@ -143,9 +134,7 @@ def store_from_shared_2d(smem, matrix, dim):
 
 @cuda.jit(inline="always")
 def store_from_shared_1d_float16x2(smem, matrix, dim, ld):
-    start = (
-        cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y)
-    )
+    start = cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y)
     step = cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z
     stop = dim[0] * dim[1]
     for index in range(start, stop, step):
diff --git a/examples/device/cublasdx_blockdim_gemm_fp16.py b/examples/device/cublasdx_blockdim_gemm_fp16.py
index e57e614..b620a99 100644
--- a/examples/device/cublasdx_blockdim_gemm_fp16.py
+++ b/examples/device/cublasdx_blockdim_gemm_fp16.py
@@ -44,7 +44,6 @@ def main():
         value_type = MM.value_type
         a_size = MM.a_size
         b_size = MM.b_size
-        c_size = MM.c_size
         a_dim = MM.a_dim
         b_dim = MM.b_dim
         c_dim = MM.c_dim
@@ -66,19 +65,15 @@ def f(a, b, c, alpha, beta, output):
 
             cuda.syncthreads()
 
-            if scenario == 0:
-                MM(alpha, smem_a, smem_b, beta, smem_c)
-
-            elif scenario == 1:
-                MM(alpha, smem_a, smem_b, beta, smem_c)
-
-            elif scenario == 2:
-                if cuda.threadIdx.y == 0:
-                    MM(alpha, smem_a, smem_b, beta, smem_c)
-
-            elif scenario == 3:
-                if cuda.threadIdx.z == 0:
+            match scenario:
+                case 0 | 1:
                     MM(alpha, smem_a, smem_b, beta, smem_c)
+                case 2:
+                    if cuda.threadIdx.y == 0:
+                        MM(alpha, smem_a, smem_b, beta, smem_c)
+                case 3:
+                    if cuda.threadIdx.z == 0:
+                        MM(alpha, smem_a, smem_b, beta, smem_c)
 
             cuda.syncthreads()
 
diff --git a/examples/device/cublasdx_fused_gemm_performance.py b/examples/device/cublasdx_fused_gemm_performance.py
index 2b421cd..2f93af1 100644
--- a/examples/device/cublasdx_fused_gemm_performance.py
+++ b/examples/device/cublasdx_fused_gemm_performance.py
@@ -40,11 +40,9 @@ def main():
     value_type = MM1.value_type
 
     a_size = MM1.a_size
-    b_size = MM1.b_size
     c_size = MM1.c_size
 
     d_size = MM2.b_size
-    f_size = MM2.c_size
 
     a_dim = MM1.a_dim
     b_dim = MM1.b_dim
@@ -131,9 +129,7 @@ def kernel(alpha1, a, b, beta1, c, alpha2, d, beta2, f, output):
     alpha2 = 3 + 4j
     beta2 = 0 + 0j
 
-    set_max_dynamic_shared_size_bytes(
-        kernel, shared_memory_size, alpha1, a_d, b_d, beta1, c_d, alpha2, d_d, beta2, f_d, o_d
-    )
+    set_max_dynamic_shared_size_bytes(kernel, shared_memory_size, alpha1, a_d, b_d, beta1, c_d, alpha2, d_d, beta2, f_d, o_d)
 
     time_ms = time_numba(
         kernel, 1, block_dim, shared_memory_size, 100, alpha1, a_d, b_d, beta1, c_d, alpha2, d_d, beta2, f_d, o_d
diff --git a/examples/device/cublasdx_gemm_fft.py b/examples/device/cublasdx_gemm_fft.py
index 662881a..fe4f819 100644
--- a/examples/device/cublasdx_gemm_fft.py
+++ b/examples/device/cublasdx_gemm_fft.py
@@ -45,7 +45,6 @@ def main():
 
     a_size = MM.a_size
     b_size = MM.b_size
-    c_size = MM.c_size
 
     a_dim = MM.a_dim
     b_dim = MM.b_dim
diff --git a/examples/device/cublasdx_gemm_fft_fp16.py b/examples/device/cublasdx_gemm_fft_fp16.py
index 939b48b..b183549 100644
--- a/examples/device/cublasdx_gemm_fft_fp16.py
+++ b/examples/device/cublasdx_gemm_fft_fp16.py
@@ -51,14 +51,12 @@ def main():
 
     a_size = MM.a_size
     b_size = MM.b_size
-    c_size = MM.c_size
 
     a_dim = MM.a_dim
     b_dim = MM.b_dim
     c_dim = MM.c_dim
 
     lda = MM.leading_dimension.a
-    ldb = MM.leading_dimension.b
     ldc = MM.leading_dimension.c
     shared_memory_size = max(MM.shared_memory_size, FFT.shared_memory_size)
 
diff --git a/examples/device/cublasdx_gemm_fft_performance.py b/examples/device/cublasdx_gemm_fft_performance.py
index 96c321b..9ae4de7 100644
--- a/examples/device/cublasdx_gemm_fft_performance.py
+++ b/examples/device/cublasdx_gemm_fft_performance.py
@@ -54,7 +54,6 @@ def main():
 
     a_size = MM.a_size
     b_size = MM.b_size
-    c_size = MM.c_size
 
     a_dim = MM.a_dim
     b_dim = MM.b_dim
@@ -126,9 +125,7 @@ def kernel(a, b, c, alpha, beta, output):
 
     def cp_kernel(a, b, c):
         cp_transform = lambda x: cp.multiply(x, x)
-        abc = cp.swapaxes(alpha * cp.einsum("bik,bkj->bij", cp_transform(a), b) + beta * c, 1, 2).reshape(
-            (batch_size, -1)
-        )
+        abc = cp.swapaxes(alpha * cp.einsum("bik,bkj->bij", cp_transform(a), b) + beta * c, 1, 2).reshape((batch_size, -1))
         return cp_transform(cp.fft.fft(abc, axis=-1))
 
     data_ref = cp_kernel(a, b, c)
diff --git a/examples/device/cublasdx_gemm_fusion.py b/examples/device/cublasdx_gemm_fusion.py
index 21a25f3..7b7b0a4 100644
--- a/examples/device/cublasdx_gemm_fusion.py
+++ b/examples/device/cublasdx_gemm_fusion.py
@@ -44,11 +44,9 @@ def main():
     )
 
     a_size = MM1.a_size
-    b_size = MM1.b_size
     c_size = MM1.c_size
 
     d_size = MM2.b_size
-    f_size = MM2.c_size
 
     a_dim = MM1.a_dim
     b_dim = MM1.b_dim
@@ -135,9 +133,7 @@ def kernel(alpha1, a, b, beta1, c, alpha2, d, beta2, f, output):
     alpha2 = 1.0
     beta2 = 1.0
 
-    set_max_dynamic_shared_size_bytes(
-        kernel, shared_memory_size, alpha1, a_d, b_d, beta1, c_d, alpha2, d_d, beta2, f_d, o_d
-    )
+    set_max_dynamic_shared_size_bytes(kernel, shared_memory_size, alpha1, a_d, b_d, beta1, c_d, alpha2, d_d, beta2, f_d, o_d)
 
     kernel[1, block_dim, 0, shared_memory_size](alpha1, a_d, b_d, beta1, c_d, alpha2, d_d, beta2, f_d, o_d)
     cuda.synchronize()
diff --git a/examples/device/cublasdx_simple_gemm_cfp16.py b/examples/device/cublasdx_simple_gemm_cfp16.py
index ea56196..bcf9520 100644
--- a/examples/device/cublasdx_simple_gemm_cfp16.py
+++ b/examples/device/cublasdx_simple_gemm_cfp16.py
@@ -28,12 +28,10 @@ def main():
     value_type = MM.value_type
     a_size = MM.a_size
     b_size = MM.b_size
-    c_size = MM.c_size
     a_dim = MM.a_dim
     b_dim = MM.b_dim
     c_dim = MM.c_dim
     block_dim = MM.block_dim
-    block_size = block_dim[0]
     ld = MM.leading_dimension
     lda, ldb, ldc = ld.a, ld.b, ld.c
     shared_memory_size = MM.shared_memory_size
diff --git a/examples/device/cublasdx_simple_gemm_fp32.py b/examples/device/cublasdx_simple_gemm_fp32.py
index 1125d50..18ee520 100644
--- a/examples/device/cublasdx_simple_gemm_fp32.py
+++ b/examples/device/cublasdx_simple_gemm_fp32.py
@@ -27,9 +27,6 @@ def main():
         compiler="numba",
     )
 
-    a_size = MM.a_size
-    b_size = MM.b_size
-    c_size = MM.c_size
     a_dim = MM.a_dim
     b_dim = MM.b_dim
     c_dim = MM.c_dim
@@ -37,8 +34,8 @@ def main():
 
     @cuda.jit(link=MM.files)
     def f(a, b, c, alpha, beta, output):
-        # cuBLASDx requires column-major arrays but cuda.shared.array creates row-major arrays (only)
-        # so we emulate a column-major array by flipping dimensions
+        # cuBLASDx requires column-major arrays but cuda.shared.array creates row-major
+        # arrays (only) so we emulate a column-major array by flipping dimensions
         smem_a = cuda.shared.array(shape=(a_dim[1], a_dim[0]), dtype=np.float32)
         smem_b = cuda.shared.array(shape=(b_dim[1], b_dim[0]), dtype=np.float32)
         smem_c = cuda.shared.array(shape=(c_dim[1], c_dim[0]), dtype=np.float32)
diff --git a/examples/device/cublasdx_simple_gemm_leading_dimensions.py b/examples/device/cublasdx_simple_gemm_leading_dimensions.py
index 4752a1b..ec20560 100644
--- a/examples/device/cublasdx_simple_gemm_leading_dimensions.py
+++ b/examples/device/cublasdx_simple_gemm_leading_dimensions.py
@@ -39,7 +39,6 @@ def main():
     a_dim = MM_static_ld.a_dim
     b_dim = MM_static_ld.b_dim
     c_dim = MM_static_ld.c_dim
-    block_size = MM_static_ld.block_size
     shared_memory_size = MM_static_ld.shared_memory_size
 
     @cuda.jit(link=MM_static_ld.files)
diff --git a/examples/device/cufftdx_autotuning.py b/examples/device/cufftdx_autotuning.py
index a26a1d2..f7aba72 100644
--- a/examples/device/cufftdx_autotuning.py
+++ b/examples/device/cufftdx_autotuning.py
@@ -69,9 +69,7 @@ def f(input, output):
         data_test = output_d.copy_to_host()
         error = np.linalg.norm(data_test - data_ref) / np.linalg.norm(data_ref)
         assert error < 1e-5
-        print(
-            f"Performance (elements_per_thread={elements_per_thread}, ffts_per_block={ffts_per_block}): {time_ms} [ms.]"
-        )
+        print(f"Performance (elements_per_thread={elements_per_thread}, ffts_per_block={ffts_per_block}): {time_ms} [ms.]")
 
 
 if __name__ == "__main__":
diff --git a/examples/device/cufftdx_block_fft_performance.py b/examples/device/cufftdx_block_fft_performance.py
index 5bbee4c..894ca5b 100644
--- a/examples/device/cufftdx_block_fft_performance.py
+++ b/examples/device/cufftdx_block_fft_performance.py
@@ -74,7 +74,8 @@ def f(data, repeat):
     perf = fft_perf_GFlops(fft_size, batch_size, time_fft_ms)
 
     print(
-        f"#SMs {sms}\nBlocks per SM {blocks_per_sm}\nFFts per block {ffts_per_block}\nBatch size {batch_size}\nTime {time_fft_ms} ms\nPerf {perf} GFlop/s"
+        f"#SMs {sms}\nBlocks per SM {blocks_per_sm}\nFFts per block {ffts_per_block}\nBatch size {batch_size}"
+        f"\nTime {time_fft_ms} ms\nPerf {perf} GFlop/s"
     )
 
 
diff --git a/examples/device/cufftdx_convolution_r2c_c2r.py b/examples/device/cufftdx_convolution_r2c_c2r.py
index d0f0a23..08c1008 100644
--- a/examples/device/cufftdx_convolution_r2c_c2r.py
+++ b/examples/device/cufftdx_convolution_r2c_c2r.py
@@ -55,7 +55,8 @@ def f(data):
 
         FFT_fwd(thread_data, shared_mem)
 
-        # After the first transform, the data is complex, so we have fft_size//2+1 complex elements per batch
+        # After the first transform, the data is complex, so we have fft_size//2+1 complex
+        # elements per batch
         index = cuda.threadIdx.x
         for i in range(elements_per_thread):
             if index < (fft_size // 2 + 1):
@@ -64,7 +65,8 @@ def f(data):
 
         FFT_inv(thread_data, shared_mem)
 
-        # After the second transform, the data is real again, so we store fft_size real elements per batch
+        # After the second transform, the data is real again, so we store fft_size real
+        # elements per batch
         index = cuda.threadIdx.x
         for i in range(elements_per_thread):
             if index < fft_size:
diff --git a/examples/device/cufftdx_convolution_r2c_c2r_packed_fold_optimized.py b/examples/device/cufftdx_convolution_r2c_c2r_packed_fold_optimized.py
index 31d2847..6b5c329 100644
--- a/examples/device/cufftdx_convolution_r2c_c2r_packed_fold_optimized.py
+++ b/examples/device/cufftdx_convolution_r2c_c2r_packed_fold_optimized.py
@@ -32,7 +32,6 @@ def main():
     )
 
     complex_type = FFT_r2c.value_type
-    real_type = FFT_r2c.precision
     storage_size = FFT_r2c.storage_size
     shared_memory_size = FFT_r2c.shared_memory_size
     ffts_per_block = FFT_r2c.ffts_per_block
@@ -73,7 +72,8 @@ def f(inout):
         for i in range(elements_per_thread):
             idx = i * stride + cuda.threadIdx.x
             if idx < size // 2:
-                # Fold optimized, so we load complex (ie 2 consecutive reals) instead of reals
+                # Fold optimized, so we load complex (ie 2 consecutive reals) instead of
+                # reals
                 real_thread_data[2 * i + 0] = inout[global_fft_id, 2 * idx + 0]
                 real_thread_data[2 * i + 1] = inout[global_fft_id, 2 * idx + 1]
 
@@ -96,7 +96,8 @@ def f(inout):
         for i in range(elements_per_thread):
             idx = i * stride + cuda.threadIdx.x
             if idx < size // 2:
-                # Fold optimized, so we load complex (ie 2 consecutive reals) instead of reals
+                # Fold optimized, so we load complex (ie 2 consecutive reals) instead of
+                # reals
                 inout[global_fft_id, 2 * idx + 0] = real_thread_data[2 * i + 0]
                 inout[global_fft_id, 2 * idx + 1] = real_thread_data[2 * i + 1]
 
diff --git a/examples/device/cufftdx_fft_2d_r2c_c2r.py b/examples/device/cufftdx_fft_2d_r2c_c2r.py
index 953538d..716a7b2 100644
--- a/examples/device/cufftdx_fft_2d_r2c_c2r.py
+++ b/examples/device/cufftdx_fft_2d_r2c_c2r.py
@@ -23,7 +23,7 @@ def main():
     fpb_x = 8
 
     FFT_base = functools.partial(fft, precision=np.float32, execution="Block", compiler="numba")
-    # R2C along Y (fft_size_x batches, logical FFT size is fft_size_y, complex size is fft_size_y//2+1)
+    # R2C along Y (fft_size_x batches, logical FFT size is fft_size_y, complex size is fft_size_y//2+1)  # noqa: W505
     FFT_y_r2c = FFT_base(fft_type="r2c", size=fft_size_y, elements_per_thread=ept_y, ffts_per_block=fpb_y)
     # C2Cf along X (fft_size_y//2+1 batches, logical FFT size is fft_size_x)
     FFT_x_c2c_f = FFT_base(
@@ -33,7 +33,7 @@ def main():
     FFT_x_c2c_i = FFT_base(
         fft_type="c2c", direction="forward", size=fft_size_x, elements_per_thread=ept_x, ffts_per_block=fpb_x
     )
-    # C2R along Y (fft_size_x batches, logical FFT size is fft_size_y, complex size is fft_size_y//2+1)
+    # C2R along Y (fft_size_x batches, logical FFT size is fft_size_y, complex size is fft_size_y//2+1)  # noqa: W505
     FFT_y_c2r = FFT_base(fft_type="c2r", size=fft_size_y, elements_per_thread=ept_y, ffts_per_block=fpb_y)
 
     complex_type = FFT_y_r2c.value_type
diff --git a/examples/device/cufftdx_fft_2d_single_kernel.py b/examples/device/cufftdx_fft_2d_single_kernel.py
index aee7db5..a50c167 100644
--- a/examples/device/cufftdx_fft_2d_single_kernel.py
+++ b/examples/device/cufftdx_fft_2d_single_kernel.py
@@ -97,7 +97,8 @@ def f(input, output):
     except cuda.cudadrv.driver.LinkerError as e:
         if str(e) == "libcudadevrt.a not found":
             print(
-                f"\n=== Numba linker error: {e}. Please use the System CTK option (see Getting Started in the documentation) to run this example. ===\n"
+                f"\n=== Numba linker error: {e}. Please use the System CTK option (see Installation in the documentation) "
+                "to run this example. ===\n"
             )
         raise e
     cuda.synchronize()
diff --git a/examples/device/cufftdx_fft_3d_box_single_block.py b/examples/device/cufftdx_fft_3d_box_single_block.py
index c6fb287..2396742 100644
--- a/examples/device/cufftdx_fft_3d_box_single_block.py
+++ b/examples/device/cufftdx_fft_3d_box_single_block.py
@@ -53,7 +53,7 @@ def f(input, output):
         # threadIdx.x  --> Z
         if tidy < fft_size_y and tidx < fft_size_z:
             for i in range(eptx):
-                # fast_copy(input, i * stride_x + tidy * stride_y + tidx * stride_z, thread_data, i)
+                # fast_copy(input, i * stride_x + tidy * stride_y + tidx * stride_z, thread_data, i)  # noqa: W505
                 thread_data[i] = input[i, tidy, tidx]
 
             FFT_x(thread_data)
@@ -118,7 +118,7 @@ def f(input, output):
                 index += stride_x
 
             for i in range(eptx):
-                # fast_copy(thread_data, i, output, i * stride_x + tidy * stride_y + tidx * stride_z)
+                # fast_copy(thread_data, i, output, i * stride_x + tidy * stride_y + tidx * stride_z)  # noqa: W505
                 output[i, tidy, tidx] = thread_data[i]
 
     input = random_complex((fft_size_x, fft_size_y, fft_size_z), real_dtype=np.float32)
diff --git a/examples/device/cufftdx_fft_3d_cube_single_block.py b/examples/device/cufftdx_fft_3d_cube_single_block.py
index ec0c599..8b51f5a 100644
--- a/examples/device/cufftdx_fft_3d_cube_single_block.py
+++ b/examples/device/cufftdx_fft_3d_cube_single_block.py
@@ -15,9 +15,7 @@
 def main():
     fft_size = 16
 
-    FFT = fft(
-        fft_type="c2c", size=fft_size, direction="forward", precision=np.float32, execution="Thread", compiler="numba"
-    )
+    FFT = fft(fft_type="c2c", size=fft_size, direction="forward", precision=np.float32, execution="Thread", compiler="numba")
 
     block_dim = Dim3(fft_size, fft_size, 1)
     grid_dim = Dim3(1, 1, 1)
diff --git a/examples/device/cufftdx_helloworld.py b/examples/device/cufftdx_helloworld.py
index e589bdf..ab5cc60 100644
--- a/examples/device/cufftdx_helloworld.py
+++ b/examples/device/cufftdx_helloworld.py
@@ -22,12 +22,11 @@ def main():
     shared_memory_size = FFT.shared_memory_size
     files = FFT.files
     stride = FFT.stride
-    ept = FFT.elements_per_thread
     block_dim = FFT.block_dim
     ffts_per_block = FFT.ffts_per_block
     elements_per_thread = FFT.elements_per_thread
 
-    @cuda.jit(link=FFT.files)
+    @cuda.jit(link=files)
     def f(data):
         thread_data = cuda.local.array(shape=(storage_size,), dtype=value_type)
 
diff --git a/examples/device/curand_cufftdx_block_fft.py b/examples/device/curand_cufftdx_block_fft.py
index c12f543..2a5086f 100644
--- a/examples/device/curand_cufftdx_block_fft.py
+++ b/examples/device/curand_cufftdx_block_fft.py
@@ -3,8 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example is a slight modification of the block FFT example `cufftdx_block_fft.py`, and shows how to use RNG
-device APIs (nvmath.device.random) within a kernel in conjunction with an FFT.
+This example is a slight modification of the block FFT example `cufftdx_block_fft.py`, and
+shows how to use RNG device APIs (nvmath.device.random) within a kernel in conjunction with
+an FFT.
 """
 
 import numpy as np
@@ -64,7 +65,7 @@ def f(data, result, states):
     data = np.empty((ffts_per_block, size), dtype=np.complex64)
     data_d = cuda.to_device(data)
     result = np.empty((ffts_per_block, size), dtype=np.complex64)
-    result_d = cuda.to_device(data)
+    result_d = cuda.to_device(result)
 
     states = random.StatesXORWOW(block_dim.x * block_dim.y)
     setup_random[1, block_dim](states)
diff --git a/examples/device/curand_philox_uniform4.py b/examples/device/curand_philox_uniform4.py
index 7f37c32..4aa4edb 100644
--- a/examples/device/curand_philox_uniform4.py
+++ b/examples/device/curand_philox_uniform4.py
@@ -3,11 +3,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example shows how to use the Philox4_32_10 bit generator to sample four single-precision values from a uniform
-distribution in a single call. The values are wrapped into an object of type `uint32x4` (Numba vector type).
+This example shows how to use the Philox4_32_10 bit generator to sample four
+single-precision values from a uniform distribution in a single call. The values are wrapped
+into an object of type `uint32x4` (Numba vector type).
 
-Following recommended practice, the implementation is split into a state initialization kernel and a sample
-generation kernel.
+Following recommended practice, the implementation is split into a state initialization
+kernel and a sample generation kernel.
 """
 
 import numpy as np
@@ -43,7 +44,8 @@ def count_upper_half(states, n, result):
         i = cuda.grid(1)
         count = 0
 
-        # Count the number of samples that falls greater than 0.5, getting 4 values at a time.
+        # Count the number of samples that falls greater than 0.5, getting 4 values at a
+        # time.
         for sample in range(n // 4):
             v = random.uniform4(states[i])
             a = v.x, v.y, v.z, v.w
diff --git a/examples/device/curand_scrambled_sobol64.py b/examples/device/curand_scrambled_sobol64.py
index d47fdbd..3d87023 100644
--- a/examples/device/curand_scrambled_sobol64.py
+++ b/examples/device/curand_scrambled_sobol64.py
@@ -3,14 +3,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This program uses the device CURAND API to calculate what proportion of quasi-random 3D points fall within a sphere
-of radius 1, and to derive the volume of the sphere.
+This program uses the device CURAND API to calculate what proportion of quasi-random 3D
+points fall within a sphere of radius 1, and to derive the volume of the sphere.
 
-In particular it uses 64 bit scrambled Sobol direction vectors returned by the host helper API `get_direction_vectors64`
-to generate double-precision uniform samples. The host helper APIs can be accessed from the
-`nvmath.device.random.random_helpers` module.
+In particular it uses 64 bit scrambled Sobol direction vectors returned by the host helper
+API `get_direction_vectors64` to generate double-precision uniform samples. The host helper
+APIs can be accessed from the `nvmath.device.random.random_helpers` module.
 
-See https://docs.nvidia.com/cuda/curand/device-api-overview.html#device-api-example for the corresponding C example.
+See https://docs.nvidia.com/cuda/curand/device-api-overview.html#device-api-example for the
+corresponding C example.
 """
 
 import cffi
@@ -69,8 +70,8 @@ def count_within_unit_sphere(states, n, result):
 
         result[id] += count
 
-    # The direction vectors and scramble constants are initialized on the host, using the helper
-    # functions in the `nvmath.device.random.random_helpers` module.
+    # The direction vectors and scramble constants are initialized on the host, using the
+    # helper functions in the `nvmath.device.random.random_helpers` module.
     hostVectors = random.random_helpers.get_direction_vectors64(
         random.random_helpers.DirectionVectorSet.SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6,
         nthreads * ndim,
diff --git a/examples/device/curand_xorwow_uniform.py b/examples/device/curand_xorwow_uniform.py
index 17738e7..85e856f 100644
--- a/examples/device/curand_xorwow_uniform.py
+++ b/examples/device/curand_xorwow_uniform.py
@@ -3,10 +3,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example shows how to use the XORWOW bit generator to sample double-precision values from a uniform distribution.
+This example shows how to use the XORWOW bit generator to sample double-precision values
+from a uniform distribution.
 
-Following recommended practice, the implementation is split into a state initialization kernel and a sample
-generation kernel.
+Following recommended practice, the implementation is split into a state initialization
+kernel and a sample generation kernel.
 """
 
 import numpy as np
diff --git a/examples/fft/caching.py b/examples/fft/caching.py
index 90c6f35..8a18e8a 100644
--- a/examples/fft/caching.py
+++ b/examples/fft/caching.py
@@ -36,21 +36,23 @@ def fft(
     cache: dict | None = None,
 ):
     """
-    A cached version of FFT, taking a cache argument in addition the the regular arguments for fft(). The stateful
-    objects are cached in the provided cache, and reused.
+    A cached version of FFT, taking a cache argument in addition the the regular arguments
+    for fft(). The stateful objects are cached in the provided cache, and reused.
 
     Args:
         cache: an object to use as the cache that satisfies `typing.Mapping` concept.
 
     Note:
-        User is responsible for explicitly free all resources stored in `cache` after no longer needed.
-        If a native `dict` object is used to store the cache, the resources can be released via:
+        User is responsible for explicitly free all resources stored in `cache` after no
+        longer needed. If a native `dict` object is used to store the cache, the resources
+        can be released via:
 
         >>> for f in cache.values():
         >>>    f.free()
 
-        Alternatively, users may use the `FFTCache` class above.
-        Resources can be cleaned by a call the the `free` method or will be automatically released if used in a context manager.
+        Alternatively, users may use the `FFTCache` class above. Resources can be cleaned by
+        a call the the `free` method or will be automatically released if used in a context
+        manager.
     """
     if cache is None:
         cache = {}
@@ -61,7 +63,8 @@ def fft(
 
     key = nvmath.fft.FFT.create_key(a, axes=axes, options=options, execution=execution, prolog=prolog, epilog=epilog)
 
-    # Get object from cache if it already exists, or create a new one and add it to the cache.
+    # Get object from cache if it already exists, or create a new one and add it to the
+    # cache.
     if (key, stream_ptr) in cache:
         logger.info("Cache HIT: using planned object.")
         # The planned object is already cached, so retrieve it.
diff --git a/examples/fft/example01_torch_complex32.py b/examples/fft/example01_torch_complex32.py
index 3aec710..ab19051 100644
--- a/examples/fft/example01_torch_complex32.py
+++ b/examples/fft/example01_torch_complex32.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example describes how to perform FFT on PyTorch tensors with low precision using function-form FFT APIs.
+This example describes how to perform FFT on PyTorch tensors with low precision using
+function-form FFT APIs.
 """
 
 import torch
diff --git a/examples/fft/example03_options.py b/examples/fft/example03_options.py
index a0c9932..70e3496 100644
--- a/examples/fft/example03_options.py
+++ b/examples/fft/example03_options.py
@@ -5,16 +5,17 @@
 """
 This example illustrates how to specify options to an FFT operation.
 
-In this example, we will use a NumPy ndarray as input, and we will look at two equivalent ways of providing options to enforce natural layout for the output.
+In this example, we will use a NumPy ndarray as input, and we will look at two equivalent
+ways of providing options to enforce natural layout for the output.
 
-The NumPy ndarrays reside in CPU memory. There are two ways to process CPU tensors with nvmath:
-either use host library to process the tensor directly or copy the tensor to GPU memory
-and process it with cuFFT.
+The NumPy ndarrays reside in CPU memory. There are two ways to process CPU tensors with
+nvmath: either use host library to process the tensor directly or copy the tensor to GPU
+memory and process it with cuFFT.
 
 The default behaviour has changed in Beta2, the cpu arrays will default to processing with
 host library: NVPL (Nvidia Performance Libraries), MKL or any other FFTW3-compatible
-library. In this example, we explicitly set ``execution="cuda"``, to copy the data on
-GPU for processing with cuFFT.
+library. In this example, we explicitly set ``execution="cuda"``, to copy the data on GPU
+for processing with cuFFT.
 """
 
 import numpy as np
@@ -36,7 +37,8 @@
 print(f"Does the FFT result shared the same layout as the input ? {b.strides == a.strides}")
 print(f"Input type = {type(a)}, FFT output type = {type(b)}")
 
-# Alternative #2 for specifying options, using dict. The two alternatives are entirely equivalent.
+# Alternative #2 for specifying options, using dict. The two alternatives are entirely
+# equivalent.
 b = nvmath.fft.fft(a, axes=axes, options={"result_layout": "natural"}, execution="cuda")
 print(f"Does the FFT result shared the same layout as the input ? {b.strides == a.strides}")
 print(f"Input type = {type(a)}, FFT output type = {type(b)}")
diff --git a/examples/fft/example03_options_cpu_execution.py b/examples/fft/example03_options_cpu_execution.py
index 497ad28..0bffe74 100644
--- a/examples/fft/example03_options_cpu_execution.py
+++ b/examples/fft/example03_options_cpu_execution.py
@@ -5,13 +5,14 @@
 """
 This example illustrates how to specify options to an FFT operation.
 
-In this example, we will use a NumPy ndarray as input, and we will look at two equivalent ways of providing:
-- FFT options to enforce natural layout for the output, and
-- execution options to change the number of CPU threads processing the FFT.
+In this example, we will use a NumPy ndarray as input, and we will look at two equivalent
+ways of providing:
+    - FFT options to enforce natural layout for the output, and
+    - execution options to change the number of CPU threads processing the FFT.
 
-The NumPy ndarrays reside in CPU memory. There are two ways to process CPU tensors with nvmath:
-either use host library to process the tensor directly or copy the tensor to GPU memory
-and process it with cuFFT.
+The NumPy ndarrays reside in CPU memory. There are two ways to process CPU tensors with
+nvmath: either use host library to process the tensor directly or copy the tensor to GPU
+memory and process it with cuFFT.
 
 The quickest way for pip users to use nvmath-python with the CPU execution space is to add
 cpu to the extras:``pip install nvmath-python[cu12,cpu]`, for example.
@@ -42,7 +43,8 @@
 print(f"Does the FFT result shared the same layout as the input ? {b.strides == a.strides}")
 print(f"Input type = {type(a)}, FFT output type = {type(b)}")
 
-# Alternative #2 for specifying options, using dict. The two alternatives are entirely equivalent.
+# Alternative #2 for specifying options, using dict. The two alternatives are entirely
+# equivalent.
 b = nvmath.fft.fft(
     a,
     axes=axes,
diff --git a/examples/fft/example04_logging_global.py b/examples/fft/example04_logging_global.py
index 6097ce9..763a894 100644
--- a/examples/fft/example04_logging_global.py
+++ b/examples/fft/example04_logging_global.py
@@ -15,8 +15,8 @@
 shape = 512, 512, 256
 axes = 0, 1
 
-# Turn on logging. Here we use the global logger, set the level to "debug", and use a custom format for the log.
-# Any of the features provided by the logging module can be used.
+# Turn on logging. Here we use the global logger, set the level to "debug", and use a custom
+# format for the log. Any of the features provided by the logging module can be used.
 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
 
 a = cp.random.rand(*shape, dtype=cp.float64) + 1j * cp.random.rand(*shape, dtype=cp.float64)
diff --git a/examples/fft/example04_logging_user.py b/examples/fft/example04_logging_user.py
index 203829b..02b1da6 100644
--- a/examples/fft/example04_logging_user.py
+++ b/examples/fft/example04_logging_user.py
@@ -28,7 +28,8 @@
 formatter = logging.Formatter("%(asctime)s %(name)-12s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
 handler.setFormatter(formatter)
 
-# Associate handler with logger, resulting in a logger with the desired level, format, and console output.
+# Associate handler with logger, resulting in a logger with the desired level, format, and
+# console output.
 logger.addHandler(handler)
 
 
@@ -42,8 +43,8 @@
 
 print("---")
 
-# Recall that the options can also be provided as a dict, so the following is an alternative, entirely
-#   equivalent way to specify options.
+# Recall that the options can also be provided as a dict, so the following is an
+# alternative, entirely equivalent way to specify options.
 b = nvmath.fft.fft(a, axes=axes, options={"logger": logger})
 
 # Synchronize the default stream
diff --git a/examples/fft/example05_stateful_inplace.py b/examples/fft/example05_stateful_inplace.py
index 90152ef..633efe7 100644
--- a/examples/fft/example05_stateful_inplace.py
+++ b/examples/fft/example05_stateful_inplace.py
@@ -6,8 +6,11 @@
 This example illustrates the use of inplace update of input operands in stateful FFT APIs.
 
 The input as well as the result from the FFT operations are CuPy ndarrays.
+
 NOTE: The use of inplace updating input operands should be adopted with caution.
-For the following cases, inplace updating the input operands will not affect the result operand:
+
+For the following cases, inplace updating the input operands will not affect the result
+operand:
     - The input operand reside on CPU.
     - The input operand reside on GPU but the operation amounts to a C2R FFT.
 """
diff --git a/examples/fft/example05_stateful_reset.py b/examples/fft/example05_stateful_reset.py
index b82b1b3..2a8a310 100644
--- a/examples/fft/example05_stateful_reset.py
+++ b/examples/fft/example05_stateful_reset.py
@@ -3,9 +3,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example illustrates how to reuse the stateful API to perform FFT operations on operands with the same layout.
+This example illustrates how to reuse the stateful API to perform FFT operations on operands
+with the same layout.
 
-In this example we will perform a forward and an inverse FFT operation to demonstrate how to recover the original input operand.
+In this example we will perform a forward and an inverse FFT operation to demonstrate how to
+recover the original input operand.
 """
 
 import cupy as cp
@@ -17,8 +19,8 @@
 
 a = cp.ones(shape, dtype=cp.complex64)
 
-# Create a stateful FFT object 'f'.
-# Note here that we need to enforce natural layout in the result in order to reuse the FFT object
+# Create a stateful FFT object 'f'. Note here that we need to enforce natural layout in the
+# result in order to reuse the FFT object
 with nvmath.fft.FFT(a, axes=axes, options={"result_layout": "natural"}) as f:
     # Plan the FFT.
     f.plan()
diff --git a/examples/fft/example06_r2c.py b/examples/fft/example06_r2c.py
index 2168e77..bcaf983 100644
--- a/examples/fft/example06_r2c.py
+++ b/examples/fft/example06_r2c.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-Example showing how to perform N-dimensional real-to-complex (R2C) FFT computation with function-form FFT APIs.
+Example showing how to perform N-dimensional real-to-complex (R2C) FFT computation with
+function-form FFT APIs.
 """
 
 import cupy as cp
diff --git a/examples/fft/example07_c2r.py b/examples/fft/example07_c2r.py
index 04c3441..ad286aa 100644
--- a/examples/fft/example07_c2r.py
+++ b/examples/fft/example07_c2r.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-Example showing how to perform N-dimensional complex-to-real (C2R) FFT computation with function-form FFT APIs.
+Example showing how to perform N-dimensional complex-to-real (C2R) FFT computation with
+function-form FFT APIs.
 """
 
 import cupy as cp
diff --git a/examples/fft/example07_c2r_odd.py b/examples/fft/example07_c2r_odd.py
index 412014f..78a3984 100644
--- a/examples/fft/example07_c2r_odd.py
+++ b/examples/fft/example07_c2r_odd.py
@@ -3,7 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example illustrates the use of function-form FFT APIs to perform N-dimensional complex-to-real (C2R) FFT computation with the size of the last axis in the result being odd.
+This example illustrates the use of function-form FFT APIs to perform N-dimensional
+complex-to-real (C2R) FFT computation with the size of the last axis in the result being
+odd.
 """
 
 import cupy as cp
@@ -19,7 +21,7 @@
 a = nvmath.fft.rfft(t, axes=axes)
 
 # Complex-to-real FFT along (0,1), batched along axis=2.
-b = nvmath.fft.irfft(a, axes=axes, options={"last_axis_size": "odd"})
+b = nvmath.fft.irfft(a, axes=axes, options={"last_axis_parity": "odd"})
 
 # Synchronize the default stream
 cp.cuda.get_current_stream().synchronize()
diff --git a/examples/fft/example08_cupy_inplace.py b/examples/fft/example08_cupy_inplace.py
index 5ecc0b7..689619f 100644
--- a/examples/fft/example08_cupy_inplace.py
+++ b/examples/fft/example08_cupy_inplace.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example illustrates how to perform inplace FFT for CuPy ndarrays using function-form FFT APIs.
+This example illustrates how to perform inplace FFT for CuPy ndarrays using function-form
+FFT APIs.
 """
 
 import cupy as cp
diff --git a/examples/fft/example08_numpy_inplace.py b/examples/fft/example08_numpy_inplace.py
index 827b1c6..9b48608 100644
--- a/examples/fft/example08_numpy_inplace.py
+++ b/examples/fft/example08_numpy_inplace.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example illustrates how to perform inplace FFT for NumPy ndarrays using function-form FFT APIs.
+This example illustrates how to perform inplace FFT for NumPy ndarrays using function-form
+FFT APIs.
 
 The NumPy ndarrays reside in CPU memory. There are two ways to process CPU tensors with
 nvmath: either use host library to process the tensor directly or copy the tensor to GPU
@@ -11,8 +12,8 @@
 
 The default behaviour has changed in Beta2, the cpu arrays will default to processing with
 host library: NVPL (Nvidia Performance Libraries), MKL or any other FFTW3-compatible
-library. In this example, we explicitly set ``execution="cuda"``, to copy the data on
-GPU for processing with cuFFT.
+library. In this example, we explicitly set ``execution="cuda"``, to copy the data on GPU
+for processing with cuFFT.
 """
 
 import numpy as np
diff --git a/examples/fft/example08_numpy_inplace_cpu_execution.py b/examples/fft/example08_numpy_inplace_cpu_execution.py
index 49323f8..4bbd491 100644
--- a/examples/fft/example08_numpy_inplace_cpu_execution.py
+++ b/examples/fft/example08_numpy_inplace_cpu_execution.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example illustrates how to perform inplace FFT for NumPy ndarrays using function-form FFT APIs.
+This example illustrates how to perform inplace FFT for NumPy ndarrays using function-form
+FFT APIs.
 
 The NumPy ndarrays reside in CPU memory. There are two ways to process CPU tensors with
 nvmath: either use host library to process the tensor directly or copy the tensor to GPU
@@ -16,8 +17,8 @@
 The quickest way for pip users to use nvmath-python with the CPU execution space is to add
 cpu to the extras:``pip install nvmath-python[cu12,cpu]`, for example.
 
-The input as well as the result from the FFT operations are NumPy ndarrays, resulting
-in effortless interoperability between nvmath-python and NumPy.
+The input as well as the result from the FFT operations are NumPy ndarrays, resulting in
+effortless interoperability between nvmath-python and NumPy.
 """
 
 import numpy as np
diff --git a/examples/fft/example09_streams.py b/examples/fft/example09_streams.py
index a3d257e..a13369e 100644
--- a/examples/fft/example09_streams.py
+++ b/examples/fft/example09_streams.py
@@ -15,7 +15,8 @@
 
 a = cp.random.rand(*shape, dtype=cp.float64) + 1j * cp.random.rand(*shape, dtype=cp.float64)
 
-# Create a CUDA stream to use for instantiating, planning, and first execution of a stateful FFT object 'f'.
+# Create a CUDA stream to use for instantiating, planning, and first execution of a stateful
+# FFT object 'f'.
 s1 = cp.cuda.Stream()
 
 # Create a stateful FFT object 'f' on stream s1.
@@ -29,22 +30,22 @@
     # Record an event on s1 for use later.
     e1 = s1.record()
 
-    # Create a new stream to on which the new operand c for the second execution will be filled.
+    # Create a new stream to on which the new operand c for the second execution will be
+    # filled.
     s2 = cp.cuda.Stream()
 
     # Fill c on s2.
     with s2:
         c = cp.random.rand(*shape, dtype=cp.float64) + 1j * cp.random.rand(*shape, dtype=cp.float64)
 
-    # In the following blocks, we will use stream s2 to perform subsequent operations.
-    # Note that it's our responsibility as a user to ensure proper ordering,
-    # and we want to order `reset_operand` after event e1 corresponding to the execute() call above.
+    # In the following blocks, we will use stream s2 to perform subsequent operations. Note
+    # that it's our responsibility as a user to ensure proper ordering, and we want to order
+    # `reset_operand` after event e1 corresponding to the execute() call above.
     s2.wait_event(e1)
 
-    # Alternatively, if we want to use stream s1 for subsequent operations (s2 only for operand creation),
-    # we need to order `reset_operand` after the event for cupy.random.rand on s2, e.g:
-    # e2 = s2.record()
-    # s1.waite_event(e2)
+    # Alternatively, if we want to use stream s1 for subsequent operations (s2 only for
+    # operand creation), we need to order `reset_operand` after the event for
+    # cupy.random.rand on s2, e.g: e2 = s2.record() s1.waite_event(e2)
 
     # Set a new operand c on stream s2.
     f.reset_operand(c, stream=s2)
diff --git a/examples/fft/example11_resource_mgmt.py b/examples/fft/example11_resource_mgmt.py
index 5724c6d..40b90d9 100644
--- a/examples/fft/example11_resource_mgmt.py
+++ b/examples/fft/example11_resource_mgmt.py
@@ -3,12 +3,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example shows how to manage memory resources used by stateful objects. This is useful when the FFT operation
-needs a lot of memory and calls to execution method on a stateful object are interleaved with calls to other
-operations (including another FFT) also requiring a lot of memory.
+This example shows how to manage memory resources used by stateful objects. This is useful
+when the FFT operation needs a lot of memory and calls to execution method on a stateful
+object are interleaved with calls to other operations (including another FFT) also requiring
+a lot of memory.
 
-In this example, two FFT operations are performed in a loop in an interleaved manner.
-We assume that the available device memory is large enough for only one FFT at a time.
+In this example, two FFT operations are performed in a loop in an interleaved manner. We
+assume that the available device memory is large enough for only one FFT at a time.
 """
 
 import logging
@@ -35,20 +36,22 @@
 f2.plan()
 
 num_iter = 3
-# Use the FFT objects as context managers so that internal library resources are properly cleaned up.
+# Use the FFT objects as context managers so that internal library resources are properly
+# cleaned up.
 with f1, f2:
     for i in range(num_iter):
         print(f"Iteration {i}")
-        # Perform the first contraction, and request that the workspace be released at the end of the operation so that there is enough
-        #   memory for the second one.
+        # Perform the first contraction, and request that the workspace be released at the
+        #   end of the operation so that there is enough memory for the second one.
         r = f1.execute(release_workspace=True)
 
         # Update f1's operands for the next iteration.
         if i < num_iter - 1:
             a[:] = cp.random.rand(*shape, dtype=cp.float32) + 1j * cp.random.rand(*shape, dtype=cp.float32)
 
-        # Perform the second FFT, and request that the workspace be released at the end of the operation so that there is enough
-        #   memory for the first FFT in the next iteration.
+        # Perform the second FFT, and request that the workspace be released at the end of
+        #   the operation so that there is enough memory for the first FFT in the next
+        #   iteration.
         r = f2.execute(release_workspace=True)
 
         # Update f2's operands for the next iteration.
diff --git a/examples/fft/example12_stateful_unsupported_fallback.py b/examples/fft/example12_stateful_unsupported_fallback.py
index ead7234..632bbfc 100644
--- a/examples/fft/example12_stateful_unsupported_fallback.py
+++ b/examples/fft/example12_stateful_unsupported_fallback.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-Example showing the fallback path for an unsupported layout error using the class-form FFT APIs.
+Example showing the fallback path for an unsupported layout error using the class-form FFT
+APIs.
 """
 
 import cupy as cp
@@ -17,7 +18,8 @@
 
 r = cp.fft.fftn(a, axes=axes)
 
-# Create a stateful FFT object 'f'. Fallback to suggested layout since the original layout-axes combination is currently not supported.
+# Create a stateful FFT object 'f'. Fallback to suggested layout since the original
+# layout-axes combination is currently not supported.
 try:
     f = nvmath.fft.FFT(a, axes=axes)
     permutation = None
diff --git a/examples/fft/example12_unsupported_fallback.py b/examples/fft/example12_unsupported_fallback.py
index a947528..9fd7522 100644
--- a/examples/fft/example12_unsupported_fallback.py
+++ b/examples/fft/example12_unsupported_fallback.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-Example showing the fallback path for an unsupported layout error using the function-form FFT APIs.
+Example showing the fallback path for an unsupported layout error using the function-form
+FFT APIs.
 """
 
 import cupy as cp
@@ -18,7 +19,8 @@
 r = cp.fft.fftn(a, axes=axes)
 
 try:
-    # Forward FFT along (0,2), batched along axis=1. This is not yet supported by the cuFFT C library.
+    # Forward FFT along (0,2), batched along axis=1. This is not yet supported by the cuFFT
+    # C library.
     b = nvmath.fft.fft(a, axes=axes)
 except nvmath.fft.UnsupportedLayoutError as e:
     # Permute the input, and copy.
diff --git a/examples/fft/example13_cupy_mt_mgpu.py b/examples/fft/example13_cupy_mt_mgpu.py
index 2cf0cf6..d6bc1e3 100644
--- a/examples/fft/example13_cupy_mt_mgpu.py
+++ b/examples/fft/example13_cupy_mt_mgpu.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-Example using a thread pool to launch multiple independent FFT operations in parallel on multiple GPUs.
+Example using a thread pool to launch multiple independent FFT operations in parallel on
+multiple GPUs.
 """
 
 from functools import partial
diff --git a/examples/fft/example13_numpy_mp_mgpu.py b/examples/fft/example13_numpy_mp_mgpu.py
index 960aacf..c3cfefe 100644
--- a/examples/fft/example13_numpy_mp_mgpu.py
+++ b/examples/fft/example13_numpy_mp_mgpu.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-Example using a process pool to launch multiple independent FFT operations in parallel on multiple GPUs.
+Example using a process pool to launch multiple independent FFT operations in parallel on
+multiple GPUs.
 
 The NumPy ndarrays reside in CPU memory. There are two ways to process CPU tensors with
 nvmath: either use host library to process the tensor directly or copy the tensor to GPU
@@ -11,8 +12,8 @@
 
 The default behaviour has changed in Beta2, the cpu arrays will default to processing with
 host library: NVPL (Nvidia Performance Libraries), MKL or any other FFTW3-compatible
-library. In this example, we explicitly set ``execution="cuda"``, to copy the data on
-GPU for processing with cuFFT.
+library. In this example, we explicitly set ``execution="cuda"``, to copy the data on GPU
+for processing with cuFFT.
 """
 
 import multiprocessing
diff --git a/examples/fft/example14_caching.py b/examples/fft/example14_caching.py
index 030327c..cb33745 100644
--- a/examples/fft/example14_caching.py
+++ b/examples/fft/example14_caching.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-Example showing how to reuse cached objects for repeated computation on the same problem specification with different operands.
+Example showing how to reuse cached objects for repeated computation on the same problem
+specification with different operands.
 
 The cached implementation is provided in `caching.py`.
 """
diff --git a/examples/fft/example15_cupy_nd_fft_benchmark.py b/examples/fft/example15_cupy_nd_fft_benchmark.py
index a451787..adcb257 100644
--- a/examples/fft/example15_cupy_nd_fft_benchmark.py
+++ b/examples/fft/example15_cupy_nd_fft_benchmark.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-Example on benchmarking an N-D FFT implementation as a composition of the 1D, 2D, or 3D batched FFTs.
+Example on benchmarking an N-D FFT implementation as a composition of the 1D, 2D, or 3D
+batched FFTs.
 
 The basic reference implementation is provided in `fftn1.py`.
 """
diff --git a/examples/fft/example16_cupy_nd_fft_benchmark.py b/examples/fft/example16_cupy_nd_fft_benchmark.py
index 9570ec0..104eccc 100644
--- a/examples/fft/example16_cupy_nd_fft_benchmark.py
+++ b/examples/fft/example16_cupy_nd_fft_benchmark.py
@@ -3,9 +3,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-Example on benchmarking an N-D FFT implementation as a composition of the 1D, 2D, or 3D batched FFTs.
+Example on benchmarking an N-D FFT implementation as a composition of the 1D, 2D, or 3D
+batched FFTs.
 
-The reference implementation is provided in `fftn2.py` with optimizations to reduce the number of copies compared with `fftn1.py`.
+The reference implementation is provided in `fftn2.py` with optimizations to reduce the
+number of copies compared with `fftn1.py`.
 """
 
 import functools
diff --git a/examples/fft/example17_trunc.py b/examples/fft/example17_trunc.py
index 0ad71cc..45ed02a 100644
--- a/examples/fft/example17_trunc.py
+++ b/examples/fft/example17_trunc.py
@@ -23,20 +23,14 @@
 r = truncated_fft(a, axes=axes, extents=extents)
 
 nvtime = benchmark(truncated_fft, args=(a,), kwargs={"axes": axes, "extents": extents}, n_repeat=10)
-print(
-    f"{len(axes)}-D FFT for axes={axes} and extents={extents} based on nvmath-python (non-cached version):\n{nvtime}\n"
-)
+print(f"{len(axes)}-D FFT for axes={axes} and extents={extents} based on nvmath-python (non-cached version):\n{nvtime}\n")
 
 with FFTCache() as cache:
     cached_fft = functools.partial(cached_fft, cache=cache)
     r = truncated_fft(a, axes=axes, extents=extents, engine=cached_fft)
 
-    nvtime = benchmark(
-        truncated_fft, args=(a,), kwargs={"axes": axes, "extents": extents, "engine": cached_fft}, n_repeat=10
-    )
-    print(
-        f"{len(axes)}-D FFT for axes={axes} and extents={extents} based on nvmath-python (cached version):\n{nvtime}\n"
-    )
+    nvtime = benchmark(truncated_fft, args=(a,), kwargs={"axes": axes, "extents": extents, "engine": cached_fft}, n_repeat=10)
+    print(f"{len(axes)}-D FFT for axes={axes} and extents={extents} based on nvmath-python (cached version):\n{nvtime}\n")
 
 cptime = benchmark(cp.fft.fftn, args=(a,), kwargs={"axes": axes, "s": extents}, n_repeat=10)
 print(f"{len(axes)}-D FFT for axes={axes} and extents={extents} based on CuPy:\n{cptime}\n")
diff --git a/examples/fft/example18_5D_trunc.py b/examples/fft/example18_5D_trunc.py
index 2b6571d..0989465 100644
--- a/examples/fft/example18_5D_trunc.py
+++ b/examples/fft/example18_5D_trunc.py
@@ -25,9 +25,7 @@
 r = truncated_fft(a, axes=axes, extents=extents, engine=fftn)
 
 nvtime = benchmark(truncated_fft, args=(a,), kwargs={"axes": axes, "extents": extents, "engine": fftn}, n_repeat=10)
-print(
-    f"{len(axes)}-D FFT for axes={axes} and extents={extents} based on nvmath-python (non-caching engine):\n{nvtime}\n"
-)
+print(f"{len(axes)}-D FFT for axes={axes} and extents={extents} based on nvmath-python (non-caching engine):\n{nvtime}\n")
 
 # Create a cached FFTN version to use in truncated FFT to create truncated FFTN.
 with FFTCache() as cache:
@@ -36,12 +34,8 @@
 
     r = truncated_fft(a, axes=axes, extents=extents, engine=cached_fftn)
 
-    nvtime = benchmark(
-        truncated_fft, args=(a,), kwargs={"axes": axes, "extents": extents, "engine": cached_fftn}, n_repeat=10
-    )
-    print(
-        f"{len(axes)}-D FFT for axes={axes} and extents={extents} based on nvmath-python (caching engine):\n{nvtime}\n"
-    )
+    nvtime = benchmark(truncated_fft, args=(a,), kwargs={"axes": axes, "extents": extents, "engine": cached_fftn}, n_repeat=10)
+    print(f"{len(axes)}-D FFT for axes={axes} and extents={extents} based on nvmath-python (caching engine):\n{nvtime}\n")
 
 cptime = benchmark(cp.fft.fftn, args=(a,), kwargs={"axes": axes, "s": extents}, n_repeat=10)
 print(f"{len(axes)}-D FFT for axes={axes} and extents={extents} based on CuPy:\n{cptime}\n")
diff --git a/examples/fft/example19_convolution_epilog_callback.py b/examples/fft/example19_convolution_epilog_callback.py
index a10d805..d5a95d2 100644
--- a/examples/fft/example19_convolution_epilog_callback.py
+++ b/examples/fft/example19_convolution_epilog_callback.py
@@ -28,20 +28,21 @@
 # Define the epilog function for the inverse FFT.
 def convolve(data_out, offset, data, filter_data, unused):
     """
-    A convolution corresponds to pointwise multiplication in the frequency domain. We also scale by the FFT size N here.
+    A convolution corresponds to pointwise multiplication in the frequency domain. We also
+    scale by the FFT size N here.
     """
     # Note we are accessing `data_out` and `filter_data` with a single `offset` integer,
-    # even though the output and `filter_data` are 2D tensors (batches of samples).
-    # Care must be taken to assure that both arrays accessed here have the same memory layout.
+    # even though the output and `filter_data` are 2D tensors (batches of samples). Care
+    # must be taken to assure that both arrays accessed here have the same memory layout.
     # For a reference, see the `example19_convolution_callback_memory_layout` example.
     data_out[offset] = data * filter_data[offset] / N
 
 
-# Compile the epilog to LTO-IR.
-# In a system with GPUs that have different compute capability, the `compute_capability` option must be specified to the
-# `compile_prolog` or `compile_epilog` helpers. Alternatively, the epilog can be compiled in the context of the device
-# where the FFT to which the epilog is provided is executed. In this case we use the current device context, where the
-# operands have been created.
+# Compile the epilog to LTO-IR. In a system with GPUs that have different compute
+# capability, the `compute_capability` option must be specified to the `compile_prolog` or
+# `compile_epilog` helpers. Alternatively, the epilog can be compiled in the context of the
+# device where the FFT to which the epilog is provided is executed. In this case we use the
+# current device context, where the operands have been created.
 with cp.cuda.Device():
     epilog = nvmath.fft.compile_epilog(convolve, "complex128", "complex128")
 
diff --git a/examples/fft/example19_convolution_memory_layout_callback.py b/examples/fft/example19_convolution_memory_layout_callback.py
index 135e148..fa251b9 100644
--- a/examples/fft/example19_convolution_memory_layout_callback.py
+++ b/examples/fft/example19_convolution_memory_layout_callback.py
@@ -43,11 +43,11 @@ def convolve(data_out, offset, element, filter_data, unused):
     data_out[offset] = element * filter_data[offset] / (N1 * N2)
 
 
-# Compile the epilog to LTO-IR.
-# In a system with GPUs that have different compute capability, the `compute_capability` option must be specified to the
-# `compile_prolog` or `compile_epilog` helpers. Alternatively, the epilog can be compiled in the context of the device
-# where the FFT to which the epilog is provided is executed. In this case we use the current device context, where the
-# operands have been created.
+# Compile the epilog to LTO-IR. In a system with GPUs that have different compute
+# capability, the `compute_capability` option must be specified to the `compile_prolog` or
+# `compile_epilog` helpers. Alternatively, the epilog can be compiled in the context of the
+# device where the FFT to which the epilog is provided is executed. In this case we use the
+# current device context, where the operands have been created.
 with cp.cuda.Device():
     epilog = nvmath.fft.compile_epilog(convolve, "complex128", "complex128")
 
diff --git a/examples/fft/example19_convolution_prolog_callback.py b/examples/fft/example19_convolution_prolog_callback.py
index 18ad96c..8decd42 100644
--- a/examples/fft/example19_convolution_prolog_callback.py
+++ b/examples/fft/example19_convolution_prolog_callback.py
@@ -28,20 +28,21 @@
 # Define the prolog function for the inverse FFT.
 def convolve(data_in, offset, filter_data, unused):
     """
-    A convolution corresponds to pointwise multiplication in the frequency domain. We also scale by the FFT size N here.
+    A convolution corresponds to pointwise multiplication in the frequency domain. We also
+    scale by the FFT size N here.
     """
     # Note we are accessing `data_out` and `filter_data` with a single `offset` integer,
-    # even though the input and `filter_data` are 2D tensors (batches of samples).
-    # Care must be taken to assure that both arrays accessed here have the same memory layout.
-    # For a reference, see the `example19_convolution_callback_memory_layout` example.
+    # even though the input and `filter_data` are 2D tensors (batches of samples). Care must
+    # be taken to assure that both arrays accessed here have the same memory layout. For a
+    # reference, see the `example19_convolution_callback_memory_layout` example.
     return data_in[offset] * filter_data[offset] / N
 
 
-# Compile the prolog to LTO-IR.
-# In a system with GPUs that have different compute capability, the `compute_capability` option must be specified to the
-# `compile_prolog` or `compile_epilog` helpers. Alternatively, the prolog can be compiled in the context of the device
-# where the FFT to which the prolog is provided is executed. In this case we use the current device context, where the
-# operands have been created.
+# Compile the prolog to LTO-IR. In a system with GPUs that have different compute
+# capability, the `compute_capability` option must be specified to the `compile_prolog` or
+# `compile_epilog` helpers. Alternatively, the prolog can be compiled in the context of the
+# device where the FFT to which the prolog is provided is executed. In this case we use the
+# current device context, where the operands have been created.
 with cp.cuda.Device():
     prolog = nvmath.fft.compile_prolog(convolve, "complex128", "complex128")
 
diff --git a/examples/fft/fftn1.py b/examples/fft/fftn1.py
index da7f895..8a034c2 100644
--- a/examples/fft/fftn1.py
+++ b/examples/fft/fftn1.py
@@ -11,10 +11,12 @@
 
 def fftn(a, *, axes=None, direction=None, options=None, prolog=None, epilog=None, stream=None, engine=nvmath.fft.fft):
     """
-    Perform an N-D FFT as a composition of the 1D, 2D, or 3D batched FFTs supported by cuFFT. This is version 1.
+    Perform an N-D FFT as a composition of the 1D, 2D, or 3D batched FFTs supported by
+    cuFFT. This is version 1.
 
     Args:
-        engine: a callable to execute the FFT operation. The engine can be `fft` from the nvmath.fft package, or `caching.fft` from the examples.
+        engine: a callable to execute the FFT operation. The engine can be `fft` from the
+            nvmath.fft package, or `caching.fft` from the examples.
     """
 
     rank = a.ndim
diff --git a/examples/fft/fftn2.py b/examples/fft/fftn2.py
index ebe8744..298943a 100644
--- a/examples/fft/fftn2.py
+++ b/examples/fft/fftn2.py
@@ -2,7 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""N-D FFT as a composition of the 1D, 2D or 3D batched FFTs with the number of copies minimized."""
+"""N-D FFT as a composition of the 1D, 2D or 3D batched FFTs with the number of copies
+minimized."""
 
 __all__ = ["fftn"]
 
@@ -25,11 +26,7 @@ def upto_three_contiguous_axes(ordered_axes, ordered_all_axes):
             right, e, f = right + 1, e - 1, f - 1
 
     d = max(left, right)
-    if (
-        d == 0
-        or (d < 3 and min(left, right) == 0 and len(ordered_axes) > d)
-        or (d < 3 and d + 1 < len(ordered_axes) - d)
-    ):
+    if d == 0 or (d < 3 and min(left, right) == 0 and len(ordered_axes) > d) or (d < 3 and d + 1 < len(ordered_axes) - d):
         return True, slice(None, -4, -1), slice(-4, None, -1)
 
     if left > right:
@@ -44,10 +41,12 @@ def upto_three_contiguous_axes(ordered_axes, ordered_all_axes):
 
 def fftn(a, *, axes=None, direction=None, options=None, prolog=None, epilog=None, stream=None, engine=nvmath.fft.fft):
     """
-    Perform an N-D FFT as a composition of the 1D, 2D, or 3D batched FFTs supported by cuFFT, minimizing the number of copies needed. This is version 2.
+    Perform an N-D FFT as a composition of the 1D, 2D, or 3D batched FFTs supported by
+    cuFFT, minimizing the number of copies needed. This is version 2.
 
     Args:
-        engine: a callable to execute the FFT operation. The engine can be `fft` from the nvmath.fft package, or `caching.fft` from the examples.
+        engine: a callable to execute the FFT operation. The engine can be `fft` from the
+            nvmath.fft package, or `caching.fft` from the examples.
     """
 
     rank = a.ndim
diff --git a/examples/fft/truncation.py b/examples/fft/truncation.py
index 27c981d..8aeb93d 100644
--- a/examples/fft/truncation.py
+++ b/examples/fft/truncation.py
@@ -53,13 +53,16 @@ def fft(
     engine=nvmath.fft.fft,
 ):
     """
-    This version supports truncation and padding of the operand, to match the functionality of NumPy FFT.
+    This version supports truncation and padding of the operand, to match the functionality
+    of NumPy FFT.
 
     Args:
-        extents: An array specifying the truncated or padded extents for the FFT axes. If not specified, the extents of the operand dimensions corresponding
-            to the FFT axes will be used.
-        engine: a callable to execute the FFT operation. The engine can be `fft` from the nvmath.fft package, or `caching.fft`, `fftn1.fftn`, `fftn2.fftn`
-            etc. from the examples.
+        extents: An array specifying the truncated or padded extents for the FFT axes. If
+            not specified, the extents of the operand dimensions corresponding to the FFT
+            axes will be used.
+        engine: a callable to execute the FFT operation. The engine can be `fft` from the
+            nvmath.fft package, or `caching.fft`, `fftn1.fftn`, `fftn2.fftn` etc. from the
+            examples.
     """
     if extents is None:
         return engine(a, axes=axes, direction=direction, options=options, prolog=prolog, epilog=epilog, stream=stream)
@@ -78,7 +81,8 @@ def fft(
     shape = a.shape
 
     if all(shape[axes[i]] == extents[i] for i in range(num_axes)):
-        # No need to pad or truncate if the transform axes extents already match the extents.
+        # No need to pad or truncate if the transform axes extents already match the
+        # extents.
         return engine(a, axes=axes, direction=direction, options=options, prolog=prolog, epilog=epilog, stream=stream)
 
     if all(extents[i] < shape[axes[i]] for i in range(num_axes)):  # All axes truncated.
diff --git a/examples/linalg/advanced/matmul/example01_cupy.py b/examples/linalg/advanced/matmul/example01_cupy.py
index b6b86e7..25a4fa0 100644
--- a/examples/linalg/advanced/matmul/example01_cupy.py
+++ b/examples/linalg/advanced/matmul/example01_cupy.py
@@ -5,8 +5,9 @@
 """
 This example demonstrates basic matrix multiplication of CuPy arrays.
 
-nvmath-python supports multiple frameworks. The result of each operation is a tensor of the same
-framework that was used to pass the inputs. It is also located on the same device as the inputs.
+nvmath-python supports multiple frameworks. The result of each operation is a tensor of the
+same framework that was used to pass the inputs. It is also located on the same device as
+the inputs.
 """
 
 import cupy as cp
@@ -21,7 +22,8 @@
 # Perform the multiplication.
 result = nvmath.linalg.advanced.matmul(a, b)
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
 
 # Check if the result is cupy array as well.
diff --git a/examples/linalg/advanced/matmul/example01_cupy_complex64.py b/examples/linalg/advanced/matmul/example01_cupy_complex64.py
index 5ac8e3d..c52ff53 100644
--- a/examples/linalg/advanced/matmul/example01_cupy_complex64.py
+++ b/examples/linalg/advanced/matmul/example01_cupy_complex64.py
@@ -5,8 +5,9 @@
 """
 This example demonstrates basic matrix multiplication using CuPy arrays.
 
-nvmath-python supports multiple frameworks. The result of each operation is a tensor of the same
-framework that was used to pass the inputs. It is also located on the same device as the inputs.
+nvmath-python supports multiple frameworks. The result of each operation is a tensor of the
+same framework that was used to pass the inputs. It is also located on the same device as
+the inputs.
 """
 
 import cupy as cp
@@ -21,7 +22,8 @@
 # Perform the multiplication.
 result = nvmath.linalg.advanced.matmul(a, b)
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
 
 # Check if the result is cupy array as well.
diff --git a/examples/linalg/advanced/matmul/example01_numpy.py b/examples/linalg/advanced/matmul/example01_numpy.py
index 27c4a9d..258b744 100644
--- a/examples/linalg/advanced/matmul/example01_numpy.py
+++ b/examples/linalg/advanced/matmul/example01_numpy.py
@@ -5,8 +5,9 @@
 """
 This example demonstrates basic matrix multiplication of numpy arrays.
 
-nvmath-python supports multiple frameworks. The result of each operation is a tensor of the same
-framework that was used to pass the inputs. It is also located on the CPU like the inputs.
+nvmath-python supports multiple frameworks. The result of each operation is a tensor of the
+same framework that was used to pass the inputs. It is also located on the CPU like the
+inputs.
 """
 
 import numpy as np
diff --git a/examples/linalg/advanced/matmul/example01_torch.py b/examples/linalg/advanced/matmul/example01_torch.py
index 6b44183..d6151cc 100644
--- a/examples/linalg/advanced/matmul/example01_torch.py
+++ b/examples/linalg/advanced/matmul/example01_torch.py
@@ -5,8 +5,9 @@
 """
 This example demonstrates basic matrix multiplication of torch tensors.
 
-nvmath-python supports multiple frameworks. The result of each operation is a tensor of the same
-framework that was used to pass the inputs. It is also located on the same device as the inputs.
+nvmath-python supports multiple frameworks. The result of each operation is a tensor of the
+same framework that was used to pass the inputs. It is also located on the same device as
+the inputs.
 """
 
 import torch
@@ -34,7 +35,8 @@
 print("\nRunning the multiplication on GPU tensors...")
 result = nvmath.linalg.advanced.matmul(a_gpu, b_gpu)
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 torch.cuda.default_stream().synchronize()
 
 print(f"Inputs were of types {type(a_gpu)} and {type(b_gpu)} and the result is of type {type(result)}.")
diff --git a/examples/linalg/advanced/matmul/example02_options.py b/examples/linalg/advanced/matmul/example02_options.py
index 218def5..395a595 100644
--- a/examples/linalg/advanced/matmul/example02_options.py
+++ b/examples/linalg/advanced/matmul/example02_options.py
@@ -5,7 +5,8 @@
 """
 This example illustrates how to specify options to a matrix multiplication operation.
 
-In this example, we will use NumPy ndarrays as input, and we will look at two equivalent ways to specify the compute type.
+In this example, we will use NumPy ndarrays as input, and we will look at two equivalent
+ways to specify the compute type.
 """
 
 import numpy as np
@@ -17,8 +18,9 @@
 a = np.random.rand(m, k).astype(np.float32)
 b = np.random.rand(k, n).astype(np.float32)
 
-# Here we'd like to use COMPUTE_32F_FAST_TF32 for the compute type, and we show two alternatives for doing so.
-# Tip: use help(nvmath.linalg.advanced.MatmulComputeType) to see available compute types.
+# Here we'd like to use COMPUTE_32F_FAST_TF32 for the compute type, and we show two
+# alternatives for doing so. Tip: use help(nvmath.linalg.advanced.MatmulComputeType) to see
+# available compute types.
 compute_type = nvmath.linalg.advanced.MatmulComputeType.COMPUTE_32F_FAST_TF32
 
 # Alternative #1 for specifying options, using a dataclass.
@@ -26,7 +28,8 @@
 options = nvmath.linalg.advanced.MatmulOptions(compute_type=compute_type)
 result = nvmath.linalg.advanced.matmul(a, b, options=options)
 
-# Alternative #2 for specifying options, using dict. The two alternatives are entirely equivalent.
+# Alternative #2 for specifying options, using dict. The two alternatives are entirely
+# equivalent.
 result = nvmath.linalg.advanced.matmul(a, b, options={"compute_type": compute_type})
 
 # No synchronization is needed for CPU tensors, since the execution always blocks.
diff --git a/examples/linalg/advanced/matmul/example03_logging_global.py b/examples/linalg/advanced/matmul/example03_logging_global.py
index d139e8b..c7d995d 100644
--- a/examples/linalg/advanced/matmul/example03_logging_global.py
+++ b/examples/linalg/advanced/matmul/example03_logging_global.py
@@ -10,7 +10,8 @@
 
 import nvmath
 
-# Turn on logging. Here we use the global logger, set the level to "debug", and use a custom format for the log.
+# Turn on logging. Here we use the global logger, set the level to "debug", and use a custom
+# format for the log.
 import logging
 
 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
@@ -24,5 +25,6 @@
 # Perform the GEMM.
 result = nvmath.linalg.advanced.matmul(a, b, alpha=alpha)
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
diff --git a/examples/linalg/advanced/matmul/example03_logging_user.py b/examples/linalg/advanced/matmul/example03_logging_user.py
index 8136a0b..7a30b3b 100644
--- a/examples/linalg/advanced/matmul/example03_logging_user.py
+++ b/examples/linalg/advanced/matmul/example03_logging_user.py
@@ -25,7 +25,8 @@
 formatter = logging.Formatter("%(asctime)s %(name)-12s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
 handler.setFormatter(formatter)
 
-# Associate handler with logger, resulting in a logger with the desired level, format, and console output.
+# Associate handler with logger, resulting in a logger with the desired level, format, and
+# console output.
 logger.addHandler(handler)
 
 # Prepare sample input data.
@@ -41,9 +42,10 @@
 
 print("---")
 
-# Recall that the options can also be provided as a dict, so the following is an alternative, entirely
-#   equivalent way to specify options.
+# Recall that the options can also be provided as a dict, so the following is an
+#   alternative, entirely equivalent way to specify options.
 result = nvmath.linalg.advanced.matmul(a, b, alpha=alpha, options={"logger": logger})
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
diff --git a/examples/linalg/advanced/matmul/example04_stateful_cupy.py b/examples/linalg/advanced/matmul/example04_stateful_cupy.py
index a43b7a1..a39ec0b 100644
--- a/examples/linalg/advanced/matmul/example04_stateful_cupy.py
+++ b/examples/linalg/advanced/matmul/example04_stateful_cupy.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example illustrates the use of stateful matrix multiplication objects. Stateful objects amortize the cost of preparation across multiple executions.
+This example illustrates the use of stateful matrix multiplication objects. Stateful objects
+amortize the cost of preparation across multiple executions.
 
 The inputs as well as the result are CuPy ndarrays.
 """
@@ -19,13 +20,15 @@
 
 # Use the stateful object as a context manager to automatically release resources.
 with nvmath.linalg.advanced.Matmul(a, b) as mm:
-    # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be configured as we'll see in a later example.
+    # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be
+    # configured as we'll see in a later example.
     mm.plan()
 
     # Execute the matrix multiplication.
     result = mm.execute()
 
-    # Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+    # Synchronize the default stream, since by default the execution is non-blocking for GPU
+    # operands.
     cp.cuda.get_current_stream().synchronize()
     print(f"Input types = {type(a), type(b)}, device = {a.device, b.device}")
     print(f"Result type = {type(result)}, device = {result.device}")
diff --git a/examples/linalg/advanced/matmul/example04_stateful_torch.py b/examples/linalg/advanced/matmul/example04_stateful_torch.py
index 4b2944a..87e6917 100644
--- a/examples/linalg/advanced/matmul/example04_stateful_torch.py
+++ b/examples/linalg/advanced/matmul/example04_stateful_torch.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example illustrates the use of stateful matrix multiplication objects. Stateful objects amortize the cost of preparation across multiple executions.
+This example illustrates the use of stateful matrix multiplication objects. Stateful objects
+amortize the cost of preparation across multiple executions.
 
 The inputs as well as the result are PyTorch tensors on the GPU.
 """
@@ -20,13 +21,15 @@
 
 # Use the stateful object as a context manager to automatically release resources.
 with nvmath.linalg.advanced.Matmul(a, b) as mm:
-    # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be configured, as we'll see in a later example.
+    # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be
+    # configured, as we'll see in a later example.
     mm.plan()
 
     # Execute the matrix multiplication.
     result = mm.execute()
 
-    # Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+    # Synchronize the default stream, since by default the execution is non-blocking for GPU
+    # operands.
     torch.cuda.default_stream().synchronize()
     print(f"Input types = {type(a), type(b)}, device = {a.device, b.device}")
     print(f"Result type = {type(result)}, device = {result.device}")
diff --git a/examples/linalg/advanced/matmul/example04_stateful_torch_cpu.py b/examples/linalg/advanced/matmul/example04_stateful_torch_cpu.py
index 11639b5..3ad4aa0 100644
--- a/examples/linalg/advanced/matmul/example04_stateful_torch_cpu.py
+++ b/examples/linalg/advanced/matmul/example04_stateful_torch_cpu.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example illustrates the use of stateful matrix multiplication objects. Stateful objects amortize the cost of preparation across multiple executions.
+This example illustrates the use of stateful matrix multiplication objects. Stateful objects
+amortize the cost of preparation across multiple executions.
 
 The inputs as well as the result are PyTorch tensors on the CPU.
 """
@@ -19,7 +20,8 @@
 
 # Use the stateful object as a context manager to automatically release resources.
 with nvmath.linalg.advanced.Matmul(a, b) as mm:
-    # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be configured, as we'll see in a later example.
+    # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be
+    # configured, as we'll see in a later example.
     mm.plan()
 
     # Execute the matrix multiplication.
diff --git a/examples/linalg/advanced/matmul/example05_stateful_inplace.py b/examples/linalg/advanced/matmul/example05_stateful_inplace.py
index 8740847..5d66faa 100644
--- a/examples/linalg/advanced/matmul/example05_stateful_inplace.py
+++ b/examples/linalg/advanced/matmul/example05_stateful_inplace.py
@@ -3,11 +3,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example illustrates the use of inplace update of input operands in stateful matrix multiplication APIs.
+This example illustrates the use of inplace update of input operands in stateful matrix
+multiplication APIs.
 
 The inputs as well as the result are CuPy ndarrays.
-NOTE: The operands should be updated inplace only when they are in a memory space that is accessible from the execution space.
-In this case, the operands reside on the GPU while the execution also happens on the GPU.
+
+NOTE: The operands should be updated inplace only when they are in a memory space that is
+accessible from the execution space. In this case, the operands reside on the GPU while the
+execution also happens on the GPU.
 """
 
 import cupy as cp
@@ -26,7 +29,8 @@
 
 # Use the stateful object as a context manager to automatically release resources.
 with nvmath.linalg.advanced.Matmul(a, b) as mm:
-    # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be configured as we'll see in a later example.
+    # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be
+    # configured as we'll see in a later example.
     mm.plan()
 
     # Execute the matrix multiplication.
@@ -39,7 +43,8 @@
     # Execute the new matrix multiplication.
     result = mm.execute()
 
-    # Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+    # Synchronize the default stream, since by default the execution is non-blocking for GPU
+    # operands.
     cp.cuda.get_current_stream().synchronize()
     print(f"Input types = {type(a), type(b)}, device = {a.device, b.device}")
     print(f"Result type = {type(result)}, device = {result.device}")
diff --git a/examples/linalg/advanced/matmul/example05_stateful_reset.py b/examples/linalg/advanced/matmul/example05_stateful_reset.py
index ee67d1e..3f838ee 100644
--- a/examples/linalg/advanced/matmul/example05_stateful_reset.py
+++ b/examples/linalg/advanced/matmul/example05_stateful_reset.py
@@ -3,9 +3,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example illustrates how to reset operands in stateful matrix multiplication APIs, and reuse the object for multiple executions. This is
-needed when the memory space of the operands is not accessible from the execution space, or if it's desired to bind new (compatible) operands
-to the stateful object.
+This example illustrates how to reset operands in stateful matrix multiplication APIs, and
+reuse the object for multiple executions. This is needed when the memory space of the
+operands is not accessible from the execution space, or if it's desired to bind new
+(compatible) operands to the stateful object.
 
 The inputs as well as the result are NumPy ndarrays.
 """
@@ -27,7 +28,8 @@
 
 # Use the stateful object as a context manager to automatically release resources.
 with nvmath.linalg.advanced.Matmul(a, b) as mm:
-    # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be configured as we'll see in a later example.
+    # Plan the matrix multiplication. Planning returns a sequence of algorithms that can be
+    # configured as we'll see in a later example.
     mm.plan()
 
     # Execute the matrix multiplication.
diff --git a/examples/linalg/advanced/matmul/example06_gemm.py b/examples/linalg/advanced/matmul/example06_gemm.py
index 32fc1bb..de60b9a 100644
--- a/examples/linalg/advanced/matmul/example06_gemm.py
+++ b/examples/linalg/advanced/matmul/example06_gemm.py
@@ -25,5 +25,6 @@
 # Perform the GEMM.
 result = nvmath.linalg.advanced.matmul(a, b, c=c, alpha=alpha, beta=beta)
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
diff --git a/examples/linalg/advanced/matmul/example07_batched_a.py b/examples/linalg/advanced/matmul/example07_batched_a.py
index a6626ac..6404851 100644
--- a/examples/linalg/advanced/matmul/example07_batched_a.py
+++ b/examples/linalg/advanced/matmul/example07_batched_a.py
@@ -5,9 +5,9 @@
 """
 This example demonstrates nvmath's capability to execute batched multiplications.
 
-Executing multiple multiplications together (in a batch) yields better performance than executing
-them separately. nvmath supports broadcasting, so if one of the inputs is batched and the other one
-is not, it will be broadcasted to match the batch size.
+Executing multiple multiplications together (in a batch) yields better performance than
+executing them separately. nvmath supports broadcasting, so if one of the inputs is batched
+and the other one is not, it will be broadcasted to match the batch size.
 
 In this example we will multiply each of our `a` matrices with the same `b` matrix.
 """
@@ -26,7 +26,8 @@
 # Execute the multiplication.
 result = nvmath.linalg.advanced.matmul(a_batch, b)
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
 print(f"Input types = {type(a_batch), type(b)}, device = {a_batch.device, b.device}")
 print(f"Result type = {type(result)}, device = {result.device}")
diff --git a/examples/linalg/advanced/matmul/example07_batched_a_b.py b/examples/linalg/advanced/matmul/example07_batched_a_b.py
index 09a4887..176cd02 100644
--- a/examples/linalg/advanced/matmul/example07_batched_a_b.py
+++ b/examples/linalg/advanced/matmul/example07_batched_a_b.py
@@ -5,8 +5,8 @@
 """
 This example demonstrates nvmath-python's capability to execute batched multiplications.
 
-Executing multiple multiplications together (in a batch) yields better performance than executing
-them separately.
+Executing multiple multiplications together (in a batch) yields better performance than
+executing them separately.
 
 In this example we will multiply each of our `a` matrices with the corresponding `b` matrix.
 """
@@ -30,7 +30,8 @@
 # Execute the multiplication.
 result = nvmath.linalg.advanced.matmul(a_batch, b_batch)
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
 print(f"Input types = {type(a_batch), type(b_batch)}, device = {a_batch.device, b_batch.device}")
 print(f"Result type = {type(result)}, device = {result.device}")
diff --git a/examples/linalg/advanced/matmul/example08_batched_a_bcast_c.py b/examples/linalg/advanced/matmul/example08_batched_a_bcast_c.py
index 9710d99..ff63ca5 100644
--- a/examples/linalg/advanced/matmul/example08_batched_a_bcast_c.py
+++ b/examples/linalg/advanced/matmul/example08_batched_a_bcast_c.py
@@ -5,13 +5,13 @@
 """
 This example demonstrates nvmath's capability to execute batched multiplications.
 
-Executing multiple multiplications together (in a batch) yields better performance than executing
-them separately. nvmath supports broadcasting, so if one of the inputs is batched and the other one
-is not, it will be broadcasted to match the batch size.
+Executing multiple multiplications together (in a batch) yields better performance than
+executing them separately. nvmath supports broadcasting, so if one of the inputs is batched
+and the other one is not, it will be broadcasted to match the batch size.
 
-In this example we will multiply each of our `a` matrices with the same `b` matrix, and add the same
-`c` vector to the result. Since `c` is a vector, it's also broadcast across the columns of the each
-result in the batch.
+In this example we will multiply each of our `a` matrices with the same `b` matrix, and add
+the same `c` vector to the result. Since `c` is a vector, it's also broadcast across the
+columns of the each result in the batch.
 """
 
 import cupy as cp
@@ -30,7 +30,8 @@
 # Execute the multiplication.
 result = nvmath.linalg.advanced.matmul(a_batch, b, c, beta=beta)
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
 print(f"Input types = {type(a_batch), type(b)}, device = {a_batch.device, b.device}")
 print(f"Result type = {type(result)}, device = {result.device}")
diff --git a/examples/linalg/advanced/matmul/example08_batched_a_c.py b/examples/linalg/advanced/matmul/example08_batched_a_c.py
index 6144439..9307946 100644
--- a/examples/linalg/advanced/matmul/example08_batched_a_c.py
+++ b/examples/linalg/advanced/matmul/example08_batched_a_c.py
@@ -5,13 +5,13 @@
 """
 This example demonstrates nvmath's capability to execute batched multiplications.
 
-Executing multiple multiplications together (in a batch) yields better performance than executing
-them separately. nvmath supports broadcasting, so if one of the inputs is batched and the other one
-is not, it will be broadcasted to match the batch size.
+Executing multiple multiplications together (in a batch) yields better performance than
+executing them separately. nvmath supports broadcasting, so if one of the inputs is batched
+and the other one is not, it will be broadcasted to match the batch size.
 
-In this example we will multiply each of our `a` matrices with the same `b` matrix, and add each of
-the `c` (m, 1) matrices to the result. Since each `c` is a column matrix, it's also broadcast across
-the columns of each result in the batch.
+In this example we will multiply each of our `a` matrices with the same `b` matrix, and add
+each of the `c` (m, 1) matrices to the result. Since each `c` is a column matrix, it's also
+broadcast across the columns of each result in the batch.
 """
 
 import cupy as cp
@@ -30,7 +30,8 @@
 # Execute the multiplication.
 result = nvmath.linalg.advanced.matmul(a_batch, b, c, beta=beta)
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
 print(f"Input types = {type(a_batch), type(b)}, device = {a_batch.device, b.device}")
 print(f"Result type = {type(result)}, device = {result.device}")
diff --git a/examples/linalg/advanced/matmul/example09_epilog_bias.py b/examples/linalg/advanced/matmul/example09_epilog_bias.py
index 8cdfc7d..e3cab2a 100644
--- a/examples/linalg/advanced/matmul/example09_epilog_bias.py
+++ b/examples/linalg/advanced/matmul/example09_epilog_bias.py
@@ -5,8 +5,8 @@
 """
 This example demonstrates usage of epilogs.
 
-Epilogs allow you to execute extra computations after the matrix multiplication in a single fused kernel.
-In this example we'll use the BIAS epilog, which adds bias to the result.
+Epilogs allow you to execute extra computations after the matrix multiplication in a single
+fused kernel. In this example we'll use the BIAS epilog, which adds bias to the result.
 """
 
 import cupy as cp
@@ -23,8 +23,7 @@
 epilog = nvmath.linalg.advanced.MatmulEpilog.BIAS
 result = nvmath.linalg.advanced.matmul(a, b, epilog=epilog, epilog_inputs={"bias": bias})
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
-print(
-    f"Inputs were of types {type(a)} and {type(b)}, the bias type is {type(bias)}, and the result is of type {type(result)}."
-)
+print(f"Inputs were of types {type(a)} and {type(b)}, the bias type is {type(bias)}, and the result is of type {type(result)}.")
diff --git a/examples/linalg/advanced/matmul/example09_epilog_gelu_bias.py b/examples/linalg/advanced/matmul/example09_epilog_gelu_bias.py
index 392c83a..fd356cf 100644
--- a/examples/linalg/advanced/matmul/example09_epilog_gelu_bias.py
+++ b/examples/linalg/advanced/matmul/example09_epilog_gelu_bias.py
@@ -5,8 +5,9 @@
 """
 This example demonstrates usage of epilogs.
 
-Epilogs allow you to execute extra computations after the matrix multiplication in a single fused kernel.
-In this example we'll use the GELU_BIAS epilog, which adds bias to the result and applies the GELU function.
+Epilogs allow you to execute extra computations after the matrix multiplication in a single
+fused kernel. In this example we'll use the GELU_BIAS epilog, which adds bias to the result
+and applies the GELU function.
 """
 
 import cupy as cp
@@ -23,8 +24,7 @@
 epilog = nvmath.linalg.advanced.MatmulEpilog.GELU_BIAS
 result = nvmath.linalg.advanced.matmul(a, b, epilog=epilog, epilog_inputs={"bias": bias})
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
-print(
-    f"Inputs were of types {type(a)} and {type(b)}, the bias type is {type(bias)}, and the result is of type {type(result)}."
-)
+print(f"Inputs were of types {type(a)} and {type(b)}, the bias type is {type(bias)}, and the result is of type {type(result)}.")
diff --git a/examples/linalg/advanced/matmul/example10_epilog_dgelu.py b/examples/linalg/advanced/matmul/example10_epilog_dgelu.py
index 2d44e29..5d2a2f0 100644
--- a/examples/linalg/advanced/matmul/example10_epilog_dgelu.py
+++ b/examples/linalg/advanced/matmul/example10_epilog_dgelu.py
@@ -3,14 +3,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example demonstrates usage of epilogs that use auxiliary output generated from a previous matmul operation.
+This example demonstrates usage of epilogs that use auxiliary output generated from a
+previous matmul operation.
 
-In this example we'll use the GELU_AUX epilog, which generates an extra output "gelu_aux". The auxiliary output
-in this case represent the input to the GELU function. In general, auxiliary output should be considered opaque,
-and is meant to be generated by one matmul operation and used in another with a compatible epilog.
+In this example we'll use the GELU_AUX epilog, which generates an extra output "gelu_aux".
+The auxiliary output in this case represent the input to the GELU function. In general,
+auxiliary output should be considered opaque, and is meant to be generated by one matmul
+operation and used in another with a compatible epilog.
 
-Here we generate the auxiliary output in a forward pass using GELU, and provide it as epilog input in the corresponding
-backward pass using the DGELU epilog.
+Here we generate the auxiliary output in a forward pass using GELU, and provide it as epilog
+input in the corresponding backward pass using the DGELU epilog.
 """
 
 import cupy as cp
@@ -30,8 +32,10 @@
 epilog = nvmath.linalg.advanced.MatmulEpilog.DGELU
 result = nvmath.linalg.advanced.matmul(a, b, epilog=epilog, epilog_inputs=auxiliary)
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
 print(
-    f"Inputs were of types {type(a)} and {type(b)}, and the result type is {type(result)}, and the auxiliary output is of type {type(auxiliary)}."
+    f"Inputs were of types {type(a)} and {type(b)}, and the result type is {type(result)}, "
+    f"and the auxiliary output is of type {type(auxiliary)}."
 )
diff --git a/examples/linalg/advanced/matmul/example10_epilog_drelu.py b/examples/linalg/advanced/matmul/example10_epilog_drelu.py
index 4413f8d..c86f8c0 100644
--- a/examples/linalg/advanced/matmul/example10_epilog_drelu.py
+++ b/examples/linalg/advanced/matmul/example10_epilog_drelu.py
@@ -3,15 +3,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example demonstrates usage of epilogs that use auxiliary output generated from a previous matmul operation.
+This example demonstrates usage of epilogs that use auxiliary output generated from a
+previous matmul operation.
 
-In this example we'll use the RELU_AUX epilog, which generates an extra output "relu_aux". The auxiliary output
-in this case represent bitflags marking where the input to the RELU function is positive. In general, auxiliary
-output should be considered opaque, and is meant to be generated by one matmul operation and used in another with
-a compatible epilog.
+In this example we'll use the RELU_AUX epilog, which generates an extra output "relu_aux".
+The auxiliary output in this case represent bitflags marking where the input to the RELU
+function is positive. In general, auxiliary output should be considered opaque, and is meant
+to be generated by one matmul operation and used in another with a compatible epilog.
 
-Here we generate the auxiliary output in a forward pass using RELU, and provide it as epilog input in the corresponding
-backward pass using the DRELU epilog.
+Here we generate the auxiliary output in a forward pass using RELU, and provide it as epilog
+input in the corresponding backward pass using the DRELU epilog.
 """
 
 import cupy as cp
@@ -31,8 +32,10 @@
 epilog = nvmath.linalg.advanced.MatmulEpilog.DRELU
 result = nvmath.linalg.advanced.matmul(a, b, epilog=epilog, epilog_inputs=auxiliary)
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
 print(
-    f"Inputs were of types {type(a)} and {type(b)}, and the result type is {type(result)}, and the auxiliary output is of type {type(auxiliary)}."
+    f"Inputs were of types {type(a)} and {type(b)}, and the result type is {type(result)}, "
+    f"and the auxiliary output is of type {type(auxiliary)}."
 )
diff --git a/examples/linalg/advanced/matmul/example10_epilog_relu_aux.py b/examples/linalg/advanced/matmul/example10_epilog_relu_aux.py
index 5643ff4..3e8879f 100644
--- a/examples/linalg/advanced/matmul/example10_epilog_relu_aux.py
+++ b/examples/linalg/advanced/matmul/example10_epilog_relu_aux.py
@@ -5,9 +5,10 @@
 """
 This example demonstrates usage of epilogs that generate auxiliary output.
 
-Epilogs allow you to execute extra computations after the matrix multiplication in a single fused kernel.
-In this example we'll use the RELU_AUX epilog, which generates an extra output "relu_aux". We will see
-in a later example how to use the auxiliary output as input to other epilogs like DRELU.
+Epilogs allow you to execute extra computations after the matrix multiplication in a single
+fused kernel. In this example we'll use the RELU_AUX epilog, which generates an extra output
+"relu_aux". We will see in a later example how to use the auxiliary output as input to other
+epilogs like DRELU.
 """
 
 import cupy as cp
@@ -23,8 +24,10 @@
 epilog = nvmath.linalg.advanced.MatmulEpilog.RELU_AUX
 result, auxiliary = nvmath.linalg.advanced.matmul(a, b, epilog=epilog)
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
 print(
-    f"Inputs were of types {type(a)} and {type(b)}, and the result type is {type(result)}, and the auxiliary output is of type {type(auxiliary)}."
+    f"Inputs were of types {type(a)} and {type(b)}, and the result type is {type(result)}, "
+    f"and the auxiliary output is of type {type(auxiliary)}."
 )
diff --git a/examples/linalg/advanced/matmul/example11_epilog_drelu_bgrad.py b/examples/linalg/advanced/matmul/example11_epilog_drelu_bgrad.py
index 6b032f1..d002697 100644
--- a/examples/linalg/advanced/matmul/example11_epilog_drelu_bgrad.py
+++ b/examples/linalg/advanced/matmul/example11_epilog_drelu_bgrad.py
@@ -3,16 +3,18 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example demonstrates usage of epilogs that use auxiliary output generated from a previous matmul operation.
-
-In this example we'll use the RELU_AUX_BIAS_BIAS epilog, which generates an extra output "relu_aux". The auxiliary output
-in this case represent bitflags marking where the input to the RELU function is positive. In general, auxiliary
-output should be considered opaque, and is meant to be generated by one matmul operation and used in another with
-a compatible epilog.
-
-Here we generate the auxiliary output in a forward pass using RELU, and provide it as epilog input in the corresponding
-backward pass using the DRELU_BGRAD epilog. This epilog also generates an auxiliary output corresponding to the bias
-gradient.
+This example demonstrates usage of epilogs that use auxiliary output generated from a
+previous matmul operation.
+
+In this example we'll use the RELU_AUX_BIAS_BIAS epilog, which generates an extra output
+"relu_aux". The auxiliary output in this case represent bitflags marking where the input to
+the RELU function is positive. In general, auxiliary output should be considered opaque, and
+is meant to be generated by one matmul operation and used in another with a compatible
+epilog.
+
+Here we generate the auxiliary output in a forward pass using RELU, and provide it as epilog
+input in the corresponding backward pass using the DRELU_BGRAD epilog. This epilog also
+generates an auxiliary output corresponding to the bias gradient.
 """
 
 import cupy as cp
@@ -34,13 +36,16 @@
 epilog = nvmath.linalg.advanced.MatmulEpilog.RELU_AUX_BIAS
 result, auxiliary = nvmath.linalg.advanced.matmul(a, b, epilog=epilog, epilog_inputs={"bias": bias})
 
-# In the backward pass using DRELU_BGRAD epilog, provide the auxiliary output "relu_aux" from the previous matmul as epilog inputs.
-# The auxiliary output "auxiliary" in the current matmul is a dict containing the bias gradient with the key "bgrad".
+# In the backward pass using DRELU_BGRAD epilog, provide the auxiliary output "relu_aux"
+# from the previous matmul as epilog inputs. The auxiliary output "auxiliary" in the current
+# matmul is a dict containing the bias gradient with the key "bgrad".
 epilog = nvmath.linalg.advanced.MatmulEpilog.DRELU_BGRAD
 result, auxiliary = nvmath.linalg.advanced.matmul(a, b, epilog=epilog, epilog_inputs=auxiliary)
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
 print(
-    f"Inputs were of types {type(a)} and {type(b)}, and the result type is {type(result)}, and the auxiliary output is of type {type(auxiliary)}."
+    f"Inputs were of types {type(a)} and {type(b)}, and the result type is {type(result)}, "
+    f"and the auxiliary output is of type {type(auxiliary)}."
 )
diff --git a/examples/linalg/advanced/matmul/example12_epilog_bgrada.py b/examples/linalg/advanced/matmul/example12_epilog_bgrada.py
index 5a33de4..c13ed63 100644
--- a/examples/linalg/advanced/matmul/example12_epilog_bgrada.py
+++ b/examples/linalg/advanced/matmul/example12_epilog_bgrada.py
@@ -5,9 +5,9 @@
 """
 This example demonstrates usage of epilogs.
 
-Epilogs allow you to execute extra computations after the matrix multiplication in a single fused kernel.
-In this example we'll use the BGRADA epilog, which generates an extra output "bgrada" corresponding to the
-reduction of the A matrix.
+Epilogs allow you to execute extra computations after the matrix multiplication in a single
+fused kernel. In this example we'll use the BGRADA epilog, which generates an extra output
+"bgrada" corresponding to the reduction of the A matrix.
 """
 
 import cupy as cp
@@ -19,13 +19,15 @@
 a = cp.random.rand(k, m).T  # Currently, it's required that 'a' is in column-major layout.
 b = cp.random.rand(k, n)
 
-# Perform the multiplication with BGRADA epilog.
-# The auxiliary output "auxiliary" is a dict containing the bias gradient with the key "bgrada".
+# Perform the multiplication with BGRADA epilog. The auxiliary output "auxiliary" is a dict
+# containing the bias gradient with the key "bgrada".
 epilog = nvmath.linalg.advanced.MatmulEpilog.BGRADA
 result, auxiliary = nvmath.linalg.advanced.matmul(a, b, epilog=epilog)
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
 print(
-    f"Inputs were of types {type(a)} and {type(b)}, and the result type is {type(result)}, and the auxiliary output is of type {type(auxiliary)}."
+    f"Inputs were of types {type(a)} and {type(b)}, and the result type is {type(result)}, "
+    f"and the auxiliary output is of type {type(auxiliary)}."
 )
diff --git a/examples/linalg/advanced/matmul/example12_epilog_bgradb.py b/examples/linalg/advanced/matmul/example12_epilog_bgradb.py
index b5464fb..51e1b41 100644
--- a/examples/linalg/advanced/matmul/example12_epilog_bgradb.py
+++ b/examples/linalg/advanced/matmul/example12_epilog_bgradb.py
@@ -5,9 +5,9 @@
 """
 This example demonstrates usage of epilogs.
 
-Epilogs allow you to execute extra computations after the matrix multiplication in a single fused kernel.
-In this example we'll use the BGRADB epilog, which generates an extra output "bgradb" corresponding to the
-reduction of the B matrix.
+Epilogs allow you to execute extra computations after the matrix multiplication in a single
+fused kernel. In this example we'll use the BGRADB epilog, which generates an extra output
+"bgradb" corresponding to the reduction of the B matrix.
 """
 
 import cupy as cp
@@ -19,13 +19,15 @@
 a = cp.random.rand(m, k)
 b = cp.random.rand(k, n)
 
-# Perform the multiplication with BGRADB epilog.
-# The auxiliary output "auxiliary" is a dict containing the bias gradient with the key "bgradb".
+# Perform the multiplication with BGRADB epilog. The auxiliary output "auxiliary" is a dict
+# containing the bias gradient with the key "bgradb".
 epilog = nvmath.linalg.advanced.MatmulEpilog.BGRADB
 result, auxiliary = nvmath.linalg.advanced.matmul(a, b, epilog=epilog)
 
-# Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+# Synchronize the default stream, since by default the execution is non-blocking for GPU
+# operands.
 cp.cuda.get_current_stream().synchronize()
 print(
-    f"Inputs were of types {type(a)} and {type(b)}, and the result type is {type(result)}, and the auxiliary output is of type {type(auxiliary)}."
+    f"Inputs were of types {type(a)} and {type(b)}, and the result type is {type(result)}, "
+    f"and the auxiliary output is of type {type(auxiliary)}."
 )
diff --git a/examples/linalg/advanced/matmul/example13_epilog_stateful_reset.py b/examples/linalg/advanced/matmul/example13_epilog_stateful_reset.py
index 9562068..87cf5bd 100644
--- a/examples/linalg/advanced/matmul/example13_epilog_stateful_reset.py
+++ b/examples/linalg/advanced/matmul/example13_epilog_stateful_reset.py
@@ -3,8 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example illustrates how to reset operands and epilog inputs in stateful matrix multiplication APIs, and reuse the object
-for multiple executions. This is needed when the memory space of the operands is not accessible from the execution space, or if
+This example illustrates how to reset operands and epilog inputs in stateful matrix
+multiplication APIs, and reuse the object for multiple executions. This is needed when the
+memory space of the operands is not accessible from the execution space, or if
  it's desired to bind new (compatible) operands to the stateful object.
 
 The inputs as well as the result are NumPy ndarrays.
diff --git a/examples/linalg/advanced/matmul/example14_autotune.py b/examples/linalg/advanced/matmul/example14_autotune.py
index b0e3f5a..b027f98 100644
--- a/examples/linalg/advanced/matmul/example14_autotune.py
+++ b/examples/linalg/advanced/matmul/example14_autotune.py
@@ -29,15 +29,16 @@
     epilog = nvmath.linalg.advanced.MatmulEpilog.RELU_BIAS
     mm.plan(epilog=epilog, epilog_inputs={"bias": bias})
 
-    # Run the autotuning. It will benchmark the algorithms found during planning and reorder them
-    # according to their actual performance. See the logs section "autotuning phase" to see what
-    # happens under the hood.
+    # Run the autotuning. It will benchmark the algorithms found during planning and reorder
+    # them according to their actual performance. See the logs section "autotuning phase" to
+    # see what happens under the hood.
     mm.autotune(iterations=5)
 
     # Execute the multiplication.
     result = mm.execute()
 
-    # Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+    # Synchronize the default stream, since by default the execution is non-blocking for GPU
+    # operands.
     cp.cuda.get_current_stream().synchronize()
     print(f"Input types = {type(a), type(b)}, device = {a.device, b.device}")
     print(f"Result type = {type(result)}, device = {result.device}")
diff --git a/examples/linalg/advanced/matmul/example15_manual_tuning.py b/examples/linalg/advanced/matmul/example15_manual_tuning.py
index 64d45b9..8593864 100644
--- a/examples/linalg/advanced/matmul/example15_manual_tuning.py
+++ b/examples/linalg/advanced/matmul/example15_manual_tuning.py
@@ -5,8 +5,9 @@
 """
 This example demonstrates the possibility to tweak algorithm's configuration manually.
 
-You are free to modify algorithm configuration as long as it's consistent with its capabilities.
-As an alternative to manual fine-tuning, you might want to try autotuning - see `autotune` example.
+You are free to modify algorithm configuration as long as it's consistent with its
+capabilities. As an alternative to manual fine-tuning, you might want to try autotuning -
+see `autotune` example.
 """
 
 import nvmath
@@ -29,14 +30,16 @@
     best = mm.algorithms[0]
     print(best.capabilities)
 
-    # Modify the tiling configuration of the algorithm. Note that the valid tile configuration depends on
-    # the hardware, and not all combinations of the configuration are supported, so we leave it as an exercise.
+    # Modify the tiling configuration of the algorithm. Note that the valid tile
+    # configuration depends on the hardware, and not all combinations of the configuration
+    # are supported, so we leave it as an exercise.
     best.tile = best.tile
     print(f"Modified the tile to be {best.tile}")
     # Execute the multiplication.
     result = mm.execute()
 
-    # Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+    # Synchronize the default stream, since by default the execution is non-blocking for GPU
+    # operands.
     cp.cuda.get_current_stream().synchronize()
     print(f"Input types = {type(a), type(b)}, device = {a.device, b.device}")
     print(f"Result type = {type(result)}, device = {result.device}")
diff --git a/examples/linalg/advanced/matmul/example16_reuse_algorithms.py b/examples/linalg/advanced/matmul/example16_reuse_algorithms.py
index f036bfe..6009d9e 100644
--- a/examples/linalg/advanced/matmul/example16_reuse_algorithms.py
+++ b/examples/linalg/advanced/matmul/example16_reuse_algorithms.py
@@ -3,10 +3,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-This example show how to save algorithms from a planned and possibly autotuned matrix multiplication object.
+This example show how to save algorithms from a planned and possibly autotuned matrix
+multiplication object.
 
-The saved algorithms can be provided later for another compatible matrix multiplication operation, thereby avoiding
-the cost of planning and autotuning.
+The saved algorithms can be provided later for another compatible matrix multiplication
+operation, thereby avoiding the cost of planning and autotuning.
 """
 
 import os
@@ -18,7 +19,7 @@
 
 # Tip: turn logging on to get information on performance improvement from autotuning.
 # import logging
-# logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
+# logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")  # noqa: W505,E501
 
 # Prepare sample input data
 m, n, k = 2048, 4096, 1024
@@ -27,9 +28,9 @@
 bias = cp.random.rand(m, 1)
 
 pickle_file = f"algorithms_{m}_{n}_{k}_f64_relu_bias.pickle"
-# In the first pass, we will plan and autotune the matrix multiplication. Autotuning reorders the
-# algorithms based on measured performance from fastest to slowest, and we will pickle the ordered
-# algorithms.
+# In the first pass, we will plan and autotune the matrix multiplication. Autotuning
+# reorders the algorithms based on measured performance from fastest to slowest, and we will
+# pickle the ordered algorithms.
 print("= Phase 1: Plan, autotune, and save the optimal algorithm sequence. =")
 with nvmath.linalg.advanced.Matmul(a, b) as mm:
     epilog = nvmath.linalg.advanced.MatmulEpilog.RELU_BIAS
@@ -53,8 +54,8 @@
     algorithms = pickle.load(f)
 print(f"Loaded optimized algorithms from '{pickle_file}'.")
 
-# In the second pass, we will provide the loaded algorithms to plan() to bypass
-# planning and autotuning costs, since we already know the optimal algorithm(s) for this case.
+# In the second pass, we will provide the loaded algorithms to plan() to bypass planning and
+# autotuning costs, since we already know the optimal algorithm(s) for this case.
 with nvmath.linalg.advanced.Matmul(a, b) as mm:
     epilog = nvmath.linalg.advanced.MatmulEpilog.RELU_BIAS
 
@@ -67,7 +68,8 @@
     result = mm.execute()
     print("Executed the matrix multiplication using the provided algorithms.")
 
-    # Synchronize the default stream, since by default the execution is non-blocking for GPU operands.
+    # Synchronize the default stream, since by default the execution is non-blocking for GPU
+    # operands.
     cp.cuda.get_current_stream().synchronize()
 
 # Remove the pickle file.
diff --git a/notebooks/matmul/01_introduction.ipynb b/notebooks/matmul/01_introduction.ipynb
index 32cdab6..ba6a3f0 100644
--- a/notebooks/matmul/01_introduction.ipynb
+++ b/notebooks/matmul/01_introduction.ipynb
@@ -5,7 +5,7 @@
    "id": "88073684-ba4e-42eb-9d9e-f7541473ce4f",
    "metadata": {},
    "source": [
-    "Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES\n",
+    "Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES\n",
     "\n",
     "SPDX-License-Identifier: BSD-3-Clause"
    ]
diff --git a/notebooks/matmul/02_epilogs.ipynb b/notebooks/matmul/02_epilogs.ipynb
index 6341317..3c4d100 100644
--- a/notebooks/matmul/02_epilogs.ipynb
+++ b/notebooks/matmul/02_epilogs.ipynb
@@ -5,7 +5,7 @@
    "id": "fb0742d2-785b-4def-9939-9309b0f5c3e7",
    "metadata": {},
    "source": [
-    "Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES\n",
+    "Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES\n",
     "\n",
     "SPDX-License-Identifier: BSD-3-Clause"
    ]
diff --git a/notebooks/matmul/03_backpropagation.ipynb b/notebooks/matmul/03_backpropagation.ipynb
index 0d1f04a..2801b56 100644
--- a/notebooks/matmul/03_backpropagation.ipynb
+++ b/notebooks/matmul/03_backpropagation.ipynb
@@ -5,8 +5,8 @@
    "id": "7853d721-ee63-4177-a01a-0c07f835814d",
    "metadata": {},
    "source": [
-    "Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES\r\n",
-    "\r\n",
+    "Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES\n",
+    "\n",
     "SPDX-License-Identifier: BSD-3-Clause"
    ]
   },
@@ -169,7 +169,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "7fdffb17-e580-42fb-b7d1-4df909f3061f",
    "metadata": {},
    "outputs": [],
@@ -190,7 +190,8 @@
     "        raise NotImplementedError()\n",
     "\n",
     "    def backward(self, grad):\n",
-    "        # return grad_weights_hidden, grad_bias_hidden, grad_weights_output, grad_bias_output\n",
+    "        # return grad_weights_hidden, grad_bias_hidden, grad_weights_output,\n",
+    "        # grad_bias_output\n",
     "        raise NotImplementedError()"
    ]
   },
@@ -360,7 +361,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "47077a54-d988-49d6-98c1-1c3b706450e3",
    "metadata": {},
    "outputs": [],
@@ -403,9 +404,9 @@
     "        self.backward_mm_weights_output.reset_operands(a=grad.T, b=self.y)\n",
     "        grad_weights_output = self.backward_mm_weights_output.execute()\n",
     "\n",
-    "        # Gradient of ReLU.\n",
-    "        # DRELU_BGRAD epilog applies the mask provided in epilog_inputs[\"relu_aux\"] to the result\n",
-    "        # and also puts bias gradient (sum of the result along the second axis) in epilog_outputs[\"drelu_bgrad\"].\n",
+    "        # Gradient of ReLU. DRELU_BGRAD epilog applies the mask provided in\n",
+    "        # epilog_inputs[\"relu_aux\"] to the result and also puts bias gradient (sum of the\n",
+    "        # result along the second axis) in epilog_outputs[\"drelu_bgrad\"].\n",
     "        if self.backward_mm_masked is None:\n",
     "            self.backward_mm_masked = Matmul(self.weights_output.T, grad.T)\n",
     "            self.backward_mm_masked.plan(epilog=Epilog.DRELU_BGRAD, epilog_inputs={\"relu_aux\": self.relu_mask})\n",
diff --git a/nvmath/_internal/enum_utils.py b/nvmath/_internal/enum_utils.py
index faff8c4..03a9eee 100644
--- a/nvmath/_internal/enum_utils.py
+++ b/nvmath/_internal/enum_utils.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-Factories for create options dataclasses, as well as utilities to add docstring to enum classes.
+Factories for create options dataclasses, as well as utilities to add docstring to enum
+classes.
 """
 
 import dataclasses
@@ -27,8 +28,12 @@ def create_options_class_from_enum(
 
     Args:
        options_class_name: Name of the dataclass that will be created.
+
        enum_class: The IntEnum class that contains the options for the dataclass.
-       get_attr_dtype: A callable that takes in an enum value as the argument and returns the size in bytes of the cuTensorNet.
+
+       get_attr_dtype: A callable that takes in an enum value as the argument and returns
+            the size in bytes of the cuTensorNet.
+
        filter_re: A re definition that defines the match named 'option_name'.
     """
     if r"(?P<option_name>" not in filter_re:
@@ -119,7 +124,8 @@ def transformer(m):
 
 def determine_enum_prefix(enum_class, chomp):
     """
-    This function assumes that the convention used to translate C enumerators to Python enum names holds.
+    This function assumes that the convention used to translate C enumerators to Python enum
+    names holds.
     """
 
     prefix = enum_class.__module__.split(".")[-1].upper()
diff --git a/nvmath/_internal/formatters.py b/nvmath/_internal/formatters.py
index afd2945..f1b785a 100644
--- a/nvmath/_internal/formatters.py
+++ b/nvmath/_internal/formatters.py
@@ -73,7 +73,8 @@ def __str__(self):
 
 def array2string(array_like):
     """
-    String representation of an array-like object with possible truncation of "interior" values to limit string size.
+    String representation of an array-like object with possible truncation of "interior"
+    values to limit string size.
 
     The NumPy function "set_printoptions" can be used to control the display of the array.
     """
diff --git a/nvmath/_internal/layout.py b/nvmath/_internal/layout.py
new file mode 100644
index 0000000..fa497ef
--- /dev/null
+++ b/nvmath/_internal/layout.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+A collection of (internal use) helper functions for shape/stride
+validation and manipulation.
+"""
+
+from collections.abc import Sequence
+
+
+def is_contiguous_layout(sorted_shape: Sequence[int], sorted_strides: Sequence[int]) -> bool:
+    return all(sorted_shape[s - 1] * sorted_strides[s - 1] == sorted_strides[s] for s in range(1, len(sorted_strides)))
+
+
+def is_contiguous_in_memory(shape: Sequence[int], strides: Sequence[int]) -> bool:
+    """
+    Check if the provided (shape, strides) result in a contiguous memory layout.
+    """
+    sorted_strides, sorted_shape = zip(*sorted(zip(strides, shape, strict=True)), strict=True)
+    return is_contiguous_layout(sorted_shape, sorted_strides)
+
+
+def is_contiguous_and_dense(shape: Sequence[int], strides: Sequence[int]) -> bool:
+    """
+    Check if the provided (shape, strides) result in a contiguous memory layout
+    with no extra stride in least strided dimension.
+    """
+    sorted_strides, sorted_shape = zip(*sorted(zip(strides, shape, strict=True)), strict=True)
+    if len(sorted_strides) > 0 and sorted_strides[0] != 1:
+        return False
+    return is_contiguous_layout(sorted_shape, sorted_strides)
+
+
+def is_overlapping_layout(shape: Sequence[int], strides: Sequence[int]) -> bool:
+    """
+    For a tensor `t`, if `not is_overlapping_layout(t.shape, t.strides)`,
+    there are no two different valid nd-indices `idxs` such that
+    `t[idxs_0]` and `t[idxs_1]` map to the same offset in the memory.
+    Checks that the strides:
+        1. are positive
+        2. any n - 1 extents maximal offset is smaller than the stride
+           of the n-th extent.
+    The check should return False for contiguous
+    or contiguous and sliced tensors.
+    """
+    sorted_strides, sorted_shape = zip(*sorted(zip(strides, shape, strict=True)), strict=True)
+    cur_max_offset = 0
+    for s in range(1, len(sorted_strides)):
+        stride = sorted_strides[s - 1]
+        extent = sorted_shape[s - 1]
+        if stride <= 0:
+            return True
+        cur_max_offset += stride * (extent - 1)
+        if cur_max_offset >= sorted_strides[s]:
+            return True
+    return False
diff --git a/nvmath/_internal/mem_limit.py b/nvmath/_internal/mem_limit.py
index fab718b..69d6b5b 100644
--- a/nvmath/_internal/mem_limit.py
+++ b/nvmath/_internal/mem_limit.py
@@ -16,15 +16,16 @@
     re.IGNORECASE,
 )
 MEM_LIMIT_DOC = """The {kind} must be specified in one of the following forms:
-  (1) A number (int or float). If the number is a float between 0 and 1 inclusive, the {kind} is interpreted as a fraction of the
-      total device memory; otherwise it is interpreted as the number of bytes of memory, with float value being cast to int.
+  (1) A number (int or float). If the number is a float between 0 and 1 inclusive, the
+      {kind} is interpreted as a fraction of the total device memory; otherwise it is
+      interpreted as the number of bytes of memory, with float value being cast to int.
       Examples: 0.75, 50E6, 50000000, ...
   (2) A string containing a positive value followed by B, kB, MB, or GB for powers of 1000.
       Examples: "0.05 GB", "50 MB", "50000000 B" ...
   (3) A string containing a positive value followed by kiB, MiB, or GiB for powers of 1024.
       Examples:  "0.05 GiB", "51.2 MiB", "53687091 B" ...
   (4) A string with value in the range [0, 100] followed by a % symbol.
-      Examples: "26%", "82%", ...
+      Examples: "26%","82%", ...
 
   Whitespace between values and units is optional.
 
diff --git a/nvmath/_internal/package_ifc.py b/nvmath/_internal/package_ifc.py
index 32a0d7c..855af2b 100644
--- a/nvmath/_internal/package_ifc.py
+++ b/nvmath/_internal/package_ifc.py
@@ -10,7 +10,7 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from contextlib import nullcontext
+from contextlib import nullcontext, AbstractContextManager
 from typing import Any
 
 
@@ -84,7 +84,7 @@ class StreamHolder:
         ptr (int): The address of the underlying ``cudaStream_t`` object.
     """
 
-    ctx: Any = nullcontext()
+    ctx: AbstractContextManager[Any] = nullcontext()
     device_id: int = -2
     obj: Any = None
     package: str = ""
diff --git a/nvmath/_internal/package_wrapper.py b/nvmath/_internal/package_wrapper.py
index 79a6a04..af22414 100644
--- a/nvmath/_internal/package_wrapper.py
+++ b/nvmath/_internal/package_wrapper.py
@@ -17,5 +17,5 @@
     from .package_ifc_torch import TorchPackage
 
     PACKAGE["torch"] = TorchPackage
-except ImportError as e:
+except ImportError:
     pass
diff --git a/nvmath/_internal/tensor_ifc.py b/nvmath/_internal/tensor_ifc.py
index 27b18aa..767e7a4 100644
--- a/nvmath/_internal/tensor_ifc.py
+++ b/nvmath/_internal/tensor_ifc.py
@@ -6,7 +6,11 @@
 Interface to seamlessly use tensors (or ndarray-like objects) from different libraries.
 """
 
+from __future__ import annotations  # allows typehint of class methods to return the self class
+
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
+import typing
 
 from . import typemaps
 
@@ -41,7 +45,7 @@ def dtype(self):
 
     @classmethod
     @abstractmethod
-    def empty(cls, shape, **context):
+    def empty(cls, shape: Sequence[int], **context: typing.Any) -> Tensor:
         raise NotImplementedError
 
     @abstractmethod
@@ -50,12 +54,12 @@ def numpy(self, stream_holder):
 
     @property
     @abstractmethod
-    def shape(self):
+    def shape(self) -> Sequence[int]:
         raise NotImplementedError
 
     @property
     @abstractmethod
-    def strides(self):
+    def strides(self) -> Sequence[int]:
         raise NotImplementedError
 
     @abstractmethod
@@ -69,7 +73,8 @@ def copy_(self, src, stream_holder=None):
     @staticmethod
     def create_name_dtype_map(conversion_function, exception_type):
         """
-        Create a map between CUDA data type names and the corresponding package dtypes for supported data types.
+        Create a map between CUDA data type names and the corresponding package dtypes for
+        supported data types.
         """
         names = typemaps.NAME_TO_DATA_TYPE.keys()
         name_to_dtype = dict()
diff --git a/nvmath/_internal/tensor_ifc_cupy.py b/nvmath/_internal/tensor_ifc_cupy.py
index 8c3a550..c8a8f98 100644
--- a/nvmath/_internal/tensor_ifc_cupy.py
+++ b/nvmath/_internal/tensor_ifc_cupy.py
@@ -27,9 +27,7 @@ class CupyTensor(Tensor):
 
     name = "cupy"
     module = cupy
-    name_to_dtype = Tensor.create_name_dtype_map(
-        conversion_function=lambda name: np.dtype(name), exception_type=TypeError
-    )
+    name_to_dtype = Tensor.create_name_dtype_map(conversion_function=lambda name: np.dtype(name), exception_type=TypeError)
 
     def __init__(self, tensor):
         super().__init__(tensor)
@@ -72,6 +70,9 @@ def numpy(self, stream_holder=StreamHolder()):
     def empty(cls, shape, *, dtype="float32", device_id=None, strides=None):
         """
         Create an empty tensor of the specified shape and data type.
+
+        Note, that the strides, if specified, MUST correspond to a dense (possibly permuted)
+        tensor, otherwise the created tensor may be corrupted.
         """
         dtype = CupyTensor.name_to_dtype[dtype]
 
@@ -132,19 +133,21 @@ def n2c_copy_(self, src, stream_holder):
         Inplace copy of src (copy the data from src into self).
         The src must by numpy ndarray
         """
+        stream = stream_holder.obj
         try:
-            stream = stream_holder.obj
             self.tensor.set(src, stream=stream)
-            if stream is not None:
-                stream.synchronize()
         except RuntimeError as e:
             # If self is a strided tensor (neither c nor f layout)
             # cupy refuses to copy from numpy array
             if "set to non-contiguous array" not in str(e):
                 raise
             else:
-                src_gpu = cupy.asarray(src)
-                self.c2c_copy_(src_gpu, stream_holder)
+                with stream_holder.ctx:
+                    src_gpu = cupy.asarray(src)
+                    cupy.copyto(self.tensor, src_gpu)
+        # cupy/cupy#7820
+        if stream is not None:
+            stream.synchronize()
 
     def copy_(self, src, stream_holder=StreamHolder()):
         """
diff --git a/nvmath/_internal/tensor_ifc_numpy.py b/nvmath/_internal/tensor_ifc_numpy.py
index 1d93098..c945234 100644
--- a/nvmath/_internal/tensor_ifc_numpy.py
+++ b/nvmath/_internal/tensor_ifc_numpy.py
@@ -26,9 +26,7 @@ class NumpyTensor(Tensor):
 
     name = "numpy"
     module = numpy
-    name_to_dtype = Tensor.create_name_dtype_map(
-        conversion_function=lambda name: numpy.dtype(name), exception_type=TypeError
-    )
+    name_to_dtype = Tensor.create_name_dtype_map(conversion_function=lambda name: numpy.dtype(name), exception_type=TypeError)
 
     def __init__(self, tensor):
         super().__init__(tensor)
@@ -70,9 +68,7 @@ def empty(cls, shape, *, dtype="float32", strides=None, device_id=None):
         dtype = NumpyTensor.name_to_dtype[dtype]
         # when strides is not None, it should be of unit counts not bytes
         return cls(
-            cls.module.ndarray(
-                shape, dtype=dtype, strides=(tuple(s * dtype.itemsize for s in strides) if strides else None)
-            )
+            cls.module.ndarray(shape, dtype=dtype, strides=(tuple(s * dtype.itemsize for s in strides) if strides else None))
         )
 
     def to(self, device="cpu", stream_holder=StreamHolder()):
@@ -92,23 +88,48 @@ def to(self, device="cpu", stream_holder=StreamHolder()):
 
         return tensor_device
 
+    def n2n_copy_(self, src):
+        """
+        Inplace copy of src (copy the data from src into self).
+        The src must by numpy ndarray
+        """
+        numpy.copyto(self.tensor, src)
+
+    def c2n_copy_(self, src, stream_holder):
+        """
+        Inplace copy of src (copy the data from src into self).
+        The src must by cupy ndarray
+        """
+        stream = stream_holder.obj
+        try:
+            with stream:
+                src.get(stream=stream, out=self.tensor)
+        except RuntimeError as e:
+            # If self is a strided tensor (neither c nor f layout)
+            # cupy refuses to copy to numpy array
+            if "copying to non-contiguous ndarray" not in str(e):
+                raise
+            else:
+                # we cannot simply use blocking=True, as it is
+                # not supported by older cupy releases (<13)
+                src_cpu = cupy.asnumpy(src, stream=stream)
+                self.n2n_copy_(src_cpu)
+        # cupy/cupy#7820
+        if stream is not None:
+            stream.synchronize()
+
     def copy_(self, src, stream_holder=StreamHolder()):
         package = utils.infer_object_package(src)
         # Handle NumPy <=> CuPy CPU-GPU ndarray asymmetry.
         if package == "cupy":
-            stream = stream_holder.obj
-            with stream:
-                out = src.get(stream=stream, out=self.tensor)
-            # cupy/cupy#7820
-            if stream is not None:
-                stream.synchronize()
-
-            return out
+            self.c2n_copy_(src, stream_holder)
         elif package == "numpy":
-            numpy.copyto(self.tensor, src)
+            self.n2n_copy_(src)
         else:
             raise NotImplementedError
 
+        return self.tensor
+
     def istensor(self):
         """
         Check if the object is ndarray-like.
diff --git a/nvmath/_internal/tensor_ifc_torch.py b/nvmath/_internal/tensor_ifc_torch.py
index 68f954c..23f8119 100644
--- a/nvmath/_internal/tensor_ifc_torch.py
+++ b/nvmath/_internal/tensor_ifc_torch.py
@@ -62,7 +62,12 @@ def numpy(self, stream_holder=StreamHolder()):
     @classmethod
     def empty(cls, shape, *, dtype="float32", device_id=None, strides=None):
         """
-        Create an empty tensor of the specified shape and data type on the specified device (None, 'cpu', or device id).
+        Create an empty tensor of the specified shape and data type on the specified device
+        (None, 'cpu', or device id).
+
+        Note, that the strides, if specified, should correspond to a dense
+        (possibly permuted) tensor and MUST NOT overlap.
+        Otherwise, the behaviour is not defined.
         """
         dtype = TorchTensor.name_to_dtype[dtype]
         if strides:
@@ -81,7 +86,7 @@ def to(self, device="cpu", stream_holder=StreamHolder()):
         if not (device == "cpu" or isinstance(device, int)):
             raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device}'.")
 
-        non_blocking = False if device == "cpu" else True
+        non_blocking = device != "cpu"
 
         with stream_holder.ctx:
             tensor_device = self.tensor.to(device=device, non_blocking=non_blocking)
diff --git a/nvmath/_internal/tensor_wrapper.py b/nvmath/_internal/tensor_wrapper.py
index e1fc27a..58367ab 100644
--- a/nvmath/_internal/tensor_wrapper.py
+++ b/nvmath/_internal/tensor_wrapper.py
@@ -27,7 +27,7 @@
 
     _TENSOR_TYPES["torch"] = TorchTensor
     torch_asarray = functools.partial(torch.as_tensor, device="cuda")
-except ImportError as e:
+except ImportError:
     torch = None  # type: ignore
     torch_asarray = None  # type: ignore
 
@@ -69,9 +69,7 @@ def check_valid_package(native_operands):
     operands_pkg = [infer_tensor_package(o) for o in native_operands]
     checks = [p in _SUPPORTED_PACKAGES for p in operands_pkg]
     if not all(checks):
-        unknown = [
-            f"{location}: {operands_pkg[location]}" for location, predicate in enumerate(checks) if predicate is False
-        ]
+        unknown = [f"{location}: {operands_pkg[location]}" for location, predicate in enumerate(checks) if predicate is False]
         unknown = formatters.array2string(unknown)
         message = f"""The operands should be ndarray-like objects from one of {_SUPPORTED_PACKAGES} packages.
 The unsupported operands as a sequence of "position: package" is: \n{unknown}"""
diff --git a/nvmath/_internal/utils.py b/nvmath/_internal/utils.py
index bd66552..59ee5d0 100644
--- a/nvmath/_internal/utils.py
+++ b/nvmath/_internal/utils.py
@@ -24,6 +24,8 @@
 from . import package_wrapper
 from . import tensor_wrapper
 from .package_ifc import StreamHolder
+from .tensor_ifc import Tensor
+from .layout import is_contiguous_and_dense
 
 
 def infer_object_package(obj):
@@ -127,8 +129,8 @@ def _raise_invalid_one_of_options(clss, options, options_description, *, cls_key
 
 def _create_stream_ctx_ptr_cupy_stream(package_ifc, stream):
     """
-    Utility function to create a stream context as a "package-native" object, get stream pointer as well as
-    create a cupy stream object.
+    Utility function to create a stream context as a "package-native" object, get stream
+    pointer as well as create a cupy stream object.
     """
     stream_ctx = package_ifc.to_stream_context(stream)
     stream_ptr = package_ifc.to_stream_pointer(stream)
@@ -142,17 +144,20 @@ def device_ctx(new_device_id):
     """
     Semantics:
 
-    1. The device context manager makes the specified device current from the point of entry until the point of exit.
+    1. The device context manager makes the specified device current from the point of entry
+       until the point of exit.
 
-    2. When the context manager exits, the current device is reset to what it was when the context manager was entered.
+    2. When the context manager exits, the current device is reset to what it was when the
+       context manager was entered.
 
-    3. Any explicit setting of the device within the context manager (using cupy.cuda.Device().use(), torch.cuda.set_device(),
-       etc) will overrule the device set by the context manager from that point onwards till the context manager exits. In
-       other words, the context manager provides a local device scope and the current device can be explicitly reset for the
-       remainder of that scope.
+    3. Any explicit setting of the device within the context manager (using
+       cupy.cuda.Device().use(), torch.cuda.set_device(), etc) will overrule the device set
+       by the context manager from that point onwards till the context manager exits. In
+       other words, the context manager provides a local device scope and the current device
+       can be explicitly reset for the remainder of that scope.
 
-    Corollary: if any library function resets the device globally and this is an undesired side-effect, such functions must be
-      called from within the device context manager.
+    Corollary: if any library function resets the device globally and this is an undesired
+        side-effect, such functions must be called from within the device context manager.
 
     Device context managers can be arbitrarily nested.
     """
@@ -201,12 +206,14 @@ def cached_get_or_create_stream(device_id, stream, op_package):
 
 def get_or_create_stream(device_id, stream, op_package):
     """
-    Create a stream object from a stream pointer or extract the stream pointer from a stream object, or
-    use the current stream.
+    Create a stream object from a stream pointer or extract the stream pointer from a stream
+    object, or use the current stream.
 
     Args:
         device_id: The device ID.
+
         stream: A stream object, stream pointer, or None.
+
         op_package: The package the tensor network operands belong to.
 
     Returns:
@@ -277,33 +284,37 @@ def get_operands_data(operands):
     return op_data
 
 
-def create_empty_tensor(cls, extents, dtype, device_id, stream_holder, strides=None):
+def create_empty_tensor(
+    cls: Tensor,
+    extents: Sequence[int],
+    dtype: type,
+    device_id: int | None,
+    stream_holder: StreamHolder,
+    verify_strides: bool,
+    strides: Sequence[int] | None = None,
+) -> Tensor:
     """
-    Create a wrapped tensor of the same type as (the wrapped) cls on the specified device having the
-    specified extents and dtype.
+    Create a wrapped tensor of the same type as (the wrapped) cls on the specified device
+    having the specified extents and dtype.
 
-    The tensor is created within a stream context to allow for asynchronous memory allocators like
-    CuPy's MemoryAsyncPool.
+    The tensor is created within a stream context to allow for asynchronous memory
+    allocators like CuPy's MemoryAsyncPool.
+
+    Note, the function assumes the `strides` are dense (possibly permuted).
+    Otherwise, the behaviour is framework specific and tensor creation may fail
+    or created tensor may be corrupted. Set `verify_strides` to True to check
+    the layout and drop the strides if the layout is not dense.
     """
     ctx = stream_holder.ctx if device_id is not None else contextlib.nullcontext()
     # if device id is none the stream holder must be too
     assert device_id is not None or stream_holder is None
+    if strides is not None and verify_strides and not is_contiguous_and_dense(extents, strides):
+        strides = None
     with ctx:
         tensor = cls.empty(extents, dtype=dtype, device_id=device_id, strides=strides)
     return tensor
 
 
-def create_output_tensor(cls, extents, device_id, stream_holder, data_type, strides=None):
-    """
-    Create output tensor. This operation is ordered through events and is safe to use with asynchronous memory pools.
-    """
-    with device_ctx(device_id):
-        output = create_empty_tensor(cls, extents, data_type, device_id, stream_holder, strides)
-        output_event = stream_holder.obj.record()
-
-    return output, output_event
-
-
 def get_operands_device_id(operands):
     """
     Return the id (ordinal) of the device the operands are on, or None if it is on the CPU.
@@ -334,21 +345,17 @@ def get_operands_package(operands):
     package = infer_object_package(operands[0].tensor)
     if not all(infer_object_package(operand.tensor) == package for operand in operands):
         packages = {infer_object_package(operand.tensor) for operand in operands}
-        raise TypeError(
-            f"All tensors in the network must be from the same library package. Packages found = {packages}."
-        )
+        raise TypeError(f"All tensors in the network must be from the same library package. Packages found = {packages}.")
     return package
 
 
 def check_operands_match(orig_operands, new_operands, attribute, description):
     """
-    Check if the specified attribute matches between the corresponding new and old operands, and raise an exception if it
-    doesn't.
+    Check if the specified attribute matches between the corresponding new and old operands,
+    and raise an exception if it doesn't.
     """
     if isinstance(orig_operands, Sequence):
-        checks = [
-            getattr(o, attribute) == getattr(n, attribute) for o, n in zip(orig_operands, new_operands, strict=True)
-        ]
+        checks = [getattr(o, attribute) == getattr(n, attribute) for o, n in zip(orig_operands, new_operands, strict=True)]
 
         if not all(checks):
             mismatch = [
@@ -357,7 +364,9 @@ def check_operands_match(orig_operands, new_operands, attribute, description):
                 if predicate is False
             ]
             mismatch = formatters.array2string(mismatch)
-            message = f"""The {description} of each new operand must match the {description} of the corresponding original operand.
+            message = f"""\
+The {description} of each new operand must match the {description} of the corresponding original operand.
+
 The mismatch in {description} as a sequence of "position: original {description} => new {description}" is: \n{mismatch}"""
             raise ValueError(message)
     else:
@@ -369,8 +378,8 @@ def check_operands_match(orig_operands, new_operands, attribute, description):
 
 def check_attribute_match(orig_attribute, new_attribute, description):
     """
-    Check if the specified attribute matches between the corresponding new and old operands, and raise an exception if it
-    doesn't.
+    Check if the specified attribute matches between the corresponding new and old operands,
+    and raise an exception if it doesn't.
     """
     check = orig_attribute == new_attribute
     if not check:
@@ -381,7 +390,8 @@ def check_attribute_match(orig_attribute, new_attribute, description):
 # Unused since cuQuantum 22.11
 def check_alignments_match(orig_alignments, new_alignments):
     """
-    Check if alignment matches between the corresponding new and old operands, and raise an exception if it doesn't.
+    Check if alignment matches between the corresponding new and old operands, and raise an
+    exception if it doesn't.
     """
     checks = [o == n for o, n in zip(orig_alignments, new_alignments, strict=True)]
 
@@ -392,7 +402,9 @@ def check_alignments_match(orig_alignments, new_alignments):
             if predicate is False
         ]
         mismatch = formatters.array2string(mismatch)
-        message = f"""The data alignment of each new operand must match the data alignment of the corresponding original operand.
+        message = f"""\
+The data alignment of each new operand must match the data alignment of the corresponding original operand.
+
 The mismatch in data alignment as a sequence of "position: original alignment => new alignment" is: \n{mismatch}"""
         raise ValueError(message)
 
@@ -415,9 +427,7 @@ def check_tensor_qualifiers(qualifiers, dtype, num_inputs):
         message = prolog + f" The shape of the ndarray is {qualifiers.shape}."
         raise ValueError(message)
     elif len(qualifiers) != num_inputs:
-        message = (
-            prolog + f" The length of the ndarray is {len(qualifiers)}, while the expected length is {num_inputs}."
-        )
+        message = prolog + f" The length of the ndarray is {len(qualifiers)}, while the expected length is {num_inputs}."
         raise ValueError(message)
 
     return qualifiers
@@ -440,7 +450,8 @@ def check_autotune_params(iterations):
 
 def get_ptr_from_memory_pointer(mem_ptr):
     """
-    Access the value associated with one of the attributes 'device_ptr', 'device_pointer', 'ptr'.
+    Access the value associated with one of the attributes 'device_ptr', 'device_pointer',
+    'ptr'.
     """
     attributes = ("device_ptr", "device_pointer", "ptr")
     for attr in attributes:
@@ -481,7 +492,8 @@ def _validate(self, value):
 
 def check_and_set_options(required: MutableMapping[str, Value], provided: MutableMapping[str, object]):
     """
-    Update each option specified in 'required' by getting the value from 'provided' if it exists or using a default.
+    Update each option specified in 'required' by getting the value from 'provided' if it
+    exists or using a default.
     """
     for option, value in required.items():
         try:
@@ -496,20 +508,22 @@ def check_and_set_options(required: MutableMapping[str, Value], provided: Mutabl
 @contextlib.contextmanager
 def cuda_call_ctx(stream_holder, blocking=True, timing=True):
     """
-    A simple context manager that provides (non-)blocking behavior depending on the `blocking` parameter for CUDA calls.
-      The call is timed only for blocking behavior when timing is requested.
+    A simple context manager that provides (non-)blocking behavior depending on the
+    `blocking` parameter for CUDA calls. The call is timed only for blocking behavior when
+    timing is requested.
 
-    An `end` event is recorded after the CUDA call for use in establishing stream ordering for non-blocking calls. This
-    event is returned together with a `Value` object that stores the elapsed time if the call is blocking and timing is
-    requested, or None otherwise.
+    An `end` event is recorded after the CUDA call for use in establishing stream ordering
+    for non-blocking calls. This event is returned together with a `Value` object that
+    stores the elapsed time if the call is blocking and timing is requested, or None
+    otherwise.
     """
     stream = stream_holder.obj
 
     if blocking:
-        start = cp.cuda.Event(disable_timing=False if timing else True)
+        start = cp.cuda.Event(disable_timing=not timing)
         stream.record(start)
 
-    end = cp.cuda.Event(disable_timing=False if timing and blocking else True)
+    end = cp.cuda.Event(disable_timing=not (timing and blocking))
 
     time = Value(None, validator=lambda v: True)
     yield end, time
@@ -545,15 +559,18 @@ def atomic(
     handler: Callable[[typing.Any, Exception | None], bool] | Callable[[Exception | None], bool], method: bool = False
 ) -> Callable:
     """
-    A decorator that provides "succeed or roll-back" semantics. A typical use for this is to release partial resources if an
-    exception occurs.
+    A decorator that provides "succeed or roll-back" semantics. A typical use for this is to
+    release partial resources if an exception occurs.
 
     Args:
-        handler: A function to call when an exception occurs. The handler takes a single argument, which is the exception
-            object, and returns a boolean stating whether the same exception should be reraised. We assume that this function
-            does not raise an exception.
-        method: Specify if the wrapped function as well as the exception handler are methods bound to the same object
-            (method = True) or they are free functions (method = False).
+        handler: A function to call when an exception occurs. The handler takes a single
+            argument, which is the exception object, and returns a boolean stating whether
+            the same exception should be reraised. We assume that this function does not
+            raise an exception.
+
+        method: Specify if the wrapped function as well as the exception handler are methods
+            bound to the same object (method = True) or they are free functions (method =
+            False).
 
     Returns:
         Callable: A decorator that creates the wrapping.
@@ -567,8 +584,8 @@ def outer(wrapped_function):
         @functools.wraps(wrapped_function)
         def inner(*args, **kwargs):
             """
-            Call the wrapped function and return the result. If an exception occurs, then call the exception handler and
-            reraise the exception.
+            Call the wrapped function and return the result. If an exception occurs, then
+            call the exception handler and reraise the exception.
             """
             try:
                 result = wrapped_function(*args, **kwargs)
@@ -593,8 +610,9 @@ def precondition(checker: Callable[..., None], what: str = "") -> Callable:
     A decorator that adds checks to ensure any preconditions are met.
 
     Args:
-        checker: The function to call to check whether the preconditions are met. It has the same signature as the wrapped
-            function with the addition of the keyword argument `what`.
+        checker: The function to call to check whether the preconditions are met. It has the
+            same signature as the wrapped function with the addition of the keyword argument
+            `what`.
         what: A string that is passed in to `checker` to provide context information.
 
     Returns:
@@ -643,13 +661,24 @@ def get_mpi_comm_pointer(comm):
 
 
 COMMON_SHARED_DOC_MAP = {
-    "operand": "A tensor (ndarray-like object). The currently supported types are :class:`numpy.ndarray`, :class:`cupy.ndarray`, and :class:`torch.Tensor`.",
-    "stream": "Provide the CUDA stream to use for executing the operation. Acceptable inputs include ``cudaStream_t`` (as Python :class:`int`), :class:`cupy.cuda.Stream`, "
-    "and :class:`torch.cuda.Stream`. If a stream is not provided, the current stream from the operand package will be used.",
-    "release_workspace": "A value of `True` specifies that the stateful object should release workspace memory back to the package memory pool on function return, "
-    "while a value of `False` specifies that the object should retain the memory. "
-    "This option may be set to `True` if the application performs other operations that consume a lot of memory between successive calls to the (same or different) :meth:`execute` API, "
-    "but incurs a small overhead due to obtaining and releasing workspace memory from and to the package memory pool on every call. The default is `False`.",
+    "operand": """\
+A tensor (ndarray-like object). The currently supported types are :class:`numpy.ndarray`,
+:class:`cupy.ndarray`, and :class:`torch.Tensor`.""".replace("\n", " "),
+    #
+    "stream": """\
+Provide the CUDA stream to use for executing the operation. Acceptable inputs include
+``cudaStream_t`` (as Python :class:`int`), :class:`cupy.cuda.Stream`, and
+:class:`torch.cuda.Stream`. If a stream is not provided, the current stream from the operand
+package will be used.""".replace("\n", " "),
+    #
+    "release_workspace": """\
+A value of `True` specifies that the stateful object should release workspace memory back to
+the package memory pool on function return, while a value of `False` specifies that the
+object should retain the memory. This option may be set to `True` if the application
+performs other operations that consume a lot of memory between successive calls to the (same
+or different) :meth:`execute` API, but incurs a small overhead due to obtaining and
+releasing workspace memory from and to the package memory pool on every call. The default is
+`False`.""".replace("\n", " "),
 }
 
 
diff --git a/nvmath/_utils.py b/nvmath/_utils.py
index 5f8cdb7..8c0531b 100644
--- a/nvmath/_utils.py
+++ b/nvmath/_utils.py
@@ -135,9 +135,7 @@ def force_loading_nvrtc(cu_ver):
                 # This absolute path will always be correct regardless of the package source
                 nvrtc_path = win32api.GetModuleFileNameW(_nvrtc_obj[0]._handle)
                 dso_dir = os.path.dirname(nvrtc_path)
-                dso_path = os.path.join(
-                    dso_dir, [f for f in os.listdir(dso_dir) if re.match("^nvrtc-builtins.*.dll$", f)][0]
-                )
+                dso_path = os.path.join(dso_dir, [f for f in os.listdir(dso_dir) if re.match("^nvrtc-builtins.*.dll$", f)][0])
                 _nvrtc_obj.append(ctypes.CDLL(dso_path))
             break
     else:
diff --git a/nvmath/_version.py b/nvmath/_version.py
index b8b382a..506a64f 100644
--- a/nvmath/_version.py
+++ b/nvmath/_version.py
@@ -2,4 +2,4 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-__version__ = "0.2.0"
+__version__ = "0.2.1"
diff --git a/nvmath/bindings/_internal/cublas.pxd b/nvmath/bindings/_internal/cublas.pxd
index 863172d..00d1cf0 100644
--- a/nvmath/bindings/_internal/cublas.pxd
+++ b/nvmath/bindings/_internal/cublas.pxd
@@ -511,3 +511,9 @@ cdef cublasStatus_t _cublasSdgmm_64(cublasHandle_t handle, cublasSideMode_t mode
 cdef cublasStatus_t _cublasDdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const double* A, int64_t lda, const double* x, int64_t incx, double* C, int64_t ldc) except* nogil
 cdef cublasStatus_t _cublasCdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const cuComplex* A, int64_t lda, const cuComplex* x, int64_t incx, cuComplex* C, int64_t ldc) except* nogil
 cdef cublasStatus_t _cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* C, int64_t ldc) except* nogil
+cdef cublasStatus_t _cublasSgemmGroupedBatched(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const float alpha_array[], const float* const Aarray[], const int lda_array[], const float* const Barray[], const int ldb_array[], const float beta_array[], float* const Carray[], const int ldc_array[], int group_count, const int group_size[]) except* nogil
+cdef cublasStatus_t _cublasSgemmGroupedBatched_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const float alpha_array[], const float* const Aarray[], const int64_t lda_array[], const float* const Barray[], const int64_t ldb_array[], const float beta_array[], float* const Carray[], const int64_t ldc_array[], int64_t group_count, const int64_t group_size[]) except* nogil
+cdef cublasStatus_t _cublasDgemmGroupedBatched(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const double alpha_array[], const double* const Aarray[], const int lda_array[], const double* const Barray[], const int ldb_array[], const double beta_array[], double* const Carray[], const int ldc_array[], int group_count, const int group_size[]) except* nogil
+cdef cublasStatus_t _cublasDgemmGroupedBatched_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const double alpha_array[], const double* const Aarray[], const int64_t lda_array[], const double* const Barray[], const int64_t ldb_array[], const double beta_array[], double* const Carray[], const int64_t ldc_array[], int64_t group_count, const int64_t group_size[]) except* nogil
+cdef cublasStatus_t _cublasGemmGroupedBatchedEx(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const void* alpha_array, const void* const Aarray[], cudaDataType_t Atype, const int lda_array[], const void* const Barray[], cudaDataType_t Btype, const int ldb_array[], const void* beta_array, void* const Carray[], cudaDataType_t Ctype, const int ldc_array[], int group_count, const int group_size[], cublasComputeType_t computeType) except* nogil
+cdef cublasStatus_t _cublasGemmGroupedBatchedEx_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const void* alpha_array, const void* const Aarray[], cudaDataType_t Atype, const int64_t lda_array[], const void* const Barray[], cudaDataType_t Btype, const int64_t ldb_array[], const void* beta_array, void* const Carray[], cudaDataType_t Ctype, const int64_t ldc_array[], int64_t group_count, const int64_t group_size[], cublasComputeType_t computeType) except* nogil
diff --git a/nvmath/bindings/_internal/cublas_linux.pyx b/nvmath/bindings/_internal/cublas_linux.pyx
index e69112f..1bf35f4 100644
--- a/nvmath/bindings/_internal/cublas_linux.pyx
+++ b/nvmath/bindings/_internal/cublas_linux.pyx
@@ -535,6 +535,12 @@ cdef void* __cublasSdgmm_64 = NULL
 cdef void* __cublasDdgmm_64 = NULL
 cdef void* __cublasCdgmm_64 = NULL
 cdef void* __cublasZdgmm_64 = NULL
+cdef void* __cublasSgemmGroupedBatched = NULL
+cdef void* __cublasSgemmGroupedBatched_64 = NULL
+cdef void* __cublasDgemmGroupedBatched = NULL
+cdef void* __cublasDgemmGroupedBatched_64 = NULL
+cdef void* __cublasGemmGroupedBatchedEx = NULL
+cdef void* __cublasGemmGroupedBatchedEx_64 = NULL
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
@@ -4063,6 +4069,48 @@ cdef int _check_or_init_cublas() except -1 nogil:
             handle = load_library(driver_ver)
         __cublasZdgmm_64 = dlsym(handle, 'cublasZdgmm_64')
 
+    global __cublasSgemmGroupedBatched
+    __cublasSgemmGroupedBatched = dlsym(RTLD_DEFAULT, 'cublasSgemmGroupedBatched')
+    if __cublasSgemmGroupedBatched == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cublasSgemmGroupedBatched = dlsym(handle, 'cublasSgemmGroupedBatched')
+
+    global __cublasSgemmGroupedBatched_64
+    __cublasSgemmGroupedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasSgemmGroupedBatched_64')
+    if __cublasSgemmGroupedBatched_64 == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cublasSgemmGroupedBatched_64 = dlsym(handle, 'cublasSgemmGroupedBatched_64')
+
+    global __cublasDgemmGroupedBatched
+    __cublasDgemmGroupedBatched = dlsym(RTLD_DEFAULT, 'cublasDgemmGroupedBatched')
+    if __cublasDgemmGroupedBatched == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cublasDgemmGroupedBatched = dlsym(handle, 'cublasDgemmGroupedBatched')
+
+    global __cublasDgemmGroupedBatched_64
+    __cublasDgemmGroupedBatched_64 = dlsym(RTLD_DEFAULT, 'cublasDgemmGroupedBatched_64')
+    if __cublasDgemmGroupedBatched_64 == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cublasDgemmGroupedBatched_64 = dlsym(handle, 'cublasDgemmGroupedBatched_64')
+
+    global __cublasGemmGroupedBatchedEx
+    __cublasGemmGroupedBatchedEx = dlsym(RTLD_DEFAULT, 'cublasGemmGroupedBatchedEx')
+    if __cublasGemmGroupedBatchedEx == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cublasGemmGroupedBatchedEx = dlsym(handle, 'cublasGemmGroupedBatchedEx')
+
+    global __cublasGemmGroupedBatchedEx_64
+    __cublasGemmGroupedBatchedEx_64 = dlsym(RTLD_DEFAULT, 'cublasGemmGroupedBatchedEx_64')
+    if __cublasGemmGroupedBatchedEx_64 == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cublasGemmGroupedBatchedEx_64 = dlsym(handle, 'cublasGemmGroupedBatchedEx_64')
+
     __py_cublas_init = True
     return 0
 
@@ -5572,6 +5620,24 @@ cpdef dict _inspect_function_pointers():
     global __cublasZdgmm_64
     data["__cublasZdgmm_64"] = <intptr_t>__cublasZdgmm_64
 
+    global __cublasSgemmGroupedBatched
+    data["__cublasSgemmGroupedBatched"] = <intptr_t>__cublasSgemmGroupedBatched
+
+    global __cublasSgemmGroupedBatched_64
+    data["__cublasSgemmGroupedBatched_64"] = <intptr_t>__cublasSgemmGroupedBatched_64
+
+    global __cublasDgemmGroupedBatched
+    data["__cublasDgemmGroupedBatched"] = <intptr_t>__cublasDgemmGroupedBatched
+
+    global __cublasDgemmGroupedBatched_64
+    data["__cublasDgemmGroupedBatched_64"] = <intptr_t>__cublasDgemmGroupedBatched_64
+
+    global __cublasGemmGroupedBatchedEx
+    data["__cublasGemmGroupedBatchedEx"] = <intptr_t>__cublasGemmGroupedBatchedEx
+
+    global __cublasGemmGroupedBatchedEx_64
+    data["__cublasGemmGroupedBatchedEx_64"] = <intptr_t>__cublasGemmGroupedBatchedEx_64
+
     func_ptrs = data
     return data
 
@@ -10565,3 +10631,63 @@ cdef cublasStatus_t _cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode
             raise FunctionNotFoundError("function cublasZdgmm_64 is not found")
     return (<cublasStatus_t (*)(cublasHandle_t, cublasSideMode_t, int64_t, int64_t, const cuDoubleComplex*, int64_t, const cuDoubleComplex*, int64_t, cuDoubleComplex*, int64_t) nogil>__cublasZdgmm_64)(
         handle, mode, m, n, A, lda, x, incx, C, ldc)
+
+
+cdef cublasStatus_t _cublasSgemmGroupedBatched(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const float alpha_array[], const float* const Aarray[], const int lda_array[], const float* const Barray[], const int ldb_array[], const float beta_array[], float* const Carray[], const int ldc_array[], int group_count, const int group_size[]) except* nogil:
+    global __cublasSgemmGroupedBatched
+    _check_or_init_cublas()
+    if __cublasSgemmGroupedBatched == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasSgemmGroupedBatched is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, const cublasOperation_t*, const cublasOperation_t*, const int*, const int*, const int*, const float*, const float* const*, const int*, const float* const*, const int*, const float*, float* const*, const int*, int, const int*) nogil>__cublasSgemmGroupedBatched)(
+        handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, lda_array, Barray, ldb_array, beta_array, Carray, ldc_array, group_count, group_size)
+
+
+cdef cublasStatus_t _cublasSgemmGroupedBatched_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const float alpha_array[], const float* const Aarray[], const int64_t lda_array[], const float* const Barray[], const int64_t ldb_array[], const float beta_array[], float* const Carray[], const int64_t ldc_array[], int64_t group_count, const int64_t group_size[]) except* nogil:
+    global __cublasSgemmGroupedBatched_64
+    _check_or_init_cublas()
+    if __cublasSgemmGroupedBatched_64 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasSgemmGroupedBatched_64 is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, const cublasOperation_t*, const cublasOperation_t*, const int64_t*, const int64_t*, const int64_t*, const float*, const float* const*, const int64_t*, const float* const*, const int64_t*, const float*, float* const*, const int64_t*, int64_t, const int64_t*) nogil>__cublasSgemmGroupedBatched_64)(
+        handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, lda_array, Barray, ldb_array, beta_array, Carray, ldc_array, group_count, group_size)
+
+
+cdef cublasStatus_t _cublasDgemmGroupedBatched(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const double alpha_array[], const double* const Aarray[], const int lda_array[], const double* const Barray[], const int ldb_array[], const double beta_array[], double* const Carray[], const int ldc_array[], int group_count, const int group_size[]) except* nogil:
+    global __cublasDgemmGroupedBatched
+    _check_or_init_cublas()
+    if __cublasDgemmGroupedBatched == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasDgemmGroupedBatched is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, const cublasOperation_t*, const cublasOperation_t*, const int*, const int*, const int*, const double*, const double* const*, const int*, const double* const*, const int*, const double*, double* const*, const int*, int, const int*) nogil>__cublasDgemmGroupedBatched)(
+        handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, lda_array, Barray, ldb_array, beta_array, Carray, ldc_array, group_count, group_size)
+
+
+cdef cublasStatus_t _cublasDgemmGroupedBatched_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const double alpha_array[], const double* const Aarray[], const int64_t lda_array[], const double* const Barray[], const int64_t ldb_array[], const double beta_array[], double* const Carray[], const int64_t ldc_array[], int64_t group_count, const int64_t group_size[]) except* nogil:
+    global __cublasDgemmGroupedBatched_64
+    _check_or_init_cublas()
+    if __cublasDgemmGroupedBatched_64 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasDgemmGroupedBatched_64 is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, const cublasOperation_t*, const cublasOperation_t*, const int64_t*, const int64_t*, const int64_t*, const double*, const double* const*, const int64_t*, const double* const*, const int64_t*, const double*, double* const*, const int64_t*, int64_t, const int64_t*) nogil>__cublasDgemmGroupedBatched_64)(
+        handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, lda_array, Barray, ldb_array, beta_array, Carray, ldc_array, group_count, group_size)
+
+
+cdef cublasStatus_t _cublasGemmGroupedBatchedEx(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const void* alpha_array, const void* const Aarray[], cudaDataType_t Atype, const int lda_array[], const void* const Barray[], cudaDataType_t Btype, const int ldb_array[], const void* beta_array, void* const Carray[], cudaDataType_t Ctype, const int ldc_array[], int group_count, const int group_size[], cublasComputeType_t computeType) except* nogil:
+    global __cublasGemmGroupedBatchedEx
+    _check_or_init_cublas()
+    if __cublasGemmGroupedBatchedEx == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasGemmGroupedBatchedEx is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, const cublasOperation_t*, const cublasOperation_t*, const int*, const int*, const int*, const void*, const void* const*, cudaDataType_t, const int*, const void* const*, cudaDataType_t, const int*, const void*, void* const*, cudaDataType_t, const int*, int, const int*, cublasComputeType_t) nogil>__cublasGemmGroupedBatchedEx)(
+        handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, Atype, lda_array, Barray, Btype, ldb_array, beta_array, Carray, Ctype, ldc_array, group_count, group_size, computeType)
+
+
+cdef cublasStatus_t _cublasGemmGroupedBatchedEx_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const void* alpha_array, const void* const Aarray[], cudaDataType_t Atype, const int64_t lda_array[], const void* const Barray[], cudaDataType_t Btype, const int64_t ldb_array[], const void* beta_array, void* const Carray[], cudaDataType_t Ctype, const int64_t ldc_array[], int64_t group_count, const int64_t group_size[], cublasComputeType_t computeType) except* nogil:
+    global __cublasGemmGroupedBatchedEx_64
+    _check_or_init_cublas()
+    if __cublasGemmGroupedBatchedEx_64 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasGemmGroupedBatchedEx_64 is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, const cublasOperation_t*, const cublasOperation_t*, const int64_t*, const int64_t*, const int64_t*, const void*, const void* const*, cudaDataType_t, const int64_t*, const void* const*, cudaDataType_t, const int64_t*, const void*, void* const*, cudaDataType_t, const int64_t*, int64_t, const int64_t*, cublasComputeType_t) nogil>__cublasGemmGroupedBatchedEx_64)(
+        handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, Atype, lda_array, Barray, Btype, ldb_array, beta_array, Carray, Ctype, ldc_array, group_count, group_size, computeType)
diff --git a/nvmath/bindings/_internal/cublas_windows.pyx b/nvmath/bindings/_internal/cublas_windows.pyx
index 6038958..46e2a23 100644
--- a/nvmath/bindings/_internal/cublas_windows.pyx
+++ b/nvmath/bindings/_internal/cublas_windows.pyx
@@ -524,6 +524,12 @@ cdef void* __cublasSdgmm_64 = NULL
 cdef void* __cublasDdgmm_64 = NULL
 cdef void* __cublasCdgmm_64 = NULL
 cdef void* __cublasZdgmm_64 = NULL
+cdef void* __cublasSgemmGroupedBatched = NULL
+cdef void* __cublasSgemmGroupedBatched_64 = NULL
+cdef void* __cublasDgemmGroupedBatched = NULL
+cdef void* __cublasDgemmGroupedBatched_64 = NULL
+cdef void* __cublasGemmGroupedBatchedEx = NULL
+cdef void* __cublasGemmGroupedBatchedEx_64 = NULL
 
 
 cdef inline list get_site_packages():
@@ -3589,6 +3595,42 @@ cdef int _check_or_init_cublas() except -1 nogil:
         except:
             pass
 
+        global __cublasSgemmGroupedBatched
+        try:
+            __cublasSgemmGroupedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemmGroupedBatched')
+        except:
+            pass
+
+        global __cublasSgemmGroupedBatched_64
+        try:
+            __cublasSgemmGroupedBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSgemmGroupedBatched_64')
+        except:
+            pass
+
+        global __cublasDgemmGroupedBatched
+        try:
+            __cublasDgemmGroupedBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemmGroupedBatched')
+        except:
+            pass
+
+        global __cublasDgemmGroupedBatched_64
+        try:
+            __cublasDgemmGroupedBatched_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasDgemmGroupedBatched_64')
+        except:
+            pass
+
+        global __cublasGemmGroupedBatchedEx
+        try:
+            __cublasGemmGroupedBatchedEx = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGemmGroupedBatchedEx')
+        except:
+            pass
+
+        global __cublasGemmGroupedBatchedEx_64
+        try:
+            __cublasGemmGroupedBatchedEx_64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGemmGroupedBatchedEx_64')
+        except:
+            pass
+
     __py_cublas_init = True
     return 0
 
@@ -5098,6 +5140,24 @@ cpdef dict _inspect_function_pointers():
     global __cublasZdgmm_64
     data["__cublasZdgmm_64"] = <intptr_t>__cublasZdgmm_64
 
+    global __cublasSgemmGroupedBatched
+    data["__cublasSgemmGroupedBatched"] = <intptr_t>__cublasSgemmGroupedBatched
+
+    global __cublasSgemmGroupedBatched_64
+    data["__cublasSgemmGroupedBatched_64"] = <intptr_t>__cublasSgemmGroupedBatched_64
+
+    global __cublasDgemmGroupedBatched
+    data["__cublasDgemmGroupedBatched"] = <intptr_t>__cublasDgemmGroupedBatched
+
+    global __cublasDgemmGroupedBatched_64
+    data["__cublasDgemmGroupedBatched_64"] = <intptr_t>__cublasDgemmGroupedBatched_64
+
+    global __cublasGemmGroupedBatchedEx
+    data["__cublasGemmGroupedBatchedEx"] = <intptr_t>__cublasGemmGroupedBatchedEx
+
+    global __cublasGemmGroupedBatchedEx_64
+    data["__cublasGemmGroupedBatchedEx_64"] = <intptr_t>__cublasGemmGroupedBatchedEx_64
+
     func_ptrs = data
     return data
 
@@ -10091,3 +10151,63 @@ cdef cublasStatus_t _cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode
             raise FunctionNotFoundError("function cublasZdgmm_64 is not found")
     return (<cublasStatus_t (*)(cublasHandle_t, cublasSideMode_t, int64_t, int64_t, const cuDoubleComplex*, int64_t, const cuDoubleComplex*, int64_t, cuDoubleComplex*, int64_t) nogil>__cublasZdgmm_64)(
         handle, mode, m, n, A, lda, x, incx, C, ldc)
+
+
+cdef cublasStatus_t _cublasSgemmGroupedBatched(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const float alpha_array[], const float* const Aarray[], const int lda_array[], const float* const Barray[], const int ldb_array[], const float beta_array[], float* const Carray[], const int ldc_array[], int group_count, const int group_size[]) except* nogil:
+    global __cublasSgemmGroupedBatched
+    _check_or_init_cublas()
+    if __cublasSgemmGroupedBatched == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasSgemmGroupedBatched is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, const cublasOperation_t*, const cublasOperation_t*, const int*, const int*, const int*, const float*, const float* const*, const int*, const float* const*, const int*, const float*, float* const*, const int*, int, const int*) nogil>__cublasSgemmGroupedBatched)(
+        handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, lda_array, Barray, ldb_array, beta_array, Carray, ldc_array, group_count, group_size)
+
+
+cdef cublasStatus_t _cublasSgemmGroupedBatched_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const float alpha_array[], const float* const Aarray[], const int64_t lda_array[], const float* const Barray[], const int64_t ldb_array[], const float beta_array[], float* const Carray[], const int64_t ldc_array[], int64_t group_count, const int64_t group_size[]) except* nogil:
+    global __cublasSgemmGroupedBatched_64
+    _check_or_init_cublas()
+    if __cublasSgemmGroupedBatched_64 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasSgemmGroupedBatched_64 is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, const cublasOperation_t*, const cublasOperation_t*, const int64_t*, const int64_t*, const int64_t*, const float*, const float* const*, const int64_t*, const float* const*, const int64_t*, const float*, float* const*, const int64_t*, int64_t, const int64_t*) nogil>__cublasSgemmGroupedBatched_64)(
+        handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, lda_array, Barray, ldb_array, beta_array, Carray, ldc_array, group_count, group_size)
+
+
+cdef cublasStatus_t _cublasDgemmGroupedBatched(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const double alpha_array[], const double* const Aarray[], const int lda_array[], const double* const Barray[], const int ldb_array[], const double beta_array[], double* const Carray[], const int ldc_array[], int group_count, const int group_size[]) except* nogil:
+    global __cublasDgemmGroupedBatched
+    _check_or_init_cublas()
+    if __cublasDgemmGroupedBatched == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasDgemmGroupedBatched is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, const cublasOperation_t*, const cublasOperation_t*, const int*, const int*, const int*, const double*, const double* const*, const int*, const double* const*, const int*, const double*, double* const*, const int*, int, const int*) nogil>__cublasDgemmGroupedBatched)(
+        handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, lda_array, Barray, ldb_array, beta_array, Carray, ldc_array, group_count, group_size)
+
+
+cdef cublasStatus_t _cublasDgemmGroupedBatched_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const double alpha_array[], const double* const Aarray[], const int64_t lda_array[], const double* const Barray[], const int64_t ldb_array[], const double beta_array[], double* const Carray[], const int64_t ldc_array[], int64_t group_count, const int64_t group_size[]) except* nogil:
+    global __cublasDgemmGroupedBatched_64
+    _check_or_init_cublas()
+    if __cublasDgemmGroupedBatched_64 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasDgemmGroupedBatched_64 is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, const cublasOperation_t*, const cublasOperation_t*, const int64_t*, const int64_t*, const int64_t*, const double*, const double* const*, const int64_t*, const double* const*, const int64_t*, const double*, double* const*, const int64_t*, int64_t, const int64_t*) nogil>__cublasDgemmGroupedBatched_64)(
+        handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, lda_array, Barray, ldb_array, beta_array, Carray, ldc_array, group_count, group_size)
+
+
+cdef cublasStatus_t _cublasGemmGroupedBatchedEx(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const void* alpha_array, const void* const Aarray[], cudaDataType_t Atype, const int lda_array[], const void* const Barray[], cudaDataType_t Btype, const int ldb_array[], const void* beta_array, void* const Carray[], cudaDataType_t Ctype, const int ldc_array[], int group_count, const int group_size[], cublasComputeType_t computeType) except* nogil:
+    global __cublasGemmGroupedBatchedEx
+    _check_or_init_cublas()
+    if __cublasGemmGroupedBatchedEx == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasGemmGroupedBatchedEx is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, const cublasOperation_t*, const cublasOperation_t*, const int*, const int*, const int*, const void*, const void* const*, cudaDataType_t, const int*, const void* const*, cudaDataType_t, const int*, const void*, void* const*, cudaDataType_t, const int*, int, const int*, cublasComputeType_t) nogil>__cublasGemmGroupedBatchedEx)(
+        handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, Atype, lda_array, Barray, Btype, ldb_array, beta_array, Carray, Ctype, ldc_array, group_count, group_size, computeType)
+
+
+cdef cublasStatus_t _cublasGemmGroupedBatchedEx_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const void* alpha_array, const void* const Aarray[], cudaDataType_t Atype, const int64_t lda_array[], const void* const Barray[], cudaDataType_t Btype, const int64_t ldb_array[], const void* beta_array, void* const Carray[], cudaDataType_t Ctype, const int64_t ldc_array[], int64_t group_count, const int64_t group_size[], cublasComputeType_t computeType) except* nogil:
+    global __cublasGemmGroupedBatchedEx_64
+    _check_or_init_cublas()
+    if __cublasGemmGroupedBatchedEx_64 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasGemmGroupedBatchedEx_64 is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, const cublasOperation_t*, const cublasOperation_t*, const int64_t*, const int64_t*, const int64_t*, const void*, const void* const*, cudaDataType_t, const int64_t*, const void* const*, cudaDataType_t, const int64_t*, const void*, void* const*, cudaDataType_t, const int64_t*, int64_t, const int64_t*, cublasComputeType_t) nogil>__cublasGemmGroupedBatchedEx_64)(
+        handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, Atype, lda_array, Barray, Btype, ldb_array, beta_array, Carray, Ctype, ldc_array, group_count, group_size, computeType)
diff --git a/nvmath/bindings/_internal/cufft.pxd b/nvmath/bindings/_internal/cufft.pxd
index 5b013bf..c3f319d 100644
--- a/nvmath/bindings/_internal/cufft.pxd
+++ b/nvmath/bindings/_internal/cufft.pxd
@@ -64,3 +64,6 @@ cdef cufftResult _cufftXtExecDescriptor(cufftHandle plan, cudaLibXtDesc* input,
 cdef cufftResult _cufftXtSetWorkAreaPolicy(cufftHandle plan, cufftXtWorkAreaPolicy policy, size_t* workSize) except* nogil
 cdef cufftResult _cufftXtSetJITCallback(cufftHandle plan, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except* nogil
 cdef cufftResult _cufftXtSetSubformatDefault(cufftHandle plan, cufftXtSubFormat subformat_forward, cufftXtSubFormat subformat_inverse) except* nogil
+cdef cufftResult _cufftSetPlanPropertyInt64(cufftHandle plan, cufftProperty property, const long long int inputValueInt) except* nogil
+cdef cufftResult _cufftGetPlanPropertyInt64(cufftHandle plan, cufftProperty property, long long int* returnPtrValue) except* nogil
+cdef cufftResult _cufftResetPlanProperty(cufftHandle plan, cufftProperty property) except* nogil
diff --git a/nvmath/bindings/_internal/cufft_linux.pyx b/nvmath/bindings/_internal/cufft_linux.pyx
index beaa8c9..7d358a9 100644
--- a/nvmath/bindings/_internal/cufft_linux.pyx
+++ b/nvmath/bindings/_internal/cufft_linux.pyx
@@ -90,6 +90,9 @@ cdef void* __cufftXtExecDescriptor = NULL
 cdef void* __cufftXtSetWorkAreaPolicy = NULL
 cdef void* __cufftXtSetJITCallback = NULL
 cdef void* __cufftXtSetSubformatDefault = NULL
+cdef void* __cufftSetPlanPropertyInt64 = NULL
+cdef void* __cufftGetPlanPropertyInt64 = NULL
+cdef void* __cufftResetPlanProperty = NULL
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
@@ -503,6 +506,27 @@ cdef int _check_or_init_cufft() except -1 nogil:
             handle = load_library(driver_ver)
         __cufftXtSetSubformatDefault = dlsym(handle, 'cufftXtSetSubformatDefault')
 
+    global __cufftSetPlanPropertyInt64
+    __cufftSetPlanPropertyInt64 = dlsym(RTLD_DEFAULT, 'cufftSetPlanPropertyInt64')
+    if __cufftSetPlanPropertyInt64 == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cufftSetPlanPropertyInt64 = dlsym(handle, 'cufftSetPlanPropertyInt64')
+
+    global __cufftGetPlanPropertyInt64
+    __cufftGetPlanPropertyInt64 = dlsym(RTLD_DEFAULT, 'cufftGetPlanPropertyInt64')
+    if __cufftGetPlanPropertyInt64 == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cufftGetPlanPropertyInt64 = dlsym(handle, 'cufftGetPlanPropertyInt64')
+
+    global __cufftResetPlanProperty
+    __cufftResetPlanProperty = dlsym(RTLD_DEFAULT, 'cufftResetPlanProperty')
+    if __cufftResetPlanProperty == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cufftResetPlanProperty = dlsym(handle, 'cufftResetPlanProperty')
+
     __py_cufft_init = True
     return 0
 
@@ -677,6 +701,15 @@ cpdef dict _inspect_function_pointers():
     global __cufftXtSetSubformatDefault
     data["__cufftXtSetSubformatDefault"] = <intptr_t>__cufftXtSetSubformatDefault
 
+    global __cufftSetPlanPropertyInt64
+    data["__cufftSetPlanPropertyInt64"] = <intptr_t>__cufftSetPlanPropertyInt64
+
+    global __cufftGetPlanPropertyInt64
+    data["__cufftGetPlanPropertyInt64"] = <intptr_t>__cufftGetPlanPropertyInt64
+
+    global __cufftResetPlanProperty
+    data["__cufftResetPlanProperty"] = <intptr_t>__cufftResetPlanProperty
+
     func_ptrs = data
     return data
 
@@ -1220,3 +1253,33 @@ cdef cufftResult _cufftXtSetSubformatDefault(cufftHandle plan, cufftXtSubFormat
             raise FunctionNotFoundError("function cufftXtSetSubformatDefault is not found")
     return (<cufftResult (*)(cufftHandle, cufftXtSubFormat, cufftXtSubFormat) nogil>__cufftXtSetSubformatDefault)(
         plan, subformat_forward, subformat_inverse)
+
+
+cdef cufftResult _cufftSetPlanPropertyInt64(cufftHandle plan, cufftProperty property, const long long int inputValueInt) except* nogil:
+    global __cufftSetPlanPropertyInt64
+    _check_or_init_cufft()
+    if __cufftSetPlanPropertyInt64 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cufftSetPlanPropertyInt64 is not found")
+    return (<cufftResult (*)(cufftHandle, cufftProperty, const long long int) nogil>__cufftSetPlanPropertyInt64)(
+        plan, property, inputValueInt)
+
+
+cdef cufftResult _cufftGetPlanPropertyInt64(cufftHandle plan, cufftProperty property, long long int* returnPtrValue) except* nogil:
+    global __cufftGetPlanPropertyInt64
+    _check_or_init_cufft()
+    if __cufftGetPlanPropertyInt64 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cufftGetPlanPropertyInt64 is not found")
+    return (<cufftResult (*)(cufftHandle, cufftProperty, long long int*) nogil>__cufftGetPlanPropertyInt64)(
+        plan, property, returnPtrValue)
+
+
+cdef cufftResult _cufftResetPlanProperty(cufftHandle plan, cufftProperty property) except* nogil:
+    global __cufftResetPlanProperty
+    _check_or_init_cufft()
+    if __cufftResetPlanProperty == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cufftResetPlanProperty is not found")
+    return (<cufftResult (*)(cufftHandle, cufftProperty) nogil>__cufftResetPlanProperty)(
+        plan, property)
diff --git a/nvmath/bindings/_internal/cufft_windows.pyx b/nvmath/bindings/_internal/cufft_windows.pyx
index 1c37ff1..d732860 100644
--- a/nvmath/bindings/_internal/cufft_windows.pyx
+++ b/nvmath/bindings/_internal/cufft_windows.pyx
@@ -79,6 +79,9 @@ cdef void* __cufftXtExecDescriptor = NULL
 cdef void* __cufftXtSetWorkAreaPolicy = NULL
 cdef void* __cufftXtSetJITCallback = NULL
 cdef void* __cufftXtSetSubformatDefault = NULL
+cdef void* __cufftSetPlanPropertyInt64 = NULL
+cdef void* __cufftGetPlanPropertyInt64 = NULL
+cdef void* __cufftResetPlanProperty = NULL
 
 
 cdef inline list get_site_packages():
@@ -474,6 +477,24 @@ cdef int _check_or_init_cufft() except -1 nogil:
         except:
             pass
 
+        global __cufftSetPlanPropertyInt64
+        try:
+            __cufftSetPlanPropertyInt64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftSetPlanPropertyInt64')
+        except:
+            pass
+
+        global __cufftGetPlanPropertyInt64
+        try:
+            __cufftGetPlanPropertyInt64 = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftGetPlanPropertyInt64')
+        except:
+            pass
+
+        global __cufftResetPlanProperty
+        try:
+            __cufftResetPlanProperty = <void*><intptr_t>win32api.GetProcAddress(handle, 'cufftResetPlanProperty')
+        except:
+            pass
+
     __py_cufft_init = True
     return 0
 
@@ -648,6 +669,15 @@ cpdef dict _inspect_function_pointers():
     global __cufftXtSetSubformatDefault
     data["__cufftXtSetSubformatDefault"] = <intptr_t>__cufftXtSetSubformatDefault
 
+    global __cufftSetPlanPropertyInt64
+    data["__cufftSetPlanPropertyInt64"] = <intptr_t>__cufftSetPlanPropertyInt64
+
+    global __cufftGetPlanPropertyInt64
+    data["__cufftGetPlanPropertyInt64"] = <intptr_t>__cufftGetPlanPropertyInt64
+
+    global __cufftResetPlanProperty
+    data["__cufftResetPlanProperty"] = <intptr_t>__cufftResetPlanProperty
+
     func_ptrs = data
     return data
 
@@ -1191,3 +1221,33 @@ cdef cufftResult _cufftXtSetSubformatDefault(cufftHandle plan, cufftXtSubFormat
             raise FunctionNotFoundError("function cufftXtSetSubformatDefault is not found")
     return (<cufftResult (*)(cufftHandle, cufftXtSubFormat, cufftXtSubFormat) nogil>__cufftXtSetSubformatDefault)(
         plan, subformat_forward, subformat_inverse)
+
+
+cdef cufftResult _cufftSetPlanPropertyInt64(cufftHandle plan, cufftProperty property, const long long int inputValueInt) except* nogil:
+    global __cufftSetPlanPropertyInt64
+    _check_or_init_cufft()
+    if __cufftSetPlanPropertyInt64 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cufftSetPlanPropertyInt64 is not found")
+    return (<cufftResult (*)(cufftHandle, cufftProperty, const long long int) nogil>__cufftSetPlanPropertyInt64)(
+        plan, property, inputValueInt)
+
+
+cdef cufftResult _cufftGetPlanPropertyInt64(cufftHandle plan, cufftProperty property, long long int* returnPtrValue) except* nogil:
+    global __cufftGetPlanPropertyInt64
+    _check_or_init_cufft()
+    if __cufftGetPlanPropertyInt64 == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cufftGetPlanPropertyInt64 is not found")
+    return (<cufftResult (*)(cufftHandle, cufftProperty, long long int*) nogil>__cufftGetPlanPropertyInt64)(
+        plan, property, returnPtrValue)
+
+
+cdef cufftResult _cufftResetPlanProperty(cufftHandle plan, cufftProperty property) except* nogil:
+    global __cufftResetPlanProperty
+    _check_or_init_cufft()
+    if __cufftResetPlanProperty == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cufftResetPlanProperty is not found")
+    return (<cufftResult (*)(cufftHandle, cufftProperty) nogil>__cufftResetPlanProperty)(
+        plan, property)
diff --git a/nvmath/bindings/_internal/cusolverDn.pxd b/nvmath/bindings/_internal/cusolverDn.pxd
index 3ce18f3..49fe8ca 100644
--- a/nvmath/bindings/_internal/cusolverDn.pxd
+++ b/nvmath/bindings/_internal/cusolverDn.pxd
@@ -378,3 +378,9 @@ cdef cusolverStatus_t _cusolverDnLoggerSetMask(int mask) except* nogil
 cdef cusolverStatus_t _cusolverDnLoggerForceDisable() except* nogil
 cdef cusolverStatus_t _cusolverDnSetDeterministicMode(cusolverDnHandle_t handle, cusolverDeterministicMode_t mode) except* nogil
 cdef cusolverStatus_t _cusolverDnGetDeterministicMode(cusolverDnHandle_t handle, cusolverDeterministicMode_t* mode) except* nogil
+cdef cusolverStatus_t _cusolverDnXlarft_bufferSize(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverDirectMode_t direct, cusolverStorevMode_t storev, int64_t n, int64_t k, cudaDataType dataTypeV, const void* V, int64_t ldv, cudaDataType dataTypeTau, const void* tau, cudaDataType dataTypeT, void* T, int64_t ldt, cudaDataType computeType, size_t* workspaceInBytesOnDevice, size_t* workspaceInBytesOnHost) except* nogil
+cdef cusolverStatus_t _cusolverDnXlarft(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverDirectMode_t direct, cusolverStorevMode_t storev, int64_t n, int64_t k, cudaDataType dataTypeV, const void* V, int64_t ldv, cudaDataType dataTypeTau, const void* tau, cudaDataType dataTypeT, void* T, int64_t ldt, cudaDataType computeType, void* bufferOnDevice, size_t workspaceInBytesOnDevice, void* bufferOnHost, size_t workspaceInBytesOnHost) except* nogil
+cdef cusolverStatus_t _cusolverDnXsyevBatched_bufferSize(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n, cudaDataType dataTypeA, const void* A, int64_t lda, cudaDataType dataTypeW, const void* W, cudaDataType computeType, size_t* workspaceInBytesOnDevice, size_t* workspaceInBytesOnHost, int64_t batchSize) except* nogil
+cdef cusolverStatus_t _cusolverDnXsyevBatched(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n, cudaDataType dataTypeA, void* A, int64_t lda, cudaDataType dataTypeW, void* W, cudaDataType computeType, void* bufferOnDevice, size_t workspaceInBytesOnDevice, void* bufferOnHost, size_t workspaceInBytesOnHost, int* info, int64_t batchSize) except* nogil
+cdef cusolverStatus_t _cusolverDnXgeev_bufferSize(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobvl, cusolverEigMode_t jobvr, int64_t n, cudaDataType dataTypeA, const void* A, int64_t lda, cudaDataType dataTypeW, const void* W, cudaDataType dataTypeVL, const void* VL, int64_t ldvl, cudaDataType dataTypeVR, const void* VR, int64_t ldvr, cudaDataType computeType, size_t* workspaceInBytesOnDevice, size_t* workspaceInBytesOnHost) except* nogil
+cdef cusolverStatus_t _cusolverDnXgeev(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobvl, cusolverEigMode_t jobvr, int64_t n, cudaDataType dataTypeA, void* A, int64_t lda, cudaDataType dataTypeW, void* W, cudaDataType dataTypeVL, void* VL, int64_t ldvl, cudaDataType dataTypeVR, void* VR, int64_t ldvr, cudaDataType computeType, void* bufferOnDevice, size_t workspaceInBytesOnDevice, void* bufferOnHost, size_t workspaceInBytesOnHost, int* info) except* nogil
diff --git a/nvmath/bindings/_internal/cusolverDn_linux.pyx b/nvmath/bindings/_internal/cusolverDn_linux.pyx
index 15f836f..c99ca26 100644
--- a/nvmath/bindings/_internal/cusolverDn_linux.pyx
+++ b/nvmath/bindings/_internal/cusolverDn_linux.pyx
@@ -404,6 +404,12 @@ cdef void* __cusolverDnLoggerSetMask = NULL
 cdef void* __cusolverDnLoggerForceDisable = NULL
 cdef void* __cusolverDnSetDeterministicMode = NULL
 cdef void* __cusolverDnGetDeterministicMode = NULL
+cdef void* __cusolverDnXlarft_bufferSize = NULL
+cdef void* __cusolverDnXlarft = NULL
+cdef void* __cusolverDnXsyevBatched_bufferSize = NULL
+cdef void* __cusolverDnXsyevBatched = NULL
+cdef void* __cusolverDnXgeev_bufferSize = NULL
+cdef void* __cusolverDnXgeev = NULL
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
@@ -3015,6 +3021,48 @@ cdef int _check_or_init_cusolverDn() except -1 nogil:
             handle = load_library(driver_ver)
         __cusolverDnGetDeterministicMode = dlsym(handle, 'cusolverDnGetDeterministicMode')
 
+    global __cusolverDnXlarft_bufferSize
+    __cusolverDnXlarft_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXlarft_bufferSize')
+    if __cusolverDnXlarft_bufferSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cusolverDnXlarft_bufferSize = dlsym(handle, 'cusolverDnXlarft_bufferSize')
+
+    global __cusolverDnXlarft
+    __cusolverDnXlarft = dlsym(RTLD_DEFAULT, 'cusolverDnXlarft')
+    if __cusolverDnXlarft == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cusolverDnXlarft = dlsym(handle, 'cusolverDnXlarft')
+
+    global __cusolverDnXsyevBatched_bufferSize
+    __cusolverDnXsyevBatched_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevBatched_bufferSize')
+    if __cusolverDnXsyevBatched_bufferSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cusolverDnXsyevBatched_bufferSize = dlsym(handle, 'cusolverDnXsyevBatched_bufferSize')
+
+    global __cusolverDnXsyevBatched
+    __cusolverDnXsyevBatched = dlsym(RTLD_DEFAULT, 'cusolverDnXsyevBatched')
+    if __cusolverDnXsyevBatched == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cusolverDnXsyevBatched = dlsym(handle, 'cusolverDnXsyevBatched')
+
+    global __cusolverDnXgeev_bufferSize
+    __cusolverDnXgeev_bufferSize = dlsym(RTLD_DEFAULT, 'cusolverDnXgeev_bufferSize')
+    if __cusolverDnXgeev_bufferSize == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cusolverDnXgeev_bufferSize = dlsym(handle, 'cusolverDnXgeev_bufferSize')
+
+    global __cusolverDnXgeev
+    __cusolverDnXgeev = dlsym(RTLD_DEFAULT, 'cusolverDnXgeev')
+    if __cusolverDnXgeev == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cusolverDnXgeev = dlsym(handle, 'cusolverDnXgeev')
+
     __py_cusolverDn_init = True
     return 0
 
@@ -4131,6 +4179,24 @@ cpdef dict _inspect_function_pointers():
     global __cusolverDnGetDeterministicMode
     data["__cusolverDnGetDeterministicMode"] = <intptr_t>__cusolverDnGetDeterministicMode
 
+    global __cusolverDnXlarft_bufferSize
+    data["__cusolverDnXlarft_bufferSize"] = <intptr_t>__cusolverDnXlarft_bufferSize
+
+    global __cusolverDnXlarft
+    data["__cusolverDnXlarft"] = <intptr_t>__cusolverDnXlarft
+
+    global __cusolverDnXsyevBatched_bufferSize
+    data["__cusolverDnXsyevBatched_bufferSize"] = <intptr_t>__cusolverDnXsyevBatched_bufferSize
+
+    global __cusolverDnXsyevBatched
+    data["__cusolverDnXsyevBatched"] = <intptr_t>__cusolverDnXsyevBatched
+
+    global __cusolverDnXgeev_bufferSize
+    data["__cusolverDnXgeev_bufferSize"] = <intptr_t>__cusolverDnXgeev_bufferSize
+
+    global __cusolverDnXgeev
+    data["__cusolverDnXgeev"] = <intptr_t>__cusolverDnXgeev
+
     func_ptrs = data
     return data
 
@@ -7814,3 +7880,63 @@ cdef cusolverStatus_t _cusolverDnGetDeterministicMode(cusolverDnHandle_t handle,
             raise FunctionNotFoundError("function cusolverDnGetDeterministicMode is not found")
     return (<cusolverStatus_t (*)(cusolverDnHandle_t, cusolverDeterministicMode_t*) nogil>__cusolverDnGetDeterministicMode)(
         handle, mode)
+
+
+cdef cusolverStatus_t _cusolverDnXlarft_bufferSize(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverDirectMode_t direct, cusolverStorevMode_t storev, int64_t n, int64_t k, cudaDataType dataTypeV, const void* V, int64_t ldv, cudaDataType dataTypeTau, const void* tau, cudaDataType dataTypeT, void* T, int64_t ldt, cudaDataType computeType, size_t* workspaceInBytesOnDevice, size_t* workspaceInBytesOnHost) except* nogil:
+    global __cusolverDnXlarft_bufferSize
+    _check_or_init_cusolverDn()
+    if __cusolverDnXlarft_bufferSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cusolverDnXlarft_bufferSize is not found")
+    return (<cusolverStatus_t (*)(cusolverDnHandle_t, cusolverDnParams_t, cusolverDirectMode_t, cusolverStorevMode_t, int64_t, int64_t, cudaDataType, const void*, int64_t, cudaDataType, const void*, cudaDataType, void*, int64_t, cudaDataType, size_t*, size_t*) nogil>__cusolverDnXlarft_bufferSize)(
+        handle, params, direct, storev, n, k, dataTypeV, V, ldv, dataTypeTau, tau, dataTypeT, T, ldt, computeType, workspaceInBytesOnDevice, workspaceInBytesOnHost)
+
+
+cdef cusolverStatus_t _cusolverDnXlarft(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverDirectMode_t direct, cusolverStorevMode_t storev, int64_t n, int64_t k, cudaDataType dataTypeV, const void* V, int64_t ldv, cudaDataType dataTypeTau, const void* tau, cudaDataType dataTypeT, void* T, int64_t ldt, cudaDataType computeType, void* bufferOnDevice, size_t workspaceInBytesOnDevice, void* bufferOnHost, size_t workspaceInBytesOnHost) except* nogil:
+    global __cusolverDnXlarft
+    _check_or_init_cusolverDn()
+    if __cusolverDnXlarft == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cusolverDnXlarft is not found")
+    return (<cusolverStatus_t (*)(cusolverDnHandle_t, cusolverDnParams_t, cusolverDirectMode_t, cusolverStorevMode_t, int64_t, int64_t, cudaDataType, const void*, int64_t, cudaDataType, const void*, cudaDataType, void*, int64_t, cudaDataType, void*, size_t, void*, size_t) nogil>__cusolverDnXlarft)(
+        handle, params, direct, storev, n, k, dataTypeV, V, ldv, dataTypeTau, tau, dataTypeT, T, ldt, computeType, bufferOnDevice, workspaceInBytesOnDevice, bufferOnHost, workspaceInBytesOnHost)
+
+
+cdef cusolverStatus_t _cusolverDnXsyevBatched_bufferSize(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n, cudaDataType dataTypeA, const void* A, int64_t lda, cudaDataType dataTypeW, const void* W, cudaDataType computeType, size_t* workspaceInBytesOnDevice, size_t* workspaceInBytesOnHost, int64_t batchSize) except* nogil:
+    global __cusolverDnXsyevBatched_bufferSize
+    _check_or_init_cusolverDn()
+    if __cusolverDnXsyevBatched_bufferSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cusolverDnXsyevBatched_bufferSize is not found")
+    return (<cusolverStatus_t (*)(cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t, cublasFillMode_t, int64_t, cudaDataType, const void*, int64_t, cudaDataType, const void*, cudaDataType, size_t*, size_t*, int64_t) nogil>__cusolverDnXsyevBatched_bufferSize)(
+        handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW, W, computeType, workspaceInBytesOnDevice, workspaceInBytesOnHost, batchSize)
+
+
+cdef cusolverStatus_t _cusolverDnXsyevBatched(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n, cudaDataType dataTypeA, void* A, int64_t lda, cudaDataType dataTypeW, void* W, cudaDataType computeType, void* bufferOnDevice, size_t workspaceInBytesOnDevice, void* bufferOnHost, size_t workspaceInBytesOnHost, int* info, int64_t batchSize) except* nogil:
+    global __cusolverDnXsyevBatched
+    _check_or_init_cusolverDn()
+    if __cusolverDnXsyevBatched == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cusolverDnXsyevBatched is not found")
+    return (<cusolverStatus_t (*)(cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t, cublasFillMode_t, int64_t, cudaDataType, void*, int64_t, cudaDataType, void*, cudaDataType, void*, size_t, void*, size_t, int*, int64_t) nogil>__cusolverDnXsyevBatched)(
+        handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW, W, computeType, bufferOnDevice, workspaceInBytesOnDevice, bufferOnHost, workspaceInBytesOnHost, info, batchSize)
+
+
+cdef cusolverStatus_t _cusolverDnXgeev_bufferSize(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobvl, cusolverEigMode_t jobvr, int64_t n, cudaDataType dataTypeA, const void* A, int64_t lda, cudaDataType dataTypeW, const void* W, cudaDataType dataTypeVL, const void* VL, int64_t ldvl, cudaDataType dataTypeVR, const void* VR, int64_t ldvr, cudaDataType computeType, size_t* workspaceInBytesOnDevice, size_t* workspaceInBytesOnHost) except* nogil:
+    global __cusolverDnXgeev_bufferSize
+    _check_or_init_cusolverDn()
+    if __cusolverDnXgeev_bufferSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cusolverDnXgeev_bufferSize is not found")
+    return (<cusolverStatus_t (*)(cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t, cusolverEigMode_t, int64_t, cudaDataType, const void*, int64_t, cudaDataType, const void*, cudaDataType, const void*, int64_t, cudaDataType, const void*, int64_t, cudaDataType, size_t*, size_t*) nogil>__cusolverDnXgeev_bufferSize)(
+        handle, params, jobvl, jobvr, n, dataTypeA, A, lda, dataTypeW, W, dataTypeVL, VL, ldvl, dataTypeVR, VR, ldvr, computeType, workspaceInBytesOnDevice, workspaceInBytesOnHost)
+
+
+cdef cusolverStatus_t _cusolverDnXgeev(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobvl, cusolverEigMode_t jobvr, int64_t n, cudaDataType dataTypeA, void* A, int64_t lda, cudaDataType dataTypeW, void* W, cudaDataType dataTypeVL, void* VL, int64_t ldvl, cudaDataType dataTypeVR, void* VR, int64_t ldvr, cudaDataType computeType, void* bufferOnDevice, size_t workspaceInBytesOnDevice, void* bufferOnHost, size_t workspaceInBytesOnHost, int* info) except* nogil:
+    global __cusolverDnXgeev
+    _check_or_init_cusolverDn()
+    if __cusolverDnXgeev == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cusolverDnXgeev is not found")
+    return (<cusolverStatus_t (*)(cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t, cusolverEigMode_t, int64_t, cudaDataType, void*, int64_t, cudaDataType, void*, cudaDataType, void*, int64_t, cudaDataType, void*, int64_t, cudaDataType, void*, size_t, void*, size_t, int*) nogil>__cusolverDnXgeev)(
+        handle, params, jobvl, jobvr, n, dataTypeA, A, lda, dataTypeW, W, dataTypeVL, VL, ldvl, dataTypeVR, VR, ldvr, computeType, bufferOnDevice, workspaceInBytesOnDevice, bufferOnHost, workspaceInBytesOnHost, info)
diff --git a/nvmath/bindings/_internal/cusolverDn_windows.pyx b/nvmath/bindings/_internal/cusolverDn_windows.pyx
index 02115ac..f705fa6 100644
--- a/nvmath/bindings/_internal/cusolverDn_windows.pyx
+++ b/nvmath/bindings/_internal/cusolverDn_windows.pyx
@@ -395,6 +395,12 @@ cdef void* __cusolverDnLoggerSetMask = NULL
 cdef void* __cusolverDnLoggerForceDisable = NULL
 cdef void* __cusolverDnSetDeterministicMode = NULL
 cdef void* __cusolverDnGetDeterministicMode = NULL
+cdef void* __cusolverDnXlarft_bufferSize = NULL
+cdef void* __cusolverDnXlarft = NULL
+cdef void* __cusolverDnXsyevBatched_bufferSize = NULL
+cdef void* __cusolverDnXsyevBatched = NULL
+cdef void* __cusolverDnXgeev_bufferSize = NULL
+cdef void* __cusolverDnXgeev = NULL
 
 
 cdef inline list get_site_packages():
@@ -2679,6 +2685,42 @@ cdef int _check_or_init_cusolverDn() except -1 nogil:
         except:
             pass
 
+        global __cusolverDnXlarft_bufferSize
+        try:
+            __cusolverDnXlarft_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXlarft_bufferSize')
+        except:
+            pass
+
+        global __cusolverDnXlarft
+        try:
+            __cusolverDnXlarft = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXlarft')
+        except:
+            pass
+
+        global __cusolverDnXsyevBatched_bufferSize
+        try:
+            __cusolverDnXsyevBatched_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXsyevBatched_bufferSize')
+        except:
+            pass
+
+        global __cusolverDnXsyevBatched
+        try:
+            __cusolverDnXsyevBatched = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXsyevBatched')
+        except:
+            pass
+
+        global __cusolverDnXgeev_bufferSize
+        try:
+            __cusolverDnXgeev_bufferSize = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgeev_bufferSize')
+        except:
+            pass
+
+        global __cusolverDnXgeev
+        try:
+            __cusolverDnXgeev = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusolverDnXgeev')
+        except:
+            pass
+
     __py_cusolverDn_init = True
     return 0
 
@@ -3795,6 +3837,24 @@ cpdef dict _inspect_function_pointers():
     global __cusolverDnGetDeterministicMode
     data["__cusolverDnGetDeterministicMode"] = <intptr_t>__cusolverDnGetDeterministicMode
 
+    global __cusolverDnXlarft_bufferSize
+    data["__cusolverDnXlarft_bufferSize"] = <intptr_t>__cusolverDnXlarft_bufferSize
+
+    global __cusolverDnXlarft
+    data["__cusolverDnXlarft"] = <intptr_t>__cusolverDnXlarft
+
+    global __cusolverDnXsyevBatched_bufferSize
+    data["__cusolverDnXsyevBatched_bufferSize"] = <intptr_t>__cusolverDnXsyevBatched_bufferSize
+
+    global __cusolverDnXsyevBatched
+    data["__cusolverDnXsyevBatched"] = <intptr_t>__cusolverDnXsyevBatched
+
+    global __cusolverDnXgeev_bufferSize
+    data["__cusolverDnXgeev_bufferSize"] = <intptr_t>__cusolverDnXgeev_bufferSize
+
+    global __cusolverDnXgeev
+    data["__cusolverDnXgeev"] = <intptr_t>__cusolverDnXgeev
+
     func_ptrs = data
     return data
 
@@ -7478,3 +7538,63 @@ cdef cusolverStatus_t _cusolverDnGetDeterministicMode(cusolverDnHandle_t handle,
             raise FunctionNotFoundError("function cusolverDnGetDeterministicMode is not found")
     return (<cusolverStatus_t (*)(cusolverDnHandle_t, cusolverDeterministicMode_t*) nogil>__cusolverDnGetDeterministicMode)(
         handle, mode)
+
+
+cdef cusolverStatus_t _cusolverDnXlarft_bufferSize(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverDirectMode_t direct, cusolverStorevMode_t storev, int64_t n, int64_t k, cudaDataType dataTypeV, const void* V, int64_t ldv, cudaDataType dataTypeTau, const void* tau, cudaDataType dataTypeT, void* T, int64_t ldt, cudaDataType computeType, size_t* workspaceInBytesOnDevice, size_t* workspaceInBytesOnHost) except* nogil:
+    global __cusolverDnXlarft_bufferSize
+    _check_or_init_cusolverDn()
+    if __cusolverDnXlarft_bufferSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cusolverDnXlarft_bufferSize is not found")
+    return (<cusolverStatus_t (*)(cusolverDnHandle_t, cusolverDnParams_t, cusolverDirectMode_t, cusolverStorevMode_t, int64_t, int64_t, cudaDataType, const void*, int64_t, cudaDataType, const void*, cudaDataType, void*, int64_t, cudaDataType, size_t*, size_t*) nogil>__cusolverDnXlarft_bufferSize)(
+        handle, params, direct, storev, n, k, dataTypeV, V, ldv, dataTypeTau, tau, dataTypeT, T, ldt, computeType, workspaceInBytesOnDevice, workspaceInBytesOnHost)
+
+
+cdef cusolverStatus_t _cusolverDnXlarft(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverDirectMode_t direct, cusolverStorevMode_t storev, int64_t n, int64_t k, cudaDataType dataTypeV, const void* V, int64_t ldv, cudaDataType dataTypeTau, const void* tau, cudaDataType dataTypeT, void* T, int64_t ldt, cudaDataType computeType, void* bufferOnDevice, size_t workspaceInBytesOnDevice, void* bufferOnHost, size_t workspaceInBytesOnHost) except* nogil:
+    global __cusolverDnXlarft
+    _check_or_init_cusolverDn()
+    if __cusolverDnXlarft == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cusolverDnXlarft is not found")
+    return (<cusolverStatus_t (*)(cusolverDnHandle_t, cusolverDnParams_t, cusolverDirectMode_t, cusolverStorevMode_t, int64_t, int64_t, cudaDataType, const void*, int64_t, cudaDataType, const void*, cudaDataType, void*, int64_t, cudaDataType, void*, size_t, void*, size_t) nogil>__cusolverDnXlarft)(
+        handle, params, direct, storev, n, k, dataTypeV, V, ldv, dataTypeTau, tau, dataTypeT, T, ldt, computeType, bufferOnDevice, workspaceInBytesOnDevice, bufferOnHost, workspaceInBytesOnHost)
+
+
+cdef cusolverStatus_t _cusolverDnXsyevBatched_bufferSize(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n, cudaDataType dataTypeA, const void* A, int64_t lda, cudaDataType dataTypeW, const void* W, cudaDataType computeType, size_t* workspaceInBytesOnDevice, size_t* workspaceInBytesOnHost, int64_t batchSize) except* nogil:
+    global __cusolverDnXsyevBatched_bufferSize
+    _check_or_init_cusolverDn()
+    if __cusolverDnXsyevBatched_bufferSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cusolverDnXsyevBatched_bufferSize is not found")
+    return (<cusolverStatus_t (*)(cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t, cublasFillMode_t, int64_t, cudaDataType, const void*, int64_t, cudaDataType, const void*, cudaDataType, size_t*, size_t*, int64_t) nogil>__cusolverDnXsyevBatched_bufferSize)(
+        handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW, W, computeType, workspaceInBytesOnDevice, workspaceInBytesOnHost, batchSize)
+
+
+cdef cusolverStatus_t _cusolverDnXsyevBatched(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n, cudaDataType dataTypeA, void* A, int64_t lda, cudaDataType dataTypeW, void* W, cudaDataType computeType, void* bufferOnDevice, size_t workspaceInBytesOnDevice, void* bufferOnHost, size_t workspaceInBytesOnHost, int* info, int64_t batchSize) except* nogil:
+    global __cusolverDnXsyevBatched
+    _check_or_init_cusolverDn()
+    if __cusolverDnXsyevBatched == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cusolverDnXsyevBatched is not found")
+    return (<cusolverStatus_t (*)(cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t, cublasFillMode_t, int64_t, cudaDataType, void*, int64_t, cudaDataType, void*, cudaDataType, void*, size_t, void*, size_t, int*, int64_t) nogil>__cusolverDnXsyevBatched)(
+        handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW, W, computeType, bufferOnDevice, workspaceInBytesOnDevice, bufferOnHost, workspaceInBytesOnHost, info, batchSize)
+
+
+cdef cusolverStatus_t _cusolverDnXgeev_bufferSize(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobvl, cusolverEigMode_t jobvr, int64_t n, cudaDataType dataTypeA, const void* A, int64_t lda, cudaDataType dataTypeW, const void* W, cudaDataType dataTypeVL, const void* VL, int64_t ldvl, cudaDataType dataTypeVR, const void* VR, int64_t ldvr, cudaDataType computeType, size_t* workspaceInBytesOnDevice, size_t* workspaceInBytesOnHost) except* nogil:
+    global __cusolverDnXgeev_bufferSize
+    _check_or_init_cusolverDn()
+    if __cusolverDnXgeev_bufferSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cusolverDnXgeev_bufferSize is not found")
+    return (<cusolverStatus_t (*)(cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t, cusolverEigMode_t, int64_t, cudaDataType, const void*, int64_t, cudaDataType, const void*, cudaDataType, const void*, int64_t, cudaDataType, const void*, int64_t, cudaDataType, size_t*, size_t*) nogil>__cusolverDnXgeev_bufferSize)(
+        handle, params, jobvl, jobvr, n, dataTypeA, A, lda, dataTypeW, W, dataTypeVL, VL, ldvl, dataTypeVR, VR, ldvr, computeType, workspaceInBytesOnDevice, workspaceInBytesOnHost)
+
+
+cdef cusolverStatus_t _cusolverDnXgeev(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobvl, cusolverEigMode_t jobvr, int64_t n, cudaDataType dataTypeA, void* A, int64_t lda, cudaDataType dataTypeW, void* W, cudaDataType dataTypeVL, void* VL, int64_t ldvl, cudaDataType dataTypeVR, void* VR, int64_t ldvr, cudaDataType computeType, void* bufferOnDevice, size_t workspaceInBytesOnDevice, void* bufferOnHost, size_t workspaceInBytesOnHost, int* info) except* nogil:
+    global __cusolverDnXgeev
+    _check_or_init_cusolverDn()
+    if __cusolverDnXgeev == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cusolverDnXgeev is not found")
+    return (<cusolverStatus_t (*)(cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t, cusolverEigMode_t, int64_t, cudaDataType, void*, int64_t, cudaDataType, void*, cudaDataType, void*, int64_t, cudaDataType, void*, int64_t, cudaDataType, void*, size_t, void*, size_t, int*) nogil>__cusolverDnXgeev)(
+        handle, params, jobvl, jobvr, n, dataTypeA, A, lda, dataTypeW, W, dataTypeVL, VL, ldvl, dataTypeVR, VR, ldvr, computeType, bufferOnDevice, workspaceInBytesOnDevice, bufferOnHost, workspaceInBytesOnHost, info)
diff --git a/nvmath/bindings/_internal/cusparse.pxd b/nvmath/bindings/_internal/cusparse.pxd
index a98fb42..8f8025a 100644
--- a/nvmath/bindings/_internal/cusparse.pxd
+++ b/nvmath/bindings/_internal/cusparse.pxd
@@ -268,3 +268,4 @@ cdef cusparseStatus_t _cusparseCreateSlicedEll(cusparseSpMatDescr_t* spMatDescr,
 cdef cusparseStatus_t _cusparseCreateConstSlicedEll(cusparseConstSpMatDescr_t* spMatDescr, int64_t rows, int64_t cols, int64_t nnz, int64_t sellValuesSize, int64_t sliceSize, const void* sellSliceOffsets, const void* sellColInd, const void* sellValues, cusparseIndexType_t sellSliceOffsetsType, cusparseIndexType_t sellColIndType, cusparseIndexBase_t idxBase, cudaDataType valueType) except* nogil
 cdef cusparseStatus_t _cusparseSpSV_updateMatrix(cusparseHandle_t handle, cusparseSpSVDescr_t spsvDescr, void* newValues, cusparseSpSVUpdate_t updatePart) except* nogil
 cdef cusparseStatus_t _cusparseSpMV_preprocess(cusparseHandle_t handle, cusparseOperation_t opA, const void* alpha, cusparseConstSpMatDescr_t matA, cusparseConstDnVecDescr_t vecX, const void* beta, cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpMVAlg_t alg, void* externalBuffer) except* nogil
+cdef cusparseStatus_t _cusparseSpSM_updateMatrix(cusparseHandle_t handle, cusparseSpSMDescr_t spsmDescr, void* newValues, cusparseSpSMUpdate_t updatePart) except* nogil
diff --git a/nvmath/bindings/_internal/cusparse_linux.pyx b/nvmath/bindings/_internal/cusparse_linux.pyx
index 7ef7ed1..41e25c8 100644
--- a/nvmath/bindings/_internal/cusparse_linux.pyx
+++ b/nvmath/bindings/_internal/cusparse_linux.pyx
@@ -292,6 +292,7 @@ cdef void* __cusparseCreateSlicedEll = NULL
 cdef void* __cusparseCreateConstSlicedEll = NULL
 cdef void* __cusparseSpSV_updateMatrix = NULL
 cdef void* __cusparseSpMV_preprocess = NULL
+cdef void* __cusparseSpSM_updateMatrix = NULL
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
@@ -2119,6 +2120,13 @@ cdef int _check_or_init_cusparse() except -1 nogil:
             handle = load_library(driver_ver)
         __cusparseSpMV_preprocess = dlsym(handle, 'cusparseSpMV_preprocess')
 
+    global __cusparseSpSM_updateMatrix
+    __cusparseSpSM_updateMatrix = dlsym(RTLD_DEFAULT, 'cusparseSpSM_updateMatrix')
+    if __cusparseSpSM_updateMatrix == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cusparseSpSM_updateMatrix = dlsym(handle, 'cusparseSpSM_updateMatrix')
+
     __py_cusparse_init = True
     return 0
 
@@ -2899,6 +2907,9 @@ cpdef dict _inspect_function_pointers():
     global __cusparseSpMV_preprocess
     data["__cusparseSpMV_preprocess"] = <intptr_t>__cusparseSpMV_preprocess
 
+    global __cusparseSpSM_updateMatrix
+    data["__cusparseSpSM_updateMatrix"] = <intptr_t>__cusparseSpSM_updateMatrix
+
     func_ptrs = data
     return data
 
@@ -5462,3 +5473,13 @@ cdef cusparseStatus_t _cusparseSpMV_preprocess(cusparseHandle_t handle, cusparse
             raise FunctionNotFoundError("function cusparseSpMV_preprocess is not found")
     return (<cusparseStatus_t (*)(cusparseHandle_t, cusparseOperation_t, const void*, cusparseConstSpMatDescr_t, cusparseConstDnVecDescr_t, const void*, cusparseDnVecDescr_t, cudaDataType, cusparseSpMVAlg_t, void*) nogil>__cusparseSpMV_preprocess)(
         handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg, externalBuffer)
+
+
+cdef cusparseStatus_t _cusparseSpSM_updateMatrix(cusparseHandle_t handle, cusparseSpSMDescr_t spsmDescr, void* newValues, cusparseSpSMUpdate_t updatePart) except* nogil:
+    global __cusparseSpSM_updateMatrix
+    _check_or_init_cusparse()
+    if __cusparseSpSM_updateMatrix == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cusparseSpSM_updateMatrix is not found")
+    return (<cusparseStatus_t (*)(cusparseHandle_t, cusparseSpSMDescr_t, void*, cusparseSpSMUpdate_t) nogil>__cusparseSpSM_updateMatrix)(
+        handle, spsmDescr, newValues, updatePart)
diff --git a/nvmath/bindings/_internal/cusparse_windows.pyx b/nvmath/bindings/_internal/cusparse_windows.pyx
index 306d941..2601c03 100644
--- a/nvmath/bindings/_internal/cusparse_windows.pyx
+++ b/nvmath/bindings/_internal/cusparse_windows.pyx
@@ -281,6 +281,7 @@ cdef void* __cusparseCreateSlicedEll = NULL
 cdef void* __cusparseCreateConstSlicedEll = NULL
 cdef void* __cusparseSpSV_updateMatrix = NULL
 cdef void* __cusparseSpMV_preprocess = NULL
+cdef void* __cusparseSpSM_updateMatrix = NULL
 
 
 cdef inline list get_site_packages():
@@ -1891,6 +1892,12 @@ cdef int _check_or_init_cusparse() except -1 nogil:
         except:
             pass
 
+        global __cusparseSpSM_updateMatrix
+        try:
+            __cusparseSpSM_updateMatrix = <void*><intptr_t>win32api.GetProcAddress(handle, 'cusparseSpSM_updateMatrix')
+        except:
+            pass
+
     __py_cusparse_init = True
     return 0
 
@@ -2671,6 +2678,9 @@ cpdef dict _inspect_function_pointers():
     global __cusparseSpMV_preprocess
     data["__cusparseSpMV_preprocess"] = <intptr_t>__cusparseSpMV_preprocess
 
+    global __cusparseSpSM_updateMatrix
+    data["__cusparseSpSM_updateMatrix"] = <intptr_t>__cusparseSpSM_updateMatrix
+
     func_ptrs = data
     return data
 
@@ -5234,3 +5244,13 @@ cdef cusparseStatus_t _cusparseSpMV_preprocess(cusparseHandle_t handle, cusparse
             raise FunctionNotFoundError("function cusparseSpMV_preprocess is not found")
     return (<cusparseStatus_t (*)(cusparseHandle_t, cusparseOperation_t, const void*, cusparseConstSpMatDescr_t, cusparseConstDnVecDescr_t, const void*, cusparseDnVecDescr_t, cudaDataType, cusparseSpMVAlg_t, void*) nogil>__cusparseSpMV_preprocess)(
         handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg, externalBuffer)
+
+
+cdef cusparseStatus_t _cusparseSpSM_updateMatrix(cusparseHandle_t handle, cusparseSpSMDescr_t spsmDescr, void* newValues, cusparseSpSMUpdate_t updatePart) except* nogil:
+    global __cusparseSpSM_updateMatrix
+    _check_or_init_cusparse()
+    if __cusparseSpSM_updateMatrix == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cusparseSpSM_updateMatrix is not found")
+    return (<cusparseStatus_t (*)(cusparseHandle_t, cusparseSpSMDescr_t, void*, cusparseSpSMUpdate_t) nogil>__cusparseSpSM_updateMatrix)(
+        handle, spsmDescr, newValues, updatePart)
diff --git a/nvmath/bindings/_internal/utils.pxd b/nvmath/bindings/_internal/utils.pxd
index 5facaef..e3b82e9 100644
--- a/nvmath/bindings/_internal/utils.pxd
+++ b/nvmath/bindings/_internal/utils.pxd
@@ -147,6 +147,8 @@ ctypedef fused ResT:
     int
     int32_t
     int64_t
+    float
+    double
 
 
 ctypedef fused PtrT:
diff --git a/nvmath/bindings/cublas.pxd b/nvmath/bindings/cublas.pxd
index eacb164..906294d 100644
--- a/nvmath/bindings/cublas.pxd
+++ b/nvmath/bindings/cublas.pxd
@@ -45,7 +45,7 @@ cpdef intptr_t create() except? 0
 cpdef destroy(intptr_t handle)
 cpdef int get_version(intptr_t handle) except? -1
 cpdef int get_property(int type) except? -1
-cpdef size_t get_cudart_version()
+cpdef size_t get_cudart_version() except? 0
 cpdef set_workspace(intptr_t handle, intptr_t workspace, size_t workspace_size_in_bytes)
 cpdef set_stream(intptr_t handle, intptr_t stream_id)
 cpdef intptr_t get_stream(intptr_t handle) except? 0
@@ -537,3 +537,9 @@ cpdef sdgmm_64(intptr_t handle, int mode, int64_t m, int64_t n, intptr_t a, int6
 cpdef ddgmm_64(intptr_t handle, int mode, int64_t m, int64_t n, intptr_t a, int64_t lda, intptr_t x, int64_t incx, intptr_t c, int64_t ldc)
 cpdef cdgmm_64(intptr_t handle, int mode, int64_t m, int64_t n, intptr_t a, int64_t lda, intptr_t x, int64_t incx, intptr_t c, int64_t ldc)
 cpdef zdgmm_64(intptr_t handle, int mode, int64_t m, int64_t n, intptr_t a, int64_t lda, intptr_t x, int64_t incx, intptr_t c, int64_t ldc)
+cpdef sgemm_grouped_batched(intptr_t handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, intptr_t aarray, lda_array, intptr_t barray, ldb_array, beta_array, intptr_t carray, ldc_array, int group_count, group_size)
+cpdef sgemm_grouped_batched_64(intptr_t handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, intptr_t aarray, lda_array, intptr_t barray, ldb_array, beta_array, intptr_t carray, ldc_array, int64_t group_count, group_size)
+cpdef dgemm_grouped_batched(intptr_t handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, intptr_t aarray, lda_array, intptr_t barray, ldb_array, beta_array, intptr_t carray, ldc_array, int group_count, group_size)
+cpdef dgemm_grouped_batched_64(intptr_t handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, intptr_t aarray, lda_array, intptr_t barray, ldb_array, beta_array, intptr_t carray, ldc_array, int64_t group_count, group_size)
+cpdef gemm_grouped_batched_ex(intptr_t handle, transa_array, transb_array, m_array, n_array, k_array, intptr_t alpha_array, intptr_t aarray, int atype, lda_array, intptr_t barray, int btype, ldb_array, intptr_t beta_array, intptr_t carray, int ctype, ldc_array, int group_count, group_size, int compute_type)
+cpdef gemm_grouped_batched_ex_64(intptr_t handle, transa_array, transb_array, m_array, n_array, k_array, intptr_t alpha_array, intptr_t aarray, int atype, lda_array, intptr_t barray, int btype, ldb_array, intptr_t beta_array, intptr_t carray, int ctype, ldc_array, int64_t group_count, group_size, int compute_type)
diff --git a/nvmath/bindings/cublas.pyx b/nvmath/bindings/cublas.pyx
index 3c73c34..726a2eb 100644
--- a/nvmath/bindings/cublas.pyx
+++ b/nvmath/bindings/cublas.pyx
@@ -5,6 +5,10 @@
 # This code was automatically generated across versions from 11.0.3 to 12.6.2. Do not modify it directly.
 
 cimport cython  # NOQA
+from libcpp.vector cimport vector
+
+from ._internal.utils cimport (get_resource_ptr, get_resource_ptrs, nullable_unique_ptr,
+                               get_buffer_pointer,)
 
 from enum import IntEnum as _IntEnum
 
@@ -196,7 +200,7 @@ cpdef int get_property(int type) except? -1:
     return value
 
 
-cpdef size_t get_cudart_version():
+cpdef size_t get_cudart_version() except? 0:
     """See `cublasGetCudartVersion`."""
     return cublasGetCudartVersion()
 
@@ -3650,3 +3654,169 @@ cpdef zdgmm_64(intptr_t handle, int mode, int64_t m, int64_t n, intptr_t a, int6
     with nogil:
         status = cublasZdgmm_64(<Handle>handle, <_SideMode>mode, m, n, <const cuDoubleComplex*>a, lda, <const cuDoubleComplex*>x, incx, <cuDoubleComplex*>c, ldc)
     check_status(status)
+
+
+cpdef sgemm_grouped_batched(intptr_t handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, intptr_t aarray, lda_array, intptr_t barray, ldb_array, beta_array, intptr_t carray, ldc_array, int group_count, group_size):
+    """See `cublasSgemmGroupedBatched`."""
+    cdef nullable_unique_ptr[ vector[int] ] _transa_array_
+    get_resource_ptr[int](_transa_array_, transa_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _transb_array_
+    get_resource_ptr[int](_transb_array_, transb_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _m_array_
+    get_resource_ptr[int](_m_array_, m_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _n_array_
+    get_resource_ptr[int](_n_array_, n_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _k_array_
+    get_resource_ptr[int](_k_array_, k_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[float] ] _alpha_array_
+    get_resource_ptr[float](_alpha_array_, alpha_array, <float*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _lda_array_
+    get_resource_ptr[int](_lda_array_, lda_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _ldb_array_
+    get_resource_ptr[int](_ldb_array_, ldb_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[float] ] _beta_array_
+    get_resource_ptr[float](_beta_array_, beta_array, <float*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _ldc_array_
+    get_resource_ptr[int](_ldc_array_, ldc_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _group_size_
+    get_resource_ptr[int](_group_size_, group_size, <int*>NULL)
+    with nogil:
+        status = cublasSgemmGroupedBatched(<Handle>handle, <const _Operation*>(_transa_array_.data()), <const _Operation*>(_transb_array_.data()), <const int*>(_m_array_.data()), <const int*>(_n_array_.data()), <const int*>(_k_array_.data()), <const float*>(_alpha_array_.data()), <const float* const*>aarray, <const int*>(_lda_array_.data()), <const float* const*>barray, <const int*>(_ldb_array_.data()), <const float*>(_beta_array_.data()), <float* const*>carray, <const int*>(_ldc_array_.data()), group_count, <const int*>(_group_size_.data()))
+    check_status(status)
+
+
+cpdef sgemm_grouped_batched_64(intptr_t handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, intptr_t aarray, lda_array, intptr_t barray, ldb_array, beta_array, intptr_t carray, ldc_array, int64_t group_count, group_size):
+    """See `cublasSgemmGroupedBatched_64`."""
+    cdef nullable_unique_ptr[ vector[int] ] _transa_array_
+    get_resource_ptr[int](_transa_array_, transa_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _transb_array_
+    get_resource_ptr[int](_transb_array_, transb_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _m_array_
+    get_resource_ptr[int64_t](_m_array_, m_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _n_array_
+    get_resource_ptr[int64_t](_n_array_, n_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _k_array_
+    get_resource_ptr[int64_t](_k_array_, k_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[float] ] _alpha_array_
+    get_resource_ptr[float](_alpha_array_, alpha_array, <float*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _lda_array_
+    get_resource_ptr[int64_t](_lda_array_, lda_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _ldb_array_
+    get_resource_ptr[int64_t](_ldb_array_, ldb_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[float] ] _beta_array_
+    get_resource_ptr[float](_beta_array_, beta_array, <float*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _ldc_array_
+    get_resource_ptr[int64_t](_ldc_array_, ldc_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _group_size_
+    get_resource_ptr[int64_t](_group_size_, group_size, <int64_t*>NULL)
+    with nogil:
+        status = cublasSgemmGroupedBatched_64(<Handle>handle, <const _Operation*>(_transa_array_.data()), <const _Operation*>(_transb_array_.data()), <const int64_t*>(_m_array_.data()), <const int64_t*>(_n_array_.data()), <const int64_t*>(_k_array_.data()), <const float*>(_alpha_array_.data()), <const float* const*>aarray, <const int64_t*>(_lda_array_.data()), <const float* const*>barray, <const int64_t*>(_ldb_array_.data()), <const float*>(_beta_array_.data()), <float* const*>carray, <const int64_t*>(_ldc_array_.data()), group_count, <const int64_t*>(_group_size_.data()))
+    check_status(status)
+
+
+cpdef dgemm_grouped_batched(intptr_t handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, intptr_t aarray, lda_array, intptr_t barray, ldb_array, beta_array, intptr_t carray, ldc_array, int group_count, group_size):
+    """See `cublasDgemmGroupedBatched`."""
+    cdef nullable_unique_ptr[ vector[int] ] _transa_array_
+    get_resource_ptr[int](_transa_array_, transa_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _transb_array_
+    get_resource_ptr[int](_transb_array_, transb_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _m_array_
+    get_resource_ptr[int](_m_array_, m_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _n_array_
+    get_resource_ptr[int](_n_array_, n_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _k_array_
+    get_resource_ptr[int](_k_array_, k_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[double] ] _alpha_array_
+    get_resource_ptr[double](_alpha_array_, alpha_array, <double*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _lda_array_
+    get_resource_ptr[int](_lda_array_, lda_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _ldb_array_
+    get_resource_ptr[int](_ldb_array_, ldb_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[double] ] _beta_array_
+    get_resource_ptr[double](_beta_array_, beta_array, <double*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _ldc_array_
+    get_resource_ptr[int](_ldc_array_, ldc_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _group_size_
+    get_resource_ptr[int](_group_size_, group_size, <int*>NULL)
+    with nogil:
+        status = cublasDgemmGroupedBatched(<Handle>handle, <const _Operation*>(_transa_array_.data()), <const _Operation*>(_transb_array_.data()), <const int*>(_m_array_.data()), <const int*>(_n_array_.data()), <const int*>(_k_array_.data()), <const double*>(_alpha_array_.data()), <const double* const*>aarray, <const int*>(_lda_array_.data()), <const double* const*>barray, <const int*>(_ldb_array_.data()), <const double*>(_beta_array_.data()), <double* const*>carray, <const int*>(_ldc_array_.data()), group_count, <const int*>(_group_size_.data()))
+    check_status(status)
+
+
+cpdef dgemm_grouped_batched_64(intptr_t handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, intptr_t aarray, lda_array, intptr_t barray, ldb_array, beta_array, intptr_t carray, ldc_array, int64_t group_count, group_size):
+    """See `cublasDgemmGroupedBatched_64`."""
+    cdef nullable_unique_ptr[ vector[int] ] _transa_array_
+    get_resource_ptr[int](_transa_array_, transa_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _transb_array_
+    get_resource_ptr[int](_transb_array_, transb_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _m_array_
+    get_resource_ptr[int64_t](_m_array_, m_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _n_array_
+    get_resource_ptr[int64_t](_n_array_, n_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _k_array_
+    get_resource_ptr[int64_t](_k_array_, k_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[double] ] _alpha_array_
+    get_resource_ptr[double](_alpha_array_, alpha_array, <double*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _lda_array_
+    get_resource_ptr[int64_t](_lda_array_, lda_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _ldb_array_
+    get_resource_ptr[int64_t](_ldb_array_, ldb_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[double] ] _beta_array_
+    get_resource_ptr[double](_beta_array_, beta_array, <double*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _ldc_array_
+    get_resource_ptr[int64_t](_ldc_array_, ldc_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _group_size_
+    get_resource_ptr[int64_t](_group_size_, group_size, <int64_t*>NULL)
+    with nogil:
+        status = cublasDgemmGroupedBatched_64(<Handle>handle, <const _Operation*>(_transa_array_.data()), <const _Operation*>(_transb_array_.data()), <const int64_t*>(_m_array_.data()), <const int64_t*>(_n_array_.data()), <const int64_t*>(_k_array_.data()), <const double*>(_alpha_array_.data()), <const double* const*>aarray, <const int64_t*>(_lda_array_.data()), <const double* const*>barray, <const int64_t*>(_ldb_array_.data()), <const double*>(_beta_array_.data()), <double* const*>carray, <const int64_t*>(_ldc_array_.data()), group_count, <const int64_t*>(_group_size_.data()))
+    check_status(status)
+
+
+cpdef gemm_grouped_batched_ex(intptr_t handle, transa_array, transb_array, m_array, n_array, k_array, intptr_t alpha_array, intptr_t aarray, int atype, lda_array, intptr_t barray, int btype, ldb_array, intptr_t beta_array, intptr_t carray, int ctype, ldc_array, int group_count, group_size, int compute_type):
+    """See `cublasGemmGroupedBatchedEx`."""
+    cdef nullable_unique_ptr[ vector[int] ] _transa_array_
+    get_resource_ptr[int](_transa_array_, transa_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _transb_array_
+    get_resource_ptr[int](_transb_array_, transb_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _m_array_
+    get_resource_ptr[int](_m_array_, m_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _n_array_
+    get_resource_ptr[int](_n_array_, n_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _k_array_
+    get_resource_ptr[int](_k_array_, k_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _lda_array_
+    get_resource_ptr[int](_lda_array_, lda_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _ldb_array_
+    get_resource_ptr[int](_ldb_array_, ldb_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _ldc_array_
+    get_resource_ptr[int](_ldc_array_, ldc_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _group_size_
+    get_resource_ptr[int](_group_size_, group_size, <int*>NULL)
+    with nogil:
+        status = cublasGemmGroupedBatchedEx(<Handle>handle, <const _Operation*>(_transa_array_.data()), <const _Operation*>(_transb_array_.data()), <const int*>(_m_array_.data()), <const int*>(_n_array_.data()), <const int*>(_k_array_.data()), <const void*>alpha_array, <const void* const*>aarray, <DataType>atype, <const int*>(_lda_array_.data()), <const void* const*>barray, <DataType>btype, <const int*>(_ldb_array_.data()), <const void*>beta_array, <void* const*>carray, <DataType>ctype, <const int*>(_ldc_array_.data()), group_count, <const int*>(_group_size_.data()), <_ComputeType>compute_type)
+    check_status(status)
+
+
+cpdef gemm_grouped_batched_ex_64(intptr_t handle, transa_array, transb_array, m_array, n_array, k_array, intptr_t alpha_array, intptr_t aarray, int atype, lda_array, intptr_t barray, int btype, ldb_array, intptr_t beta_array, intptr_t carray, int ctype, ldc_array, int64_t group_count, group_size, int compute_type):
+    """See `cublasGemmGroupedBatchedEx_64`."""
+    cdef nullable_unique_ptr[ vector[int] ] _transa_array_
+    get_resource_ptr[int](_transa_array_, transa_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _transb_array_
+    get_resource_ptr[int](_transb_array_, transb_array, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _m_array_
+    get_resource_ptr[int64_t](_m_array_, m_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _n_array_
+    get_resource_ptr[int64_t](_n_array_, n_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _k_array_
+    get_resource_ptr[int64_t](_k_array_, k_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _lda_array_
+    get_resource_ptr[int64_t](_lda_array_, lda_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _ldb_array_
+    get_resource_ptr[int64_t](_ldb_array_, ldb_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _ldc_array_
+    get_resource_ptr[int64_t](_ldc_array_, ldc_array, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _group_size_
+    get_resource_ptr[int64_t](_group_size_, group_size, <int64_t*>NULL)
+    with nogil:
+        status = cublasGemmGroupedBatchedEx_64(<Handle>handle, <const _Operation*>(_transa_array_.data()), <const _Operation*>(_transb_array_.data()), <const int64_t*>(_m_array_.data()), <const int64_t*>(_n_array_.data()), <const int64_t*>(_k_array_.data()), <const void*>alpha_array, <const void* const*>aarray, <DataType>atype, <const int64_t*>(_lda_array_.data()), <const void* const*>barray, <DataType>btype, <const int64_t*>(_ldb_array_.data()), <const void*>beta_array, <void* const*>carray, <DataType>ctype, <const int64_t*>(_ldc_array_.data()), group_count, <const int64_t*>(_group_size_.data()), <_ComputeType>compute_type)
+    check_status(status)
diff --git a/nvmath/bindings/cublasLt.pxd b/nvmath/bindings/cublasLt.pxd
index 24122a3..f1e47e2 100644
--- a/nvmath/bindings/cublasLt.pxd
+++ b/nvmath/bindings/cublasLt.pxd
@@ -53,8 +53,8 @@ ctypedef cublasLtMatmulInnerShape_t _MatmulInnerShape
 
 cpdef intptr_t create() except? 0
 cpdef destroy(intptr_t light_handle)
-cpdef size_t get_version()
-cpdef size_t get_cudart_version()
+cpdef size_t get_version() except? 0
+cpdef size_t get_cudart_version() except? 0
 cpdef int get_property(int type) except? -1
 cpdef matmul(intptr_t light_handle, intptr_t compute_desc, intptr_t alpha, intptr_t a, intptr_t adesc, intptr_t b, intptr_t bdesc, intptr_t beta, intptr_t c, intptr_t cdesc, intptr_t d, intptr_t ddesc, intptr_t algo, intptr_t workspace, size_t workspace_size_in_bytes, intptr_t stream)
 cpdef matrix_transform(intptr_t light_handle, intptr_t transform_desc, intptr_t alpha, intptr_t a, intptr_t adesc, intptr_t beta, intptr_t b, intptr_t bdesc, intptr_t c, intptr_t cdesc, intptr_t stream)
diff --git a/nvmath/bindings/cublasLt.pyx b/nvmath/bindings/cublasLt.pyx
index 38aeeea..2f5f392 100644
--- a/nvmath/bindings/cublasLt.pyx
+++ b/nvmath/bindings/cublasLt.pyx
@@ -1252,12 +1252,12 @@ cpdef destroy(intptr_t light_handle):
     check_status(status)
 
 
-cpdef size_t get_version():
+cpdef size_t get_version() except? 0:
     """See `cublasLtGetVersion`."""
     return cublasLtGetVersion()
 
 
-cpdef size_t get_cudart_version():
+cpdef size_t get_cudart_version() except? 0:
     """See `cublasLtGetCudartVersion`."""
     return cublasLtGetCudartVersion()
 
@@ -1524,6 +1524,16 @@ cpdef get_matmul_preference_attribute_dtype(int attr):
     .. note:: This API has no C counterpart and is a convenient helper for
         allocating memory for :func:`matmul_preference_get_attribute`, :func:`matmul_preference_set_attribute`.
     """
+    if attr == CUBLASLT_MATMUL_PREF_MATH_MODE_MASK:
+        raise ValueError('The value has been deprecated and removed. Please use corresponding value from `MatmulNumericalImplFlags`')
+    if attr == CUBLASLT_MATMUL_PREF_GAUSSIAN_MODE_MASK:
+        raise ValueError('The value has been deprecated and removed. Please use corresponding value from `MatmulNumericalImplFlags`')
+    if attr == CUBLASLT_MATMUL_PREF_POINTER_MODE_MASK:
+        raise ValueError('The value has been deprecated and removed. Please use corresponding value from `MatmulPreferenceAttribute`')
+    if attr == CUBLASLT_MATMUL_PREF_EPILOGUE_MASK:
+        raise ValueError('The value has been deprecated and removed. Please use corresponding value from `MatmulPreferenceAttribute`')
+    if attr == CUBLASLT_MATMUL_PREF_SM_COUNT_TARGET:
+        raise ValueError('The value has been deprecated and removed. Please use corresponding value from `MatmulPreferenceAttribute`')
     return matmul_preference_attribute_sizes[attr]
 
 ###########################################################################
diff --git a/nvmath/bindings/cufft.pxd b/nvmath/bindings/cufft.pxd
index a09bd45..e24af44 100644
--- a/nvmath/bindings/cufft.pxd
+++ b/nvmath/bindings/cufft.pxd
@@ -109,3 +109,6 @@ cpdef xt_exec_descriptor(int plan, intptr_t input, intptr_t output, int directio
 cpdef xt_set_work_area_policy(int plan, int policy, intptr_t work_size)
 cpdef xt_set_jit_callback(int plan, lto_callback_fatbin, size_t lto_callback_fatbin_size, int type, caller_info)
 cpdef xt_set_subformat_default(int plan, int subformat_forward, int subformat_inverse)
+cpdef set_plan_property_int64(int plan, int property, long long int input_value_int)
+cpdef long long int get_plan_property_int64(int plan, int property) except? -1
+cpdef reset_plan_property(int plan, int property)
diff --git a/nvmath/bindings/cufft.pyx b/nvmath/bindings/cufft.pyx
index 0a4de7c..884bc21 100644
--- a/nvmath/bindings/cufft.pyx
+++ b/nvmath/bindings/cufft.pyx
@@ -99,8 +99,8 @@ class XtCallbackType(_IntEnum):
 
 class Property(_IntEnum):
     """See `cufftProperty`."""
-    NVFFT_PLAN_INT64_PATIENT_JIT = NVFFT_PLAN_PROPERTY_INT64_PATIENT_JIT
-    NVFFT_PLAN_INT64_MAX_NUM_HOST_THREADS = NVFFT_PLAN_PROPERTY_INT64_MAX_NUM_HOST_THREADS
+    PATIENT_JIT = NVFFT_PLAN_PROPERTY_INT64_PATIENT_JIT
+    MAX_NUM_HOST_THREADS = NVFFT_PLAN_PROPERTY_INT64_MAX_NUM_HOST_THREADS
 
 
 ###############################################################################
@@ -607,3 +607,26 @@ cpdef xt_set_subformat_default(int plan, int subformat_forward, int subformat_in
     with nogil:
         status = cufftXtSetSubformatDefault(<cufftHandle>plan, <_XtSubFormat>subformat_forward, <_XtSubFormat>subformat_inverse)
     check_status(status)
+
+
+cpdef set_plan_property_int64(int plan, int property, long long int input_value_int):
+    """See `cufftSetPlanPropertyInt64`."""
+    with nogil:
+        status = cufftSetPlanPropertyInt64(<cufftHandle>plan, <_Property>property, <const long long int>input_value_int)
+    check_status(status)
+
+
+cpdef long long int get_plan_property_int64(int plan, int property) except? -1:
+    """See `cufftGetPlanPropertyInt64`."""
+    cdef long long int return_ptr_value
+    with nogil:
+        status = cufftGetPlanPropertyInt64(<cufftHandle>plan, <_Property>property, &return_ptr_value)
+    check_status(status)
+    return return_ptr_value
+
+
+cpdef reset_plan_property(int plan, int property):
+    """See `cufftResetPlanProperty`."""
+    with nogil:
+        status = cufftResetPlanProperty(<cufftHandle>plan, <_Property>property)
+    check_status(status)
diff --git a/nvmath/bindings/cusolverDn.pxd b/nvmath/bindings/cusolverDn.pxd
index e97c201..fc68205 100644
--- a/nvmath/bindings/cusolverDn.pxd
+++ b/nvmath/bindings/cusolverDn.pxd
@@ -402,3 +402,9 @@ cpdef logger_set_mask(int mask)
 cpdef logger_force_disable()
 cpdef set_deterministic_mode(intptr_t handle, int mode)
 cpdef int get_deterministic_mode(intptr_t handle) except *
+cpdef tuple xlarft_buffer_size(intptr_t handle, intptr_t params, cusolverDirectMode_t direct, cusolverStorevMode_t storev, int64_t n, int64_t k, int data_type_v, intptr_t v, int64_t ldv, int data_type_tau, intptr_t tau, int data_type_t, intptr_t t, int64_t ldt, int compute_type)
+cpdef xlarft(intptr_t handle, intptr_t params, cusolverDirectMode_t direct, cusolverStorevMode_t storev, int64_t n, int64_t k, int data_type_v, intptr_t v, int64_t ldv, int data_type_tau, intptr_t tau, int data_type_t, intptr_t t, int64_t ldt, int compute_type, intptr_t buffer_on_device, size_t workspace_in_bytes_on_device, intptr_t buffer_on_host, size_t workspace_in_bytes_on_host)
+cpdef tuple xsyev_batched_buffer_size(intptr_t handle, intptr_t params, int jobz, int uplo, int64_t n, int data_type_a, intptr_t a, int64_t lda, int data_type_w, intptr_t w, int compute_type, int64_t batch_size)
+cpdef xsyev_batched(intptr_t handle, intptr_t params, int jobz, int uplo, int64_t n, int data_type_a, intptr_t a, int64_t lda, int data_type_w, intptr_t w, int compute_type, intptr_t buffer_on_device, size_t workspace_in_bytes_on_device, intptr_t buffer_on_host, size_t workspace_in_bytes_on_host, intptr_t info, int64_t batch_size)
+cpdef tuple xgeev_buffer_size(intptr_t handle, intptr_t params, int jobvl, int jobvr, int64_t n, int data_type_a, intptr_t a, int64_t lda, int data_type_w, intptr_t w, int data_type_vl, intptr_t vl, int64_t ldvl, int data_type_vr, intptr_t vr, int64_t ldvr, int compute_type)
+cpdef xgeev(intptr_t handle, intptr_t params, int jobvl, int jobvr, int64_t n, int data_type_a, intptr_t a, int64_t lda, int data_type_w, intptr_t w, int data_type_vl, intptr_t vl, int64_t ldvl, int data_type_vr, intptr_t vr, int64_t ldvr, int compute_type, intptr_t buffer_on_device, size_t workspace_in_bytes_on_device, intptr_t buffer_on_host, size_t workspace_in_bytes_on_host, intptr_t info)
diff --git a/nvmath/bindings/cusolverDn.pyx b/nvmath/bindings/cusolverDn.pyx
index 6b2f157..9c1b5f6 100644
--- a/nvmath/bindings/cusolverDn.pyx
+++ b/nvmath/bindings/cusolverDn.pyx
@@ -3033,3 +3033,54 @@ cpdef int get_deterministic_mode(intptr_t handle) except *:
         status = cusolverDnGetDeterministicMode(<Handle>handle, &mode)
     check_status(status)
     return <int>mode
+
+
+cpdef tuple xlarft_buffer_size(intptr_t handle, intptr_t params, cusolverDirectMode_t direct, cusolverStorevMode_t storev, int64_t n, int64_t k, int data_type_v, intptr_t v, int64_t ldv, int data_type_tau, intptr_t tau, int data_type_t, intptr_t t, int64_t ldt, int compute_type):
+    """See `cusolverDnXlarft_bufferSize`."""
+    cdef size_t workspace_in_bytes_on_device
+    cdef size_t workspace_in_bytes_on_host
+    with nogil:
+        status = cusolverDnXlarft_bufferSize(<Handle>handle, <Params>params, direct, storev, n, k, <DataType>data_type_v, <const void*>v, ldv, <DataType>data_type_tau, <const void*>tau, <DataType>data_type_t, <void*>t, ldt, <DataType>compute_type, &workspace_in_bytes_on_device, &workspace_in_bytes_on_host)
+    check_status(status)
+    return (workspace_in_bytes_on_device, workspace_in_bytes_on_host)
+
+
+cpdef xlarft(intptr_t handle, intptr_t params, cusolverDirectMode_t direct, cusolverStorevMode_t storev, int64_t n, int64_t k, int data_type_v, intptr_t v, int64_t ldv, int data_type_tau, intptr_t tau, int data_type_t, intptr_t t, int64_t ldt, int compute_type, intptr_t buffer_on_device, size_t workspace_in_bytes_on_device, intptr_t buffer_on_host, size_t workspace_in_bytes_on_host):
+    """See `cusolverDnXlarft`."""
+    with nogil:
+        status = cusolverDnXlarft(<Handle>handle, <Params>params, direct, storev, n, k, <DataType>data_type_v, <const void*>v, ldv, <DataType>data_type_tau, <const void*>tau, <DataType>data_type_t, <void*>t, ldt, <DataType>compute_type, <void*>buffer_on_device, workspace_in_bytes_on_device, <void*>buffer_on_host, workspace_in_bytes_on_host)
+    check_status(status)
+
+
+cpdef tuple xsyev_batched_buffer_size(intptr_t handle, intptr_t params, int jobz, int uplo, int64_t n, int data_type_a, intptr_t a, int64_t lda, int data_type_w, intptr_t w, int compute_type, int64_t batch_size):
+    """See `cusolverDnXsyevBatched_bufferSize`."""
+    cdef size_t workspace_in_bytes_on_device
+    cdef size_t workspace_in_bytes_on_host
+    with nogil:
+        status = cusolverDnXsyevBatched_bufferSize(<Handle>handle, <Params>params, <cusolverEigMode_t>jobz, <cublasFillMode_t>uplo, n, <DataType>data_type_a, <const void*>a, lda, <DataType>data_type_w, <const void*>w, <DataType>compute_type, &workspace_in_bytes_on_device, &workspace_in_bytes_on_host, batch_size)
+    check_status(status)
+    return (workspace_in_bytes_on_device, workspace_in_bytes_on_host)
+
+
+cpdef xsyev_batched(intptr_t handle, intptr_t params, int jobz, int uplo, int64_t n, int data_type_a, intptr_t a, int64_t lda, int data_type_w, intptr_t w, int compute_type, intptr_t buffer_on_device, size_t workspace_in_bytes_on_device, intptr_t buffer_on_host, size_t workspace_in_bytes_on_host, intptr_t info, int64_t batch_size):
+    """See `cusolverDnXsyevBatched`."""
+    with nogil:
+        status = cusolverDnXsyevBatched(<Handle>handle, <Params>params, <cusolverEigMode_t>jobz, <cublasFillMode_t>uplo, n, <DataType>data_type_a, <void*>a, lda, <DataType>data_type_w, <void*>w, <DataType>compute_type, <void*>buffer_on_device, workspace_in_bytes_on_device, <void*>buffer_on_host, workspace_in_bytes_on_host, <int*>info, batch_size)
+    check_status(status)
+
+
+cpdef tuple xgeev_buffer_size(intptr_t handle, intptr_t params, int jobvl, int jobvr, int64_t n, int data_type_a, intptr_t a, int64_t lda, int data_type_w, intptr_t w, int data_type_vl, intptr_t vl, int64_t ldvl, int data_type_vr, intptr_t vr, int64_t ldvr, int compute_type):
+    """See `cusolverDnXgeev_bufferSize`."""
+    cdef size_t workspace_in_bytes_on_device
+    cdef size_t workspace_in_bytes_on_host
+    with nogil:
+        status = cusolverDnXgeev_bufferSize(<Handle>handle, <Params>params, <cusolverEigMode_t>jobvl, <cusolverEigMode_t>jobvr, n, <DataType>data_type_a, <const void*>a, lda, <DataType>data_type_w, <const void*>w, <DataType>data_type_vl, <const void*>vl, ldvl, <DataType>data_type_vr, <const void*>vr, ldvr, <DataType>compute_type, &workspace_in_bytes_on_device, &workspace_in_bytes_on_host)
+    check_status(status)
+    return (workspace_in_bytes_on_device, workspace_in_bytes_on_host)
+
+
+cpdef xgeev(intptr_t handle, intptr_t params, int jobvl, int jobvr, int64_t n, int data_type_a, intptr_t a, int64_t lda, int data_type_w, intptr_t w, int data_type_vl, intptr_t vl, int64_t ldvl, int data_type_vr, intptr_t vr, int64_t ldvr, int compute_type, intptr_t buffer_on_device, size_t workspace_in_bytes_on_device, intptr_t buffer_on_host, size_t workspace_in_bytes_on_host, intptr_t info):
+    """See `cusolverDnXgeev`."""
+    with nogil:
+        status = cusolverDnXgeev(<Handle>handle, <Params>params, <cusolverEigMode_t>jobvl, <cusolverEigMode_t>jobvr, n, <DataType>data_type_a, <void*>a, lda, <DataType>data_type_w, <void*>w, <DataType>data_type_vl, <void*>vl, ldvl, <DataType>data_type_vr, <void*>vr, ldvr, <DataType>compute_type, <void*>buffer_on_device, workspace_in_bytes_on_device, <void*>buffer_on_host, workspace_in_bytes_on_host, <int*>info)
+    check_status(status)
diff --git a/nvmath/bindings/cusparse.pxd b/nvmath/bindings/cusparse.pxd
index e7efb11..3df3661 100644
--- a/nvmath/bindings/cusparse.pxd
+++ b/nvmath/bindings/cusparse.pxd
@@ -86,13 +86,13 @@ cpdef set_pointer_mode(intptr_t handle, int mode)
 cpdef intptr_t create_mat_descr() except? 0
 cpdef destroy_mat_descr(intptr_t descr_a)
 cpdef set_mat_type(intptr_t descr_a, int type)
-cpdef int get_mat_type(intptr_t descr_a)
+cpdef int get_mat_type(intptr_t descr_a) except? -1
 cpdef set_mat_fill_mode(intptr_t descr_a, int fill_mode)
-cpdef int get_mat_fill_mode(intptr_t descr_a)
+cpdef int get_mat_fill_mode(intptr_t descr_a) except? -1
 cpdef set_mat_diag_type(intptr_t descr_a, int diag_type)
-cpdef int get_mat_diag_type(intptr_t descr_a)
+cpdef int get_mat_diag_type(intptr_t descr_a) except? -1
 cpdef set_mat_index_base(intptr_t descr_a, int base)
-cpdef int get_mat_index_base(intptr_t descr_a)
+cpdef int get_mat_index_base(intptr_t descr_a) except? -1
 cpdef sgemvi(intptr_t handle, int trans_a, int m, int n, intptr_t alpha, intptr_t a, int lda, int nnz, intptr_t x_val, intptr_t x_ind, intptr_t beta, intptr_t y, int idx_base, intptr_t p_buffer)
 cpdef int sgemvi_buffer_size(intptr_t handle, int trans_a, int m, int n, int nnz) except? -1
 cpdef dgemvi(intptr_t handle, int trans_a, int m, int n, intptr_t alpha, intptr_t a, int lda, int nnz, intptr_t x_val, intptr_t x_ind, intptr_t beta, intptr_t y, int idx_base, intptr_t p_buffer)
@@ -326,3 +326,4 @@ cpdef intptr_t create_sliced_ell(int64_t rows, int64_t cols, int64_t nnz, int64_
 cpdef intptr_t create_const_sliced_ell(int64_t rows, int64_t cols, int64_t nnz, int64_t sell_values_size, int64_t slice_size, intptr_t sell_slice_offsets, intptr_t sell_col_ind, intptr_t sell_values, int sell_slice_offsets_type, int sell_col_ind_type, int idx_base, int value_type) except? 0
 cpdef sp_sv_update_matrix(intptr_t handle, intptr_t spsv_descr, intptr_t new_values, int update_part)
 cpdef sp_mv_preprocess(intptr_t handle, int op_a, intptr_t alpha, intptr_t mat_a, intptr_t vec_x, intptr_t beta, intptr_t vec_y, int compute_type, int alg, intptr_t external_buffer)
+cpdef sp_sm_update_matrix(intptr_t handle, intptr_t spsm_descr, intptr_t new_values, int update_part)
diff --git a/nvmath/bindings/cusparse.pyx b/nvmath/bindings/cusparse.pyx
index a9b7b05..c66bfbb 100644
--- a/nvmath/bindings/cusparse.pyx
+++ b/nvmath/bindings/cusparse.pyx
@@ -324,9 +324,9 @@ cpdef set_mat_type(intptr_t descr_a, int type):
     check_status(status)
 
 
-cpdef int get_mat_type(intptr_t descr_a):
+cpdef int get_mat_type(intptr_t descr_a) except? -1:
     """See `cusparseGetMatType`."""
-    return cusparseGetMatType(<const MatDescr>descr_a)
+    return <int>cusparseGetMatType(<const MatDescr>descr_a)
 
 
 cpdef set_mat_fill_mode(intptr_t descr_a, int fill_mode):
@@ -336,9 +336,9 @@ cpdef set_mat_fill_mode(intptr_t descr_a, int fill_mode):
     check_status(status)
 
 
-cpdef int get_mat_fill_mode(intptr_t descr_a):
+cpdef int get_mat_fill_mode(intptr_t descr_a) except? -1:
     """See `cusparseGetMatFillMode`."""
-    return cusparseGetMatFillMode(<const MatDescr>descr_a)
+    return <int>cusparseGetMatFillMode(<const MatDescr>descr_a)
 
 
 cpdef set_mat_diag_type(intptr_t descr_a, int diag_type):
@@ -348,9 +348,9 @@ cpdef set_mat_diag_type(intptr_t descr_a, int diag_type):
     check_status(status)
 
 
-cpdef int get_mat_diag_type(intptr_t descr_a):
+cpdef int get_mat_diag_type(intptr_t descr_a) except? -1:
     """See `cusparseGetMatDiagType`."""
-    return cusparseGetMatDiagType(<const MatDescr>descr_a)
+    return <int>cusparseGetMatDiagType(<const MatDescr>descr_a)
 
 
 cpdef set_mat_index_base(intptr_t descr_a, int base):
@@ -360,9 +360,9 @@ cpdef set_mat_index_base(intptr_t descr_a, int base):
     check_status(status)
 
 
-cpdef int get_mat_index_base(intptr_t descr_a):
+cpdef int get_mat_index_base(intptr_t descr_a) except? -1:
     """See `cusparseGetMatIndexBase`."""
-    return cusparseGetMatIndexBase(<const MatDescr>descr_a)
+    return <int>cusparseGetMatIndexBase(<const MatDescr>descr_a)
 
 
 cpdef sgemvi(intptr_t handle, int trans_a, int m, int n, intptr_t alpha, intptr_t a, int lda, int nnz, intptr_t x_val, intptr_t x_ind, intptr_t beta, intptr_t y, int idx_base, intptr_t p_buffer):
@@ -2351,3 +2351,10 @@ cpdef sp_mv_preprocess(intptr_t handle, int op_a, intptr_t alpha, intptr_t mat_a
     with nogil:
         status = cusparseSpMV_preprocess(<Handle>handle, <_Operation>op_a, <const void*>alpha, <ConstSpMatDescr>mat_a, <ConstDnVecDescr>vec_x, <const void*>beta, <DnVecDescr>vec_y, <DataType>compute_type, <_SpMVAlg>alg, <void*>external_buffer)
     check_status(status)
+
+
+cpdef sp_sm_update_matrix(intptr_t handle, intptr_t spsm_descr, intptr_t new_values, int update_part):
+    """See `cusparseSpSM_updateMatrix`."""
+    with nogil:
+        status = cusparseSpSM_updateMatrix(<Handle>handle, <SpSMDescr>spsm_descr, <void*>new_values, <_SpSMUpdate>update_part)
+    check_status(status)
diff --git a/nvmath/bindings/cycublas.pxd b/nvmath/bindings/cycublas.pxd
index 717fa19..cc984a1 100644
--- a/nvmath/bindings/cycublas.pxd
+++ b/nvmath/bindings/cycublas.pxd
@@ -648,3 +648,9 @@ cdef cublasStatus_t cublasSdgmm_64(cublasHandle_t handle, cublasSideMode_t mode,
 cdef cublasStatus_t cublasDdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const double* A, int64_t lda, const double* x, int64_t incx, double* C, int64_t ldc) except* nogil
 cdef cublasStatus_t cublasCdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const cuComplex* A, int64_t lda, const cuComplex* x, int64_t incx, cuComplex* C, int64_t ldc) except* nogil
 cdef cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* C, int64_t ldc) except* nogil
+cdef cublasStatus_t cublasSgemmGroupedBatched(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const float alpha_array[], const float* const Aarray[], const int lda_array[], const float* const Barray[], const int ldb_array[], const float beta_array[], float* const Carray[], const int ldc_array[], int group_count, const int group_size[]) except* nogil
+cdef cublasStatus_t cublasSgemmGroupedBatched_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const float alpha_array[], const float* const Aarray[], const int64_t lda_array[], const float* const Barray[], const int64_t ldb_array[], const float beta_array[], float* const Carray[], const int64_t ldc_array[], int64_t group_count, const int64_t group_size[]) except* nogil
+cdef cublasStatus_t cublasDgemmGroupedBatched(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const double alpha_array[], const double* const Aarray[], const int lda_array[], const double* const Barray[], const int ldb_array[], const double beta_array[], double* const Carray[], const int ldc_array[], int group_count, const int group_size[]) except* nogil
+cdef cublasStatus_t cublasDgemmGroupedBatched_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const double alpha_array[], const double* const Aarray[], const int64_t lda_array[], const double* const Barray[], const int64_t ldb_array[], const double beta_array[], double* const Carray[], const int64_t ldc_array[], int64_t group_count, const int64_t group_size[]) except* nogil
+cdef cublasStatus_t cublasGemmGroupedBatchedEx(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const void* alpha_array, const void* const Aarray[], cudaDataType_t Atype, const int lda_array[], const void* const Barray[], cudaDataType_t Btype, const int ldb_array[], const void* beta_array, void* const Carray[], cudaDataType_t Ctype, const int ldc_array[], int group_count, const int group_size[], cublasComputeType_t computeType) except* nogil
+cdef cublasStatus_t cublasGemmGroupedBatchedEx_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const void* alpha_array, const void* const Aarray[], cudaDataType_t Atype, const int64_t lda_array[], const void* const Barray[], cudaDataType_t Btype, const int64_t ldb_array[], const void* beta_array, void* const Carray[], cudaDataType_t Ctype, const int64_t ldc_array[], int64_t group_count, const int64_t group_size[], cublasComputeType_t computeType) except* nogil
diff --git a/nvmath/bindings/cycublas.pyx b/nvmath/bindings/cycublas.pyx
index 3d658dc..242dae3 100644
--- a/nvmath/bindings/cycublas.pyx
+++ b/nvmath/bindings/cycublas.pyx
@@ -2001,3 +2001,27 @@ cdef cublasStatus_t cublasCdgmm_64(cublasHandle_t handle, cublasSideMode_t mode,
 
 cdef cublasStatus_t cublasZdgmm_64(cublasHandle_t handle, cublasSideMode_t mode, int64_t m, int64_t n, const cuDoubleComplex* A, int64_t lda, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* C, int64_t ldc) except* nogil:
     return _cublas._cublasZdgmm_64(handle, mode, m, n, A, lda, x, incx, C, ldc)
+
+
+cdef cublasStatus_t cublasSgemmGroupedBatched(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const float alpha_array[], const float* const Aarray[], const int lda_array[], const float* const Barray[], const int ldb_array[], const float beta_array[], float* const Carray[], const int ldc_array[], int group_count, const int group_size[]) except* nogil:
+    return _cublas._cublasSgemmGroupedBatched(handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, lda_array, Barray, ldb_array, beta_array, Carray, ldc_array, group_count, group_size)
+
+
+cdef cublasStatus_t cublasSgemmGroupedBatched_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const float alpha_array[], const float* const Aarray[], const int64_t lda_array[], const float* const Barray[], const int64_t ldb_array[], const float beta_array[], float* const Carray[], const int64_t ldc_array[], int64_t group_count, const int64_t group_size[]) except* nogil:
+    return _cublas._cublasSgemmGroupedBatched_64(handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, lda_array, Barray, ldb_array, beta_array, Carray, ldc_array, group_count, group_size)
+
+
+cdef cublasStatus_t cublasDgemmGroupedBatched(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const double alpha_array[], const double* const Aarray[], const int lda_array[], const double* const Barray[], const int ldb_array[], const double beta_array[], double* const Carray[], const int ldc_array[], int group_count, const int group_size[]) except* nogil:
+    return _cublas._cublasDgemmGroupedBatched(handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, lda_array, Barray, ldb_array, beta_array, Carray, ldc_array, group_count, group_size)
+
+
+cdef cublasStatus_t cublasDgemmGroupedBatched_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const double alpha_array[], const double* const Aarray[], const int64_t lda_array[], const double* const Barray[], const int64_t ldb_array[], const double beta_array[], double* const Carray[], const int64_t ldc_array[], int64_t group_count, const int64_t group_size[]) except* nogil:
+    return _cublas._cublasDgemmGroupedBatched_64(handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, lda_array, Barray, ldb_array, beta_array, Carray, ldc_array, group_count, group_size)
+
+
+cdef cublasStatus_t cublasGemmGroupedBatchedEx(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const void* alpha_array, const void* const Aarray[], cudaDataType_t Atype, const int lda_array[], const void* const Barray[], cudaDataType_t Btype, const int ldb_array[], const void* beta_array, void* const Carray[], cudaDataType_t Ctype, const int ldc_array[], int group_count, const int group_size[], cublasComputeType_t computeType) except* nogil:
+    return _cublas._cublasGemmGroupedBatchedEx(handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, Atype, lda_array, Barray, Btype, ldb_array, beta_array, Carray, Ctype, ldc_array, group_count, group_size, computeType)
+
+
+cdef cublasStatus_t cublasGemmGroupedBatchedEx_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const void* alpha_array, const void* const Aarray[], cudaDataType_t Atype, const int64_t lda_array[], const void* const Barray[], cudaDataType_t Btype, const int64_t ldb_array[], const void* beta_array, void* const Carray[], cudaDataType_t Ctype, const int64_t ldc_array[], int64_t group_count, const int64_t group_size[], cublasComputeType_t computeType) except* nogil:
+    return _cublas._cublasGemmGroupedBatchedEx_64(handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, Atype, lda_array, Barray, Btype, ldb_array, beta_array, Carray, Ctype, ldc_array, group_count, group_size, computeType)
diff --git a/nvmath/bindings/cycufft.pxd b/nvmath/bindings/cycufft.pxd
index afde866..94b53c4 100644
--- a/nvmath/bindings/cycufft.pxd
+++ b/nvmath/bindings/cycufft.pxd
@@ -316,3 +316,6 @@ cdef cufftResult cufftXtExecDescriptor(cufftHandle plan, cudaLibXtDesc* input, c
 cdef cufftResult cufftXtSetWorkAreaPolicy(cufftHandle plan, cufftXtWorkAreaPolicy policy, size_t* workSize) except* nogil
 cdef cufftResult cufftXtSetJITCallback(cufftHandle plan, const void* lto_callback_fatbin, size_t lto_callback_fatbin_size, cufftXtCallbackType type, void** caller_info) except* nogil
 cdef cufftResult cufftXtSetSubformatDefault(cufftHandle plan, cufftXtSubFormat subformat_forward, cufftXtSubFormat subformat_inverse) except* nogil
+cdef cufftResult cufftSetPlanPropertyInt64(cufftHandle plan, cufftProperty property, const long long int inputValueInt) except* nogil
+cdef cufftResult cufftGetPlanPropertyInt64(cufftHandle plan, cufftProperty property, long long int* returnPtrValue) except* nogil
+cdef cufftResult cufftResetPlanProperty(cufftHandle plan, cufftProperty property) except* nogil
diff --git a/nvmath/bindings/cycufft.pyx b/nvmath/bindings/cycufft.pyx
index 704f4a3..561a30a 100644
--- a/nvmath/bindings/cycufft.pyx
+++ b/nvmath/bindings/cycufft.pyx
@@ -221,3 +221,15 @@ cdef cufftResult cufftXtSetJITCallback(cufftHandle plan, const void* lto_callbac
 
 cdef cufftResult cufftXtSetSubformatDefault(cufftHandle plan, cufftXtSubFormat subformat_forward, cufftXtSubFormat subformat_inverse) except* nogil:
     return _cufft._cufftXtSetSubformatDefault(plan, subformat_forward, subformat_inverse)
+
+
+cdef cufftResult cufftSetPlanPropertyInt64(cufftHandle plan, cufftProperty property, const long long int inputValueInt) except* nogil:
+    return _cufft._cufftSetPlanPropertyInt64(plan, property, inputValueInt)
+
+
+cdef cufftResult cufftGetPlanPropertyInt64(cufftHandle plan, cufftProperty property, long long int* returnPtrValue) except* nogil:
+    return _cufft._cufftGetPlanPropertyInt64(plan, property, returnPtrValue)
+
+
+cdef cufftResult cufftResetPlanProperty(cufftHandle plan, cufftProperty property) except* nogil:
+    return _cufft._cufftResetPlanProperty(plan, property)
diff --git a/nvmath/bindings/cycusolverDn.pxd b/nvmath/bindings/cycusolverDn.pxd
index efd4b12..2c94637 100644
--- a/nvmath/bindings/cycusolverDn.pxd
+++ b/nvmath/bindings/cycusolverDn.pxd
@@ -406,3 +406,9 @@ cdef cusolverStatus_t cusolverDnLoggerSetMask(int mask) except* nogil
 cdef cusolverStatus_t cusolverDnLoggerForceDisable() except* nogil
 cdef cusolverStatus_t cusolverDnSetDeterministicMode(cusolverDnHandle_t handle, cusolverDeterministicMode_t mode) except* nogil
 cdef cusolverStatus_t cusolverDnGetDeterministicMode(cusolverDnHandle_t handle, cusolverDeterministicMode_t* mode) except* nogil
+cdef cusolverStatus_t cusolverDnXlarft_bufferSize(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverDirectMode_t direct, cusolverStorevMode_t storev, int64_t n, int64_t k, cudaDataType dataTypeV, const void* V, int64_t ldv, cudaDataType dataTypeTau, const void* tau, cudaDataType dataTypeT, void* T, int64_t ldt, cudaDataType computeType, size_t* workspaceInBytesOnDevice, size_t* workspaceInBytesOnHost) except* nogil
+cdef cusolverStatus_t cusolverDnXlarft(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverDirectMode_t direct, cusolverStorevMode_t storev, int64_t n, int64_t k, cudaDataType dataTypeV, const void* V, int64_t ldv, cudaDataType dataTypeTau, const void* tau, cudaDataType dataTypeT, void* T, int64_t ldt, cudaDataType computeType, void* bufferOnDevice, size_t workspaceInBytesOnDevice, void* bufferOnHost, size_t workspaceInBytesOnHost) except* nogil
+cdef cusolverStatus_t cusolverDnXsyevBatched_bufferSize(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n, cudaDataType dataTypeA, const void* A, int64_t lda, cudaDataType dataTypeW, const void* W, cudaDataType computeType, size_t* workspaceInBytesOnDevice, size_t* workspaceInBytesOnHost, int64_t batchSize) except* nogil
+cdef cusolverStatus_t cusolverDnXsyevBatched(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n, cudaDataType dataTypeA, void* A, int64_t lda, cudaDataType dataTypeW, void* W, cudaDataType computeType, void* bufferOnDevice, size_t workspaceInBytesOnDevice, void* bufferOnHost, size_t workspaceInBytesOnHost, int* info, int64_t batchSize) except* nogil
+cdef cusolverStatus_t cusolverDnXgeev_bufferSize(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobvl, cusolverEigMode_t jobvr, int64_t n, cudaDataType dataTypeA, const void* A, int64_t lda, cudaDataType dataTypeW, const void* W, cudaDataType dataTypeVL, const void* VL, int64_t ldvl, cudaDataType dataTypeVR, const void* VR, int64_t ldvr, cudaDataType computeType, size_t* workspaceInBytesOnDevice, size_t* workspaceInBytesOnHost) except* nogil
+cdef cusolverStatus_t cusolverDnXgeev(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobvl, cusolverEigMode_t jobvr, int64_t n, cudaDataType dataTypeA, void* A, int64_t lda, cudaDataType dataTypeW, void* W, cudaDataType dataTypeVL, void* VL, int64_t ldvl, cudaDataType dataTypeVR, void* VR, int64_t ldvr, cudaDataType computeType, void* bufferOnDevice, size_t workspaceInBytesOnDevice, void* bufferOnHost, size_t workspaceInBytesOnHost, int* info) except* nogil
diff --git a/nvmath/bindings/cycusolverDn.pyx b/nvmath/bindings/cycusolverDn.pyx
index aeecbc6..3d03f3c 100644
--- a/nvmath/bindings/cycusolverDn.pyx
+++ b/nvmath/bindings/cycusolverDn.pyx
@@ -1477,3 +1477,27 @@ cdef cusolverStatus_t cusolverDnSetDeterministicMode(cusolverDnHandle_t handle,
 
 cdef cusolverStatus_t cusolverDnGetDeterministicMode(cusolverDnHandle_t handle, cusolverDeterministicMode_t* mode) except* nogil:
     return _cusolverDn._cusolverDnGetDeterministicMode(handle, mode)
+
+
+cdef cusolverStatus_t cusolverDnXlarft_bufferSize(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverDirectMode_t direct, cusolverStorevMode_t storev, int64_t n, int64_t k, cudaDataType dataTypeV, const void* V, int64_t ldv, cudaDataType dataTypeTau, const void* tau, cudaDataType dataTypeT, void* T, int64_t ldt, cudaDataType computeType, size_t* workspaceInBytesOnDevice, size_t* workspaceInBytesOnHost) except* nogil:
+    return _cusolverDn._cusolverDnXlarft_bufferSize(handle, params, direct, storev, n, k, dataTypeV, V, ldv, dataTypeTau, tau, dataTypeT, T, ldt, computeType, workspaceInBytesOnDevice, workspaceInBytesOnHost)
+
+
+cdef cusolverStatus_t cusolverDnXlarft(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverDirectMode_t direct, cusolverStorevMode_t storev, int64_t n, int64_t k, cudaDataType dataTypeV, const void* V, int64_t ldv, cudaDataType dataTypeTau, const void* tau, cudaDataType dataTypeT, void* T, int64_t ldt, cudaDataType computeType, void* bufferOnDevice, size_t workspaceInBytesOnDevice, void* bufferOnHost, size_t workspaceInBytesOnHost) except* nogil:
+    return _cusolverDn._cusolverDnXlarft(handle, params, direct, storev, n, k, dataTypeV, V, ldv, dataTypeTau, tau, dataTypeT, T, ldt, computeType, bufferOnDevice, workspaceInBytesOnDevice, bufferOnHost, workspaceInBytesOnHost)
+
+
+cdef cusolverStatus_t cusolverDnXsyevBatched_bufferSize(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n, cudaDataType dataTypeA, const void* A, int64_t lda, cudaDataType dataTypeW, const void* W, cudaDataType computeType, size_t* workspaceInBytesOnDevice, size_t* workspaceInBytesOnHost, int64_t batchSize) except* nogil:
+    return _cusolverDn._cusolverDnXsyevBatched_bufferSize(handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW, W, computeType, workspaceInBytesOnDevice, workspaceInBytesOnHost, batchSize)
+
+
+cdef cusolverStatus_t cusolverDnXsyevBatched(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n, cudaDataType dataTypeA, void* A, int64_t lda, cudaDataType dataTypeW, void* W, cudaDataType computeType, void* bufferOnDevice, size_t workspaceInBytesOnDevice, void* bufferOnHost, size_t workspaceInBytesOnHost, int* info, int64_t batchSize) except* nogil:
+    return _cusolverDn._cusolverDnXsyevBatched(handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW, W, computeType, bufferOnDevice, workspaceInBytesOnDevice, bufferOnHost, workspaceInBytesOnHost, info, batchSize)
+
+
+cdef cusolverStatus_t cusolverDnXgeev_bufferSize(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobvl, cusolverEigMode_t jobvr, int64_t n, cudaDataType dataTypeA, const void* A, int64_t lda, cudaDataType dataTypeW, const void* W, cudaDataType dataTypeVL, const void* VL, int64_t ldvl, cudaDataType dataTypeVR, const void* VR, int64_t ldvr, cudaDataType computeType, size_t* workspaceInBytesOnDevice, size_t* workspaceInBytesOnHost) except* nogil:
+    return _cusolverDn._cusolverDnXgeev_bufferSize(handle, params, jobvl, jobvr, n, dataTypeA, A, lda, dataTypeW, W, dataTypeVL, VL, ldvl, dataTypeVR, VR, ldvr, computeType, workspaceInBytesOnDevice, workspaceInBytesOnHost)
+
+
+cdef cusolverStatus_t cusolverDnXgeev(cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobvl, cusolverEigMode_t jobvr, int64_t n, cudaDataType dataTypeA, void* A, int64_t lda, cudaDataType dataTypeW, void* W, cudaDataType dataTypeVL, void* VL, int64_t ldvl, cudaDataType dataTypeVR, void* VR, int64_t ldvr, cudaDataType computeType, void* bufferOnDevice, size_t workspaceInBytesOnDevice, void* bufferOnHost, size_t workspaceInBytesOnHost, int* info) except* nogil:
+    return _cusolverDn._cusolverDnXgeev(handle, params, jobvl, jobvr, n, dataTypeA, A, lda, dataTypeW, W, dataTypeVL, VL, ldvl, dataTypeVR, VR, ldvr, computeType, bufferOnDevice, workspaceInBytesOnDevice, bufferOnHost, workspaceInBytesOnHost, info)
diff --git a/nvmath/bindings/cycusparse.pxd b/nvmath/bindings/cycusparse.pxd
index c975a2c..de2f01b 100644
--- a/nvmath/bindings/cycusparse.pxd
+++ b/nvmath/bindings/cycusparse.pxd
@@ -468,3 +468,4 @@ cdef cusparseStatus_t cusparseCreateSlicedEll(cusparseSpMatDescr_t* spMatDescr,
 cdef cusparseStatus_t cusparseCreateConstSlicedEll(cusparseConstSpMatDescr_t* spMatDescr, int64_t rows, int64_t cols, int64_t nnz, int64_t sellValuesSize, int64_t sliceSize, const void* sellSliceOffsets, const void* sellColInd, const void* sellValues, cusparseIndexType_t sellSliceOffsetsType, cusparseIndexType_t sellColIndType, cusparseIndexBase_t idxBase, cudaDataType valueType) except* nogil
 cdef cusparseStatus_t cusparseSpSV_updateMatrix(cusparseHandle_t handle, cusparseSpSVDescr_t spsvDescr, void* newValues, cusparseSpSVUpdate_t updatePart) except* nogil
 cdef cusparseStatus_t cusparseSpMV_preprocess(cusparseHandle_t handle, cusparseOperation_t opA, const void* alpha, cusparseConstSpMatDescr_t matA, cusparseConstDnVecDescr_t vecX, const void* beta, cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpMVAlg_t alg, void* externalBuffer) except* nogil
+cdef cusparseStatus_t cusparseSpSM_updateMatrix(cusparseHandle_t handle, cusparseSpSMDescr_t spsmDescr, void* newValues, cusparseSpSMUpdate_t updatePart) except* nogil
diff --git a/nvmath/bindings/cycusparse.pyx b/nvmath/bindings/cycusparse.pyx
index 800e8a8..b59feb7 100644
--- a/nvmath/bindings/cycusparse.pyx
+++ b/nvmath/bindings/cycusparse.pyx
@@ -1029,3 +1029,7 @@ cdef cusparseStatus_t cusparseSpSV_updateMatrix(cusparseHandle_t handle, cuspars
 
 cdef cusparseStatus_t cusparseSpMV_preprocess(cusparseHandle_t handle, cusparseOperation_t opA, const void* alpha, cusparseConstSpMatDescr_t matA, cusparseConstDnVecDescr_t vecX, const void* beta, cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpMVAlg_t alg, void* externalBuffer) except* nogil:
     return _cusparse._cusparseSpMV_preprocess(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg, externalBuffer)
+
+
+cdef cusparseStatus_t cusparseSpSM_updateMatrix(cusparseHandle_t handle, cusparseSpSMDescr_t spsmDescr, void* newValues, cusparseSpSMUpdate_t updatePart) except* nogil:
+    return _cusparse._cusparseSpSM_updateMatrix(handle, spsmDescr, newValues, updatePart)
diff --git a/nvmath/bindings/nvpl/cyfft.pxd b/nvmath/bindings/nvpl/cyfft.pxd
index 31d1746..c227697 100644
--- a/nvmath/bindings/nvpl/cyfft.pxd
+++ b/nvmath/bindings/nvpl/cyfft.pxd
@@ -32,6 +32,9 @@ cdef extern from *:
     #define FFTW_DESTROY_INPUT      0x08
     #define FFTW_PRESERVE_INPUT     0x0C
     #define FFTW_UNALIGNED          0x10
+
+    typedef double fftw_complex[2] __attribute__ ((aligned (16)));
+    typedef float fftwf_complex[2] __attribute__ ((aligned (8)));
     """
 
     cdef const int FFTW_FORWARD
@@ -47,9 +50,9 @@ cdef extern from *:
     cdef const int FFTW_PRESERVE_INPUT
     cdef const int FFTW_UNALIGNED
 
+    ctypedef double fftw_complex[2]
+    ctypedef float fftwf_complex[2]
 
-ctypedef double fftw_complex[2];
-ctypedef float fftwf_complex[2];
 
 ctypedef void* fftw_plan 'fftw_plan'
 ctypedef void* fftwf_plan 'fftwf_plan'
diff --git a/nvmath/bindings/nvpl/fft.pxd b/nvmath/bindings/nvpl/fft.pxd
index 1da331e..449da11 100644
--- a/nvmath/bindings/nvpl/fft.pxd
+++ b/nvmath/bindings/nvpl/fft.pxd
@@ -79,22 +79,22 @@ cpdef int get_version() except? -1
 cpdef intptr_t plan_many_c2c_double(int rank, n, int batch, intptr_t in_, inembed, int istride, int idist, intptr_t out, onembed, int ostride, int odist, int sign, unsigned flags) except? 0
 cpdef intptr_t plan_many_r2c_double(int rank, n, int batch, intptr_t in_, inembed, int istride, int idist, intptr_t out, onembed, int ostride, int odist, unsigned flags) except? 0
 cpdef intptr_t plan_many_c2r_double(int rank, n, int batch, intptr_t in_, inembed, int istride, int idist, intptr_t out, onembed, int ostride, int odist, unsigned flags) except? 0
-cpdef void execute_c2c_double(intptr_t plan, intptr_t idata, intptr_t odata)
-cpdef void execute_r2c_double(intptr_t plan, intptr_t idata, intptr_t odata)
-cpdef void execute_c2r_double(intptr_t plan, intptr_t idata, intptr_t odata)
+cpdef void execute_c2c_double(intptr_t plan, intptr_t idata, intptr_t odata) except*
+cpdef void execute_r2c_double(intptr_t plan, intptr_t idata, intptr_t odata) except*
+cpdef void execute_c2r_double(intptr_t plan, intptr_t idata, intptr_t odata) except*
 cpdef intptr_t plan_many_c2c_float(int rank, n, int batch, intptr_t in_, inembed, int istride, int idist, intptr_t out, onembed, int ostride, int odist, int sign, unsigned flags) except? 0
 cpdef intptr_t plan_many_r2c_float(int rank, n, int batch, intptr_t in_, inembed, int istride, int idist, intptr_t out, onembed, int ostride, int odist, unsigned flags) except? 0
 cpdef intptr_t plan_many_c2r_float(int rank, n, int batch, intptr_t in_, inembed, int istride, int idist, intptr_t out, onembed, int ostride, int odist, unsigned flags) except? 0
-cpdef void execute_c2c_float(intptr_t plan, intptr_t idata, intptr_t odata)
-cpdef void execute_r2c_float(intptr_t plan, intptr_t idata, intptr_t odata)
-cpdef void execute_c2r_float(intptr_t plan, intptr_t idata, intptr_t odata)
-cpdef int init_threads_double()
-cpdef int init_threads_float()
-cpdef void plan_with_nthreads_double(int nthreads)
-cpdef void plan_with_nthreads_float(int nthreads)
-cpdef int planner_nthreads_double()
-cpdef int planner_nthreads_float()
-cpdef void cleanup_threads_double()
-cpdef void cleanup_threads_float()
-cpdef void destroy_plan_double(intptr_t plan)
-cpdef void destroy_plan_float(intptr_t plan)
+cpdef void execute_c2c_float(intptr_t plan, intptr_t idata, intptr_t odata) except*
+cpdef void execute_r2c_float(intptr_t plan, intptr_t idata, intptr_t odata) except*
+cpdef void execute_c2r_float(intptr_t plan, intptr_t idata, intptr_t odata) except*
+cpdef int init_threads_double() except 0
+cpdef int init_threads_float() except 0
+cpdef void plan_with_nthreads_double(int nthreads) except*
+cpdef void plan_with_nthreads_float(int nthreads) except*
+cpdef int planner_nthreads_double() except? 0
+cpdef int planner_nthreads_float() except? 0
+cpdef void cleanup_threads_double() except*
+cpdef void cleanup_threads_float() except*
+cpdef void destroy_plan_double(intptr_t plan) except*
+cpdef void destroy_plan_float(intptr_t plan) except*
diff --git a/nvmath/bindings/nvpl/fft.pyx b/nvmath/bindings/nvpl/fft.pyx
index 1333356..01d84ac 100644
--- a/nvmath/bindings/nvpl/fft.pyx
+++ b/nvmath/bindings/nvpl/fft.pyx
@@ -54,7 +54,7 @@ class FFTWError(Exception):
     pass
 
 
-class FFTWUnaliged(FFTWError):
+class FFTWUnaligned(FFTWError):
     pass
 
 
@@ -64,6 +64,21 @@ cdef inline check_plan(intptr_t plan):
         raise FFTWError("Planning failed")
 
 
+@cython.profile(False)
+cdef inline check_nthreads(int nthreads):
+    if nthreads <= 0:
+        raise FFTWError(
+            f"The number of threads available for the plan execution "
+            f"was reported to be {nthreads}, expected a positive integer."
+        )
+
+
+@cython.profile(False)
+cdef inline check_init_threads(intptr_t nthreads):
+    if nthreads == 0:
+        raise FFTWError(f"Initialization of FFT threading failed")
+
+
 @cython.profile(False)
 cdef inline intptr_t get_ptr_alignment(intptr_t ptr):
     return ptr & (~(ptr - 1))
@@ -72,14 +87,14 @@ cdef inline intptr_t get_ptr_alignment(intptr_t ptr):
 @cython.profile(False)
 cdef inline check_alignment(intptr_t in_ptr, intptr_t out_ptr, int alignment):
     if in_ptr != 0 and get_ptr_alignment(in_ptr) < alignment:
-        raise FFTWUnaliged(
+        raise FFTWUnaligned(
             f"The input tensor's underlying memory pointer must be "
             f"aligned to at least {alignment} bytes. "
             f"The address {in_ptr} is not aligned enough."
         )
 
     if out_ptr != 0 and get_ptr_alignment(out_ptr) < alignment:
-        raise FFTWUnaliged(
+        raise FFTWUnaligned(
             f"The output tensor's underlying memory pointer must be "
             f"aligned to at least {alignment} bytes. "
             f"The address {out_ptr} is not aligned enough."
@@ -330,17 +345,17 @@ cpdef intptr_t plan_many_c2r_double(int rank, n, int batch, intptr_t in_, inembe
     return ret
 
 
-cpdef void execute_c2c_double(intptr_t plan, intptr_t idata, intptr_t odata):
+cpdef void execute_c2c_double(intptr_t plan, intptr_t idata, intptr_t odata) except*:
     """See `fftw_execute_dft`."""
     fftw_execute_dft(<const fftw_plan>plan, <fftw_complex*>idata, <fftw_complex*>odata)
 
 
-cpdef void execute_r2c_double(intptr_t plan, intptr_t idata, intptr_t odata):
+cpdef void execute_r2c_double(intptr_t plan, intptr_t idata, intptr_t odata) except*:
     """See `fftw_execute_dft_r2c`."""
     fftw_execute_dft_r2c(<const fftw_plan>plan, <double*>idata, <fftw_complex*>odata)
 
 
-cpdef void execute_c2r_double(intptr_t plan, intptr_t idata, intptr_t odata):
+cpdef void execute_c2r_double(intptr_t plan, intptr_t idata, intptr_t odata) except*:
     """See `fftw_execute_dft_c2r`."""
     fftw_execute_dft_c2r(<const fftw_plan>plan, <fftw_complex*>idata, <double*>odata)
 
@@ -390,66 +405,82 @@ cpdef intptr_t plan_many_c2r_float(int rank, n, int batch, intptr_t in_, inembed
     return ret
 
 
-cpdef void execute_c2c_float(intptr_t plan, intptr_t idata, intptr_t odata):
+cpdef void execute_c2c_float(intptr_t plan, intptr_t idata, intptr_t odata) except*:
     """See `fftwf_execute_dft`."""
     fftwf_execute_dft(<const fftwf_plan>plan, <fftwf_complex*>idata, <fftwf_complex*>odata)
 
 
-cpdef void execute_r2c_float(intptr_t plan, intptr_t idata, intptr_t odata):
+cpdef void execute_r2c_float(intptr_t plan, intptr_t idata, intptr_t odata) except*:
     """See `fftwf_execute_dft_r2c`."""
     fftwf_execute_dft_r2c(<const fftwf_plan>plan, <float*>idata, <fftwf_complex*>odata)
 
 
-cpdef void execute_c2r_float(intptr_t plan, intptr_t idata, intptr_t odata):
+cpdef void execute_c2r_float(intptr_t plan, intptr_t idata, intptr_t odata) except*:
     """See `fftwf_execute_dft_c2r`."""
     fftwf_execute_dft_c2r(<const fftwf_plan>plan, <fftwf_complex*>idata, <float*>odata)
 
 
-cpdef int init_threads_double():
+cpdef int init_threads_double() except 0:
     """See `fftw_init_threads`."""
-    return fftw_init_threads()
+    cdef intptr_t ret
+    with nogil:
+        ret = fftw_init_threads()
+    check_init_threads(ret)
+    return ret
 
 
-cpdef int init_threads_float():
+cpdef int init_threads_float() except 0:
     """See `fftwf_init_threads`."""
-    return fftwf_init_threads()
+    cdef intptr_t ret
+    with nogil:
+        ret = fftwf_init_threads()
+    check_init_threads(ret)
+    return ret
 
 
-cpdef void plan_with_nthreads_double(int nthreads):
+cpdef void plan_with_nthreads_double(int nthreads) except*:
     """See `fftw_plan_with_nthreads`."""
     fftw_plan_with_nthreads(nthreads)
 
 
-cpdef void plan_with_nthreads_float(int nthreads):
+cpdef void plan_with_nthreads_float(int nthreads) except*:
     """See `fftwf_plan_with_nthreads`."""
     fftwf_plan_with_nthreads(nthreads)
 
 
-cpdef int planner_nthreads_double():
+cpdef int planner_nthreads_double() except? 0:
     """See `fftw_planner_nthreads`."""
-    return fftw_planner_nthreads()
+    cdef intptr_t ret
+    with nogil:
+        ret = fftw_planner_nthreads()
+    check_nthreads(ret)
+    return ret
 
 
-cpdef int planner_nthreads_float():
+cpdef int planner_nthreads_float() except? 0:
     """See `fftwf_planner_nthreads`."""
-    return fftwf_planner_nthreads()
+    cdef intptr_t ret
+    with nogil:
+        ret = fftwf_planner_nthreads()
+    check_nthreads(ret)
+    return ret
 
 
-cpdef void cleanup_threads_double():
+cpdef void cleanup_threads_double() except*:
     """See `fftw_cleanup_threads`."""
     fftw_cleanup_threads()
 
 
-cpdef void cleanup_threads_float():
+cpdef void cleanup_threads_float() except*:
     """See `fftwf_cleanup_threads`."""
     fftwf_cleanup_threads()
 
 
-cpdef void destroy_plan_double(intptr_t plan):
+cpdef void destroy_plan_double(intptr_t plan) except*:
     """See `fftw_destroy_plan`."""
     fftw_destroy_plan(<fftw_plan>plan)
 
 
-cpdef void destroy_plan_float(intptr_t plan):
+cpdef void destroy_plan_float(intptr_t plan) except*:
     """See `fftwf_destroy_plan`."""
     fftwf_destroy_plan(<fftwf_plan>plan)
diff --git a/nvmath/device/common.py b/nvmath/device/common.py
index fb6e464..53dd12a 100644
--- a/nvmath/device/common.py
+++ b/nvmath/device/common.py
@@ -10,8 +10,13 @@
 
 SHARED_DEVICE_DOCSTRINGS = {
     "compiler": "A string to specify the compiler for the device code, currently supports ``None`` (default) and ``'Numba'``",
-    "precision": "The computation precision specified as a numpy float dtype, currently supports ``numpy.float16``, ``numpy.float32`` and ``numpy.float64``.",
+    #
+    "precision": """\
+The computation precision specified as a numpy float dtype, currently supports ``numpy.float16``, ``numpy.float32`` and
+``numpy.float64``.""".replace("\n", " "),
+    #
     "code_type": "The target GPU code and compute-capability.",
+    #
     "execution": "A string specifying the execution method, can be ``'Block'`` or ``'Thread'``.",
 }
 
diff --git a/nvmath/device/common_cuda.py b/nvmath/device/common_cuda.py
index 0dbd7a1..adf9b25 100644
--- a/nvmath/device/common_cuda.py
+++ b/nvmath/device/common_cuda.py
@@ -126,9 +126,11 @@ def get_default_code_type():
 
 def current_device_lto():
     """
-    A helper function to get the default code type for link time optimization (LTO) on the current device.
+    A helper function to get the default code type for link time optimization (LTO) on the
+    current device.
 
     Returns:
-        A :class:`CodeType` object representing the default LTO code type for the current device.
+        A :class:`CodeType` object representing the default LTO code type for the current
+        device.
     """
     return get_default_code_type()
diff --git a/nvmath/device/common_mathdx.py b/nvmath/device/common_mathdx.py
index 7a55371..12c95d0 100644
--- a/nvmath/device/common_mathdx.py
+++ b/nvmath/device/common_mathdx.py
@@ -75,8 +75,7 @@ def check_cuda_home():
             os.path.join(conda_include, "cuda/std/type_traits")
         ):
             CUDA_HOME = (os.path.join(conda_include, ".."),)
-            # TODO: verify conda case.
-            CURAND_HOME = os.path.join(CUDA_HOME, "include")
+            CURAND_HOME = os.path.join(CUDA_HOME[0], "include")
             return
 
     # Try local
@@ -91,9 +90,8 @@ def check_cuda_home():
         )
     elif CUDA_PATH is not None and CUDA_HOME is None:
         CUDA_HOME = CUDA_PATH
-    elif CUDA_PATH is not None and CUDA_HOME is not None:
-        if CUDA_HOME != CUDA_PATH:
-            warnings.warn("Both CUDA_HOME and CUDA_PATH are set but not consistent. " "Ignoring CUDA_PATH...")
+    elif CUDA_PATH is not None and CUDA_HOME is not None and CUDA_HOME != CUDA_PATH:
+        warnings.warn("Both CUDA_HOME and CUDA_PATH are set but not consistent. Ignoring CUDA_PATH...")
     CUDA_HOME = (CUDA_HOME,)
     CURAND_HOME = os.path.join(CUDA_HOME[0], "include")
 
diff --git a/nvmath/device/common_numba.py b/nvmath/device/common_numba.py
index 627ca6a..b3f1287 100644
--- a/nvmath/device/common_numba.py
+++ b/nvmath/device/common_numba.py
@@ -51,9 +51,9 @@ def make_dx_codegen_one_arg(context, builder, type_, arg):
         else:
             val = arg
         return (intTy, val)
-    # Floats and Complex are passed by reference (pointer)
-    # This is because some CUDA C++ types, such as __half2 are non-trivial, and those must be passed
-    # by reference. For consistency we pass everything by reference.
+    # Floats and Complex are passed by reference (pointer) This is because some CUDA C++
+    # types, such as __half2 are non-trivial, and those must be passed by reference. For
+    # consistency we pass everything by reference.
     elif type_ in [float16x2_type, float16x4_type, float32x2_type, float64x2_type] or isinstance(  # noqa: UP038
         type_, (types.Float, types.Complex)
     ):
diff --git a/nvmath/device/cublasdx.py b/nvmath/device/cublasdx.py
index 931cc84..cc9e68c 100644
--- a/nvmath/device/cublasdx.py
+++ b/nvmath/device/cublasdx.py
@@ -27,16 +27,32 @@
 CUBLASDX_DOCSTRING = SHARED_DEVICE_DOCSTRINGS.copy()
 CUBLASDX_DOCSTRING.update(
     {
-        "size": "A sequence of integers denoting the three dimensions ``(m, n, k)`` for the matrix multiplication problem.",
-        "data_type": "The data type of the input matrices, can be either ``'real'`` or ``'complex'``.",
-        "block_size": "The total block size, optional. If not provided or set to ``'suggested'``, "
-        "will be set to a suggested value for 1D block dim. ",
-        "block_dim": "The block dimension for launching the CUDA kernel, optional. If not provided or set to ``'suggested'``, "
-        "will be set to a suggested value. Can't not be used when `block_size` is explicitly specified.",
-        "leading_dimension": "The leading dimensions for the input matrices, optional. If not provided, will be set to match the matrix row/column dimension. "
-        "Alternatively, if provided as ``'suggested'``, will be set to a suggested value for optimal performance. ",
-        "transpose_mode": "The transpose mode for all input matrices. If not provided, no transposition by default.",
-        "function": "A string specifying the name of the function. Currently supports ``'MM'`` (default) for matrix multiplication.",
+        "size": """\
+A sequence of integers denoting the three dimensions ``(m, n, k)`` for the matrix multiplication
+problem.""".replace("\n", " "),
+        #
+        "data_type": """\
+The data type of the input matrices, can be either ``'real'`` or ``'complex'``.""".replace("\n", " "),
+        #
+        "block_size": """\
+The total block size, optional. If not provided or set to ``'suggested'``, will be set to a suggested value for 1D block
+dim. """.replace("\n", " "),
+        #
+        "block_dim": """\
+The block dimension for launching the CUDA kernel, optional. If not provided or set to ``'suggested'``, will be set to a
+suggested value. Can't not be used when `block_size` is explicitly specified.""".replace("\n", " "),
+        #
+        "leading_dimension": """\
+The leading dimensions for the input matrices, optional. If not provided, will be set to match the matrix row/column
+dimension. Alternatively, if provided as ``'suggested'``, will be set to a suggested value for optimal performance.
+""".replace("\n", " "),
+        #
+        "transpose_mode": """\
+The transpose mode for all input matrices. If not provided, no transposition by default.""".replace("\n", " "),
+        #
+        "function": """\
+A string specifying the name of the function. Currently supports ``'MM'`` (default) for matrix
+multiplication.""".replace("\n", " "),
     }
 )
 
@@ -48,25 +64,36 @@
 @docstring_decorator(CUBLASDX_DOCSTRING, skip_missing=False)
 class BlasOptions:
     """
-    A class that encapsulates a partial BLAS device function.
-    A partial device function can be queried for available or optimal values for the some knobs (such as `leading_dimension` or `block_dim`).
-    It does not contain a compiled, ready-to-use, device function until finalized using :meth:`create`.
+    A class that encapsulates a partial BLAS device function. A partial device function can
+    be queried for available or optimal values for the some knobs (such as
+    `leading_dimension` or `block_dim`). It does not contain a compiled, ready-to-use,
+    device function until finalized using :meth:`create`.
 
     Args:
         size: {size}
+
         precision: {precision}
+
         data_type: {data_type}
+
         code_type (CodeType): {code_type}
+
         block_size (int): {block_size}
+
         block_dim (Dim3): {block_dim}
+
         leading_dimension (LeadingDimension): {leading_dimension}
+
         transpose_mode (TransposeMode): {transpose_mode}
+
         function (str): {function}
+
         execution (str): {execution}
 
     See Also:
         The attributes of this class provide a 1:1 mapping with the CUDA C++ cuBLASDx APIs.
-        For further details, please refer to `cuBLASDx documentation <https://docs.nvidia.com/cuda/cublasdx/>`_.
+        For further details, please refer to `cuBLASDx documentation
+        <https://docs.nvidia.com/cuda/cublasdx/>`_.
     """
 
     def __init__(
@@ -88,20 +115,20 @@ def __init__(
         code_type = CodeType(code_type[0], ComputeCapability(*code_type[1]))
         if code_type.cc.major < 7:
             raise RuntimeError(
-                "Minimal compute capability 7.0 is required by cuBLASDx, got "
-                f"{code_type.cc.major}.{code_type.cc.minor}"
+                "Minimal compute capability 7.0 is required by cuBLASDx, got " f"{code_type.cc.major}.{code_type.cc.minor}"
             )
 
         if len(transpose_mode) != 2:
             raise ValueError(
-                f"transpose_mode should be an instance of TransposeMode or a 2-tuple ; got transpose_mode = {transpose_mode}"
+                "transpose_mode should be an instance of TransposeMode or a 2-tuple ; " f"got transpose_mode = {transpose_mode}"
             )
         transpose_mode = TransposeMode(*transpose_mode)
 
         if isinstance(leading_dimension, tuple):
             if len(leading_dimension) != 3:
                 raise ValueError(
-                    f"leading_dimension should be a 3-tuple, an instance of LeadingDimension, 'suggested' or None ; got leading_dimension = {leading_dimension}"
+                    "leading_dimension should be a 3-tuple, an instance of LeadingDimension, 'suggested' or None ; "
+                    f"got leading_dimension = {leading_dimension}"
                 )
             else:
                 leading_dimension = LeadingDimension(*leading_dimension)
@@ -118,14 +145,13 @@ def __init__(
                 block_dim = "suggested"
             else:
                 block_dim = Dim3(block_size, 1, 1)
-        if block_dim is not None:
-            if isinstance(block_dim, tuple):
-                if len(block_dim) != 3:
-                    raise ValueError(
-                        f"block_dim should be a 3-tuple, an instance of Dim3, 'suggested' or None ; got block_dim = {block_dim}"
-                    )
-                else:
-                    block_dim = Dim3(*block_dim)
+        if block_dim is not None and isinstance(block_dim, tuple):
+            if len(block_dim) != 3:
+                raise ValueError(
+                    f"block_dim should be a 3-tuple, an instance of Dim3, 'suggested' or None ; got block_dim = {block_dim}"
+                )
+            else:
+                block_dim = Dim3(*block_dim)
 
         validate(
             size=size,
@@ -423,9 +449,7 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
         # Add Numba logic
-        self._codegened = codegen(
-            {"value_type": self.value_type, "symbols": self._symbols, "execution": self.execution}, self
-        )
+        self._codegened = codegen({"value_type": self.value_type, "symbols": self._symbols, "execution": self.execution}, self)
 
     @property
     def value_type(self):
@@ -446,24 +470,36 @@ def __call__(self, *args):
 @docstring_decorator(CUBLASDX_DOCSTRING, skip_missing=False)
 def matmul(*, compiler=None, **kwargs):
     """
-    Create an :class:`BlasOptions` object that encapsulates a compiled and ready-to-use device function for matrix multiplication.
+    Create an :class:`BlasOptions` object that encapsulates a compiled and ready-to-use
+    device function for matrix multiplication.
 
     Args:
         size: {size}
+
         precision: {precision}
+
         data_type: {data_type}
+
         compiler: {compiler}
+
         code_type (CodeType): {code_type}
+
         block_size (int): {block_size}
+
         block_dim (Dim3): {block_dim}
+
         leading_dimension (LeadingDimension): {leading_dimension}
+
         transpose_mode (TransposeMode): {transpose_mode}
+
         function (str): {function}
+
         execution (str): {execution}
 
     See Also:
-        The attributes of :class:`BlasOptions` provide a 1:1 mapping with the CUDA C++ cuBLASDx APIs.
-        For further details, please refer to `cuBLASDx documentation <https://docs.nvidia.com/cuda/cublasdx/>`_.
+        The attributes of :class:`BlasOptions` provide a 1:1 mapping with the CUDA C++
+        cuBLASDx APIs. For further details, please refer to `cuBLASDx documentation
+        <https://docs.nvidia.com/cuda/cublasdx/>`_.
 
     Examples:
 
@@ -473,17 +509,27 @@ def matmul(*, compiler=None, **kwargs):
         >>> m, n, k = 32, 16, 64
         >>> block_size = 256
 
-        Use :func:`nvmath.device.matmul` to create the compiled matrix multiplication object:
+        Use :func:`nvmath.device.matmul` to create the compiled matrix multiplication
+        object:
 
-        >>> MM = matmul(size=(m, n, k), precision=np.float32, data_type='real', transpose_mode=('non_transposed', 'transposed'),
-        ...             execution='Block', block_size=block_size, compiler='numba')
+        >>> MM = matmul(
+        ...     size=(m, n, k),
+        ...     precision=np.float32,
+        ...     data_type="real",
+        ...     transpose_mode=("non_transposed", "transposed"),
+        ...     execution="Block",
+        ...     block_size=block_size,
+        ...     compiler="numba",
+        ... )
 
-        Pass `link=MM.files` to the `numba.cuda.jit` decorator when defining your kernel to link with the compiled code.
+        Pass ``link=MM.files`` to the :func:`numba.cuda.jit` decorator when defining your
+        kernel to link with the compiled code.
 
-        cuBLASDx works on shared memory arrays.
-        It requires column-major (F order) arrays but `cuda.shared.array` creates row-major (C order) arrays only.
-        You can emulate a column-major array by flipping dimensions.
-        With your shared memory arrays ready and filled with actual data, you can run the matrix multiplication by calling `MM`
+        cuBLASDx works on shared memory arrays. It requires column-major (F order) arrays
+        but :class:`cuda.shared.array` creates row-major (C order) arrays only. You can
+        emulate a column-major array by flipping dimensions. With your shared memory arrays
+        ready and filled with actual data, you can run the matrix multiplication by calling
+        `MM`
 
         >>> a_dim, b_dim, c_dim = MM.a_dim, MM.b_dim, MM.c_dim
         >>> @cuda.jit(link=MM.files)
@@ -492,13 +538,14 @@ def matmul(*, compiler=None, **kwargs):
         ...     b = cuda.shared.array(shape=(b_dim[1], b_dim[0]), dtype=np.float32)
         ...     c = cuda.shared.array(shape=(c_dim[1], c_dim[0]), dtype=np.float32)
         ...     # TODO: Populate the arrays with actual data.
-        ...     alpha, beta = 1., 0.
+        ...     alpha, beta = 1.0, 0.0
         ...     MM(alpha, a, b, beta, c)
         ...     cuda.syncthreads()
         ...     # TODO: Copy the result (c) from the shared memory
         >>> f[1, block_size]()
 
-        Further examples can be found in the `nvmath/examples/device <https://github.com/NVIDIA/nvmath-python/tree/main/examples/device>`_ directory.
+        Further examples can be found in the `nvmath/examples/device
+        <https://github.com/NVIDIA/nvmath-python/tree/main/examples/device>`_ directory.
     """
     check_in("compiler", compiler, [None, "numba"])
     if compiler is None:
diff --git a/nvmath/device/cublasdx_backend.py b/nvmath/device/cublasdx_backend.py
index 54fc010..1cc27dd 100644
--- a/nvmath/device/cublasdx_backend.py
+++ b/nvmath/device/cublasdx_backend.py
@@ -14,13 +14,19 @@
 
 
 class LeadingDimension(namedtuple("LeadingDimension", ["a", "b", "c"])):
-    r"""
-    A namedtuple class that encapsulates the three leading dimensions in matrix multiplication :math:`C = \alpha Op(A) Op(B) + \beta C`.
+    """
+    A namedtuple class that encapsulates the three leading dimensions in matrix
+    multiplication :math:`C = \\alpha Op(A) Op(B) + \\beta C`.
 
     Attributes:
-        a (int): The leading dimension of two-dimensional array used to store the matrix ``A``.
-        b (int): The leading dimension of two-dimensional array used to store the matrix ``B``.
-        c (int): The leading dimension of two-dimensional array used to store the matrix ``C``.
+        a (int): The leading dimension of two-dimensional array used to store the matrix
+            ``A``.
+
+        b (int): The leading dimension of two-dimensional array used to store the matrix
+            ``B``.
+
+        c (int): The leading dimension of two-dimensional array used to store the matrix
+            ``C``.
     """
 
     pass
@@ -28,11 +34,15 @@ class LeadingDimension(namedtuple("LeadingDimension", ["a", "b", "c"])):
 
 class TransposeMode(namedtuple("TransposeMode", ["a", "b"])):
     """
-    A namedtuple class that encapsulates the transpose mode for input matrices ``A`` and ``B`` in matrix multiplication.
+    A namedtuple class that encapsulates the transpose mode for input matrices ``A`` and
+    ``B`` in matrix multiplication.
 
     Attributes:
-        a: The operation that needs to be performed with input matrix ``A``, currently supports ``'non_transposed'``, ``'transposed'`` and ``'conj_transposed'``.
-        b: The operation that needs to be performed with input matrix ``B``, currently supports ``'non_transposed'``, ``'transposed'`` and ``'conj_transposed'``.
+        a: The operation that needs to be performed with input matrix ``A``, currently
+            supports ``'non_transposed'``, ``'transposed'`` and ``'conj_transposed'``.
+
+        b: The operation that needs to be performed with input matrix ``B``, currently
+            supports ``'non_transposed'``, ``'transposed'`` and ``'conj_transposed'``.
     """
 
     pass
@@ -53,11 +63,10 @@ def validate(size, data_type, precision, execution, transpose_mode, block_dim, c
             check_in("transpose_mode.b", transpose_mode.b, allowed_values)
         else:
             raise ValueError(
-                f"transpose_mode should be an instance of {TransposeMode} or a 2-tuple, and individual fields should be one of {allowed_values}. Instead got transpose_mode = {transpose_mode}"
+                f"transpose_mode should be an instance of {TransposeMode} or a 2-tuple, and individual fields "
+                f"should be one of {allowed_values}. Instead got transpose_mode = {transpose_mode}"
             )
-    if block_dim is None:
-        pass
-    elif block_dim == "suggested":
+    if block_dim in (None, "suggested"):
         pass
     elif isinstance(block_dim, Dim3):
         prod = block_dim[0] * block_dim[1] * block_dim[2]
@@ -69,21 +78,16 @@ def validate(size, data_type, precision, execution, transpose_mode, block_dim, c
         raise ValueError(f"block_dim should be None, a Dim3 instance or 'suggested'; got block_dim = {block_dim}")
     if code_type is not None:
         check_code_type(code_type)
-    if leading_dimension is None:
-        pass
-    elif leading_dimension == "suggested":
-        pass
-    elif isinstance(leading_dimension, LeadingDimension):
+    if leading_dimension in (None, "suggested") or isinstance(leading_dimension, LeadingDimension):
         pass
     else:
         raise ValueError(
-            f"leading_dimension should be None, a LeadingDimension instance or 'suggested'; got leading_dimension = {leading_dimension}"
+            f"leading_dimension should be None, a LeadingDimension instance or 'suggested'; "
+            f"got leading_dimension = {leading_dimension}"
         )
 
 
-def generate_MM(
-    size, precision, data_type, function, transpose_mode, code_type, block_dim, execution, leading_dimension
-):
+def generate_MM(size, precision, data_type, function, transpose_mode, code_type, block_dim, execution, leading_dimension):
     if block_dim is not None:
         block_dim = f"+ BlockDim<{ block_dim[0] }, { block_dim[1] }, { block_dim[2] }>()"
     else:
@@ -95,9 +99,7 @@ def generate_MM(
         sm = ""
 
     if leading_dimension is not None:
-        leading_dimension = (
-            f"+ LeadingDimension<{ leading_dimension.a }, { leading_dimension.b }, { leading_dimension.c }>()"
-        )
+        leading_dimension = f"+ LeadingDimension<{ leading_dimension.a }, { leading_dimension.b }, { leading_dimension.c }>()"
     else:
         leading_dimension = ""
 
@@ -135,9 +137,7 @@ def generate_MM(
     return cpp, hash
 
 
-def generate_block(
-    size, precision, data_type, function, transpose_mode, code_type, block_dim, execution, leading_dimension
-):
+def generate_block(size, precision, data_type, function, transpose_mode, code_type, block_dim, execution, leading_dimension):
     MM, name = generate_MM(
         size=size,
         precision=precision,
@@ -222,9 +222,7 @@ def generate_block(
     return {"cpp": cpp, "names": {"smem_basic": api_name_basic, "smem_ldabc": api_name_ldabc}}
 
 
-def generate_block_ld(
-    size, precision, data_type, function, transpose_mode, code_type, block_dim, leading_dimension, execution
-):
+def generate_block_ld(size, precision, data_type, function, transpose_mode, code_type, block_dim, leading_dimension, execution):
     MM_str, _ = generate_MM(
         size=size,
         precision=precision,
diff --git a/nvmath/device/cublasdx_numba.py b/nvmath/device/cublasdx_numba.py
index 9af8d81..66b925c 100644
--- a/nvmath/device/cublasdx_numba.py
+++ b/nvmath/device/cublasdx_numba.py
@@ -28,12 +28,13 @@ def make_codegen(kind, value_type, symbol, a_type, b_type, c_type):
     if kind == "smem_basic":
         # smem_basic APIs take the 3 input arrays and alpha/beta as argument
         # lda, ldb and ldc are based on the underlying Dx type
-        # (void) ( (value_type*)alpha, (value_type*)a, (value_type*)b, (value_type*)beta, (value_type*)c )
+        # (void) ( (value_type*)alpha, (value_type*)a, (value_type*)b, (value_type*)beta, (value_type*)c )  # noqa: W505
 
         return signature(return_type, value_type, a_type, b_type, value_type, c_type), make_function_call(symbol)
 
     elif kind == "smem_ldabc":
-        # smem_ldabc APIs take the 3 input arrays, alpha/beta as argument and lda, ldb and ldc
+        # smem_ldabc APIs take the 3 input arrays, alpha/beta as argument and lda, ldb and
+        # ldc
         # (void) ( (value_type*)alpha, (value_type*)a, (unsigned)lda,
         #                              (value_type*)b, (unsigned)ldb,
         #           (value_type*)beta, (value_type*)c, (unsigned)ldc )
diff --git a/nvmath/device/cufftdx.py b/nvmath/device/cufftdx.py
index 95ff75e..264ff03 100644
--- a/nvmath/device/cufftdx.py
+++ b/nvmath/device/cufftdx.py
@@ -30,14 +30,26 @@
 FFTDX_DOCSTRING = SHARED_DEVICE_DOCSTRINGS.copy()
 FFTDX_DOCSTRING.update(
     {
-        "size": "The size of the FFT to calculate.",
-        "fft_type": "A string specifying the type of FFT operation, can be ``'c2c'``, ``'c2r'`` or ``'r2c'``.",
-        "direction": "A string specifying the direction of FFT, can be ``'forward'`` or ``'inverse'``. If not provided, "
-        "will be ``'forward'`` if complex-to-real FFT is specified and ``'inverse'`` if real-to-complex FFT is specified.",
-        "ffts_per_block": "The number of FFTs calculated per CUDA block, optional. The default is 1. Alternatively, if provided as ``'suggested'``, "
-        "will be set to a suggested value",
-        "elements_per_thread": "The number of elements per thread, optional. The default is 1. Alternatively, if provided as ``'suggested'``, will be set to a suggested value. ",
-        "real_fft_options": "A dictionary specifying the options for real FFT operation, optional.",
+        "size": """\
+The size of the FFT to calculate.""".replace("\n", " "),
+        #
+        "fft_type": """\
+A string specifying the type of FFT operation, can be ``'c2c'``, ``'c2r'`` or ``'r2c'``.""".replace("\n", " "),
+        #
+        "direction": """\
+A string specifying the direction of FFT, can be ``'forward'`` or ``'inverse'``. If not provided, will be ``'forward'``
+if complex-to-real FFT is specified and ``'inverse'`` if real-to-complex FFT is specified.""".replace("\n", " "),
+        #
+        "ffts_per_block": """\
+The number of FFTs calculated per CUDA block, optional. The default is 1. Alternatively, if provided as ``'suggested'``
+will be set to a suggested value""".replace("\n", " "),
+        #
+        "elements_per_thread": """\
+The number of elements per thread, optional. The default is 1. Alternatively, if provided as ``'suggested'``, will be
+set to a suggested value. """.replace("\n", " "),
+        #
+        "real_fft_options": """\
+A dictionary specifying the options for real FFT operation, optional.""".replace("\n", " "),
     }
 )
 
@@ -45,30 +57,43 @@
 @docstring_decorator(FFTDX_DOCSTRING, skip_missing=False)
 class FFTOptions:
     """
-    A class that encapsulates a partial FFT device function.
-    A partial device function can be queried for available or optimal values for the some knobs (such as `leading_dimension` or `block_dim`).
-    It does not contain a compiled, ready-to-use, device function until finalized using :meth:`create`.
+    A class that encapsulates a partial FFT device function. A partial device function can
+    be queried for available or optimal values for the some knobs (such as
+    `leading_dimension` or `block_dim`). It does not contain a compiled, ready-to-use,
+    device function until finalized using :meth:`create`.
 
     Args:
         size (int): {size}
+
         precision (str): {precision}
+
         fft_type (str): {fft_type}
+
         code_type (CodeType): {code_type}
+
         execution (str): {execution}
+
         direction (str): {direction}
+
         ffts_per_block (int): {ffts_per_block}
+
         elements_per_thread (int): {elements_per_thread}
-        real_fft_options (dict): {real_fft_options} User may specify the following options in the dictionary:
 
-            - ``'complex_layout'``, currently supports ``'natural'``, ``'packed'``, and ``'full'``.
+        real_fft_options (dict): {real_fft_options} User may specify the following options
+        in the dictionary:
+
+            - ``'complex_layout'``, currently supports ``'natural'``, ``'packed'``, and
+              ``'full'``.
             - ``'real_mode'``, currently supports ``'normal'`` and ``'folded``.
 
     Note:
-        The class is not meant to used directly with its constructor. Users are instead advised to use :func:`fft` create the object.
+        The class is not meant to used directly with its constructor. Users are instead
+        advised to use :func:`fft` create the object.
 
     See Also:
         The attributes of this class provide a 1:1 mapping with the CUDA C++ cuFFTDx APIs.
-        For further details, please refer to `cuFFTDx documentation <https://docs.nvidia.com/cuda/cufftdx/index.html>`_.
+        For further details, please refer to `cuFFTDx documentation
+        <https://docs.nvidia.com/cuda/cufftdx/index.html>`_.
     """
 
     def __init__(
@@ -89,8 +114,7 @@ def __init__(
         code_type = CodeType(code_type[0], ComputeCapability(*code_type[1]))
         if code_type.cc.major < 7:
             raise RuntimeError(
-                "Minimal compute capability 7.0 is required by cuFFTDx, got "
-                f"{code_type.cc.major}.{code_type.cc.minor}"
+                f"Minimal compute capability 7.0 is required by cuFFTDx, got {code_type.cc.major}.{code_type.cc.minor}"
             )
 
         #
@@ -180,9 +204,7 @@ def code_type(self):
 
     def valid(self, *knobs):
         if not (set(knobs) <= {"ffts_per_block", "elements_per_thread"}):
-            raise ValueError(
-                f"Unsupported knob. Only valid knobs are ffts_per_block and elements_per_thread but got {knobs}"
-            )
+            raise ValueError(f"Unsupported knob. Only valid knobs are ffts_per_block and elements_per_thread but got {knobs}")
 
         constraints = {
             "fft_type": self.fft_type,
@@ -201,9 +223,7 @@ def valid(self, *knobs):
 
         global CUFFTDX_DATABASE
         if CUFFTDX_DATABASE is None:
-            CUFFTDX_DATABASE = cuFFTDxDatabase.create(
-                os.path.join(MATHDX_HOME, "include/cufftdx/include/database/records/")
-            )
+            CUFFTDX_DATABASE = cuFFTDxDatabase.create(os.path.join(MATHDX_HOME, "include/cufftdx/include/database/records/"))
 
         return CUFFTDX_DATABASE.query(knobs, constraints)
 
@@ -417,29 +437,43 @@ def extensions(self):
 @docstring_decorator(FFTDX_DOCSTRING, skip_missing=False)
 def fft(*, compiler=None, **kwargs):
     """
-    Create an :class:`FFTOptions` object that encapsulates a compiled and ready-to-use FFT device function.
+    Create an :class:`FFTOptions` object that encapsulates a compiled and ready-to-use FFT
+    device function.
 
     Args:
         size (int): {size}
+
         precision (str): {precision}
+
         fft_type (str): {fft_type}
+
         compiler (str): {compiler}
+
         code_type (CodeType): {code_type}. Optional if compiler is specified as ``'Numba'``.
+
         execution (str): {execution}
+
         direction (str): {direction}
+
         ffts_per_block (int): {ffts_per_block}
+
         elements_per_thread (int): {elements_per_thread}
-        real_fft_options (dict): {real_fft_options} User may specify the following options in the dictionary:
 
-            - ``'complex_layout'``, currently supports ``'natural'``, ``'packed'``, and ``'full'``.
+        real_fft_options (dict): {real_fft_options} User may specify the following options
+        in the dictionary:
+
+            - ``'complex_layout'``, currently supports ``'natural'``, ``'packed'``, and
+              ``'full'``.
             - ``'real_mode'``, currently supports ``'normal'`` and ``'folded'``.
 
     See Also:
-        The attributes of :class:`FFTOptions` provide a 1:1 mapping with the CUDA C++ cuFFTDx APIs.
-        For further details, please refer to `cuFFTDx documentation <https://docs.nvidia.com/cuda/cufftdx/index.html>`_.
+        The attributes of :class:`FFTOptions` provide a 1:1 mapping with the CUDA C++
+        cuFFTDx APIs. For further details, please refer to `cuFFTDx documentation
+        <https://docs.nvidia.com/cuda/cufftdx/index.html>`_.
 
     Examples:
-        Examples can be found in the `nvmath/examples/device <https://github.com/NVIDIA/nvmath-python/tree/main/examples/device>`_ directory.
+        Examples can be found in the `nvmath/examples/device
+        <https://github.com/NVIDIA/nvmath-python/tree/main/examples/device>`_ directory.
     """
     check_in("compiler", compiler, [None, "numba"])
     if compiler is None:
diff --git a/nvmath/device/cufftdx_backend.py b/nvmath/device/cufftdx_backend.py
index 9bb13f6..f4e04e6 100644
--- a/nvmath/device/cufftdx_backend.py
+++ b/nvmath/device/cufftdx_backend.py
@@ -8,9 +8,7 @@
 from .types import REAL_NP_TYPES
 
 
-def validate(
-    size, precision, fft_type, execution, direction, ffts_per_block, elements_per_thread, real_fft_options, code_type
-):
+def validate(size, precision, fft_type, execution, direction, ffts_per_block, elements_per_thread, real_fft_options, code_type):
     if size <= 0:
         raise ValueError(f"size must be > 0. Got {size}")
     check_in("precision", precision, REAL_NP_TYPES)
@@ -18,32 +16,27 @@ def validate(
     check_in("execution", execution, ["Block", "Thread"])
     if direction is not None:
         check_in("direction", direction, ["forward", "inverse"])
-    if ffts_per_block is None:
-        pass
-    elif ffts_per_block == "suggested":
+    if ffts_per_block in (None, "suggested"):
         pass
     else:
         if ffts_per_block <= 0:
             raise ValueError(
                 f"ffts_per_block must be None, 'suggested' or a positive integer ; got ffts_per_block = {ffts_per_block}"
             )
-    if elements_per_thread is None:
-        pass
-    elif elements_per_thread == "suggested":
+    if elements_per_thread in (None, "suggested"):
         pass
     else:
         if elements_per_thread <= 0:
             raise ValueError(
-                f"elements_per_thread must be None, 'suggested' or a positive integer ; got elements_per_thread = {elements_per_thread}"
+                f"elements_per_thread must be None, 'suggested' or a positive integer ; "
+                f"got elements_per_thread = {elements_per_thread}"
             )
     if real_fft_options is None:
         pass
     else:
         check_contains(real_fft_options, "complex_layout")
         check_contains(real_fft_options, "real_mode")
-        check_in(
-            "real_fft_options['complex_layout']", real_fft_options["complex_layout"], ["natural", "packed", "full"]
-        )
+        check_in("real_fft_options['complex_layout']", real_fft_options["complex_layout"], ["natural", "packed", "full"])
         check_in("real_fft_options['real_mode']", real_fft_options["real_mode"], ["normal", "folded"])
     check_code_type(code_type)
 
@@ -108,9 +101,7 @@ def generate_FFT(
     return cpp, name
 
 
-def generate_block(
-    size, precision, fft_type, direction, code_type, ffts_per_block, elements_per_thread, real_fft_options
-):
+def generate_block(size, precision, fft_type, direction, code_type, ffts_per_block, elements_per_thread, real_fft_options):
     FFT, name = generate_FFT(
         size=size,
         precision=precision,
diff --git a/nvmath/device/cufftdx_db.py b/nvmath/device/cufftdx_db.py
index 77d532c..8857a59 100644
--- a/nvmath/device/cufftdx_db.py
+++ b/nvmath/device/cufftdx_db.py
@@ -9,9 +9,7 @@
 from abc import ABC, abstractmethod
 from .common_cuda import ComputeCapability
 
-Record = namedtuple(
-    "Record", ["arch", "fft_type", "precision", "direction", "size", "elements_per_thread", "ffts_per_block"]
-)
+Record = namedtuple("Record", ["arch", "fft_type", "precision", "direction", "size", "elements_per_thread", "ffts_per_block"])
 
 
 def _update(dict, **kwargs):
@@ -25,13 +23,14 @@ def _update(dict, **kwargs):
 
 
 #
-# Those transformations take care of mapping a set of constraints (e.g. arch=800, fft_type=r2c, size=32)
-# from the "frontend API" to the "database".
-# This is necessary because the mapping is not bijective: multiple APIs map to the same implementation
-# For instance (fft_type=C2C, size=32) and (fft_type=R2C, size=32) both map to (fft_type=C2C, size=32) under the hood
-# Those mapper take care of
+# Those transformations take care of mapping a set of constraints (e.g. arch=800,
+# fft_type=r2c, size=32) from the "frontend API" to the "database". This is necessary
+# because the mapping is not bijective: multiple APIs map to the same implementation For
+# instance (fft_type=C2C, size=32) and (fft_type=R2C, size=32) both map to (fft_type=C2C,
+# size=32) under the hood Those mapper take care of
 # [1] Doing the forward mapping (frontend -> db), which is injective - the `fwd` methods
-# [2] Undoing the mapping (which is doable because we keep track of the forward mapping internally) - the `inv` methods
+# [2] Undoing the mapping (which is doable because we keep track of the forward mapping
+# internally) - the `inv` methods
 #
 class Mapper(ABC):
     @staticmethod
@@ -87,7 +86,8 @@ def inv(self, **kwargs):
             return _update(kwargs, execution="Block")
 
 
-# Fwd: If fft_type=(R2C|C2R) and real_fft_options[real_mode]=folded, set fft_type=C2C and size/=2 and elements_per_thread/=2 (if set) and drop real_fft_options
+# Fwd: If fft_type=(R2C|C2R) and real_fft_options[real_mode]=folded, set fft_type=C2C and
+# size/=2 and elements_per_thread/=2 (if set) and drop real_fft_options
 # Inv: Restore the original R2C/C2R. If elements_per_thread was set, restore it
 class FoldedToC2C(Mapper):
     # This should be the complement of R2CC2RToC2C.required
diff --git a/nvmath/device/cufftdx_numba.py b/nvmath/device/cufftdx_numba.py
index cdf25c5..9c85741 100644
--- a/nvmath/device/cufftdx_numba.py
+++ b/nvmath/device/cufftdx_numba.py
@@ -3,9 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from numba import types
-from numba.core.extending import make_attribute_wrapper, register_model, typeof_impl, intrinsic, overload, models
 from numba.core.typing import signature
-from numba.extending import typeof_impl, models, register_model, make_attribute_wrapper
+from numba.extending import typeof_impl, models, register_model, make_attribute_wrapper, intrinsic, overload
 from .common_numba import NUMBA_FE_TYPES_TO_NUMBA_IR, make_function_call
 from .common import check_in
 from .cufftdx_workspace import Workspace
@@ -57,7 +56,8 @@ def prepare_args(self, ty, val, **kwargs):
 # Lowering down to function call to cuFFTDx
 # io = 'thread' or 'smem'
 # execution = 'Thread' or 'Block'
-# value_type = real or complex numpy type of the input/output thread-private and shared memory
+# value_type = real or complex numpy type of the input/output thread-private and shared
+# memory
 # symbols = name of the function, as a string
 # requires_workspace whether we require workspace, or not, as a bool.
 #
@@ -78,10 +78,10 @@ def make_codegen(io, execution, value_type, symbol, requires_workspace):
         return signature(return_type, array_type), make_function_call(symbol)
 
     # Block() APIs have four variants
-    # (void) ( (value_type*)thread array,         (value_type*)shared memory array               )
-    # (void) ( (value_type*)shared memory array                                                  )
-    # (void) ( (value_type*)thread array,         (value_type*)shared memory array,    workspace )
-    # (void) ( (value_type*)shared memory array,                                       workspace )
+    # (void) ( (value_type*)thread array,         (value_type*)shared memory array               )  # noqa: W505
+    # (void) ( (value_type*)shared memory array                                                  )  # noqa: W505
+    # (void) ( (value_type*)thread array,         (value_type*)shared memory array,    workspace )  # noqa: W505
+    # (void) ( (value_type*)shared memory array,                                       workspace )  # noqa: W505
     # In all cases we pass 3 arguments, with appropriate nullptrs or 0's where needed
 
     elif execution == "Block" and io == "thread":
diff --git a/nvmath/device/patch.py b/nvmath/device/patch.py
index 9e1eac7..733fcbd 100644
--- a/nvmath/device/patch.py
+++ b/nvmath/device/patch.py
@@ -9,6 +9,7 @@
 #
 
 import os
+import functools
 
 import numba
 from numba.cuda.cudadrv import libs
@@ -70,9 +71,7 @@ def patch_codegen():
     required_numba_ver = (0, 60)
     numba_ver = numba.version_info.short
     if numba_ver != required_numba_ver:
-        raise RuntimeError(
-            f"numba version {required_numba_ver} is required, but got {numba.__version__} (aka {numba_ver})"
-        )
+        raise RuntimeError(f"numba version {required_numba_ver} is required, but got {numba.__version__} (aka {numba_ver})")
 
     # Add new LTO-IR linker to Numba (from pynvjitlink)
     pynvjitlink.patch.patch_numba_linker(lto=True)
diff --git a/nvmath/device/random.py b/nvmath/device/random.py
index 0d92010..7892faa 100644
--- a/nvmath/device/random.py
+++ b/nvmath/device/random.py
@@ -22,8 +22,8 @@
 _INIT_DOC = """init(..., state)
     Initialize the RNG state.
 
-    The arguments depend upon the selected bit generator
-    (see the overloads of `curand_init` in `cuRAND docs <https://docs.nvidia.com/cuda/curand/group__DEVICE.html>`_).
+    The arguments depend upon the selected bit generator (see the overloads of `curand_init`
+    in `cuRAND docs <https://docs.nvidia.com/cuda/curand/group__DEVICE.html>`_).
 
     Example:
 
@@ -37,17 +37,19 @@
         >>> blocks = 64
         >>> nthreads = blocks * threads
 
-        Let us show how to use `init` with :class:`nvmath.device.random.StatesPhilox4_32_10` states.
-        The same applies to :class:`nvmath.device.random.StatesMRG32k3a` and
+        Let us show how to use `init` with :class:`nvmath.device.random.StatesPhilox4_32_10`
+        states. The same applies to :class:`nvmath.device.random.StatesMRG32k3a` and
         :class:`nvmath.device.random.StatesXORWOW`.
 
-        First, create an array of states (one per thread) using :class:`nvmath.device.random.StatesPhilox4_32_10`
-        constructor.
+        First, create an array of states (one per thread) using
+        :class:`nvmath.device.random.StatesPhilox4_32_10` constructor.
 
         >>> states = random.StatesPhilox4_32_10(nthreads)
 
-        Define a kernel to initialize the states. Each thread will initialize one element of `states`.
-        For the `Philox4_32_10 <https://docs.nvidia.com/cuda/curand/group__DEVICE.html#group__DEVICE_1ga613d37dacbc50494f2f859ef0d378b8>`_ generator, the `init` arguments are: `seed`, `subsequence`, `offset`.
+        Define a kernel to initialize the states. Each thread will initialize one element of
+        `states`. For the `Philox4_32_10
+        <https://docs.nvidia.com/cuda/curand/group__DEVICE.html#group__DEVICE_1ga613d37dacbc50494f2f859ef0d378b8>`_
+        generator, the `init` arguments are: `seed`, `subsequence`, `offset`.
 
         >>> @cuda.jit(link=compiled_apis.files, extensions=compiled_apis.extension)
         ... def setup(states):
@@ -58,20 +60,24 @@
 
         >>> setup[blocks, threads](states)
 
-        Now, you can use the `states` array to generate random numbers using the random samplers available.
+        Now, you can use the `states` array to generate random numbers using the random
+        samplers available.
 
-        For Sobol' family of quasirandom number generators, initialization is a bit more complex as it
-        requires preparing a set of *direction vectors* and *scramble constants*. In this example, we
-        will setup :class:`nvmath.device.random.StatesScrambledSobol64` states.
+        For Sobol' family of quasirandom number generators, initialization is a bit more
+        complex as it requires preparing a set of *direction vectors* and *scramble
+        constants*. In this example, we will setup
+        :class:`nvmath.device.random.StatesScrambledSobol64` states.
 
-        Direction vectors can be obtained with :func:`nvmath.device.random_helpers.get_direction_vectors64`:
+        Direction vectors can be obtained with
+        :func:`nvmath.device.random_helpers.get_direction_vectors64`:
 
         >>> from nvmath.device import random_helpers
         >>> hostVectors = random_helpers.get_direction_vectors64(
         ...     random.random_helpers.DirectionVectorSet.SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6, nthreads)
         >>> sobolDirectionVectors = cuda.to_device(hostVectors)
 
-        To get scramble constants, use :func:`nvmath.device.random_helpers.get_scramble_constants64`:
+        To get scramble constants, use
+        :func:`nvmath.device.random_helpers.get_scramble_constants64`:
 
         >>> hostScrambleConstants = random_helpers.get_scramble_constants64(nthreads)
         >>> sobolScrambleConstants = cuda.to_device(hostScrambleConstants)
@@ -201,7 +207,6 @@ def _wrap_sampler(new_name, *, extra_arguments_list: list | None = None, extra_a
     # Additional arguments for sampler, if any, other than 'state'.
     num_extra_arguments = len(extra_arguments_list)
     extra_arguments_str = "" if num_extra_arguments == 0 else ", {}".format(", ".join(extra_arguments_list))
-    extra_arguments_str += ")"
 
     # Infer data needed for the description template from the name.
     m = re.match(r"([a-z_]+)([24])?(_([\w]+))?", new_name)
@@ -318,18 +323,14 @@ def _create_symbols():
     extra_arguments_list = ["mean", "stddev"]
     extra_arguments_doc = ["The mean value", "The standard deviation"]
     for sampler in _SAMPLERS_LOG_NORMAL:
-        function = _wrap_sampler(
-            sampler, extra_arguments_list=extra_arguments_list, extra_arguments_doc=extra_arguments_doc
-        )
+        function = _wrap_sampler(sampler, extra_arguments_list=extra_arguments_list, extra_arguments_doc=extra_arguments_doc)
         setattr(random_module, sampler, function)
 
     # The poisson distribution requires an extra argument.
     extra_arguments_list = ["Lambda"]
     extra_arguments_doc = ["The parameter characterizing the Poisson distribution"]
     for sampler in _SAMPLERS_POISSON:
-        function = _wrap_sampler(
-            sampler, extra_arguments_list=extra_arguments_list, extra_arguments_doc=extra_arguments_doc
-        )
+        function = _wrap_sampler(sampler, extra_arguments_list=extra_arguments_list, extra_arguments_doc=extra_arguments_doc)
         setattr(random_module, sampler, function)
 
 
@@ -362,12 +363,13 @@ class Compile:
     """
     Compile the random device APIs with the specified compute capability.
 
-    The ``files`` and ``extension`` attributes should be used as the arguments for `numba.cuda.jit`
-    decorator in Numba kernels which use random device APIs.
+    The ``files`` and ``extension`` attributes should be used as the arguments for
+    :py:func:`numba.cuda.jit` decorator in Numba kernels which use random device APIs.
 
     Args:
-        cc: (optional) the compute capability specified as an object of type `nvmath.device.ComputeCapability`. If
-           not specified, the default compute capability will be used.
+        cc: (optional) the compute capability specified as an object of type
+            :py:class:`nvmath.device.ComputeCapability`. If not specified, the default
+            compute capability will be used.
 
     Example:
         >>> from numba import cuda
@@ -385,7 +387,8 @@ def __init__(self, cc: nvmath.device.ComputeCapability | None = None):
             cc = nvmath.device.common_cuda.get_default_code_type().cc
         elif not isinstance(cc, nvmath.device.ComputeCapability):
             raise ValueError(
-                f"The specified compute capability {cc} is not valid. It must be an object of type `nvmath.device.ComputeCapability`."
+                f"The specified compute capability {cc} is not valid. "
+                "It must be an object of type :py:class:`nvmath.device.ComputeCapability`."
             )
         self.cc = cc
 
diff --git a/nvmath/device/random_states.py b/nvmath/device/random_states.py
index 08266b7..a544149 100644
--- a/nvmath/device/random_states.py
+++ b/nvmath/device/random_states.py
@@ -20,7 +20,7 @@
 import os
 import logging
 
-from numba import types, config, cuda
+from numba import config
 
 xorwow_dtype = np.dtype(
     [
@@ -173,7 +173,8 @@ def prepare_args(self, ty, val, **kwargs):
     "curandStateSobol64",
     "curandStateScrambledSobol64",
     "curandStatePhilox4_32_10",
-    # "curandStateMtgp32", # Require additional type parsing of mtgp32_params_fast and mtgp32_kernel_params
+    # Require additional type parsing of mtgp32_params_fast and mtgp32_kernel_params
+    # "curandStateMtgp32",
 ]
 
 numpy_curand_states = []
diff --git a/nvmath/device/vector_types_numba.py b/nvmath/device/vector_types_numba.py
index 3ef1d5c..1426ced 100644
--- a/nvmath/device/vector_types_numba.py
+++ b/nvmath/device/vector_types_numba.py
@@ -80,11 +80,11 @@ def __name__(self):
     @type_callable(vector)
     def type(context):
         def typer(x, y=None, z=None, w=None):
-            if vector_length == 2 and isinstance(x, types.Complex) and y is None and z is None and w is None:
-                return vector_type
-            elif vector_length == 2 and all(isinstance(v, types.Float) for v in [x, y]) and z is None and w is None:
-                return vector_type
-            elif vector_length == 4 and all(isinstance(v, types.Float) for v in [x, y, z, w]):
+            if (
+                (vector_length == 2 and isinstance(x, types.Complex) and y is None and z is None and w is None)
+                or (vector_length == 2 and all(isinstance(v, types.Float) for v in [x, y]) and z is None and w is None)
+                or (vector_length == 4 and all(isinstance(v, types.Float) for v in [x, y, z, w]))
+            ):
                 return vector_type
 
         return typer
diff --git a/nvmath/fft/_configuration.py b/nvmath/fft/_configuration.py
index e19c617..7c583a6 100644
--- a/nvmath/fft/_configuration.py
+++ b/nvmath/fft/_configuration.py
@@ -8,6 +8,7 @@
 from enum import IntEnum
 from logging import Logger
 from typing import Literal
+import warnings
 
 from nvmath._internal.mem_limit import MEM_LIMIT_RE_PCT, MEM_LIMIT_RE_VAL, MEM_LIMIT_DOC
 from nvmath.memory import BaseCUDAMemoryManager
@@ -15,14 +16,17 @@
 
 @dataclass
 class ExecutionCUDA:
-    """A data class for providing GPU execution options to the :class:`FFT` object and the family of wrapper functions :func:`fft`,
-    :func:`ifft`, :func:`rfft`, and :func:`irfft`.
+    """
+    A data class for providing GPU execution options to the :class:`FFT` object and the
+    family of wrapper functions :func:`fft`, :func:`ifft`, :func:`rfft`, and :func:`irfft`.
 
     Attributes:
-        device_id: CUDA device ordinal (used if the operand resides on the CPU). Device 0 will be used if not specified.
+        device_id: CUDA device ordinal (used if the operand resides on the CPU). Device 0
+        will be used if not specified.
 
     See Also:
-       :class:`ExecutionCPU`, :class:`FFT`, :func:`fft`, :func:`ifft`, :func:`rfft`, and :func:`irfft`.
+       :class:`ExecutionCPU`, :class:`FFT`, :func:`fft`, :func:`ifft`, :func:`rfft`, and
+       :func:`irfft`.
     """
 
     name: Literal["cuda"] = field(default="cuda", init=False)
@@ -31,15 +35,18 @@ class ExecutionCUDA:
 
 @dataclass
 class ExecutionCPU:
-    """A data class for providing CPU execution options to the :class:`FFT` object and the family of wrapper functions :func:`fft`,
-    :func:`ifft`, :func:`rfft`, and :func:`irfft`.
+    """
+    A data class for providing CPU execution options to the :class:`FFT` object and the
+    family of wrapper functions :func:`fft`, :func:`ifft`, :func:`rfft`, and :func:`irfft`.
 
     Attributes:
         num_threads: The number of CPU threads used to execute the plan.
-                     If not specified, defaults to the number of CPU cores available to the process.
+                     If not specified, defaults to the number of CPU cores available to the
+                     process.
 
     See Also:
-       :class:`ExecutionCUDA`, :class:`FFT`, :func:`fft`, :func:`ifft`, :func:`rfft`, and :func:`irfft`.
+       :class:`ExecutionCUDA`, :class:`FFT`, :func:`fft`, :func:`ifft`, :func:`rfft`, and
+       :func:`irfft`.
     """
 
     name: Literal["cpu"] = field(default="cpu", init=False)
@@ -48,47 +55,75 @@ class ExecutionCPU:
 
 @dataclass
 class FFTOptions:
-    """A data class for providing options to the :class:`FFT` object and the family of wrapper functions :func:`fft`,
-    :func:`ifft`, :func:`rfft`, and :func:`irfft`.
+    """
+    A data class for providing options to the :class:`FFT` object and the family of wrapper
+    functions :func:`fft`, :func:`ifft`, :func:`rfft`, and :func:`irfft`.
 
     Attributes:
-        fft_type: The type of FFT to perform, available options include ``'C2C'``, ``'C2R'``, and ``'R2C'``. The default is ``'C2C'``
-            for complex input and ``'R2C'`` for real input.
-        inplace: Specify if the operation is in-place (`True` or `False`). The operand is overwritten by the result if
-            ``inplace`` is `True`. The default is `False`.
-        last_axis_size: For complex-to-real FFT (corresponding to ``fft_type='C2R'``), specify whether the size of the
-            last axis in the result should be even or odd. The even size is calculated as :math:`2 * (m - 1)`, where :math:`m` is the
-            the size of the last axis of the operand, and the odd size is calculated as :math:`2 * (m - 1) + 1`. The specified
-            value should be either ``'even'`` or ``'odd'``, with the default being ``'even'``.
-        result_layout: The layout to use for the result, either ``'natural'`` or ``'optimized'``. For the ``'natural'`` option, the
-            result layout is the same as that of the operand. The default is ``'optimized'``, which generally provides much better
-            performance and should be used if the user doesn't care about the result layout matching the operand layout. However in rare cases,
-            depending on the device type, shape and strides of the operand, and the FFT dimensions, the ``'natural'`` layout may perform better.
-            This option is ignored if ``inplace`` is specified to be True.
+        fft_type: The type of FFT to perform, available options include ``'C2C'``,
+            ``'C2R'``, and ``'R2C'``. The default is ``'C2C'`` for complex input and
+            ``'R2C'`` for real input.
+
+        inplace: Specify if the operation is in-place (`True` or `False`). The operand is
+            overwritten by the result if ``inplace`` is `True`. The default is `False`.
+
+        last_axis_parity: For complex-to-real FFT (corresponding to ``fft_type='C2R'``),
+            specify whether the size of the last axis in the result should be even or odd.
+            The even size is calculated as :math:`2 * (m - 1)`, where :math:`m` is the the
+            size of the last axis of the operand, and the odd size is calculated as :math:`2
+            * (m - 1) + 1`. The specified value should be either ``'even'`` or ``'odd'``,
+            with the default being ``'even'``.
+
+        last_axis_size: See :attr:`last_axis_parity`.
+
+            .. deprecated:: 0.2.1
+                :attr:`last_axis_size` will be removed in 0.3.0. Use
+                :attr:`last_axis_parity` instead.
+
+        result_layout: The layout to use for the result, either ``'natural'`` or
+            ``'optimized'``. For the ``'natural'`` option, the result layout is the same as
+            that of the operand. The default is ``'optimized'``, which generally provides
+            much better performance and should be used if the user doesn't care about the
+            result layout matching the operand layout. However in rare cases, depending on
+            the device type, shape and strides of the operand, and the FFT dimensions, the
+            ``'natural'`` layout may perform better. This option is ignored if ``inplace``
+            is specified to be True.
+
         device_id: CUDA device ordinal (used if the operand resides on the CPU).
             Device 0 will be used if not specified.
 
             .. deprecated:: 0.2.0
-                The ``device_id`` should be specified as ``execution`` option, see :attr:`ExecutionCUDA.device_id`.
-
-        logger (logging.Logger): Python Logger object. The root logger will be used if a logger object is not provided.
-        blocking: A flag specifying the behavior of the execution functions and methods, such as :func:`fft` and :meth:`FFT.execute`.
-            When ``blocking`` is `True`, the execution methods do not return until the operation is complete. When ``blocking`` is
-            ``"auto"``, the methods return immediately when the input tensor is on the GPU and ``execution`` is set to 'CUDA'.
-            The execution methods always block when the input tensor is on the CPU or ``execution`` is specified to 'CPU' to ensure
-            that the user doesn't inadvertently use the result before it becomes available. The default is ``"auto"``.
-        allocator: An object that supports the :class:`BaseCUDAMemoryManager` protocol, used to draw device memory. If an
-            allocator is not provided, a memory allocator from the library package will be used
-            (:func:`torch.cuda.caching_allocator_alloc` for PyTorch operands, :func:`cupy.cuda.alloc` otherwise).
-            Currently, CPU execution does not allow specifying a memory pool.
+                The ``device_id`` should be specified as ``execution`` option,
+                see :attr:`ExecutionCUDA.device_id`.
+
+
+        logger (logging.Logger): Python Logger object. The root logger will be used if a
+            logger object is not provided.
+
+        blocking: A flag specifying the behavior of the execution functions and methods,
+            such as :func:`fft` and :meth:`FFT.execute`. When ``blocking`` is `True`, the
+            execution methods do not return until the operation is complete. When
+            ``blocking`` is ``"auto"``, the methods return immediately when the input tensor
+            is on the GPU and ``execution`` is set to 'CUDA'. The execution methods always
+            block when the input tensor is on the CPU or ``execution`` is specified to 'CPU'
+            to ensure that the user doesn't inadvertently use the result before it becomes
+            available. The default is ``"auto"``.
+
+        allocator: An object that supports the :class:`BaseCUDAMemoryManager` protocol, used
+            to draw device memory. If an allocator is not provided, a memory allocator from
+            the library package will be used (:func:`torch.cuda.caching_allocator_alloc` for
+            PyTorch operands, :func:`cupy.cuda.alloc` otherwise). Currently, CPU execution
+            does not allow specifying a memory pool.
 
     See Also:
-        :class:`ExecutionCUDA`, :class:`ExecutionCPU`, :class:`FFT`, :func:`fft`, :func:`ifft`, :func:`rfft`, and :func:`irfft`.
+        :class:`ExecutionCUDA`, :class:`ExecutionCPU`, :class:`FFT`, :func:`fft`,
+        :func:`ifft`, :func:`rfft`, and :func:`irfft`.
     """
 
     fft_type: Literal["C2C", "C2R", "R2C"] | None = None
     inplace: bool = False
-    last_axis_size: Literal["even", "odd"] | None = "even"
+    last_axis_parity: Literal["even", "odd"] | None = "even"
+    last_axis_size: None = None
     result_layout: Literal["natural", "optimized"] | None = "optimized"
     device_id: int | None = None
     logger: Logger | None = None
@@ -103,9 +138,17 @@ def __post_init__(self):
         if not isinstance(self.inplace, bool):
             raise ValueError("The value specified for 'inplace' must be of type bool (True or False).")
 
-        valid_last_axis_sizes = ["even", "odd"]
-        if self.last_axis_size not in valid_last_axis_sizes:
-            raise ValueError(f"The value specified for 'last_axis_size' must be one of {valid_last_axis_sizes}.")
+        # TODO: Remove in version 0.3.0
+        if self.last_axis_size is not None:
+            warnings.warn(
+                "FFTOptions.last_axis_size is deprecated and will be removed in version 0.3.0. "
+                "Use FFTOptions.last_axis_parity instead.",
+                DeprecationWarning,
+            )
+            self.last_axis_parity = self.last_axis_size
+        valid_last_axis_parity = ["even", "odd"]
+        if self.last_axis_parity not in valid_last_axis_parity:
+            raise ValueError(f"The value specified for 'last_axis_parity' must be one of {valid_last_axis_parity}.")
 
         valid_result_layout_options = ["natural", "optimized"]
         if self.result_layout not in valid_result_layout_options:
@@ -120,12 +163,16 @@ class DeviceCallable:
     """A data class capturing LTO-IR callables.
 
     Attributes:
-        ltoir: A device-callable function in LTO-IR format, which can be provided as as either as a :class:`bytes` object
-            or as a pointer to the LTO-IR as Python :class:`int`.
-        size: The size of the LTO-IR callable. If not specified and a :class:`bytes` object is passed for ``ltoir``, the size is
-            calculated from it. If a pointer is provided for the LTO-IR, `size` must be specified.
-        data:  A device pointer to user data used in the callback. The default is None, which means a null pointer will
-            be used in the callback.
+        ltoir: A device-callable function in LTO-IR format, which can be provided as as
+            either as a :class:`bytes` object or as a pointer to the LTO-IR as Python
+            :class:`int`.
+
+        size: The size of the LTO-IR callable. If not specified and a :class:`bytes` object
+            is passed for ``ltoir``, the size is calculated from it. If a pointer is
+            provided for the LTO-IR, `size` must be specified.
+
+        data:  A device pointer to user data used in the callback. The default is None,
+            which means a null pointer will be used in the callback.
 
     See Also:
         :meth:`FFT.plan`, :func:`fft`, :func:`ifft`, :func:`rfft`, and :func:`irfft`.
@@ -140,7 +187,8 @@ def __post_init__(self):
             return
         if not isinstance(self.ltoir, int | bytes):
             raise ValueError(
-                "The LTO-IR code must be provided as a bytes object or as a Python int representing the pointer to the LTO-IR code."
+                "The LTO-IR code must be provided as a bytes object or as a Python int "
+                "representing the pointer to the LTO-IR code."
             )
         if isinstance(self.ltoir, int) and self.size is None:
             raise ValueError(
diff --git a/nvmath/fft/_exec_utils.py b/nvmath/fft/_exec_utils.py
index 4f9f74c..dfcc05c 100644
--- a/nvmath/fft/_exec_utils.py
+++ b/nvmath/fft/_exec_utils.py
@@ -33,9 +33,7 @@ def _check_init_cufft():
         try:
             import cupy
         except ImportError as e:
-            raise RuntimeError(
-                "Currently, the FFT CUDA execution requires cupy. " "Please make sure cupy is installed."
-            ) from e
+            raise RuntimeError("Currently, the FFT CUDA execution requires cupy. Please make sure cupy is installed.") from e
 
         IS_EXEC_GPU_AVAILABLE = True
 
diff --git a/nvmath/fft/_helpers.py b/nvmath/fft/_helpers.py
index 0e22c7b..9a0c4c9 100644
--- a/nvmath/fft/_helpers.py
+++ b/nvmath/fft/_helpers.py
@@ -53,7 +53,8 @@ def compile_to(function, sig, name, *, compute_capability=None, representation="
         else:
             if not isinstance(compute_capability, str):
                 raise ValueError(
-                    f"The compute capability must be specified as a string ('80', '89', ...). The provided value {compute_capability} is invalid."
+                    f"The compute capability must be specified as a string ('80', '89', ...). "
+                    f"The provided value {compute_capability} is invalid."
                 )
             compute_capability = tuple(int(c) for c in compute_capability)
 
@@ -100,7 +101,8 @@ def _compile(
 
     if element_dtype not in VALID_DTYPES:
         raise ValueError(
-            f"The specified operand data type '{element_dtype}' is not currently supported. It must be one of {VALID_DTYPES.keys()}."
+            f"The specified operand data type '{element_dtype}' is not currently supported. "
+            f"It must be one of {VALID_DTYPES.keys()}."
         )
 
     data_type = getattr(numba.cuda.types.types, element_dtype)
@@ -111,7 +113,8 @@ def _compile(
         info_type = user_info_dtype
     else:
         raise ValueError(
-            f"The specified user information data type '{user_info_dtype}' is not supported. It must be a Numba custom type or one of {VALID_DTYPES.keys()}."
+            f"The specified user information data type '{user_info_dtype}' is not supported. "
+            f"It must be a Numba custom type or one of {VALID_DTYPES.keys()}."
         )
 
     dataptr_type = numba.types.CPointer(data_type)
@@ -126,9 +129,7 @@ def _compile(
     else:
         insert = TYPE_MAP[element_dtype]
         return_type = numba.types.void
-        signature = numba.core.typing.signature(
-            return_type, dataptr_type, offset_type, data_type, infoptr_type, smemptr_type
-        )
+        signature = numba.core.typing.signature(return_type, dataptr_type, offset_type, data_type, infoptr_type, smemptr_type)
 
     if name is None:
         snippet = VALID_DTYPES[element_dtype]
@@ -169,50 +170,63 @@ def _compile(
 compile_prolog.__doc__ = """
     compile_prolog(prolog_fn, element_dtype, user_info_dtype, *, compute_capability=None)
 
-    Compile a Python function to LTO-IR to provide as a prolog function for :func:`~nvmath.fft.fft` and :meth:`~nvmath.fft.FFT.plan`.
+    Compile a Python function to LTO-IR to provide as a prolog function for
+    :func:`~nvmath.fft.fft` and :meth:`~nvmath.fft.FFT.plan`.
 
     Args:
-        prolog_fn: The prolog function to be compiled to LTO-IR. It must have the signature: ``prolog_fn(data_in, offset, user_info, reserved_for_future_use)``, and
-            it essentially returns transformed ``data_in`` at ``offset``.
+        prolog_fn: The prolog function to be compiled to LTO-IR. It must have the signature:
+            ``prolog_fn(data_in, offset, user_info, reserved_for_future_use)``, and it
+            essentially returns transformed ``data_in`` at ``offset``.
+
         element_dtype: {element_dtype}
+
         user_info_dtype: {user_info_dtype}
+
         compute_capability: {compute_capability}
 
     Returns:
         The function compiled to LTO-IR as `bytes` object.
 
     See Also:
-        :func:`~nvmath.fft.fft`, :meth:`~nvmath.fft.FFT.plan`, :meth:`~nvmath.fft.compile_epilog`.
+        :func:`~nvmath.fft.fft`, :meth:`~nvmath.fft.FFT.plan`,
+        :meth:`~nvmath.fft.compile_epilog`.
 
     Notes:
-        - The user must ensure that the specified argument types meet the requirements listed above.
+        - The user must ensure that the specified argument types meet the requirements
+          listed above.
 """.format(**SHARED_FFT_HELPER_DOCUMENTATION)
 compile_prolog.__name__ = "compile_prolog"
 
 compile_epilog.__doc__ = """
     compile_epilog(epilog_fn, element_dtype, user_info_dtype, *, compute_capability=None)
 
-    Compile a Python function to LTO-IR to provide as an epilog function for :func:`~nvmath.fft.fft` and :meth:`~nvmath.fft.FFT.plan`.
+    Compile a Python function to LTO-IR to provide as an epilog function for
+    :func:`~nvmath.fft.fft` and :meth:`~nvmath.fft.FFT.plan`.
 
     Args:
-        epilog_fn: The epilog function to be compiled to LTO-IR. It must have the signature: ``epilog_fn(data_out, offset, data, user_info, reserved_for_future_use)``, and
+        epilog_fn: The epilog function to be compiled to LTO-IR. It must have the signature:
+            ``epilog_fn(data_out, offset, data, user_info, reserved_for_future_use)``, and
             it essentially stores transformed ``data`` into ``data_out`` at ``offset``.
+
         element_dtype: {element_dtype}
+
         user_info_dtype: {user_info_dtype}
+
         compute_capability: {compute_capability}
 
     Returns:
         The function compiled to LTO-IR as `bytes` object.
 
     See Also:
-        :func:`~nvmath.fft.fft`, :meth:`~nvmath.fft.FFT.plan`, :meth:`~nvmath.fft.compile_prolog`.
+        :func:`~nvmath.fft.fft`, :meth:`~nvmath.fft.FFT.plan`,
+        :meth:`~nvmath.fft.compile_prolog`.
 
     Examples:
 
-        The cuFFT library expects the end user to manage scaling of the outputs, so in order to replicate the ``norm``
-        option found in `other Python FFT libraries
-        <https://numpy.org/doc/stable/reference/routines.fft.html#normalization>`_ we can define an epilog which
-        performs the scaling.
+        The cuFFT library expects the end user to manage scaling of the outputs, so in order
+        to replicate the ``norm`` option found in `other Python FFT libraries
+        <https://numpy.org/doc/stable/reference/routines.fft.html#normalization>`_ we can
+        define an epilog which performs the scaling.
 
         >>> import cupy as cp
         >>> import nvmath
@@ -230,15 +244,17 @@ def _compile(
         Define the epilog function for the FFT.
 
         >>> def rescale(data_out, offset, data, user_info, unused):
-        >>>     data_out[offset] = data * norm_factor
+        ...     data_out[offset] = data * norm_factor
 
-        Compile the epilog to LTO-IR. In a system with GPUs that have different compute capability, the
-        `compute_capability` option must be specified to the `compile_prolog` or `compile_epilog` helpers.
-        Alternatively, the epilog can be compiled in the context of the device where the FFT to which the epilog is
-        provided is executed. In this case we use the current device context, where the operands have been created.
+        Compile the epilog to LTO-IR. In a system with GPUs that have different compute
+        capability, the `compute_capability` option must be specified to the
+        `compile_prolog` or `compile_epilog` helpers. Alternatively, the epilog can be
+        compiled in the context of the device where the FFT to which the epilog is provided
+        is executed. In this case we use the current device context, where the operands have
+        been created.
 
         >>> with cp.cuda.Device():
-        >>>     epilog = nvmath.fft.compile_epilog(rescale, "complex128", "complex128")
+        ...     epilog = nvmath.fft.compile_epilog(rescale, "complex128", "complex128")
 
         Perform the forward FFT, applying the rescaling as a epilog.
 
@@ -250,6 +266,7 @@ def _compile(
         >>> assert cp.allclose(r, s)
 
     Notes:
-        - The user must ensure that the specified argument types meet the requirements listed above.
+        - The user must ensure that the specified argument types meet the requirements
+          listed above.
 """.format(**SHARED_FFT_HELPER_DOCUMENTATION)
 compile_prolog.__name__ = "compile_prolog"
diff --git a/nvmath/fft/fft.py b/nvmath/fft/fft.py
index 6cd00d1..8af5838 100644
--- a/nvmath/fft/fft.py
+++ b/nvmath/fft/fft.py
@@ -4,13 +4,16 @@
 
 __all__ = ["FFT", "fft", "ifft", "rfft", "irfft", "UnsupportedLayoutError"]
 
+from typing import Literal
+from collections.abc import Sequence
 from dataclasses import dataclass, astuple as data_cls_astuple
 import enum
 import functools
 import logging
 import math
 import operator
-from collections.abc import Sequence
+
+from ._configuration import ExecutionCPU, ExecutionCUDA
 
 try:
     import cupy as cp
@@ -40,6 +43,8 @@
     FFTW_SUPPORTED_COMPLEX,
 )
 from nvmath._internal import utils
+from nvmath._internal.tensor_ifc import Tensor
+from nvmath._internal.layout import is_contiguous_layout, is_contiguous_in_memory, is_overlapping_layout
 
 
 class UnsupportedLayoutError(Exception):
@@ -48,9 +53,13 @@ class UnsupportedLayoutError(Exception):
 
     Args:
         message: The error message.
-        permutation: The permutation needed to convert the input layout to a supported layout to the FFT operation. The same
-            permutation needs to be applied to the result to obtain the axis sequence corresponding to the non-permuted input.
-        axes: The dimensions along which the FFT is performed corresponding to the permuted operand layout.
+
+        permutation: The permutation needed to convert the input layout to a supported
+            layout to the FFT operation. The same permutation needs to be applied to the
+            result to obtain the axis sequence corresponding to the non-permuted input.
+
+        axes: The dimensions along which the FFT is performed corresponding to the permuted
+            operand layout.
     """
 
     def __init__(self, message, permutation, axes):
@@ -106,25 +115,39 @@ class CBStoreType(enum.IntEnum):
 SHARED_FFT_DOCUMENTATION = utils.COMMON_SHARED_DOC_MAP.copy()
 SHARED_FFT_DOCUMENTATION.update(
     {
-        "axes": "The dimensions along which the FFT is performed. ``axes[-1]`` is the 'last transformed' axis for rffts. Currently, it is required that the axes are contiguous "
-        "and include the first or the last dimension. Only up to 3D FFTs are supported.",
-        "options": "Specify options for the FFT as a :class:`FFTOptions` object. "
-        "Alternatively, a `dict` containing the parameters for the ``FFTOptions`` constructor can also be provided. "
-        "If not specified, the value will be set to the default-constructed ``FFTOptions`` object.",
-        "execution": "Specify execution space options for the FFT as a :class:`ExecutionCUDA` or :class:`ExecutionCPU` object. "
-        "Alternatively, a string ('cuda' or 'cpu'), or a `dict` with the 'name' key set to 'cpu' or 'cuda' "
-        "and optional parameters relevant to the given execution space. "
-        "If not specified, the execution space will be selected to match operand's storage (in GPU or host memory), "
-        "and the corresponding :class:`ExecutionCUDA` or :class:`ExecutionCPU` object will be default-constructed.",
-        "prolog": "Provide device-callable function in LTO-IR format to use as load-callback as an object of type :class:`DeviceCallable`. "
-        "Alternatively, a `dict` containing the parameters for the ``DeviceCallable`` constructor can also be provided. The default is no prolog. "
-        "Currently, callbacks are supported only with CUDA execution.",
-        "epilog": "Provide device-callable function in LTO-IR format to use as store-callback as an object of type :class:`DeviceCallable`. "
-        "Alternatively, a `dict` containing the parameters for the ``DeviceCallable`` constructor can also be provided. The default is no epilog. "
-        "Currently, callbacks are supported only with CUDA execution.",
-        "direction": "Specify whether forward or inverse FFT is performed (:class:`FFTDirection` object, or as a string from ['forward', 'inverse'], "
-        "or as an int from [-1, 1] denoting forward and inverse directions respectively).",
-        "fft_key": "A tuple as the key to represent the input FFT problem.",
+        "axes": """\
+The dimensions along which the FFT is performed. ``axes[-1]`` is the 'last transformed' axis for rffts. Currently, it is
+required that the axes are contiguous and include the first or the last dimension. Only up to 3D FFTs are
+supported.""".replace("\n", " "),
+        #
+        "options": """\
+Specify options for the FFT as a :class:`FFTOptions` object. Alternatively, a `dict` containing the parameters for the
+``FFTOptions`` constructor can also be provided. If not specified, the value will be set to the default-constructed
+``FFTOptions`` object.""".replace("\n", " "),
+        #
+        "execution": """\
+Specify execution space options for the FFT as a :class:`ExecutionCUDA` or :class:`ExecutionCPU` object. Alternatively,
+a string ('cuda' or 'cpu'), or a `dict` with the 'name' key set to 'cpu' or 'cuda' and optional parameters relevant to
+the given execution space. If not specified, the execution space will be selected to match operand's storage (in GPU or
+host memory), and the corresponding :class:`ExecutionCUDA` or :class:`ExecutionCPU` object will be
+default-constructed.""".replace("\n", " "),
+        #
+        "prolog": """\
+Provide device-callable function in LTO-IR format to use as load-callback as an object of type :class:`DeviceCallable`.
+Alternatively, a `dict` containing the parameters for the ``DeviceCallable`` constructor can also be provided. The
+default is no prolog. Currently, callbacks are supported only with CUDA execution.""".replace("\n", " "),
+        #
+        "epilog": """\
+Provide device-callable function in LTO-IR format to use as store-callback as an object of type :class:`DeviceCallable`.
+Alternatively, a `dict` containing the parameters for the ``DeviceCallable`` constructor can also be provided. The
+default is no epilog. Currently, callbacks are supported only with CUDA execution.""".replace("\n", " "),
+        #
+        "direction": """\
+Specify whether forward or inverse FFT is performed (:class:`FFTDirection` object, or as a string from ['forward',
+'inverse'], "or as an int from [-1, 1] denoting forward and inverse directions respectively).""".replace("\n", " "),
+        #
+        "fft_key": """\
+A tuple as the key to represent the input FFT problem.""".replace("\n", " "),
     }
 )
 
@@ -181,7 +204,8 @@ def _get_fft_result_and_compute_types(dtype, fft_abstract_type):
 
 def _get_fft_default_direction(fft_abstract_type):
     """
-    Return the default FFT direction (as object of type configuration.FFTDirection) based on the FFT type.
+    Return the default FFT direction (as object of type configuration.FFTDirection) based on
+    the FFT type.
     """
     if fft_abstract_type in ["C2C", "R2C"]:
         return _configuration.FFTDirection.FORWARD
@@ -195,12 +219,21 @@ def _get_size(shape):
     return functools.reduce(operator.mul, shape, 1)
 
 
-def _get_last_axis_id_and_size(axes, operand_shape, fft_abstract_type, last_axis_size):
+def _get_last_axis_id_and_size(
+    axes: Sequence[int],
+    operand_shape: Sequence[int],
+    fft_abstract_type: Literal["C2C", "C2R", "R2C"],
+    last_axis_parity: Literal["even", "odd"],
+) -> tuple[int, int]:
     """
-    axes                  = The user-specified or default FFT axes.
-    operand_shape         = The input operand shape.
-    fft_abstract_type     = The "abstract" type of the FFT ('C2C', 'C2R', 'R2C').
-    last_axis_size        = For 'C2R' FFTs, specify whether the last axis size is even or odd.
+    Args:
+        axes: The user-specified or default FFT axes.
+
+        operand_shape: The input operand shape.
+
+        fft_abstract_type: The "abstract" type of the FFT ('C2C', 'C2R', 'R2C').
+
+        last_axis_parity: For 'C2R' FFTs, specify whether the last axis size is even or odd.
 
     Returns the last axis ID and the corresponding axis size required for the result.
     """
@@ -210,9 +243,9 @@ def _get_last_axis_id_and_size(axes, operand_shape, fft_abstract_type, last_axis
         return last_axis_id, operand_shape[last_axis_id]
 
     if fft_abstract_type == "C2R":
-        if last_axis_size == "even":
+        if last_axis_parity == "even":
             return last_axis_id, 2 * (operand_shape[last_axis_id] - 1)
-        elif last_axis_size == "odd":
+        elif last_axis_parity == "odd":
             return last_axis_id, 2 * operand_shape[last_axis_id] - 1
         else:
             raise AssertionError("Unreachable.")
@@ -221,21 +254,12 @@ def _get_last_axis_id_and_size(axes, operand_shape, fft_abstract_type, last_axis
         return last_axis_id, operand_shape[last_axis_id] // 2 + 1
 
 
-def _contiguous_layout(sorted_shape, sorted_strides):
-    return all(sorted_shape[s - 1] * sorted_strides[s - 1] == sorted_strides[s] for s in range(1, len(sorted_strides)))
-
-
-def contiguous_in_memory(shape, strides):
-    """
-    Check if the provided (shape, strides) result in a contiguous memory layout.
-    """
-    sorted_strides, sorted_shape = zip(*sorted(zip(strides, shape, strict=True)), strict=True)
-    return _contiguous_layout(sorted_shape, sorted_strides)
-
-
-def overlapping_layout(shape, strides):
-    sorted_strides, sorted_shape = zip(*sorted(zip(strides, shape, strict=True)), strict=True)
-    return any(sorted_shape[s - 1] * sorted_strides[s - 1] > sorted_strides[s] for s in range(1, len(sorted_strides)))
+def check_inplace_overlapping_layout(operand: Tensor):
+    if is_overlapping_layout(operand.shape, operand.strides):
+        raise ValueError(
+            f"In-place transform is not supported because the tensor with shape "
+            f"{operand.shape} and strides {operand.strides} overlaps in memory."
+        )
 
 
 def check_embedding_possible(strides, presorted=False):
@@ -254,17 +278,17 @@ def check_batch_tileable(sorted_batch_shape, sorted_batch_strides):
     """
     Check if FFT layout is tileable across the specified batch layout.
     """
-    return _contiguous_layout(sorted_batch_shape, sorted_batch_strides)
+    return is_contiguous_layout(sorted_batch_shape, sorted_batch_strides)
 
 
 def check_contiguous_layout(axes, strides, shape):
     if not axes:
         return True
     sorted_batch_strides, sorted_batch_shape = zip(*sorted((strides[a], shape[a]) for a in axes), strict=True)
-    return _contiguous_layout(sorted_batch_shape, sorted_batch_strides)
+    return is_contiguous_layout(sorted_batch_shape, sorted_batch_strides)
 
 
-def calculate_embedding_shape(shape, strides):
+def calculate_embedding_shape(shape: Sequence[int], strides: Sequence[int]):
     """
     Calculate the embedding shape for the given shape and strides.
     """
@@ -272,9 +296,7 @@ def calculate_embedding_shape(shape, strides):
     # The shape is used to resolve cases like (1, 2, 1) : (2, 1, 1) in CuTe notation.
     ordered_strides, _, order = zip(*sorted(zip(strides, shape, range(n), strict=True)), strict=True)
 
-    ordered_shape = [ordered_strides[i] // ordered_strides[i - 1] for i in range(1, len(ordered_strides))] + [
-        shape[order[-1]]
-    ]
+    ordered_shape = [ordered_strides[i] // ordered_strides[i - 1] for i in range(1, len(ordered_strides))] + [shape[order[-1]]]
 
     embedding_shape = [0] * n
     for o in range(n):
@@ -314,7 +336,10 @@ def unsupported_layout_exception(operand_dim, axes, message, logger):
     fft_dim = len(axes)
     axes = tuple(range(operand_dim - fft_dim, operand_dim))
 
-    message = f"To convert to a supported layout, create a transposed view using transpose{permutation} and copy the view into a new tensor, using view.copy() for instance, and use axes={axes}."
+    message = (
+        f"To convert to a supported layout, create a transposed view using transpose{permutation} and copy the "
+        f"view into a new tensor, using view.copy() for instance, and use axes={axes}."
+    )
     logger.error(message)
 
     raise UnsupportedLayoutError(message, permutation, axes)
@@ -329,35 +354,47 @@ def get_null_logger(name):
 
 
 def get_fft_plan_traits(
-    operand_shape,
-    operand_strides,
+    operand_shape: Sequence[int],
+    operand_strides: Sequence[int],
     operand_dtype,
-    axes,
-    execution,
+    axes: Sequence[int],
+    execution: ExecutionCUDA | ExecutionCPU,
     *,
-    fft_abstract_type="C2C",
-    last_axis_size="even",
-    result_layout="optimized",
-    logger=None,
-):
+    fft_abstract_type: Literal["C2C", "C2R", "R2C"] = "C2C",
+    last_axis_parity: Literal["even", "odd"] = "even",
+    result_layout: Literal["optimized", "natural"] = "optimized",
+    logger: logging.Logger | None = None,
+) -> PlanTraits:
     """
-    Extract the FFT shape from the operand shape, compute the ordered axes so that the data is C-contiguous in memory, and compute the result shape and strides.
+    Extract the FFT shape from the operand shape, compute the ordered axes so that the data
+    is C-contiguous in memory, and compute the result shape and strides.
+
+    Args:
+        operand_shape: The operand shape
+
+        operand_strides: The operand strides
+
+        axes: The axes over which the FFT is performed. For R2C and C2R transforms, the size
+            of the last axis in `axes` will change.
+
+        execution: The execution options, an instance of either ExecutionCUDA or
+            ExecutionCPU class.
+
+        fft_abstract_type: The "abstract" type of the FFT ('C2C', 'C2R', 'R2C').
 
-    operand_shape         = The operand shape
-    operand_strides       = The operand strides
-    axes                  = The axes over which the FFT is performed. For R2C and C2R transforms, the size of the last axis in `axes` will change.
-    execution             = The execution options, an instance of either ExecutionCUDA or ExecutionCPU class.
-    fft_abstract_type     = The "abstract" type of the FFT ('C2C', 'C2R', 'R2C').
-    last_axis_size        = For 'C2R' FFTs, specify whether the last axis size is even or odd.
+        last_axis_parity: For 'C2R' FFTs, specify whether the last axis size is even or odd.
 
     The data needed for creating a cuFFT plan is returned in the following order:
-    (result_shape, result_strides), ordered_axes, ordered_fft_in_shape, ordered_fft_out_shape, (istride, idistance), (ostride, odistance)
+    (result_shape, result_strides), ordered_axes, ordered_fft_in_shape,
+    ordered_fft_out_shape, (istride, idistance), (ostride, odistance)
     """
     logger = logger if logger is not None else get_null_logger("get_fft_plan_traits_null")
 
     if len(axes) > 3:
         raise ValueError(
-            f"Only up to 3D FFTs are currently supported. You can use the 'axes' option to specify up to three axes along which to perform the FFT. The current number of dimensions is {len(axes)} corresponding to the axes {axes}."
+            "Only up to 3D FFTs are currently supported. You can use the 'axes' option to specify up to three axes "
+            f"along which to perform the FFT. The current number of dimensions is {len(axes)} corresponding to the "
+            f"axes {axes}."
         )
 
     # Check for duplicate axis IDs.
@@ -369,18 +406,23 @@ def get_fft_plan_traits(
 
     # Check if an embedding is possible for the provided operand layout.
     if not check_embedding_possible(operand_strides):
-        message = f"The operand layout corresponding to shape = {operand_shape} and strides = {operand_strides} is not currently supported because it does not have a suitable embedding dimension."
+        message = (
+            f"The operand layout corresponding to shape = {operand_shape} and strides = {operand_strides} is "
+            "not currently supported because it does not have a suitable embedding dimension."
+        )
         unsupported_layout_exception(operand_dim, axes, message, logger)
 
     # Compute the embedding shape for the operand.
     operand_embedding_shape, axis_order = calculate_embedding_shape(operand_shape, operand_strides)
     logger.debug(f"The operand embedding shape = {operand_embedding_shape}.")
 
-    # The first or the last *ordered* axis must be present in the specified axes to be able to use the "advanced" layout.
+    # The first or the last *ordered* axis must be present in the specified axes to be able
+    # to use the "advanced" layout.
     first, last = axis_order[-1], axis_order[0]
     if first not in axes and last not in axes:
         raise ValueError(
-            f"The first ({first}) or the last ({last}) tensor axis in stride order {axis_order} must be present in the specified FFT axes {axes}."
+            f"The first ({first}) or the last ({last}) tensor axis in stride order {axis_order} must be present in the "
+            f"specified FFT axes {axes}."
         )
 
     # Compute the embedding input shape for the FFT.
@@ -388,22 +430,27 @@ def get_fft_plan_traits(
 
     # Compute the input shape for the FFT.
     fft_in_shape, fft_in_strides = zip(*[(operand_shape[a], operand_strides[a]) for a in axes], strict=True)
-    if not contiguous_in_memory(fft_in_embedding_shape, fft_in_strides):
-        message = f"The FFT axes {axes} cannot be reordered so that the data is contiguous in memory for operand shape = {operand_shape} and operand strides = {operand_strides}."
+    if not is_contiguous_in_memory(fft_in_embedding_shape, fft_in_strides):
+        message = (
+            f"The FFT axes {axes} cannot be reordered so that the data is contiguous in memory for "
+            f"operand shape = {operand_shape} and operand strides = {operand_strides}."
+        )
         unsupported_layout_exception(operand_dim, axes, message, logger)
 
-    # Reorder the FFT axes and input shape so that they are contiguous or separated by constant stride in memory.
+    # Reorder the FFT axes and input shape so that they are contiguous or separated by
+    # constant stride in memory.
     quadruple = sorted(
         zip(fft_in_strides, fft_in_shape, fft_in_embedding_shape, axes, strict=True), key=lambda v: v[:2], reverse=True
     )
 
-    ordered_in_strides, ordered_fft_in_shape, ordered_fft_in_embedding_shape, ordered_axes = zip(
-        *quadruple, strict=True
-    )
+    ordered_in_strides, ordered_fft_in_shape, ordered_fft_in_embedding_shape, ordered_axes = zip(*quadruple, strict=True)
 
     # Check if R2C and C2R can be supported without copying.
     if fft_abstract_type in ["R2C", "C2R"] and ordered_axes[-1] != axes[-1]:
-        message = f"The last FFT axis specified ({axes[-1]}) must have the smallest stride of all the FFT axes' strides {fft_in_strides} for FFT type '{fft_abstract_type}'."
+        message = (
+            f"The last FFT axis specified ({axes[-1]}) must have the smallest stride of all the FFT axes' "
+            f"strides {fft_in_strides} for FFT type '{fft_abstract_type}'."
+        )
         unsupported_layout_exception(operand_dim, axes, message, logger)
 
     # Input FFT size and batch size.
@@ -413,10 +460,11 @@ def get_fft_plan_traits(
     fft_batch_size = _get_size(operand_shape) // fft_in_size
 
     # Output FFT (ordered) shape and size.
-    last_axis_id, last_axis_size = _get_last_axis_id_and_size(axes, operand_shape, fft_abstract_type, last_axis_size)
+    last_axis_id, last_axis_size = _get_last_axis_id_and_size(axes, operand_shape, fft_abstract_type, last_axis_parity)
     if last_axis_size == 0:
         raise ValueError(
-            f"The size of the last FFT axis in the result for FFT type '{fft_abstract_type}' is 0 for operand shape = {operand_shape} and axes = {axes}. To fix this, provide 'last_axis_size' = 'odd' to the FFT options."
+            f"The size of the last FFT axis in the result for FFT type '{fft_abstract_type}' is 0 for operand shape = "
+            f"{operand_shape} and axes = {axes}. To fix this, provide 'last_axis_size' = 'odd' to the FFT options."
         )
     ordered_fft_out_shape = list(ordered_fft_in_shape)
     index = ordered_axes.index(last_axis_id)
@@ -424,29 +472,36 @@ def get_fft_plan_traits(
     fft_out_size = _get_size(ordered_fft_out_shape)
 
     # Check that batch dimensions are tileable, as required by the "advanced" layout.
-    sorted_batch_shape, sorted_batch_strides = list(), list()
+    sorted_batch_shape: Sequence[int] = []
+    sorted_batch_strides: Sequence[int] = []
     if batch_axes:
         sorted_batch_strides, sorted_batch_shape = zip(
             *sorted((operand_strides[a], operand_shape[a]) for a in batch_axes), strict=True
         )
         if not check_embedding_possible(sorted_batch_strides, presorted=True):
             raise ValueError(
-                f"The operand layout corresponding to shape = {operand_shape} and strides = {operand_strides} together with the specified axes = {axes} is currently not supported because it is not tileable."
+                f"The operand layout corresponding to shape = {operand_shape} and strides = {operand_strides} "
+                f"together with the specified axes = {axes} is currently not supported because it is not tileable."
             )
         logger.debug(f"The sorted batch shape is {sorted_batch_shape}.")
         logger.debug(f"The sorted batch strides are {sorted_batch_strides}.")
     if not check_batch_tileable(sorted_batch_shape, sorted_batch_strides):
-        message = f"The operand layout corresponding to shape = {operand_shape} and strides = {operand_strides} together with the specified axes = {axes} is currently not supported because it is not tileable."
+        message = (
+            f"The operand layout corresponding to shape = {operand_shape} and strides = {operand_strides} "
+            f"together with the specified axes = {axes} is currently not supported because it is not tileable."
+        )
         unsupported_layout_exception(operand_dim, axes, message, logger)
     logger.debug(
-        f"The operand layout corresponding to shape = {operand_shape} and strides = {operand_strides} together with the specified axes = {axes} IS tileable."
+        f"The operand layout corresponding to shape = {operand_shape} and strides = {operand_strides} together with "
+        f"the specified axes = {axes} IS tileable."
     )
 
     # The result tensor has updated shape for R2C and C2R transforms.
     result_shape = list(operand_shape)
     result_shape[last_axis_id] = last_axis_size
 
-    # The result tensor layout is either natural or chosen for optimal cuFFT performance, based on the operand layout and user-provided option.
+    # The result tensor layout is either natural or chosen for optimal cuFFT performance,
+    # based on the operand layout and user-provided option.
 
     # We can keep the input's layout (i.e. operand's extents order of increasing strides)
     # without performance hit, if the samples do not interleave.
@@ -457,20 +512,27 @@ def get_fft_plan_traits(
     if not is_sample_interleaved or result_layout == "natural":  # Natural (== operand) layout.
         axis_order = axis_order_in_memory(operand_shape, operand_strides)
         result_strides = calculate_strides(result_shape, axis_order)
-        # If the resulting output operand is not tilable, keeping the original layout is not possible.
-        # If `not is_sample_interleaved` the batch must be tilable,
-        # because the min batch stride is bigger than max fft stride
+        # If the resulting output operand is not tilable, keeping the original layout is not
+        # possible. If `not is_sample_interleaved` the batch must be tilable, because the
+        # min batch stride is bigger than max fft stride
         if is_sample_interleaved:
             if not check_contiguous_layout(batch_axes, result_strides, result_shape):
-                message = f"The operand layout corresponding to shape = {operand_shape} and strides = {operand_strides} together with the specified axes = {axes} is currently not supported with result_layout='natural', because the output batch would not be tileable."
+                message = (
+                    f"The operand layout corresponding to shape = {operand_shape} and strides = {operand_strides} "
+                    f"together with the specified axes = {axes} is currently not supported with "
+                    "result_layout='natural', because the output batch would not be tileable."
+                )
                 unsupported_layout_exception(operand_dim, axes, message, logger)
             if not check_contiguous_layout(axes, result_strides, result_shape):
-                message = f"The operand layout corresponding to shape = {operand_shape} and strides = {operand_strides} together with the specified axes = {axes} is currently not supported with result_layout='natural', because the output sample would be non-contiguous."
+                message = (
+                    f"The operand layout corresponding to shape = {operand_shape} and strides = {operand_strides} "
+                    f"together with the specified axes = {axes} is currently not supported with "
+                    "result_layout='natural', because the output sample would be non-contiguous."
+                )
                 unsupported_layout_exception(operand_dim, axes, message, logger)
     else:  # Optimized layout.
         axis_order = tuple(
-            list(reversed(ordered_axes))
-            + sorted((a for a in batch_axes), key=lambda v: (operand_strides[v], operand_shape[v]))
+            list(reversed(ordered_axes)) + sorted((a for a in batch_axes), key=lambda v: (operand_strides[v], operand_shape[v]))
         )
         result_strides = calculate_strides(result_shape, axis_order)
     logger.debug(f"The result layout is '{result_layout}' with the result_strides {result_strides}.")
@@ -518,10 +580,11 @@ def get_fft_plan_traits(
                         f"The FFT of sample size 1 and half-precision type ({operand_dtype}) "
                         f"of size 1 is not supported by the installed cuFFT version. "
                     )
-                # There is a bug that leads to invalid memory access (CTK 12.1) for one-element,
-                # strided C2C complex32 tensors (either in the input or output) or results in
-                # CUFFT_INVALID_SIZE (CTK 12.3). This workaround relies on the fact that the
-                # [i|o]stride effectively does not matter in a one-element sample.
+                # There is a bug that leads to invalid memory access (CTK 12.1) for
+                # one-element, strided C2C complex32 tensors (either in the input or output)
+                # or results in CUFFT_INVALID_SIZE (CTK 12.3). This workaround relies on the
+                # fact that the [i|o]stride effectively does not matter in a one-element
+                # sample.
                 elif fft_abstract_type == "C2C":
                     istride = ostride = 1
 
@@ -563,16 +626,15 @@ def _copy_operand_perhaps(
             return operand, None
         else:
             # For C2R, we need to take a copy to avoid input being overwritten
-            logger.info(
-                "For C2R FFT with input operand on GPU, the input is copied to " "avoid being overwritten by cuFFT."
-            )
+            logger.info("For C2R FFT with input operand on GPU, the input is copied to " "avoid being overwritten by cuFFT.")
             operand_copy = utils.create_empty_tensor(
                 operand.__class__,
                 operand.shape,
                 operand.dtype,
                 device_id,
                 stream_holder,
-                operand.strides,
+                verify_strides=True,
+                strides=operand.strides,
             )
             operand_copy.copy_(operand.tensor, stream_holder=stream_holder)
             # We don't need to keep the operand backup, because C2R precludes `inplace=True`
@@ -595,31 +657,23 @@ def _copy_operand_perhaps(
             return internal_operand, operand
 
 
-def create_xt_plan_args(
-    *, plan_traits=None, fft_abstract_type=None, operand_data_type=None, operand_layout=None, inplace=None
-):
+def create_xt_plan_args(*, plan_traits=None, fft_abstract_type=None, operand_data_type=None, inplace=None):
     """
-    Create the arguments to xt_make_plan_many() except for the handle. This is also used for computing the FFT key.
+    Create the arguments to xt_make_plan_many() except for the handle. This is also used for
+    computing the FFT key.
     """
     assert plan_traits is not None, "Internal error."
     assert fft_abstract_type is not None, "Internal error."
     assert operand_data_type is not None, "Internal error."
     assert inplace is not None, "Internal error."
-    assert operand_layout is not None, "Internal error."
 
     result_data_type, compute_data_type = _get_fft_result_and_compute_types(operand_data_type, fft_abstract_type)
 
     # The input shape to the plan should be the logical FFT shape.
-    ordered_plan_shape = (
-        plan_traits.ordered_fft_out_shape if fft_abstract_type == "C2R" else plan_traits.ordered_fft_in_shape
-    )
+    ordered_plan_shape = plan_traits.ordered_fft_out_shape if fft_abstract_type == "C2R" else plan_traits.ordered_fft_in_shape
 
     # Handle in-place transforms.
     if inplace:
-        if overlapping_layout(operand_layout.shape, operand_layout.strides):
-            raise ValueError(
-                f"In-place transform is not supported because the tensor with shape {operand_layout.shape} and strides {operand_layout.strides} overlaps in memory."
-            )
         ordered_fft_out_shape, ostride, odistance = (
             plan_traits.ordered_fft_in_embedding_shape,
             plan_traits.istride,
@@ -650,11 +704,11 @@ def create_xt_plan_args(
 
 def fftw_plan_args(xt_plan_args, operand_ptr, result_ptr, fft_abstract_type, direction):
     """
-    Create the arguments for fftw API based on the args created by create_xt_plan_args and pointers
-    to the input and the output tensors.
-    Note, that while the pointers to the data are required in planning, different pointers may be passed
-    to the same plan in subsequent execute call
-    (assuming dtype, memory layout, alignment, and inplace properties do not change).
+    Create the arguments for fftw API based on the args created by create_xt_plan_args and
+    pointers to the input and the output tensors. Note, that while the pointers to the data
+    are required in planning, different pointers may be passed to the same plan in
+    subsequent execute call (assuming dtype, memory layout, alignment, and inplace
+    properties do not change).
     """
     (
         rank,
@@ -735,15 +789,14 @@ def setup_options(operand, options, execution):
     return _cross_setup_execution_and_options(options, execution)
 
 
-def create_fft_key(
-    operand, *, axes=None, options=None, execution=None, inplace=None, prolog=None, epilog=None, plan_args=None
-):
+def create_fft_key(operand, *, axes=None, options=None, execution=None, inplace=None, prolog=None, epilog=None, plan_args=None):
     """
-    This key is not designed to be serialized and used on a different machine. It is meant for runtime use only.
-    We use a specific inplace argument instead of taking it from options, because self.inplace != self.options.inplace
-    for CPU tensors for efficiency.
+    This key is not designed to be serialized and used on a different machine. It is meant
+    for runtime use only. We use a specific inplace argument instead of taking it from
+    options, because self.inplace != self.options.inplace for CPU tensors for efficiency.
 
-    It is the user's responsibility to augment this key with the stream in case they use stream-ordered memory pools.
+    It is the user's responsibility to augment this key with the stream in case they use
+    stream-ordered memory pools.
     """
     if plan_args is None:
         operand = tensor_wrapper.wrap_operand(operand)
@@ -760,39 +813,40 @@ def create_fft_key(
             axes,
             execution,
             fft_abstract_type=fft_abstract_type,
-            last_axis_size=options.last_axis_size,
+            last_axis_parity=options.last_axis_parity,
             result_layout=options.result_layout,
             logger=None,
         )
 
-        # Inplace is always True when execution space is different than the operand's memory space
-        # (as the operand needs to be copied once anyway)
+        # Inplace is always True when execution space is different than the operand's memory
+        # space (as the operand needs to be copied once anyway)
         if inplace is None:
             memory_space = "cpu" if operand.device_id is None else "cuda"
             execution_space = execution.name
             assert execution.name in ("cpu", "cuda")
             inplace = memory_space != execution_space or options.inplace
 
+        if inplace:
+            check_inplace_overlapping_layout(operand)
+
         # Get the arguments to xt_make_plan_many.
         plan_args = create_xt_plan_args(
             plan_traits=plan_traits,
             fft_abstract_type=fft_abstract_type,
             operand_data_type=operand.dtype,
-            operand_layout=TensorLayout(shape=operand.shape, strides=operand.strides),
             inplace=inplace,
         )
 
     # Prolog and epilog, if used.
     if prolog is not None or epilog is not None:
-        get_data = (
-            lambda device_callable: None if device_callable is None else (device_callable.ltoir, device_callable.data)
-        )
+        get_data = lambda device_callable: None if device_callable is None else (device_callable.ltoir, device_callable.data)
         callable_data = get_data(prolog), get_data(epilog)
     else:
         callable_data = None
 
-    # The key is based on plan arguments, callback data (a callable object of type DeviceCallback or None) and the
-    # execution options (in "normalized" form of ("cpu"/"cuda", *execution_options)).
+    # The key is based on plan arguments, callback data (a callable object of type
+    # DeviceCallback or None) and the execution options (in "normalized" form of
+    # ("cpu"/"cuda", *execution_options)).
     return plan_args, callable_data, data_cls_astuple(execution)
 
 
@@ -890,30 +944,47 @@ class FFT:
     """
     FFT(operand, *, axes=None, options=None, execution=None, stream=None)
 
-    Create a stateful object that encapsulates the specified FFT computations and required resources.
-    This object ensures the validity of resources during use and releases them when they are no longer needed to prevent misuse.
+    Create a stateful object that encapsulates the specified FFT computations and required
+    resources. This object ensures the validity of resources during use and releases them
+    when they are no longer needed to prevent misuse.
 
-    This object encompasses all functionalities of function-form APIs :func:`fft`, :func:`ifft`, :func:`rfft`, and :func:`irfft`, which are convenience wrappers around it.
-    The stateful object also allows for the amortization of preparatory costs when the same FFT operation is to be performed on multiple operands with the same problem specification (see :meth:`reset_operand` and :meth:`create_key` for more details).
+    This object encompasses all functionalities of function-form APIs :func:`fft`,
+    :func:`ifft`, :func:`rfft`, and :func:`irfft`, which are convenience wrappers around it.
+    The stateful object also allows for the amortization of preparatory costs when the same
+    FFT operation is to be performed on multiple operands with the same problem
+    specification (see :meth:`reset_operand` and :meth:`create_key` for more details).
 
     Using the stateful object typically involves the following steps:
 
-    1. **Problem Specification**: Initialize the object with a defined operation and options.
-    2. **Preparation**: Use :meth:`plan` to determine the best algorithmic implementation for this specific FFT operation.
-    3. **Execution**: Perform the FFT computation with :meth:`execute`, which can be either forward or inverse FFT transformation.
-    4. **Resource Management**: Ensure all resources are released either by explicitly calling :meth:`free` or by managing the stateful object within a context manager.
+    1. **Problem Specification**: Initialize the object with a defined operation and
+       options.
+    2. **Preparation**: Use :meth:`plan` to determine the best algorithmic implementation
+       for this specific FFT operation.
+    3. **Execution**: Perform the FFT computation with :meth:`execute`, which can be either
+       forward or inverse FFT transformation.
+    4. **Resource Management**: Ensure all resources are released either by explicitly
+       calling :meth:`free` or by managing the stateful object within a context manager.
 
-    Detailed information on each step described above can be obtained by passing in a :class:`logging.Logger` object
-    to :class:`FFTOptions` or by setting the appropriate options in the root logger object, which is used by default:
+    Detailed information on each step described above can be obtained by passing in a
+    :class:`logging.Logger` object to :class:`FFTOptions` or by setting the appropriate
+    options in the root logger object, which is used by default:
 
         >>> import logging
-        >>> logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M:%S')
+        >>> logging.basicConfig(
+        ...     level=logging.INFO,
+        ...     format="%(asctime)s %(levelname)-8s %(message)s",
+        ...     datefmt="%m-%d %H:%M:%S",
+        ... )
 
     Args:
         operand: {operand}
+
         axes: {axes}
+
         options: {options}
+
         execution: {execution}
+
         stream: {stream}
 
     See Also:
@@ -929,7 +1000,8 @@ class FFT:
         >>> shape = 128, 128, 128
         >>> a = cp.random.rand(*shape) + 1j * cp.random.rand(*shape)
 
-        We will define a 2-D C2C FFT operation along the first two dimensions, batched along the last dimension:
+        We will define a 2-D C2C FFT operation along the first two dimensions, batched along
+        the last dimension:
 
         >>> axes = 0, 1
 
@@ -937,24 +1009,30 @@ class FFT:
 
         >>> f = nvmath.fft.FFT(a, axes=axes)
 
-        Options can be provided above to control the behavior of the operation using the `options` argument (see :class:`FFTOptions`).
-        Similarly, the execution space (CUDA or CPU) and execution options can be passed using the `execution` argument (see :class:`ExecutionCUDA`, :class:`ExecutionCPU`).
+        Options can be provided above to control the behavior of the operation using the
+        `options` argument (see :class:`FFTOptions`). Similarly, the execution space (CUDA
+        or CPU) and execution options can be passed using the `execution` argument (see
+        :class:`ExecutionCUDA`, :class:`ExecutionCPU`).
 
-        Next, plan the FFT. Load and/or store callback functions can be provided to :meth:`plan` using the `prolog` and `epilog` option:
+        Next, plan the FFT. Load and/or store callback functions can be provided to
+        :meth:`plan` using the `prolog` and `epilog` option:
 
         >>> f.plan()
 
-        Now execute the FFT, and obtain the result `r1` as a CuPy ndarray. The transform will be performed on GPU,
-        because ``execution`` was not explicitly specified and ``a`` resides in GPU memory.
+        Now execute the FFT, and obtain the result `r1` as a CuPy ndarray. The transform
+        will be performed on GPU, because ``execution`` was not explicitly specified and
+        ``a`` resides in GPU memory.
 
         >>> r1 = f.execute()
 
-        Finally, free the FFT object's resources. To avoid this explicit call, it's recommended to use the FFT object as
-        a context manager as shown below, if possible.
+        Finally, free the FFT object's resources. To avoid this explicit call, it's
+        recommended to use the FFT object as a context manager as shown below, if possible.
 
         >>> f.free()
 
-        Note that all :class:`FFT` methods execute on the current stream by default. Alternatively, the `stream` argument can be used to run a method on a specified stream.
+        Note that all :class:`FFT` methods execute on the current stream by default.
+        Alternatively, the `stream` argument can be used to run a method on a specified
+        stream.
 
         Let's now look at the same problem with NumPy ndarrays on the CPU.
 
@@ -964,32 +1042,37 @@ class FFT:
         >>> shape = 128, 128, 128
         >>> a = np.random.rand(*shape) + 1j * np.random.rand(*shape)
 
-        Create an FFT object encapsulating the problem specification described earlier and use it as a context manager.
+        Create an FFT object encapsulating the problem specification described earlier and
+        use it as a context manager.
 
         >>> with nvmath.fft.FFT(a, axes=axes) as f:
-        ...    f.plan()
+        ...     f.plan()
         ...
-        ...    # Execute the FFT to get the first result.
-        ...    r1 = f.execute()
+        ...     # Execute the FFT to get the first result.
+        ...     r1 = f.execute()
 
         All the resources used by the object are released at the end of the block.
 
-        The operation was performed on the CPU because ``a`` resides in host memory.
-        With ``execution`` specified to 'cuda', the NumPy array would be temporarily copied to device memory
-        and transformed on the GPU:
+        The operation was performed on the CPU because ``a`` resides in host memory. With
+        ``execution`` specified to 'cuda', the NumPy array would be temporarily copied to
+        device memory and transformed on the GPU:
 
         >>> with nvmath.fft.FFT(a, axes=axes, execution="cuda") as f:
-        ...    f.plan()
+        ...     f.plan()
         ...
-        ...    # Execute the FFT to get the first result.
-        ...    r1 = f.execute()
+        ...     # Execute the FFT to get the first result.
+        ...     r1 = f.execute()
 
-        Further examples can be found in the `nvmath/examples/fft <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft>`_ directory.
+        Further examples can be found in the `nvmath/examples/fft
+        <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft>`_ directory.
 
     Notes:
 
-        - The input must be Hermitian-symmetric when :attr:`FFTOptions.fft_type` is ``'C2R'``, otherwise the result is undefined. As a specific example, if the input for a C2R FFT was generated using an R2C FFT with an odd last axis size,
-          then :attr:`FFTOptions.last_axis_size` must be set to `odd` to recover the original signal.
+        - The input must be Hermitian-symmetric when :attr:`FFTOptions.fft_type` is
+          ``'C2R'``, otherwise the result is undefined. As a specific example, if the input
+          for a C2R FFT was generated using an R2C FFT with an odd last axis size, then
+          :attr:`FFTOptions.last_axis_size` must be set to `odd` to recover the original
+          signal.
     """
 
     def __init__(self, operand, *, axes=None, options=None, execution=None, stream=None):
@@ -998,13 +1081,12 @@ def __init__(self, operand, *, axes=None, options=None, execution=None, stream=N
         self.options = options
         self.execution_options = execution
 
-        # Capture operand layout for consistency checks when resetting operands.
-        self.operand_layout = TensorLayout(shape=operand.shape, strides=operand.strides)
         self.operand_dim = len(operand.shape)
 
         if not axes and self.operand_dim > 3:
             raise ValueError(
-                f"The tensor is {self.operand_dim}-D and FFTs in number of dimensions > 3 is not supported. The FFT axes need to be specified using the 'axes' option."
+                f"The tensor is {self.operand_dim}-D and FFTs in number of dimensions > 3 is not supported. The FFT "
+                "axes need to be specified using the 'axes' option."
             )
 
         if self.operand_dim == 0:
@@ -1013,9 +1095,7 @@ def __init__(self, operand, *, axes=None, options=None, execution=None, stream=N
         self.operand_data_type = operand.dtype
         self.fft_abstract_type = _get_default_fft_abstract_type(self.operand_data_type, options.fft_type)
 
-        self.result_data_type, self.compute_data_type = _get_fft_result_and_compute_types(
-            operand.dtype, self.fft_abstract_type
-        )
+        self.result_data_type, self.compute_data_type = _get_fft_result_and_compute_types(operand.dtype, self.fft_abstract_type)
 
         self.logger = options.logger if options.logger is not None else logging.getLogger()
         self.logger.info(f"The FFT type is {self.fft_abstract_type}.")
@@ -1076,7 +1156,8 @@ def __init__(self, operand, *, axes=None, options=None, execution=None, stream=N
         self.inplace = self.options.inplace
         if self.inplace and self.fft_abstract_type != "C2C":
             raise ValueError(
-                f"The in-place option (FFTOptions.inplace=True) is only supported for complex-to-complex FFT. The FFT type is '{self.fft_abstract_type}'."
+                f"The in-place option (FFTOptions.inplace=True) is only supported for complex-to-complex FFT. "
+                f"The FFT type is '{self.fft_abstract_type}'."
             )
 
         # Copy the operand to execution_space's device if needed.
@@ -1091,6 +1172,9 @@ def __init__(self, operand, *, axes=None, options=None, execution=None, stream=N
             self.logger,
         )
         operand = self.operand
+        # Capture operand layout for consistency checks when resetting operands.
+        self.operand_layout = TensorLayout(shape=operand.shape, strides=operand.strides)
+
         self._preallocated_result = None
 
         if self.options.inplace:  # Don't use self.inplace here, because we always set it to True for CPU tensors.
@@ -1106,22 +1190,21 @@ def __init__(self, operand, *, axes=None, options=None, execution=None, stream=N
             self.axes,
             self.execution_options,
             fft_abstract_type=self.fft_abstract_type,
-            last_axis_size=self.options.last_axis_size,
+            last_axis_parity=self.options.last_axis_parity,
             result_layout=self.options.result_layout,
             logger=self.logger,
         )
 
         self.logger.info(
-            f"The operand data type = {self.operand_data_type}, shape = {self.operand_layout.shape}, and strides = {self.operand_layout.strides}."
+            f"The operand data type = {self.operand_data_type}, shape = {self.operand_layout.shape}, and "
+            f"strides = {self.operand_layout.strides}."
         )
         result_data_type, result_shape, result_strides = (
             (self.operand_data_type, self.operand_layout.shape, self.operand_layout.strides)
             if self.inplace
             else (self.result_data_type, self.plan_traits.result_shape, self.plan_traits.result_strides)
         )
-        self.logger.info(
-            f"The result data type = {result_data_type}, shape = {result_shape}, and strides = {result_strides}."
-        )
+        self.logger.info(f"The result data type = {result_data_type}, shape = {result_shape}, and strides = {result_strides}.")
         self.logger.info(f"The FFT batch size is {self.plan_traits.fft_batch_size}.")
 
         ordered_fft_out_shape, ostride, odistance = (
@@ -1130,11 +1213,11 @@ def __init__(self, operand, *, axes=None, options=None, execution=None, stream=N
             else (self.plan_traits.ordered_fft_out_shape, self.plan_traits.ostride, self.plan_traits.odistance)
         )
         self.logger.debug(
-            f"The plan ordered axes = {self.plan_traits.ordered_axes}, ordered input shape = {self.plan_traits.ordered_fft_in_shape}, ordered input embedding shape = {self.plan_traits.ordered_fft_in_embedding_shape}, ordered output shape = {ordered_fft_out_shape}."
-        )
-        self.logger.debug(
-            f"The plan input stride is {self.plan_traits.istride} with distance {self.plan_traits.idistance}."
+            f"The plan ordered axes = {self.plan_traits.ordered_axes}, ordered input shape = "
+            f"{self.plan_traits.ordered_fft_in_shape}, ordered input embedding shape = "
+            f"{self.plan_traits.ordered_fft_in_embedding_shape}, ordered output shape = {ordered_fft_out_shape}."
         )
+        self.logger.debug(f"The plan input stride is {self.plan_traits.istride} with distance {self.plan_traits.idistance}.")
         self.logger.debug(f"The plan output stride is {ostride} with distance {odistance}.")
 
         # The result's package and device.
@@ -1216,26 +1299,37 @@ def get_key(self, *, prolog=None, epilog=None):
     @staticmethod
     def create_key(operand, *, axes=None, options=None, execution=None, prolog=None, epilog=None):
         """
-        Create a key as a compact representation of the FFT problem specification based on the given operand, axes and the FFT options.
-        Note that different combinations of operand layout, axes and options can potentially correspond to the same underlying problem specification (key).
-        Users may reuse the FFT objects when different input problems map to an identical key.
+        Create a key as a compact representation of the FFT problem specification based on
+        the given operand, axes and the FFT options. Note that different combinations of
+        operand layout, axes and options can potentially correspond to the same underlying
+        problem specification (key). Users may reuse the FFT objects when different input
+        problems map to an identical key.
 
         Args:
             operand: {operand}
+
             axes: {axes}
+
             options: {options}
+
             execution: {execution}
+
             prolog: {prolog}
+
             epilog: {epilog}
 
         Returns:
             {fft_key}
 
         Notes:
-            - Users may take advantage of this method to create cached version of :func:`fft` based on the stateful object APIs
-              (see `caching.py <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/caching.py>`_ for an example implementation).
-            - This key is meant for runtime use only and not designed to be serialized or used on a different machine.
-            - It is the user's responsibility to augment this key with the stream in case they use stream-ordered memory pools.
+            - Users may take advantage of this method to create cached version of
+              :func:`fft` based on the stateful object APIs (see `caching.py
+              <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/caching.py>`_
+              for an example implementation).
+            - This key is meant for runtime use only and not designed to be serialized or
+              used on a different machine.
+            - It is the user's responsibility to augment this key with the stream in case
+              they use stream-ordered memory pools.
         """
         return create_fft_key(operand, axes=axes, options=options, execution=execution, prolog=prolog, epilog=epilog)
 
@@ -1282,7 +1376,8 @@ def _allocate_result_operand(self, exec_stream_holder, log_debug):
         if log_debug:
             self.logger.debug("Beginning output (empty) tensor creation...")
             self.logger.debug(
-                f"The output tensor shape = {self.plan_traits.result_shape} with strides = {self.plan_traits.result_strides} and data type '{self.result_data_type}'."
+                f"The output tensor shape = {self.plan_traits.result_shape} with strides = "
+                f"{self.plan_traits.result_strides} and data type '{self.result_data_type}'."
             )
         result = utils.create_empty_tensor(
             self.result_class,
@@ -1290,7 +1385,8 @@ def _allocate_result_operand(self, exec_stream_holder, log_debug):
             self.result_data_type,
             self.device_id,
             exec_stream_holder,
-            self.plan_traits.result_strides,
+            verify_strides=False,  # the strides are computed so that they are contiguous
+            strides=self.plan_traits.result_strides,
         )
         if log_debug:
             self.logger.debug("The output (empty) tensor has been created.")
@@ -1307,7 +1403,7 @@ def _get_validate_direction(self, direction):
                 raise ValueError(
                     f"The specified direction {direction.name} is not compatible with the FFT type '{self.fft_abstract_type}'."
                 )
-        elif self.fft_abstract_type == "R2C":
+        elif self.fft_abstract_type == "R2C":  # noqa: SIM102
             if direction != _configuration.FFTDirection.FORWARD:
                 raise ValueError(
                     f"The specified direction {direction.name} is not compatible with the FFT type '{self.fft_abstract_type}'."
@@ -1321,10 +1417,14 @@ def plan(self, *, prolog=None, epilog=None, stream=None, direction=None):
 
         Args:
             prolog: {prolog}
+
             epilog: {epilog}
+
             stream: {stream}
-            direction: If specified, the same direction must be passed to subsequent :meth:`execute` calls.
-                       It may be used as a hint to optimize C2C planning for CPU FFT calls.
+
+            direction: If specified, the same direction must be passed to subsequent
+                :meth:`execute` calls. It may be used as a hint to optimize C2C planning for
+                CPU FFT calls.
         """
 
         if self.fft_planned:
@@ -1346,20 +1446,23 @@ def plan(self, *, prolog=None, epilog=None, stream=None, direction=None):
             prolog = utils.check_or_create_options(_configuration.DeviceCallable, prolog, "prolog", keep_none=True)
             epilog = utils.check_or_create_options(_configuration.DeviceCallable, epilog, "epilog", keep_none=True)
             _check_prolog_epilog_traits(prolog, epilog, self.plan_traits, self.operand, self.fft_abstract_type)
-            set_prolog_and_epilog(
-                self.handle, prolog, epilog, self.operand_data_type, self.result_data_type, self.logger
-            )
+            set_prolog_and_epilog(self.handle, prolog, epilog, self.operand_data_type, self.result_data_type, self.logger)
 
         # Get all the arguments to xt_make_plan_many except for the first (the handle).
+        if self.inplace:
+            check_inplace_overlapping_layout(self.operand)
+            if self.operand_backup is not None:
+                check_inplace_overlapping_layout(self.operand_backup)
+
         plan_args = create_xt_plan_args(
             plan_traits=self.plan_traits,
             fft_abstract_type=self.fft_abstract_type,
             operand_data_type=self.operand_data_type,
-            operand_layout=self.operand_layout,
             inplace=self.inplace,
         )
 
-        # Keep track of original key (sans callback) for resetting operands. Pass in plan args to avoid recomputation.
+        # Keep track of original key (sans callback) for resetting operands. Pass in plan
+        # args to avoid recomputation.
         self.orig_key = create_fft_key(
             self.operand.tensor,
             axes=self.axes,
@@ -1370,7 +1473,8 @@ def plan(self, *, prolog=None, epilog=None, stream=None, direction=None):
         self.logger.debug(f"The FFT key (sans callback) is {self.orig_key}.")
 
         self.logger.debug(
-            f"The operand CUDA type is {NAME_TO_DATA_TYPE[self.operand_data_type].name}, and the result CUDA type is {NAME_TO_DATA_TYPE[self.result_data_type].name}."
+            f"The operand CUDA type is {NAME_TO_DATA_TYPE[self.operand_data_type].name}, and the result CUDA type is "
+            f"{NAME_TO_DATA_TYPE[self.result_data_type].name}."
         )
         self.logger.debug(f"The CUDA type used for compute is {NAME_TO_DATA_TYPE[self.compute_data_type].name}.")
         timing = bool(self.logger and self.logger.handlers)
@@ -1382,13 +1486,13 @@ def plan(self, *, prolog=None, epilog=None, stream=None, direction=None):
             if self.inplace:
                 result_ptr = self.operand.data_ptr
             else:
-                # FFTW3 API requires passing pointers to the input and output during planning.
-                # Passing different pointers to (properly strided and aligned) data in
-                # subsequent execute calls is supported, but it is not clear what planning
-                # is allowed to do with the provided pointers.
-                # For one, planning can compare the two pointers for equality to decide if
-                # it is inplace or out-of-place operation.
-                # To avoid subtle issues, just preallocate the result tensor earlier.
+                # FFTW3 API requires passing pointers to the input and output during
+                # planning. Passing different pointers to (properly strided and aligned)
+                # data in subsequent execute calls is supported, but it is not clear what
+                # planning is allowed to do with the provided pointers. For one, planning
+                # can compare the two pointers for equality to decide if it is inplace or
+                # out-of-place operation. To avoid subtle issues, just preallocate the
+                # result tensor earlier.
                 self._preallocated_result = self._allocate_result_operand(None, True)
                 result_ptr = self._preallocated_result.data_ptr
             precision, *plan_args = fftw_plan_args(
@@ -1425,17 +1529,23 @@ def plan(self, *, prolog=None, epilog=None, stream=None, direction=None):
     @utils.precondition(_check_valid_fft)
     def reset_operand(self, operand=None, *, stream=None):
         """
-        Reset the operand held by this :class:`FFT` instance. This method has two use cases: (1) it can be used to provide a new operand for execution,
-        and (2) it can be used to release the internal reference to the previous operand and potentially make its memory available for
-        other use by passing ``operand=None``.
+        Reset the operand held by this :class:`FFT` instance. This method has two use cases:
+            (1) it can be used to provide a new operand for execution
+            (2) it can be used to release the internal reference to the previous operand and
+                potentially make its memory available for other use by passing
+                ``operand=None``.
 
         Args:
-            operand: A tensor (ndarray-like object) compatible with the previous one or `None` (default).
-                A value of `None` will release the internal reference to the previous operand and user is expected to set a new operand before again calling :meth:`execute`.
-                The new operand is considered compatible if all the following properties match with the previous one:
-
-                    - The problem specification key for the new operand. Generally the keys will match if the operand shares the same layout (shape, strides and data type).
-                      The keys may still match for certain operands with different layout, see :meth:`create_key` for details.
+            operand: A tensor (ndarray-like object) compatible with the previous one or
+                `None` (default). A value of `None` will release the internal reference to
+                the previous operand and user is expected to set a new operand before again
+                calling :meth:`execute`. The new operand is considered compatible if all the
+                following properties match with the previous one:
+
+                    - The problem specification key for the new operand. Generally the keys
+                      will match if the operand shares the same layout (shape, strides and
+                      data type). The keys may still match for certain operands with
+                      different layout, see :meth:`create_key` for details.
                     - The package that the new operand belongs to.
                     - The memory space of the new operand (CPU or GPU).
                     - The device that new operand belongs to if it is on GPU.
@@ -1456,28 +1566,35 @@ def reset_operand(self, operand=None, *, stream=None):
 
             >>> axes = 0, 1
             >>> with nvmath.fft.FFT(a, axes=axes) as f:
-            ...    # Plan the FFT
-            ...    f.plan()
+            ...     # Plan the FFT
+            ...     f.plan()
             ...
-            ...    # Execute the FFT to get the first result.
-            ...    r1 = f.execute()
+            ...     # Execute the FFT to get the first result.
+            ...     r1 = f.execute()
             ...
-            ...    # Reset the operand to a new CuPy ndarray.
-            ...    b = cp.random.rand(*shape) + 1j * cp.random.rand(*shape)
-            ...    f.reset_operand(b)
+            ...     # Reset the operand to a new CuPy ndarray.
+            ...     b = cp.random.rand(*shape) + 1j * cp.random.rand(*shape)
+            ...     f.reset_operand(b)
             ...
-            ...    # Execute to get the new result corresponding to the updated operand.
-            ...    r2 = f.execute()
+            ...     # Execute to get the new result corresponding to the updated operand.
+            ...     r2 = f.execute()
 
-            With :meth:`reset_operand`, minimal overhead is achieved as problem specification and planning are only performed once.
+            With :meth:`reset_operand`, minimal overhead is achieved as problem
+            specification and planning are only performed once.
 
-            For the particular example above, explicitly calling :meth:`reset_operand` is equivalent to updating the operand in-place, i.e, replacing ``f.reset_operand(b)`` with ``a[:]=b``.
-            Note that updating the operand in-place should be adopted with caution as it can only yield the expected result and incur no additional copies under the additional constraints below:
+            For the particular example above, explicitly calling :meth:`reset_operand` is
+            equivalent to updating the operand in-place, i.e, replacing
+            ``f.reset_operand(b)`` with ``a[:]=b``. Note that updating the operand in-place
+            should be adopted with caution as it can only yield the expected result and
+            incur no additional copies under the additional constraints below:
 
                 - The operation is not a complex-to-real (C2R) FFT.
-                - The operand's memory matches the FFT execution space. More precisely, the operand memory space should be accessible from the execution space (CPU or CUDA).
+                - The operand's memory matches the FFT execution space. More precisely, the
+                  operand memory space should be accessible from the execution space (CPU or
+                  CUDA).
 
-            For more details, please refer to `inplace update example <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/example05_stateful_inplace.py>`_.
+            For more details, please refer to `inplace update example
+            <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/example05_stateful_inplace.py>`_.
         """
 
         if operand is None:
@@ -1500,7 +1617,8 @@ def reset_operand(self, operand=None, *, stream=None):
 
         exec_stream_holder, operand_stream_holder = self._get_or_create_stream_maybe(stream)
         self.logger.info(
-            f"The specified stream for reset_operand() is {(exec_stream_holder or operand_stream_holder) and (exec_stream_holder or operand_stream_holder).obj}."
+            "The specified stream for reset_operand() is "
+            f"{(exec_stream_holder or operand_stream_holder) and (exec_stream_holder or operand_stream_holder).obj}."
         )
 
         # In principle, we could support memory_space change,
@@ -1535,12 +1653,15 @@ def device_str(device_id):
                 self.logger.debug(f"The FFT key corresponding to the original operand is: {self.orig_key}.")
                 if new_key is None:
                     self.logger.debug(
-                        f"The FFT key for the new operand cannot be computed since the layout (shape = {operand.shape}, strides = {operand.strides}) and axes = {self.axes} combination is unsupported."
+                        "The FFT key for the new operand cannot be computed since the layout "
+                        f"(shape = {operand.shape}, strides = {operand.strides}) and axes = {self.axes} combination "
+                        "is unsupported."
                     )
                 else:
                     self.logger.debug(f"The FFT key corresponding to the new operand is:      {new_key}.")
                 raise ValueError(
-                    "The new operand's traits (data type, shape, or strides) are incompatible with that of the original operand."
+                    "The new operand's traits (data type, shape, or strides) are incompatible with that of the "
+                    "original operand."
                 )
 
         if self.execution_space == "cuda":
@@ -1561,9 +1682,7 @@ def device_str(device_id):
 
         # Update operand layout and plan traits.
         self.operand_layout = TensorLayout(shape=operand.shape, strides=operand.strides)
-        self.logger.info(
-            f"The reset operand shape = {self.operand_layout.shape}, and strides = {self.operand_layout.strides}."
-        )
+        self.logger.info(f"The reset operand shape = {self.operand_layout.shape}, and strides = {self.operand_layout.strides}.")
 
         self.plan_traits = get_fft_plan_traits(
             operand.shape,
@@ -1572,7 +1691,7 @@ def device_str(device_id):
             self.axes,
             self.execution_options,
             fft_abstract_type=self.fft_abstract_type,
-            last_axis_size=self.options.last_axis_size,
+            last_axis_parity=self.options.last_axis_parity,
             result_layout=self.options.result_layout,
             logger=self.logger,
         )
@@ -1618,7 +1737,8 @@ def _check_valid_operand(self, *args, **kwargs):
         what = kwargs["what"]
         if self.operand is None:
             raise RuntimeError(
-                f"{what} cannot be performed if the input operand has been set to None. Use reset_operand() to set the desired input before using performing the {what.lower()}."
+                f"{what} cannot be performed if the input operand has been set to None. Use reset_operand() to set the "
+                f"desired input before using performing the {what.lower()}."
             )
 
     def _free_workspace_memory(self, exception: Exception | None = None) -> bool:
@@ -1659,12 +1779,14 @@ def _allocate_workspace_memory(self, stream_holder):
 
         self.workspace_stream = stream_holder.obj
         self.logger.debug(
-            f"Finished allocating device workspace of size {formatters.MemoryStr(self.workspace_size)} in the context of stream {self.workspace_stream}."
+            f"Finished allocating device workspace of size {formatters.MemoryStr(self.workspace_size)} in the context "
+            f"of stream {self.workspace_stream}."
         )
 
     def _allocate_workspace_memory_perhaps(self, stream_holder):
         """
-        Allocate workspace memory using the specified allocator, if it hasn't already been done.
+        Allocate workspace memory using the specified allocator, if it hasn't already been
+        done.
         """
         if self.execution_space != "cuda" or self.workspace_ptr is not None:
             return
@@ -1679,7 +1801,8 @@ def _free_workspace_memory_perhaps(self, release_workspace):
         if not release_workspace:
             return
 
-        # Establish ordering wrt the computation and free workspace if it's more than the specified cache limit.
+        # Establish ordering wrt the computation and free workspace if it's more than the
+        # specified cache limit.
         if self.last_compute_event is not None:
             self.workspace_stream.wait_event(self.last_compute_event)
             self.logger.debug("Established ordering with respect to the computation before releasing the workspace.")
@@ -1691,11 +1814,13 @@ def _free_workspace_memory_perhaps(self, release_workspace):
 
     def _release_workspace_memory_perhaps(self, exception: Exception | None = None) -> bool:
         """
-        Free workspace memory if it was allocated in this call (self._workspace_allocated_here == True) when an exception occurs.
+        Free workspace memory if it was allocated in this call
+        (self._workspace_allocated_here == True) when an exception occurs.
         """
         release_workspace = self._workspace_allocated_here
         self.logger.debug(
-            f"[_release_workspace_memory_perhaps] The release_workspace flag is set to {release_workspace} based upon the value of 'workspace_allocated_here'."
+            f"[_release_workspace_memory_perhaps] The release_workspace flag is set to {release_workspace} based upon "
+            "the value of 'workspace_allocated_here'."
         )
         self._free_workspace_memory_perhaps(release_workspace)
         return True
@@ -1710,12 +1835,15 @@ def execute(self, direction=None, stream=None, release_workspace=False):
 
         Args:
             direction: {direction}
+
             stream: {stream}
+
             release_workspace: {release_workspace}
 
         Returns:
-            The transformed operand, which remains on the same device and utilizes the same package as the input operand.
-            The data type and shape of the transformed operand depend on the type of input operand:
+            The transformed operand, which remains on the same device and utilizes the same
+            package as the input operand. The data type and shape of the transformed operand
+            depend on the type of input operand:
 
                 - For C2C FFT, the data type and shape remain identical to the input.
                 - For R2C and C2R FFT, both data type and shape differ from the input.
@@ -1769,11 +1897,13 @@ def execute(self, direction=None, stream=None, release_workspace=False):
         if log_info and elapsed.data is not None:
             self.logger.info(f"The FFT calculation took {elapsed.data:.3f} ms to complete.")
 
-        # Establish ordering wrt the computation and free workspace if it's more than the specified cache limit.
+        # Establish ordering wrt the computation and free workspace if it's more than the
+        # specified cache limit.
         self._free_workspace_memory_perhaps(release_workspace)
 
-        # reset workspace allocation tracking to False at the end of the methods where workspace memory is potentially allocated.
-        # This is necessary to prevent any exceptions raised before method entry from using stale tracking values.
+        # reset workspace allocation tracking to False at the end of the methods where
+        # workspace memory is potentially allocated. This is necessary to prevent any
+        # exceptions raised before method entry from using stale tracking values.
         self._workspace_allocated_here = False
 
         # Return the result.
@@ -1795,16 +1925,17 @@ def execute(self, direction=None, stream=None, release_workspace=False):
     def free(self):
         """Free FFT resources.
 
-        It is recommended that the :class:`FFT` object be used within a context, but if it is not possible then this
-        method must be called explicitly to ensure that the FFT resources (especially internal library objects) are
-        properly cleaned up.
+        It is recommended that the :class:`FFT` object be used within a context, but if it
+        is not possible then this method must be called explicitly to ensure that the FFT
+        resources (especially internal library objects) are properly cleaned up.
         """
 
         if not self.valid_state:
             return
 
         try:
-            # Future operations on the workspace stream should be ordered after the computation.
+            # Future operations on the workspace stream should be ordered after the
+            # computation.
             if self.last_compute_event is not None:
                 self.workspace_stream.wait_event(self.last_compute_event)
 
@@ -1840,20 +1971,27 @@ def _fft(
     check_dtype=None,
 ):
     r"""
-    fft(operand, axes=None, direction=None, options=None, execution=None, prolog=None, epilog=None, stream=None)
+    fft(operand, axes=None, direction=None, options=None, execution=None, prolog=None,
+    epilog=None, stream=None)
 
     Perform an N-D *complex-to-complex* (C2C) FFT on the provided complex operand.
 
     Args:
         operand: {operand}
+
         axes: {axes}
+
         options: {options}
+
         prolog: {prolog}
+
         epilog: {epilog}
+
         stream: {stream}
 
     Returns:
-        A transformed operand that retains the same data type and shape as the input. It remains on the same device and uses the same package as the input operand.
+        A transformed operand that retains the same data type and shape as the input. It
+        remains on the same device and uses the same package as the input operand.
 
     See Also:
         :func:`ifft`, :func:`irfft`, :func:`rfft`, :class:`FFT`
@@ -1866,51 +2004,63 @@ def _fft(
         Create a 3-D complex128 ndarray on the GPU:
 
         >>> shape = 256, 256, 256
-        >>> a = cp.random.rand(*shape, dtype=cp.float64) + 1j * cp.random.rand(*shape, dtype=cp.float64)
+        >>> a = cp.random.rand(*shape, dtype=cp.float64) + 1j * cp.random.rand(
+        ...     *shape, dtype=cp.float64
+        ... )
 
-        Perform a 3-D C2C FFT using :func:`fft`. The result `r` is also a CuPy complex128 ndarray:
+        Perform a 3-D C2C FFT using :func:`fft`. The result `r` is also a CuPy complex128
+        ndarray:
 
         >>> r = nvmath.fft.fft(a)
 
-        User may also perform FFT along a subset of dimensions, e.g, 2-D C2C FFT along the first two dimensions, batched along the last dimension:
+        User may also perform FFT along a subset of dimensions, e.g, 2-D C2C FFT along the
+        first two dimensions, batched along the last dimension:
 
         >>> axes = 0, 1
         >>> r = nvmath.fft.fft(a, axes=axes)
 
-        For C2C type FFT operation, the output can be directly computed inplace thus overwriting the input operand. This can be specified using options to the FFT:
+        For C2C type FFT operation, the output can be directly computed inplace thus
+        overwriting the input operand. This can be specified using options to the FFT:
 
         >>> o = nvmath.fft.FFTOptions(inplace=True)
         >>> r = nvmath.fft.fft(a, options=o)
         >>> r is a
+        True
 
         See :class:`FFTOptions` for the complete list of available options.
 
-        The package current stream is used by default, but a stream can be explicitly provided to the FFT operation. This can be done if the
-        FFT operand is computed on a different stream, for example:
+        The package current stream is used by default, but a stream can be explicitly
+        provided to the FFT operation. This can be done if the FFT operand is computed on a
+        different stream, for example:
 
         >>> s = cp.cuda.Stream()
         >>> with s:
-        ...    a = cp.random.rand(*shape) + 1j * cp.random.rand(*shape)
-        >>> nvmath.fft.fft(a, stream=s)
+        ...     a = cp.random.rand(*shape) + 1j * cp.random.rand(*shape)
+        >>> r = nvmath.fft.fft(a, stream=s)
 
-        The operation above runs on stream `s` and is ordered with respect to the input computation.
+        The operation above runs on stream `s` and is ordered with respect to the input
+        computation.
 
         Create a NumPy ndarray on the CPU.
 
         >>> import numpy as np
         >>> b = np.random.rand(*shape) + 1j * np.random.rand(*shape)
 
-        Provide the NumPy ndarray to :func:`fft`, with the result also being a NumPy ndarray:
+        Provide the NumPy ndarray to :func:`fft`, with the result also being a NumPy
+        ndarray:
 
         >>> r = nvmath.fft.fft(b)
 
     Notes:
-        - This function only takes complex operand for C2C transformation. If the user wishes to perform full FFT transformation on real input,
-          please cast the input to the corresponding complex data type.
-        - This function is a convenience wrapper around :class:`FFT` and and is specifically meant for *single* use.
-          The same computation can be performed with the stateful API using the default `direction` argument in :meth:`FFT.execute`.
-
-    Further examples can be found in the `nvmath/examples/fft <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft>`_ directory.
+        - This function only takes complex operand for C2C transformation. If the user
+          wishes to perform full FFT transformation on real input, please cast the input to
+          the corresponding complex data type.
+        - This function is a convenience wrapper around :class:`FFT` and and is specifically
+          meant for *single* use. The same computation can be performed with the stateful
+          API using the default `direction` argument in :meth:`FFT.execute`.
+
+    Further examples can be found in the `nvmath/examples/fft
+    <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft>`_ directory.
     """
     if check_dtype is not None:
         assert check_dtype in {"real", "complex"}, "internal error"
@@ -1929,9 +2079,7 @@ def _fft(
 
 
 # Forward C2C FFT Function.
-fft = functools.wraps(_fft)(
-    functools.partial(_fft, direction=_configuration.FFTDirection.FORWARD, check_dtype="complex")
-)
+fft = functools.wraps(_fft)(functools.partial(_fft, direction=_configuration.FFTDirection.FORWARD, check_dtype="complex"))
 fft.__doc__ = fft.__doc__.format(**SHARED_FFT_DOCUMENTATION)  # type: ignore
 fft.__name__ = "fft"
 
@@ -1940,21 +2088,28 @@ def _fft(
 @utils.docstring_decorator(SHARED_FFT_DOCUMENTATION, skip_missing=False)
 def rfft(operand, *, axes=None, options=None, execution=None, prolog=None, epilog=None, stream=None):
     r"""
-    rfft(operand, axes=None, options=None, execution=None, prolog=None, epilog=None, stream=None)
+    rfft(operand, axes=None, options=None, execution=None, prolog=None, epilog=None,
+    stream=None)
 
     Perform an N-D *real-to-complex* (R2C) FFT on the provided real operand.
 
     Args:
         operand: {operand}
+
         axes: {axes}
+
         options: {options}
+
         prolog: {prolog}
+
         epilog: {epilog}
+
         stream: {stream}
 
     Returns:
-        A complex tensor that remains on the same device and belongs to the same package as the input operand. The extent of the last
-        transformed axis in the result will be ``operand.shape[axes[-1]] // 2 + 1``.
+        A complex tensor that remains on the same device and belongs to the same package as
+        the input operand. The extent of the last transformed axis in the result will be
+        ``operand.shape[axes[-1]] // 2 + 1``.
 
 
     See Also:
@@ -1963,9 +2118,7 @@ def rfft(operand, *, axes=None, options=None, execution=None, prolog=None, epilo
     wrapped_operand = tensor_wrapper.wrap_operand(operand)
     # check if input operand if real type
     if "complex" in wrapped_operand.dtype:
-        raise RuntimeError(
-            f"rfft expects a real input, but got {wrapped_operand.dtype}. Please use fft for complex input."
-        )
+        raise RuntimeError(f"rfft expects a real input, but got {wrapped_operand.dtype}. Please use fft for complex input.")
 
     return _fft(
         operand,
@@ -1980,33 +2133,42 @@ def rfft(operand, *, axes=None, options=None, execution=None, prolog=None, epilo
 
 
 # Inverse C2C/R2C FFT Function.
-ifft = functools.wraps(_fft)(
-    functools.partial(_fft, direction=_configuration.FFTDirection.INVERSE, check_dtype="complex")
-)
+ifft = functools.wraps(_fft)(functools.partial(_fft, direction=_configuration.FFTDirection.INVERSE, check_dtype="complex"))
 ifft.__doc__ = """
-    ifft(operand, axes=None, options=None, execution=None, prolog=None, epilog=None, stream=None)
+    ifft(operand, axes=None, options=None, execution=None, prolog=None, epilog=None,
+    stream=None)
 
-    Perform an N-D *complex-to-complex* (C2C) inverse FFT on the provided complex operand. The direction is implicitly inverse.
+    Perform an N-D *complex-to-complex* (C2C) inverse FFT on the provided complex operand.
+    The direction is implicitly inverse.
 
     Args:
         operand: {operand}
+
         axes: {axes}
+
         options: {options}
+
         prolog: {prolog}
+
         epilog: {epilog}
+
         stream: {stream}
 
     Returns:
-        A transformed operand that retains the same data type and shape as the input. It remains on the same device and uses the same package as the input operand.
+        A transformed operand that retains the same data type and shape as the input. It
+        remains on the same device and uses the same package as the input operand.
 
     See Also:
         :func:`fft`, :func:`irfft`, :class:`FFT`.
 
     Notes:
-        - This function only takes complex operand for C2C transformation. If users wishes to perform full FFT transformation on real input,
-          please cast the input to the corresponding complex data type.
-        - This function is a convenience wrapper around :class:`FFT` and and is specifically meant for *single* use.
-          The same computation can be performed with the stateful API by passing the argument ``direction='inverse'`` when calling :meth:`FFT.execute`.
+        - This function only takes complex operand for C2C transformation. If users wishes
+          to perform full FFT transformation on real input, please cast the input to the
+          corresponding complex data type.
+        - This function is a convenience wrapper around :class:`FFT` and and is specifically
+          meant for *single* use. The same computation can be performed with the stateful
+          API by passing the argument ``direction='inverse'`` when calling
+          :meth:`FFT.execute`.
 """.format(**SHARED_FFT_DOCUMENTATION)
 ifft.__name__ = "ifft"
 
@@ -2015,22 +2177,31 @@ def rfft(operand, *, axes=None, options=None, execution=None, prolog=None, epilo
 @utils.docstring_decorator(SHARED_FFT_DOCUMENTATION, skip_missing=False)
 def irfft(x, *, axes=None, options=None, execution=None, prolog=None, epilog=None, stream=None):
     """
-    irfft(operand, axes=None, options=None, execution=None, prolog=None, epilog=None, stream=None)
+    irfft(operand, axes=None, options=None, execution=None, prolog=None, epilog=None,
+    stream=None)
 
-    Perform an N-D *complex-to-real* (C2R) FFT on the provided complex operand. The direction is implicitly inverse.
+    Perform an N-D *complex-to-real* (C2R) FFT on the provided complex operand. The
+    direction is implicitly inverse.
 
     Args:
         operand: {operand}
+
         axes: {axes}
+
         options: {options}
+
         prolog: {prolog}
+
         epilog: {epilog}
+
         stream: {stream}
 
     Returns:
-        A real tensor that remains on the same device and belongs to the same package as the input operand. The extent of the last
-        transformed axis in the result will be ``(operand.shape[axes[-1]] - 1) * 2`` if :attr:`FFTOptions.last_axis_size` is ``even``, or
-        ``operand.shape[axes[-1]] * 2 - 1`` if :attr:`FFTOptions.last_axis_size` is ``odd``.
+        A real tensor that remains on the same device and belongs to the same package as the
+        input operand. The extent of the last transformed axis in the result will be
+        ``(operand.shape[axes[-1]] - 1) * 2`` if :attr:`FFTOptions.last_axis_size` is
+        ``even``, or ``operand.shape[axes[-1]] * 2 - 1`` if
+        :attr:`FFTOptions.last_axis_size` is ``odd``.
 
     See Also:
         :func:`fft`, :func:`ifft`, :class:`FFT`.
@@ -2045,22 +2216,35 @@ def irfft(x, *, axes=None, options=None, execution=None, prolog=None, epilog=Non
         >>> shape = 512, 768, 256
         >>> a = nvmath.fft.rfft(cp.random.rand(*shape, dtype=cp.float64))
 
-        Perform a 3-D C2R FFT using the :func:`irfft` wrapper. The result `r` is a CuPy float64 ndarray:
+        Perform a 3-D C2R FFT using the :func:`irfft` wrapper. The result `r` is a CuPy
+        float64 ndarray:
 
         >>> r = nvmath.fft.irfft(a)
         >>> r.dtype
+        dtype('float64')
 
     Notes:
 
-        - This function performs an inverse C2R N-D FFT, which is similar to `irfftn` but different from `irfft` in various numerical packages.
-        - This function is a convenience wrapper around :class:`FFT` and and is specifically meant for *single* use.
-          The same computation can be performed with the stateful API by setting :attr:`FFTOptions.fft_type` to ``'C2R'`` and passing the argument ``direction='inverse'`` when calling :meth:`FFT.execute`.
-        - **The input to this function must be Hermitian-symmetric, otherwise the result is undefined.** While the symmetry requirement is partially captured by the different extents in the last transformed
-          dimension between the input and result, there are additional `constraints <https://docs.nvidia.com/cuda/cufft/#fourier-transform-types>`_. As
-          a specific example, 1-D transforms require the first element (and the last element, if the extent is even) of the input to be purely real-valued.
-          In addition, if the input to `irfft` was generated using an R2C FFT with an odd last axis size, :attr:`FFTOptions.last_axis_size` must be set to ``odd`` to recover the original signal.
-        - For more details, please refer to `C2R example <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/example07_c2r.py>`_
-          and `odd C2R example <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/example07_c2r_odd.py>`_.
+        - This function performs an inverse C2R N-D FFT, which is similar to `irfftn` but
+          different from `irfft` in various numerical packages.
+        - This function is a convenience wrapper around :class:`FFT` and and is specifically
+          meant for *single* use. The same computation can be performed with the stateful
+          API by setting :attr:`FFTOptions.fft_type` to ``'C2R'`` and passing the argument
+          ``direction='inverse'`` when calling :meth:`FFT.execute`.
+        - **The input to this function must be Hermitian-symmetric, otherwise the result is
+          undefined.** While the symmetry requirement is partially captured by the different
+          extents in the last transformed dimension between the input and result, there are
+          additional `constraints
+          <https://docs.nvidia.com/cuda/cufft/#fourier-transform-types>`_. As a specific
+          example, 1-D transforms require the first element (and the last element, if the
+          extent is even) of the input to be purely real-valued. In addition, if the input
+          to `irfft` was generated using an R2C FFT with an odd last axis size,
+          :attr:`FFTOptions.last_axis_size` must be set to ``odd`` to recover the original
+          signal.
+        - For more details, please refer to `C2R example
+          <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/example07_c2r.py>`_
+          and `odd C2R example
+          <https://github.com/NVIDIA/nvmath-python/tree/main/examples/fft/example07_c2r_odd.py>`_.
     """
     options = utils.check_or_create_options(_configuration.FFTOptions, options, "FFT options")
     options.fft_type = "C2R"
diff --git a/nvmath/linalg/_internal/enum_to_tuples.py b/nvmath/linalg/_internal/enum_to_tuples.py
index 619e174..7ec8360 100644
--- a/nvmath/linalg/_internal/enum_to_tuples.py
+++ b/nvmath/linalg/_internal/enum_to_tuples.py
@@ -33,7 +33,8 @@ def integer_or_string(value):
 
 def create_valid_tuples_from_enum(enum, prefix, *, expr=r"(?:(\d+)x(\d+|\w+)(?:x(\d+))?|(AUTO|UNDEFINED))"):
     """
-    Create a sequence of tuples representing the allowed combinations for the given enumeration.
+    Create a sequence of tuples representing the allowed combinations for the given
+    enumeration.
     """
 
     combinations = list()
@@ -56,12 +57,8 @@ def create_valid_tuples_from_enum(enum, prefix, *, expr=r"(?:(\d+)x(\d+|\w+)(?:x
     return tuple(combinations), value_to_enumerator, enumerator_to_value
 
 
-CLUSTER_SHAPES, CLUSTER_SHAPE_TO_ENUM, ENUM_TO_CLUSTER_SHAPE = create_valid_tuples_from_enum(
-    cublaslt.ClusterShape, "SHAPE_"
-)
+CLUSTER_SHAPES, CLUSTER_SHAPE_TO_ENUM, ENUM_TO_CLUSTER_SHAPE = create_valid_tuples_from_enum(cublaslt.ClusterShape, "SHAPE_")
 
-MATMUL_STAGES, MATMUL_STAGE_TO_ENUM, ENUM_TO_MATMUL_STAGE = create_valid_tuples_from_enum(
-    cublaslt.MatmulStages, "STAGES_"
-)
+MATMUL_STAGES, MATMUL_STAGE_TO_ENUM, ENUM_TO_MATMUL_STAGE = create_valid_tuples_from_enum(cublaslt.MatmulStages, "STAGES_")
 
 MATMUL_TILES, MATMUL_TILE_TO_ENUM, ENUM_TO_MATMUL_TILE = create_valid_tuples_from_enum(cublaslt.MatmulTile, "TILE_")
diff --git a/nvmath/linalg/_internal/epilog_protocol.py b/nvmath/linalg/_internal/epilog_protocol.py
index 173f37a..3e284f6 100644
--- a/nvmath/linalg/_internal/epilog_protocol.py
+++ b/nvmath/linalg/_internal/epilog_protocol.py
@@ -31,7 +31,8 @@
 @runtime_checkable
 class EpilogInputHandler(Protocol):
     """
-    Protocol for epilog handler input validation and setting the appropriate MM descriptor attributes.
+    Protocol for epilog handler input validation and setting the appropriate MM descriptor
+    attributes.
     """
 
     @property
@@ -46,7 +47,8 @@ def name(self):
     @abstractmethod
     def order(self):
         """
-        The result order that is needed by this epilog (cublaslt.Order or None, if no restriction on order).
+        The result order that is needed by this epilog (cublaslt.Order or None, if no
+        restriction on order).
         """
         raise NotImplementedError
 
@@ -67,8 +69,8 @@ def update(self, mm_desc_ifc, epilog_input):
         Update the provided epilog input.
 
         Args:
-            mm_desc_ifc: The MM descriptor to update, provided as a MatmulDescInterface object.
-            epilog_input: The epilog input to validate.
+            mm_desc_ifc: The MM descriptor to update, provided as a MatmulDescInterface
+                object. epilog_input: The epilog input to validate.
 
         """
         raise NotImplementedError
@@ -77,14 +79,16 @@ def update(self, mm_desc_ifc, epilog_input):
 @runtime_checkable
 class EpilogOutputHandler(Protocol):
     """
-    Protocol for epilog handler output validation and setting the appropriate MM descriptor attributes.
+    Protocol for epilog handler output validation and setting the appropriate MM descriptor
+    attributes.
     """
 
     @property
     @abstractmethod
     def name(self):
         """
-        The name of the epilog output that is handled (relu_aux, gelu_aux, bgrad, bragda, bgradb, ...).
+        The name of the epilog output that is handled (relu_aux, gelu_aux, bgrad, bragda,
+        bgradb, ...).
         """
         raise NotImplementedError
 
@@ -92,7 +96,8 @@ def name(self):
     @abstractmethod
     def order(self):
         """
-        The result order that is needed by this epilog (cublaslt.Order or None, if no restriction on order).
+        The result order that is needed by this epilog (cublaslt.Order or None, if no
+        restriction on order).
         """
         raise NotImplementedError
 
@@ -109,7 +114,8 @@ def update(self, mm_desc_ifc):
         Update all the attributes for this epilog, except the pointer.
 
         Args:
-            mm_desc_ifc: The MM descriptor to update, provided as a MatmulDescInterface object.
+            mm_desc_ifc: The MM descriptor to update, provided as a MatmulDescInterface
+            object.
         """
         raise NotImplementedError
 
@@ -119,7 +125,9 @@ def update_ptr(self, mm_desc_ifc, ptr):
         Set the pointer for this epilog.
 
         Args:
-            mm_desc_ifc: The MM descriptor to update, provided as a MatmulDescInterface object.
+            mm_desc_ifc: The MM descriptor to update, provided as a MatmulDescInterface
+                object.
+
             ptr: The pointer to set.
 
         """
@@ -157,9 +165,8 @@ def order(self):
         return cublaslt.Order.COL
 
     def validate(self, bias_tensor):
-        batch_rank = len(self.mm_traits.batch_shape)
-
-        # The bias_tensor must be of rank 1, or rank 2 with (M, 1) or batched versions of the latter (..., M, 1) consistent with the C operand.
+        # The bias_tensor must be of rank 1, or rank 2 with (M, 1) or batched versions of
+        # the latter (..., M, 1) consistent with the C operand.
 
         bias_shape = list(bias_tensor.shape)
         bias_strides = list(bias_tensor.strides)
@@ -182,31 +189,39 @@ def validate(self, bias_tensor):
         if Nb != 1:
             raise ValueError(f"The N dimension of the bias vector ({Nb}) must be equal to 1.")
 
-        # Check if the bias_tensor batch shape and axis order match that of the MM, and it's tileable.
+        # Check if the bias_tensor batch shape and axis order match that of the MM, and it's
+        # tileable.
         if len(bias_batch_shape) > 0 and bias_batch_shape != mm_traits.batch_shape:
             raise ValueError(
-                f"The batch dimensions of the bias {bias_batch_shape} must match with that of the matrix multiplication definition {mm_traits.batch_shape}."
+                f"The batch dimensions of the bias {bias_batch_shape} must match with that of the matrix multiplication "
+                f"definition {mm_traits.batch_shape}."
             )
 
         if len(bias_batch_shape) > 0:
             if self.version < 11703:
                 raise ValueError(
-                    f"Batch dimensions are not supported for the bias tensor in cuBLASLt version < 11703 (you have version {self.version})."
+                    f"Batch dimensions are not supported for the bias tensor in cuBLASLt version < 11703 "
+                    f"(you have version {self.version})."
                 )
 
             bias_batch_axis_order = axis_order_in_memory(bias_batch_strides)
             if bias_batch_axis_order != mm_traits.batch_axis_order:
                 raise ValueError(
-                    f"The batch axis order of the bias {bias_batch_axis_order} must match with that of the other operands {mm_traits.batch_axis_order}."
+                    f"The batch axis order of the bias {bias_batch_axis_order} must match with that "
+                    f"of the other operands {mm_traits.batch_axis_order}."
                 )
 
             if not check_batch_tileable(bias_batch_shape, bias_batch_strides):
-                message = f"The batch layout for bias corresponding to shape = {bias_batch_shape} and strides = {bias_batch_strides} is currently not supported because it is not tileable."
+                message = (
+                    f"The batch layout for bias corresponding to shape = {bias_batch_shape} "
+                    f"and strides = {bias_batch_strides} is currently not supported because it is not tileable."
+                )
                 raise ValueError(message)
 
         if bias_mm_strides[0] != 1:
             raise ValueError(
-                f"The stride of the bias {bias_strides} must be 1 along the dimension {len(bias_strides) - 2}, which corresponds to the M dimension."
+                f"The stride of the bias {bias_strides} must be 1 along the dimension {len(bias_strides) - 2}, "
+                f"which corresponds to the M dimension."
             )
 
         self.batch_offset = min(bias_batch_strides) if bias_batch_strides else 0  # bias broadcast
@@ -220,7 +235,8 @@ def update(self, mm_desc_ifc, bias_tensor):
         if bias_tensor.dtype != self.d_dtype_name:
             if self.version < 111103:
                 raise ValueError(
-                    f"The bias tensor dtype '{bias_tensor.dtype}' must be the same as the result dtype '{self.d_dtype_name}' in cuBLASLt version < 111103 (you have {self.version})."
+                    f"The bias tensor dtype '{bias_tensor.dtype}' must be the same as the result dtype "
+                    f"'{self.d_dtype_name}' in cuBLASLt version < 111103 (you have {self.version})."
                 )
             mm_desc_ifc.bias_data_type = typemaps.NAME_TO_DATA_TYPE[bias_tensor.dtype]
 
@@ -231,7 +247,8 @@ def round_up(m, base):
 
 def relu_aux_mm_shape(m, n):
     """
-    Return the RELU auxiliary bitmask matrix shape when stored as uint8 and M is padded to 128-bit/16-byte multiples.
+    Return the RELU auxiliary bitmask matrix shape when stored as uint8 and M is padded to
+    128-bit/16-byte multiples.
     """
     # Store bitflag mask using int8 dtype, padded to (128//8 ==) 16 bytes.
     m = round_up(math.ceil(m / 8), base=16)
@@ -307,9 +324,7 @@ def __init__(self, logger, mm_traits, enumerator, d_dtype_name):
         batch_len = len(mm_traits.batch_axis_order)
 
         self.aux_shape = mm_traits.batch_shape + [m, n]
-        aux_axis_order = [batch_len, batch_len + 1] + list(
-            mm_traits.batch_axis_order
-        )  # Column order for the GELU inputs.
+        aux_axis_order = [batch_len, batch_len + 1] + list(mm_traits.batch_axis_order)  # Column order for the GELU inputs.
         self.aux_strides = calculate_strides(self.aux_shape, aux_axis_order)
         self.aux_dtype_name = d_dtype_name
 
@@ -332,7 +347,8 @@ def update(self, mm_desc_ifc):
         mm_desc_ifc.epilogue_aux_ld = self.aux_ld
         # Set the aux batch offset.
         mm_desc_ifc.epilogue_aux_batch_stride = self.aux_batch_offset
-        # The aux data type is by default the data type of the result for all the cases we support.
+        # The aux data type is by default the data type of the result for all the cases we
+        # support.
         assert self.aux_dtype_name == self.d_dtype_name, "Internal error."
 
     def update_ptr(self, mm_desc_ifc, ptr):
@@ -364,9 +380,7 @@ def __init__(self, logger, mm_traits, enumerator, d_dtype_name):
             shape = shape + [1]
             self.bgrad_shape = mm_traits.batch_shape + self.bgrad_shape + [1]
 
-        bgrad_axis_order = [batch_len + a for a in range(len(shape))] + list(
-            mm_traits.batch_axis_order
-        )  # Column order.
+        bgrad_axis_order = [batch_len + a for a in range(len(shape))] + list(mm_traits.batch_axis_order)  # Column order.
         self.bgrad_strides = calculate_strides(self.bgrad_shape, bgrad_axis_order)
 
         self.d_dtype_name = d_dtype_name
@@ -380,8 +394,9 @@ def name(self):
     @property
     def order(self):
         """
-        For BGRADA or BGRADB, there is no layout constraint but DRELU_BGRAD and DGELU_BGRAD require COL. However currently
-        use COL for BGRADB as well to workaround a library issue.
+        For BGRADA or BGRADB, there is no layout constraint but DRELU_BGRAD and DGELU_BGRAD
+        require COL. However currently use COL for BGRADB as well to workaround a library
+        issue.
         """
         if self.enumerator in [Epilog.DRELU_BGRAD, Epilog.DGELU_BGRAD, Epilog.BGRADB]:
             return cublaslt.Order.COL
@@ -393,7 +408,8 @@ def attributes(self):
     def update(self, mm_desc_ifc):
         # Set the bgrad batch offset.
         mm_desc_ifc.bias_batch_stride = self.bgrad_batch_offset
-        # The bgrad data type is by default the data type of the result for all the cases we support.
+        # The bgrad data type is by default the data type of the result for all the cases we
+        # support.
         assert self.bgrad_dtype_name == self.d_dtype_name, "Internal error."
 
     def update_ptr(self, mm_desc_ifc, ptr):
@@ -432,16 +448,16 @@ def order(self):
         return cublaslt.Order.COL
 
     def validate(self, relu_aux_tensor):
-        batch_rank = len(self.mm_traits.batch_shape)
-
-        # The relu_aux_tensor must be of rank 2 or its batched version of the latter (..., M, N).
+        # The relu_aux_tensor must be of rank 2 or its batched version of the latter (...,
+        # M, N).
         relu_aux_shape = list(relu_aux_tensor.shape)
         relu_aux_strides = list(relu_aux_tensor.strides)
 
         # The dtype must be uint8.
         if relu_aux_tensor.dtype != "uint8":
             raise ValueError(
-                f"The dtype of the RELU auxiliary input for epilog {self.enumerator.name} must be 'uint8'. The epilog input's dtype is '{relu_aux_tensor.dtype}'."
+                f"The dtype of the RELU auxiliary input for epilog {self.enumerator.name} must be 'uint8'. "
+                f"The epilog input's dtype is '{relu_aux_tensor.dtype}'."
             )
 
         mm_traits = self.mm_traits
@@ -450,43 +466,48 @@ def validate(self, relu_aux_tensor):
         relu_aux_batch_shape, relu_aux_mm_shape = relu_aux_shape[:-2], relu_aux_shape[-2:]
         relu_aux_batch_strides, relu_aux_mm_strides = relu_aux_strides[:-2], relu_aux_strides[-2:]
 
-        # The MM shape must match, the MM must be in col order, and the batch order must match.
+        # The MM shape must match, the MM must be in col order, and the batch order must
+        # match.
         Ma, Na = relu_aux_mm_shape
         if Ma != self.mm_m or Na != self.mm_n:
             raise ValueError(
-                f"The auxiliary epilog input for epilog {self.enumerator.name} must have the MM shape (..., {self.mm_m}, {self.mm_n}). The epilog input's MM shape is (..., {Ma}, {Na})."
-            )
-
-        # Check if the relu_aux_tensor batch shape and axis order match that of the MM, and it's tileable.
-        if len(relu_aux_batch_shape) > 0 and relu_aux_batch_shape != mm_traits.batch_shape:
-            raise ValueError(
-                f"The batch dimensions of the RELU auxiliary input {relu_aux_batch_shape} must match with that of the matrix multiplication definition {mm_traits.batch_shape}."
+                f"The auxiliary epilog input for epilog {self.enumerator.name} must have the MM shape "
+                f"(..., {self.mm_m}, {self.mm_n}). The epilog input's MM shape is (..., {Ma}, {Na})."
             )
 
+        # Check if the relu_aux_tensor batch shape and axis order match that of the MM, and
+        # it's tileable.
         if len(relu_aux_batch_shape) > 0:
-            relu_aux_batch_axis_order = axis_order_in_memory(relu_aux_batch_strides)
-            if relu_aux_batch_axis_order != mm_traits.batch_axis_order:
+            if relu_aux_batch_shape != mm_traits.batch_shape:
                 raise ValueError(
-                    f"The batch axis order of the RELU auxiliary input {relu_aux_batch_axis_order} must match with that of the other operands {mm_traits.batch_axis_order}."
+                    f"The batch dimensions of the RELU auxiliary input {relu_aux_batch_shape} must match with that "
+                    f"of the matrix multiplication definition {mm_traits.batch_shape}."
+                )
+
+            if (relu_aux_batch_axis_order := axis_order_in_memory(relu_aux_batch_strides)) != mm_traits.batch_axis_order:
+                raise ValueError(
+                    f"The batch axis order of the RELU auxiliary input {relu_aux_batch_axis_order} must match with that "
+                    f"of the other operands {mm_traits.batch_axis_order}."
                 )
 
-        if len(relu_aux_batch_shape) > 0:
             if not check_batch_tileable(relu_aux_batch_shape, relu_aux_batch_strides):
-                message = f"The batch layout for RELU auxiliary input corresponding to shape = {relu_aux_batch_shape} and strides = {relu_aux_batch_strides} is currently not supported because it is not tileable."
+                message = (
+                    f"The batch layout for RELU auxiliary input corresponding to shape = {relu_aux_batch_shape} "
+                    f"and strides = {relu_aux_batch_strides} is currently not supported because it is not tileable."
+                )
                 raise ValueError(message)
 
         if relu_aux_mm_strides[0] != 1:
             raise ValueError(
-                f"The stride of the RELU auxiliary input {relu_aux_strides} must be 1 along the dimension {len(relu_aux_strides) - 2}, which corresponds to the M dimension."
+                f"The stride of the RELU auxiliary input {relu_aux_strides} must be 1 "
+                f"along the dimension {len(relu_aux_strides) - 2}, which corresponds to the M dimension."
             )
 
         # Convert from bits to elements.
         self.batch_offset = min(relu_aux_batch_strides) * 8 if relu_aux_batch_strides else 0  # relu_aux broadcast
 
         if self.batch_offset > 0:
-            assert (
-                self.batch_offset > 0 and self.batch_offset >= self.mm_m * 8 * self.mm_n
-            ), "Tensor data must not overlap."
+            assert self.batch_offset > 0 and self.batch_offset >= self.mm_m * 8 * self.mm_n, "Tensor data must not overlap."
 
     def update(self, mm_desc_ifc, relu_aux_tensor):
         # Set the epilog aux pointer.
@@ -530,16 +551,16 @@ def order(self):
         return cublaslt.Order.COL
 
     def validate(self, gelu_aux_tensor):
-        batch_rank = len(self.mm_traits.batch_shape)
-
-        # The gelu_aux_tensor must be of rank 2 or its batched version of the latter (..., M, N).
+        # The gelu_aux_tensor must be of rank 2 or its batched version of the latter (...,
+        # M, N).
         gelu_aux_shape = list(gelu_aux_tensor.shape)
         gelu_aux_strides = list(gelu_aux_tensor.strides)
 
         # The dtype must be the same as that of D.
         if gelu_aux_tensor.dtype != self.d_dtype_name:
             raise ValueError(
-                f"The dtype of the GELU auxiliary input for epilog {self.enumerator.name} must be '{self.d_dtype_name}'. The epilog input's dtype is '{gelu_aux_tensor.dtype}'."
+                f"The dtype of the GELU auxiliary input for epilog {self.enumerator.name} must be '{self.d_dtype_name}'. "
+                f"The epilog input's dtype is '{gelu_aux_tensor.dtype}'."
             )
 
         mm_traits = self.mm_traits
@@ -548,34 +569,41 @@ def validate(self, gelu_aux_tensor):
         gelu_aux_batch_shape, gelu_aux_mm_shape = gelu_aux_shape[:-2], gelu_aux_shape[-2:]
         gelu_aux_batch_strides, gelu_aux_mm_strides = gelu_aux_strides[:-2], gelu_aux_strides[-2:]
 
-        # The MM shape must match, the MM must be in col order, and the batch order must match.
+        # The MM shape must match, the MM must be in col order, and the batch order must
+        # match.
         Ma, Na = gelu_aux_mm_shape
         if Ma != self.mm_m or Na != self.mm_n:
             raise ValueError(
-                f"The auxiliary epilog input for epilog {self.enumerator.name} must have the MM shape (..., {self.mm_m}, {self.mm_n}). The epilog input's MM shape is (..., {Ma}, {Na})."
-            )
-
-        # Check if the gelu_aux_tensor batch shape and axis order match that of the MM, and it's tileable.
-        if len(gelu_aux_batch_shape) > 0 and gelu_aux_batch_shape != mm_traits.batch_shape:
-            raise ValueError(
-                f"The batch dimensions of the GELU auxiliary input {gelu_aux_batch_shape} must match with that of the matrix multiplication definition {mm_traits.batch_shape}."
+                f"The auxiliary epilog input for epilog {self.enumerator.name} must have "
+                f"the MM shape (..., {self.mm_m}, {self.mm_n}). The epilog input's MM shape is (..., {Ma}, {Na})."
             )
 
+        # Check if the gelu_aux_tensor batch shape and axis order match that of the MM, and
+        # it's tileable.
         if len(gelu_aux_batch_shape) > 0:
-            gelu_aux_batch_axis_order = axis_order_in_memory(gelu_aux_batch_strides)
-            if gelu_aux_batch_axis_order != mm_traits.batch_axis_order:
+            if gelu_aux_batch_shape != mm_traits.batch_shape:
                 raise ValueError(
-                    f"The batch axis order of the GELU auxiliary input {gelu_aux_batch_axis_order} must match with that of the other operands {mm_traits.batch_axis_order}."
+                    f"The batch dimensions of the GELU auxiliary input {gelu_aux_batch_shape} must match with that "
+                    f"of the matrix multiplication definition {mm_traits.batch_shape}."
+                )
+
+            if (gelu_aux_batch_axis_order := axis_order_in_memory(gelu_aux_batch_strides)) != mm_traits.batch_axis_order:
+                raise ValueError(
+                    f"The batch axis order of the GELU auxiliary input {gelu_aux_batch_axis_order} "
+                    f"must match with that of the other operands {mm_traits.batch_axis_order}."
                 )
 
-        if len(gelu_aux_batch_shape) > 0:
             if not check_batch_tileable(gelu_aux_batch_shape, gelu_aux_batch_strides):
-                message = f"The batch layout for GELU auxiliary input corresponding to shape = {gelu_aux_batch_shape} and strides = {gelu_aux_batch_strides} is currently not supported because it is not tileable."
+                message = (
+                    f"The batch layout for GELU auxiliary input corresponding to shape = {gelu_aux_batch_shape} and "
+                    f"strides = {gelu_aux_batch_strides} is currently not supported because it is not tileable."
+                )
                 raise ValueError(message)
 
         if gelu_aux_mm_strides[0] != 1:
             raise ValueError(
-                f"The stride of the GELU auxiliary input {gelu_aux_strides} must be 1 along the dimension {len(gelu_aux_strides) - 2}, which corresponds to the M dimension."
+                f"The stride of the GELU auxiliary input {gelu_aux_strides} must be 1 "
+                f"along the dimension {len(gelu_aux_strides) - 2}, which corresponds to the M dimension."
             )
 
         self.batch_offset = min(gelu_aux_batch_strides) if gelu_aux_batch_strides else 0  # gelu_aux broadcast
@@ -594,7 +622,8 @@ def update(self, mm_desc_ifc, gelu_aux_tensor):
         if gelu_aux_tensor.dtype != self.d_dtype_name:
             if self.version < 111103:
                 raise ValueError(
-                    f"The GELU auxiliary tensor dtype '{gelu_aux_tensor.dtype}' must be the same as the result dtype '{self.d_dtype_name}' in cuBLASLt version < 111103 (you have {self.version})."
+                    f"The GELU auxiliary tensor dtype '{gelu_aux_tensor.dtype}' must be the same as "
+                    f"the result dtype '{self.d_dtype_name}' in cuBLASLt version < 111103 (you have {self.version})."
                 )
             mm_desc_ifc.epilogue_aux_data_type = typemaps.NAME_TO_DATA_TYPE[gelu_aux_tensor.dtype]
 
@@ -642,8 +671,8 @@ def update(self, mm_desc_ifc, gelu_aux_tensor):
     # 11301, 11.2.0 supports ^
     # 11401, 11.2.2 supports ^
     # 11402, 11.3.0 supports ^
-    # Even though some *BIAS epilogs appear in earlier headers, because CUBLASLT_MATMUL_DESC_BIAS_BATCH_STRIDE is not
-    # defined we cannot use them.
+    # Even though some *BIAS epilogs appear in earlier headers, because
+    # CUBLASLT_MATMUL_DESC_BIAS_BATCH_STRIDE is not defined we cannot use them.
     Epilog.GELU: {"cublaslt": 11501, "ctk": "11.3.1"},
     Epilog.BIAS: {"cublaslt": 11501, "ctk": "11.3.1"},
     Epilog.RELU_AUX: {"cublaslt": 11501, "ctk": "11.3.1"},
@@ -660,8 +689,8 @@ def update(self, mm_desc_ifc, gelu_aux_tensor):
     # 11801, 11.6.0 supports ^
     # 11801, 11.6.1 supports ^
     # 110902, 11.6.2 supports ^
-    # Even though BGRAD[A,B] and D[R,G]ELU appear in earlier headers, our unit tests mark them as unsupported before
-    # 111103 for in order to avoid API changes.
+    # Even though BGRAD[A,B] and D[R,G]ELU appear in earlier headers, our unit tests mark
+    # them as unsupported before 111103 for in order to avoid API changes.
     Epilog.DRELU: {"cublaslt": 111103, "ctk": "11.8.0"},
     Epilog.DGELU: {"cublaslt": 111103, "ctk": "11.8.0"},
     Epilog.BGRADA: {"cublaslt": 111103, "ctk": "11.8.0"},
@@ -670,3 +699,13 @@ def update(self, mm_desc_ifc, gelu_aux_tensor):
     Epilog.DGELU_BGRAD: {"cublaslt": 111103, "ctk": "11.8.0"},
     # 111103, 11.8.0 supports ^
 }
+
+BATCHED_EPILOG_MINIMUM_VERSIONS_MAP = EPILOG_MINIMUM_VERSIONS_MAP.copy()
+BATCHED_EPILOG_MINIMUM_VERSIONS_MAP.update(
+    {
+        Epilog.RELU_AUX: {"cublaslt": 11703, "ctk": "11.5"},
+        Epilog.GELU_AUX: {"cublaslt": 11703, "ctk": "11.5"},
+        Epilog.RELU_AUX_BIAS: {"cublaslt": 11703, "ctk": "11.5"},
+        Epilog.GELU_AUX_BIAS: {"cublaslt": 11703, "ctk": "11.5"},
+    }
+)
diff --git a/nvmath/linalg/_internal/matmul_desc_ifc.py b/nvmath/linalg/_internal/matmul_desc_ifc.py
index a4cb772..5a3bb39 100644
--- a/nvmath/linalg/_internal/matmul_desc_ifc.py
+++ b/nvmath/linalg/_internal/matmul_desc_ifc.py
@@ -36,9 +36,7 @@ def _get_attribute_ctype(name):
     return np.ctypeslib.as_ctypes_type(cublaslt.get_matmul_desc_attribute_dtype(DescEnum[name]))
 
 
-DESC_ENUM_SCALAR_ATTR_INFO = {
-    name: (DescEnum[name].value, _get_attribute_ctype(name)) for name in DESC_ENUM_SCALAR_ATTR
-}
+DESC_ENUM_SCALAR_ATTR_INFO = {name: (DescEnum[name].value, _get_attribute_ctype(name)) for name in DESC_ENUM_SCALAR_ATTR}
 
 
 class MatmulDescInterface:
diff --git a/nvmath/linalg/_internal/matmul_pref_ifc.py b/nvmath/linalg/_internal/matmul_pref_ifc.py
index 87f0d3f..fbea322 100644
--- a/nvmath/linalg/_internal/matmul_pref_ifc.py
+++ b/nvmath/linalg/_internal/matmul_pref_ifc.py
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """
-Interface class to encapsulate low-level calls to get and set matmul plan preference attributes.
+Interface class to encapsulate low-level calls to get and set matmul plan preference
+attributes.
 """
 
 __all__ = ["MatmulPreferenceInterface"]
diff --git a/nvmath/linalg/_internal/typemaps.py b/nvmath/linalg/_internal/typemaps.py
index a97a98a..ea17b11 100644
--- a/nvmath/linalg/_internal/typemaps.py
+++ b/nvmath/linalg/_internal/typemaps.py
@@ -15,13 +15,15 @@
 
 def create_default_scale_type_map():
     """
-    Map the data type name to the corresponding CUDA data type that's appropriate for default scale.
+    Map the data type name to the corresponding CUDA data type that's appropriate for
+    default scale.
     """
 
     dt = cudaDataType
 
     scale_type_map = dict()
-    # scale_type_map['float8'] = dt.CUDA_R_32F # both CUDA_R_8F_E4M3 and CUDA_R_8F_E5M2 -> CUDA_R_32F
+    # scale_type_map['float8'] = dt.CUDA_R_32F # both CUDA_R_8F_E4M3 and CUDA_R_8F_E5M2 ->
+    # CUDA_R_32F
     scale_type_map["bfloat16"] = dt.CUDA_R_32F
     scale_type_map["float16"] = dt.CUDA_R_32F
     scale_type_map["float32"] = dt.CUDA_R_32F
@@ -33,16 +35,62 @@ def create_default_scale_type_map():
     return scale_type_map
 
 
-def create_compute_type_map():
+def create_compute_type_to_scale_type_map(is_complex):
     """
-    Map the data type name to the corresponding CUDA data type that's appropriate for default scale.
+    Map the compute type to the corresponding CUDA data type that's appropriate for 
+    default scale.
     """
 
     dt = cudaDataType
     ct = cublas.ComputeType
 
+    scale_type_map = dict()
+    scale_type_map[ct.COMPUTE_16F] = dt.CUDA_R_16F
+    scale_type_map[ct.COMPUTE_16F_PEDANTIC] = dt.CUDA_R_16F
+
+    f32 = dt.CUDA_C_32F if is_complex else dt.CUDA_R_32F
+    f64 = dt.CUDA_C_64F if is_complex else dt.CUDA_R_64F
+
+    scale_type_map[ct.COMPUTE_32F] = f32
+    scale_type_map[ct.COMPUTE_32F_PEDANTIC] = f32
+    scale_type_map[ct.COMPUTE_32F_FAST_16F] = f32
+    scale_type_map[ct.COMPUTE_32F_FAST_16BF] = f32
+    scale_type_map[ct.COMPUTE_32F_FAST_TF32] = f32
+    scale_type_map[ct.COMPUTE_64F] = f64
+    scale_type_map[ct.COMPUTE_64F_PEDANTIC] = f64
+
+    return scale_type_map
+
+
+def create_scale_type_to_compute_type_map():
+    """
+    Map the scale type to the corresponding compute type that's an appropriate default.
+    """
+
+    dt = cudaDataType
+    ct = cublas.ComputeType
+
+    compute_type_map = dict()
+    compute_type_map[dt.CUDA_R_16F] = ct.COMPUTE_16F
+    compute_type_map[dt.CUDA_R_16BF] = ct.COMPUTE_32F
+    compute_type_map[dt.CUDA_R_32F] = ct.COMPUTE_32F
+    compute_type_map[dt.CUDA_C_32F] = ct.COMPUTE_32F
+    compute_type_map[dt.CUDA_R_64F] = ct.COMPUTE_64F
+    compute_type_map[dt.CUDA_C_64F] = ct.COMPUTE_64F
+    return compute_type_map
+
+
+def create_compute_type_map():
+    """
+    Map the data type name to the corresponding CUDA data type that's appropriate for 
+    default scale.
+    """
+
+    ct = cublas.ComputeType
+
     compute_type_map = dict()
-    # compute_type_map['float8'] = ct.COMPUTE_32F # both CUDA_R_8F_E4M3 and CUDA_R_8F_E5M2 -> CUBLAS_COMPUTE_32F
+    # compute_type_map['float8'] = ct.COMPUTE_32F 
+    # both CUDA_R_8F_E4M3 and CUDA_R_8F_E5M2 -> CUBLAS_COMPUTE_32F
     compute_type_map["bfloat16"] = ct.COMPUTE_32F
     compute_type_map["float16"] = ct.COMPUTE_32F
     compute_type_map["float32"] = ct.COMPUTE_32F
@@ -61,3 +109,8 @@ def create_compute_type_map():
 
 NAME_TO_DEFAULT_SCALE_TYPE = create_default_scale_type_map()
 NAME_TO_DEFAULT_COMPUTE_TYPE = create_compute_type_map()
+COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE = {
+    "real": create_compute_type_to_scale_type_map(is_complex=False),
+    "complex": create_compute_type_to_scale_type_map(is_complex=True),
+}
+SCALE_TYPE_TO_DEFAULT_COMPUTE_TYPE = create_scale_type_to_compute_type_map()
diff --git a/nvmath/linalg/_internal/utils.py b/nvmath/linalg/_internal/utils.py
index 5376213..fe47e29 100644
--- a/nvmath/linalg/_internal/utils.py
+++ b/nvmath/linalg/_internal/utils.py
@@ -41,7 +41,8 @@ def destroy_handle(handle: int):
 
 def get_handle(device_id: int) -> int:
     """
-    Retrieve the BLAS library handle for the specified device. If one doesn't exist, create, cache, and return the handle.
+    Retrieve the BLAS library handle for the specified device. If one doesn't exist, create,
+    cache, and return the handle.
     """
     return HANDLES.setdefault(device_id, create_handle(device_id))
 
diff --git a/nvmath/linalg/advanced/_algorithmmod.py b/nvmath/linalg/advanced/_algorithmmod.py
index c3ca17e..c7db5db 100644
--- a/nvmath/linalg/advanced/_algorithmmod.py
+++ b/nvmath/linalg/advanced/_algorithmmod.py
@@ -34,7 +34,8 @@ def __init__(self, algorithm):
     @property
     def capabilities(self):
         """
-        Return the capabilities of this algorithm as a :class:`nvmath.linalg.advanced.AlgorithmCapabilities` dataclass.
+        Return the capabilities of this algorithm as a
+        :class:`nvmath.linalg.advanced.AlgorithmCapabilities` dataclass.
         """
         names = [field.name for field in dataclasses.fields(AlgorithmCapabilities)]
         _capabilities = dict()
@@ -73,7 +74,9 @@ def stages(self, stages):
 
     @property
     def split_k(self):
-        "The number of split-k steps (see MatmulAlgoConfigAttribute.SPLITK_NUM). This can be set only if `splitk_support` is 1 in the algorithm capabilities."
+        """The number of split-k steps (see MatmulAlgoConfigAttribute.SPLITK_NUM).
+
+        This can be set only if `splitk_support` is 1 in the algorithm capabilities."""
         return self.config_ifc.splitk_num
 
     @split_k.setter
@@ -83,7 +86,9 @@ def split_k(self, number):
     @property
     def reduction_scheme(self):
         """The reduction scheme used (see MatmulAlgoConfigAttribute.REDUCTION_SCHEME).
-        The value provided must be consistent with the `reduction_scheme_mask` in the algorithm capabilities."""
+
+        The value provided must be consistent with the `reduction_scheme_mask` in the
+        algorithm capabilities."""
         return self.config_ifc.reduction_scheme
 
     @reduction_scheme.setter
@@ -92,7 +97,9 @@ def reduction_scheme(self, scheme_id):
 
     @property
     def cta_swizzling(self):
-        "A flag indicating CTA swizzling (see MatmulAlgoConfigAttribute.CTA_SWIZZLING). This can be set only if `cta_swizzling` is 1 in the algorithm capabilities."
+        """A flag indicating CTA swizzling (see MatmulAlgoConfigAttribute.CTA_SWIZZLING).
+
+        This can be set only if `cta_swizzling` is 1 in the algorithm capabilities."""
         return self.config_ifc.cta_swizzling
 
     @cta_swizzling.setter
@@ -101,8 +108,11 @@ def cta_swizzling(self, flag: bool):
 
     @property
     def custom_option(self):
-        """A value indicating the custom option (see MatmulAlgoConfigAttribute.CUSTOM_OPTION). The value provided must be
-        less than `custom_option_max` in the algorithm capabilities."""
+        """A value indicating the custom option (see
+        MatmulAlgoConfigAttribute.CUSTOM_OPTION).
+
+        The value provided must be less than `custom_option_max` in the algorithm
+        capabilities."""
         return self.config_ifc.custom_option
 
     @custom_option.setter
@@ -111,7 +121,8 @@ def custom_option(self, value: int):
 
     @property
     def inner_shape(self):
-        """A value indicating the inner shape (see MatmulAlgoConfigAttribute.INNER_SHAPE_ID)."""
+        """A value indicating the inner shape (see
+        MatmulAlgoConfigAttribute.INNER_SHAPE_ID)."""
         return self.config_ifc.inner_shape_id
 
     @inner_shape.setter
@@ -120,8 +131,11 @@ def inner_shape(self, shape):
 
     @property
     def cluster_shape(self):
-        """A tuple representing the cluster shape (see MatmulAlgoConfigAttribute.CLUSTER_SHAPE_ID).
-        The value provided must be one of the `cluster_shape_ids` in the algorithm capabilities."""
+        """A tuple representing the cluster shape (see
+        MatmulAlgoConfigAttribute.CLUSTER_SHAPE_ID).
+
+        The value provided must be one of the `cluster_shape_ids` in the algorithm
+        capabilities."""
         return self.config_ifc.cluster_shape_id
 
     @cluster_shape.setter
diff --git a/nvmath/linalg/advanced/_configuration.py b/nvmath/linalg/advanced/_configuration.py
index c6f6f9c..cd5298d 100644
--- a/nvmath/linalg/advanced/_configuration.py
+++ b/nvmath/linalg/advanced/_configuration.py
@@ -34,27 +34,47 @@
 
 @dataclasses.dataclass
 class MatmulOptions:
-    """A data class for providing options to the :class:`Matmul` object and the wrapper function :func:`matmul`.
+    """A data class for providing options to the :class:`Matmul` object and the wrapper
+    function :func:`matmul`.
 
     Attributes:
-        compute_type (nvmath.linalg.ComputeType): CUDA compute type. A suitable compute type will be selected if not specified.
-        scale_type (nvmath.CudaDataType): CUDA data type. A suitable data type consistent with the compute type will be
-            selected if not specified.
-        sm_count_target (int) : The number of SMs to use for execution. The default is 0, corresponding to all available SMs.
-        fast_accumulation (bool) : Enable or disable FP8 fast accumulation mode. The default is False (disabled).
-        device_id: CUDA device ordinal (used if the MM operands reside on the CPU). Device 0 will be used if not specified.
-        handle: Linear algebra library handle. A handle will be created if one is not provided.
-        logger (logging.Logger): Python Logger object. The root logger will be used if a logger object is not provided.
-        memory_limit: Maximum memory available to the MM operation. It can be specified as a value (with optional suffix like
-            K[iB], M[iB], G[iB]) or as a percentage. The default is 80% of the device memory.
-        blocking: A flag specifying the behavior of the execution functions and methods, such as :func:`matmul` and :meth:`Matmul.execute`.
-            When ``blocking`` is `True`, the execution methods do not return until the operation is complete. When ``blocking`` is
-            ``"auto"``, the methods return immediately when the inputs are on the GPU. The execution methods always block
-            when the operands are on the CPU to ensure that the user doesn't inadvertently use the result before it becomes
+        compute_type (nvmath.linalg.ComputeType): CUDA compute type. A suitable compute type
+        will be selected if not specified.
+
+        scale_type (nvmath.CudaDataType): CUDA data type. A suitable data type consistent
+            with the compute type will be selected if not specified.
+
+        sm_count_target (int) : The number of SMs to use for execution. The default is 0,
+            corresponding to all available SMs.
+
+        fast_accumulation (bool) : Enable or disable FP8 fast accumulation mode. The default
+            is False (disabled).
+
+        device_id: CUDA device ordinal (used if the MM operands reside on the CPU). Device 0
+            will be used if not specified.
+
+        handle: Linear algebra library handle. A handle will be created if one is not
+            provided.
+
+        logger (logging.Logger): Python Logger object. The root logger will be used if a
+            logger object is not provided.
+
+        memory_limit: Maximum memory available to the MM operation. It can be specified as a
+            value (with optional suffix like K[iB], M[iB], G[iB]) or as a percentage. The
+            default is 80% of the device memory.
+
+        blocking: A flag specifying the behavior of the execution functions and methods,
+            such as :func:`matmul` and :meth:`Matmul.execute`. When ``blocking`` is `True`,
+            the execution methods do not return until the operation is complete. When
+            ``blocking`` is ``"auto"``, the methods return immediately when the inputs are
+            on the GPU. The execution methods always block when the operands are on the CPU
+            to ensure that the user doesn't inadvertently use the result before it becomes
             available. The default is ``"auto"``.
-        allocator: An object that supports the :class:`BaseCUDAMemoryManager` protocol, used to draw device memory. If an
-            allocator is not provided, a memory allocator from the library package will be used
-            (:func:`torch.cuda.caching_allocator_alloc` for PyTorch operands, :func:`cupy.cuda.alloc` otherwise).
+
+        allocator: An object that supports the :class:`BaseCUDAMemoryManager` protocol, used
+            to draw device memory. If an allocator is not provided, a memory allocator from
+            the library package will be used (:func:`torch.cuda.caching_allocator_alloc` for
+            PyTorch operands, :func:`cupy.cuda.alloc` otherwise).
 
     See Also:
        :class:`Matmul`, :func:`matmul`
@@ -139,16 +159,26 @@ class MatmulNumericalImplFlags(IntEnum):
 
 @dataclasses.dataclass
 class MatmulPlanPreferences:
-    """A data class for providing options to the :meth:`Matmul.plan` method and the wrapper function :func:`matmul`.
+    """A data class for providing options to the :meth:`Matmul.plan` method and the
+    wrapper function :func:`matmul`.
 
     Attributes:
-        reduction_scheme_mask (object of type :class:`linalg.advanced.MatmulReductionScheme`) : Enumerators from :class:`linalg.advanced.MatmulReductionScheme`
-            combined with bitwise operator ``|``. The default is all reduction schemes.
-        max_waves_count (float) : The maximum wave count. Selecting a value greater than 0. will exclude algorithms with
-            device utilization greater than specified. The default is 0.
-        numerical_impl_mask (object of type :class:`linalg.advanced.MatmulNumericalImplFlags`) : Enumerators from :class:`linalg.advanced.MatmulNumericalImplFlags`
-            combined with bitwise operator ``|``. The default is all numerical implementation flag choices.
-        limit (int) : The number of algorithms to consider. If not specified, a suitable default will be chosen.
+        reduction_scheme_mask (object of type
+            :class:`linalg.advanced.MatmulReductionScheme`) : Enumerators from
+            :class:`linalg.advanced.MatmulReductionScheme` combined with bitwise operator
+            ``|``. The default is all reduction schemes.
+
+        max_waves_count (float) : The maximum wave count. Selecting a value greater than 0.
+            will exclude algorithms with device utilization greater than specified. The
+            default is 0.
+
+        numerical_impl_mask (object of type
+            :class:`linalg.advanced.MatmulNumericalImplFlags`) : Enumerators from
+            :class:`linalg.advanced.MatmulNumericalImplFlags` combined with bitwise operator
+            ``|``. The default is all numerical implementation flag choices.
+
+        limit (int) : The number of algorithms to consider. If not specified, a suitable
+            default will be chosen.
 
     See Also:
        :meth:`Matmul.plan`, :func:`matmul`
diff --git a/nvmath/linalg/advanced/matmulmod.py b/nvmath/linalg/advanced/matmulmod.py
index 24bf1f6..05324bf 100644
--- a/nvmath/linalg/advanced/matmulmod.py
+++ b/nvmath/linalg/advanced/matmulmod.py
@@ -32,7 +32,12 @@
 from nvmath._internal import utils
 
 from nvmath.linalg._internal import matmul_desc_ifc, matmul_pref_ifc, matrix_layout_ifc
-from nvmath.linalg._internal.typemaps import NAME_TO_DEFAULT_SCALE_TYPE, NAME_TO_DEFAULT_COMPUTE_TYPE
+from nvmath.linalg._internal.typemaps import (
+    NAME_TO_DEFAULT_SCALE_TYPE,
+    NAME_TO_DEFAULT_COMPUTE_TYPE,
+    COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE,
+    SCALE_TYPE_TO_DEFAULT_COMPUTE_TYPE,
+)
 from nvmath.linalg._internal.utils import (
     axis_order_in_memory,
     calculate_strides,
@@ -46,7 +51,9 @@
     EPILOG_INPUT_HANDLERS_MAP,
     EPILOG_OUTPUT_HANDLERS_MAP,
     EPILOG_MINIMUM_VERSIONS_MAP,
+    BATCHED_EPILOG_MINIMUM_VERSIONS_MAP,
 )
+from nvmath._utils import CudaDataType
 
 MatmulComputeType = cublas.ComputeType
 
@@ -98,10 +105,17 @@ def get_mm_layout(self, transpose=False):
         if not transpose:
             return *self.mm_shape, self.ld, self.order
 
-        # Use of transpose is supported only for A and B for two specific use cases till the C library directly supports these use cases:
-        #   1. When A or B has the conjugate qualifier, we transpose it internally and then use conjugate transpose in the MM (A @ B.conj() == A @ B.T.H).
-        #   2. When the epilog is BGRADB, we transpose B internally and use transpose in the MM since this epilog requires B to be transposed (A @ B == A @ B.T.T).
-        # This requires that the layout order be ROW or COL (no special layouts such as structured or hierarchical).
+        # Use of transpose is supported only for A and B for two specific use cases till the
+        # C library directly supports these use cases:
+        #
+        #   1. When A or B has the conjugate qualifier, we transpose it internally and then
+        #      use conjugate transpose in the MM (A @ B.conj() == A @ B.T.H).
+        #
+        #   2. When the epilog is BGRADB, we transpose B internally and use transpose in the
+        #      MM since this epilog requires B to be transposed (A @ B == A @ B.T.T).
+        #
+        #  This requires that the layout order be ROW or COL (no special layouts such as
+        #  structured or hierarchical).
         assert self.mm_shape is not None and self.mm_strides is not None, "Internal Error."
         assert self.ld != 0, "Internal Error."
 
@@ -127,27 +141,54 @@ class ResultTraits:
     result_strides: Sequence[int]
 
 
-def get_matrix_layout_traits(mm_shape, mm_strides, batch_strides, col_bcast):
+def get_matrix_layout_traits(mm_shape, mm_strides, batch_strides, col_bcast, ordering=None, orientation=None):
+    """
+    The 'ordering' option specifies the layout order, if it's not None, as in the case of
+    the D matrix whose layout is determined by the other operands' layout. It is required if
+    the matrix is degenerate (a vector or scalar: len(mm_shape) < 2).
+
+    The 'orientation' option (ROW or COL) is required to infer the correct leading dimension
+    for degenerate matrices (a vector or scalar: len(mm_shape) < 2).
+    """
     if len(mm_shape) < 2:  # The result D can be a scalar or vector.
+        assert ordering is not None, "Internal Error: 'ordering' must be specified for degenerate matrices."
+        assert orientation is not None, "Internal Error: 'orientation' must be specified for degenerate matrices."
         batch_offset = min(batch_strides) if batch_strides else 0
-        order = cublaslt.Order.COL
-        ld = max(mm_shape[0], mm_strides[0]) if len(mm_shape) == 1 else 1
+        order = ordering
+        if len(mm_shape) < 1:
+            ld = 1
+        elif order != orientation:
+            # For a ROW vector in COL order, the LD should be 1 since we promote the row
+            # vector to a matrix as (1, M). Similarly, for a COL vector in ROW order, the LD
+            # should be 1 as well since we promote the column vector to a matrix as (M, 1).
+            ld = 1
+        else:
+            ld = max(mm_strides[0], mm_shape[0])
         return order, ld, batch_offset
 
     M, N = mm_shape
 
-    # Important: start with the first dimension so that cases like (M, 1) : (1, 1) or (1, M) : (1, 1) in CuTe notation map to COL.
-    if mm_strides[0] == 1:
-        order = cublaslt.Order.COL
-    elif mm_strides[1] == 1:
-        order = cublaslt.Order.ROW
+    if ordering is not None:
+        order = ordering
+        message = f"Internal Error: incompatible ordering '{ordering}' and strides {mm_strides}"
+        if order == cublaslt.Order.ROW:
+            assert mm_strides[0] >= mm_strides[1] and mm_strides[1] == 1, message
+        else:
+            assert mm_strides[1] >= mm_strides[0] and mm_strides[0] == 1, message
     else:
-        if M == 1:
+        # Important: start with the first dimension so that cases like (M, 1) : (1, 1) or
+        # (1, M) : (1, 1) in CuTe notation map to COL.
+        if mm_strides[0] == 1:
             order = cublaslt.Order.COL
-        elif N == 1:
+        elif mm_strides[1] == 1:
             order = cublaslt.Order.ROW
         else:
-            raise ValueError("Unsupported layout.")
+            if M == 1:
+                order = cublaslt.Order.COL
+            elif N == 1:
+                order = cublaslt.Order.ROW
+            else:
+                raise ValueError("Unsupported layout.")
 
     # We need to handle broadcast dimensions with zero-stride for the c matrix.
     if col_bcast and N == 1:
@@ -155,8 +196,9 @@ def get_matrix_layout_traits(mm_shape, mm_strides, batch_strides, col_bcast):
     else:
         ld = max(M, mm_strides[1]) if order == cublaslt.Order.COL else max(N, mm_strides[0])
 
-    # Batch dimensions should be contiguous in memory, which we have already checked.
-    # The batch_offset should be based on the lowest stride in the batch dimension to account for embedded matrices.
+    # Batch dimensions should be contiguous in memory, which we have already checked. The
+    # batch_offset should be based on the lowest stride in the batch dimension to account
+    # for embedded matrices.
     batch_offset = min(batch_strides) if batch_strides else 0
 
     return order, ld, batch_offset
@@ -168,16 +210,19 @@ def get_mm_traits(a_layout, b_layout, c_layout, logger):
 
     1. Check MM compatibility (K):
         a. First pad A and/or B MM dimensions if 1-D according to NumPy convention.
-        b. The padding is used to determine M, N, and K but should not appear in the output dimensions.
+        b. The padding is used to determine M, N, and K but should not appear in the output
+           dimensions.
         c. If both A and B are N-D, the dimensions must match.
     2. Check batch dimensions:
-        a. One of A or B can have missing batch extents, in which case it is broadcast, otherwise
+        a. One of A or B can have missing batch extents, in which case it is broadcast,
+           otherwise
         b. A and B must have the same batch ordering.
         c. In addition, the batch dimensions must be tileable (contiguous in memory).
 
     Then check C:
 
-    C can be None. If C is passed in, it must be a vector or matrix. Batching rule is the same as above.
+    C can be None. If C is passed in, it must be a vector or matrix. Batching rule is the
+    same as above.
     """
     a_shape, a_strides = list(a_layout.shape), list(a_layout.strides)
     b_shape, b_strides = list(b_layout.shape), list(b_layout.strides)
@@ -187,7 +232,6 @@ def get_mm_traits(a_layout, b_layout, c_layout, logger):
 
     a_batch_strides, a_mm_strides = a_strides[:-2], a_strides[-2:]
     b_batch_strides, b_mm_strides = b_strides[:-2], b_strides[-2:]
-
     d_mm_shape = []
     if len(a_mm_shape) == 1:
         s, d = a_mm_shape[0], a_mm_strides[0]
@@ -218,20 +262,26 @@ def get_mm_traits(a_layout, b_layout, c_layout, logger):
     batch_shape, batch_axis_order = [], ()
     if len(a_batch_shape) > 0:
         if not check_batch_tileable(a_batch_shape, a_batch_strides):
-            message = f"The batch layout for A corresponding to shape = {a_batch_shape} and strides = {a_batch_strides} is currently not supported because it is not tileable."
+            message = (
+                f"The batch layout for A corresponding to shape = {a_batch_shape} and strides = {a_batch_strides} "
+                "is currently not supported because it is not tileable."
+            )
             raise ValueError(message)
         logger.debug(
-            f"The batch layout for A corresponding to shape = {a_batch_shape} and strides = {a_batch_strides} IS tileable."
+            f"The batch layout for A corresponding to shape = {a_batch_shape} and strides = {a_batch_strides} IS " "tileable."
         )
         batch_shape = a_batch_shape
         batch_axis_order = a_batch_axis_order = axis_order_in_memory(a_batch_strides)
 
     if len(b_batch_shape) > 0:
         if not check_batch_tileable(b_batch_shape, b_batch_strides):
-            message = f"The batch layout for B corresponding to shape = {b_batch_shape} and strides = {b_batch_strides} is currently not supported because it is not tileable."
+            message = (
+                f"The batch layout for B corresponding to shape = {b_batch_shape} and strides = {b_batch_strides} "
+                "is currently not supported because it is not tileable."
+            )
             raise ValueError(message)
         logger.debug(
-            f"The batch layout for B corresponding to shape = {b_batch_shape} and strides = {b_batch_strides} IS tileable."
+            f"The batch layout for B corresponding to shape = {b_batch_shape} and strides = {b_batch_strides} IS " "tileable."
         )
         batch_shape = b_batch_shape
         batch_axis_order = b_batch_axis_order = axis_order_in_memory(b_batch_strides)
@@ -240,9 +290,7 @@ def get_mm_traits(a_layout, b_layout, c_layout, logger):
         if a_batch_shape != b_batch_shape:
             raise ValueError(f"The batch dimensions of operands A {a_batch_shape} and B {b_batch_shape} must match.")
         if a_batch_axis_order != b_batch_axis_order:
-            raise ValueError(
-                f"The batch order of operands A {a_batch_axis_order} and B {b_batch_axis_order} must match."
-            )
+            raise ValueError(f"The batch order of operands A {a_batch_axis_order} and B {b_batch_axis_order} must match.")
 
     logger.debug(f"The batch shape is {batch_shape} with batch axis order {batch_axis_order}.")
 
@@ -258,9 +306,7 @@ def get_mm_traits(a_layout, b_layout, c_layout, logger):
         mm_shape=a_mm_shape,
         mm_strides=a_mm_strides,
     )
-    logger.debug(
-        f"The layout order for operand A is {a_order.name}, with LD {a_ld}, and batch offset {a_batch_offset}."
-    )
+    logger.debug(f"The layout order for operand A is {a_order.name}, with LD {a_ld}, and batch offset {a_batch_offset}.")
 
     b_order, b_ld, b_batch_offset = get_matrix_layout_traits(b_mm_shape, b_mm_strides, b_batch_strides, col_bcast=False)
     b_layout_traits = LayoutTraits(
@@ -271,16 +317,16 @@ def get_mm_traits(a_layout, b_layout, c_layout, logger):
         mm_shape=b_mm_shape,
         mm_strides=b_mm_strides,
     )
-    logger.debug(
-        f"The layout order for operand B is {b_order.name}, with LD {b_ld}, and batch offset {b_batch_offset}."
-    )
+    logger.debug(f"The layout order for operand B is {b_order.name}, with LD {b_ld}, and batch offset {b_batch_offset}.")
 
     # Process matrix c, if provided.
     c_layout_traits = None
     if c_layout is not None:
-        # C can be a vector of dimension M, which is broadcast.
-        # C can be a matrix of dimension (M, N) or (M, 1), broadcast in the latter case and has to have contiguous strides.
-        # C can be batched matrices of dimension (..., M, N) or (..., M, 1), broadcast in the latter case and has to have contiguous strides.
+        # 1. C can be a vector of dimension M, which is broadcast.
+        # 2. C can be a matrix of dimension (M, N) or (M, 1), broadcast in the latter case
+        #    and has to have contiguous strides.
+        # 3. C can be batched matrices of dimension (..., M, N) or (..., M, 1), broadcast in
+        #    the latter case and has to have contiguous strides.
         c_shape, c_strides = list(c_layout.shape), list(c_layout.strides)
 
         c_batch_shape, c_mm_shape = c_shape[:-2], c_shape[-2:]
@@ -298,31 +344,29 @@ def get_mm_traits(a_layout, b_layout, c_layout, logger):
         if Nc != 1 and Nc != N0:
             raise ValueError(f"The N dimension of the C matrix ({Nc}) must match the N dimension of B.")
 
-        if len(c_batch_shape) > 0 and c_batch_shape != batch_shape:
-            raise ValueError(
-                f"The batch dimension of operand C {c_batch_shape} must match with that of the other operands {batch_shape}."
-            )
         if len(c_batch_shape) > 0:
-            c_batch_axis_order = axis_order_in_memory(c_batch_strides)
-            if c_batch_axis_order != batch_axis_order:
+            if c_batch_shape != batch_shape:
                 raise ValueError(
-                    f"The batch axis order of operand C {c_batch_axis_order} must match with that of the other operands {batch_axis_order}."
+                    f"The batch dimension of operand C {c_batch_shape} must match with that of the other operands "
+                    f"{batch_shape}."
+                )
+
+            if (c_batch_axis_order := axis_order_in_memory(c_batch_strides)) != batch_axis_order:
+                raise ValueError(
+                    f"The batch axis order of operand C {c_batch_axis_order} must match with that of the other "
+                    f"operands {batch_axis_order}."
                 )
 
-        if len(c_batch_shape) > 0:
             if not check_batch_tileable(c_batch_shape, c_batch_strides):
-                message = f"The batch layout for C corresponding to shape = {c_batch_shape} and strides = {c_batch_strides} is currently not supported because it is not tileable."
+                message = (
+                    f"The batch layout for C corresponding to shape = {c_batch_shape} and strides = "
+                    f"{c_batch_strides} is currently not supported because it is not tileable."
+                )
                 raise ValueError(message)
 
-        c_order, c_ld, c_batch_offset = get_matrix_layout_traits(
-            c_mm_shape, c_mm_strides, c_batch_strides, col_bcast=True
-        )
-        c_layout_traits = LayoutTraits(
-            order=c_order, ld=c_ld, batch_offset=c_batch_offset, is_conjugate=c_layout.is_conjugate
-        )
-        logger.debug(
-            f"The layout order for operand C is {c_order.name}, with LD {c_ld}, and batch offset {c_batch_offset}."
-        )
+        c_order, c_ld, c_batch_offset = get_matrix_layout_traits(c_mm_shape, c_mm_strides, c_batch_strides, col_bcast=True)
+        c_layout_traits = LayoutTraits(order=c_order, ld=c_ld, batch_offset=c_batch_offset, is_conjugate=c_layout.is_conjugate)
+        logger.debug(f"The layout order for operand C is {c_order.name}, with LD {c_ld}, and batch offset {c_batch_offset}.")
 
     return MMTraits(
         M=M0,
@@ -343,11 +387,12 @@ def get_result_traits(mm_traits, epilog_ordering, logger):
     epilog_ordering = value of type cublaslt.Order or None.
 
     The result layout is determined from:
-    - the ordering of operand c, if it is provided, or
-    - the epilog requirement, if it exists, or
-    - the ordering of operand a.
+        - the ordering of operand c, if it is provided, or
+        - the epilog requirement, if it exists, or
+        - the ordering of operand a.
 
-    The result batch dimensions must have the same extents and axis order as the inputs. The MM layout can be C or F.
+    The result batch dimensions must have the same extents and axis order as the inputs. The
+    MM layout can be C or F.
     """
     # The result shape is the batch shape + d_mm_shape.
     result_shape = mm_traits.batch_shape + mm_traits.d_mm_shape
@@ -371,18 +416,32 @@ def get_result_traits(mm_traits, epilog_ordering, logger):
     # Calculate the result strides.
     result_strides = calculate_strides(result_shape, result_axis_order)
 
+    # For degenerate matrices, we need to specify the result orientation.
+    result_orientation = None
+    if len(mm_traits.d_mm_shape) < 2:
+        if mm_traits.M == 1:
+            result_orientation = cublaslt.Order.ROW
+        elif mm_traits.N == 1:
+            result_orientation = cublaslt.Order.COL
+
     # The result's traits.
     d_batch_strides, d_mm_strides = (
         result_strides[: len(mm_traits.batch_shape)],
         result_strides[len(mm_traits.batch_shape) :],
     )
     d_order, d_ld, d_batch_offset = get_matrix_layout_traits(
-        mm_traits.d_mm_shape, d_mm_strides, d_batch_strides, col_bcast=False
+        mm_traits.d_mm_shape,
+        d_mm_strides,
+        d_batch_strides,
+        col_bcast=False,
+        ordering=result_ordering,
+        orientation=result_orientation,
     )
+    assert (
+        d_order == result_ordering
+    ), f"Internal Error: d_order = {d_order.name}, result_ordering = {result_ordering.name}, mm_traits = {mm_traits}."
     d_layout_traits = LayoutTraits(order=d_order, ld=d_ld, batch_offset=d_batch_offset, is_conjugate=False)
-    logger.debug(
-        f"The layout order for operand D is {d_order.name}, with LD {d_ld}, and batch offset {d_batch_offset}."
-    )
+    logger.debug(f"The layout order for operand D is {d_order.name}, with LD {d_ld}, and batch offset {d_batch_offset}.")
 
     return ResultTraits(result_shape=result_shape, result_strides=result_strides, d_layout_traits=d_layout_traits)
 
@@ -390,50 +449,99 @@ def get_result_traits(mm_traits, epilog_ordering, logger):
 SHARED_MM_DOCUMENTATION = utils.COMMON_SHARED_DOC_MAP.copy()
 SHARED_MM_DOCUMENTATION.update(
     {
-        "a": "A tensor representing the first operand to the matrix multiplication (see `Semantics`). The currently supported types are "
-        ":class:`numpy.ndarray`, :class:`cupy.ndarray`, and :class:`torch.Tensor`.",
-        "b": "A tensor representing the second operand to the matrix multiplication (see `Semantics`). The currently supported types are "
-        ":class:`numpy.ndarray`, :class:`cupy.ndarray`, and :class:`torch.Tensor`.",
-        "c": "(Optional) A tensor representing the operand to add to the matrix multiplication result (see `Semantics`). The currently supported "
-        "types are :class:`numpy.ndarray`, :class:`cupy.ndarray`, and :class:`torch.Tensor`.",
-        "alpha": "The scale factor for the matrix multiplication term as a real or complex number. The default is :math:`1.0`.",
-        "beta": "The scale factor for the matrix addition term as a real or complex number. A value for `beta` must be provided if operand `c` is specified.",
-        "algorithms": "A sequence of :class:`Algorithm` objects that can be directly provided to bypass planning. The algorithm objects must "
-        "be compatible with the matrix multiplication. A typical use for this option is to provide algorithms serialized (pickled) "
-        "from a previously planned and autotuned matrix multiplication.",
-        "epilog": r"Specify an epilog :math:`F` as an object of type :class:`MatmulEpilog` to apply to the result of the matrix "
-        r"multiplication: :math:`F(\alpha A @ B + \beta C`). The default is no epilog. "
-        r"See `cuBLASLt documentation <https://docs.nvidia.com/cuda/cublas/#cublasltepilogue-t>`_ for the list of available epilogs.",
-        "epilog_inputs": "Specify the additional inputs needed for the selected epilog as a dictionary, where the key is the epilog "
-        "input name and the value is the epilog input. The epilog input must be a tensor with the same package and in the same "
-        "memory space as the operands (see the constructor for more information on the operands). If the required epilog inputs "
-        "are not provided, an exception is raised that lists the required epilog inputs. "
-        "Some epilog inputs are generated by other epilogs. For example, the epilog input for :class:`MatmulEpilog.DRELU` is generated by matrix "
-        "multiplication with the same operands using :class:`MatmulEpilog.RELU_AUX`. ",
-        "qualifiers": "If desired, specify the matrix qualifiers as a :class:`numpy.ndarray` of :class:`~nvmath.linalg.advanced.matrix_qualifiers_dtype` objects "
-        "of length 3 corresponding to the operands `a`, `b`, and `c`.",
-        "options": "Specify options for the matrix multiplication as a :class:`~nvmath.linalg.advanced.MatmulOptions` object. Alternatively, a `dict` containing the parameters for the "
-        "``MatmulOptions`` constructor can also be provided. If not specified, the value will be set to the default-constructed ``MatmulOptions`` object.",
-        "preferences": "This parameter specifies the preferences for planning as a :class:`MatmulPlanPreferences` object. Alternatively, a "
-        "dictionary containing the parameters for the ``MatmulPlanPreferences`` constructor can also be provided. If not specified, the "
-        "value will be set to the default-constructed ``MatmulPlanPreferences`` object.",
-        "result": "The result of the specified matrix multiplication (epilog applied), which remains on the same device and belong to the same package as the input operands. If an epilog "
-        "(like :attr:`nvmath.linalg.advanced.MatmulEpilog.RELU_AUX`) that results in extra output is used, a tuple is returned with the first element being the matrix multiplication result (epilog applied) and the second "
-        "element being the auxiliary output provided by the selected epilog as a `dict`.",
-        "semantics": """The semantics of the matrix multiplication follows :func:`numpy.matmul` semantics, with some restrictions on broadcasting. In addition, the
-        semantics for the fused matrix addition are described below:
-
-            * If arguments `a` and `b` are matrices, they are multiplied according to the rules of matrix multiplication.
-            * If argument `a` is 1-D, it is promoted to a matrix by prefixing ``1`` to its dimensions. After matrix multiplication, the prefixed ``1``
-              is removed from the result's dimensions.
-            * If argument `b` is 1-D, it is promoted to a matrix by appending ``1`` to its dimensions. After matrix multiplication, the appended ``1``
-              is removed from the result's dimensions.
-            * If `a` or `b` is N-D (N > 2), then the operand is treated as a batch of matrices. If both `a` and `b` are N-D, their batch dimensions
-              must match. If exactly one of `a` or `b` is N-D, the other operand is broadcast.
-            * The operand for the matrix addition `c` may be a vector of length M, a matrix of shape (M, 1) or (M, N), or batched versions of the
-              latter (..., M, 1) or (..., M, N). Here M and N are the dimensions of the result of the matrix multiplication. If a vector is provided
-              or N = 1, the columns of `c` are broadcast for the addition. If batch dimensions are not present, `c` is broadcast across batches as
-              needed.""",
+        "a": """\
+A tensor representing the first operand to the matrix multiplication (see `Semantics`). The currently supported types
+are :class:`numpy.ndarray`, :class:`cupy.ndarray`, and :class:`torch.Tensor`.""".replace("\n", " "),
+        #
+        "b": """\
+A tensor representing the second operand to the matrix multiplication (see `Semantics`). The currently supported types
+are :class:`numpy.ndarray`, :class:`cupy.ndarray`, and :class:`torch.Tensor`.""".replace("\n", " "),
+        #
+        "c": """\
+(Optional) A tensor representing the operand to add to the matrix multiplication result (see `Semantics`). The currently
+supported types are :class:`numpy.ndarray`, :class:`cupy.ndarray`, and :class:`torch.Tensor`.""".replace("\n", " "),
+        #
+        "c_admonitions": """
+                .. note::
+                    The broadcasting behavior of a 1-D (vector) `c` deviates from the
+                    equivalent NumPy expression. With nvmath-python, `c` is internally
+                    promoted to shape (M, 1) in order to broadcast with ``a @ b``; this matches the
+                    behavior of cuBLASLt. With NumPy, a 1-D `c` behaves as if it has shape
+                    (1, N) in the expression ``a @ b + c``.
+
+                .. deprecated:: 0.2.1
+                    In order to avoid broadcasting behavior ambiguity, nvmath-python will no longer
+                    accept a 1-D (vector) `c` starting in version 0.3.0. Use a singleton
+                    dimension to convert your input array to 2-D.
+""",
+        #
+        "alpha": """\
+The scale factor for the matrix multiplication term as a real or complex number. The default is
+:math:`1.0`.""".replace("\n", " "),
+        #
+        "beta": """\
+The scale factor for the matrix addition term as a real or complex number. A value for `beta` must be provided if
+operand `c` is specified.""".replace("\n", " "),
+        #
+        "algorithms": """\
+A sequence of :class:`Algorithm` objects that can be directly provided to bypass planning. The algorithm objects must be
+compatible with the matrix multiplication. A typical use for this option is to provide algorithms serialized (pickled)
+from a previously planned and autotuned matrix multiplication.""".replace("\n", " "),
+        #
+        "epilog": """\
+Specify an epilog :math:`F` as an object of type :class:`MatmulEpilog` to apply to the result of the matrix
+multiplication: :math:`F(\\alpha A @ B + \\beta C`). The default is no epilog. See `cuBLASLt documentation
+<https://docs.nvidia.com/cuda/cublas/#cublasltepilogue-t>`_ for the list of available epilogs.""".replace("\n", " "),
+        #
+        "epilog_inputs": """\
+Specify the additional inputs needed for the selected epilog as a dictionary, where the key is the epilog input name and
+the value is the epilog input. The epilog input must be a tensor with the same package and in the same memory space as
+the operands (see the constructor for more information on the operands). If the required epilog inputs are not provided,
+an exception is raised that lists the required epilog inputs. Some epilog inputs are generated by other epilogs. For
+example, the epilog input for :class:`MatmulEpilog.DRELU` is generated by matrix multiplication with the same operands
+using :class:`MatmulEpilog.RELU_AUX`. """.replace("\n", " "),
+        #
+        "qualifiers": """\
+If desired, specify the matrix qualifiers as a :class:`numpy.ndarray` of
+:class:`~nvmath.linalg.advanced.matrix_qualifiers_dtype` objects of length 3 corresponding to the operands `a`, `b`, and
+`c`.""".replace("\n", " "),
+        #
+        "options": """\
+Specify options for the matrix multiplication as a :class:`~nvmath.linalg.advanced.MatmulOptions` object. Alternatively,
+a `dict` containing the parameters for the ``MatmulOptions`` constructor can also be provided. If not specified, the
+value will be set to the default-constructed ``MatmulOptions`` object.""".replace("\n", " "),
+        #
+        "preferences": """\
+This parameter specifies the preferences for planning as a :class:`MatmulPlanPreferences` object. Alternatively, a
+dictionary containing the parameters for the :class:`MatmulPlanPreferences` constructor can also be provided. If not
+specified, the value will be set to the default-constructed :class:`MatmulPlanPreferences` object.
+""".replace("\n", " "),
+        #
+        "result": """\
+The result of the specified matrix multiplication (epilog applied), which remains on the same device and belong to the
+same package as the input operands. If an epilog (like :attr:`nvmath.linalg.advanced.MatmulEpilog.RELU_AUX`) that
+results in extra output is used, a tuple is returned with the first element being the matrix multiplication result
+(epilog applied) and the second element being the auxiliary output provided by the selected epilog as a
+`dict`.""".replace("\n", " "),
+        #
+        "semantics": """\
+        The semantics of the matrix multiplication follows :func:`numpy.matmul` semantics, with some restrictions on
+        broadcasting. In addition, the semantics for the fused matrix addition are described below:
+
+        * If arguments `a` and `b` are matrices, they are multiplied according to the rules of matrix multiplication.
+        * If argument `a` is 1-D, it is promoted to a matrix by prefixing ``1`` to its dimensions. After matrix
+          multiplication, the prefixed ``1`` is removed from the result's dimensions.
+        * If argument `b` is 1-D, it is promoted to a matrix by appending ``1`` to its dimensions. After matrix
+          multiplication, the appended ``1`` is removed from the result's dimensions.
+        * If `a` or `b` is N-D (N > 2), then the operand is treated as a batch of matrices. If both `a` and `b` are N-D,
+          their batch dimensions must match. If exactly one of `a` or `b` is N-D, the other operand is broadcast.
+        * The operand for the matrix addition `c` may be a vector of length M, a matrix of shape (M, 1) or (M, N), or
+          batched versions of the latter (..., M, 1) or (..., M, N). Here M and N are the dimensions of the result of
+          the matrix multiplication. If a vector is provided or N = 1, the columns of `c` are broadcast for the
+          addition. If batch dimensions are not present, `c` is broadcast across batches as needed.
+        * Similarly, when operating on a batch, auxiliary outputs are 3-D for all epilogs. Therefore, epilogs that return 1-D
+          vectors of length N in non-batched mode return 3-D matrices of size (batch, N, 1) in batched mode.
+""".strip(),
     }
 )
 
@@ -444,48 +552,64 @@ class InvalidMatmulState(Exception):
 
 def _check_extents(shape: tuple, name: str):
     if any(e <= 0 for e in shape):
-        message = (
-            f"The specified extents {shape} for operand {name} are not valid. The extents must be strictly positive. "
-        )
+        message = f"The specified extents {shape} for operand {name} are not valid. The extents must be strictly positive. "
         raise ValueError(message)
 
 
 @utils.docstring_decorator(SHARED_MM_DOCUMENTATION, skip_missing=False)
 class Matmul:
-    r"""
-    Matmul(a, b, /, c=None, *, alpha=None, beta=None, qualifiers=None, options=None, stream=None)
-
-    Create a stateful object encapsulating the specified matrix multiplication computation :math:`\alpha a @ b + \beta c` and the required
-    resources to perform the operation.  A stateful object can be used to amortize the cost of preparation (planning in the case of matrix
-    multiplication) across multiple executions (also see the :ref:`Stateful APIs <host api types>` section).
-
-    The function-form API :func:`matmul` is a convenient alternative to using stateful objects for *single* use (the user needs to
-    perform just one matrix multiplication, for example), in which case there is no possibility of amortizing preparatory costs. The
+    """
+    Create a stateful object encapsulating the specified matrix multiplication computation
+    :math:`\\alpha a @ b + \\beta c` and the required resources to perform the operation.  A
+    stateful object can be used to amortize the cost of preparation (planning in the case of
+    matrix multiplication) across multiple executions (also see the :ref:`Stateful APIs
+    <host api types>` section).
+
+    The function-form API :func:`matmul` is a convenient alternative to using stateful
+    objects for *single* use (the user needs to perform just one matrix multiplication, for
+    example), in which case there is no possibility of amortizing preparatory costs. The
     function-form APIs are just convenience wrappers around the stateful object APIs.
 
     Using the stateful object typically involves the following steps:
 
-    1. **Problem Specification**: Initialize the object with a defined operation and options.
-    2. **Preparation**: Use :meth:`plan` to determine the best algorithmic implementation for this specific matrix multiplication operation.
+    1. **Problem Specification**: Initialize the object with a defined operation and
+       options.
+    2. **Preparation**: Use :meth:`plan` to determine the best algorithmic implementation
+       for this specific matrix multiplication operation.
     3. **Execution**: Perform the matrix multiplication computation with :meth:`execute`.
-    4. **Resource Management**: Ensure all resources are released either by explicitly calling :meth:`free` or by managing the stateful object within a context manager.
+    4. **Resource Management**: Ensure all resources are released either by explicitly
+       calling :meth:`free` or by managing the stateful object within a context manager.
 
-    Detailed information on what's happening in the various phases described above can be obtained by passing in a :class:`logging.Logger` object
-    to :class:`MatmulOptions` or by setting the appropriate options in the root logger object, which is used by default:
+    Detailed information on what's happening in the various phases described above can be
+    obtained by passing in a :class:`logging.Logger` object to :class:`MatmulOptions` or by
+    setting the appropriate options in the root logger object, which is used by default:
 
         >>> import logging
-        >>> logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M:%S')
+        >>> logging.basicConfig(
+        ...     level=logging.INFO,
+        ...     format="%(asctime)s %(levelname)-8s %(message)s",
+        ...     datefmt="%m-%d %H:%M:%S",
+        ... )
 
-    A user can select the desired logging level and, in general, take advantage of all of the functionality offered by the Python `logging` module.
+    A user can select the desired logging level and, in general, take advantage of all of
+    the functionality offered by the Python `logging` module.
 
     Args:
         a: {a}
+
         b: {b}
+
         c: {c}
+            {c_admonitions}
+
         alpha: {alpha}
+
         beta: {beta}
+
         qualifiers: {qualifiers}
+
         options: {options}
+
         stream: {stream}
 
     Semantics:
@@ -505,31 +629,40 @@ class Matmul:
         >>> a = np.random.rand(M, K)
         >>> b = np.random.rand(K, N)
 
-        We will define a matrix multiplication operation followed by a RELU epilog function using the specialized matrix multiplication interface.
+        We will define a matrix multiplication operation followed by a RELU epilog function
+        using the specialized matrix multiplication interface.
 
         Create a Matmul object encapsulating the problem specification above:
 
         >>> mm = nvmath.linalg.advanced.Matmul(a, b)
 
-        Options can be provided above to control the behavior of the operation using the `options` argument (see :class:`MatmulOptions`).
+        Options can be provided above to control the behavior of the operation using the
+        `options` argument (see :class:`MatmulOptions`).
 
-        Next, plan the operation. The epilog is specified, and optionally, preferences can be specified for planning:
+        Next, plan the operation. The epilog is specified, and optionally, preferences can
+        be specified for planning:
 
         >>> epilog = nvmath.linalg.advanced.MatmulEpilog.RELU
-        >>> mm.plan(epilog=epilog)
+        >>> algorithms = mm.plan(epilog=epilog)
 
-        Certain epilog choices (like :attr:`nvmath.linalg.advanced.MatmulEpilog.BIAS`) require additional input provided using the `epilog_inputs` argument to :meth:`plan`.
+        Certain epilog choices (like :attr:`nvmath.linalg.advanced.MatmulEpilog.BIAS`)
+        require additional input provided using the `epilog_inputs` argument to
+        :meth:`plan`.
 
-        Now execute the matrix multiplication, and obtain the result `r1` as a NumPy ndarray.
+        Now execute the matrix multiplication, and obtain the result `r1` as a NumPy
+        ndarray.
 
         >>> r1 = mm.execute()
 
-        Finally, free the object's resources. To avoid having to explicitly making this call, it's recommended to use the Matmul object as
-        a context manager as shown below, if possible.
+        Finally, free the object's resources. To avoid having to explicitly making this
+        call, it's recommended to use the Matmul object as a context manager as shown below,
+        if possible.
 
         >>> mm.free()
 
-        Note that all :class:`Matmul` methods execute on the current stream by default. Alternatively, the `stream` argument can be used to run a method on a specified stream.
+        Note that all :class:`Matmul` methods execute on the current stream by default.
+        Alternatively, the `stream` argument can be used to run a method on a specified
+        stream.
 
         Let's now look at the same problem with CuPy ndarrays on the GPU.
 
@@ -539,25 +672,29 @@ class Matmul:
         >>> a = cp.random.rand(M, K)
         >>> b = cp.random.rand(K, N)
 
-        Create an Matmul object encapsulating the problem specification described earlier and use it as a context manager.
+        Create an Matmul object encapsulating the problem specification described earlier
+        and use it as a context manager.
 
         >>> with nvmath.linalg.advanced.Matmul(a, b) as mm:
-        ...    mm.plan(epilog=epilog)
+        ...     algorithms = mm.plan(epilog=epilog)
         ...
-        ...    # Execute the operation to get the first result.
-        ...    r1 = mm.execute()
+        ...     # Execute the operation to get the first result.
+        ...     r1 = mm.execute()
         ...
-        ...    # Update operands A and B in-place (see reset_operands() for an alternative).
-        ...    a[:] = cp.random.rand(M, K)
-        ...    b[:] = cp.random.rand(K, N)
+        ...     # Update operands A and B in-place (see reset_operands() for an
+        ...     # alternative).
+        ...     a[:] = cp.random.rand(M, K)
+        ...     b[:] = cp.random.rand(K, N)
         ...
-        ...    # Execute the operation to get the new result.
-        ...    r2 = mm.execute()
+        ...     # Execute the operation to get the new result.
+        ...     r2 = mm.execute()
 
 
         All the resources used by the object are released at the end of the block.
 
-        Further examples can be found in the `nvmath/examples/linalg/advanced/matmul <https://github.com/NVIDIA/nvmath-python/tree/main/examples/linalg/advanced/matmul>`_ directory.
+        Further examples can be found in the `nvmath/examples/linalg/advanced/matmul
+        <https://github.com/NVIDIA/nvmath-python/tree/main/examples/linalg/advanced/matmul>`_
+        directory.
     """
 
     def __init__(self, a, b, /, c=None, *, alpha=None, beta=None, qualifiers=None, options=None, stream=None):
@@ -566,7 +703,8 @@ def __init__(self, a, b, /, c=None, *, alpha=None, beta=None, qualifiers=None, o
 
         self.logger = options.logger if options.logger is not None else logging.getLogger()
 
-        # The matrix multiplication has two required operands 'a' and 'b', and one optional operand 'c'.
+        # The matrix multiplication has two required operands 'a' and 'b', and one optional
+        # operand 'c'.
         a = tensor_wrapper.wrap_operand(a)
         b = tensor_wrapper.wrap_operand(b)
         self.logger.info("= SPECIFICATION PHASE =")
@@ -607,7 +745,8 @@ def __init__(self, a, b, /, c=None, *, alpha=None, beta=None, qualifiers=None, o
             self.memory_space = "cpu"
             self.device_id = options.device_id
         self.logger.info(
-            f"The input operands' memory space is {self.memory_space}, and the execution space is on device {self.device_id}."
+            f"The input operands' memory space is {self.memory_space}, and the execution space is on device "
+            f"{self.device_id}."
         )
 
         # Allocate device memory (in stream context) if needed.
@@ -619,12 +758,11 @@ def __init__(self, a, b, /, c=None, *, alpha=None, beta=None, qualifiers=None, o
             self.operands = tensor_wrapper.to(self.operands, self.device_id, stream_holder)
 
         # Set qualifiers.
-        self.qualifiers = (
-            qualifiers if qualifiers is not None else np.zeros((3,), dtype=_configuration.matrix_qualifiers_dtype)
-        )
+        self.qualifiers = qualifiers if qualifiers is not None else np.zeros((3,), dtype=_configuration.matrix_qualifiers_dtype)
         if self.qualifiers.dtype != _configuration.matrix_qualifiers_dtype:
             raise ValueError(
-                "The qualifiers must be specified as a NumPy array of length 3 corresponding to the operands A, B, and C of type 'matrix_qualifiers_dtype'."
+                "The qualifiers must be specified as a NumPy array of length 3 corresponding to the operands A, B, and "
+                "C of type 'matrix_qualifiers_dtype'."
             )
         if self.qualifiers[2]["is_conjugate"]:
             raise ValueError("The conjugate flag is currently not supported for operand C.")
@@ -667,18 +805,6 @@ def __init__(self, a, b, /, c=None, *, alpha=None, beta=None, qualifiers=None, o
         else:
             self.handle = get_handle(self.device_id)
 
-        # Determine the scale type.
-        if options.scale_type is None:
-            self.scale_type = NAME_TO_DEFAULT_SCALE_TYPE[self.ab_dtype_name]
-            self.scale_type_name = typemaps.DATA_TYPE_TO_NAME[self.scale_type]
-        else:
-            self.scale_type = options.scale_type
-            if self.scale_type not in typemaps.DATA_TYPE_TO_NAME:
-                message = f"Unsupported scale type. The data type '{self.scale_type}' is currently not supported."
-                raise ValueError(message)
-            self.scale_type_name = typemaps.DATA_TYPE_TO_NAME[self.scale_type]
-        self.logger.info(f"The scale type is '{self.scale_type_name}'.")
-
         # Determine the data types for a and b.
         self.a_dtype = typemaps.NAME_TO_DATA_TYPE[a.dtype]
         self.b_dtype = typemaps.NAME_TO_DATA_TYPE[b.dtype]
@@ -695,25 +821,76 @@ def __init__(self, a, b, /, c=None, *, alpha=None, beta=None, qualifiers=None, o
             self.d_dtype_name = typemaps.DATA_TYPE_TO_NAME[self.d_dtype]
         self.logger.info(f"The data type for the result D is '{self.d_dtype_name}'.")
 
+        def assert_valid_compute_type(compute_type):
+            if compute_type not in cublas.ComputeType:
+                message = f"Unsupported compute type. The compute type '{compute_type}' is currently not supported."
+                raise ValueError(message)
+
+        # Determine the scale type.
+        if options.scale_type is None:
+            if options.compute_type is not None:
+                assert_valid_compute_type(options.compute_type)
+                if "complex" in self.ab_dtype_name:
+                    scale_type_map = COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE["complex"]
+                else:
+                    scale_type_map = COMPUTE_TYPE_TO_DEFAULT_SCALE_TYPE["real"]
+                self.scale_type = scale_type_map[options.compute_type]
+            else:
+                self.scale_type = NAME_TO_DEFAULT_SCALE_TYPE[self.ab_dtype_name]
+            self.scale_type_name = typemaps.DATA_TYPE_TO_NAME[self.scale_type]
+        else:
+            self.scale_type = options.scale_type
+            if self.scale_type not in typemaps.DATA_TYPE_TO_NAME:
+                message = f"Unsupported scale type. The data type '{self.scale_type}' is currently not supported."
+                raise ValueError(message)
+            self.scale_type_name = typemaps.DATA_TYPE_TO_NAME[self.scale_type]
+        self.logger.info(f"The scale type is '{self.scale_type_name}'.")
+
         # Determine the compute type.
-        self.compute_type = (
-            options.compute_type
-            if options.compute_type is not None
-            else NAME_TO_DEFAULT_COMPUTE_TYPE[self.ab_dtype_name]
-        )
-        if self.compute_type not in cublas.ComputeType:
-            message = f"Unsupported compute type. The compute type '{self.compute_type}' is currently not supported."
-            raise ValueError(message)
+        if options.compute_type is None:
+            if options.scale_type is not None:
+                self.compute_type = SCALE_TYPE_TO_DEFAULT_COMPUTE_TYPE[options.scale_type]
+            else:
+                self.compute_type = NAME_TO_DEFAULT_COMPUTE_TYPE[self.ab_dtype_name]
+        else:
+            self.compute_type = options.compute_type
+        assert_valid_compute_type(self.compute_type)
         self.logger.info(f"The compute type is {self.compute_type.name}.")
 
+        def is_supported(dtype, compute_type, scale_type):
+            ct = cublas.ComputeType
+            st = CudaDataType
+            if compute_type in (ct.COMPUTE_16F, ct.COMPUTE_16F_PEDANTIC):
+                return scale_type == st.CUDA_R_16F and dtype == "float16"
+            elif compute_type in (ct.COMPUTE_32F, ct.COMPUTE_32F_PEDANTIC):
+                if scale_type == st.CUDA_R_32F:
+                    return dtype in ("float32", "bfloat16", "float16")
+                elif scale_type == st.CUDA_C_32F:
+                    return dtype == "complex64"
+            elif compute_type in (ct.COMPUTE_32F_FAST_16F, ct.COMPUTE_32F_FAST_16BF, ct.COMPUTE_32F_FAST_TF32):
+                if scale_type == st.CUDA_R_32F:
+                    return dtype == "float32"
+                if scale_type == st.CUDA_C_32F:
+                    return dtype == "complex64"
+            elif compute_type in (ct.COMPUTE_64F, ct.COMPUTE_64F_PEDANTIC):
+                if scale_type == st.CUDA_R_64F:
+                    return dtype == "float64"
+                if scale_type == st.CUDA_C_64F:
+                    return dtype == "complex128"
+            return False
+
+        if not is_supported(self.ab_dtype_name, self.compute_type, self.scale_type):
+            raise ValueError(
+                f"Selected scale_type={repr(self.scale_type)} compute_type={repr(self.compute_type)} "
+                + f"are not supported for data type {self.ab_dtype_name}"
+            )
+
         # Set alpha and beta.
         self.alpha = np.zeros((1,), dtype=self.scale_type_name)
         try:
             self.alpha[0] = alpha if alpha is not None else 1
         except (ValueError, TypeError) as e:
-            raise ValueError(
-                f"The value provided for alpha {alpha} is not convertible to dtype '{self.alpha.dtype}'."
-            ) from e
+            raise ValueError(f"The value provided for alpha {alpha} is not convertible to dtype '{self.alpha.dtype}'.") from e
 
         self.beta = np.zeros((1,), dtype=self.scale_type_name)
         if beta is not None and self.num_operands == 2:
@@ -721,9 +898,7 @@ def __init__(self, a, b, /, c=None, *, alpha=None, beta=None, qualifiers=None, o
         try:
             self.beta[0] = beta if beta is not None and self.num_operands == 3 else 0
         except (ValueError, TypeError) as e:
-            raise ValueError(
-                f"The value provided for beta {beta} is not convertible to dtype '{self.beta.dtype}'."
-            ) from e
+            raise ValueError(f"The value provided for beta {beta} is not convertible to dtype '{self.beta.dtype}'.") from e
 
         # Capture operand extents and strides for consistency check when resetting operands.
         self.operand_extents = tuple(o.shape for o in self.operands)
@@ -738,10 +913,12 @@ def __init__(self, a, b, /, c=None, *, alpha=None, beta=None, qualifiers=None, o
         self.mm_traits = get_mm_traits(a_layout, b_layout, c_layout, self.logger)
         self.result_traits = None  # Wait till planning to determine this based on the epilog.
         self.logger.info(
-            f"The matrix multiplication attributes are M = {self.mm_traits.M}, N = {self.mm_traits.N}, and K = {self.mm_traits.K}."
+            f"The matrix multiplication attributes are M = {self.mm_traits.M}, N = {self.mm_traits.N}, and "
+            f"K = {self.mm_traits.K}."
         )
         self.logger.info(
-            f"The batch count is {self.mm_traits.batch_count}, and the batch shape is {self.mm_traits.batch_shape} with batch axis order {self.mm_traits.batch_axis_order}."
+            f"The batch count is {self.mm_traits.batch_count}, and the batch shape is {self.mm_traits.batch_shape} "
+            f"with batch axis order {self.mm_traits.batch_axis_order}."
         )
 
         # Create and set the operation descriptor.
@@ -825,7 +1002,8 @@ def _check_valid_operands(self, *args, **kwargs):
         what = kwargs["what"]
         if self.operands is None:
             raise RuntimeError(
-                f"{what} cannot be performed if the operands have been set to None. Use reset_operands() to set the desired input before using performing the {what.lower()}."
+                f"{what} cannot be performed if the operands have been set to None. Use reset_operands() to set the "
+                f"desired input before using performing the {what.lower()}."
             )
 
     def _free_plan_resources(self, exception: Exception | None = None) -> bool:
@@ -874,9 +1052,9 @@ def _free_workspace_memory(self, exception: Exception | None = None) -> bool:
 
     def _reset_workspace_allocation_tracking(self):
         """
-        Reset workspace allocation tracking attributes to False at the end of the methods where workspace memory is
-        potentially allocated. This is necessary to prevent any exceptions raised before method entry from using
-        stale tracking values.
+        Reset workspace allocation tracking attributes to False at the end of the methods
+        where workspace memory is potentially allocated. This is necessary to prevent any
+        exceptions raised before method entry from using stale tracking values.
         """
         self.workspace_allocated_here = False
 
@@ -921,6 +1099,7 @@ def _allocate_workspace_memory(self, stream_holder):
             with utils.device_ctx(self.device_id), stream_holder.ctx:
                 try:
                     self.workspace_ptr = self.allocator.memalloc(self.workspace_size)
+                    self.workspace_allocated_here = True
                 except TypeError as e:
                     message = (
                         "The method 'memalloc' in the allocator object must conform to the interface in the "
@@ -931,12 +1110,14 @@ def _allocate_workspace_memory(self, stream_holder):
         self.workspace_allocated_size = self.workspace_size
         self.workspace_stream = stream_holder.obj
         self.logger.debug(
-            f"Finished allocating device workspace of size {formatters.MemoryStr(self.workspace_size)} in the context of stream {self.workspace_stream}."
+            f"Finished allocating device workspace of size {formatters.MemoryStr(self.workspace_size)} in the context "
+            f"of stream {self.workspace_stream}."
         )
 
     def _allocate_workspace_memory_perhaps(self, stream_holder):
         """
-        Allocate workspace memory using the specified allocator, if it hasn't already been done.
+        Allocate workspace memory using the specified allocator, if it hasn't already been
+        done.
         """
 
         if self.workspace_ptr is not None and self.workspace_allocated_size >= self.workspace_size:
@@ -952,7 +1133,8 @@ def applicable_algorithm_ids(self, limit=8):
             limit: The maximum number of applicable algorithm IDs that is desired
 
         Returns:
-            A sequence of algorithm IDs that are applicable to this matrix multiplication problem specification, in random order.
+            A sequence of algorithm IDs that are applicable to this matrix multiplication
+            problem specification, in random order.
         """
         ...
         algo_ids = cublaslt.matmul_algo_get_ids(
@@ -977,66 +1159,89 @@ def plan(
 
         Args:
             preferences: {preferences}
+
             algorithms: {algorithms}
+
             epilog: {epilog}
+
             epilog_inputs: {epilog_inputs}
+
             stream: {stream}
 
         Returns:
-            A sequence of :class:`nvmath.linalg.advanced.Algorithm` objects that are applicable to this matrix multiplication problem
-            specification, heuristically ordered from fastest to slowest.
+            A sequence of :class:`nvmath.linalg.advanced.Algorithm` objects that are
+            applicable to this matrix multiplication problem specification, heuristically
+            ordered from fastest to slowest.
 
         Notes:
-            Epilogs that have ``BIAS`` in their name need an epilog input with the key ``'bias'``.
-            Epilogs that have ``DRELU`` need an epilog input with the key ``'relu_aux'``, which is produced in a "forward pass" epilog like ``RELU_AUX`` or ``RELU_AUX_BIAS``.
-            Similarly, epilogs with ``DGELU`` in their name require an epilog input with the key ``'gelu_aux'``, produced in the corresponding forward pass operation.
+            Epilogs that have ``BIAS`` in their name need an epilog input with the key
+            ``'bias'``. Epilogs that have ``DRELU`` need an epilog input with the key
+            ``'relu_aux'``, which is produced in a "forward pass" epilog like ``RELU_AUX``
+            or ``RELU_AUX_BIAS``. Similarly, epilogs with ``DGELU`` in their name require an
+            epilog input with the key ``'gelu_aux'``, produced in the corresponding forward
+            pass operation.
 
         Examples:
 
             >>> import numpy as np
             >>> import nvmath
 
-            Create two 3-D float64 ndarrays on the CPU representing batched matrices, along with a bias vector:
+            Create two 3-D float64 ndarrays on the CPU representing batched matrices, along
+            with a bias vector:
 
             >>> batch = 32
             >>> M, N, K = 1024, 1024, 1024
             >>> a = np.random.rand(batch, M, K)
             >>> b = np.random.rand(batch, K, N)
-            >>> bias = np.random.rand(M)   # The bias vector will be broadcast along the columns, as well as along the batch dimension.
+            >>> # The bias vector will be broadcast along the columns, as well as along the
+            >>> # batch dimension.
+            >>> bias = np.random.rand(M)
 
-            We will define a matrix multiplication operation followed by a :attr:`nvmath.linalg.advanced.MatmulEpilog.RELU_BIAS` epilog function.
+            We will define a matrix multiplication operation followed by a
+            :attr:`nvmath.linalg.advanced.MatmulEpilog.RELU_BIAS` epilog function.
 
             >>> with nvmath.linalg.advanced.Matmul(a, b) as mm:
-            ...
-            ...     # Plan the operation with RELU_BIAS epilog and corresponding epilog input.
+            ...     # Plan the operation with RELU_BIAS epilog and corresponding epilog
+            ...     # input.
             ...     p = nvmath.linalg.advanced.MatmulPlanPreferences(limit=8)
             ...     epilog = nvmath.linalg.advanced.MatmulEpilog.RELU_BIAS
-            ...     epilog_inputs = {{'bias': bias}}
-            ...     mm.plan(preferences=p, epilog=epilog, epilog_inputs=epilog_inputs)    # The preferences can also be provided as a dict: {{'limit': 8}}
+            ...     epilog_inputs = {{"bias": bias}}
+            ...     # The preferences can also be provided as a dict: {{'limit': 8}}
+            ...     algorithms = mm.plan(
+            ...         preferences=p,
+            ...         epilog=epilog,
+            ...         epilog_inputs=epilog_inputs,
+            ...     )
             ...
-            ...     # Execute the matrix multiplication, and obtain the result `r` as a NumPy ndarray.
+            ...     # Execute the matrix multiplication, and obtain the result `r` as a
+            ...     # NumPy ndarray.
             ...     r = mm.execute()
 
-            Some epilogs like :attr:`nvmath.linalg.advanced.MatmulEpilog.RELU_AUX` produce auxiliary output.
+            Some epilogs like :attr:`nvmath.linalg.advanced.MatmulEpilog.RELU_AUX` produce
+            auxiliary output.
 
             >>> with nvmath.linalg.advanced.Matmul(a, b) as mm:
-            ...
             ...     # Plan the operation with RELU_AUX epilog>
             ...     epilog = nvmath.linalg.advanced.MatmulEpilog.RELU_AUX
-            ...     mm.plan(epilog=epilog)
+            ...     algorithms = mm.plan(epilog=epilog)
             ...
-            ...     # Execute the matrix multiplication, and obtain the result `r` along with the auxiliary output.
+            ...     # Execute the matrix multiplication, and obtain the result `r` along
+            ...     # with the auxiliary output.
             ...     r, auxiliary = mm.execute()
 
-            The auxiliary output is a Python `dict` with the names of each auxiliary output as keys.
+            The auxiliary output is a Python `dict` with the names of each auxiliary output
+            as keys.
 
-        Further examples can be found in the `nvmath/examples/linalg/advanced/matmul <https://github.com/NVIDIA/nvmath-python/tree/main/examples/linalg/advanced/matmul>`_ directory.
+        Further examples can be found in the `nvmath/examples/linalg/advanced/matmul
+        <https://github.com/NVIDIA/nvmath-python/tree/main/examples/linalg/advanced/matmul>`_
+        directory.
         """
         self.logger.info("= PLANNING PHASE =")
 
         # Clear epilog operands, since different epilogs can be provided in different calls.
-        # We don't need to worry about ordering, since it's the user's responsibility to order calls that accept a stream argument.
-        # This applies to CPU operands as well, even though we move them to the GPU, since the execution is blocking.
+        # We don't need to worry about ordering, since it's the user's responsibility to
+        # order calls that accept a stream argument. This applies to CPU operands as well,
+        # even though we move them to the GPU, since the execution is blocking.
         self.epilog_operands = dict()  # Clear operands in case of repeated planning.
         self.epilog_input_name_to_handler = dict()  # Clear input name to handler map as well,
         self.epilog_inputs_traits = dict()  # ... and the input traits as well.
@@ -1048,13 +1253,11 @@ def plan(
 
         # Base FLOP count.
         self.flop_count = 2 * mm_traits.M * mm_traits.N * mm_traits.K
-        self.logger.info(
-            f"The base matrix multiplication FLOP count is {formatters.FLOPSStr(self.flop_count, 'FLOP')}."
-        )
+        self.logger.info(f"The base matrix multiplication FLOP count is {formatters.FLOPSStr(self.flop_count, 'FLOP')}.")
 
         if epilog is None and epilog_inputs is not None:
             self.logger.warning(
-                f"Matmul: The provided epilog inputs {epilog_inputs.keys()} are ignored since an epilog is not specified."
+                f"Matmul: The provided epilog inputs {epilog_inputs.keys()} are ignored since an epilog is not " "specified."
             )
 
         self.epilog = epilog
@@ -1064,11 +1267,22 @@ def plan(
             self.logger.info(f"The specified epilog is {epilog.name}.")
 
             epilog_minimum_versions = EPILOG_MINIMUM_VERSIONS_MAP[epilog]
+            batched_epilog_minimum_versions = BATCHED_EPILOG_MINIMUM_VERSIONS_MAP[epilog]
             version = cublaslt.get_version()
             if version < epilog_minimum_versions["cublaslt"]:
-                message = f"The epilog {epilog.name} requires cublaslt >= {epilog_minimum_versions['cublaslt']}; you have version {version}. Update to CUDA Toolkit >= {epilog_minimum_versions['ctk']}."
+                message = (
+                    f"The epilog {epilog.name} requires cublaslt >= {epilog_minimum_versions['cublaslt']}; "
+                    f"you have version {version}. Update to CUDA Toolkit >= {epilog_minimum_versions['ctk']}."
+                )
                 raise ValueError(message)
 
+            if len(mm_traits.batch_shape) > 0 and version < batched_epilog_minimum_versions["cublaslt"]:
+                message = (
+                    f"The epilog {epilog.name} supports batching in "
+                    f"cublaslt >= {batched_epilog_minimum_versions['cublaslt']}; "
+                    f"you have version {version}. Update to CUDA Toolkit >= {epilog_minimum_versions['ctk']}."
+                )
+                raise ValueError(message)
             if (
                 self.mm_traits.c_layout_traits is not None
                 and self.mm_traits.c_layout_traits.order == cublaslt.Order.ROW
@@ -1088,7 +1302,8 @@ def plan(
                     for handler_type in epilog_input_handler_types
                 ]
 
-                # Check if the epilog requires a specific result layout, and if the requirement is consistent for all the handlers.
+                # Check if the epilog requires a specific result layout, and if the
+                # requirement is consistent for all the handlers.
                 epilog_input_handlers_ordering = {h.order for h in epilog_input_handlers}
                 assert len(epilog_input_handlers_ordering) == 1, "Internal error."
                 epilog_ordering = epilog_input_handlers_ordering.pop()
@@ -1103,7 +1318,8 @@ def plan(
 
                 if required_epilog_input_names != set(epilog_inputs.keys()):
                     raise ValueError(
-                        f"The epilog {epilog.name} requires the following input tensors: {required_epilog_input_names}. The provided tensor names are: {epilog_inputs.keys()}"
+                        f"The epilog {epilog.name} requires the following input tensors: "
+                        f"{required_epilog_input_names}. The provided tensor names are: {epilog_inputs.keys()}"
                     )
 
                 # Wrap epilog inputs. Take a copy of the user-provided dict.
@@ -1111,20 +1327,20 @@ def plan(
                 for name in epilog_inputs:
                     epilog_inputs[name] = tensor_wrapper.wrap_operand(epilog_inputs[name])
 
-                # Check if epilog inputs all belong to the same package, which is the same as the package of the MM operands.
+                # Check if epilog inputs all belong to the same package, which is the same
+                # as the package of the MM operands.
                 epilog_package = utils.get_operands_package(list(epilog_inputs.values()))
-                epilog_package = (
-                    "cupy" if epilog_package == "numpy" else epilog_package
-                )  # Handle the NumPy <=> CuPy asymmetry.
+                epilog_package = "cupy" if epilog_package == "numpy" else epilog_package  # Handle the NumPy <=> CuPy asymmetry.
                 if self.package != epilog_package:
                     message = f"Library package mismatch for epilog: '{self.package}' => '{epilog_package}'"
                     raise TypeError(message)
 
-                # Check if all epilog inputs all are on the same device, which is the device of the operands.
+                # Check if all epilog inputs all are on the same device, which is the device
+                # of the operands.
                 device_id = utils.get_operands_device_id(list(epilog_inputs.values()))
                 if device_id is not None and self.device_id != device_id:
                     raise ValueError(
-                        f"The epilog inputs must be on the same device ({device_id}) as the operands ({self.device_id})."
+                        f"The epilog inputs must be on the same device ({device_id}) as the operands " f"({self.device_id})."
                     )
 
                 # Move epilog inputs to the GPU, if needed.
@@ -1136,16 +1352,19 @@ def plan(
                     for e in required_epilog_input_names:
                         self.epilog_operands[e] = epilog_inputs[e]
 
-                # First validate all epilog inputs. Use the GPU tensors in case metadata has changed.
+                # First validate all epilog inputs. Use the GPU tensors in case metadata has
+                # changed.
                 for handler in epilog_input_handlers:
                     handler.validate(epilog_inputs[handler.name])
 
-                # Finally, update the MM descriptor. Note that we pass in self.epilog_operands (which are on the GPU).
+                # Finally, update the MM descriptor. Note that we pass in
+                # self.epilog_operands (which are on the GPU).
                 for handler in epilog_input_handlers:
                     handler.update(self.mm_desc_ifc, self.epilog_operands[handler.name])
                     self.epilog_input_name_to_handler[handler.name] = handler
 
-                # Capture the epilog operands traits for consistency checks when resetting operands.
+                # Capture the epilog operands traits for consistency checks when resetting
+                # operands.
                 self.epilog_inputs_traits = {
                     name: EpilogInputTraits(
                         dtype=self.epilog_operands[name].dtype,
@@ -1162,7 +1381,8 @@ def plan(
                     for handler_type in epilog_output_handler_types
                 ]
 
-                # Check if the epilog requires a specific result layout, and if the requirement is consistent for all the handlers.
+                # Check if the epilog requires a specific result layout, and if the
+                # requirement is consistent for all the handlers.
                 epilog_output_handlers_ordering = {h.order for h in epilog_output_handlers}
                 assert len(epilog_output_handlers_ordering) == 1, "Internal error."
                 op_epilog_ordering = epilog_output_handlers_ordering.pop()
@@ -1175,13 +1395,16 @@ def plan(
                 for handler in epilog_output_handlers:
                     handler.update(self.mm_desc_ifc)
 
-            # Set the epilog. At this point, we're sure that the epilog inputs, if any, are valid and have been set.
+            # Set the epilog. At this point, we're sure that the epilog inputs, if any, are
+            # valid and have been set.
             self.mm_desc_ifc.epilogue = epilog
 
         # Fill the result traits, now that we know the epilog.
         self.result_traits = result_traits = get_result_traits(mm_traits, epilog_ordering, self.logger)
         self.logger.info(
-            f"The layout order for the result D is {self.result_traits.d_layout_traits.order.name}, with LD {self.result_traits.d_layout_traits.ld}, and batch offset {self.result_traits.d_layout_traits.batch_offset}."
+            f"The layout order for the result D is {self.result_traits.d_layout_traits.order.name}, with LD "
+            f"{self.result_traits.d_layout_traits.ld}, and batch offset "
+            f"{self.result_traits.d_layout_traits.batch_offset}."
         )
 
         preferences = utils.check_or_create_options(
@@ -1194,25 +1417,29 @@ def plan(
             self.mm_desc_ifc.transa = cublas.Operation.C
             transpose = True
             self.logger.debug(
-                "To conjugate A, the operand A will be internally transposed and the matrix multiplication will be performed with OP_C for operand A."
+                "To conjugate A, the operand A will be internally transposed and the matrix multiplication will be "
+                "performed with OP_C for operand A."
             )
         m, n, ld, a_order = mm_traits.a_layout_traits.get_mm_layout(transpose=transpose)
         self.a_layout_ptr = cublaslt.matrix_layout_create(self.a_dtype, rows=m, cols=n, ld=ld)
         self.logger.debug(f"Layout for A: rows = {m}, cols = {n}, ld = {ld}.")
 
-        # Internally transpose operand B if required (conjugate flag, or epilog is BGRADB) and create layout.
+        # Internally transpose operand B if required (conjugate flag, or epilog is BGRADB)
+        # and create layout.
         transpose = False
         if mm_traits.b_layout_traits.is_conjugate and "complex" in self.ab_dtype_name:
             self.mm_desc_ifc.transb = cublas.Operation.C
             transpose = True
             self.logger.debug(
-                "To conjugate B, the operand B will be internally transposed and the matrix multiplication will be performed with OP_C for operand B."
+                "To conjugate B, the operand B will be internally transposed and the matrix multiplication will be "
+                "performed with OP_C for operand B."
             )
         elif epilog == _configuration.MatmulEpilog.BGRADB:
             self.mm_desc_ifc.transb = cublas.Operation.T
             transpose = True
             self.logger.debug(
-                "For BGRADB epilog, the operand B will be internally transposed and the matrix multiplication will be performed with OP_T for operand B."
+                "For BGRADB epilog, the operand B will be internally transposed and the matrix multiplication will be "
+                "performed with OP_T for operand B."
             )
         m, n, ld, b_order = mm_traits.b_layout_traits.get_mm_layout(transpose=transpose)
         self.b_layout_ptr = cublaslt.matrix_layout_create(self.b_dtype, rows=m, cols=n, ld=ld)
@@ -1261,7 +1488,8 @@ def plan(
         if self.preference_ptr is None:
             self.preference_ptr = cublaslt.matmul_preference_create()
         else:
-            # We need to create a new preferences object to avoid preferences being set in a cumulative manner if plan() is called multiple times.
+            # We need to create a new preferences object to avoid preferences being set in a
+            # cumulative manner if plan() is called multiple times.
             cublaslt.matmul_preference_destroy(self.preference_ptr)
             self.preference_ptr = cublaslt.matmul_preference_create()
 
@@ -1282,9 +1510,7 @@ def plan(
             if self.num_operands == 3:
                 c_ptr = self.operands[2].data_ptr
                 preference_ifc.min_alignment_c_bytes = min(256, pointer_aligned_to(c_ptr))
-                self.logger.debug(
-                    f"The minimum alignment for operand C is {preference_ifc.min_alignment_c_bytes} bytes."
-                )
+                self.logger.debug(f"The minimum alignment for operand C is {preference_ifc.min_alignment_c_bytes} bytes.")
             # The result alignment should be 256 bytes.
             self.logger.debug("The minimum alignment for the result D is the default 256 bytes.")
 
@@ -1328,11 +1554,13 @@ def plan(
 
         if algorithms is None:
             self.logger.info(
-                f"The plan found {num_algorithms} suitable algorithms within the requested limit of {limit} algorithms, with a workspace requirement of {formatters.MemoryStr(self.workspace_size)}."
+                f"The plan found {num_algorithms} suitable algorithms within the requested limit of {limit} "
+                f"algorithms, with a workspace requirement of {formatters.MemoryStr(self.workspace_size)}."
             )
         else:
             self.logger.info(
-                f"The plan is using {num_algorithms} algorithm passed through the algorithms argument, with a workspace requirement of {formatters.MemoryStr(self.workspace_size)}."
+                f"The plan is using {num_algorithms} algorithm passed through the algorithms argument, with a "
+                f"workspace requirement of {formatters.MemoryStr(self.workspace_size)}."
             )
 
         self.mm_planned = True
@@ -1344,10 +1572,12 @@ def plan(
     @property
     def algorithms(self):
         """
-        After planning using :meth:`plan()`, get the sequence of algorithm objects to inquire their capabilities, configure them, or serialize them for later use.
+        After planning using :meth:`plan()`, get the sequence of algorithm objects to
+        inquire their capabilities, configure them, or serialize them for later use.
 
         Returns:
-            A sequence of :class:`nvmath.linalg.advanced.Algorithm` objects that are applicable to this matrix multiplication problem specification.
+            A sequence of :class:`nvmath.linalg.advanced.Algorithm` objects that are
+            applicable to this matrix multiplication problem specification.
         """
         return self.algorithm_objects
 
@@ -1366,7 +1596,8 @@ def _check_and_set_operand(
         strides=None,
     ):
         """
-        Check to make sure that the provided operand is consistent with the one it's updating, and update it.
+        Check to make sure that the provided operand is consistent with the one it's
+        updating, and update it.
         """
         assert (operand_index is None) ^ (epilog_name is None), "Internal Error."
 
@@ -1377,11 +1608,13 @@ def _check_and_set_operand(
         package = utils.infer_object_package(operand.tensor)
 
         # Conjugate flag of the provided operands must match the original qualifiers
-        if operand_index is not None and package == "torch" and self.lazy_conjugation:
-            if self.qualifiers[operand_index]["is_conjugate"] != operand.tensor.is_conj():
-                raise ValueError(
-                    f"The provided operand {operand_name} has different conjugate flag than the original operand"
-                )
+        if (
+            operand_index is not None
+            and package == "torch"
+            and self.lazy_conjugation
+            and self.qualifiers[operand_index]["is_conjugate"] != operand.tensor.is_conj()
+        ):
+            raise ValueError(f"The provided operand {operand_name} has different conjugate flag than the original operand")
 
         device_id = operand.device_id
         if device_id is None:
@@ -1405,7 +1638,8 @@ def _check_and_set_operand(
                     # Update the epilog pointer, since we're starting afresh.
                     self.epilog_input_name_to_handler[epilog_name].update(mm_desc_ifc, o)
             else:
-                # In-place copy to existing device pointer because the new operand is on the CPU.
+                # In-place copy to existing device pointer because the new operand is on the
+                # CPU.
                 tensor_wrapper.copy_([operand], [o], stream_holder)
         else:
             if self.package != package:
@@ -1437,12 +1671,17 @@ def reset_operands(self, a=None, b=None, c=None, *, alpha=None, beta=None, epilo
         """
         Reset the operands held by this :class:`Matmul` instance.
 
-        This method has two use cases: (1) it can be used to provide new operands for execution when the original operands are on the CPU,
-        or (2) it can be used to release the internal reference to the previous operands and make their memory available for other use by
-        passing ``None`` for *all* arguments. In this case, this method must be called again to provide the desired operands before another
-        call to execution APIs like :meth:`autotune` or :meth:`execute`.
+        This method has two use cases:
+            (1) it can be used to provide new operands for execution when the original
+                operands are on the CPU
+            (2) it can be used to release the internal reference to the previous operands
+                and make their memory available for other use by passing ``None`` for *all*
+                arguments. In this case, this method must be called again to provide the
+                desired operands before another call to execution APIs like :meth:`autotune`
+                or :meth:`execute`.
 
-        This method is not needed when the operands reside on the GPU and in-place operations are used to update the operand values.
+        This method is not needed when the operands reside on the GPU and in-place
+        operations are used to update the operand values.
 
         This method will perform various checks on the new operands to make sure:
 
@@ -1452,11 +1691,18 @@ def reset_operands(self, a=None, b=None, c=None, *, alpha=None, beta=None, epilo
 
         Args:
             a: {a}
+
             b: {b}
+
             c: {c}
+                {c_admonitions}
+
             alpha: {alpha}
+
             beta: {beta}
+
             epilog_inputs: {epilog_inputs}
+
             stream: {stream}
 
         Examples:
@@ -1473,30 +1719,37 @@ def reset_operands(self, a=None, b=None, c=None, *, alpha=None, beta=None, epilo
             Create an matrix multiplication object as a context manager
 
             >>> with nvmath.linalg.advanced.Matmul(a, b) as mm:
-            ...    # Plan the operation.
-            ...    mm.plan()
+            ...     # Plan the operation.
+            ...     algorithms = mm.plan()
             ...
-            ...    # Execute the MM to get the first result.
-            ...    r1 = mm.execute()
+            ...     # Execute the MM to get the first result.
+            ...     r1 = mm.execute()
             ...
-            ...    # Reset the operands to new CuPy ndarrays.
-            ...    c = cp.random.rand(M, K)
-            ...    d = cp.random.rand(K, N)
-            ...    mm.reset_operands(c, d)
+            ...     # Reset the operands to new CuPy ndarrays.
+            ...     c = cp.random.rand(M, K)
+            ...     d = cp.random.rand(K, N)
+            ...     mm.reset_operands(c, d)
             ...
-            ...    # Execute to get the new result corresponding to the updated operands.
-            ...    r2 = mm.execute()
+            ...     # Execute to get the new result corresponding to the updated operands.
+            ...     r2 = mm.execute()
 
-            Note that if only a subset of operands are reset, the operands that are not reset hold their original values.
+            Note that if only a subset of operands are reset, the operands that are not
+            reset hold their original values.
 
-            With :meth:`reset_operands`, minimal overhead is achieved as problem specification and planning are only performed once.
+            With :meth:`reset_operands`, minimal overhead is achieved as problem
+            specification and planning are only performed once.
 
-            For the particular example above, explicitly calling :meth:`reset_operands` is equivalent to updating the operands in-place, i.e, replacing ``mm.reset_operand(c, d)`` with ``a[:]=c`` and ``b[:]=d``.
-            Note that updating the operand in-place should be adopted with caution as it can only yield the expected result under the additional constraint below:
+            For the particular example above, explicitly calling :meth:`reset_operands` is
+            equivalent to updating the operands in-place, i.e, replacing
+            ``mm.reset_operand(c, d)`` with ``a[:]=c`` and ``b[:]=d``. Note that updating
+            the operand in-place should be adopted with caution as it can only yield the
+            expected result under the additional constraint below:
 
-                - The operand is on the GPU (more precisely, the operand memory space should be accessible from the execution space).
+                - The operand is on the GPU (more precisely, the operand memory space should
+                  be accessible from the execution space).
 
-            For more details, please refer to `inplace update example <https://github.com/NVIDIA/nvmath-python/tree/main/examples/linalg/advanced/example05_stateful_inplace.py>`_.
+            For more details, please refer to `inplace update example
+            <https://github.com/NVIDIA/nvmath-python/tree/main/examples/linalg/advanced/matmul/example05_stateful_inplace.py>`_.
         """
 
         if c is not None and self.num_operands == 2:
@@ -1510,7 +1763,8 @@ def reset_operands(self, a=None, b=None, c=None, *, alpha=None, beta=None, epilo
             self.logger.info("The operands have been reset to None.")
             return
 
-        # If the operands have been reset to None, then all required operands (a, b, c, and epilog_inputs need to be provided).
+        # If the operands have been reset to None, then all required operands (a, b, c, and
+        # epilog_inputs need to be provided).
         if self.operands is None:
             if a is None or b is None or (c is None and self.num_operands == 3):
                 op_names = "A, B"
@@ -1525,7 +1779,8 @@ def reset_operands(self, a=None, b=None, c=None, *, alpha=None, beta=None, epilo
                 # Check that all required epilog inputs names are provided.
                 if epilog_names != epilog_inputs.keys():
                     raise ValueError(
-                        f"The epilog inputs {epilog_names} are required. The provided epilog input names are {epilog_inputs.keys()}."
+                        f"The epilog inputs {epilog_names} are required. The provided epilog input names are "
+                        f"{epilog_inputs.keys()}."
                     )
             self.operands = [None] * self.num_operands
             self.epilog_operands = {name: None for name in epilog_names}
@@ -1546,9 +1801,7 @@ def reset_operands(self, a=None, b=None, c=None, *, alpha=None, beta=None, epilo
         # Update beta.
         if beta is not None:
             if self.num_operands == 2:
-                self.logger.warning(
-                    f"Matmul: The provided beta value {beta} is ignored since operand C is not specified."
-                )
+                self.logger.warning(f"Matmul: The provided beta value {beta} is ignored since operand C is not specified.")
             else:
                 try:
                     self.beta[0] = beta
@@ -1625,13 +1878,18 @@ def autotune(
         self, iterations=3, prune=None, release_workspace=False, stream=None
     ):  # Prune means keep top N of the algorithms only.
         """
-        Autotune the matrix multiplication to order the algorithms from the fastest measured execution time to the slowest. Once autotuned,
-        the optimally-ordered algorithm sequence can be accessed using :py:attr:`algorithms`.
+        Autotune the matrix multiplication to order the algorithms from the fastest measured
+        execution time to the slowest. Once autotuned, the optimally-ordered algorithm
+        sequence can be accessed using :py:attr:`algorithms`.
 
         Args:
             iterations: The number of autotuning iterations to perform.
-            prune: An integer N, specifying the top N fastest algorithms to retain after autotuning. The default is to retain all algorithms.
+
+            prune: An integer N, specifying the top N fastest algorithms to retain after
+                autotuning. The default is to retain all algorithms.
+
             release_workspace: {release_workspace}
+
             stream: {stream}
         """
         self.logger.info("= AUTOTUNING PHASE =")
@@ -1660,7 +1918,13 @@ def autotune(
             name = handler.name
             shape, strides, dtype_name = handler.attributes()
             epilog_outputs[name] = aux = utils.create_empty_tensor(
-                self.result_class, shape, dtype_name, self.device_id, stream_holder, strides
+                self.result_class,
+                shape,
+                dtype_name,
+                self.device_id,
+                stream_holder,
+                verify_strides=False,
+                strides=strides,
             )
 
             # Update the data pointer in the MM descriptor.
@@ -1673,7 +1937,8 @@ def autotune(
             self.d_dtype_name,
             self.device_id,
             stream_holder,
-            self.result_traits.result_strides,
+            verify_strides=False,
+            strides=self.result_traits.result_strides,
         )
         result_ptr = result.data_ptr
 
@@ -1726,7 +1991,8 @@ def execute_matmul(algorithm_ptr):
         # Get the sort order based on the GPU times.
         sorted_gpu_times, sort_order = zip(*sorted(zip(gpu_times, range(num_algorithms), strict=True)), strict=True)
 
-        # Reorder the algorithms buffer and algorithm objects according to the sort order, and prune it.
+        # Reorder the algorithms buffer and algorithm objects according to the sort order,
+        # and prune it.
         sort_order = sort_order[:limit]
         self.algorithms_buffer = self.algorithms_buffer[list(sort_order)]
         self.algorithm_objects = tuple(self.algorithm_objects[i] for i in sort_order)
@@ -1740,15 +2006,18 @@ def execute_matmul(algorithm_ptr):
         orig_flop_rate = self.flop_count / gpu_times[0] * 1000
         if sort_order[0] != 0:
             self.logger.info(
-                f"Autotuning found that the algorithm originally ranked {sort_order[0]} is the best out of the {num_algorithms} in the plan, and moved it to rank 0."
+                f"Autotuning found that the algorithm originally ranked {sort_order[0]} is the best out of the "
+                f"{num_algorithms} in the plan, and moved it to rank 0."
             )
             new_flop_rate = self.flop_count / sorted_gpu_times[0] * 1000
             self.logger.info(
-                f"Autotuning has improved performance from {formatters.FLOPSStr(orig_flop_rate, 'FLOP/s')} to {formatters.FLOPSStr(new_flop_rate, 'FLOP/s')}."
+                f"Autotuning has improved performance from {formatters.FLOPSStr(orig_flop_rate, 'FLOP/s')} to "
+                f"{formatters.FLOPSStr(new_flop_rate, 'FLOP/s')}."
             )
         else:
             self.logger.info(
-                f"Autotuning found that the algorithm ranked best by the plan heuristics remains the best out of the {num_algorithms} algorithms in the plan."
+                f"Autotuning found that the algorithm ranked best by the plan heuristics remains the best out of the "
+                f"{num_algorithms} algorithms in the plan."
             )
             self.logger.info(f"The best performance remains at {formatters.FLOPSStr(orig_flop_rate, 'FLOP/s')}.")
 
@@ -1764,9 +2033,12 @@ def execute(self, *, algorithm=None, release_workspace=False, stream=None):
         Execute a prepared (planned and possibly autotuned) matrix multiplication.
 
         Args:
-            algorithm: (Experimental) An algorithm chosen from the sequence returned by :meth:`plan` or :py:attr:`algorithms`. By default,
-                the first algorithm in the sequence is used.
+            algorithm: (Experimental) An algorithm chosen from the sequence returned by
+                :meth:`plan` or :py:attr:`algorithms`. By default, the first algorithm in
+                the sequence is used.
+
             release_workspace: {release_workspace}
+
             stream: {stream}
 
         Returns:
@@ -1790,11 +2062,15 @@ def execute(self, *, algorithm=None, release_workspace=False, stream=None):
             shape, strides, dtype_name = handler.attributes()
             if log_debug:
                 self.logger.debug(f"Beginning auxiliary output tensor '{name}' creation...")
-                self.logger.debug(
-                    f"The '{name}' tensor shape = {shape} with strides = {strides} and data type '{dtype_name}'."
-                )
+                self.logger.debug(f"The '{name}' tensor shape = {shape} with strides = {strides} and data type '{dtype_name}'.")
             self.epilog_outputs[name] = aux = utils.create_empty_tensor(
-                self.result_class, shape, dtype_name, self.device_id, stream_holder, strides
+                self.result_class,
+                shape,
+                dtype_name,
+                self.device_id,
+                stream_holder,
+                verify_strides=False,
+                strides=strides,
             )
             if log_debug:
                 self.logger.debug(f"The auxiliary output tensor '{name}' has been created.")
@@ -1806,7 +2082,8 @@ def execute(self, *, algorithm=None, release_workspace=False, stream=None):
         if log_debug:
             self.logger.debug("Beginning output (empty) tensor creation...")
             self.logger.debug(
-                f"The output tensor shape = {self.result_traits.result_shape} with strides = {self.result_traits.result_strides} and data type '{self.d_dtype_name}'."
+                f"The output tensor shape = {self.result_traits.result_shape} with strides = "
+                f"{self.result_traits.result_strides} and data type '{self.d_dtype_name}'."
             )
         self.result = utils.create_empty_tensor(
             self.result_class,
@@ -1814,7 +2091,8 @@ def execute(self, *, algorithm=None, release_workspace=False, stream=None):
             self.d_dtype_name,
             self.device_id,
             stream_holder,
-            self.result_traits.result_strides,
+            verify_strides=False,
+            strides=self.result_traits.result_strides,
         )
         if log_debug:
             self.logger.debug("The output (empty) tensor has been created.")
@@ -1824,7 +2102,8 @@ def execute(self, *, algorithm=None, release_workspace=False, stream=None):
             algorithm_struct = self.algorithms_buffer[0]["algo"]
             if log_info:
                 self.logger.info(
-                    f"The highest ranked algorithm in the plan (algorithm id = {self.algorithm_objects[0].algorithm_id}) will be used."
+                    "The highest ranked algorithm in the plan (algorithm id = "
+                    f"{self.algorithm_objects[0].algorithm_id}) will be used."
                 )
         else:
             if algorithm not in self.algorithm_objects:
@@ -1874,9 +2153,7 @@ def execute(self, *, algorithm=None, release_workspace=False, stream=None):
         if self.memory_space == "cpu":
             out = self.result.to("cpu", stream_holder=stream_holder)
             # Copy auxiliary output to CPU.
-            aux = {
-                name: self.epilog_outputs[name].to("cpu", stream_holder=stream_holder) for name in self.epilog_outputs
-            }
+            aux = {name: self.epilog_outputs[name].to("cpu", stream_holder=stream_holder) for name in self.epilog_outputs}
         else:
             out = self.result.tensor
             # Return the unwrapped epilog output tensor(s).
@@ -1895,16 +2172,18 @@ def execute(self, *, algorithm=None, release_workspace=False, stream=None):
     def free(self):
         """Free Matmul resources.
 
-        It is recommended that the :class:`Matmul` object be used within a context, but if it is not possible then this
-        method must be called explicitly to ensure that the matrix multiplication resources (especially internal library objects) are
-        properly cleaned up.
+        It is recommended that the :class:`Matmul` object be used within a context, but if
+        it is not possible then this method must be called explicitly to ensure that the
+        matrix multiplication resources (especially internal library objects) are properly
+        cleaned up.
         """
 
         if not self.valid_state:
             return
 
         try:
-            # Future operations on the workspace stream should be ordered after the computation.
+            # Future operations on the workspace stream should be ordered after the
+            # computation.
             if self.last_compute_event is not None:
                 self.workspace_stream.wait_event(self.last_compute_event)
 
@@ -1941,34 +2220,55 @@ def matmul(
     algorithm=None,
     stream=None,
 ):
-    r"""
-    Perform the specified matrix multiplication computation :math:`F(\alpha a @ b + \beta c)`, where :math:`F` is the epilog. This function-form is a wrapper around the
-    stateful :class:`Matmul` object APIs and is meant for *single* use (the user needs to perform just one matrix multiplication, for
-    example), in which case there is no possibility of amortizing preparatory costs.
+    """
+    Perform the specified matrix multiplication computation :math:`F(\\alpha a @ b + \\beta
+    c)`, where :math:`F` is the epilog. This function-form is a wrapper around the stateful
+    :class:`Matmul` object APIs and is meant for *single* use (the user needs to perform
+    just one matrix multiplication, for example), in which case there is no possibility of
+    amortizing preparatory costs.
 
-    Detailed information on what's happening within this function can be obtained by passing in a :class:`logging.Logger` object
-    to :class:`MatmulOptions` or by setting the appropriate options in the root logger object, which is used by default:
+    Detailed information on what's happening within this function can be obtained by passing
+    in a :class:`logging.Logger` object to :class:`MatmulOptions` or by setting the
+    appropriate options in the root logger object, which is used by default:
 
         >>> import logging
-        >>> logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M:%S')
+        >>> logging.basicConfig(
+        ...     level=logging.INFO,
+        ...     format="%(asctime)s %(levelname)-8s %(message)s",
+        ...     datefmt="%m-%d %H:%M:%S",
+        ... )
 
-    A user can select the desired logging level and, in general, take advantage of all of the functionality offered by the Python `logging` module.
+    A user can select the desired logging level and, in general, take advantage of all of
+    the functionality offered by the Python `logging` module.
 
     Args:
         a: {a}
+
         b: {b}
+
         c: {c}
+            {c_admonitions}
+
         alpha: {alpha}
-        beta: {beta}
-           from a previously planned and autotuned matrix multiplication.
+
+        beta: {beta} from a previously planned and autotuned matrix multiplication.
+
         epilog: {epilog}
+
         epilog_inputs: {epilog_inputs}
+
         qualifiers: {qualifiers}
+
         options: {options}
+
         preferences: {preferences}
-        algorithm: An object of type :class:`Algorithm` objects can be directly provided to bypass planning, if desired. The algorithm object must
-            be compatible with the matrix multiplication. A typical use for this option is to provide an algorithm that has been serialized
-            (pickled) from a previously planned and autotuned matrix multiplication.
+
+        algorithm: An object of type :class:`Algorithm` objects can be directly provided to
+            bypass planning, if desired. The algorithm object must be compatible with the
+            matrix multiplication. A typical use for this option is to provide an algorithm
+            that has been serialized (pickled) from a previously planned and autotuned
+            matrix multiplication.
+
         stream: {stream}
 
     Returns:
@@ -1978,7 +2278,8 @@ def matmul(
         {semantics}
 
     See Also:
-        :class:`Matmul`, :class:`MatmulOptions`, :class:`MatmulEpilog`, :class:`MatmulPlanPreferences`
+        :class:`Matmul`, :class:`MatmulOptions`, :class:`MatmulEpilog`,
+        :class:`MatmulPlanPreferences`
 
     Examples:
 
@@ -1992,11 +2293,13 @@ def matmul(
         >>> b = cp.random.rand(K, N, dtype=cp.float32)
         >>> c = cp.random.rand(M, N, dtype=cp.float32)
 
-        Perform the operation :math:`\alpha A @ B + \beta C` using :func:`matmul`. The result `r` is also a CuPy float64 ndarray:
+        Perform the operation :math:`\\alpha A @ B + \\beta C` using :func:`matmul`. The
+        result `r` is also a CuPy float64 ndarray:
 
         >>> r = nvmath.linalg.advanced.matmul(a, b, c, alpha=1.23, beta=0.74)
 
-        An epilog can be used as well. Here we perform :math:`RELU(\alpha A @ B + \beta C)`:
+        An epilog can be used as well. Here we perform
+        :math:`RELU(\\alpha A @ B + \\beta C)`:
 
         >>> epilog = nvmath.linalg.advanced.MatmulEpilog.RELU
         >>> r = nvmath.linalg.advanced.matmul(a, b, c, alpha=1.23, beta=0.74, epilog=epilog)
@@ -2009,16 +2312,18 @@ def matmul(
 
         See `MatmulOptions` for the complete list of available options.
 
-        The package current stream is used by default, but a stream can be explicitly provided to the Matmul operation. This can be done if the
-        operands are computed on a different stream, for example:
+        The package current stream is used by default, but a stream can be explicitly
+        provided to the Matmul operation. This can be done if the operands are computed on a
+        different stream, for example:
 
         >>> s = cp.cuda.Stream()
         >>> with s:
-        ...    a = cp.random.rand(M, K)
-        ...    b = cp.random.rand(K, N)
-        >>> nvmath.linalg.advanced.matmul(a, b, stream=s)
+        ...     a = cp.random.rand(M, K)
+        ...     b = cp.random.rand(K, N)
+        >>> r = nvmath.linalg.advanced.matmul(a, b, stream=s)
 
-        The operation above runs on stream `s` and is ordered with respect to the input computation.
+        The operation above runs on stream `s` and is ordered with respect to the input
+        computation.
 
         Create  NumPy ndarrays on the CPU.
 
@@ -2026,14 +2331,18 @@ def matmul(
         >>> a = np.random.rand(M, K)
         >>> b = np.random.rand(K, N)
 
-        Provide the NumPy ndarrays to :func:`matmul`, with the result also being a NumPy ndarray:
+        Provide the NumPy ndarrays to :func:`matmul`, with the result also being a NumPy
+        ndarray:
 
         >>> r = nvmath.linalg.advanced.matmul(a, b)
 
     Notes:
-        - This function is a convenience wrapper around :class:`Matmul` and and is specifically meant for *single* use.
+        - This function is a convenience wrapper around :class:`Matmul` and and is
+          specifically meant for *single* use.
 
-    Further examples can be found in the `nvmath/examples/linalg/advanced/matmul <https://github.com/NVIDIA/nvmath-python/tree/main/examples/linalg/advanced/matmul>`_ directory.
+    Further examples can be found in the `nvmath/examples/linalg/advanced/matmul
+    <https://github.com/NVIDIA/nvmath-python/tree/main/examples/linalg/advanced/matmul>`_
+    directory.
     """
 
     # Set algorithm limit to 1, but take a copy first if needed.
@@ -2051,9 +2360,7 @@ def matmul(
         algorithms = [algorithm]  # The type of algorithm should be algorithm.Algorithm and will be checked in plan()
 
     with Matmul(a, b, c=c, alpha=alpha, beta=beta, qualifiers=qualifiers, options=options, stream=stream) as mm:
-        mm.plan(
-            preferences=preferences, epilog=epilog, epilog_inputs=epilog_inputs, stream=stream, algorithms=algorithms
-        )
+        mm.plan(preferences=preferences, epilog=epilog, epilog_inputs=epilog_inputs, stream=stream, algorithms=algorithms)
 
         r = mm.execute(stream=stream)
 
diff --git a/nvmath/memory.py b/nvmath/memory.py
index 231dd58..2b4f576 100644
--- a/nvmath/memory.py
+++ b/nvmath/memory.py
@@ -67,13 +67,15 @@ def memalloc(self, size):
             size: The size of the memory buffer in bytes.
 
         Returns:
-            An object that owns the allocated memory and is responsible for releasing it (to the OS or a pool). The object must
-            have an attribute named ``device_ptr``, ``device_pointer``, or ``ptr`` specifying the pointer to the allocated memory
+            An object that owns the allocated memory and is responsible for releasing it (to
+            the OS or a pool). The object must have an attribute named ``device_ptr``,
+            ``device_pointer``, or ``ptr`` specifying the pointer to the allocated memory
             buffer. See :class:`MemoryPointer` for an example interface.
 
         Note:
-            Objects of type :class:`numba.cuda.MemoryPointer` as well as :class:`cupy.cuda.MemoryPointer` meet the requirements
-            listed above for the device memory pointer object.
+            Objects of type :class:`numba.cuda.MemoryPointer` as well as
+            :class:`cupy.cuda.MemoryPointer` meet the requirements listed above for the
+            device memory pointer object.
         """
         raise NotImplementedError
 
@@ -105,7 +107,8 @@ def memalloc(self, size):
 
         def create_finalizer():
             def finalizer():
-                # Note: With UVA there is no need to switch context to the device the memory belongs to before calling free().
+                # Note: With UVA there is no need to switch context to the device the memory
+                # belongs to before calling free().
                 cp.cuda.runtime.free(device_ptr)
                 self.logger.debug(f"_RawCUDAMemoryManager (release memory): ptr = {device_ptr}")
 
diff --git a/pyproject.toml b/pyproject.toml
index ce0b7d4..a2c64e1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,10 +65,18 @@ sysctk12 = { file = ["requirements/pip/nvmath-python-sysctk12.txt"] }
 sysctk12-dx = { file = ["requirements/pip/nvmath-python-sysctk12-dx.txt"] }
 
 [tool.ruff]
-line-length = 120
+line-length = 128
 # Don't format autogenerated files
 exclude = ["nvmath/device/curand_kernel.py"]
 
+[tool.ruff.format]
+docstring-code-format = true
+docstring-code-line-length = 88
+
+[tool.ruff.lint.pycodestyle]
+max-doc-length = 92
+max-line-length = 128
+
 [tool.ruff.lint]
 select = [
     # pycodestyle Error
@@ -96,15 +104,10 @@ ignore = [
     "B028",
     "B904",
     "I001",
-    "SIM102",
     "SIM105",
     "SIM108",
-    "SIM114",
-    "SIM211",
     # Ignore module-import-not-at-top-of-file
     "E402",
-    # Ignore line-too-long
-    "E501",
     # Ignore multiple-statements-on-one-line-colon
     "E701",
     # Ignore multiple-statements-on-one-line-semicolon
@@ -121,12 +124,6 @@ ignore = [
     "F403",
     # Ignore undefined-local-with-import-star-usage
     "F405",
-    # Ignore redefined-while-unused
-    "F811",
-    # Ignore undefined-name
-    "F821",
-    # Ignore unused-variable
-    "F841",
 ]
 fixable = ["ALL"]
 
diff --git a/requirements/README.md b/requirements/README.md
index e209add..ff4ef5b 100644
--- a/requirements/README.md
+++ b/requirements/README.md
@@ -1,13 +1,14 @@
 # nvmath-python requirements
 
-Dependencies are organized with requirements.txt files which can be use to set up virtualenvs with all required
-development tools to build docs, run tests, and build redistributable wheels.  Different requirements are necessary for
-installation with [pip](https://pip.pypa.io/en/stable/) vs [conda](https://docs.conda.io/en/latest/).
+Dependencies are organized with requirements.txt files which can be use to set up
+virtualenvs with all required development tools to build docs, run tests, and build
+redistributable wheels.  Different requirements are necessary for installation with
+[pip](https://pip.pypa.io/en/stable/) vs [conda](https://docs.conda.io/en/latest/).
 
 ## Pip: Top-level package requirements files
 
-Prefer using these `requirements/pip-<name>.txt` files for development in pip managed virtualenvs.  These include all
-relevant requirements sets and package extras.
+Prefer using these `requirements/pip-<name>.txt` files for development in pip managed
+virtualenvs.  These include all relevant requirements sets and package extras.
 
 ### Pip: Supported configurations for wheels
 
@@ -22,11 +23,13 @@ relevant requirements sets and package extras.
 
 ### Pip: Development usage
 
-The requirements files provide dependencies only.  The nvmath-python package itself must also be installed, typically in
-editable mode for development.  Extras are not required to be specified on the editable install assuming the right
-requirements.txt has been installed in virtualenv.
+The requirements files provide dependencies only.  The nvmath-python package itself must
+also be installed, typically in editable mode for development.  Extras are not required to
+be specified on the editable install assuming the right requirements.txt has been installed
+in virtualenv.
 
-*Note*: For testing wheel/RPATH support locally, currently it requires to build in the non-editable mode (no `-e` flag).
+*Note*: For testing wheel/RPATH support locally, currently it requires to build in the
+non-editable mode (no `-e` flag).
 
 #### Install with pip
 
@@ -49,8 +52,8 @@ $ pipenv shell
 
 ### Pip: Fine-grained requirements
 
-Requirements for specific functionality are broken out into subsets.  These fine-grained requirements are included by
-the top-level requirements sets.
+Requirements for specific functionality are broken out into subsets.  These fine-grained
+requirements are included by the top-level requirements sets.
 
 | requirements.txt | Functionality |
 | ---------------- | ------- |
diff --git a/tests/docstring_tests/test_docstrings.py b/tests/docstring_tests/test_docstrings.py
new file mode 100644
index 0000000..19ccb2e
--- /dev/null
+++ b/tests/docstring_tests/test_docstrings.py
@@ -0,0 +1,23 @@
+import contextlib
+import os
+import sphinx.cmd.build
+
+
+@contextlib.contextmanager
+def os_cd(path):
+    old = os.getcwd()
+    os.chdir(path)
+    try:
+        yield
+    finally:
+        os.chdir(old)
+
+
+def test_docstrings():
+    with os_cd("docs/sphinx"):
+        ret = sphinx.cmd.build.main(["-M", "doctest", ".", os.path.join("../..", "docs/_build/doctest")])
+        assert ret == 0
+
+
+if __name__ == "__main__":
+    test_docstrings()
diff --git a/tests/example_tests/fft_tests/test_fft_samples.py b/tests/example_tests/fft_tests/test_fft_samples.py
index 5c17e89..ba36d1a 100644
--- a/tests/example_tests/fft_tests/test_fft_samples.py
+++ b/tests/example_tests/fft_tests/test_fft_samples.py
@@ -45,9 +45,7 @@ def _has_numba():
 
 # The HAS_CUDA filtering may be extended for runs with no CUFFT at all
 skip_cufft_jit_callback = (
-    not HAS_CUFFT
-    or not _has_numba()
-    or bindings._internal.cufft._inspect_function_pointer("__cufftXtSetJITCallback") == 0
+    not HAS_CUFFT or not _has_numba() or bindings._internal.cufft._inspect_function_pointer("__cufftXtSetJITCallback") == 0
 )
 
 samples_path = os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples", "fft")
diff --git a/tests/nvmath_tests/device/cpp_conv.py b/tests/nvmath_tests/device/cpp_conv.py
index 68a4509..5104865 100644
--- a/tests/nvmath_tests/device/cpp_conv.py
+++ b/tests/nvmath_tests/device/cpp_conv.py
@@ -122,9 +122,7 @@ def run(self, input, filter, reference, ncycles):
         assert batch % self._ffts_per_block == 0
 
         num_blocks = batch // self._ffts_per_block
-        print(
-            f"FFTConvCpp Batch {batch}, ffts_per_block {self._ffts_per_block}, num_blocks {num_blocks}, ncycles {ncycles}"
-        )
+        print(f"FFTConvCpp Batch {batch}, ffts_per_block {self._ffts_per_block}, num_blocks {num_blocks}, ncycles {ncycles}")
         assert num_blocks * self._ffts_per_block == batch
 
         # Create input
diff --git a/tests/nvmath_tests/device/cpp_gemm_loop.py b/tests/nvmath_tests/device/cpp_gemm_loop.py
index 5d858a2..e5bb6c4 100644
--- a/tests/nvmath_tests/device/cpp_gemm_loop.py
+++ b/tests/nvmath_tests/device/cpp_gemm_loop.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda import cuda
-from .helpers import CHECK_CUDA, _TOLERANCE, l2error, free_array, convert_to_cuda_array, free_array, copy_to_cupy
+from .helpers import CHECK_CUDA, _TOLERANCE, l2error, convert_to_cuda_array, free_array, copy_to_cupy
 import numpy as np
 from .helpers_cpp import run_and_time, compile_cpp_kernel
 import cupy
diff --git a/tests/nvmath_tests/device/curand/distributions.py b/tests/nvmath_tests/device/curand/distributions.py
index 3578fa2..3e919ea 100644
--- a/tests/nvmath_tests/device/curand/distributions.py
+++ b/tests/nvmath_tests/device/curand/distributions.py
@@ -40,7 +40,8 @@ def _get_curand_function(self, dtype_name: str, group_size: int) -> Callable:
 
     def curand(self, dtype_name: str, group_size: int) -> tuple[Callable, tuple]:
         """
-        Returns curand distribution function for specific dtype and group size, together with its extra arguments.
+        Returns curand distribution function for specific dtype and group size, together
+        with its extra arguments.
         """
         raise NotImplementedError
 
diff --git a/tests/nvmath_tests/device/curand/test_random.py b/tests/nvmath_tests/device/curand/test_random.py
index 8f865ce..52fa09b 100644
--- a/tests/nvmath_tests/device/curand/test_random.py
+++ b/tests/nvmath_tests/device/curand/test_random.py
@@ -13,9 +13,10 @@
 """
 This set of tests checks random device APIs.
 
-Running the tests requires compiling a setup kernel for every generator, and random number generation kernel for all
-combinations of generator, distribution, dtype and group size. As a result, due to lazy jit compilation, the tests will
-be running slower at the beginning and then become much faster.
+Running the tests requires compiling a setup kernel for every generator, and random number
+generation kernel for all combinations of generator, distribution, dtype and group size. As
+a result, due to lazy jit compilation, the tests will be running slower at the beginning and
+then become much faster.
 """
 
 
@@ -84,7 +85,8 @@ def test(x):
 )
 def test_seeds(distribution, dtype_name, generator, nsamples, threads, blocks, group_size):
     """
-    Tests if seeding works, i.e. same seeds results in same sequences and different seeds result in different sequences.
+    Tests if seeding works, i.e. same seeds results in same sequences and different seeds
+    result in different sequences.
     """
 
     def generate_with_seed(seed):
@@ -124,7 +126,8 @@ def generate_with_seed(seed):
 )
 def test_skipahead(generator, threads, blocks):
     """
-    Tests if seeding works, i.e. same seeds results in same sequences and different seeds result in different sequences.
+    Tests if seeding works, i.e. same seeds results in same sequences and different seeds
+    result in different sequences.
     """
 
     seed = 765
@@ -196,7 +199,8 @@ def gen_all():
 )
 def test_skipahead_sequence(generator, threads, blocks):
     """
-    Tests if seeding works, i.e. same seeds results in same sequences and different seeds result in different sequences.
+    Tests if seeding works, i.e. same seeds results in same sequences and different seeds
+    result in different sequences.
     """
 
     seed = 100
diff --git a/tests/nvmath_tests/device/curand/utils.py b/tests/nvmath_tests/device/curand/utils.py
index 511cc3f..a0e2549 100644
--- a/tests/nvmath_tests/device/curand/utils.py
+++ b/tests/nvmath_tests/device/curand/utils.py
@@ -76,7 +76,8 @@ def generate_random_numbers(
 ):
     """
     Runs numba kernel generating random numbers from the specified distribution and states.
-    Each thread generates `nsample` values. The result is a numpy array of shape (threads*blocks, nsamples).
+    Each thread generates `nsample` values. The result is a numpy array of shape
+    (threads*blocks, nsamples).
     """
     assert nsamples % group_size == 0
     nthreads = blocks * threads
@@ -151,8 +152,10 @@ def prepare_states_and_generate(
 ):
     """
     A wrapper for prepare_states and generate_random_numbers.
-    Creates states and runs numba kernel generating random numbers from the specified distribution.
-    Each thread generates `nsample` values. The result is a numpy array of shape (threads*blocks, nsamples).
+
+    Creates states and runs numba kernel generating random numbers from the specified
+    distribution. Each thread generates `nsample` values. The result is a numpy array of
+    shape (threads*blocks, nsamples).
     """
     states = prepare_states(generator=generator, seed=seed, threads=threads, blocks=blocks, offset=offset)
     return generate_random_numbers(
diff --git a/tests/nvmath_tests/device/numba_gemm_batched.py b/tests/nvmath_tests/device/numba_gemm_batched.py
index 4fb753e..fbff205 100644
--- a/tests/nvmath_tests/device/numba_gemm_batched.py
+++ b/tests/nvmath_tests/device/numba_gemm_batched.py
@@ -16,7 +16,6 @@ def __init__(self, size, precision, data_type, block_size, repeat):
         assert precision == np.float32
         assert data_type == "real"
 
-        start = time.time()
         MM = matmul(
             size=size,
             data_type="real",
@@ -26,8 +25,6 @@ def __init__(self, size, precision, data_type, block_size, repeat):
             execution="Block",
             compiler="numba",
         )
-        stop = time.time()
-        t_numba_jit_s = stop - start
 
         (m, n, k) = size
 
diff --git a/tests/nvmath_tests/device/test_cublasdx_generic.py b/tests/nvmath_tests/device/test_cublasdx_generic.py
index 6f8699c..0b3124c 100644
--- a/tests/nvmath_tests/device/test_cublasdx_generic.py
+++ b/tests/nvmath_tests/device/test_cublasdx_generic.py
@@ -9,7 +9,6 @@
     CodeType,
     ComputeCapability,
     matmul,
-    BlasOptions,
     TransposeMode,
     LeadingDimension,
     BlasOptions,
@@ -64,9 +63,7 @@ def test_third_party_code():
     assert MM.max_threads_per_block <= 1024
 
 
-@pytest.mark.parametrize(
-    "ta, tb", list(itertools.product(["non_transposed", "transposed", "conj_transposed"], repeat=2))
-)
+@pytest.mark.parametrize("ta, tb", list(itertools.product(["non_transposed", "transposed", "conj_transposed"], repeat=2)))
 def test_transpose_mode(ta, tb):
     MM1 = matmul(
         size=(2, 2, 2),
@@ -241,7 +238,7 @@ def test_negative(opt, value):
     else:
         opts[opt] = value
     with pytest.raises(Exception):
-        MM = matmul(**opts)
+        MM = matmul(**opts)  # noqa: F841
 
 
 @pytest.mark.parametrize("code_type", [SM70, SM72, SM75, SM80, SM86, SM89, SM90])
diff --git a/tests/nvmath_tests/device/test_cublasdx_numba_perf.py b/tests/nvmath_tests/device/test_cublasdx_numba_perf.py
index 6c2374d..b163783 100644
--- a/tests/nvmath_tests/device/test_cublasdx_numba_perf.py
+++ b/tests/nvmath_tests/device/test_cublasdx_numba_perf.py
@@ -51,9 +51,9 @@ def fun(a, b):
             return out
 
         t_cupy = time_check_cupy(fun, reference, ncycles, a, b)
-        t_cpp = MatmulBatchedCpp(
-            size=size, precision=np.float32, data_type="real", sm=SM, block_size=32, repeat=repeat
-        ).run(a=a, b=b, reference=reference, ncycles=ncycles)
+        t_cpp = MatmulBatchedCpp(size=size, precision=np.float32, data_type="real", sm=SM, block_size=32, repeat=repeat).run(
+            a=a, b=b, reference=reference, ncycles=ncycles
+        )
         t_numba = NumbaGemmBatched(size=size, precision=np.float32, data_type="real", block_size=32, repeat=repeat).run(
             a=a, b=b, reference=reference, ncycles=ncycles
         )
diff --git a/tests/nvmath_tests/device/test_cufftdx_generic.py b/tests/nvmath_tests/device/test_cufftdx_generic.py
index 342e3db..4f32eb9 100644
--- a/tests/nvmath_tests/device/test_cufftdx_generic.py
+++ b/tests/nvmath_tests/device/test_cufftdx_generic.py
@@ -43,12 +43,10 @@ def test_third_party_code():
     assert all([len(code.data) > 0 for code in FFT.codes])
 
 
-#                                            2      | 2, 2^2, ... | 2, 2^2, ... | 2, 2^2, ...
+#                                            2      | 2, 2^2, ... | 2, 2^2, ... | 2, 2^2, ...  # noqa: W505
 @pytest.mark.parametrize("size, mincount", [(2, 1), (16, 4), (128, 4), (2048, 4)])
 def test_knobs_c2c_ept_fpb(size, mincount):
-    FO = FFTOptions(
-        fft_type="c2c", size=size, precision=np.float32, direction="forward", code_type=SM80, execution="Block"
-    )
+    FO = FFTOptions(fft_type="c2c", size=size, precision=np.float32, direction="forward", code_type=SM80, execution="Block")
     valid = FO.valid("elements_per_thread", "ffts_per_block")
     assert len(list(valid)) >= mincount
     for ept, fpb in valid:
@@ -63,9 +61,7 @@ def test_knobs_c2c_ept_fpb(size, mincount):
 #                                            3, 3^2 | 11      |2, 2^2, 2^3, ...
 @pytest.mark.parametrize("size, mincount", [(9, 2), (121, 1), (2048, 4)])
 def test_knobs_c2c_ept_only(size, mincount):
-    FO = FFTOptions(
-        fft_type="c2c", size=size, precision=np.float32, direction="forward", code_type=SM80, execution="Block"
-    )
+    FO = FFTOptions(fft_type="c2c", size=size, precision=np.float32, direction="forward", code_type=SM80, execution="Block")
     valid = FO.valid("elements_per_thread")
     assert len(list(valid)) >= mincount
     for (ept,) in valid:
@@ -78,9 +74,7 @@ def test_knobs_c2c_ept_only(size, mincount):
 
 @pytest.mark.parametrize("size, mincount", [(7, 1), (36, 1), (2048, 1)])
 def test_knobs_c2c_fpb_only(size, mincount):
-    FO = FFTOptions(
-        fft_type="c2c", size=size, precision=np.float32, direction="forward", code_type=SM80, execution="Block"
-    )
+    FO = FFTOptions(fft_type="c2c", size=size, precision=np.float32, direction="forward", code_type=SM80, execution="Block")
     valid = FO.valid("ffts_per_block")
     assert len(list(valid)) >= mincount
     for (fpb,) in valid:
@@ -132,9 +126,7 @@ def test_knobs_r2c_c2r(fft_type, complex_layout, real_mode):
 
 
 def test_knobs_0():
-    FO = FFTOptions(
-        fft_type="c2c", size=4, precision=np.float32, direction="forward", code_type=SM80, execution="Block"
-    )
+    FO = FFTOptions(fft_type="c2c", size=4, precision=np.float32, direction="forward", code_type=SM80, execution="Block")
     val = FO.valid("elements_per_thread", "ffts_per_block")
     print(val)
 
@@ -236,9 +228,7 @@ def test_partial_fft():
 
 
 def test_valid_knobs_0():
-    FO = FFTOptions(
-        fft_type="c2c", size=32, precision=np.float32, direction="forward", code_type=SM80, execution="Block"
-    )
+    FO = FFTOptions(fft_type="c2c", size=32, precision=np.float32, direction="forward", code_type=SM80, execution="Block")
     valids = FO.valid("elements_per_thread", "ffts_per_block")
     count = 0
     for ept, bpb in valids:
@@ -261,9 +251,7 @@ def test_valid_knobs_0():
 
 
 def test_valid_knobs_1():
-    FO = FFTOptions(
-        fft_type="c2c", size=32, precision=np.float32, direction="forward", code_type=SM80, execution="Block"
-    )
+    FO = FFTOptions(fft_type="c2c", size=32, precision=np.float32, direction="forward", code_type=SM80, execution="Block")
     valids = FO.valid("elements_per_thread", "ffts_per_block")
     count = 0
     for ept, bpb in valids:
@@ -320,14 +308,12 @@ def test_negative(opt, value):
     else:
         opts[opt] = value
     with pytest.raises(Exception):
-        FFT = fft(**opts)
+        FFT = fft(**opts)  # noqa: F841
 
 
 @pytest.mark.parametrize("code_type", [SM70, SM72, SM75, SM80, SM86, SM89, SM90])
 def test_sm(code_type):
-    FFT = fft(
-        fft_type="c2c", size=256, precision=np.float32, direction="forward", code_type=code_type, execution="Block"
-    )
+    FFT = fft(fft_type="c2c", size=256, precision=np.float32, direction="forward", code_type=code_type, execution="Block")
     assert all([isinstance(code.data, bytes) for code in FFT.codes])
     assert all([len(code.data) > 0 for code in FFT.codes])
 
@@ -358,9 +344,7 @@ def test_value_type(precision, value_type):
 
 @pytest.mark.parametrize("code_type", [("lto", (7, 0)), ("lto", (8, 0))])
 def test_sm_tuple(code_type):
-    FFT = fft(
-        fft_type="c2c", size=256, precision=np.float32, direction="forward", code_type=code_type, execution="Block"
-    )
+    FFT = fft(fft_type="c2c", size=256, precision=np.float32, direction="forward", code_type=code_type, execution="Block")
     assert all([isinstance(code.data, bytes) for code in FFT.codes])
     assert all([len(code.data) > 0 for code in FFT.codes])
     assert all([code.code_type.kind == code_type[0] for code in FFT.codes])
diff --git a/tests/nvmath_tests/device/test_cufftdx_numba.py b/tests/nvmath_tests/device/test_cufftdx_numba.py
index cb77169..5941e40 100644
--- a/tests/nvmath_tests/device/test_cufftdx_numba.py
+++ b/tests/nvmath_tests/device/test_cufftdx_numba.py
@@ -141,12 +141,8 @@ def convert_output(fft_type, precision, output_d):
 TEST_CASES.append(("r2c", 32, np.float64, "forward", "thread", 16, None))
 
 # real_mode Normal
-TEST_CASES.append(
-    ("r2c", 4, np.float16, "forward", "thread", None, {"complex_layout": "packed", "real_mode": "normal"})
-)
-TEST_CASES.append(
-    ("r2c", 8, np.float16, "forward", "thread", None, {"complex_layout": "packed", "real_mode": "normal"})
-)
+TEST_CASES.append(("r2c", 4, np.float16, "forward", "thread", None, {"complex_layout": "packed", "real_mode": "normal"}))
+TEST_CASES.append(("r2c", 8, np.float16, "forward", "thread", None, {"complex_layout": "packed", "real_mode": "normal"}))
 TEST_CASES.append(("r2c", 16, np.float16, "forward", "smem", None, {"complex_layout": "packed", "real_mode": "normal"}))
 TEST_CASES.append(("r2c", 32, np.float16, "forward", "smem", None, {"complex_layout": "packed", "real_mode": "normal"}))
 
@@ -155,18 +151,10 @@ def convert_output(fft_type, precision, output_d):
 TEST_CASES.append(("r2c", 17, np.float32, "forward", "smem", None, {"complex_layout": "full", "real_mode": "normal"}))
 TEST_CASES.append(("r2c", 33, np.float32, "forward", "smem", None, {"complex_layout": "full", "real_mode": "normal"}))
 
-TEST_CASES.append(
-    ("c2r", 64, np.float16, "inverse", "thread", None, {"complex_layout": "packed", "real_mode": "normal"})
-)
-TEST_CASES.append(
-    ("c2r", 128, np.float16, "inverse", "thread", None, {"complex_layout": "packed", "real_mode": "normal"})
-)
-TEST_CASES.append(
-    ("c2r", 256, np.float16, "inverse", "smem", None, {"complex_layout": "packed", "real_mode": "normal"})
-)
-TEST_CASES.append(
-    ("c2r", 512, np.float16, "inverse", "smem", None, {"complex_layout": "packed", "real_mode": "normal"})
-)
+TEST_CASES.append(("c2r", 64, np.float16, "inverse", "thread", None, {"complex_layout": "packed", "real_mode": "normal"}))
+TEST_CASES.append(("c2r", 128, np.float16, "inverse", "thread", None, {"complex_layout": "packed", "real_mode": "normal"}))
+TEST_CASES.append(("c2r", 256, np.float16, "inverse", "smem", None, {"complex_layout": "packed", "real_mode": "normal"}))
+TEST_CASES.append(("c2r", 512, np.float16, "inverse", "smem", None, {"complex_layout": "packed", "real_mode": "normal"}))
 
 TEST_CASES.append(("c2r", 9, np.float64, "inverse", "thread", None, {"complex_layout": "full", "real_mode": "normal"}))
 TEST_CASES.append(("c2r", 13, np.float64, "inverse", "thread", None, {"complex_layout": "full", "real_mode": "normal"}))
@@ -178,37 +166,21 @@ def convert_output(fft_type, precision, output_d):
 TEST_CASES.append(("c2r", 8, np.float64, "inverse", "smem", None, {"complex_layout": "natural", "real_mode": "folded"}))
 TEST_CASES.append(("c2r", 16, np.float16, "inverse", "smem", None, {"complex_layout": "packed", "real_mode": "folded"}))
 TEST_CASES.append(("c2r", 32, np.float32, "inverse", "thread", None, {"complex_layout": "full", "real_mode": "folded"}))
-TEST_CASES.append(
-    ("c2r", 64, np.float64, "inverse", "thread", None, {"complex_layout": "natural", "real_mode": "folded"})
-)
-TEST_CASES.append(
-    ("c2r", 128, np.float16, "inverse", "smem", None, {"complex_layout": "packed", "real_mode": "folded"})
-)
+TEST_CASES.append(("c2r", 64, np.float64, "inverse", "thread", None, {"complex_layout": "natural", "real_mode": "folded"}))
+TEST_CASES.append(("c2r", 128, np.float16, "inverse", "smem", None, {"complex_layout": "packed", "real_mode": "folded"}))
 TEST_CASES.append(("c2r", 256, np.float32, "inverse", "smem", None, {"complex_layout": "full", "real_mode": "folded"}))
-TEST_CASES.append(
-    ("c2r", 512, np.float64, "inverse", "thread", None, {"complex_layout": "natural", "real_mode": "folded"})
-)
-TEST_CASES.append(
-    ("c2r", 1024, np.float16, "inverse", "thread", None, {"complex_layout": "full", "real_mode": "folded"})
-)
+TEST_CASES.append(("c2r", 512, np.float64, "inverse", "thread", None, {"complex_layout": "natural", "real_mode": "folded"}))
+TEST_CASES.append(("c2r", 1024, np.float16, "inverse", "thread", None, {"complex_layout": "full", "real_mode": "folded"}))
 
 TEST_CASES.append(("r2c", 4, np.float32, "forward", "thread", None, {"complex_layout": "full", "real_mode": "folded"}))
 TEST_CASES.append(("r2c", 8, np.float64, "forward", "smem", None, {"complex_layout": "natural", "real_mode": "folded"}))
 TEST_CASES.append(("r2c", 16, np.float16, "forward", "smem", None, {"complex_layout": "packed", "real_mode": "folded"}))
 TEST_CASES.append(("r2c", 32, np.float32, "forward", "thread", None, {"complex_layout": "full", "real_mode": "folded"}))
-TEST_CASES.append(
-    ("r2c", 64, np.float64, "forward", "thread", None, {"complex_layout": "natural", "real_mode": "folded"})
-)
-TEST_CASES.append(
-    ("r2c", 128, np.float16, "forward", "smem", None, {"complex_layout": "packed", "real_mode": "folded"})
-)
+TEST_CASES.append(("r2c", 64, np.float64, "forward", "thread", None, {"complex_layout": "natural", "real_mode": "folded"}))
+TEST_CASES.append(("r2c", 128, np.float16, "forward", "smem", None, {"complex_layout": "packed", "real_mode": "folded"}))
 TEST_CASES.append(("r2c", 256, np.float32, "forward", "smem", None, {"complex_layout": "full", "real_mode": "folded"}))
-TEST_CASES.append(
-    ("r2c", 512, np.float64, "forward", "thread", None, {"complex_layout": "natural", "real_mode": "folded"})
-)
-TEST_CASES.append(
-    ("r2c", 1024, np.float16, "forward", "thread", None, {"complex_layout": "full", "real_mode": "folded"})
-)
+TEST_CASES.append(("r2c", 512, np.float64, "forward", "thread", None, {"complex_layout": "natural", "real_mode": "folded"}))
+TEST_CASES.append(("r2c", 1024, np.float16, "forward", "thread", None, {"complex_layout": "full", "real_mode": "folded"}))
 
 
 # Supports: Block APIs, C2R/R2C/C2C, all precision, all real_fft_options
diff --git a/tests/nvmath_tests/device/test_cufftdx_numba_perf.py b/tests/nvmath_tests/device/test_cufftdx_numba_perf.py
index 3cd5b3e..b059e7f 100644
--- a/tests/nvmath_tests/device/test_cufftdx_numba_perf.py
+++ b/tests/nvmath_tests/device/test_cufftdx_numba_perf.py
@@ -78,7 +78,8 @@ def run_conv_perf(test_cases):
         assert batch >= min_batch
 
         print(
-            f"Numba vs cupy host APIs vs CUDA C++ (convolution), batch = {batch}, size = {size}, precision = {precision}, bpb = {ffts_per_block}, ept = {elements_per_thread}"
+            f"Numba vs cupy host APIs vs CUDA C++ (convolution), batch = {batch}, size = {size}, precision = {precision}, "
+            f"bpb = {ffts_per_block}, ept = {elements_per_thread}"
         )
 
         #
diff --git a/tests/nvmath_tests/fft/test_default_backend.py b/tests/nvmath_tests/fft/test_default_backend.py
index e343528..6db05e6 100644
--- a/tests/nvmath_tests/fft/test_default_backend.py
+++ b/tests/nvmath_tests/fft/test_default_backend.py
@@ -67,6 +67,6 @@ def wrapper(*args, **kwargs):
     assert_array_type(out, framework, mem_backend, get_fft_dtype(dtype))
     assert_norm_close(out, get_fft_ref(signal, axes))
     ifft_fn = nvmath.fft.ifft if is_complex(dtype) else nvmath.fft.irfft
-    iout = ifft_fn(out, axes=axes, options={"last_axis_size": "odd"})
+    iout = ifft_fn(out, axes=axes, options={"last_axis_parity": "odd"})
     assert_array_type(iout, framework, mem_backend, dtype)
     assert_norm_close(iout, get_scaled(signal, shape[0]))
diff --git a/tests/nvmath_tests/fft/test_fft_with_hypothesis.py b/tests/nvmath_tests/fft/test_fft_with_hypothesis.py
new file mode 100644
index 0000000..5dc0e4f
--- /dev/null
+++ b/tests/nvmath_tests/fft/test_fft_with_hypothesis.py
@@ -0,0 +1,228 @@
+import itertools
+
+import cupy as cp
+import numpy as np
+import scipy.fft
+
+from hypothesis import given, reproduce_failure, strategies as st
+from hypothesis.extra.numpy import arrays, array_shapes
+
+import nvmath
+
+from nvmath_tests.helpers import nvmath_seed
+
+# FIMXE: Lower minimum side length to 1 after refactoring of array traits
+shape_st = array_shapes(min_dims=1, max_dims=3, min_side=2, max_side=256)
+
+element_properties = dict(
+    allow_nan=False,
+    allow_infinity=False,
+    allow_subnormal=True,
+    min_magnitude=0.0,
+    max_magnitude=1.0,
+    min_value=-0.5,
+    max_value=+0.5,
+)
+
+c32_array_st = arrays(
+    np.complex64,
+    shape=shape_st,
+    elements=element_properties,
+)
+c64_array_st = arrays(
+    np.complex128,
+    shape=shape_st,
+    elements=element_properties,
+)
+f32_array_st = arrays(
+    np.float32,
+    shape=shape_st,
+    elements=element_properties,
+)
+f64_array_st = arrays(
+    np.float64,
+    shape=shape_st,
+    elements=element_properties,
+)
+
+options_st = st.fixed_dictionaries(
+    {
+        "result_layout": st.sampled_from(["natural", "optimized"]),
+        "last_axis_parity": st.sampled_from(["odd", "even"]),
+        # TODO more options
+    }
+)
+
+execution_st = st.sampled_from(
+    [
+        "cuda",
+        "cpu",
+        nvmath.fft.ExecutionCUDA(),
+        nvmath.fft.ExecutionCPU(),
+    ]
+)
+
+dtype_dict = {
+    ("fft", "complex64"): "complex64",
+    ("fft", "complex128"): "complex128",
+    ("ifft", "complex64"): "complex64",
+    ("ifft", "complex128"): "complex128",
+    ("rfft", "float64"): "complex128",
+    ("rfft", "float32"): "complex64",
+    ("irfft", "complex128"): "float64",
+    ("irfft", "complex64"): "float32",
+}
+
+axes_strategy = st.sampled_from(
+    list(
+        itertools.chain(
+            itertools.permutations(range(3)),
+            itertools.permutations((0, 1)),
+            # itertools.permutations((0,2)),  # axes must be contiguous
+            itertools.permutations((1, 2)),
+            itertools.combinations(range(3), r=1),
+            [None],
+        )
+    )
+)
+
+
+def is_axes_valid(a: np.ndarray, axes: tuple[int] | None, is_r2c: bool) -> bool:
+    if axes is None:
+        return True
+    return all(
+        [
+            # axes must be in the range [0...N) where N is the number of dimensions
+            all((n >= 0 and n < a.ndim) for n in axes),
+            # axes must contain either the first or last dimension
+            a.ndim - 1 in axes or 0 in axes,
+            # the least significant dimension must be listed last for R2C,C2R
+            (not is_r2c) or max(axes) == axes[-1],
+            # FIXME: R2C only supports stride of 1 for last dimension?
+            (not is_r2c) or max(axes) == a.ndim - 1,
+        ]
+    )
+
+
+def verify_result(result, ref, orig, fft_type):
+    assert result.dtype.name == dtype_dict[(fft_type, orig.dtype.name)]
+    tol = 1e2 * np.finfo(orig.dtype).eps
+    if np.linalg.norm(ref) == 0.0:
+        assert np.linalg.norm(result - ref) < tol, f"error greater than tolerance for input shape {orig.shape}"
+    else:
+        assert (
+            np.linalg.norm(result - ref) / np.linalg.norm(ref) < tol
+        ), f"error greater than tolerance for input shape {orig.shape}"
+
+
+@nvmath_seed()
+@given(a=st.one_of(c32_array_st, c64_array_st), axes=axes_strategy, options=options_st, execution=execution_st)
+def test_fft(a, axes, options, execution):
+    if not is_axes_valid(a, axes, is_r2c=False):
+        return
+    try:
+        b = nvmath.fft.fft(a, axes=axes, options=options, execution=execution)
+    except cp.cuda.memory.OutOfMemoryError:
+        # requiring too much GPU memory (>1GB), do nothing
+        assert a.nbytes > 2**30, "suspicious OOM when requesting not too much GPU memory!"
+        return
+    except RuntimeError as e:
+        if "The FFT CPU execution is not available" in str(e):
+            assert (
+                execution == "cpu"
+                or isinstance(execution, nvmath.fft.ExecutionCPU)
+                or (execution is None and isinstance(a, np.ndarray))
+            )
+            return
+        raise e
+    if execution == "cuda" or isinstance(execution, nvmath.fft.ExecutionCUDA):
+        c = cp.asnumpy(cp.fft.fftn(cp.asarray(a), axes=axes, norm="backward"))
+    else:
+        c = scipy.fft.fftn(a, axes=axes, norm="backward")
+    verify_result(b, c, a, "fft")
+
+
+@nvmath_seed()
+@given(a=st.one_of(c32_array_st, c64_array_st), axes=axes_strategy, options=options_st, execution=execution_st)
+def test_ifft(a, axes, options, execution):
+    if not is_axes_valid(a, axes, is_r2c=False):
+        return
+    try:
+        b = nvmath.fft.ifft(a, axes=axes, options=options, execution=execution)
+    except cp.cuda.memory.OutOfMemoryError:
+        # requiring too much GPU memory (>1GB), do nothing
+        assert a.nbytes > 2**30, "suspicious OOM when requesting not too much GPU memory!"
+        return
+    except RuntimeError as e:
+        if "The FFT CPU execution is not available" in str(e):
+            assert (
+                execution == "cpu"
+                or isinstance(execution, nvmath.fft.ExecutionCPU)
+                or (execution is None and isinstance(a, np.ndarray))
+            )
+            return
+        raise e
+    if execution == "cuda" or isinstance(execution, nvmath.fft.ExecutionCUDA):
+        c = cp.asnumpy(cp.fft.ifftn(cp.asarray(a), axes=axes, norm="forward"))
+    else:
+        c = scipy.fft.ifftn(a, axes=axes, norm="forward")
+    verify_result(b, c, a, "ifft")
+
+
+@nvmath_seed()
+@given(a=st.one_of(f32_array_st, f64_array_st), axes=axes_strategy, options=options_st, execution=execution_st)
+def test_rfft(a, axes, options, execution):
+    if not is_axes_valid(a, axes, is_r2c=True):
+        return
+    try:
+        b = nvmath.fft.rfft(a, axes=axes, options=options, execution=execution)
+    except cp.cuda.memory.OutOfMemoryError:
+        # requiring too much GPU memory (>1GB), do nothing
+        assert a.nbytes > 2**30, "suspicious OOM when requesting not too much GPU memory!"
+        return
+    except RuntimeError as e:
+        if "The FFT CPU execution is not available" in str(e):
+            assert (
+                execution == "cpu"
+                or isinstance(execution, nvmath.fft.ExecutionCPU)
+                or (execution is None and isinstance(a, np.ndarray))
+            )
+            return
+        raise e
+    if execution == "cuda" or isinstance(execution, nvmath.fft.ExecutionCUDA):
+        c = cp.asnumpy(cp.fft.rfftn(cp.asarray(a), axes=axes, norm="backward"))
+    else:
+        c = scipy.fft.rfftn(a, axes=axes, norm="backward")
+    verify_result(b, c, a, "rfft")
+
+
+@nvmath_seed()
+@given(a=st.one_of(f32_array_st, f64_array_st), axes=axes_strategy, options=options_st, execution=execution_st)
+def test_irfft(a, axes, options, execution):
+    if not is_axes_valid(a, axes, is_r2c=True):
+        return
+    # NOTE: Specifying output shape is the equivalent of `last_axis_parity` for scipy/numpy
+    fft_shape = tuple(a.shape[e] for e in (range(a.ndim) if axes is None else axes))
+    options["last_axis_parity"] = "odd" if fft_shape[-1] % 2 else "even"
+    try:
+        b = nvmath.fft.rfft(a, axes=axes, options=options, execution=execution)  # C2R needs complex-Hermitian input
+        c = nvmath.fft.irfft(b, axes=axes, options=options, execution=execution)
+    except cp.cuda.memory.OutOfMemoryError:
+        # requiring too much GPU memory (>1GB), do nothing
+        assert a.nbytes > 2**30, "suspicious OOM when requesting not too much GPU memory!"
+        return
+    except RuntimeError as e:
+        if "The FFT CPU execution is not available" in str(e):
+            assert (
+                execution == "cpu"
+                or isinstance(execution, nvmath.fft.ExecutionCPU)
+                or (execution is None and isinstance(a, np.ndarray))
+            )
+            return
+        raise e
+    assert a.shape == c.shape, f"{a.shape} vs {c.shape}"
+    if execution == "cuda" or isinstance(execution, nvmath.fft.ExecutionCUDA):
+        c_ref = cp.asnumpy(cp.fft.irfftn(cp.asarray(b), s=fft_shape, axes=axes, norm="forward"))
+    else:
+        c_ref = scipy.fft.irfftn(b, s=fft_shape, axes=axes, norm="forward")
+    verify_result(c, c_ref, b, "irfft")
diff --git a/tests/nvmath_tests/fft/test_lto_callbacks.py b/tests/nvmath_tests/fft/test_lto_callbacks.py
index 51f2a90..93877e9 100644
--- a/tests/nvmath_tests/fft/test_lto_callbacks.py
+++ b/tests/nvmath_tests/fft/test_lto_callbacks.py
@@ -43,12 +43,16 @@
 from .utils.support_matrix import (
     lto_callback_supperted_types,
     supported_backends,
+    opt_fft_type_direction_support,
+    opt_fft_type_input_type_support,
+    inplace_opt_ftt_type_support,
 )
 from .utils.input_fixtures import (
     get_random_input_data,
     get_custom_stream,
     get_primes_up_to,
     init_assert_exec_backend_specified,
+    fx_last_operand_layout,
 )
 from .utils.check_helpers import (
     add_in_place,
@@ -446,7 +450,7 @@ def iepilog_cb(data_out, offset, value, filter_data, unused):
             options={
                 "result_layout": result_layout.value,
                 "inplace": inplace.value,
-                "last_axis_size": "odd" if last_extent % 2 == 1 else "even",
+                "last_axis_parity": "odd" if last_extent % 2 == 1 else "even",
             },
         )
     except (nvmath.bindings.cufft.cuFFTError, ValueError) as e:
@@ -547,7 +551,7 @@ def epilog_cb(data_out, offset, value, filter_data, unused):
             **cb_kwargs,
             options={
                 "result_layout": result_layout.value,
-                "last_axis_size": "odd" if shape[axes[-1]] % 2 == 1 else "even",
+                "last_axis_parity": "odd" if shape[axes[-1]] % 2 == 1 else "even",
             },
         )
     except (nvmath.bindings.cufft.cuFFTError, ValueError) as e:
@@ -830,30 +834,38 @@ def epilog_cb(data_out, offset, value, filter_data, unused):
         "exec_backend",
         "mem_backend",
         "allow_to_fail",
-        "shape",
-        "axes",
+        "base_shape",
+        "base_axes",
         "permutation",
         "inplace",
+        "fft_type",
+        "direction",
         "dtype",
         "result_layout",
         "callbacks",
     ),
     [
         (
-            rng.choice([f for f in Framework.enabled() if MemBackend.cuda in supported_backends.framework_mem[f]]),
+            rng.choice(avail_frameworks),
             ExecBackend.cufft,
             MemBackend.cuda,  # cpu -> gpu may make the layout dense, no point to check it here
             AllowToFail(allow_to_fail),
-            repr(shape),
-            repr(axes),
+            repr(base_shape),
+            repr(base_axes),
             repr(permutation),
             inplace,
+            fft_type,
+            rng.choice(opt_fft_type_direction_support[fft_type]),
             dtype,
-            rng.choice(list(OptFftLayout)),
-            rng.choice(list(LtoCallback)),
+            OptFftLayout.natural if inplace else rng.choice(list(OptFftLayout)),
+            rng.choice(
+                list(LtoCallback),
+            ),
         )
+        for avail_frameworks in [[f for f in Framework.enabled() if MemBackend.cuda in supported_backends.framework_mem[f]]]
+        if avail_frameworks
         # fmt: off
-        for allow_to_fail, shape, axes, permutation in [
+        for allow_to_fail, base_shape, base_axes, permutation in [
             (False, (128, 1), (0,), (0, 1)),  # 1D batched, pow2
             (False, (128, 1), (0,), (1, 0)),  # 1D batched, pow2
             (False, (1, 128), (1,), (1, 0)),  # 1D batched, pow2
@@ -889,87 +901,163 @@ def epilog_cb(data_out, offset, value, filter_data, unused):
         ]
         # fmt: on
         for inplace in OptFftInplace
-        for dtype in [
-            DType.complex128,  # it is the "hardest" case, plus one more for coverage
-            rng.choice(
-                [
-                    dt
-                    for dt in lto_callback_supperted_types
-                    if dt != DType.complex128 and (not inplace or is_complex(dt))
-                ]
-            ),
-        ]
-        if ExecBackend.cufft in supported_backends.exec
+        for fft_type in inplace_opt_ftt_type_support[inplace.value]
+        for dtype in opt_fft_type_input_type_support[fft_type]
+        if dtype in lto_callback_supperted_types
     ],
 )
 def test_permuted_stride_operand(
+    fx_last_operand_layout,  # noqa: F811
     framework,
     exec_backend,
     mem_backend,
     allow_to_fail,
-    shape,
-    axes,
+    base_shape,
+    base_axes,
     permutation,
     inplace,
+    fft_type,
+    direction,
     dtype,
     result_layout,
     callbacks,
 ):
     free_framework_pools(framework, mem_backend)
 
-    shape = literal_eval(shape)
-    axes = literal_eval(axes)
+    base_shape = literal_eval(base_shape)
+    base_axes = literal_eval(base_axes)
     permutation = literal_eval(permutation)
-    assert len(shape) == len(permutation)
+    axes = tuple(permutation.index(a) for a in base_axes)
+    assert len(base_shape) == len(permutation)
+
+    if fft_type != OptFftType.c2r:
+        signal_base = get_random_input_data(framework, base_shape, dtype, mem_backend, seed=105)
+        signal = get_permuted(signal_base, permutation)
+        signal_shape = tuple(base_shape[p] for p in permutation)
+        if fft_type == OptFftType.c2c:
+            output_shape = signal_shape
+        else:
+            output_shape = r2c_shape(signal_shape, axes)
+    else:
+        real_type = get_ifft_dtype(dtype, fft_type)
+        assert not is_complex(real_type)
+        signal_base = get_random_input_data(framework, base_shape, real_type, mem_backend, seed=105)
+        signal_base = copy_array(get_fft_ref(signal_base, axes=base_axes))
+        signal = get_permuted(signal_base, permutation)
+        signal_shape = list(base_shape)
+        signal_shape[base_axes[-1]] = signal_shape[base_axes[-1]] // 2 + 1
+        signal_shape = tuple(signal_shape[p] for p in permutation)
+        output_shape = tuple(base_shape[p] for p in permutation)
 
-    fft_shape = [shape[a] for a in axes]
+    signal_copy = copy_array(signal) if inplace else signal
+    assert signal.shape == signal_shape
+    last_axis_parity = "odd" if output_shape[axes[-1]] % 2 else "even"
 
-    signal_base = get_random_input_data(framework, shape, dtype, mem_backend, seed=105)
-    signal = get_permuted(signal_base, permutation)
+    check_layouts, *_ = fx_last_operand_layout
 
-    axes = tuple(permutation.index(a) for a in axes)
+    if fft_type != OptFftType.c2r:
+        prolog_filter = get_random_input_data(framework, signal_shape, dtype, MemBackend.cuda, seed=243)
+    else:
+        # assure the required symmetry in the input
+        prolog_filter = get_random_input_data(framework, output_shape, real_type, mem_backend, seed=243)
+        prolog_filter = copy_array(get_fft_ref(prolog_filter, axes=axes))
+    assert get_dtype_from_array(prolog_filter) == dtype
+    assert prolog_filter.shape == signal_shape
 
-    permuted_fft_shape = [signal.shape[a] for a in axes]
-    assert fft_shape == permuted_fft_shape
+    if direction == Direction.forward:
+        epilog_dtype = get_fft_dtype(dtype)
+    else:
+        epilog_dtype = get_ifft_dtype(dtype, fft_type)
+    epilog_filter = get_random_input_data(framework, output_shape, epilog_dtype, MemBackend.cuda, seed=143)
 
     def prolog_cb(data, offset, filter_data, unused):
-        return data[offset] * 5
+        return data[offset] * filter_data[offset]
 
     def epilog_cb(data_out, offset, value, filter_data, unused):
-        data_out[offset] = value * 8
+        data_out[offset] = value * filter_data[offset] + 7
 
-    scaling = 1
     cb_kwargs = {}
     if callbacks.has_prolog():
         prolog_ltoir = nvmath.fft.compile_prolog(prolog_cb, dtype.name, dtype.name)
         cb_kwargs["prolog"] = {"ltoir": prolog_ltoir}
-        scaling *= 5
 
     if callbacks.has_epilog():
-        epilog_dtype = get_fft_dtype(dtype)
         epilog_ltoir = nvmath.fft.compile_epilog(epilog_cb, epilog_dtype.name, epilog_dtype.name)
         cb_kwargs["epilog"] = {"ltoir": epilog_ltoir}
-        scaling *= 8
 
-    ref = get_fft_ref(get_scaled(signal, scaling), axes=axes)
+    with nvmath.fft.FFT(
+        signal,
+        axes=axes,
+        execution=exec_backend.nvname,
+        options={
+            "fft_type": fft_type.value,
+            "result_layout": result_layout.value,
+            "inplace": inplace.value,
+            "last_axis_parity": last_axis_parity,
+        },
+    ) as fft:
+        if callbacks.has_prolog():
+            signal_strides = get_array_element_strides(signal)
+            prolog_strides = get_array_element_strides(prolog_filter)
+            operand_shape, operand_strides = fft.get_input_layout()
+            assert operand_shape == signal.shape
+            # even for c2r internal copy should keep the strides here
+            assert operand_strides == signal_strides
+            assert prolog_filter.shape == signal.shape
+            if prolog_strides != operand_strides:
+                prolog_data = permute_copy_like(prolog_filter, operand_shape, operand_strides)
+                assert get_array_element_strides(prolog_data) == operand_strides
+            else:
+                assert all(s == 1 for s in base_shape[1:])
+                prolog_data = prolog_filter
+            cb_kwargs["prolog"]["data"] = get_raw_ptr(prolog_data)
 
-    fft_fn = nvmath.fft.fft if is_complex(dtype) else nvmath.fft.rfft
+        if callbacks.has_epilog():
+            epilog_strides = get_array_element_strides(epilog_filter)
+            res_shape, res_strides = fft.get_output_layout()
+            assert res_shape == epilog_filter.shape
+            if res_strides != epilog_strides:
+                epilog_data = permute_copy_like(epilog_filter, res_shape, res_strides)
+                assert get_array_element_strides(epilog_data) == res_strides
+            else:
+                epilog_data = epilog_filter
+            cb_kwargs["epilog"]["data"] = get_raw_ptr(epilog_data)
 
-    try:
-        out = fft_fn(
-            signal,
-            axes=axes,
-            execution=exec_backend.nvname,
-            **cb_kwargs,
-            options={"result_layout": result_layout.value, "inplace": inplace.value},
+        try:
+            fft.plan(**cb_kwargs)
+        except (nvmath.bindings.cufft.cuFFTError, ValueError) as e:
+            if not allow_to_fail:
+                raise
+            problem_shape = signal_shape if fft_type != OptFftType.c2r else output_shape
+            allow_to_fail_compund_shape(e, problem_shape, axes=axes)
+
+        out = fft.execute(direction=direction.value)
+        check_layouts(
+            exec_backend,
+            mem_backend,
+            axes,
+            result_layout,
+            fft_type,
+            is_dense=True,
+            inplace=inplace.value,
         )
-    except (nvmath.bindings.cufft.cuFFTError, ValueError) as e:
-        if allow_to_fail:
-            allow_to_fail_compund_shape(e, signal.shape, axes=axes)
+        fft_ref = signal_copy
+        if callbacks.has_prolog():
+            fft_ref = as_type(fft_ref * prolog_filter, dtype)
+        if direction == Direction.forward:
+            fft_ref = get_fft_ref(fft_ref, axes)
         else:
-            raise
-
-    assert_norm_close_check_constant(out, ref, axes=axes)
+            fft_ref = get_ifft_ref(
+                fft_ref,
+                axes,
+                is_c2c=fft_type == OptFftType.c2c,
+                last_axis_parity=last_axis_parity,
+            )
+        if callbacks.has_epilog():
+            fft_ref = as_type(fft_ref * epilog_filter + 7, epilog_dtype)
+        if inplace:
+            assert signal is out
+        assert_norm_close_check_constant(out, fft_ref, axes=axes)
 
 
 def _operand_filter_dtype_shape_fft_ifft_case(
@@ -999,9 +1087,7 @@ def _operand_filter_dtype_shape_fft_ifft_case(
         # make sure the data we multiply in the forward epilog/
         # inverse prolog have the required hermitian symmetry
         epilog_real_dtype = (
-            epilog_filter_dtype
-            if not is_complex(epilog_filter_dtype)
-            else get_ifft_dtype(epilog_filter_dtype, OptFftType.c2r)
+            epilog_filter_dtype if not is_complex(epilog_filter_dtype) else get_ifft_dtype(epilog_filter_dtype, OptFftType.c2r)
         )
         epilog_filter_base = get_random_input_data(framework, shape, epilog_real_dtype, mem_backend, seed=143)
         # copy array to make sure it is dense
@@ -1130,11 +1216,13 @@ def ref(data, flt):
             "fft_type": "C2C" if is_complex(dtype) else "C2R",
             "result_layout": result_layout.value,
             "inplace": inplace.value,
-            "last_axis_size": last_axis_parity,
+            "last_axis_parity": last_axis_parity,
         },
     ) as f:
         in_shape, in_strides = f.get_input_layout()
         out_shape, out_strides = f.get_output_layout()
+        if exec_backend.mem == mem_backend:
+            assert_eq(in_strides, get_array_element_strides(fft_out))
         assert_eq(in_shape, epilog_filter_dev.shape)
         assert_eq(out_shape, prolog_filter_dev.shape)
         if in_strides == get_array_element_strides(epilog_filter_dev):
@@ -2079,9 +2167,7 @@ def epilog_fn(data, offset, element, flt, unused):
         for framework in Framework.enabled()
         if ExecBackend.cufft in supported_backends.exec
         for callback in (LtoCallback.prolog, LtoCallback.epilog)
-        for actual_dtype in [
-            dt for dt in lto_callback_supperted_types if callback == LtoCallback.prolog or is_complex(dt)
-        ]
+        for actual_dtype in [dt for dt in lto_callback_supperted_types if callback == LtoCallback.prolog or is_complex(dt)]
     ],
 )
 def test_mismatched_operand_type(framework, exec_backend, mem_backend, callback, actual_dtype, declared_dtype):
diff --git a/tests/nvmath_tests/fft/test_stateful.py b/tests/nvmath_tests/fft/test_stateful.py
index 3d8aba0..9f57e88 100644
--- a/tests/nvmath_tests/fft/test_stateful.py
+++ b/tests/nvmath_tests/fft/test_stateful.py
@@ -7,6 +7,7 @@
 import random
 import math
 import time
+import functools
 from ast import literal_eval
 
 import pytest
@@ -30,7 +31,6 @@
     Framework,
     DType,
     OptFftLayout,
-    Direction,
     ShapeKind,
     OptFftBlocking,
     OptFftType,
@@ -40,6 +40,8 @@
     is_complex,
     is_half,
     get_fft_dtype,
+    size_of,
+    get_ifft_dtype,
 )
 from .utils.support_matrix import (
     framework_exec_type_support,
@@ -47,23 +49,30 @@
     type_shape_support,
     multi_gpu_only,
     opt_fft_type_input_type_support,
+    opt_fft_type_direction_support,
 )
 from .utils.input_fixtures import (
+    align_up,
     get_random_input_data,
     get_custom_stream,
+    get_overaligned_view,
     init_assert_exec_backend_specified,
 )
 from .utils.check_helpers import (
     get_fft_ref,
+    get_ifft_ref,
     get_scaled,
+    get_raw_ptr,
     record_event,
     use_stream,
     assert_norm_close,
     assert_array_type,
+    assert_array_equal,
     assert_eq,
     get_array_device_id,
     get_array_strides,
     is_decreasing,
+    is_pow_2,
     intercept_default_allocations,
     add_in_place,
     free_cupy_pool,
@@ -173,17 +182,13 @@ def test_stateful_nd_default_allocator(
         fft_0 = f.execute()
 
         assert allocations[expected_key] == expected_allocations, f"{allocations}, {expected_key}"
-        assert all(
-            allocations[key] == 0 for key in allocations if key != expected_key
-        ), f"{allocations}, {expected_key}"
+        assert all(allocations[key] == 0 for key in allocations if key != expected_key), f"{allocations}, {expected_key}"
 
         f.reset_operand(signal_1)
         fft_1 = f.execute()
 
         assert allocations[expected_key] == expected_allocations, f"{allocations}, {expected_key}"
-        assert all(
-            allocations[key] == 0 for key in allocations if key != expected_key
-        ), f"{allocations}, {expected_key}"
+        assert all(allocations[key] == 0 for key in allocations if key != expected_key), f"{allocations}, {expected_key}"
 
         assert_array_type(fft_0, framework, mem_backend, get_fft_dtype(dtype))
         assert_array_type(fft_1, framework, mem_backend, get_fft_dtype(dtype))
@@ -247,9 +252,7 @@ def test_stateful_nd_custom_allocator(monkeypatch, framework, exec_backend, mem_
         ifft = f.execute(direction=Direction.inverse.value)
 
         assert allocations[expected_key] == expected_allocations, f"{allocations}, {expected_key}"
-        assert all(
-            allocations[key] == 0 for key in allocations if key != expected_key
-        ), f"{allocations}, {expected_key}"
+        assert all(allocations[key] == 0 for key in allocations if key != expected_key), f"{allocations}, {expected_key}"
 
         assert_array_type(fft, framework, mem_backend, dtype)
         assert_array_type(ifft, framework, mem_backend, dtype)
@@ -300,9 +303,7 @@ def test_stateful_release_workspace(monkeypatch, framework, exec_backend, mem_ba
         fft_1 = f.execute(direction=Direction.forward.value, release_workspace=release_workspace)
 
         assert allocations[expected_key] == num_allocs_2 if release_workspace else 1, f"{allocations}, {expected_key}"
-        assert all(
-            allocations[key] == 0 for key in allocations if key != expected_key
-        ), f"{allocations}, {expected_key}"
+        assert all(allocations[key] == 0 for key in allocations if key != expected_key), f"{allocations}, {expected_key}"
 
         assert_array_type(fft_0, framework, mem_backend, get_fft_dtype(dtype))
         assert_array_type(fft_1, framework, mem_backend, get_fft_dtype(dtype))
@@ -1193,3 +1194,242 @@ def wrapper(*args, **kwargs):
             plan_forward, plan_backward = rets[-1]
             assert plan_forward == 0
             assert plan_backward != 0
+
+
+@pytest.mark.parametrize(
+    (
+        "framework",
+        "exec_backend",
+        "mem_backend",
+        "fft_type",
+        "direction",
+        "dtype",
+        "shape",
+        "axes",
+        "inplace",
+        "layout",
+        "required_alignment",
+    ),
+    [
+        (
+            framework,
+            exec_backend,
+            exec_backend.mem,
+            fft_type,
+            direction,
+            dtype,
+            repr(shape),
+            repr(axes),
+            inplace,
+            OptFftLayout.natural if not inplace else rng.choice(list(OptFftLayout)),
+            size_of(get_fft_dtype(dtype)),
+        )
+        for framework in Framework.enabled()
+        for exec_backend in supported_backends.exec
+        if exec_backend.mem in supported_backends.framework_mem[framework]
+        for fft_type in OptFftType
+        for direction in opt_fft_type_direction_support[fft_type]
+        for dtype in opt_fft_type_input_type_support[fft_type]
+        if dtype in framework_exec_type_support[framework][exec_backend]
+        for shape, axes in [
+            ((1,), (0,)),
+            ((5413, 13), (0,)),
+            ((17, 1024), (1,)),
+            ((13, 99), (0, 1)),
+            ((32, 32, 11), (0, 1)),
+            ((11, 32, 32), (1, 2)),
+            ((2999, 6, 7), (0, 1, 2)),
+            ((8, 16, 8, 31), (0, 1, 2)),
+            ((31, 16, 16, 8), (1, 2, 3)),
+        ]
+        for inplace in [False, True]
+        if not inplace or fft_type == OptFftType.c2c
+    ],
+)
+def test_reset_operand_decreasing_alignment(
+    framework,
+    exec_backend,
+    mem_backend,
+    fft_type,
+    direction,
+    dtype,
+    inplace,
+    shape,
+    axes,
+    required_alignment,
+    layout,
+):
+    """
+    The test checks if the plan does not depend on the initial data alignment,
+    especially host execution libs, which take the pointers to data
+    during the planning.
+    """
+    shape = literal_eval(shape)
+    axes = literal_eval(axes)
+    fft_dim = len(shape)
+    if fft_type != OptFftType.c2r:
+        problem_shape = shape
+    else:
+        # assume last_axis_parity = odd
+        problem_shape = tuple(e if axes[-1] != i else 2 * (e - 1) + 1 for i, e in enumerate(shape))
+
+    if should_skip_3d_unsupported(exec_backend, problem_shape, axes):
+        pytest.skip("Older CTK does not support 3D FFT")
+
+    # we will create the plan with the tensor starting at
+    # the address aligned to exactly 512 bytes and then reset_operand
+    # to tensors starting at the 256, 128, 64...-byte alignments,
+    # stopping at the minimal required alignment, i.e. size_of(complex_dtype)
+    # to make sure we are aligned to 512 and no more, we start with
+    # tensor aligned to 1024 and set the 512 offset
+    overalignment_lg2 = 10
+    overalignment_bytes = 2**overalignment_lg2
+    dtype_size = size_of(dtype)
+    overalignment_elements = overalignment_bytes // dtype_size
+    complex_dtype = get_fft_dtype(dtype)
+    assert required_alignment == size_of(complex_dtype)
+    assert overalignment_bytes % required_alignment == 0
+
+    max_offset = overalignment_elements - 1
+    last_extent = shape[-1]
+    last_extent_sample_stride = dtype_size * (last_extent + max_offset)
+    last_extent_sample_stride = align_up(last_extent_sample_stride, overalignment_bytes) // dtype_size
+    assert last_extent_sample_stride >= last_extent + max_offset
+    assert (dtype_size * last_extent_sample_stride) % overalignment_bytes == 0
+
+    alignment_cases = tuple(
+        (
+            i,
+            alignment,
+            # Move to a new sample - to preserve input chracterictis for inplace cases.
+            # Move by overaligned offset to preserve the overlignment.
+            # Then shift from overaligned position to get exactly pointer
+            # aligned to the ``alignment``
+            offset := i * last_extent_sample_stride + (alignment // dtype_size),
+            offset + last_extent,
+        )
+        for i, a_lg2 in enumerate(range(overalignment_lg2 - 1, -1, -1))
+        if (alignment := 2**a_lg2) >= required_alignment
+    )
+    assert alignment_cases[0][1] == overalignment_bytes // 2
+    assert alignment_cases[-1][1] == required_alignment
+
+    all_samples_last_extent = len(alignment_cases) * last_extent_sample_stride
+    all_view_shape = list(shape)
+    all_view_shape[-1] = all_samples_last_extent
+    all_view_shape = tuple(all_view_shape)
+    signal_base, signal_overaligned = get_overaligned_view(
+        overalignment_bytes, framework, all_view_shape, dtype, mem_backend, seed=177
+    )
+    base_ptr = get_raw_ptr(signal_base)
+    overaligned_ptr = get_raw_ptr(signal_overaligned)
+    assert overaligned_ptr % overalignment_bytes == 0, f"{base_ptr}, {overaligned_ptr}"
+    assert signal_overaligned.shape == all_view_shape
+    assert 0 <= overaligned_ptr - base_ptr <= max_offset * dtype_size
+    assert_array_type(signal_overaligned, framework, mem_backend, dtype)
+    overaligned_copy = signal_overaligned if not inplace else copy_array(signal_overaligned)
+    assert_array_equal(signal_overaligned, overaligned_copy)
+
+    def decreasingly_aligned_signals():
+        prev_end = 0
+        for i, alignment, offset_start, offset_end in alignment_cases:
+            assert offset_start < offset_end
+            assert prev_end <= offset_start
+            prev_end = offset_end
+            slices = (slice(None),) * (fft_dim - 1) + (slice(offset_start, offset_end),)
+            sample = signal_overaligned[slices]
+            sample_copy = overaligned_copy[slices]
+            assert sample.shape == shape
+            assert sample_copy.shape == shape
+
+            if fft_type == OptFftType.c2r:
+                real_dtype = get_ifft_dtype(dtype, fft_type=fft_type)
+                real_sample = get_random_input_data(framework, problem_shape, real_dtype, mem_backend, seed=444 + i)
+                complex_sample = get_fft_ref(real_sample, axes=axes)
+                assert complex_sample.shape == shape
+                sample[:] = complex_sample[:]
+
+            sample_ptr = get_raw_ptr(sample)
+            assert sample_ptr % alignment == 0, f"{i}, {alignment}, {sample_ptr}, {overaligned_ptr}, {base_ptr}"
+            assert sample_ptr % (2 * alignment) == alignment, f"{i}, {alignment}, {sample_ptr}, {overaligned_ptr}, {base_ptr}"
+            assert_array_type(sample, framework, mem_backend, dtype)
+            try:
+                assert_array_equal(sample, sample_copy)
+            except AssertionError as e:
+                raise AssertionError(f"The copied sample is not equal for {alignment} (i={i})") from e
+
+            yield i, alignment, sample, sample_copy
+
+    samples = decreasingly_aligned_signals()
+    i, alignment, sample, sample_copy = next(samples)
+    assert i == 0
+    assert 2 * alignment == overalignment_bytes
+
+    if direction == Direction.forward:
+        ref_fn = functools.partial(get_fft_ref, axes=axes)
+    else:
+        ref_fn = functools.partial(
+            get_ifft_ref,
+            axes=axes,
+            is_c2c=fft_type == OptFftType.c2c,
+            last_axis_parity="odd",
+        )
+
+    try:
+        with nvmath.fft.FFT(
+            sample,
+            execution=exec_backend.nvname,
+            options={
+                "fft_type": fft_type.value,
+                "inplace": inplace,
+                "result_layout": layout.value,
+                "last_axis_size": "odd",
+            },
+            axes=axes,
+        ) as fft:
+            fft.plan()
+            res = fft.execute(direction=direction.value.lower())
+            ref = ref_fn(sample_copy)
+            assert_norm_close(res, ref, exec_backend=exec_backend, axes=axes)
+            for i, alignment, sample, sample_copy in samples:
+                assert i > 0 and 2 * alignment < overalignment_bytes
+                fft.reset_operand(sample)
+                res = fft.execute(direction=direction.value.lower())
+                ref = ref_fn(sample_copy)
+                try:
+                    assert_norm_close(res, ref, exec_backend=exec_backend, axes=axes)
+                except AssertionError as e:
+                    raise AssertionError(
+                        f"The output and reference are not close for " f"tesnor aligned to {alignment} (i={i})"
+                    ) from e
+    except ValueError as e:
+        if (
+            is_half(dtype)
+            and exec_backend == ExecBackend.cufft
+            and max(axes) < len(shape) - 1
+            and (
+                ("The R2C FFT of half-precision tensor" in str(e) and not is_complex(dtype) and fft_type == OptFftType.r2c)
+                or ("The C2R FFT of half-precision tensor" in str(e) and is_complex(dtype) and fft_type == OptFftType.c2r)
+            )
+        ) or (
+            "incompatible with that of the original" in str(e)
+            and fft_type == OptFftType.c2r
+            and direction == Direction.inverse
+            and len(shape) > 1
+        ):
+            # reset_operand will reject the operand as non-compatible
+            # because the operand layout is non-contiguous
+            # so the internal copy changes it
+            pass
+        else:
+            raise
+    except nvmath.bindings.cufft.cuFFTError as e:
+        if (
+            ("CUFFT_NOT_SUPPORTED" in str(e) or "CUFFT_SETUP_FAILED" in str(e))
+            and is_half(dtype)
+            and exec_backend == ExecBackend.cufft
+            and any(not is_pow_2(problem_shape[a]) for a in axes)
+        ):
+            pass
+        else:
+            raise
diff --git a/tests/nvmath_tests/fft/test_stateless_1d.py b/tests/nvmath_tests/fft/test_stateless_1d.py
index a67596c..bdbca3c 100644
--- a/tests/nvmath_tests/fft/test_stateless_1d.py
+++ b/tests/nvmath_tests/fft/test_stateless_1d.py
@@ -197,7 +197,7 @@ def test_ifft_explicit_fft_type(framework, fft_type, exec_backend, mem_backend,
         sample_fft,
         options={
             "fft_type": fft_type.value,
-            "last_axis_size": "even" if shape % 2 == 0 else "odd",
+            "last_axis_parity": "even" if shape % 2 == 0 else "odd",
         },
         execution=exec_backend.nvname,
     )
@@ -363,7 +363,7 @@ def test_fft_ifft_overlap(
             axes=(1,),
             options={
                 "result_layout": result_layout.value,
-                "last_axis_size": "odd" if window_size % 2 else "even",
+                "last_axis_parity": "odd" if window_size % 2 else "even",
             },
             execution=exec_backend.nvname,
         )
@@ -735,10 +735,7 @@ def test_fft_ifft_unsupported_type(framework, dtype, exec_backend, mem_backend):
 
     if dtype in r2c_dtype and r2c_dtype[dtype] not in framework_dtype[framework]:
         err_cls = TypeError
-        rfft_match = (
-            f"The result data type {r2c_dtype[dtype].name} is not "
-            f"supported by the operand package '{framework.name}'."
-        )
+        rfft_match = f"The result data type {r2c_dtype[dtype].name} is not supported by the operand package '{framework.name}'."
     elif is_complex(dtype):
         err_cls = RuntimeError
         rfft_match = f"expects a real input, but got {dtype.name}"
@@ -844,7 +841,7 @@ def test_irfft_unsupported_empty_output(framework, exec_backend, dtype, mem_back
             signal,
             axes=axes,
             execution=exec_backend.nvname,
-            options={"last_axis_size": "odd"},
+            options={"last_axis_parity": "odd"},
         )
 
     with pytest.raises(
@@ -1103,7 +1100,7 @@ def test_cpu_execution_wrong_options(framework, exec_backend, mem_backend, dtype
         ValueError,
         match="The 'device_id' is not a valid option when 'execution' is specified to be 'cpu'",
     ):
-        fn(sample, execution="cpu", options={"last_axis_size": "odd", "device_id": 1})
+        fn(sample, execution="cpu", options={"last_axis_parity": "odd", "device_id": 1})
 
     with pytest.raises(
         TypeError,
diff --git a/tests/nvmath_tests/fft/test_stateless_nd.py b/tests/nvmath_tests/fft/test_stateless_nd.py
index dab9a93..dc76440 100644
--- a/tests/nvmath_tests/fft/test_stateless_nd.py
+++ b/tests/nvmath_tests/fft/test_stateless_nd.py
@@ -18,15 +18,20 @@
     DType,
     ShapeKind,
     OptFftLayout,
+    OptFftType,
+    Direction,
 )
 from .utils.axes_utils import (
     is_complex,
     is_half,
     get_fft_dtype,
+    get_ifft_dtype,
+    size_of,
 )
 from .utils.support_matrix import (
     type_shape_support,
     opt_fft_type_input_type_support,
+    opt_fft_type_direction_support,
     inplace_opt_ftt_type_support,
     framework_exec_type_support,
     supported_backends,
@@ -34,12 +39,13 @@
 from .utils.input_fixtures import (
     get_random_input_data,
     init_assert_exec_backend_specified,
+    fx_last_operand_layout,
 )
 from .utils.check_helpers import (
-    assert_eq,
     is_decreasing,
     copy_array,
     get_fft_ref,
+    get_ifft_ref,
     get_scaled,
     check_layout_fallback,
     get_permuted_copy,
@@ -50,11 +56,13 @@
     as_strided,
     get_ifft_c2r_options,
     get_array_strides,
+    get_array_element_strides,
     assert_norm_close,
     assert_array_type,
     assert_eq,
     should_skip_3d_unsupported,
     assert_array_equal,
+    get_raw_ptr,
 )
 
 
@@ -99,6 +107,7 @@
     ],
 )
 def test_fft_ifft_1d(
+    fx_last_operand_layout,  # noqa: F811
     framework,
     exec_backend,
     mem_backend,
@@ -112,10 +121,11 @@ def test_fft_ifft_1d(
         ShapeKind.pow2: 128,
         ShapeKind.pow2357: 2 * 3 * 5 * 7,
         ShapeKind.prime: 127,
-        ShapeKind.random: 414,
+        ShapeKind.random: 207,
     }
     shape = (shapes[shape_kind],) * array_dim
     signal = get_random_input_data(framework, shape, dtype, mem_backend, seed=55)
+    check_layouts, *_ = fx_last_operand_layout
 
     fft_fn = nvmath.fft.fft if is_complex(dtype) else nvmath.fft.rfft
 
@@ -135,10 +145,16 @@ def test_fft_ifft_1d(
         execution=exec_backend.nvname,
         options={"result_layout": result_layout.value},
     )
+    check_layouts(
+        exec_backend,
+        mem_backend,
+        (axis,),
+        result_layout,
+        OptFftType.c2c if is_complex(dtype) else OptFftType.r2c,
+        is_dense=True,
+        inplace=False,
+    )
     assert_array_type(fft, framework, mem_backend, get_fft_dtype(dtype))
-    if result_layout == OptFftLayout.natural:
-        fft_strides = get_array_strides(fft)
-        assert is_decreasing(fft_strides), f"{fft_strides}"
     assert_norm_close(
         fft,
         get_fft_ref(signal, axes=[axis]),
@@ -158,10 +174,16 @@ def test_fft_ifft_1d(
             execution=exec_backend.nvname,
             axes=[axis],
         )
+        check_layouts(
+            exec_backend,
+            mem_backend,
+            (axis,),
+            result_layout,
+            OptFftType.c2c,
+            is_dense=True,
+            inplace=False,
+        )
         assert_array_type(ifft, framework, mem_backend, dtype)
-        if result_layout == OptFftLayout.natural:
-            ifft_strides = get_array_strides(ifft)
-            assert is_decreasing(ifft_strides), f"{ifft_strides}"
         assert_norm_close(
             ifft,
             get_scaled(signal, shape[axis]),
@@ -201,6 +223,7 @@ def test_fft_ifft_1d(
     ],
 )
 def test_fft_ifft_2d(
+    fx_last_operand_layout,  # noqa: F811
     framework,
     exec_backend,
     mem_backend,
@@ -219,6 +242,7 @@ def test_fft_ifft_2d(
     shape = shapes[shape_kind][:array_dim]
     axes = list(range(first_axis, first_axis + 2))
     signal = get_random_input_data(framework, shape, dtype, mem_backend, seed=55)
+    check_layouts, *_ = fx_last_operand_layout
 
     fft_fn = nvmath.fft.fft if is_complex(dtype) else nvmath.fft.rfft
 
@@ -238,10 +262,16 @@ def test_fft_ifft_2d(
         execution=exec_backend.nvname,
         options={"result_layout": result_layout.value},
     )
+    check_layouts(
+        exec_backend,
+        mem_backend,
+        axes,
+        result_layout,
+        OptFftType.c2c if is_complex(dtype) else OptFftType.r2c,
+        is_dense=True,
+        inplace=False,
+    )
     assert_array_type(fft, framework, mem_backend, get_fft_dtype(dtype))
-    if result_layout == OptFftLayout.natural:
-        fft_strides = get_array_strides(fft)
-        assert is_decreasing(fft_strides), f"{fft_strides}"
     assert_norm_close(
         fft,
         get_fft_ref(signal, axes=axes),
@@ -255,10 +285,16 @@ def test_fft_ifft_2d(
             **get_ifft_c2r_options(dtype, shape[axes[-1]]),
         }
         ifft = nvmath.fft.ifft(fft, execution=exec_backend.nvname, options=options, axes=axes)
+        check_layouts(
+            exec_backend,
+            mem_backend,
+            axes,
+            result_layout,
+            OptFftType.c2c,
+            is_dense=True,
+            inplace=False,
+        )
         assert_array_type(ifft, framework, mem_backend, dtype)
-        if result_layout == OptFftLayout.natural:
-            ifft_strides = get_array_strides(ifft)
-            assert is_decreasing(ifft_strides), f"{ifft_strides}"
         volume = math.prod(shape[axis] for axis in axes)
         assert_norm_close(
             ifft,
@@ -301,6 +337,7 @@ def test_fft_ifft_2d(
     ],
 )
 def test_fft_ifft_3d(
+    fx_last_operand_layout,  # noqa: F811
     framework,
     exec_backend,
     mem_backend,
@@ -318,6 +355,7 @@ def test_fft_ifft_3d(
     }
     shape = shapes[shape_kind][:array_dim]
     axes = list(range(first_axis, first_axis + 3))
+    check_layouts, *_ = fx_last_operand_layout
 
     if should_skip_3d_unsupported(exec_backend, shape, axes):
         pytest.skip("Pre 11.4.2 CTK does not support 3D batched FFT")
@@ -331,10 +369,16 @@ def test_fft_ifft_3d(
         execution=exec_backend.nvname,
         options={"result_layout": result_layout.value},
     )
+    check_layouts(
+        exec_backend,
+        mem_backend,
+        axes,
+        result_layout,
+        OptFftType.c2c if is_complex(dtype) else OptFftType.r2c,
+        is_dense=True,
+        inplace=False,
+    )
     assert_array_type(fft, framework, mem_backend, get_fft_dtype(dtype))
-    if result_layout == OptFftLayout.natural:
-        fft_strides = get_array_strides(fft)
-        assert is_decreasing(fft_strides), f"{fft_strides}"
     assert_norm_close(
         fft,
         get_fft_ref(signal, axes=axes),
@@ -349,10 +393,16 @@ def test_fft_ifft_3d(
             **get_ifft_c2r_options(dtype, shape[axes[-1]]),
         }
         ifft = nvmath.fft.ifft(fft, execution=exec_backend.nvname, options=options, axes=axes)
+        check_layouts(
+            exec_backend,
+            mem_backend,
+            axes,
+            result_layout,
+            OptFftType.c2c,
+            is_dense=True,
+            inplace=False,
+        )
         assert_array_type(ifft, framework, mem_backend, dtype)
-        if result_layout == OptFftLayout.natural:
-            ifft_strides = get_array_strides(ifft)
-            assert is_decreasing(ifft_strides), f"{ifft_strides}"
         volume = math.prod(shape[axis] for axis in axes)
         assert_norm_close(
             ifft,
@@ -401,9 +451,19 @@ def test_fft_ifft_3d(
         for result_layout in OptFftLayout
     ],
 )
-def test_irfft_preserves_input(framework, exec_backend, mem_backend, shape, axes, dtype, result_layout):
+def test_irfft_preserves_input(
+    fx_last_operand_layout,  # noqa: F811
+    framework,
+    exec_backend,
+    mem_backend,
+    shape,
+    axes,
+    dtype,
+    result_layout,
+):
     shape = literal_eval(shape)
     axes = literal_eval(axes)
+    check_layouts, *_ = fx_last_operand_layout
 
     if should_skip_3d_unsupported(exec_backend, shape, axes):
         pytest.skip("Pre 11.4.2 CTK does not support 3D batched FFT")
@@ -420,6 +480,16 @@ def test_irfft_preserves_input(framework, exec_backend, mem_backend, shape, axes
         execution=exec_options,
         options={"result_layout": result_layout.value},
     )
+    check_layouts(
+        exec_backend,
+        mem_backend,
+        axes,
+        result_layout,
+        OptFftType.r2c,
+        is_dense=True,
+        inplace=False,
+    )
+
     assert_array_type(fft, framework, mem_backend, get_fft_dtype(dtype))
     assert_norm_close(
         fft,
@@ -429,14 +499,24 @@ def test_irfft_preserves_input(framework, exec_backend, mem_backend, shape, axes
     )
     fft_copy = copy_array(fft)
     ifft = nvmath.fft.irfft(
-        fft_copy,
+        fft,
         axes=axes,
         execution=exec_options,
         options={
-            "last_axis_size": "odd" if shape[axes[-1]] % 2 else "even",
+            "last_axis_parity": "odd" if shape[axes[-1]] % 2 else "even",
             "result_layout": result_layout.value,
         },
     )
+    check_layouts(
+        exec_backend,
+        mem_backend,
+        axes,
+        result_layout,
+        OptFftType.c2r,
+        is_dense=True,
+        inplace=False,
+    )
+
     assert_array_type(ifft, framework, mem_backend, dtype)
     volume = math.prod(shape[a] for a in axes)
     assert_norm_close(
@@ -445,7 +525,7 @@ def test_irfft_preserves_input(framework, exec_backend, mem_backend, shape, axes
         axes=axes,
         exec_backend=exec_backend,
     )
-    assert_array_equal(fft_copy, fft)
+    assert_array_equal(fft, fft_copy)
 
 
 @pytest.mark.parametrize(
@@ -752,9 +832,7 @@ def test_permuted_axes_c2c_repeated_strides(framework, exec_backend, mem_backend
         for result_layout in OptFftLayout
     ],
 )
-def test_permuted_axes_c2c_repeated_strides_inplace(
-    framework, exec_backend, mem_backend, axes, shape, dtype, result_layout
-):
+def test_permuted_axes_c2c_repeated_strides_inplace(framework, exec_backend, mem_backend, axes, shape, dtype, result_layout):
     axes = literal_eval(axes)
     shape = literal_eval(shape)
 
@@ -810,9 +888,7 @@ def test_permuted_axes_c2c_repeated_strides_inplace(
             mem_backend,
             repr(axes),
             batched,
-            dtype := rng.choice(
-                [dt for dt in framework_exec_type_support[framework][exec_backend] if not is_complex(dt)]
-            ),
+            dtype := rng.choice([dt for dt in framework_exec_type_support[framework][exec_backend] if not is_complex(dt)]),
             repr(
                 rng.sample(
                     ([17, 31, 101] if ShapeKind.prime in type_shape_support[exec_backend][dtype] else [16, 32, 64]),
@@ -874,7 +950,7 @@ def test_permuted_axes_r2c_c2r(framework, exec_backend, mem_backend, axes, batch
         axes=axes,
         execution=exec_backend.nvname,
         options={
-            "last_axis_size": "odd" if shape[axes[-1]] % 2 else "even",
+            "last_axis_parity": "odd" if shape[axes[-1]] % 2 else "even",
             "result_layout": result_layout.value,
         },
     )
@@ -919,9 +995,7 @@ def test_permuted_axes_r2c_c2r(framework, exec_backend, mem_backend, axes, batch
         for result_layout in OptFftLayout
     ],
 )
-def test_permuted_axes_r2c_c2r_repeated_strides(
-    framework, exec_backend, mem_backend, axes, shape, dtype, result_layout
-):
+def test_permuted_axes_r2c_c2r_repeated_strides(framework, exec_backend, mem_backend, axes, shape, dtype, result_layout):
     axes = literal_eval(axes)
     shape = literal_eval(shape)
 
@@ -958,7 +1032,7 @@ def test_permuted_axes_r2c_c2r_repeated_strides(
         axes=axes,
         execution=exec_backend.nvname,
         options={
-            "last_axis_size": "odd" if shape[axes[-1]] % 2 else "even",
+            "last_axis_parity": "odd" if shape[axes[-1]] % 2 else "even",
             "result_layout": result_layout.value,
         },
     )
@@ -1033,7 +1107,7 @@ def test_permuted_axes_r2c_c2r_repeated_strides_fallback(
         axes=axes,
         exec_backend=exec_backend,
     )
-    last_axis_size = "odd" if shape[axes[-1]] % 2 else "even"
+    last_axis_parity = "odd" if shape[axes[-1]] % 2 else "even"
     ifft = check_layout_fallback(
         fft,
         axes,
@@ -1042,7 +1116,7 @@ def test_permuted_axes_r2c_c2r_repeated_strides_fallback(
             execution=exec_backend.nvname,
             axes=axes,
             options={
-                "last_axis_size": last_axis_size,
+                "last_axis_parity": last_axis_parity,
                 "result_layout": result_layout.value,
             },
         ),
@@ -1242,7 +1316,7 @@ def test_ifft_repeated_strides(
             axes=axes,
             execution=exec_backend.nvname,
             options={
-                "last_axis_size": "even" if shape[axes[-1]] % 2 == 0 else "odd",
+                "last_axis_parity": "even" if shape[axes[-1]] % 2 == 0 else "odd",
                 "result_layout": result_layout.value,
             },
         )
@@ -1336,79 +1410,230 @@ def test_irfft_half_strided_output(result_layout, framework, exec_backend, mem_b
         "framework",
         "exec_backend",
         "mem_backend",
-        "fft_dim",
-        "batched",
+        "fft_type",
+        "direction",
         "dtype",
-        "shape_kind",
+        "base_shape",
+        "view_shape_kind",
+        "view_shape",
+        "slices",
+        "axes",
+        "fft_dim",
+        "inplace",
+        "layout",
     ),
     [
         (
             framework,
             exec_backend,
             mem_backend,
-            fft_dim,
-            batched,
+            fft_type,
+            direction,
             dtype,
-            rng.choice(type_shape_support[exec_backend][dtype]),
+            repr(base_shape),
+            view_shape_kind,
+            repr(view_shape),
+            repr(slices),
+            repr(axes),
+            f"FftDim.{len(axes)}",
+            inplace,
+            layout,
         )
         for framework in Framework.enabled()
         for exec_backend in supported_backends.exec
         for mem_backend in supported_backends.framework_mem[framework]
-        for fft_dim in [1, 2, 3]
-        for batched in ["no", "left", "right"]
-        for dtype in framework_exec_type_support[framework][exec_backend]
-        if is_complex(dtype) and not is_half(dtype)
+        for fft_type in OptFftType
+        for direction in opt_fft_type_direction_support[fft_type]
+        for dtype in opt_fft_type_input_type_support[fft_type]
+        if dtype in framework_exec_type_support[framework][exec_backend]
+        for base_shape, view_shape_kind, view_shape, slices, axes in [
+            ((256,), ShapeKind.pow2, (16,), ((16,),), (0,)),
+            ((39, 2), ShapeKind.pow2, (16, 2), ((0, 16), None), (0,)),
+            ((39, 2), ShapeKind.pow2, (16, 2), ((16, 32), None), (0,)),
+            ((39, 7), ShapeKind.pow2, (16, 5), ((0, 16), (0, 5)), (0,)),
+            ((39, 2), ShapeKind.pow2, (8, 2), ((16, 32, 2), None), (0,)),
+            ((2, 39), ShapeKind.pow2, (2, 8), (None, (16, 32, 2)), (1,)),
+            ((5, 2048), ShapeKind.pow2, (5, 16), (None, (1, 17)), (1,)),
+            ((5, 2048), ShapeKind.pow2, (5, 16), (None, (2, 18)), (1,)),
+            ((5, 2048), ShapeKind.random, (5, 177), (None, (2, 179)), (1,)),
+            ((5, 2048), ShapeKind.pow2, (1, 16), ((None, None, 5), (1, 17)), (1,)),
+            ((5, 2048), ShapeKind.pow2, (1, 16), ((0, 1), (2, 18)), (1,)),
+            ((2048, 5), ShapeKind.pow2, (16, 1), ((1, 17), (0, 1)), (0,)),
+            ((2048, 5), ShapeKind.pow2, (16, 1), ((2, 18), (None, None, 5)), (0,)),
+            # using big base shape to catch errors around insufficient allocation
+            ((128, 1024), ShapeKind.pow2, (8, 16), ((-10, -2), (-32, -16)), (0, 1)),
+            ((1024, 4096, 7), ShapeKind.pow2, (256, 16, 7), ((2, 258), (6, 22), None), (0, 1)),
+            ((7, 1024, 4096), ShapeKind.pow2, (7, 32, 16), (None, (2, 34), (6, 22)), (1, 2)),
+            ((3, 17, 13), ShapeKind.pow2, (2, 8, 8), ((1, 3), (8,), (1, 9)), (1, 2)),
+            ((55, 55, 55), ShapeKind.random, (55, 19, 28), (None, (None, None, 3), (None, None, 2)), (0, 1, 2)),
+            ((13, 17, 13), ShapeKind.pow2, (8, 8, 8), ((1, 9), (8,), (1, 9)), (0, 1, 2)),
+            ((13, 17, 13, 11), ShapeKind.pow2, (8, 8, 8, 11), ((1, 9), (8,), (1, 9), None), (0, 1, 2)),
+            ((11, 13, 17, 13), ShapeKind.pow2, (11, 8, 8, 8), (None, (1, 9), (8,), (1, 9)), (1, 2, 3)),
+        ]
+        for inplace in [False, True]
+        if not inplace or fft_type == OptFftType.c2c
+        for layout in OptFftLayout
+        if not inplace or layout == OptFftLayout.natural
     ],
 )
-def test_sliced_tensor_complex(framework, exec_backend, mem_backend, fft_dim, batched, dtype, shape_kind):
-    extent = {
-        ShapeKind.pow2: (64 + 5, 64 + 5, 64 + 5),
-        ShapeKind.pow2357: (48 + 5, 48 + 5, 48 + 5),
-        ShapeKind.prime: (43 + 5, 43 + 5, 43 + 5),
-        ShapeKind.random: (22 + 5, 22 + 5, 22 + 5),
-    }
-    shape = extent[shape_kind][:fft_dim]
+def test_sliced_tensor(
+    fx_last_operand_layout,  # noqa: F811
+    framework,
+    exec_backend,
+    mem_backend,
+    fft_type,
+    direction,
+    dtype,
+    base_shape,
+    view_shape_kind,
+    view_shape,
+    slices,
+    axes,
+    fft_dim,
+    inplace,
+    layout,
+):
+    base_shape = literal_eval(base_shape)
+    view_shape = literal_eval(view_shape)
+    slices = literal_eval(slices)
+    axes = literal_eval(axes)
+    assert len(slices) == len(base_shape) == len(view_shape)
+    check_layouts, *_ = fx_last_operand_layout
 
-    slices = tuple(slice(3, -2) for _ in range(fft_dim))
-    if batched == "left":
-        shape = (8,) + shape
-        axes = tuple(range(1, fft_dim + 1))
-        slices = (slice(None),) + slices
-    elif batched == "right":
-        shape = shape + (16,)
-        axes = tuple(-i for i in range(2, fft_dim + 2))
-        slices = slices + (slice(None),)
+    if fft_type == OptFftType.c2c:
+        if direction == Direction.forward:
+            fft_fn = nvmath.fft.fft
+        else:
+            assert direction == Direction.inverse
+            fft_fn = nvmath.fft.ifft
+        complex_dtype = dtype
+    elif fft_type == OptFftType.r2c:
+        fft_fn = nvmath.fft.rfft
+        complex_dtype = get_fft_dtype(dtype)
     else:
-        assert batched == "no"
-        axes = None
+        assert fft_type == OptFftType.c2r
+        fft_fn = nvmath.fft.irfft
+        complex_dtype = dtype
 
-    if should_skip_3d_unsupported(exec_backend, shape, axes):
-        pytest.skip("Pre 11.4.2 CTK does not support 3D batched FFT")
+    assert is_complex(complex_dtype)
 
-    signal = get_random_input_data(framework, shape, dtype, mem_backend, seed=105)
-    signal_sliced = signal[slices]
-    assert get_array_strides(signal) == get_array_strides(signal_sliced)
-    assert signal.dtype == signal_sliced.dtype
+    signal_base = get_random_input_data(framework, base_shape, dtype, mem_backend, seed=105)
+    slices = tuple(slice(*s) if s is not None else slice(s) for s in slices)
+    signal = signal_base[slices]
+    assert_array_type(signal, framework, mem_backend, dtype)
+    assert signal.shape == view_shape
+    last_axis_parity = "odd"
 
-    if is_complex(dtype):
-        fft = nvmath.fft.fft(
-            signal_sliced,
-            axes=axes,
-            execution=exec_backend.nvname,
-        )
+    if fft_type != OptFftType.c2r:
+        # problem size as defined by cufft, i.e. shape of the input for r2c, c2c
+        # and the shape of the output for c2r
+        instance_shape = view_shape
     else:
-        fft = nvmath.fft.rfft(
-            signal_sliced,
-            axes=axes,
-            execution=exec_backend.nvname,
-        )
-    assert_array_type(fft, framework, mem_backend, get_fft_dtype(dtype))
-    assert_norm_close(
-        fft,
-        get_fft_ref(signal_sliced, axes=axes),
-        axes=axes,
-        exec_backend=exec_backend,
-    )
+        real_dtype = get_ifft_dtype(dtype, fft_type=fft_type)
+        # assuming last_axis_parit == "odd"
+        instance_shape = tuple(e if i != axes[-1] else 2 * (e - 1) + 1 for i, e in enumerate(view_shape))
+        real_sample = get_random_input_data(framework, instance_shape, real_dtype, mem_backend, seed=106)
+        complex_sample = get_fft_ref(real_sample, axes=axes)
+        assert_array_type(complex_sample, framework, mem_backend, dtype)
+        assert complex_sample.shape == view_shape
+        signal[:] = complex_sample[:]
+        assert_array_type(signal, framework, mem_backend, dtype)
+        assert signal.shape == view_shape
+
+    signal_copy = signal if not inplace else copy_array(signal)
+
+    if should_skip_3d_unsupported(exec_backend, view_shape, axes):
+        pytest.skip("Skipping 3D for older cufft")
+
+    alignment_excpt_clss = (nvmath.bindings.cufft.cuFFTError,)
+    if nvmath.bindings.nvpl is not None:
+        alignment_excpt_clss += (nvmath.bindings.nvpl.fft.FFTWUnaligned,)
+
+    try:
+        try:
+            out = fft_fn(
+                signal,
+                axes=axes,
+                options={
+                    "last_axis_parity": last_axis_parity,
+                    "inplace": inplace,
+                    "result_layout": layout.value,
+                },
+                execution=exec_backend.nvname,
+            )
+            check_layouts(
+                exec_backend,
+                mem_backend,
+                axes,
+                layout,
+                fft_type,
+                is_dense=False,
+                inplace=inplace,
+            )
+        except nvmath.fft.UnsupportedLayoutError as e:
+            # with slices, we don't really require permutation,
+            # just the `step` may make the embedding not possible
+            assert e.permutation == tuple(range(len(view_shape)))
+            assert exec_backend == mem_backend.exec
+            cont_signal = copy_array(signal)
+            out = fft_fn(
+                cont_signal,
+                axes=axes,
+                options={
+                    "last_axis_parity": last_axis_parity,
+                    "inplace": inplace,
+                    "result_layout": layout.value,
+                },
+                execution=exec_backend.nvname,
+            )
+    except alignment_excpt_clss as e:
+        str_e = str(e)
+        if (
+            exec_backend == ExecBackend.cufft
+            and is_half(dtype)
+            and ("CUFFT_NOT_SUPPORTED" in str_e or "CUFFT_SETUP_FAILED" in str_e)
+        ):
+            # only pow2 problem sizes are supported for halfs
+            assert any(math.gcd(instance_shape[a], 2**30) != instance_shape[a] for a in axes)
+        else:
+            alignment = size_of(complex_dtype)
+            if exec_backend == ExecBackend.cufft:
+                assert "CUFFT_INVALID_VALUE" in str_e, str_e
+            else:
+                assert "input tensor's underlying memory" in str_e, str_e
+                assert f"pointer must be aligned to at least {alignment} bytes" in str_e, str_e
+            assert fft_type in (OptFftType.c2r, OptFftType.r2c), f"{fft_type}"
+            assert get_raw_ptr(signal) % alignment != 0
+            strides = get_array_element_strides(signal)
+            start_offset = sum(stride * (extent_slice.start or 0) for stride, extent_slice in zip(strides, slices, strict=True))
+            assert start_offset % 2 == 1, f"{strides} {slices} {start_offset}"
+    except ValueError as e:
+        str_e = str(e)
+        if "The R2C FFT of half-precision tensor" in str_e:
+            assert exec_backend == ExecBackend.cufft and is_half(dtype) and fft_type == OptFftType.r2c
+        elif "The C2R FFT of half-precision tensor" in str_e:
+            assert exec_backend == ExecBackend.cufft and is_half(dtype) and fft_type == OptFftType.c2r
+            assert layout == OptFftLayout.natural
+        else:
+            raise
+    else:
+        if fft_type == OptFftType.c2r:
+            assert out.shape == instance_shape
+
+        if direction == Direction.forward:
+            ref = get_fft_ref(signal_copy, axes=axes)
+            out_dtype = get_fft_dtype(dtype)
+        else:
+            ref = get_ifft_ref(
+                signal_copy,
+                axes=axes,
+                last_axis_parity="odd",
+                is_c2c=fft_type == OptFftType.c2c,
+            )
+            out_dtype = get_ifft_dtype(dtype, fft_type)
+
+        assert_array_type(out, framework, mem_backend, out_dtype)
+        assert_norm_close(out, ref, axes=axes, exec_backend=exec_backend, shape_kind=view_shape_kind)
 
 
 @pytest.mark.parametrize(
@@ -1450,7 +1675,7 @@ def test_sliced_tensor_unaligned(framework, exec_backend, shape, slice_descs, dt
 
     excpt_clss = (nvmath.bindings.cufft.cuFFTError,)
     if nvmath.bindings.nvpl is not None:
-        excpt_clss += (nvmath.bindings.nvpl.fft.FFTWUnaliged,)
+        excpt_clss += (nvmath.bindings.nvpl.fft.FFTWUnaligned,)
     try:
         fft_fn = nvmath.fft.fft if is_complex(dtype) else nvmath.fft.rfft
         fft = fft_fn(
@@ -1649,7 +1874,7 @@ def test_single_element(
         fft,
         axes=axes,
         execution=exec_backend.nvname,
-        options={"last_axis_size": "odd", "result_layout": result_layout.value},
+        options={"last_axis_parity": "odd", "result_layout": result_layout.value},
     )
     assert_norm_close(
         ifft,
@@ -1769,7 +1994,7 @@ def test_single_element_view(
         fft,
         axes=axes,
         execution=exec_backend.nvname,
-        options={"last_axis_size": "odd", "result_layout": result_layout.value},
+        options={"last_axis_parity": "odd", "result_layout": result_layout.value},
     )
     assert_norm_close(
         ifft,
@@ -1783,161 +2008,102 @@ def test_single_element_view(
         "framework",
         "exec_backend",
         "mem_backend",
-        "shape",
+        "base_shape",
         "axes",
-        "slices",
+        "steps",
         "dtype",
-        "result_layout",
     ),
     [
         (
             framework,
             exec_backend,
             mem_backend,
-            repr(shape),
+            repr(base_shape),
             repr(axes),
-            repr(slices),
-            dtype,
-            result_layout,
+            repr(step),
+            rng.choice(
+                [
+                    dtype
+                    for dtype in framework_exec_type_support[framework][exec_backend]
+                    if is_complex(dtype) and not is_half(dtype)
+                ]
+            ),
         )
         for framework in Framework.enabled()
         for exec_backend in supported_backends.exec
         for mem_backend in supported_backends.framework_mem[framework]
-        for shape, axes, slices in [
-            (
-                (
-                    39,
-                    2,
-                ),
-                (0,),
-                ((16,), (None,)),
-            ),
-            (
-                (
-                    39,
-                    7,
-                ),
-                (0,),
-                (
-                    (16,),
-                    (5,),
-                ),
-            ),
-            (
-                (
-                    39,
-                    2,
-                ),
-                (0,),
-                ((16, 32), (None,)),
-            ),
-            (
-                (
-                    39,
-                    2,
-                ),
-                (0,),
-                ((16, 32, 2), (None,)),
-            ),
-            (
-                (
-                    39,
-                    7,
-                ),
-                (0,),
-                (
-                    (16, 32, 2),
-                    (5,),
-                ),
-            ),
-            (
-                (
-                    17,
-                    17,
-                ),
-                (1,),
-                (
-                    (1, 17, 2),
-                    (16,),
-                ),
-            ),
-            ((17, 13, 3), (0, 1), ((8,), (1, 9), (1, 3))),
-            ((3, 17, 13), (1, 2), ((1, 3), (8,), (1, 9))),
-            ((13, 17, 13), (0, 1, 2), ((1, 9), (8,), (1, 9))),
+        for base_shape, axes, step in [
+            ((2, 55), (0,), (None, 2)),
+            ((2, 55), (1,), (None, 2)),
+            ((100, 101, 5), (0, 1), (None, 2, 3)),
+            ((100, 101, 5), (1, 2), (None, 2, 3)),
+            ((100, 101, 5), (0, 1, 2), (None, 2, 3)),
         ]
-        for dtype in framework_exec_type_support[framework][exec_backend]
-        if is_complex(dtype)
-        for result_layout in OptFftLayout
     ],
 )
-def test_inplace_view(
+def test_inplace_sliced_non_overlapping(
     framework,
     exec_backend,
     mem_backend,
-    shape,
+    base_shape,
     axes,
-    slices,
+    steps,
     dtype,
-    result_layout,
 ):
-    shape = literal_eval(shape)
+    base_shape = literal_eval(base_shape)
     axes = literal_eval(axes)
-
+    steps = literal_eval(steps)
+    assert len(base_shape) == len(steps)
+    signal_base = get_random_input_data(framework, base_shape, dtype, mem_backend, seed=105)
+    signal = signal_base[tuple(slice(None, None, step) for step in steps)]
+    signal_copy = copy_array(signal)
+    shape = tuple(
+        (e + impl_step - 1) // impl_step
+        for e, step in zip(base_shape, steps, strict=True)
+        for impl_step in [1 if step is None else step]
+    )
+    assert signal.shape == shape
     if should_skip_3d_unsupported(exec_backend, shape, axes):
         pytest.skip("Pre 11.4.2 CTK does not support 3D batched FFT")
 
-    slices = tuple(slice(*args) for args in literal_eval(slices))
-    signal = get_random_input_data(framework, shape, dtype, mem_backend, seed=105)
-    signal_copy = copy_array(signal)
-    signal_view = signal[slices]
-    signal_view_strides = get_array_strides(signal_view)
     try:
         fft = nvmath.fft.fft(
-            signal_view,
-            axes=axes,
-            execution=exec_backend.nvname,
-            options={"inplace": True, "result_layout": result_layout.value},
-        )
-        assert_eq(get_array_strides(signal_view), signal_view_strides)
-        assert_eq(get_array_strides(fft), signal_view_strides)
-        assert_array_type(signal_view, framework, mem_backend, dtype)
-        ref_view = get_fft_ref(signal_copy[slices], axes=axes)
-        assert_norm_close(
-            signal_view,
-            ref_view,
-            axes=axes,
-            exec_backend=exec_backend,
-        )
-        signal_ref = copy_array(signal_copy)
-        signal_ref[slices] = ref_view
-        assert_norm_close(
             signal,
-            signal_ref,
-            axes=axes,
-            exec_backend=exec_backend,
-        )
-        ifft = nvmath.fft.ifft(
-            signal_view,
             axes=axes,
             execution=exec_backend.nvname,
-            options={"inplace": True, "result_layout": result_layout.value},
-        )
-        assert_eq(get_array_strides(signal_view), signal_view_strides)
-        assert_eq(get_array_strides(ifft), signal_view_strides)
-        signal_copy[slices] = get_scaled(
-            signal_copy[slices],
-            math.prod(signal_view.shape[a] for a in range(len(shape)) if a in axes),
+            options={"inplace": True},
         )
-        assert_norm_close(
-            signal,
-            signal_copy,
-            axes=axes,
-            exec_backend=exec_backend,
-        )
-    except RuntimeError as e:
-        assert "cannot be specified when copying to non-contiguous" in str(e)
-        assert_eq(mem_backend, MemBackend.cpu)
-        assert_eq(framework, Framework.numpy)
+    except nvmath.fft.UnsupportedLayoutError:
+        # The embedding check does usually block the step-sliced
+        # operands. For mem_backend != exec_backend though
+        # the copy is made so we have contiguous layout
+        # and we can make sure that the overlapping check is not
+        # too strict
+        assert mem_backend.exec == exec_backend
+        return
+    assert fft is signal
+    assert_norm_close(
+        fft,
+        get_fft_ref(signal_copy, axes=axes),
+        axes=axes,
+        exec_backend=exec_backend,
+    )
+    ifft = nvmath.fft.ifft(
+        fft,
+        axes=axes,
+        execution=exec_backend.nvname,
+        options={
+            "inplace": True,
+            "last_axis_parity": "odd" if shape[axes[-1]] else "even",
+        },
+    )
+    assert ifft is signal
+    assert_norm_close(
+        ifft,
+        get_scaled(signal_copy, math.prod(shape[a] for a in axes)),
+        axes=axes,
+        exec_backend=exec_backend,
+    )
 
 
 @pytest.mark.parametrize(
@@ -2028,23 +2194,18 @@ def test_inplace_overlapping(
                 options={"inplace": True, "result_layout": result_layout.value},
             )
     else:
-        try:
-            nvmath.fft.fft(
-                signal_view,
-                axes=axes,
-                execution=exec_backend.nvname,
-                options={"inplace": True, "result_layout": result_layout.value},
-            )
-            assert_norm_close(
-                signal_view,
-                get_fft_ref(signal_view_copy, axes=axes),
-                axes=axes,
-                exec_backend=exec_backend,
-            )
-        except RuntimeError as e:
-            assert "cannot be specified when copying to non-contiguous" in str(e)
-            assert_eq(mem_backend, MemBackend.cpu)
-            assert_eq(framework, Framework.numpy)
+        nvmath.fft.fft(
+            signal_view,
+            axes=axes,
+            execution=exec_backend.nvname,
+            options={"inplace": True, "result_layout": result_layout.value},
+        )
+        assert_norm_close(
+            signal_view,
+            get_fft_ref(signal_view_copy, axes=axes),
+            axes=axes,
+            exec_backend=exec_backend,
+        )
 
 
 @pytest.mark.parametrize(
@@ -2122,24 +2283,19 @@ def test_repeated_strides_strided(
     signal = signal[::stride].reshape(shape)
     signal_copy = copy_array(signal)
     fft_fn = nvmath.fft.fft if is_complex(dtype) else nvmath.fft.rfft
-    try:
-        fft = fft_fn(
-            signal,
-            axes=axes,
-            execution=exec_backend.nvname,
-            options={"result_layout": result_layout.value, "inplace": inplace},
-        )
-        assert_array_type(fft, framework, mem_backend, get_fft_dtype(dtype))
-        assert_norm_close(
-            fft,
-            get_fft_ref(signal_copy, axes=axes),
-            axes=axes,
-            exec_backend=exec_backend,
-        )
-    except RuntimeError as e:
-        assert "cannot be specified when copying to non-contiguous" in str(e)
-        assert_eq(mem_backend, MemBackend.cpu)
-        assert_eq(framework, Framework.numpy)
+    fft = fft_fn(
+        signal,
+        axes=axes,
+        execution=exec_backend.nvname,
+        options={"result_layout": result_layout.value, "inplace": inplace},
+    )
+    assert_array_type(fft, framework, mem_backend, get_fft_dtype(dtype))
+    assert_norm_close(
+        fft,
+        get_fft_ref(signal_copy, axes=axes),
+        axes=axes,
+        exec_backend=exec_backend,
+    )
 
 
 @pytest.mark.parametrize(
diff --git a/tests/nvmath_tests/fft/utils/check_helpers.py b/tests/nvmath_tests/fft/utils/check_helpers.py
index c8d6b4e..f6e7021 100644
--- a/tests/nvmath_tests/fft/utils/check_helpers.py
+++ b/tests/nvmath_tests/fft/utils/check_helpers.py
@@ -27,6 +27,7 @@
     get_fft_dtype,
     get_ifft_dtype,
     is_complex,
+    is_half,
     is_array,
     size_of,
     get_dtype_from_array,
@@ -35,6 +36,9 @@
     get_array_backend,
 )
 
+_torch_has_cuda = bool(torch and torch.cuda.is_available() and torch.cuda.device_count() > 0)
+
+
 _cufft_version = None
 
 
@@ -64,6 +68,10 @@ def slice_r2c(fft, axes=None):
     return fft[slices]
 
 
+def is_pow_2(extent):
+    return extent > 0 and (extent & (extent - 1) == 0)
+
+
 def get_numpy_fft_ref(
     sample: np.ndarray,
     axes=None,
@@ -108,18 +116,23 @@ def get_torch_fft_ref(
 
     dtype = get_dtype_from_array(sample)
     mem_backend = get_array_backend(sample)
-    # torch does not implement half precision fft
-    if dtype == DType.complex32:
-        in_sample = sample.type(torch.complex64)
-    elif dtype == DType.float16:
-        in_sample = sample.type(torch.float32)
+    device = sample.device
+
+    # torch fft does not support f16/c32 cpu tensors
+    # the gpu support is limited only to pows of 2
+    if is_half(dtype):
+        if _torch_has_cuda and mem_backend == MemBackend.cpu and all(is_pow_2(e) for e in sample.shape):
+            in_sample = sample.to("cuda")
+        else:
+            assert dtype in [DType.float16, DType.complex32]
+            in_sample = sample.type(torch.float32 if dtype == DType.float16 else torch.complex64)
     else:
         in_sample = sample
 
     # Workaround for bug in torch CPU fftn that leads to a crash
     # for larger 3D ffts with interleaved batch
     if (
-        mem_backend == MemBackend.cpu
+        get_array_backend(in_sample) == MemBackend.cpu
         and axes is not None
         and len(axes) >= 3
         and len(sample.shape) > 3
@@ -133,11 +146,16 @@ def get_torch_fft_ref(
         if not is_complex(dtype):
             ref = slice_r2c(ref, axes=axes)
 
-    if dtype in [DType.complex32, DType.float16]:
-        assert_eq(ref.dtype, torch.complex64)
-        ref = ref.type(torch.complex32)
+    if is_half(dtype):
+        if _torch_has_cuda and mem_backend == MemBackend.cpu and all(is_pow_2(e) for e in sample.shape):
+            ref = ref.to(device)
+        else:
+            assert_eq(ref.dtype, torch.complex64)
+            ref = ref.type(torch.complex32)
 
     assert_eq(ref.dtype, get_framework_dtype(Framework.torch, get_fft_dtype(dtype)))
+    assert_eq(mem_backend, get_array_backend(ref))
+    assert_eq(ref.device, device)
     return ref
 
 
@@ -182,8 +200,8 @@ def get_ifft_ref(
             assert isinstance(axes, tuple | list)
             assert len(axes)
             last_axis = axes[-1]
-        out_last_axis_size = (2 * shape[last_axis] - 2) if last_axis_parity == "even" else (2 * shape[last_axis] - 1)
-        out_shape = tuple(e if a != last_axis else out_last_axis_size for a, e in enumerate(shape))
+        out_last_axis_parity = (2 * shape[last_axis] - 2) if last_axis_parity == "even" else (2 * shape[last_axis] - 1)
+        out_shape = tuple(e if a != last_axis else out_last_axis_parity for a, e in enumerate(shape))
         fft_out_shape = out_shape if axes is None else tuple(out_shape[a] for a in axes)
 
     if get_framework_from_array(sample) == Framework.numpy:
@@ -198,8 +216,19 @@ def get_ifft_ref(
         fn = cp.fft.ifftn if is_c2c else cp.fft.irfftn
         ret = fn(sample, s=fft_out_shape, axes=axes, norm=norm)
     elif get_framework_from_array(sample) == Framework.torch:
+        # torch does not implement half precision fft
+        # (or the support is partial)
+        if dtype == DType.complex32:
+            in_sample = sample.type(torch.complex64)
+        else:
+            in_sample = sample
         fn = torch.fft.ifftn if is_c2c else torch.fft.irfftn
-        ret = fn(sample, s=fft_out_shape, norm=norm, dim=axes)
+        ret = fn(in_sample, s=fft_out_shape, norm=norm, dim=axes)
+        if dtype == DType.complex32:
+            ret_dtype = get_dtype_from_array(ret)
+            expected_ret_dtype = DType.complex64 if is_c2c else DType.float32
+            assert ret_dtype == expected_ret_dtype
+            ret = ret.type(get_framework_dtype(Framework.torch, out_dtype))
     else:
         raise ValueError(f"Unknown framework {get_framework_from_array(sample)}")
 
@@ -304,12 +333,12 @@ def assert_all_close(a, b, rtol, atol):
         raise ValueError(f"Unknown array type {a}")
 
 
-def get_ifft_c2r_options(out_type, last_axis_size):
+def get_ifft_c2r_options(out_type, last_axis_parity):
     if is_complex(out_type):
         return {}
     return {
         "fft_type": "C2R",
-        "last_axis_size": "odd" if last_axis_size % 2 else "even",
+        "last_axis_parity": "odd" if last_axis_parity % 2 else "even",
     }
 
 
@@ -443,10 +472,11 @@ def copy_array(a):
     if get_framework_from_array(a) != Framework.torch:
         return a.copy()
     else:
-        if not a.is_contiguous():
-            return a.contiguous()
-        else:
-            return a.clone()
+        # torch contiguous or copy does not always
+        # enforce truly contiguous stride
+        t = torch.empty(a.shape, dtype=a.dtype, device=a.device)
+        t.copy_(a, non_blocking=True)
+        return t
 
 
 def get_permuted(sample: np.ndarray | CP_NDARRAY | TORCH_TENSOR, permutation):
@@ -708,6 +738,4 @@ def extent_comprises_only_small_factors(extent):
 
 
 def has_only_small_factors(shape, axes=None):
-    return all(
-        extent_comprises_only_small_factors(extent) for a, extent in enumerate(shape) if axes is None or a in axes
-    )
+    return all(extent_comprises_only_small_factors(extent) for a, extent in enumerate(shape) if axes is None or a in axes)
diff --git a/tests/nvmath_tests/fft/utils/input_fixtures.py b/tests/nvmath_tests/fft/utils/input_fixtures.py
index d771e94..bbb90d2 100644
--- a/tests/nvmath_tests/fft/utils/input_fixtures.py
+++ b/tests/nvmath_tests/fft/utils/input_fixtures.py
@@ -15,8 +15,10 @@
 except ImportError:
     torch = None
 
-from .common_axes import MemBackend, Framework, DType, ShapeKind
-from .axes_utils import get_framework_dtype, is_complex
+import pytest
+
+from .common_axes import MemBackend, Framework, DType, ShapeKind, OptFftType, OptFftLayout
+from .axes_utils import get_framework_dtype, is_complex, c2r_dtype, size_of, get_fft_dtype
 
 
 def get_random_input_data(
@@ -43,10 +45,22 @@ def _create_array():
             if not is_complex(dtype):
                 a = rng.uniform(lo, hi, size=shape).astype(framework_dtype)
             else:
-                real = rng.uniform(lo, hi, size=shape)
-                imag = rng.uniform(lo, hi, size=shape)
-                a = (real + 1j * imag).astype(framework_dtype)
-            assert a.dtype == framework_dtype
+                if len(shape) >= 32:  # we can't go over the max supported dimension
+                    real = rng.uniform(lo, hi, size=shape)
+                    imag = rng.uniform(lo, hi, size=shape)
+                    a = (real + 1j * imag).astype(framework_dtype)
+                else:
+                    # add and squeeze the extra dimension to get the scalar case right
+                    real_size = shape + (2,)
+                    real_framework_dtype = get_framework_dtype(framework, c2r_dtype[dtype])
+                    a = (
+                        rng.uniform(lo, hi, size=real_size)
+                        .astype(real_framework_dtype)
+                        .view(dtype=framework_dtype)
+                        .reshape(shape)
+                    )
+            assert a.dtype == framework_dtype, f"{a.dtype} vs {framework_dtype}"
+            assert a.shape == shape, f"{a.shape} vs {shape}"
             return a
 
         if mem_backend == MemBackend.cuda and device_id is not None:
@@ -70,7 +84,7 @@ def _create_array():
             shift = torch.tensor(lo, dtype=framework_dtype)
         else:
             shift = torch.tensor(lo + 1j * lo, dtype=framework_dtype)
-        t = t * scale + shift
+        t = t.mul_(scale).add_(shift)
         assert t.dtype == framework_dtype
         return t
     else:
@@ -150,3 +164,112 @@ def get_primes_up_to(up_to):
         while c <= up_to:
             is_prime[c] = False
             c += k
+
+
+@pytest.fixture
+def fx_last_operand_layout(monkeypatch):
+    import nvmath
+    from .check_helpers import get_array_element_strides, get_raw_ptr
+
+    _actual_init = nvmath.fft.FFT.__init__
+    _actual_exec = nvmath.fft.FFT.execute
+    layouts = {}
+    ptrs = {}
+
+    def wrapped_init(self, initial_operand, *args, **kwargs):
+        nonlocal layouts, ptrs
+        layouts["initial_operand"] = (tuple(initial_operand.shape), get_array_element_strides(initial_operand))
+        ptrs["initial_operand"] = get_raw_ptr(initial_operand)
+        ret = _actual_init(self, initial_operand, *args, **kwargs)
+        layouts["operand"] = (self.operand.shape, self.operand.strides)
+        ptrs["operand"] = get_raw_ptr(self.operand.tensor)
+        assert self.operand_layout.shape == self.operand.shape
+        assert self.operand_layout.strides == self.operand.strides
+        if self.operand_backup is not None:
+            layouts["operand_backup"] = (self.operand_backup.shape, self.operand_backup.strides)
+            ptrs["operand_backup"] = get_raw_ptr(self.operand_backup.tensor)
+        return ret
+
+    monkeypatch.setattr(nvmath.fft.FFT, "__init__", wrapped_init)
+
+    def wrapped_exec(self, *args, **kwargs):
+        ret = _actual_exec(self, *args, **kwargs)
+        layouts["result"] = (tuple(ret.shape), get_array_element_strides(ret))
+        ptrs["result"] = get_raw_ptr(ret)
+        return ret
+
+    monkeypatch.setattr(nvmath.fft.FFT, "execute", wrapped_exec)
+
+    def stride_order(shape, stride):
+        return tuple(i for _, _, i in sorted(zip(stride, shape, range(len(shape)), strict=True)))
+
+    def check_layouts(exec_backend, mem_backend, axes, result_layout, fft_type, is_dense, inplace):
+        initial_shape, initial_strides = layouts["initial_operand"]
+        if mem_backend == exec_backend.mem:
+            assert "operand_backup" not in layouts
+        else:
+            assert ptrs["initial_operand"] == ptrs["operand_backup"]
+            assert layouts["operand_backup"][0] == initial_shape
+            assert layouts["operand_backup"][1] == initial_strides
+
+        assert layouts["operand"][0] == initial_shape
+        if fft_type != OptFftType.c2r and mem_backend == exec_backend.mem:
+            assert ptrs["initial_operand"] == ptrs["operand"]
+        else:
+            assert ptrs["initial_operand"] != ptrs["operand"]
+        if mem_backend == exec_backend.mem and (fft_type != OptFftType.c2r or is_dense):
+            # nvmath should keep the strides for dense (possibly permuted) tensors
+            assert layouts["operand"][1] == initial_strides
+
+        if inplace:
+            assert ptrs["result"] == ptrs["initial_operand"]
+            assert layouts["result"] == layouts["initial_operand"]
+        else:
+            assert ptrs["result"] != ptrs["initial_operand"]
+            # the frameworks gpu<->cpu copy does not necessarily keep the layout
+            if mem_backend == exec_backend.mem:
+                if result_layout == OptFftLayout.natural:
+                    initial_order = stride_order(*layouts["initial_operand"])
+                    res_order = stride_order(*layouts["result"])
+                    assert initial_order == res_order
+                else:
+                    assert result_layout == OptFftLayout.optimized
+                    res_layout = layouts["result"]
+                    res_order = stride_order(*res_layout)
+                    least_strided = res_order[: len(axes)]
+                    assert sorted(axes) == sorted(
+                        least_strided
+                    ), f"{sorted(axes)} vs {sorted(least_strided)}: result_layout={res_layout}"
+
+    return check_layouts, layouts, ptrs
+
+
+def align_up(num_bytes, alignment):
+    return ((num_bytes + alignment - 1) // alignment) * alignment
+
+
+def get_overaligned_view(alignment, framework, shape, dtype, mem_backend, seed):
+    from .check_helpers import get_raw_ptr, assert_array_type
+
+    dtype_size = size_of(dtype)
+    assert alignment % dtype_size == 0
+    innermost_extent = shape[-1]
+    offset_upperbound = alignment // dtype_size
+    base_innermost = innermost_extent + offset_upperbound
+    base_shape = list(shape)
+    base_shape[-1] = base_innermost
+    base_shape = tuple(base_shape)
+    a = get_random_input_data(framework, base_shape, dtype, mem_backend, seed)
+    base_ptr = get_raw_ptr(a)
+    complex_dtype = get_fft_dtype(dtype)
+    required_alignment = size_of(complex_dtype)
+    assert required_alignment % dtype_size == 0
+    assert base_ptr % required_alignment == 0
+    overaligned_offset = (-(base_ptr % -alignment)) // dtype_size
+    assert 0 <= overaligned_offset < offset_upperbound
+    slices = (slice(None),) * (len(shape) - 1) + (slice(overaligned_offset, overaligned_offset + innermost_extent),)
+    aligned_view = a[slices]
+    view_ptr = get_raw_ptr(aligned_view)
+    assert view_ptr % alignment == 0
+    assert_array_type(aligned_view, framework, mem_backend, dtype)
+    return a, aligned_view
diff --git a/tests/nvmath_tests/fft/utils/support_matrix.py b/tests/nvmath_tests/fft/utils/support_matrix.py
index 028cde6..73d356b 100644
--- a/tests/nvmath_tests/fft/utils/support_matrix.py
+++ b/tests/nvmath_tests/fft/utils/support_matrix.py
@@ -69,9 +69,7 @@
 
 framework_exec_type_support = {
     framework: {
-        exec_backend: [
-            dtype for dtype in framework_type_support[framework] if dtype in exec_backend_type_support[exec_backend]
-        ]
+        exec_backend: [dtype for dtype in framework_type_support[framework] if dtype in exec_backend_type_support[exec_backend]]
         for exec_backend in ExecBackend
     }
     for framework in Framework
@@ -146,9 +144,7 @@ def mem(self) -> list[MemBackend]:
 
     @functools.cached_property
     def framework_mem(self) -> dict[Framework, list[MemBackend]]:
-        return {
-            framework: [b for b in self.mem if b in framework_backend_support[framework]] for framework in Framework
-        }
+        return {framework: [b for b in self.mem if b in framework_backend_support[framework]] for framework in Framework}
 
     def __call__(self):
         return self.backends
diff --git a/tests/nvmath_tests/helpers.py b/tests/nvmath_tests/helpers.py
index 94880f4..3ab23d1 100644
--- a/tests/nvmath_tests/helpers.py
+++ b/tests/nvmath_tests/helpers.py
@@ -38,7 +38,7 @@ def time_cupy(fun, ncycles, *args):
 
     start.record(None)
     for _ in range(ncycles):
-        out = fun(*args)
+        out = fun(*args)  # noqa: F841
     stop.record(None)
     stop.synchronize()
 
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_epilog.py b/tests/nvmath_tests/linalg/advanced/matmul/test_epilog.py
index adc050e..ba9fe94 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_epilog.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_epilog.py
@@ -6,16 +6,20 @@
 This set of tests verifies the correctness of the epilog handling.
 """
 
-from nvmath.linalg.advanced import matmul, MatmulEpilog as Epilog
+from nvmath.linalg.advanced import matmul, Matmul, MatmulEpilog as Epilog
 from nvmath.bindings import cublasLt as cublaslt
 import pytest
+import random
 from .utils import *
+import numpy as np
 
 try:
     from cupy import tanh, sqrt, pi, cosh
 except ModuleNotFoundError:
     pytest.skip("cupy required for matmul tests", allow_module_level=True)
 
+rng = np.random.default_rng(12345)
+
 
 def relu(x):
     y = x.copy()
@@ -88,6 +92,38 @@ def simulate_epilog(a, b, epilog, epilog_inputs):
 
     a = cupy.asarray(a)
     b = cupy.asarray(b)
+
+    a_batched, b_batched = len(a.shape) > 2, len(b.shape) > 2
+    if a_batched or b_batched:
+        # For batched input, simulate on each batch element separately and then combine
+        results = []
+        aux_checkers = {}
+        batch_size = a.shape[0] if a_batched else b.shape[0]
+        for i in range(batch_size):
+            a_slice = a[i] if a_batched else a
+            b_slice = b[i] if b_batched else b
+            epilog_inputs_slice = {k: v[i] if len(v.shape) > 2 else v for k, v in epilog_inputs.items()}
+            r, ac = simulate_epilog(a_slice, b_slice, epilog, epilog_inputs_slice)
+            results.append(r)
+            if ac:
+                for k, v in ac.items():
+                    aux_checkers[k] = aux_checkers.get(k, [])
+                    aux_checkers[k].append(v)
+
+        def check_all_aux(key):
+            checkers = aux_checkers[key]
+
+            def check(aux):
+                if epilog in (Epilog.BGRADA, Epilog.BGRADB, Epilog.DRELU_BGRAD, Epilog.DGELU_BGRAD):
+                    # Aux outputs of BGRAD were promoted to 3-D
+                    assert len(aux.shape) == 3 and aux.shape[-1] == 1
+                    aux = aux[:, :, 0]  # Remove the extra dimension
+                return all(checker(aux[i]) for i, checker in enumerate(checkers))
+
+            return check
+
+        return cupy.stack(results), {k: check_all_aux(k) for k in aux_checkers}
+
     epilog_inputs = {k: cupy.asarray(v) for k, v in epilog_inputs.items()}
 
     def drelu(mask):
@@ -113,7 +149,7 @@ def simulate_drelu(a, b, mask):
     def simulate_dgelu(a, b, x):
         return (a @ b) * dgelu(x[: a.shape[0], : b.shape[1]])
 
-    x = matmul(a, b)
+    x = cupy.matmul(a, b)
     if epilog == Epilog.RELU:
         return relu(x), None
     elif epilog == Epilog.GELU:
@@ -124,9 +160,7 @@ def simulate_dgelu(a, b, x):
             and y.shape == (round_up(np.ceil(x.shape[0] / 8), 16), x.shape[1])
         }
     elif epilog == Epilog.GELU_AUX:
-        return gelu(x), {
-            "gelu_aux": lambda y: compare_tensors(y[: x.shape[0]], x) and y.shape[0] == round_up(x.shape[0], 8)
-        }
+        return gelu(x), {"gelu_aux": lambda y: compare_tensors(y[: x.shape[0]], x) and y.shape[0] == round_up(x.shape[0], 8)}
     elif epilog == Epilog.BGRADA:
         return x, {"bgrada": lambda y: compare_tensors(y, a.sum(axis=1))}
     elif epilog == Epilog.BGRADB:
@@ -164,18 +198,59 @@ def simulate_dgelu(a, b, x):
         raise AssertionError()
 
 
+def execute_matmul(a, b, *, epilog=None, epilog_inputs=None, stateful=True, autotune=False):
+    if not stateful:
+        assert not autotune, "autotune=True requires stateful=True"
+        return matmul(a, b, epilog=epilog, epilog_inputs=epilog_inputs)
+    with Matmul(a, b) as mm:
+        mm.plan(epilog=epilog, epilog_inputs=epilog_inputs)
+        if autotune:
+            mm.autotune()
+        return mm.execute()
+
+
+def make_matrix(shape, framework, use_cuda, transposed=False):
+    if transposed:
+        m = sample_matrix(framework, "float32", (*shape[:-2], shape[-1], shape[-2]), use_cuda=use_cuda)
+        m = get_framework(m).swapaxes(m, -1, -2)
+        return m
+    else:
+        return sample_matrix(framework, "float32", shape, use_cuda=use_cuda)
+
+
 @pytest.mark.parametrize("epilog", (*simple_epilogs, *epilogs_with_bias))
-@pytest.mark.parametrize("bias_shape", (lambda m: (m,), lambda m: (m, 1)))
+@pytest.mark.parametrize("bias_extra_dim", (False, True))
 @pytest.mark.parametrize("framework", ("torch", "numpy/cupy"))
 @pytest.mark.parametrize("use_cuda", (True, False))
 @pytest.mark.parametrize("n,m,k", ((40, 50, 60), (1, 1, 1), (8, 16, 32), (65, 43, 21), (1, 2, 3), (3, 2, 1), (2, 1, 3)))
-def test_epilogs(epilog, bias_shape, framework, n, m, k, use_cuda):
+@pytest.mark.parametrize(
+    "a_batch,b_batch",
+    (
+        (None, 3),
+        (5, None),
+        (4, 4),
+        (None, None),
+        (1, 1),
+    ),
+)
+def test_epilogs(epilog, bias_extra_dim, framework, n, m, k, use_cuda, a_batch, b_batch):
+    autotune = rng.choice((True, False))
     if epilog == Epilog.BGRADB and m == 1:
         pytest.skip("BGRADB doesn't support m=1")
     if epilog == Epilog.BGRADA and n == 1:
-        # TODO: This is a temporary fix. If A has a singleton dimension we change it to COL order
-        # (see get_matrix_layout_traits), and COL order is not supported by BGRADA.
+        # TODO: This is a temporary fix. If A has a singleton dimension we change it to COL
+        # order (see get_matrix_layout_traits), and COL order is not supported by BGRADA.
         pytest.skip("BGRADA doesn't support n=1")
+    if epilog == Epilog.BGRADA and framework == "numpy/cupy" and not use_cuda and (a_batch is not None or b_batch is not None):
+        # BGRADA requires COL layout of each matrix in the batch.
+        # Also, one of the matrix dimensions needs to have stride one.
+        # Transfer to the GPU (which uses cupy.asarray under the hood)
+        # won't preserve such layout.
+        pytest.skip("It's not possible to create batched COL layout with numpy")
+    if bias_extra_dim and epilog not in epilogs_with_bias:
+        pytest.skip("bias_extra_dim=False is irrelevant for epilog without bias")
+
+    bias_shape = (lambda m: (m, 1)) if bias_extra_dim else (lambda m: (m,))
 
     if epilog in (
         Epilog.GELU,
@@ -192,15 +267,19 @@ def test_epilogs(epilog, bias_shape, framework, n, m, k, use_cuda):
     if epilog in (Epilog.BGRADA, Epilog.BGRADB):
         skip_if_cublas_before(111103)
 
-    def make_matrix(shape, transposed):
-        if transposed:
-            return sample_matrix(framework, "float32", tuple(reversed(shape)), use_cuda=use_cuda).T
-        else:
-            return sample_matrix(framework, "float32", shape, use_cuda=use_cuda)
-
     a, b = (
-        make_matrix((n, k), transposed=(epilog == Epilog.BGRADA)),
-        make_matrix((k, m), transposed=(epilog == Epilog.BGRADA)),
+        make_matrix(
+            (n, k) if a_batch is None else (a_batch, n, k),
+            transposed=(epilog == Epilog.BGRADA),
+            framework=framework,
+            use_cuda=use_cuda,
+        ),
+        make_matrix(
+            (k, m) if b_batch is None else (b_batch, k, m),
+            transposed=(epilog == Epilog.BGRADA),
+            framework=framework,
+            use_cuda=use_cuda,
+        ),
     )
     bias_value = sample_matrix(framework, "float32", bias_shape(n), use_cuda=use_cuda)
     inputs = (
@@ -211,19 +290,29 @@ def make_matrix(shape, transposed):
         else {}
     )
     reference, aux_checkers = simulate_epilog(a, b, epilog, inputs)
+
+    if (
+        cublaslt.get_version() < 11703
+        and epilog in (Epilog.RELU_AUX, Epilog.GELU_AUX, Epilog.RELU_AUX_BIAS, Epilog.GELU_AUX_BIAS)
+        and (a_batch is not None or b_batch is not None)
+    ):
+        with pytest.raises(ValueError, match="supports batching in cublaslt >= 11703"):
+            execute_matmul(a, b, epilog=epilog, epilog_inputs=inputs, stateful=autotune, autotune=autotune)
+        return
+
     if aux_checkers:
         if k == 1 and epilog in [Epilog.BGRADA, Epilog.BGRADB] and cublaslt.get_version() < 120304:
             with pytest.raises(ValueError, match="not supported"):
-                matmul(a, b, epilog=epilog, epilog_inputs=inputs)
+                execute_matmul(a, b, epilog=epilog, epilog_inputs=inputs, stateful=autotune, autotune=autotune)
             return
-        result, aux = matmul(a, b, epilog=epilog, epilog_inputs=inputs)
-        assert_tensors_equal(result, result)
+        result, aux = execute_matmul(a, b, epilog=epilog, epilog_inputs=inputs, stateful=autotune, autotune=autotune)
+        assert_tensors_equal(result, reference)
         assert aux.keys() == aux_checkers.keys()
         for k in aux:
             res, checker = aux[k], aux_checkers[k]
             assert checker(to_numpy(res))
     else:
-        result = matmul(a, b, epilog=epilog, epilog_inputs=inputs)
+        result = execute_matmul(a, b, epilog=epilog, epilog_inputs=inputs, stateful=autotune, autotune=autotune)
         assert_tensors_equal(result, reference)
 
 
@@ -240,25 +329,44 @@ def make_matrix(shape, transposed):
 @pytest.mark.parametrize(
     "n,m,k", ((41, 33, 29), (2, 2, 2), (64, 32, 16), (65, 43, 21), (4, 1, 2), (1, 1, 1), (9, 2, 1), (1, 2, 3))
 )
-def test_d_epilogs(d_epilog, epilog, n, m, k, use_cuda):
+@pytest.mark.parametrize("framework", ("numpy/cupy", "torch"))
+@pytest.mark.parametrize(
+    "a_batch,b_batch",
+    (
+        (None, 2),
+        (2, None),
+        (5, 5),
+        (None, None),
+        (1, 1),
+    ),
+)
+def test_d_epilogs(d_epilog, epilog, n, m, k, framework, use_cuda, a_batch, b_batch):
+    autotune = rng.choice((True, False))
     skip_if_cublas_before(111103, message="DRELU/DGELU not supported")
 
-    a = sample_matrix("torch", "float32", (k, n), use_cuda=use_cuda).T
-    b = sample_matrix("torch", "float32", (m, k), use_cuda=use_cuda).T
-    bias_value = torch.rand((a.shape[0], 1)) - 0.5
-    bias_value = bias_value.cuda() if use_cuda else bias_value
-    ab, aux = matmul(a, b, epilog=epilog, epilog_inputs={"bias": bias_value})
+    a_shape = (a_batch, n, k) if a_batch is not None else (n, k)
+    b_shape = (b_batch, k, m) if b_batch is not None else (k, m)
+
+    if "numpy" in framework and not use_cuda:
+        # Transfer to the GPU (which uses cupy.asarray under the hood)
+        # won't preserve such layout.
+        pytest.skip("It's not possible to create COL-order matrix with numpy")
+
+    a = make_matrix(a_shape, use_cuda=use_cuda, framework=framework, transposed=True)
+    b = make_matrix(b_shape, use_cuda=use_cuda, framework=framework, transposed=True)
+    bias_value = make_matrix((a.shape[-2], 1), use_cuda=use_cuda, framework=framework)
+    ab, aux = execute_matmul(a, b, epilog=epilog, epilog_inputs={"bias": bias_value}, stateful=autotune, autotune=autotune)
     reference, aux_checkers = simulate_epilog(a, b, epilog=d_epilog, epilog_inputs=aux)
 
     if k == 1:
         with pytest.raises(ValueError, match="not supported"):
-            result, aux = matmul(a, b, epilog=d_epilog, epilog_inputs=aux)
+            result, aux = execute_matmul(a, b, epilog=d_epilog, epilog_inputs=aux, stateful=autotune, autotune=autotune)
         return
 
     if not aux_checkers:
-        result = matmul(a, b, epilog=d_epilog, epilog_inputs=aux)
+        result = execute_matmul(a, b, epilog=d_epilog, epilog_inputs=aux, stateful=autotune, autotune=autotune)
     else:
-        result, aux = matmul(a, b, epilog=d_epilog, epilog_inputs=aux)
+        result, aux = execute_matmul(a, b, epilog=d_epilog, epilog_inputs=aux, stateful=autotune, autotune=autotune)
         assert aux.keys() == aux_checkers.keys()
         for k in aux:
             assert aux_checkers[k](to_numpy(aux[k]))
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_input.py b/tests/nvmath_tests/linalg/advanced/matmul/test_input.py
index c66dc46..ac58736 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_input.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_input.py
@@ -140,6 +140,86 @@ def sample_batch(batch_shape):
     assert_tensors_equal(result, a @ b + c)
 
 
+@pytest.mark.parametrize("c_desc", (None, "M", "M1", "MN"))
+@pytest.mark.parametrize("b_desc", ("K", "KN"))
+@pytest.mark.parametrize("a_desc", ("K", "MK"))
+@pytest.mark.parametrize("a_t", (True, False))
+@pytest.mark.parametrize("b_t", (True, False))
+@pytest.mark.parametrize("c_t", (True, False))
+@pytest.mark.parametrize("framework", ("numpy/cupy", "torch"))
+@pytest.mark.parametrize("M,N,K", ((2, 3, 5),))
+@pytest.mark.parametrize("use_cuda", (True, False))
+def test_shape_promotion(a_desc, b_desc, c_desc, a_t, b_t, c_t, M, N, K, framework, use_cuda):
+    """
+    Test shape promotion rules for 1D inputs
+    """
+
+    if "M" not in a_desc:
+        M = 1
+    if "N" not in b_desc:
+        N = 1
+
+    def unpack_shape(shape_desc):
+        if shape_desc is None:
+            return None
+        shape_map = {
+            "N": N,
+            "M": M,
+            "K": K,
+            "1": 1,
+        }
+        return tuple(shape_map[c] for c in shape_desc)
+
+    a_shape, b_shape, c_shape = unpack_shape(a_desc), unpack_shape(b_desc), unpack_shape(c_desc)
+
+    def make_matrix(shape, transposed):
+        if transposed:
+            return sample_matrix(framework, "float32", tuple(reversed(shape)), use_cuda=use_cuda).T
+        else:
+            return sample_matrix(framework, "float32", shape, use_cuda=use_cuda)
+
+    a = make_matrix(a_shape, a_t)
+    b = make_matrix(b_shape, b_t)
+    if c_desc:
+        c = make_matrix(c_shape, c_t)
+        with_c = True
+    else:
+        c = None
+        with_c = False
+
+    a_promoted, b_promoted, c_promoted = a, b, c
+
+    if len(a_shape) == 1:
+        # If argument a is 1-D, it is promoted to a matrix by prefixing 1 to its dimensions.
+        a_promoted = a_promoted.reshape(1, a_shape[0])
+
+    if len(b_shape) == 1:
+        # If argument b is 1-D, it is promoted to a matrix by appending 1 to its dimensions.
+        b_promoted = b_promoted.reshape(b_shape[0], 1)
+
+    if with_c and len(c_shape) == 1:
+        c_promoted = c_promoted.reshape(c_shape[0], 1)
+
+    if with_c and c_promoted.shape[-1] == 1:
+        # If a vector is provided or N = 1, the columns of c are broadcast for the addition.
+        c_promoted = get_framework(c_promoted).stack([c_promoted[..., 0]] * N, -1)
+
+    alpha = 0.12
+    beta = 0.34 if with_c else None
+    result = matmul(a, b, c=c, alpha=alpha, beta=beta)
+    reference = matmul(a_promoted, b_promoted, c=c_promoted, alpha=alpha, beta=beta)
+
+    if len(a_shape) == 1:
+        assert reference.shape[-2] == 1
+        reference = reference.reshape((*reference.shape[:-2], reference.shape[-1]))
+
+    if len(b_shape) == 1:
+        assert reference.shape[-1] == 1
+        reference = reference.reshape(reference.shape[:-1])
+
+    assert_tensors_equal(result, reference)
+
+
 @pytest.mark.parametrize(
     "slices",
     (
@@ -302,7 +382,7 @@ def test_dtype_mismatch(framework, a_dtype, b_dtype, c_dtype):
         a = sample_matrix(framework, a_dtype, (2, 2), True)
         b = sample_matrix(framework, b_dtype, (2, 2), True)
         c = sample_matrix(framework, c_dtype, (2, 2), True)
-    except NotImplementedError as e:
+    except NotImplementedError:
         pytest.skip("Unable to generate matrix of this dtype")
     with pytest.raises(ValueError, match=r"The dtype of operands .* must be the same"):
         matmul(a, b, c, beta=1)
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_matmul_with_hypothesis.py b/tests/nvmath_tests/linalg/advanced/matmul/test_matmul_with_hypothesis.py
index f8a64dc..63adf01 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_matmul_with_hypothesis.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_matmul_with_hypothesis.py
@@ -1,6 +1,7 @@
 import collections
 import logging
 import re
+import typing
 
 from hypothesis import given, settings, reproduce_failure, assume
 from hypothesis.extra.numpy import arrays, from_dtype
@@ -123,6 +124,20 @@ def drelu(x, bitmask):
 def verify_result(a, b, c, result_c, alpha, beta, epilog, epilog_inputs):
     possible_dtype = CUBLAS_COMPUTE_TYPE_TO_NAME[NAME_TO_DEFAULT_COMPUTE_TYPE[str(a.dtype)]]
     compute_dtype = possible_dtype[1] if np.iscomplexobj(a) else possible_dtype[0]
+
+    added_singleton_dimensions: list[int] = []
+    if a.ndim == 1:
+        a = a[None, ...]
+        added_singleton_dimensions.append(0)
+    if b.ndim == 1:
+        b = b[..., None]
+        added_singleton_dimensions.append(1)
+    if c is not None and c.ndim == 1:
+        # nvmath and numpy have different broadcasting for `c`. nvmath assumes that a 1D `c`
+        # has length `M`. numpy assumes 1D `c` has length `N` to be consistent with
+        # broadcasting behavior where singleton dimensions are always prepended.
+        c = c[..., None]
+
     ab = (
         np.matmul(a, b, dtype=compute_dtype)
         if alpha is None
@@ -160,9 +175,10 @@ def verify_result(a, b, c, result_c, alpha, beta, epilog, epilog_inputs):
         if epilog == MatmulEpilog.RELU_AUX:
             assert verify_bitmask(abc >= 0, result_c[1]["relu_aux"])
         if epilog == MatmulEpilog.RELU_AUX_BIAS:
-            assert verify_bitmask(
-                (abc + cp.asnumpy(epilog_inputs["bias"].astype(compute_dtype))) >= 0, result_c[1]["relu_aux"]
-            )
+            assert verify_bitmask((abc + cp.asnumpy(epilog_inputs["bias"].astype(compute_dtype))) >= 0, result_c[1]["relu_aux"])
+
+    if added_singleton_dimensions:
+        ref_c = np.squeeze(ref_c, axis=tuple(added_singleton_dimensions))
 
     result_c_ = result_c[0] if isinstance(result_c, tuple) else result_c
     compare_result(ref_c.astype(a.dtype), result_c_)
@@ -205,13 +221,13 @@ def verify_result(a, b, c, result_c, alpha, beta, epilog, epilog_inputs):
 
 @composite
 def matrix_multiply_arrays(draw):
-    m = draw(problem_size_mnk)
-    n = draw(problem_size_mnk)
+    m = draw(one_of(none(), problem_size_mnk))
+    n = draw(one_of(none(), problem_size_mnk))
     k = draw(problem_size_mnk)
     ab_type = draw(sampled_from(ab_type_values))
     # Generate data in range [0, 5] to match sample_matrix() from utils
     # Only non-negative reals to avoid catastrophic cancellation
-    element_properties = dict(
+    element_properties: dict[str, typing.Any] = dict(
         allow_infinity=False,
         allow_nan=False,
         allow_subnormal=False,
@@ -223,14 +239,29 @@ def matrix_multiply_arrays(draw):
     # NOTE: It is unfeasible for hypothesis to explore a parameter space where
     # all elements of the input arrays are unique, so most of the time, arrays
     # contain just a few unique values
-    a = draw(arrays(dtype=ab_type, shape=(m, k), elements=element_properties))
-    b = draw(arrays(dtype=ab_type, shape=(k, n), elements=element_properties))
-    c = draw(one_of(none(), arrays(dtype=ab_type, shape=(m, draw(sampled_from([1, n]))), elements=element_properties)))
+    a = draw(arrays(dtype=ab_type, shape=(k,) if m is None else (m, k), elements=element_properties))
+    b = draw(arrays(dtype=ab_type, shape=(k,) if n is None else (k, n), elements=element_properties))
+    m_for_c = 1 if m is None else m
+    c = draw(
+        one_of(
+            none(),
+            arrays(
+                dtype=ab_type,
+                shape=(m_for_c,)
+                if n is None
+                else (
+                    m_for_c,
+                    draw(sampled_from([1, n])),
+                ),
+                elements=element_properties,
+            ),
+        )
+    )
     beta = None if c is None else draw(from_dtype(dtype=np.dtype(ab_type), **element_properties))
     alpha = draw(one_of(none(), from_dtype(dtype=np.dtype(ab_type), **element_properties)))
     epilogs = draw(sampled_from(MatmulEpilog_valid_pairs_list))
     bias = (
-        draw(arrays(dtype=ab_type, shape=(m, 1), elements=element_properties))
+        draw(arrays(dtype=ab_type, shape=(m_for_c, 1), elements=element_properties))
         if epilogs[0] in MatmulEpilog_BIAS_list
         else None
     )
@@ -241,10 +272,8 @@ def matrix_multiply_arrays(draw):
     # FIXME: We should also test broadcasting of c. i.e. when the shape of c is
     # (m, 1), but currently we are avoiding a bug where broadcasting doesn't
     # work on V100 and double precision
-    assume(ab_type != np.float64 or c is None or c.shape != (m, 1))
-    return MatmulInputs(
-        a=a, b=b, c=c, m=m, n=n, k=k, ab_type=ab_type, bias=bias, beta=beta, alpha=alpha, epilogs=epilogs
-    )
+    assume(ab_type != np.float64 or c is None or c.shape != (m_for_c, 1))
+    return MatmulInputs(a=a, b=b, c=c, m=m, n=n, k=k, ab_type=ab_type, bias=bias, beta=beta, alpha=alpha, epilogs=epilogs)
 
 
 @nvmath_seed()
@@ -287,9 +316,8 @@ def test_matmul(input_arrays, order, options, preferences):
 
         epilog_inputs = None if epilog is None else {}
 
-        if epilog is not None:
-            if epilog in MatmulEpilog_BIAS_list:
-                epilog_inputs["bias"] = cp.asarray(bias, order=order)
+        if epilog is not None and epilog in MatmulEpilog_BIAS_list:
+            epilog_inputs["bias"] = cp.asarray(bias, order=order)
 
         result_c = matmul(
             d_a,
@@ -331,33 +359,23 @@ def test_matmul(input_arrays, order, options, preferences):
             raise e
     except ValueError as e:
         # FIXME: Check for CUDA toolkit version 11
-        if re.search("K=1 is not supported for (BGRAD(A|B)|D(R|G)ELU) epilog", str(e)):
-            pass
-        elif "requires cublaslt >=" in str(e):
+        if re.search("K=1 is not supported for (BGRAD(A|B)|D(R|G)ELU) epilog", str(e)) or "requires cublaslt >=" in str(e):
             pass
         else:
             raise e
 
 
 problem_size = integers(min_value=0, max_value=256)
-f32_strategy = arrays(
-    np.float32, shape=tuples(problem_size, problem_size), elements=floats(min_value=1, max_value=2, width=32)
-)
-f64_strategy = arrays(
-    np.float64, shape=tuples(problem_size, problem_size), elements=floats(min_value=1, max_value=2, width=32)
-)
+f32_strategy = arrays(np.float32, shape=tuples(problem_size, problem_size), elements=floats(min_value=1, max_value=2, width=32))
+f64_strategy = arrays(np.float64, shape=tuples(problem_size, problem_size), elements=floats(min_value=1, max_value=2, width=32))
 c32_strategy = arrays(
     np.complex64, shape=tuples(problem_size, problem_size), elements=floats(min_value=1, max_value=2, width=32)
 )
 c64_strategy = arrays(
     np.complex128, shape=tuples(problem_size, problem_size), elements=floats(min_value=1, max_value=2, width=32)
 )
-f64_strategy = arrays(
-    np.float64, shape=tuples(problem_size, problem_size), elements=floats(min_value=1, max_value=2, width=64)
-)
-f16_strategy = arrays(
-    np.float16, shape=tuples(problem_size, problem_size), elements=floats(min_value=1, max_value=2, width=16)
-)
+f64_strategy = arrays(np.float64, shape=tuples(problem_size, problem_size), elements=floats(min_value=1, max_value=2, width=64))
+f16_strategy = arrays(np.float16, shape=tuples(problem_size, problem_size), elements=floats(min_value=1, max_value=2, width=16))
 
 options_blocking_values_negative = [True, False, "auto", "none"]
 
@@ -411,7 +429,8 @@ def generate_alpha_beta(value_type, value):
     ),
 )
 def test_matmul_negative(a, b, c, alpha_value, beta_value, epilog, epilog_inputs, options, preferences):
-    """Call nvmath.linalg.advanced.matmul() with invalid inputs; catch expected exceptions."""
+    """Call nvmath.linalg.advanced.matmul() with invalid inputs; catch expected
+    exceptions."""
     try:
         if c is not None and ((a.dtype != c.dtype) or (a.shape[0] != c.shape[0]) or (c.shape[1] != b.shape[1])):
             return
@@ -450,8 +469,8 @@ def test_matmul_negative(a, b, c, alpha_value, beta_value, epilog, epilog_inputs
         elif f"The dtype of operands A {a.dtype} and B {b.dtype} must be the same." in str(e):
             assert a.dtype != b.dtype
         elif (
-            f"The 'K' extent must match for the operands: K={a.shape[1]} in operand A is not equal to K={b.shape[0]} in operand B."
-            in str(e)
+            f"The 'K' extent must match for the operands: K={a.shape[1]} in operand A is not equal to K={b.shape[0]} "
+            "in operand B." in str(e)
         ):
             assert a.shape[1] != b.shape[0]
         elif re.search(
@@ -472,9 +491,7 @@ def test_matmul_negative(a, b, c, alpha_value, beta_value, epilog, epilog_inputs
             assert a.shape[0] == 0 or a.shape[1] == 0 or b.shape[0] == 0 or b.shape[1] == 0
         elif "The extents must be strictly positive" in str(e):
             assert (
-                any(e <= 0 for e in a.shape)
-                or any(e <= 0 for e in b.shape)
-                or (c is not None and any(e <= 0 for e in c.shape))
+                any(e <= 0 for e in a.shape) or any(e <= 0 for e in b.shape) or (c is not None and any(e <= 0 for e in c.shape))
             )
         elif "requires cublaslt >=" in str(e):
             from nvmath.bindings import cublasLt
@@ -486,8 +503,8 @@ def test_matmul_negative(a, b, c, alpha_value, beta_value, epilog, epilog_inputs
         if "an integer is required" in str(e):
             pass  # ignore error in nvmath.bindings.cublasLt.matrix_layout_destroy
         elif (
-            "The Matrix multiplication plan preferences must be provided as an object of type MatmulPlanPreferences or as a dict with valid Matrix multiplication plan preferences."
-            in str(e)
+            "The Matrix multiplication plan preferences must be provided as an object of type MatmulPlanPreferences "
+            "or as a dict with valid Matrix multiplication plan preferences." in str(e)
         ):
             assert not isinstance(preferences, MatmulPlanPreferences)
         elif "The allocator must be an object of type that fulfils the BaseCUDAMemoryManager protocol" in str(e):
@@ -497,7 +514,10 @@ def test_matmul_negative(a, b, c, alpha_value, beta_value, epilog, epilog_inputs
         else:
             raise e
     except cuBLASLtError as e:
-        if "CUBLAS_STATUS_NOT_SUPPORTED" in str(e):
+        if "NOT_SUPPORTED" in str(e) or "CUBLAS_STATUS_INVALID_VALUE" in str(e):
+            # Catch both not_supported and invalid value because some features
+            # which are only unsupported on certain devices are raised as invalid
+            # value in older libraries
             pass
         else:
             raise e
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_options.py b/tests/nvmath_tests/linalg/advanced/matmul/test_options.py
index 61b20b5..1411949 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_options.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_options.py
@@ -8,7 +8,6 @@
 from .utils import *
 import pytest
 import logging
-import pytest
 
 try:
     import cupy_backends.cuda
@@ -26,45 +25,105 @@
 """
 
 
-def check_matmul_with_options(size, options, use_cuda=False, dtype="float32"):
-    a = b = sample_matrix("numpy/cupy", dtype, (size, size), use_cuda)
-    result = matmul(a, b, alpha=0.42, options=options)
-    assert_tensors_equal(result, 0.42 * (a @ b))
+def check_matmul_with_options(size, options, use_cuda=False, dtype="float32", atol=None):
+    a = b = sample_matrix("numpy/cupy" if dtype != "bfloat16" else "torch", dtype, (size, size), use_cuda)
+    is_complex = "_C_" in str(options.scale_type) or (options.compute_type is None and "complex" in dtype)
+    alpha = 0.42 + 0.24j if is_complex else 0.42
+    result = matmul(a, b, alpha=alpha, options=options)
+    assert_tensors_equal(result, alpha * (a @ b), atol=atol)
     return result
 
 
+ct = cublas.ComputeType
+st = nvmath.CudaDataType
+
+
 @pytest.mark.parametrize(
-    "dtype,compute_type",
+    "dtype,compute_type,scale_type",
     (
-        ("float32", cublas.ComputeType.COMPUTE_32F),
-        ("float64", cublas.ComputeType.COMPUTE_64F),
-        ("float16", cublas.ComputeType.COMPUTE_32F),
+        # None specified
+        ("bfloat16", None, None),
+        ("float16", None, None),
+        ("float32", None, None),
+        ("float64", None, None),
+        ("complex64", None, None),
+        ("complex128", None, None),
+        # Only compute type specified
+        ("float16", ct.COMPUTE_16F, None),
+        ("float16", ct.COMPUTE_16F_PEDANTIC, None),
+        ("float16", ct.COMPUTE_32F, None),
+        ("float32", ct.COMPUTE_32F, None),
+        ("bfloat16", ct.COMPUTE_32F_PEDANTIC, None),
+        ("complex64", ct.COMPUTE_32F, None),
+        ("float16", ct.COMPUTE_32F_PEDANTIC, None),
+        ("float32", ct.COMPUTE_32F_PEDANTIC, None),
+        ("bfloat16", ct.COMPUTE_32F_PEDANTIC, None),
+        ("complex64", ct.COMPUTE_32F_PEDANTIC, None),
+        ("float32", ct.COMPUTE_32F_FAST_16F, None),
+        ("float32", ct.COMPUTE_32F_FAST_16BF, None),
+        ("float32", ct.COMPUTE_32F_FAST_TF32, None),
+        ("float64", ct.COMPUTE_64F, None),
+        ("float64", ct.COMPUTE_64F_PEDANTIC, None),
+        ("complex128", ct.COMPUTE_64F, None),
+        ("complex128", ct.COMPUTE_64F_PEDANTIC, None),
+        # Only scale type specified
+        ("float16", None, st.CUDA_R_16F),
+        ("float16", None, st.CUDA_R_32F),
+        ("bfloat16", None, st.CUDA_R_32F),
+        ("float32", None, st.CUDA_R_32F),
+        ("complex64", None, st.CUDA_C_32F),
+        ("float32", None, st.CUDA_R_32F),
+        ("float64", None, st.CUDA_R_64F),
+        ("complex128", None, st.CUDA_C_64F),
+        # Both compute and scale type specified
+        ("float16", ct.COMPUTE_16F, st.CUDA_R_16F),
+        ("float16", ct.COMPUTE_16F_PEDANTIC, st.CUDA_R_16F),
+        ("float16", ct.COMPUTE_32F, st.CUDA_R_32F),
+        ("bfloat16", ct.COMPUTE_32F, st.CUDA_R_32F),
+        ("float32", ct.COMPUTE_32F, st.CUDA_R_32F),
+        ("complex64", ct.COMPUTE_32F, st.CUDA_C_32F),
+        ("float16", ct.COMPUTE_32F_PEDANTIC, st.CUDA_R_32F),
+        ("bfloat16", ct.COMPUTE_32F_PEDANTIC, st.CUDA_R_32F),
+        ("float32", ct.COMPUTE_32F_PEDANTIC, st.CUDA_R_32F),
+        ("complex64", ct.COMPUTE_32F_PEDANTIC, st.CUDA_C_32F),
+        ("float32", ct.COMPUTE_32F_FAST_16F, st.CUDA_R_32F),
+        ("float32", ct.COMPUTE_32F_FAST_16BF, st.CUDA_R_32F),
+        ("float32", ct.COMPUTE_32F_FAST_TF32, st.CUDA_R_32F),
+        ("float64", ct.COMPUTE_64F, st.CUDA_R_64F),
+        ("float64", ct.COMPUTE_64F_PEDANTIC, st.CUDA_R_64F),
+        ("complex128", ct.COMPUTE_64F, st.CUDA_C_64F),
+        ("complex128", ct.COMPUTE_64F_PEDANTIC, st.CUDA_C_64F),
     ),
 )
-def test_compute_type(dtype, compute_type):
+def test_compute_scale_type(dtype, compute_type, scale_type):
     check_matmul_with_options(
-        5,
-        MatmulOptions(compute_type=compute_type),
+        2,
+        MatmulOptions(compute_type=compute_type, scale_type=scale_type),
         dtype=dtype,
         use_cuda=True,
+        atol=0.1,
     )
 
 
 @pytest.mark.parametrize(
-    "dtype,scale_type",
+    "dtype,compute_type,scale_type",
     (
-        ("float32", nvmath.CudaDataType.CUDA_R_32F),
-        ("float64", nvmath.CudaDataType.CUDA_R_64F),
-        ("float16", nvmath.CudaDataType.CUDA_R_32F),
+        ("float16", ct.COMPUTE_32F, st.CUDA_R_16F),
+        ("float32", ct.COMPUTE_16F, st.CUDA_R_32F),
+        ("float64", ct.COMPUTE_64F, st.CUDA_R_32F),
+        ("complex64", ct.COMPUTE_32F_PEDANTIC, st.CUDA_R_32F),
+        ("float64", ct.COMPUTE_32F_FAST_16F, st.CUDA_R_32F),
+        ("float16", ct.COMPUTE_32F_FAST_16BF, st.CUDA_R_32F),
     ),
 )
-def test_scale_type(dtype, scale_type):
-    check_matmul_with_options(
-        5,
-        MatmulOptions(scale_type=scale_type),
-        dtype=dtype,
-        use_cuda=True,
-    )
+def test_unsupported_compute_scale_type(dtype, compute_type, scale_type):
+    with pytest.raises(Exception, match="not supported|INVALID_VALUE|NOT_SUPPORTED"):
+        check_matmul_with_options(
+            2,
+            MatmulOptions(compute_type=compute_type, scale_type=scale_type),
+            dtype=dtype,
+            use_cuda=True,
+        )
 
 
 @pytest.mark.parametrize(
@@ -144,7 +203,7 @@ def test_memory_limit_filtering():
     a = b = sample_matrix("numpy/cupy", "float32", (1000, 1000), True)
 
     def get_memory_requirements(algos):
-        return [int(alg.algorithm["workspace_size"]) for alg in algos]
+        return [alg.algorithm.workspace_size for alg in algos]
 
     all_memory = get_memory_requirements(Matmul(a, b).plan())
 
@@ -228,7 +287,8 @@ def test_invalid_allocator():
 
 def test_uninstantiated_allocator():
     """
-    Tests if reasonable error is produced when an allocator class is provided instead of an instance
+    Tests if reasonable error is produced when an allocator class is provided instead of an
+    instance
     """
     from nvmath.memory import _TorchCUDAMemoryManager
 
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_planning.py b/tests/nvmath_tests/linalg/advanced/matmul/test_planning.py
index ecbf7ac..e0c07a3 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_planning.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_planning.py
@@ -29,14 +29,26 @@
 @pytest.mark.parametrize("iterations", (1, 5))
 @pytest.mark.parametrize("prune", (1, 5, 9))
 @pytest.mark.parametrize("use_cuda", (True, False))
-def test_autotuning(framework, dtype, n, m, k, max_waves_count, iterations, prune, use_cuda):
+def test_autotuning(
+    framework,
+    dtype,
+    n,
+    m,
+    k,
+    max_waves_count,
+    iterations,
+    prune,
+    use_cuda,
+):
     a = sample_matrix(framework, dtype, (n, k), use_cuda)
     b = sample_matrix(framework, dtype, (k, m), use_cuda)
     c = sample_matrix(framework, dtype, (n, m), use_cuda)
     mm = Matmul(a, b, beta=0.7, c=c)
     with allow_cublas_unsupported(
         allow_invalid_value=False,
-        message=f"Unsupported configuration: {framework}-{dtype}-{n}-{m}-{k}-{max_waves_count}-{iterations}-{prune}-{use_cuda}.",
+        message=(
+            f"Unsupported configuration: {framework}-{dtype}-{n}-{m}-{k}-{max_waves_count}-{iterations}-{prune}-{use_cuda}."
+        ),
     ):
         mm.plan(preferences=MatmulPlanPreferences(limit=9, max_waves_count=max_waves_count))
     num_algorithms = len(mm.algorithms)
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_reset.py b/tests/nvmath_tests/linalg/advanced/matmul/test_reset.py
index 3a516b5..edd479a 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_reset.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_reset.py
@@ -173,7 +173,8 @@ def test_shape_mismatch(
     use_cuda,
 ):
     """
-    Checks if resetting operands to ones of different shapes results in appropriate error message
+    Checks if resetting operands to ones of different shapes results in appropriate error
+    message
     """
     m, n, k = 54, 32, 10
     a = sample_matrix(framework, dtype, (m, k), use_cuda)
@@ -238,7 +239,8 @@ def test_dtype_mismatch(
     use_cuda,
 ):
     """
-    Checks if resetting operands to ones with different dtypes results in appropriate error message
+    Checks if resetting operands to ones with different dtypes results in appropriate error
+    message
     """
 
     m, n, k = 19, 28, 37
@@ -263,9 +265,7 @@ def test_dtype_mismatch(
         new_b = sample_matrix(framework, bad_dtype if b_mismatch else dtype, (k, n), use_cuda)
         new_c = sample_matrix(framework, bad_dtype if c_mismatch else dtype, (m, n), use_cuda) if with_c else None
         new_epilog_inputs = (
-            {"bias": sample_matrix(framework, bad_dtype if bias_mismatch else dtype, (m, 1), use_cuda)}
-            if with_epilog
-            else None
+            {"bias": sample_matrix(framework, bad_dtype if bias_mismatch else dtype, (m, 1), use_cuda)} if with_epilog else None
         )
 
         if any((a_mismatch, b_mismatch, c_mismatch, bias_mismatch)):
@@ -309,7 +309,8 @@ def test_framework_mismatch(
     use_cuda,
 ):
     """
-    Checks if resetting operands to ones from different framework results in appropriate error message
+    Checks if resetting operands to ones from different framework results in appropriate
+    error message
     """
 
     m, n, k = 10, 11, 12
@@ -390,8 +391,8 @@ def test_conjugate_flag(b_conj_init, b_conj_reset):
 
     Only checks GPU tensors, because conj flag is reset on H2D copy.
 
-    Only checks B, because changing conj flag of A requires transposing it due to cublas requirements,
-    which causes stride mismatch.
+    Only checks B, because changing conj flag of A requires transposing it due to cublas
+    requirements, which causes stride mismatch.
     """
     m, k, n = 3, 4, 5
 
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/utils.py b/tests/nvmath_tests/linalg/advanced/matmul/utils.py
index 57a4f76..ca7db5e 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/utils.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/utils.py
@@ -72,6 +72,17 @@ def to_numpy(tensor):
         raise AssertionError()
 
 
+def get_framework(tensor):
+    if torch is not None and isinstance(tensor, torch.Tensor):
+        return torch
+    elif isinstance(tensor, cupy.ndarray):
+        return cupy
+    elif isinstance(tensor, np.ndarray):
+        return np
+    else:
+        raise AssertionError()
+
+
 def get_tolerance(value):
     eps = np.finfo(to_numpy(value).dtype).eps
     if torch is not None and value.dtype == torch.bfloat16:
@@ -79,15 +90,18 @@ def get_tolerance(value):
     return eps**0.5
 
 
-def compare_tensors(result, reference):
-    return np.allclose(to_numpy(result), to_numpy(reference), atol=get_tolerance(result))
+def compare_tensors(result, reference, atol=None):
+    if atol is None:
+        atol = get_tolerance(result)
+    return np.allclose(to_numpy(result), to_numpy(reference), atol=atol)
 
 
-def assert_tensors_equal(result, reference):
+def assert_tensors_equal(result, reference, atol=None):
     """
     Checks if result is close to the provided numpy reference.
     """
-    ok = compare_tensors(result, reference)
+    assert result is not reference, "same object passed as `result` and `reference`!"
+    ok = compare_tensors(result, reference, atol=atol)
     if not ok:
         print("Result:\n", result)
         print("Reference:\n", reference)
@@ -127,7 +141,6 @@ def __enter__(self):
         pass
 
     def __exit__(self, exc_type, exc_value, exc_traceback):
-        if exc_type is nvmath.bindings.cublasLt.cuBLASLtError:
-            if re.search(self.regex, str(exc_value)):
-                return skip_if_cublas_before(self.unsupported_before, self.message)
+        if exc_type is nvmath.bindings.cublasLt.cuBLASLtError and re.search(self.regex, str(exc_value)):
+            return skip_if_cublas_before(self.unsupported_before, self.message)
         return False