diff --git a/.gitattributes b/.gitattributes
index 1255c68cb..892050179 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -21,3 +21,4 @@
 *.mp4 filter=lfs diff=lfs merge=lfs -text
 *.a filter=lfs diff=lfs merge=lfs -text
 *.hdf5 filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 540810c2f..359a414ff 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -7,6 +7,22 @@ assignees: ''
 
 ---
 
+[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved."
+[//]: # "SPDX-License-Identifier: Apache-2.0"
+[//]: # ""
+[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');"
+[//]: # "you may not use this file except in compliance with the License."
+[//]: # "You may obtain a copy of the License at"
+[//]: # "http://www.apache.org/licenses/LICENSE-2.0"
+[//]: # ""
+[//]: # "Unless required by applicable law or agreed to in writing, software"
+[//]: # "distributed under the License is distributed on an 'AS IS' BASIS"
+[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied."
+[//]: # "See the License for the specific language governing permissions and"
+[//]: # "limitations under the License."
+
+
+
 **Describe the bug**
 A clear and concise description of the bug.
 
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
index d4b540a76..ffd08012b 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -7,6 +7,22 @@ assignees: ''
 
 ---
 
+[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved."
+[//]: # "SPDX-License-Identifier: Apache-2.0"
+[//]: # ""
+[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');"
+[//]: # "you may not use this file except in compliance with the License."
+[//]: # "You may obtain a copy of the License at"
+[//]: # "http://www.apache.org/licenses/LICENSE-2.0"
+[//]: # ""
+[//]: # "Unless required by applicable law or agreed to in writing, software"
+[//]: # "distributed under the License is distributed on an 'AS IS' BASIS"
+[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied."
+[//]: # "See the License for the specific language governing permissions and"
+[//]: # "limitations under the License."
+
+
+
 **Is your feature request related to a problem? Please describe.**
 A clear and concise description of the problem. Ex. I wish I could use CV-CUDA to do [...]
 
diff --git a/.github/ISSUE_TEMPLATE/submit-question.md b/.github/ISSUE_TEMPLATE/submit-question.md
index 72b2b74c5..6900ea6b2 100644
--- a/.github/ISSUE_TEMPLATE/submit-question.md
+++ b/.github/ISSUE_TEMPLATE/submit-question.md
@@ -7,4 +7,20 @@ assignees: ''
 
 ---
 
+[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved."
+[//]: # "SPDX-License-Identifier: Apache-2.0"
+[//]: # ""
+[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');"
+[//]: # "you may not use this file except in compliance with the License."
+[//]: # "You may obtain a copy of the License at"
+[//]: # "http://www.apache.org/licenses/LICENSE-2.0"
+[//]: # ""
+[//]: # "Unless required by applicable law or agreed to in writing, software"
+[//]: # "distributed under the License is distributed on an 'AS IS' BASIS"
+[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied."
+[//]: # "See the License for the specific language governing permissions and"
+[//]: # "limitations under the License."
+
+
+
 **What is your question?**
diff --git a/.gitignore b/.gitignore
index 718aa64f1..4b0a6a14c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,7 @@
 /build/
 /build-*/
 /install/
+/cvcuda-installer*/
 
 # Visual Studio Code
 # ------------------
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f0db22406..f5197e2ed 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -72,8 +72,8 @@ repos:
   - id: copyright_check
     name: 'check copyright message'
     language: system
-    types: ['file', 'text']
-    exclude_types: ['markdown', 'xml', 'json', 'csv']
+    types: ['file', 'text', 'markdown']
+    exclude_types: ['xml', 'json', 'csv']
     entry: ./lint/copyright_check.sh
     exclude: 'models/.*'
   - id: lfs_check
@@ -83,7 +83,7 @@ repos:
     require_serial: true
 
 - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
-  rev: v9.0.0
+  rev: v9.13.0
   hooks:
   - id: commitlint
     stages: [commit-msg]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fccd9c7eb..0f98aedef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,7 @@ endif()
 
 project(cvcuda
         LANGUAGES C CXX
-        VERSION 0.6.0
+        VERSION 0.7.0
         DESCRIPTION "CUDA-accelerated Computer Vision algorithms"
 )
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d21011b97..37852a875 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -16,7 +16,7 @@
 
 # Contributing to CV-CUDA
 
-**As of release v0.6.0-beta, CV-CUDA is not accepting outside contribution.**
+**As of release v0.7.0-beta, CV-CUDA is not accepting outside contribution.**
 
 Contributions to CV-CUDA fall into the following categories:
 
@@ -28,7 +28,7 @@ Contributions to CV-CUDA fall into the following categories:
 1. To propose a new feature, please file a new feature request
    [issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). Describe the
    intended feature and discuss the design and implementation with the team and
-   community. NOTE: Currently, as of release v0.6.0-beta, CV-CUDA is not accepting
+   community. NOTE: Currently, as of release v0.7.0-beta, CV-CUDA is not accepting
    outside contribution.
 1. To ask a general question, please sumbit a question
    [issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). If you need
diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
index a5f4bec53..83e42f22c 100644
--- a/DEVELOPER_GUIDE.md
+++ b/DEVELOPER_GUIDE.md
@@ -30,7 +30,7 @@ CV-CUDA includes:
 - C, C++, and Python APIs
 - Batching support, with variable shape images
 - Zero-copy interfaces to PyTorch
-- Sample applications: object classification and image segmentation
+- Sample applications
 
 ## What Pre- and Post-Processing Operators Are Included?
 
@@ -40,7 +40,7 @@ CV-CUDA includes:
 | Advanced Color Format Conversions | Performs color conversion from interleaved RGB/BGR <-> YUV/YVU and semi planar. Supported standards: BT.601. BT.709. BT.2020 |
 | AverageBlur | Reduces image noise using an average filter |
 | BilateralFilter | Reduces image noise while preserving strong edges |
-| Bounding Box | Draws a rectangular border using the X-Y coordinates and dimensions typically to define the location and size of an object in an image |
+| Bounding Box | Draws an rectangular border using the X-Y coordinates and dimensions typically to define the location and size of an object in an image |
 | Box Blurring | Overlays a blurred rectangle using the X-Y coordinates and dimensions that define the location and size of an object in an image |
 | Brightness_Contrast | Adjusts brightness and contrast of an image |
 | CenterCrop | Crops an image at its center |
@@ -53,8 +53,6 @@ CV-CUDA includes:
 | CvtColor | Converts an image from one color space to another |
 | DataTypeConvert | Converts an image’s data type, with optional scaling |
 | Erase | Erases image regions |
-| Find Contours | Extract closed contours from an input binary image |
-| FindHomography | Calculates a perspective transform from four pairs of the corresponding points  |
 | Flip | Flips a 2D image around its axis |
 | GammaContrast | Adjusts image contrast |
 | Gaussian | Applies a gaussian blur filter to the image |
@@ -70,9 +68,9 @@ CV-CUDA includes:
 | MinArea Rect | Finds the minimum area rotated rectangle typically used to draw bounding rectangle with minimum area |
 | MinMaxLoc | Finds the maximum and minimum values in a given array |
 | Morphology | Performs morphological erode and dilate transformations |
-| Morphology (close) | Performs a morphological operation that involves dilation followed by erosion on an image |
-| Morphology (open) | Performs a morphological operation that involves erosion followed by dilation on an image |
-| Non-max Suppression | Enables selecting a single entity out of many overlapping ones typically used for selecting from multiple bounding boxes during object detection |
+| Morphology (close) | Performs morphological operation that involves dilation followed by erosion on an image |
+| Morphology (open) | Performs morphological operation that involves erosion followed by dilation on an image |
+| Non-Maximum Suppression | Enables selecting a single entity out of many overlapping ones typically used for selecting from multiple bounding boxes during object detection |
 | Normalize | Normalizes an image pixel’s range |
 | OSD (Polyline Line Text Rotated Rect Segmented Mask) | Displays an overlay on the image of different forms including polyline line text rotated rectangle segmented mask |
 | PadStack | Stacks several images into a tensor with border extension |
@@ -83,20 +81,19 @@ CV-CUDA includes:
 | Remap | Maps pixels in an image with one projection to another projection in a new image. |
 | Resize | Changes the size and scale of an image |
 | Rotate | Rotates a 2D array in multiples of 90 degrees |
-| SIFT | Identifies and matches features in images that are invariant to scale rotation and affine distortion. |
-| Stack | Concatenates two input tensors into a single output tensor |
+| SIFT | Identifies and describes features in images that are invariant to scale rotation and affine distortion. |
 | Thresholding | Chooses a global threshold value that is the same for all pixels across the image. |
 | WarpAffine | Applies an affine transformation to an image |
 | WarpPerspective | Applies a perspective transformation to an image |
 
 ## Where Are the Release Notes?
 
-An awesome product requires excellent support. CV-CUDA release notes can be
+CV-CUDA release notes can be
 found [here](https://github.com/CVCUDA/CV-CUDA/releases)
 
 ## Where Can I Get Help?
 
-File requests for enhancements and bug reports
+An awesome product requires excellent support. File requests for enhancements and bug reports
 [here](https://github.com/CVCUDA/CV-CUDA/issues/new/choose).
 
 We are providing limited, direct, support to select enterprises using CV-CUDA.
@@ -208,5 +205,5 @@ companies with which they are associated.
 
 <b>Copyright</b>
 
-© 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+© 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 </font>
diff --git a/README.md b/README.md
index 31ae4466b..4eaf478d2 100644
--- a/README.md
+++ b/README.md
@@ -15,9 +15,10 @@
 
 # CV-CUDA
 
+
 [![License](https://img.shields.io/badge/License-Apache_2.0-yellogreen.svg)](https://opensource.org/licenses/Apache-2.0)
 
-![Version](https://img.shields.io/badge/Version-v0.6.0--beta-blue)
+![Version](https://img.shields.io/badge/Version-v0.7.0--beta-blue)
 
 ![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2%7C_aarch64-gray)
 
@@ -33,7 +34,7 @@ efficient pre- and post-processing pipelines. CV-CUDA originated as a
 collaborative effort between [NVIDIA][NVIDIA Develop] and [ByteDance][ByteDance].
 
 Refer to our [Developer Guide](DEVELOPER_GUIDE.md) for more information on the
-operators available as of release v0.6.0-beta.
+operators available.
 
 ## Getting Started
 
@@ -43,10 +44,10 @@ To get a local copy up and running follow these steps.
 
 |CV-CUDA Build|Platform|CUDA Version|CUDA Compute Capability|Hardware Architectures|Nvidia Driver|Python Versions|Supported Compilers (build from source)|API compatibility with prebuilt binaries|OS/Linux distributions tested with prebuilt packages|
 |-|-|-|-|-|-|-|-|-|-|
-|x86_64_cu11|x86_64|11.7 or later|SM7 and later|Volta, Turing, Amper, Hopper, Ada Lovelace|r520 or later*** |3.8, 3.9, 3.10, 3.11|gcc>=9* <br> gcc>=11**|gcc>=9|Ubuntu>= 20.04<br>WSL2/Ubuntu>=20.04|
-|x86_64_cu12|x86_64|12.2 or later|SM7 and later|Volta, Turing, Amper, Hopper, Ada Lovelace|r520 or later***|3.8, 3.9, 3.10, 3.11|gcc>=9* <br> gcc>=11**|gcc>=9|Ubuntu>= 20.04<br>WSL2/Ubuntu>=20.04|
-|aarch64_cu11 (JetPack 5.1)|aarch64|11.4|SM7 and later|Jetson AGX Orin|JetPack 5.1|3.8|gcc>=9* <br> gcc>=11**|gcc>=9|Jetson Linux 35.x|
-|aarch64_cu12 (JetPack 6.0)|aarch64|12.2|SM7 and later|Jetson AGX Orin|JetPack 6.0 DP|3.10|gcc>=9* <br> gcc>=11**|gcc>=9|Jetson Linux 36.2|
+|x86_64_cu11|x86_64|11.7 or later|SM7 and later|Volta, Turing, Ampere, Hopper, Ada Lovelace|r525 or later*** |3.8, 3.9, 3.10, 3.11|gcc>=9* <br> gcc>=11**|gcc>=9|Ubuntu>= 20.04<br>WSL2/Ubuntu>=20.04|
+|x86_64_cu12|x86_64|12.2 or later|SM7 and later|Volta, Turing, Ampere, Hopper, Ada Lovelace|r525 or later***|3.8, 3.9, 3.10, 3.11|gcc>=9* <br> gcc>=11**|gcc>=9|Ubuntu>= 20.04<br>WSL2/Ubuntu>=20.04|
+|aarch64_cu11|aarch64|11.4|SM7 and later|Jetson AGX Orin|JetPack 5.1|3.8|gcc>=9* <br> gcc>=11**|gcc>=9|Jetson Linux 35.x|
+|aarch64_cu12|aarch64|12.2|SM7 and later|Jetson AGX Orin, IGX Orin + Ampere RTX6000, IGX Orin + ADA RTX6000|JetPack 6.0 DP, r535 (IGX OS v0.6)|3.10|gcc>=9* <br> gcc>=11**|gcc>=9|Jetson Linux 36.2<br> IGX OS v0.6|
 
 \* partial build, no test module (see Known Limitations) <br>
 \** full build, including test module <br>
@@ -58,7 +59,7 @@ To get a local copy up and running follow these steps.
 - The C++ test module cannot build with gcc<11 (requires specific C++-20 features).  With gcc-9 or gcc-10, please build with option `-DBUILD_TESTS=0`
 - [CV-CUDA Samples] require driver r535 or later to run and are only officially supported with CUDA 12.
 - Only one CUDA version (CUDA 11.x or CUDA 12.x) of CV-CUDA packages (Debian packages, tarballs, Python Wheels) can be installed at a time. Please uninstall all packages from a given CUDA version before installing packages from a different version.
-- Test tarballs (cvcuda-tests-*.tar.xz) need to be unpacked at the root level to find existing tests.
+- Documentation built with older toolchains (doxygen, sphinx, breathe, exhale) may be incomplete. We recommend using Ubuntu 22.04 or later.
 
 ### Installation
 
@@ -66,211 +67,189 @@ For convenience, we provide pre-built packages for various combinations of CUDA
 The following steps describe how to install CV-CUDA from such pre-built packages.
 
 We support two main alternative pathways:
-- DEB or Tar archive installation (C++/CUDA Libraries, Headers, Python bindings)
 - Standalone Python Wheels (containing C++/CUDA Libraries and Python bindings)
+- DEB or Tar archive installation (C++/CUDA Libraries, Headers, Python bindings)
 
 Choose the installation method that meets your environment needs.
 
-#### Tar File Installation
+#### Python Wheel File Installation
 
-- Installation of C++/CUDA libraries (cvcuda-lib*) and development headers (cvcuda-dev*):
-```shell
-tar -xvf cvcuda-lib-0.6.0_beta-<cu_ver>-<arch>-linux.tar.xz
-tar -xvf cvcuda-dev-0.6.0_beta-<cu_ver>-<arch>-linux.tar.xz
-```
-- Installation of Python bindings (cvcuda-python*)
-```shell
-tar -xvf cvcuda-python<py_ver>-0.6.0_beta-<cu_ver>-<arch>-linux.tar.xz
-```
-with `<cu_ver>` the desired CUDA version,
-`<py_ver>` the desired Python version and
-`<arch>` the desired architecture
+Download the appropriate .whl file for your computer architecture, Python and CUDA version from the release assets of current CV-CUDA release. Release information of all CV-CUDA releases can be found [here][CV-CUDA GitHub Releases]. Once downloaded, execute the `pip install` command to install the Python wheel. For example:
+   ```shell
+   pip install cvcuda_<cu_ver>-0.7.0b0-cp<py_ver>-cp<py_ver>-linux_<arch>.whl
+   ```
+
+where `<cu_ver>` is the desired CUDA version, `<py_ver>` is the desired Python version and `<arch>` is the desired architecture.
+
+Please note that the Python wheels are standalone, they include both the C++/CUDA libraries and the Python bindings.
 
 #### DEB File Installation
 
-- Installation of C++/CUDA libraries (cvcuda-lib*) and development headers (cvcuda-dev*):
+Install C++/CUDA libraries (cvcuda-lib*) and development headers (cvcuda-dev*) using `apt`:
 ```shell
-sudo apt-get install -y ./cvcuda-lib-0.6.0_beta-<cu_ver>-<arch>-linux.deb ./cvcuda-dev-0.6.0_beta-<cu_ver>-<arch>-linux.deb
+apt install -y ./cvcuda-lib-<x.x.x>-<cu_ver>-<arch>-linux.deb ./cvcuda-dev-<x.x.x>-<cu_ver>-<arch>-linux.deb
 ```
-- Installation of Python bindings (cvcuda-python*)
+
+Install Python bindings (cvcuda-python*) using `apt`:
 ```shell
-sudo apt-get install -y cvcuda-python<py_ver>-0.6.0_beta-<cu_ver>-<arch>-linux.deb
+apt install -y ./cvcuda-python<py_ver>-<x.x.x>-<cu_ver>-<arch>-linux.deb
 ```
-with `<cu_ver>` the desired CUDA version,
-`<py_ver>` the desired Python version and
-`<arch>` the desired architecture
-
-#### Python Wheel File Installation
-
-
-Download the appropriate .whl file for your computer architecture, Python and CUDA version from the release assets of current CV-CUDA release. Release information of all CV-CUDA releases can be accessed [here][CV-CUDA GitHub Releases]. Once downloaded, execute the `pip install` command to install the Python wheel. For example:
+where `<cu_ver>` is the desired CUDA version, `<py_ver>` is the desired Python version and `<arch>` is the desired architecture.
 
+#### Tar File Installation
 
+Install C++/CUDA libraries (cvcuda-lib*) and development headers (cvcuda-dev*):
 ```shell
-pip install cvcuda_<cu_ver>-0.6.0b0-cp<py_ver>-cp<py_ver>-linux_<arch>.whl
+tar -xvf cvcuda-lib-<x.x.x>-<cu_ver>-<arch>-linux.tar.xz
+tar -xvf cvcuda-dev-<x.x.x>-<cu_ver>-<arch>-linux.tar.xz
 ```
-with `<cu_ver>` the desired CUDA version,
-`<py_ver>` the desired Python version and
-`<arch>`  the desired architecture
-
-Please note that the Python wheels provided are standalone, they include both the C++/CUDA libraries and the Python bindings.
+Install Python bindings (cvcuda-python*)
+```shell
+tar -xvf cvcuda-python<py_ver>-<x.x.x>-<cu_ver>-<arch>-linux.tar.xz
+```
+where `<cu_ver>` is the desired CUDA version, `<py_ver>` is the desired Python version and `<arch>` is the desired architecture.
 
 
 ### Build from Source
 
 Follow these instruction to build CV-CUDA from source:
 
-1. Set up your local CV-CUDA repository
-
-    a. Install prerequisites needed to setup up the repository.
-
-       On Ubuntu >= 20.04, install the following packages:
-       - git-lfs: to retrieve binary files from remote repository
-
-       ```shell
-       sudo apt-get install -y git git-lfs
-       ```
-
-    b. After cloning the repository (assuming it was cloned in `~/cvcuda`),
-       it needs to be properly configured by running the `init_repo.sh` script only once.
-
-       ```shell
-       cd ~/cvcuda
-       ./init_repo.sh
-       ```
-
-2. Build CV-CUDA
-
-    a. Install the dependencies required for building CV-CUDA
-
-       On Ubuntu >= 20.04, install the following packages:
-       - g++-11: compiler to be used
-       - cmake (>= 3.20), ninja-build (optional): manage build rules
-       - python3-dev: for python bindings
-       - libssl-dev: needed by the testsuite (MD5 hashing utilities)
-
-       ```shell
-       sudo apt-get install -y g++-11 cmake ninja-build python3-dev libssl-dev
-       ```
-
-       For CUDA Toolkit, any version of the 11.x or 12.x series should work.
-       CV-CUDA was tested with 11.7 and 12.2, thus those should be preferred.
-
-       ```shell
-       sudo apt-get install -y cuda-11-7
-       # or
-       sudo apt-get install -y cuda-12-2
-       ```
-
-    b. Build the project
+#### 1. Set up your local CV-CUDA repository
 
-       ```shell
-       ci/build.sh [release|debug] [output build tree path] [-DBUILD_TESTS=1|0] [-DPYTHON_VERSIONS='3.8;3.9;3.10;3.11'] [-DPUBLIC_API_COMPILERS='gcc-9;gcc-11;clang-11;clang-14']
-       ```
+Install the dependencies needed to setup up the repository:
+- git
+- git-lfs: to retrieve binary files from remote repository
 
-       The default build type is 'release'.
-
-       If output build tree path isn't specified, it will be `build-rel` for release
-       builds, and `build-deb` for debug.
-
-       The library is in `build-rel/lib` and executables (tests, etc...) are in `build-rel/bin`.
+On Ubuntu >= 20.04, install the following packages using `apt`:
+```shell
+apt install -y git git-lfs
+```
 
-       The `-DBUILD_TESTS` option can be used to disable/enable building the tests (enabled by default, see Known Limitations).
+Clone the repository
+```shell
+git clone https://github.com/CVCUDA/CV-CUDA.git
+```
 
-       The `-DPYTHON_VERSIONS` option can be used to select Python versions to build bindings and Wheels for.
-       By default, only the default system Python3 version will be selected.
+Assuming the repository was cloned in `~/cvcuda`, it needs to be properly configured by running the `init_repo.sh` script only once.
 
-       The `-DPUBLIC_API_COMPILERS` option can be used to select the compilers used to check public API compatibility.
-       By default, gcc-11, gcc-9, clang-11, and clang-14 is tried to be selected and checked.
+```shell
+cd ~/cvcuda
+./init_repo.sh
+```
 
-3. Build Documentation
+#### 2. Build CV-CUDA
 
-    a. Install the dependencies required for building the documentation
+Install the dependencies required to build CV-CUDA:
+- g++-11: compiler to be used
+- cmake (>= 3.20), ninja-build (optional): manage build rules
+- python3-dev: for python bindings
+- libssl-dev: needed by the testsuite (MD5 hashing utilities)
+- CUDA toolkit
 
-       On Ubuntu >= 20.04, install the following packages:
-       - doxygen: parse header files for reference documentation
-       - python3, python3-pip: to install some python packages needed
-       - sphinx, breathe, exhale, recommonmark, graphiviz: to render the documentation
-       - sphinx-rtd-theme: documenation theme used
+On Ubuntu >= 20.04, install the following packages using `apt`:
+```shell
+apt install -y g++-11 cmake ninja-build python3-dev libssl-dev
+```
 
-       ```shell
-       sudo apt-get install -y doxygen graphviz python3 python3-pip
-       sudo python3 -m pip install sphinx==4.5.0 breathe exhale recommonmark graphviz sphinx-rtd-theme
-       ```
+Any version of the 11.x or 12.x CUDA toolkit should work.
+CV-CUDA was tested with 11.7 and 12.2, these versions are thus recommended.
 
-    b. Build the documentation
-       ```shell
-       ci/build_docs.sh [build folder]
-       ```
+```shell
+apt install -y cuda-11-7
+# or
+apt install -y cuda-12-2
+```
 
-       Example:
-       `ci/build_docs.sh build_docs`
+Build the project:
+```shell
+ci/build.sh [release|debug] [output build tree path] [-DBUILD_TESTS=1|0] [-DPYTHON_VERSIONS='3.8;3.9;3.10;3.11'] [-DPUBLIC_API_COMPILERS='gcc-9;gcc-11;clang-11;clang-14']
+```
 
-4. Build and run Samples
+- The default build type is 'release'.
+- If output build tree path isn't specified, it will be `build-rel` for release
+      builds, and `build-deb` for debug.
+- The library is in `build-rel/lib` and executables (tests, etc...) are in `build-rel/bin`.
+- The `-DBUILD_TESTS` option can be used to disable/enable building the tests (enabled by default, see Known Limitations).
+- The `-DPYTHON_VERSIONS` option can be used to select Python versions to build bindings and Wheels for. By default, only the default system Python3 version will be selected.
+- The `-DPUBLIC_API_COMPILERS` option can be used to select the compilers used to check public API compatibility. By default, gcc-11, gcc-9, clang-11, and clang-14 is tried to be selected and checked.
 
-   For instructions on how to build samples from source and run them, see the [Samples](samples/README.md) documentation.
+#### 3. Build Documentation
 
-5. Run Tests
+Known limitation: documentation built with older toolchains (doxygen, sphinx, breathe, exhale) may be incomplete. We recommend using Ubuntu 22.04 or later.
 
-   a. Install the dependencies required for running the tests
+Install the dependencies required to  build the documentation:
+- doxygen: parse header files for reference documentation
+- python3, python3-pip: to install some python packages needed
+- sphinx, breathe, exhale, recommonmark, graphiviz: to render the documentation
+- sphinx-rtd-theme: documentation theme used
 
-       On Ubuntu >= 20.04, install the following packages:
-       - python3, python3-pip: to run python bindings tests
-       - torch: dependencies needed by python bindings tests
+On Ubuntu, install the following packages using `apt` and `pip`:
+```shell
+apt install -y doxygen graphviz python3 python3-pip
+python3 -m pip install sphinx==4.5.0 breathe exhale recommonmark graphviz sphinx-rtd-theme
+```
 
-       ```shell
-       sudo apt-get install -y python3 python3-pip
-       sudo python3 -m pip install pytest torch
-       ```
+Build the documentation:
+```shell
+ci/build_docs.sh [build folder]
+```
+Default build folder is 'build'.
 
-   b. Run the tests
+#### 4. Build and run Samples
 
-       The tests are in `<buildtree>/bin`. You can run the script below to run all
-       tests at once. Here's an example when build tree is created in `build-rel`
+For instructions on how to build samples from source and run them, see the [Samples](samples/README.md) documentation.
 
-       ```shell
-       build-rel/bin/run_tests.sh
-       ```
+#### 5. Run Tests
 
-6. Package installers and Python Wheels
+Install the dependencies required for running the tests:
+- python3, python3-pip: to run python bindings tests
+- torch: dependencies needed by python bindings tests
 
-   a. Package installers
+On Ubuntu >= 20.04, install the following packages using `apt` and `pip`:
+```shell
+apt install -y python3 python3-pip
+python3 -m pip install pytest torch
+```
 
-      Installers can be generated using the following cpack command once you have successfully built the project
+The tests are in `<buildtree>/bin`. You can run the script below to run all tests at once. Here's an example when build tree is created in `build-rel`:
+```shell
+build-rel/bin/run_tests.sh
+```
 
-      ```shell
-      cd build-rel
-      cpack .
-      ```
+#### 6. Package installers and Python Wheels
 
-      This will generate in the build directory both Debian installers and tarballs
-      (\*.tar.xz), needed for integration in other distros.
+Package installers
 
-      For a fine-grained choice of what installers to generate, the full syntax is:
+Installers can be generated using the following cpack command once you have successfully built the project:
+```shell
+cd build-rel
+cpack .
+```
+This will generate in the build directory both Debian installers and tarballs (\*.tar.xz), needed for integration in other distros.
 
-      ```shell
-      cpack . -G [DEB|TXZ]
-      ```
+For a fine-grained choice of what installers to generate, the full syntax is:
 
-      - DEB for Debian packages
-      - TXZ for \*.tar.xz tarballs.
+```shell
+cpack . -G [DEB|TXZ]
+```
+- DEB for Debian packages
+- TXZ for \*.tar.xz tarballs.
 
-   b. Python Wheels
+Python Wheels
 
-      By default during the `release` build, Python bindings and wheels are created for the available CUDA version and the specified Python
-      version(s). The wheels are stored in `build-rel/pythonX.Y/wheel` folder, where `build-rel` is the build directory
-      used to build the release build and `X` and `Y` are Python major and minor versions. The built wheels can be installed using pip.
-      For example, to install the Python wheel built for CUDA 12.x, Python 3.10 on Linux x86_64 systems:
+By default during the `release` build, Python bindings and wheels are created for the available CUDA version and the specified Python version(s). The wheels are stored in `build-rel/pythonX.Y/wheel` folder, where `build-rel` is the build directory used to build the release build and `X` and `Y` are Python major and minor versions.
 
-      ```shell
-      pip install cvcuda_cu12-0.6.0b0-cp310-cp310-linux_x86_64.whl
-      ```
+The built wheels can be installed using pip.
+For example, to install the Python wheel built for CUDA 12.x, Python 3.10 on Linux x86_64 systems:
+```shell
+pip install cvcuda_cu12-<x.x.x>-cp310-cp310-linux_x86_64.whl
+```
 
 ## Contributing
 
 CV-CUDA is an open source project. As part of the Open Source Community, we are
 committed to the cycle of learning, improving, and updating that makes this
-community thrive. However, as of release v0.6.0-beta, CV-CUDA is not yet ready
+community thrive. However, as of release v0.7.0-beta, CV-CUDA is not yet ready
 for external contributions.
 
 To understand the process for contributing the CV-CUDA, see our
@@ -287,27 +266,27 @@ The `mkop.sh` script is a powerful tool for creating a scaffold for new operator
 
 1. **Operator Stub Creation**: Generates no-op (no-operation) operator templates, which serve as a starting point for implementing new functionalities.
 
-1. **File Customization**: Modifies template files to include the new operator's name, ensuring consistent naming conventions across the codebase.
+2. **File Customization**: Modifies template files to include the new operator's name, ensuring consistent naming conventions across the codebase.
 
-1. **CMake Integration**: Adds the new operator files to the appropriate CMakeLists, facilitating seamless compilation and integration into the build system.
+3. **CMake Integration**: Adds the new operator files to the appropriate CMakeLists, facilitating seamless compilation and integration into the build system.
 
-1. **Python Bindings**: Creates Python wrapper stubs for the new operator, allowing it to be used within Python environments.
+4. **Python Bindings**: Creates Python wrapper stubs for the new operator, allowing it to be used within Python environments.
 
-1. **Test Setup**: Generates test files for both C++ and Python, enabling immediate development of unit tests for the new operator.
+5. **Test Setup**: Generates test files for both C++ and Python, enabling immediate development of unit tests for the new operator.
 
 #### How to Use `mkop.sh`:
 
-Run the script with the desired operator name. The script assumes it's located in `/cvcuda/tools/mkop`.
+Run the script with the desired operator name. The script assumes it's located in `~/cvcuda/tools/mkop`.
 
-  ```shell
-  ./mkop.sh [Operator Name]
-  ```
+```shell
+./mkop.sh [Operator Name]
+```
 
 If the script is run from a different location, provide the path to the CV-CUDA root directory.
 
-  ```shell
-  ./mkop.sh [Operator Name] [CV-CUDA root]
-  ```
+```shell
+./mkop.sh [Operator Name] [CV-CUDA root]
+```
 
 **NOTE**: The first letter of the new operator name is captitalized where needed to match the rest of the file structures.
 
diff --git a/bench/BenchAdaptiveThreshold.cpp b/bench/BenchAdaptiveThreshold.cpp
index 658281fd4..10fe8570f 100644
--- a/bench/BenchAdaptiveThreshold.cpp
+++ b/bench/BenchAdaptiveThreshold.cpp
@@ -92,5 +92,5 @@ using AdaptiveThresholdTypes = nvbench::type_list<uint8_t>;
 NVBENCH_BENCH_TYPES(AdaptiveThreshold, NVBENCH_TYPE_AXES(AdaptiveThresholdTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_int64_axis("blockSize", {7});
diff --git a/bench/BenchAverageBlur.cpp b/bench/BenchAverageBlur.cpp
index fbfc9c4cd..0736ccd47 100644
--- a/bench/BenchAverageBlur.cpp
+++ b/bench/BenchAverageBlur.cpp
@@ -88,6 +88,6 @@ using AverageBlurTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(AverageBlur, NVBENCH_TYPE_AXES(AverageBlurTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_string_axis("kernelSize", {"7x7"})
     .add_string_axis("border", {"REPLICATE"});
diff --git a/bench/BenchBilateralFilter.cpp b/bench/BenchBilateralFilter.cpp
index 73875d8ed..ff41b9494 100644
--- a/bench/BenchBilateralFilter.cpp
+++ b/bench/BenchBilateralFilter.cpp
@@ -90,7 +90,7 @@ using BilateralFilterTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(BilateralFilter, NVBENCH_TYPE_AXES(BilateralFilterTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_int64_axis("diameter", {-1})
     .add_float64_axis("sigmaSpace", {1.2})
     .add_string_axis("border", {"REFLECT"});
diff --git a/bench/BenchBrightnessContrast.cpp b/bench/BenchBrightnessContrast.cpp
index 8e741169a..ea79f5a13 100644
--- a/bench/BenchBrightnessContrast.cpp
+++ b/bench/BenchBrightnessContrast.cpp
@@ -88,4 +88,4 @@ using BrightnessContrastTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(BrightnessContrast, NVBENCH_TYPE_AXES(BrightnessContrastTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1});
+    .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchColorTwist.cpp b/bench/BenchColorTwist.cpp
index 67e90af8b..1ade029f4 100644
--- a/bench/BenchColorTwist.cpp
+++ b/bench/BenchColorTwist.cpp
@@ -82,4 +82,4 @@ using ColorTwistTypes = nvbench::type_list<uchar3, uchar4>;
 NVBENCH_BENCH_TYPES(ColorTwist, NVBENCH_TYPE_AXES(ColorTwistTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1});
+    .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchComposite.cpp b/bench/BenchComposite.cpp
index 2293ecab0..f29f26acf 100644
--- a/bench/BenchComposite.cpp
+++ b/bench/BenchComposite.cpp
@@ -88,4 +88,4 @@ using CompositeTypes = nvbench::type_list<uchar3>;
 NVBENCH_BENCH_TYPES(Composite, NVBENCH_TYPE_AXES(CompositeTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1});
+    .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchCopyMakeBorder.cpp b/bench/BenchCopyMakeBorder.cpp
index 722c37d03..8d26487a7 100644
--- a/bench/BenchCopyMakeBorder.cpp
+++ b/bench/BenchCopyMakeBorder.cpp
@@ -92,5 +92,5 @@ using CopyMakeBorderTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(CopyMakeBorder, NVBENCH_TYPE_AXES(CopyMakeBorderTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_string_axis("border", {"REFLECT101"});
diff --git a/bench/BenchCvtColor.cpp b/bench/BenchCvtColor.cpp
index 05469e0f7..abe1951ea 100644
--- a/bench/BenchCvtColor.cpp
+++ b/bench/BenchCvtColor.cpp
@@ -80,4 +80,4 @@ using CvtColorTypes = nvbench::type_list<uchar3, uchar4>;
 NVBENCH_BENCH_TYPES(CvtColor, NVBENCH_TYPE_AXES(CvtColorTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1});
+    .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchErase.cpp b/bench/BenchErase.cpp
index 68419ad9d..2bb504d2b 100644
--- a/bench/BenchErase.cpp
+++ b/bench/BenchErase.cpp
@@ -91,5 +91,5 @@ using EraseTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(Erase, NVBENCH_TYPE_AXES(EraseTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {0})
+    .add_int64_axis("varShape", {-1, 0})
     .add_int64_axis("numErase", {3});
diff --git a/bench/BenchFindContours.cpp b/bench/BenchFindContours.cpp
deleted file mode 100644
index 06deb9732..000000000
--- a/bench/BenchFindContours.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "BenchUtils.hpp"
-
-#include <cvcuda/OpFindContours.hpp>
-#include <nvcv/TensorDataAccess.hpp>
-#include <stdlib.h>
-
-#include <nvbench/nvbench.cuh>
-
-using CPUImage = std::vector<uint8_t>;
-
-static void generateRectangle(CPUImage &image, nvcv::Size2D boundary, nvcv::Size2D anchor = {0, 0},
-                              nvcv::Size2D size = {5, 5}, double angle = 0.0, bool fill = true, uint8_t setValue = 1);
-
-static void generateRectangle(CPUImage &image, nvcv::Size2D boundary, nvcv::Size2D anchor, nvcv::Size2D size,
-                              double angle, bool fill, uint8_t setValue)
-{
-    auto rad      = angle * (M_PI / 180.0);
-    auto cosAngle = std::cos(rad);
-    auto sinAngle = std::sin(rad);
-
-    auto transformed = anchor;
-    for (auto y = 0; y < size.h; ++y)
-    {
-        for (auto x = 0; x < size.w; ++x)
-        {
-            transformed.w = anchor.w + (x * cosAngle - y * sinAngle);
-            transformed.h = anchor.h + (x * sinAngle + y * cosAngle);
-
-            if (fill || y == 0 || y == size.h - 1 || x == 0 || x == size.w - 1)
-            {
-                if (transformed.w >= 0 && transformed.w < boundary.w && transformed.h >= 0
-                    && transformed.h < boundary.h)
-                {
-                    image[transformed.h * boundary.w + transformed.w] = setValue;
-                }
-            }
-        }
-    }
-}
-
-template<typename T>
-inline void FindContours(nvbench::state &state, nvbench::type_list<T>)
-try
-{
-    srand(0U); // Use a fixed random seed
-    long3 shape     = benchutils::GetShape<3>(state.get_string("shape"));
-    long  varShape  = state.get_int64("varShape");
-    int   numPoints = static_cast<int>(state.get_int64("numPoints"));
-
-    // R/W bandwidth rationale:
-    // Read image + connected components (S32)
-    // Write points + contours (U32)
-    state.add_global_memory_reads(shape.x * shape.y * shape.z * (sizeof(T) + sizeof(int)));
-    state.add_global_memory_writes(shape.x * numPoints * sizeof(int) * 2 + shape.x * 4 * sizeof(int));
-
-    cvcuda::FindContours op(nvcv::Size2D{(int)shape.z, (int)shape.y}, shape.x);
-
-    // clang-format off
-
-    nvcv::Tensor points({{shape.x, numPoints, 2}, "NCW"}, nvcv::TYPE_S32);
-    nvcv::Tensor counts({{shape.x, 4}, "NW"}, nvcv::TYPE_S32);
-
-    if (varShape < 0) // negative var shape means use Tensor
-    {
-        nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType<T>());
-        auto inData = src.exportData<nvcv::TensorDataStridedCuda>();
-        auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData);
-
-        //Generate input
-        CPUImage srcVec(shape.y * shape.z, 0);
-        for (auto i = 0; i < 10; ++i)
-        {
-            int anchorX = rand() % shape.z;
-            int anchorY = rand() % shape.y;
-            int sizeX = rand() % (shape.z - anchorX);
-            int sizeY = rand() % (shape.y - anchorY);
-            generateRectangle(srcVec, {anchorX, anchorY}, {sizeX, sizeY});
-        }
-
-        for (auto i = 0; i < shape.x; ++i)
-        {
-            CUDA_CHECK_ERROR(cudaMemcpy2D(inAccess->sampleData(i), inAccess->rowStride(), srcVec.data(), shape.z, shape.z,
-                                          shape.y, cudaMemcpyHostToDevice));
-        }
-
-        state.exec(nvbench::exec_tag::sync, [&op, &src, &points, &counts](nvbench::launch &launch)
-        {
-            op(launch.get_stream(), src, points, counts);
-        });
-    }
-    else // zero and positive var shape means use ImageBatchVarShape
-    {
-        throw std::invalid_argument("ImageBatchVarShape not implemented for this operator");
-    }
-}
-catch (const std::exception &err)
-{
-    state.skip(err.what());
-}
-
-// clang-format on
-
-using FindContoursTypes = nvbench::type_list<uint8_t>;
-
-NVBENCH_BENCH_TYPES(FindContours, NVBENCH_TYPE_AXES(FindContoursTypes))
-    .set_type_axes_names({"InOutDataType"})
-    .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
-    .add_int64_axis("numPoints", {1024});
diff --git a/bench/BenchFlip.cpp b/bench/BenchFlip.cpp
index 620eac7f6..9c052f62a 100644
--- a/bench/BenchFlip.cpp
+++ b/bench/BenchFlip.cpp
@@ -95,5 +95,5 @@ using FlipTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(Flip, NVBENCH_TYPE_AXES(FlipTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_string_axis("flipType", {"BOTH"});
diff --git a/bench/BenchGaussian.cpp b/bench/BenchGaussian.cpp
index 8b4fc30d1..a1976581d 100644
--- a/bench/BenchGaussian.cpp
+++ b/bench/BenchGaussian.cpp
@@ -91,6 +91,6 @@ using GaussianTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(Gaussian, NVBENCH_TYPE_AXES(GaussianTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_float64_axis("sigma", {1.2})
     .add_string_axis("border", {"REFLECT"});
diff --git a/bench/BenchGaussianNoise.cpp b/bench/BenchGaussianNoise.cpp
index 68633a90f..09dcd04e4 100644
--- a/bench/BenchGaussianNoise.cpp
+++ b/bench/BenchGaussianNoise.cpp
@@ -84,4 +84,4 @@ using GaussianNoiseTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(GaussianNoise, NVBENCH_TYPE_AXES(GaussianNoiseTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1});
+    .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchHQResize.cpp b/bench/BenchHQResize.cpp
index 9d80963ec..49ff41412 100644
--- a/bench/BenchHQResize.cpp
+++ b/bench/BenchHQResize.cpp
@@ -122,7 +122,7 @@ using HQResizeTypes = nvbench::type_list<uint8_t, float>;
 
 NVBENCH_BENCH_TYPES(HQResize, NVBENCH_TYPE_AXES(HQResizeTypes))
     .set_type_axes_names({"InOutDataType"})
-    .add_int64_axis("batch", {false})
+    .add_int64_axis("batch", {false, true})
     .add_string_axis("shape", {"1x1080x1920"})
     .add_string_axis("interpolation", {"CUBIC"})
     .add_int64_axis("antialias", {false, true})
diff --git a/bench/BenchHistogramEq.cpp b/bench/BenchHistogramEq.cpp
index 54082d550..74bcb9d46 100644
--- a/bench/BenchHistogramEq.cpp
+++ b/bench/BenchHistogramEq.cpp
@@ -74,4 +74,4 @@ using HistogramEqTypes = nvbench::type_list<uint8_t, uchar4>;
 NVBENCH_BENCH_TYPES(HistogramEq, NVBENCH_TYPE_AXES(HistogramEqTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1});
+    .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchInpaint.cpp b/bench/BenchInpaint.cpp
index 88a237b31..ed6dbd055 100644
--- a/bench/BenchInpaint.cpp
+++ b/bench/BenchInpaint.cpp
@@ -82,4 +82,4 @@ using InpaintTypes = nvbench::type_list<uint8_t, uchar4>;
 NVBENCH_BENCH_TYPES(Inpaint, NVBENCH_TYPE_AXES(InpaintTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1});
+    .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchJointBilateralFilter.cpp b/bench/BenchJointBilateralFilter.cpp
index 45c325bd8..2aa748048 100644
--- a/bench/BenchJointBilateralFilter.cpp
+++ b/bench/BenchJointBilateralFilter.cpp
@@ -94,7 +94,7 @@ using JointBilateralFilterTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(JointBilateralFilter, NVBENCH_TYPE_AXES(JointBilateralFilterTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_int64_axis("diameter", {-1})
     .add_float64_axis("sigmaSpace", {1.2})
     .add_string_axis("border", {"REFLECT"});
diff --git a/bench/BenchLabel.cpp b/bench/BenchLabel.cpp
index 41005379d..5e1870f50 100644
--- a/bench/BenchLabel.cpp
+++ b/bench/BenchLabel.cpp
@@ -32,15 +32,16 @@ try
 
     std::string runChoice = state.get_string("runChoice");
 
-    // Use [BG][MIN][MAX][ISLAND][COUNT][STAT] in runChoice to run Label with:
-    // background; minThreshold; maxThreshold; island removal; count; statistics
+    // Use [BG][MIN][MAX][ISLAND][COUNT][STAT][MASK] in runChoice to run Label with:
+    // background; minThreshold; maxThreshold; island removal; count; statistics; mask
 
-    long3 staShape{srcShape.x, 10000, 6}; // using fixed 10K max. cap. and 2D problem
+    long3 staShape{srcShape.x, 10000, 7}; // using fixed 10K max. cap. and 2D problem
 
-    NVCVConnectivityType conn = NVCV_CONNECTIVITY_4_2D;
-    NVCVLabelType        alab = NVCV_LABEL_FAST;
+    NVCVConnectivityType conn  = NVCV_CONNECTIVITY_4_2D;
+    NVCVLabelType        alab  = NVCV_LABEL_FAST;
+    NVCVLabelMaskType    mType = NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY;
 
-    nvcv::Tensor bgT, minT, maxT, countT, statsT, mszT;
+    nvcv::Tensor bgT, minT, maxT, countT, statsT, mszT, maskT;
 
     cvcuda::Label op;
 
@@ -81,16 +82,20 @@ try
     {
         statsT = nvcv::Tensor({{staShape.x, staShape.y, staShape.z}, "NMA"}, benchutils::GetDataType<DT>());
     }
+    if (runChoice.find("MASK") != std::string::npos)
+    {
+        maskT = nvcv::Tensor({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, nvcv::TYPE_U8);
+    }
 
     nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType<ST>());
     nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType<DT>());
 
     benchutils::FillTensor<ST>(src, benchutils::RandomValues<ST>());
 
-    state.exec(nvbench::exec_tag::sync,
-               [&op, &src, &dst, &bgT, &minT, &maxT, &mszT, &countT, &statsT, &conn, &alab](nvbench::launch &launch)
+    state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &bgT, &minT, &maxT, &mszT, &countT, &statsT, &maskT, &conn,
+                                         &alab, &mType](nvbench::launch &launch)
                {
-                   op(launch.get_stream(), src, dst, bgT, minT, maxT, mszT, countT, statsT, conn, alab);
+                   op(launch.get_stream(), src, dst, bgT, minT, maxT, mszT, countT, statsT, maskT, conn, alab, mType);
                });
 }
 catch (const std::exception &err)
diff --git a/bench/BenchLaplacian.cpp b/bench/BenchLaplacian.cpp
index e685198ef..7956d8c22 100644
--- a/bench/BenchLaplacian.cpp
+++ b/bench/BenchLaplacian.cpp
@@ -85,7 +85,7 @@ using LaplacianTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(Laplacian, NVBENCH_TYPE_AXES(LaplacianTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_int64_axis("ksize", {1})
     .add_float64_axis("scale", {1.0})
     .add_string_axis("border", {"REFLECT101"});
diff --git a/bench/BenchMedianBlur.cpp b/bench/BenchMedianBlur.cpp
index 45b2c1a6e..0520f5f26 100644
--- a/bench/BenchMedianBlur.cpp
+++ b/bench/BenchMedianBlur.cpp
@@ -82,5 +82,5 @@ using MedianBlurTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(MedianBlur, NVBENCH_TYPE_AXES(MedianBlurTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_string_axis("kernelSize", {"5x5"});
diff --git a/bench/BenchMinMaxLoc.cpp b/bench/BenchMinMaxLoc.cpp
index 582348fde..40e8385bf 100644
--- a/bench/BenchMinMaxLoc.cpp
+++ b/bench/BenchMinMaxLoc.cpp
@@ -88,5 +88,5 @@ using MinMaxLocTypes = nvbench::type_list<nvbench::uint8_t, nvbench::uint32_t>;
 NVBENCH_BENCH_TYPES(MinMaxLoc, NVBENCH_TYPE_AXES(MinMaxLocTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_int64_axis("maxLocations", {100000});
diff --git a/bench/BenchMorphology.cpp b/bench/BenchMorphology.cpp
index d3947e788..f357dbffb 100644
--- a/bench/BenchMorphology.cpp
+++ b/bench/BenchMorphology.cpp
@@ -128,7 +128,7 @@ using MorphologyTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(Morphology, NVBENCH_TYPE_AXES(MorphologyTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_int64_axis("iteration", {1})
     .add_string_axis("kernelSize", {"3x3"})
     .add_string_axis("morphType", {"ERODE", "DILATE", "OPEN", "CLOSE"})
diff --git a/bench/BenchNormalize.cpp b/bench/BenchNormalize.cpp
index 64eed3e33..9e7cc09e6 100644
--- a/bench/BenchNormalize.cpp
+++ b/bench/BenchNormalize.cpp
@@ -96,4 +96,4 @@ using NormalizeTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(Normalize, NVBENCH_TYPE_AXES(NormalizeTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1});
+    .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchPillowResize.cpp b/bench/BenchPillowResize.cpp
index 359480e25..1340a9f26 100644
--- a/bench/BenchPillowResize.cpp
+++ b/bench/BenchPillowResize.cpp
@@ -100,6 +100,6 @@ using PillowResizeTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(PillowResize, NVBENCH_TYPE_AXES(PillowResizeTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_string_axis("resizeType", {"CONTRACT"})
     .add_string_axis("interpolation", {"CUBIC"});
diff --git a/bench/BenchRandomResizedCrop.cpp b/bench/BenchRandomResizedCrop.cpp
index b7f58c57f..661a5e42c 100644
--- a/bench/BenchRandomResizedCrop.cpp
+++ b/bench/BenchRandomResizedCrop.cpp
@@ -98,6 +98,6 @@ using RandomResizedCropTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(RandomResizedCrop, NVBENCH_TYPE_AXES(RandomResizedCropTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_string_axis("resizeType", {"EXPAND"})
     .add_string_axis("interpolation", {"LINEAR"});
diff --git a/bench/BenchRemap.cpp b/bench/BenchRemap.cpp
index 7fc20600c..3f3825c82 100644
--- a/bench/BenchRemap.cpp
+++ b/bench/BenchRemap.cpp
@@ -116,5 +116,5 @@ using RemapTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(Remap, NVBENCH_TYPE_AXES(RemapTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_string_axis("mapType", {"DENSE"});
diff --git a/bench/BenchResize.cpp b/bench/BenchResize.cpp
index 7446a6f80..b8fb517a0 100644
--- a/bench/BenchResize.cpp
+++ b/bench/BenchResize.cpp
@@ -92,6 +92,6 @@ using ResizeTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(Resize, NVBENCH_TYPE_AXES(ResizeTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_string_axis("resizeType", {"EXPAND"})
     .add_string_axis("interpolation", {"LINEAR"});
diff --git a/bench/BenchRotate.cpp b/bench/BenchRotate.cpp
index 4f4af05c7..bfd58527b 100644
--- a/bench/BenchRotate.cpp
+++ b/bench/BenchRotate.cpp
@@ -87,5 +87,5 @@ using RotateTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(Rotate, NVBENCH_TYPE_AXES(RotateTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_string_axis("interpolation", {"CUBIC"});
diff --git a/bench/BenchThreshold.cpp b/bench/BenchThreshold.cpp
index 648a83ac7..1c87a7995 100644
--- a/bench/BenchThreshold.cpp
+++ b/bench/BenchThreshold.cpp
@@ -82,4 +82,4 @@ using ThresholdTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(Threshold, NVBENCH_TYPE_AXES(ThresholdTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1});
+    .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchWarpAffine.cpp b/bench/BenchWarpAffine.cpp
index 459c3b32d..a028e28b9 100644
--- a/bench/BenchWarpAffine.cpp
+++ b/bench/BenchWarpAffine.cpp
@@ -89,7 +89,7 @@ using WarpAffineTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(WarpAffine, NVBENCH_TYPE_AXES(WarpAffineTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_string_axis("border", {"REFLECT"})
     .add_string_axis("interpolation", {"CUBIC"})
     .add_string_axis("inverseMap", {"Y"});
diff --git a/bench/BenchWarpPerspective.cpp b/bench/BenchWarpPerspective.cpp
index 874986129..f18108e87 100644
--- a/bench/BenchWarpPerspective.cpp
+++ b/bench/BenchWarpPerspective.cpp
@@ -89,7 +89,7 @@ using WarpPerspectiveTypes = nvbench::type_list<uint8_t, float>;
 NVBENCH_BENCH_TYPES(WarpPerspective, NVBENCH_TYPE_AXES(WarpPerspectiveTypes))
     .set_type_axes_names({"InOutDataType"})
     .add_string_axis("shape", {"1x1080x1920"})
-    .add_int64_axis("varShape", {-1})
+    .add_int64_axis("varShape", {-1, 0})
     .add_string_axis("border", {"REFLECT"})
     .add_string_axis("interpolation", {"CUBIC"})
     .add_string_axis("inverseMap", {"Y"});
diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt
index e82bf3da4..3ca000274 100644
--- a/bench/CMakeLists.txt
+++ b/bench/CMakeLists.txt
@@ -53,7 +53,6 @@ set(bench_sources
     BenchCropFlipNormalizeReformat.cpp
     BenchCustomCrop.cpp
     BenchErase.cpp
-    BenchFindContours.cpp
     BenchGammaContrast.cpp
     BenchGaussianNoise.cpp
     BenchHistogramEq.cpp
diff --git a/bench/python/all_ops/op_copymakeborder.py b/bench/python/all_ops/op_copymakeborder.py
index c0bca25b6..2f57475d0 100644
--- a/bench/python/all_ops/op_copymakeborder.py
+++ b/bench/python/all_ops/op_copymakeborder.py
@@ -24,7 +24,7 @@
 class OpCopyMakeBorder(AbstractOpBase):
     def setup(self, input):
         self.border_mode = cvcuda.Border.CONSTANT
-        self.border_values = [255, 0, 0]  # Border values for 3 channel input.
+        self.border_values = [255, 0, 0]  # Border values for 3 channel RGB input.
         self.top = 30
         self.left = 40
         self.bottom = 50
diff --git a/bench/python/all_ops/op_findcontours.py b/bench/python/all_ops/op_findcontours.py
deleted file mode 100644
index 7fe31cab0..000000000
--- a/bench/python/all_ops/op_findcontours.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise
-# things may throw unexpected errors.
-import pycuda.driver as cuda  # noqa: F401
-
-from bench_utils import AbstractOpBase
-import cvcuda
-import torch
-from torchvision.io import read_image
-import matplotlib.pyplot as plt
-import numpy as np
-import os
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-class OpFindContours(AbstractOpBase):
-    def setup(self, input):
-        grayscale_input = read_image(
-            os.path.join(self.assets_dir, "countour_lines.jpg")
-        )
-        grayscale_input = grayscale_input.moveaxis(
-            0, -1
-        ).contiguous()  # From CHW to HWC
-        # Binarize the grayscale_input
-        grayscale_input[grayscale_input <= 50] = 0
-        grayscale_input[grayscale_input > 50] = 255
-
-        grayscale_input = [grayscale_input.clone() for _ in range(input.shape[0])]
-        grayscale_input = torch.stack(grayscale_input)
-        grayscale_input = grayscale_input.cuda(self.device_id)
-        self.grayscale_input = cvcuda.as_tensor(grayscale_input, "NHWC")
-
-    def run(self, input):
-        return cvcuda.find_contours(self.grayscale_input)
-
-    def visualize(self):
-        """
-        Attempts to visualize the output produced by the operator as an image by writing it
-        down to the disk. May raise exceptions if visualization is not successful.
-        """
-        output_dir = self._setup_clear_output_dir(filename_ends_with="_op_out.jpg")
-        # Convert the inputs and outputs to numpy arrays first.
-        # input shape: NHWC
-        # out[0] = points_info shape: NxMx2 (M == max points, 2 for x and y coordinates)
-        # out[1] = contours_info shape: NxC where
-        #       (C == max contours, number of non-zero elements are number of contours)
-        input_npy = (
-            torch.as_tensor(
-                self.grayscale_input.cuda(), device="cuda:%d" % self.device_id
-            )
-            .cpu()
-            .numpy()
-        )
-        points_npy = (
-            torch.as_tensor(self.op_output[0].cuda(), device="cuda:%d" % self.device_id)
-            .cpu()
-            .numpy()
-        )
-        num_contours_npy = (
-            torch.as_tensor(self.op_output[1].cuda(), device="cuda:%d" % self.device_id)
-            .cpu()
-            .numpy()
-        )
-
-        # Loop over all the images...
-        for i, img in enumerate(input_npy):
-
-            # Grab the information on the points and the contours of this image.
-            points_info = points_npy[i]
-            contours_info = num_contours_npy[i]
-
-            # Keep only the non-zero entries from contours_info
-            contours_info = contours_info[np.nonzero(contours_info)]
-            # Use the num_points in contours_info to split the points_info
-            # Since the values in num_points are not start-stop indices of the points
-            # we need to use cumsum to fix it and use it inside the split function
-            valid_points = np.split(points_info, contours_info.cumsum())
-            # Last element in valid_points is the remainder of the points so need to drop it.
-            all_contours = valid_points[:-1]  # This list stores OpenCV style contours.
-
-            plt.figure(figsize=(img.shape[1] / 100.0, img.shape[0] / 100.0))
-            plt.gca().invert_yaxis()
-
-            plt.plot(0, 0, color="white")
-            plt.plot(img.shape[1], img.shape[0], color="white")
-            for contour in all_contours:
-                x, y = contour[:, 0], contour[:, 1]
-                plt.plot(x, y, color="green", linewidth=2)
-
-            # Save using PIL
-            out_file_name = "img_%d_op_out.jpg" % i
-            plt.savefig(os.path.join(output_dir, out_file_name))
-            plt.close()
diff --git a/bench/python/all_ops/op_flip.py b/bench/python/all_ops/op_flip.py
index 962a12856..d93a1c148 100644
--- a/bench/python/all_ops/op_flip.py
+++ b/bench/python/all_ops/op_flip.py
@@ -21,9 +21,25 @@
 import cvcuda
 
 
-class OpFlip(AbstractOpBase):
+class OpFlipX(AbstractOpBase):
     def setup(self, input):
-        self.flip_code = -1  # means flipping around both axes.
+        self.flip_code = 0  # means flipping around x axis.
+
+    def run(self, input):
+        return cvcuda.flip(input, flipCode=self.flip_code)
+
+
+class OpFlipY(AbstractOpBase):
+    def setup(self, input):
+        self.flip_code = 1  # means flipping around y axis.
+
+    def run(self, input):
+        return cvcuda.flip(input, flipCode=self.flip_code)
+
+
+class OpFlipXY(AbstractOpBase):
+    def setup(self, input):
+        self.flip_code = -1  # means flipping around x and y axis.
 
     def run(self, input):
         return cvcuda.flip(input, flipCode=self.flip_code)
diff --git a/bench/python/assets/brooklyn_bboxes.pt b/bench/python/assets/brooklyn_bboxes.pt
index 3261e4720..69bc4260c 100644
Binary files a/bench/python/assets/brooklyn_bboxes.pt and b/bench/python/assets/brooklyn_bboxes.pt differ
diff --git a/bench/python/assets/brooklyn_nms_masks.pt b/bench/python/assets/brooklyn_nms_masks.pt
index 2e13cef5e..0b97d7a0d 100644
Binary files a/bench/python/assets/brooklyn_nms_masks.pt and b/bench/python/assets/brooklyn_nms_masks.pt differ
diff --git a/bench/python/assets/brooklyn_scores.pt b/bench/python/assets/brooklyn_scores.pt
index 013cebc71..fdb4a3b29 100644
Binary files a/bench/python/assets/brooklyn_scores.pt and b/bench/python/assets/brooklyn_scores.pt differ
diff --git a/bench/python/run_bench.py b/bench/python/run_bench.py
index ae2c69b08..8c00b43c1 100644
--- a/bench/python/run_bench.py
+++ b/bench/python/run_bench.py
@@ -68,22 +68,25 @@ def run_bench(
     logger = logging.getLogger("run_bench")
     logger.info("Benchmarking started.")
 
+    # Set up various CUDA stuff.
+    cuda_device = cuda.Device(device_id)
+    cuda_ctx = cuda_device.retain_primary_context()
+    cuda_ctx.push()
+    # Use the the default stream for cvcuda and torch
+    # Since we never created a stream current will be the CUDA default stream
+    cvcuda_stream = cvcuda.Stream().current
+    torch_stream = torch.cuda.default_stream(device=cuda_device)
+
     # Create an image batch decoder to supply us the input test data.
     decoder = ImageBatchDecoder(
         input_path,
         batch_size,
         device_id,
-        cuda_ctx=None,
+        cuda_ctx,
+        cvcuda_stream,
         cvcuda_perf=cvcuda_perf,
     )
 
-    # Set up various CUDA stuff.
-    cuda_device = cuda.Device(device_id)
-    cuda_ctx = cuda_device.retain_primary_context()
-    cuda_ctx.push()
-    cvcuda_stream = cvcuda.Stream()
-    torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle)
-
     # Get a list of (class names, class types) of all the ops that can be profiled.
     ops_info_list = get_benchmark_eligible_ops_info()
     logger.info("Found a total of %d operators for benchmarking." % len(ops_info_list))
diff --git a/ci/check_formatting.sh b/ci/check_formatting.sh
new file mode 100755
index 000000000..b91d518cb
--- /dev/null
+++ b/ci/check_formatting.sh
@@ -0,0 +1,42 @@
+#!/bin/bash -e
+
+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# = 0 ]; then
+    # No arguments? Lint all code.
+    echo "Linting all code in the repository =========================="
+    pre-commit run -a
+else
+    from=$1
+    if [ $# = 1 ]; then
+        to=HEAD
+    elif [ $# = 2 ]; then
+        to=$2
+    else
+        echo "Invalid arguments"
+        echo "Usage: $(basename "$0") [ref_from [ref_to]]"
+        exit 1
+    fi
+
+    echo "Linting files touched from commit $from to $to =============="
+    echo "Files to be linted:"
+    git diff --stat $from..$to
+    if ! pre-commit run --from-ref $from --to-ref $to ; then
+        echo "Formatting errors:"
+        git diff
+        false
+    fi
+fi
diff --git a/cmake/ConfigCUDA.cmake b/cmake/ConfigCUDA.cmake
index 24bc2453c..88a2707c5 100644
--- a/cmake/ConfigCUDA.cmake
+++ b/cmake/ConfigCUDA.cmake
@@ -38,9 +38,14 @@ if(NOT USE_CMAKE_CUDA_ARCHITECTURES)
     if(ENABLE_TEGRA)
         list(APPEND CMAKE_CUDA_ARCHITECTURES
             72-real # Volta  - gv11b/Tegra (Jetson AGX Xavier)
-            86-real # Ampere - Jetson IGX Orin
+            86-real # Jetson IGX Orin with optional Ampere RTX A6000
             87-real # Ampere - ga10b,ga10c/Tegra (Jetson AGX Orin)
         )
+        if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8")
+            list(APPEND CMAKE_CUDA_ARCHITECTURES
+                89-real # Jetson IGX Orin with optional RTX 6000 Ada
+            )
+        endif()
     else()
         # All architectures we build sass for
         list(APPEND CMAKE_CUDA_ARCHITECTURES
diff --git a/docker/config b/docker/config
index aa84ebf0d..56df16cb9 100644
--- a/docker/config
+++ b/docker/config
@@ -27,5 +27,5 @@ TAG_IMAGE_SAMPLES=6.1
 TAG_IMAGE_TEST=5
 
 VER_CUDA=11.7.1
-VER_UBUNTU=22.04
+VER_UBUNTU=20.04
 VER_TRT=24.01
diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py
index a32ec33c5..f7fac8e63 100644
--- a/docs/sphinx/conf.py
+++ b/docs/sphinx/conf.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +33,7 @@
 import sys
 
 project = "CV-CUDA"
-copyright = "2022-2023, NVIDIA."
+copyright = "2022-2024, NVIDIA."
 author = "NVIDIA"
 version = "Beta"
 release = version
diff --git a/docs/sphinx/content/cvcuda_oplist.csv b/docs/sphinx/content/cvcuda_oplist.csv
index bc4aecd54..85e45f080 100644
--- a/docs/sphinx/content/cvcuda_oplist.csv
+++ b/docs/sphinx/content/cvcuda_oplist.csv
@@ -16,15 +16,13 @@ CustomCrop,Crops an image with a given region-of-interest
 CvtColor,Converts an image from one color space to another
 DataTypeConvert,Converts an image’s data type with optional scaling
 Erase,Erases image regions
-Find Contours,Extract closed contours from an input binary image
-FindHomography,Calculates a perspective transform from four pairs of the corresponding points
 Flip,Flips a 2D image around its axis
 GammaContrast,Adjusts image contrast
 Gaussian,Applies a gaussian blur filter to the image
 Gaussian Noise,Generates a statistical noise with a normal (Gaussian) distribution
 Histogram,Provides a grayscale value distribution showing the frequency of occurrence of each gray value.
 Histogram Equalizer,Allows effective spreading out the intensity range of the image typically used to improve contrast
-HqResize,Performs advanced resizing supporting 2D and 3D data, tensors, tensor batches, and varshape image batches (2D only). Supports nearest neighbor, linear, cubic, Gaussian and Lanczos interpolation, with optional antialiasing when down-sampling.
+HqResize, "Performs advanced resizing supporting 2D and 3D data, tensors, tensor batches, and varshape image batches (2D only). Supports nearest neighbor, linear, cubic, Gaussian and Lanczos interpolation, with optional antialiasing when down-sampling."
 Inpainting,Performs inpainting by replacing a pixel by normalized weighted sum of all the known pixels in the neighborhood
 Joint Bilateral Filter,Reduces image noise while preserving strong edges based on a guidance image
 Label,Labels connected regions in an image using 4-way connectivity for foreground and 8-way for background pixels
@@ -35,11 +33,11 @@ MinMaxLoc,Finds the maximum and minimum values in a given array
 Morphology,Performs morphological erode and dilate transformations
 Morphology (close), Performs morphological operation that involves dilation followed by erosion on an image
 Morphology (open), Performs morphological operation that involves erosion followed by dilation on an image
-Non-max Suppression,Enables selecting a single entity out of many overlapping ones typically used for selecting from multiple bounding boxes during object detection
+Non-Maximum Suppression,Enables selecting a single entity out of many overlapping ones typically used for selecting from multiple bounding boxes during object detection
 Normalize,Normalizes an image pixel’s range
 OSD (Polyline Line Text Rotated Rect Segmented Mask),Displays an overlay on the image of of different forms including polyline line text rotated rectangle segmented mask
 PadStack,Stacks several images into a tensor with border extension
-PairwiseMatcher,Matches features computed separately (e.g. via the SIFT operator) in two images, e.g. using the brute force method
+PairwiseMatcher,"Matches features computed separately (e.g. via the SIFT operator) in two images, e.g. using the brute force method"
 PillowResize,Changes the size and scale of an image using python-pillow algorithm
 RandomResizedCrop,Crops a random portion of an image and resizes it to a specified size.
 Reformat,Converts a planar image into non-planar and vice versa
@@ -47,7 +45,6 @@ Remap,Maps pixels in an image with one projection to another projection in a new
 Resize,Changes the size and scale of an image
 Rotate,Rotates a 2D array in multiples of 90 degrees
 SIFT,Identifies and matches features in images that are invariant to scale rotation and affine distortion.
-Stack,Concatenates two input tensors into a single output tensor
 Thresholding,Chooses a global threshold value that is the same for all pixels across the image.
 WarpAffine,Applies an affine transformation to an image
 WarpPerspective,Applies a perspective transformation to an image
diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst
index 890d44262..254a0bf63 100644
--- a/docs/sphinx/index.rst
+++ b/docs/sphinx/index.rst
@@ -1,5 +1,5 @@
 ..
-  # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
   # SPDX-License-Identifier: Apache-2.0
   #
   # Licensed under the Apache License, Version 2.0 (the "License");
@@ -38,10 +38,9 @@ CV-CUDA includes:
 CV-CUDA Pre- and Post-Processing Operators
 ------------------
 
-CV-CUDA offers more than 20 Computer Vision and Image Processing operators. Find the operator that is right for your workflow below.
+CV-CUDA offers a comprehensive collection of Computer Vision and Image Processing operators, listed below.
 
-
-.. csv-table::
+.. csv-table:: List of operators
    :file: content/cvcuda_oplist.csv
    :widths: 30, 70
    :header-rows: 1
@@ -50,12 +49,13 @@ CV-CUDA offers more than 20 Computer Vision and Image Processing operators. Find
 Where Are the Release Notes?
 ------------------
 
-An awesome product requires excellent support.  CV-CUDA release notes can be found `here <https://github.com/CVCUDA/CV-CUDA/releases/tag/v0.6.0-beta>`_.
+CV-CUDA release notes can be found `here <https://github.com/CVCUDA/CV-CUDA/releases/>`_.
 
 
 Where Can I Get Help?
 ------------------
 
+An awesome product requires excellent support.
 File requests for enhancements and bug reports `here <https://github.com/CVCUDA/CV-CUDA/issues/new/choose>`_.
 
 
@@ -97,7 +97,7 @@ NVIDIA, the NVIDIA logo, NVIDIA CV-CUDA, and NVIDIA TensorRT are trademarks and/
 
 Copyright
 --------------------
-© 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+© 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 
 
@@ -124,6 +124,7 @@ Copyright
     :maxdepth: 1
     :hidden:
 
+    Beta.5 <relnotes/v0.7.0-beta>
     Beta.4 <relnotes/v0.6.0-beta>
     Beta.3 <relnotes/v0.5.0-beta>
     Beta.2 <relnotes/v0.4.0-beta>
diff --git a/docs/sphinx/installation.rst b/docs/sphinx/installation.rst
index 5e213d536..6c05a33d8 100644
--- a/docs/sphinx/installation.rst
+++ b/docs/sphinx/installation.rst
@@ -24,14 +24,15 @@ Pre-requisites
 
 This section describes the recommended dependencies to install CV-CUDA.
 
-* Ubuntu >= 20.04
-* CUDA driver >= 11.7
+* Ubuntu >= 20.04 (22.04 recommended for building the documentation)
+* CUDA >= 11.7 (cuda 12 required for samples)
+* NVIDIA driver r525 or later (r535 required for samples)
 
 Setup
 -----
 
 The following steps describe how to install CV-CUDA. Choose the installation method that meets your environment needs.
-You can download the CV-CUDA tar, deb or wheel packages from `here <https://github.com/CVCUDA/CV-CUDA/releases/tag/v0.6.0-beta>`_
+You can download the CV-CUDA tar, deb or wheel packages from `the asset section <https://github.com/CVCUDA/CV-CUDA/releases>`_
 
 * Tar File Installation
 
@@ -73,11 +74,11 @@ You can download the CV-CUDA tar, deb or wheel packages from `here <https://gith
 
 * Python Wheel File Installation
 
-    Download the appropriate .whl file for your computer architecture, Python and CUDA version from `here <https://github.com/CVCUDA/CV-CUDA/releases/tag/v0.6.0-beta>`_
+    Download the appropriate .whl file for your computer architecture, Python and CUDA version from `here <https://github.com/CVCUDA/CV-CUDA/releases/tag/v0.7.0-beta>`_
 
     Execute the following command to install appropriate CV-CUDA Python wheel ::
 
-        pip install cvcuda_<cu_ver>-0.6.0b0-cp<py_ver>-cp<py_ver>-linux_<arch>.whl
+        pip install cvcuda_<cu_ver>-0.7.0b0-cp<py_ver>-cp<py_ver>-linux_<arch>.whl
 
     where <cu_ver> is the desired CUDA version, <py_ver> the desired Python version and <arch> the desired architecture.
 
diff --git a/docs/sphinx/relnotes/v0.7.0-beta.rst b/docs/sphinx/relnotes/v0.7.0-beta.rst
new file mode 100644
index 000000000..5ad3ae437
--- /dev/null
+++ b/docs/sphinx/relnotes/v0.7.0-beta.rst
@@ -0,0 +1,69 @@
+..
+  # SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+  # SPDX-License-Identifier: Apache-2.0
+  #
+  # Licensed under the Apache License, Version 2.0 (the "License");
+  # you may not use this file except in compliance with the License.
+  # You may obtain a copy of the License at
+  #
+  # http://www.apache.org/licenses/LICENSE-2.0
+  #
+  # Unless required by applicable law or agreed to in writing, software
+  # distributed under the License is distributed on an "AS IS" BASIS,
+  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  # See the License for the specific language governing permissions and
+  # limitations under the License.
+
+.. _v0.7.0-beta:
+
+Beta.5
+======
+
+CV-CUDA 0.7.0 introduces performance and support enhancements, along with bug fixes and new features.
+
+Release Highlights
+------------------
+
+CV-CUDA v0.7.0 includes the following improvements:
+
+* **New Features**:
+
+  * Optimized Python bindings: near-zero overhead compared to C++ calls​
+
+  * Added masking option to Label operator: conditional island removal
+
+  * Added IGX Orin support (with dGPU, Ampere or Ada RTX6000)​
+
+  * Added support of signed 32bits output datatype for Label operator​
+
+* **Removed Operator**:​
+
+  * Removed Find Contours operator for troubleshooting of major limitations
+
+* **Bug Fixes**:
+
+  * Fixed constraint on installation directory for Python tests​: tar test packages can now be used from any directory​
+
+
+Compatibility and Known Limitations
+-----------------------------------
+
+See main README on `CV-CUDA GitHub <https://github.com/CVCUDA/CV-CUDA>`_.
+
+License
+-------
+
+CV-CUDA is licensed under the `Apache 2.0 <https://github.com/CVCUDA/CV-CUDA/blob/main/LICENSE.md>`_ license.
+
+Resources
+---------
+
+1. `CV-CUDA GitHub <https://github.com/CVCUDA/CV-CUDA>`_
+2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA <https://developer.nvidia.com/blog/increasing-throughput-and-reducing-costs-for-computer-vision-with-cv-cuda/>`_
+3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI <https://blogs.nvidia.com/blog/2023/03/21/cv-cuda-ai-computer-vision/>`_
+4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI <https://developer.nvidia.com/zh-cn/blog/cv-cuda-high-performance-image-processing/>`_
+
+Acknowledgements
+----------------
+
+CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team.
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 7647d0491..65f61d879 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.18)
+cmake_minimum_required(VERSION 3.20.1)
 
 project(cvcuda_python CXX C)
 
diff --git a/python/mod_cvcuda/CMakeLists.txt b/python/mod_cvcuda/CMakeLists.txt
index 45ecc94e0..66b53d87f 100644
--- a/python/mod_cvcuda/CMakeLists.txt
+++ b/python/mod_cvcuda/CMakeLists.txt
@@ -29,7 +29,6 @@ nvcv_python_add_module(
         OpLabel.cpp
         LabelType.cpp
         ConnectivityType.cpp
-        OpFindContours.cpp
         OpHistogramEq.cpp
         OpOSD.cpp
         OpAdvCvtColor.cpp
diff --git a/python/mod_cvcuda/Main.cpp b/python/mod_cvcuda/Main.cpp
index 130d01680..aff67174b 100644
--- a/python/mod_cvcuda/Main.cpp
+++ b/python/mod_cvcuda/Main.cpp
@@ -94,7 +94,6 @@ PYBIND11_MODULE(cvcuda, m)
     // CV-CUDA Operators
     ExportOpPairwiseMatcher(m);
     ExportOpLabel(m);
-    ExportOpFindContours(m);
     ExportOpOSD(m);
     ExportOpHistogramEq(m);
     ExportOpAdvCvtColor(m);
diff --git a/python/mod_cvcuda/OpFindContours.cpp b/python/mod_cvcuda/OpFindContours.cpp
deleted file mode 100644
index 137bf645f..000000000
--- a/python/mod_cvcuda/OpFindContours.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Operators.hpp"
-
-#include <common/PyUtil.hpp>
-#include <common/String.hpp>
-#include <cvcuda/OpFindContours.hpp>
-#include <cvcuda/Types.h>
-#include <nvcv/Tensor.hpp>
-#include <nvcv/cuda/TypeTraits.hpp>
-#include <nvcv/python/ImageBatchVarShape.hpp>
-#include <nvcv/python/ResourceGuard.hpp>
-#include <nvcv/python/Stream.hpp>
-#include <nvcv/python/Tensor.hpp>
-#include <pybind11/stl.h>
-
-namespace cvcudapy {
-
-namespace {
-
-using TupleTensor2 = std::tuple<Tensor, Tensor>;
-
-TupleTensor2 FindContoursInto(Tensor &points, Tensor &numPoints, Tensor &input, std::optional<Stream> pstream)
-{
-    if (!pstream)
-    {
-        pstream = Stream::Current();
-    }
-
-    nvcv::Size2D size{static_cast<int>(input.shape()[2]), static_cast<int>(input.shape()[1])};
-    auto         findContours = CreateOperator<cvcuda::FindContours>(size, static_cast<int32_t>(input.shape()[0]));
-
-    ResourceGuard guard(*pstream);
-    guard.add(LockMode::LOCK_MODE_READ, {input});
-    guard.add(LockMode::LOCK_MODE_WRITE, {points});
-    guard.add(LockMode::LOCK_MODE_WRITE, {numPoints});
-    guard.add(LockMode::LOCK_MODE_READWRITE, {*findContours});
-
-    findContours->submit(pstream->cudaHandle(), input, points, numPoints);
-
-    return TupleTensor2(std::move(points), std::move(numPoints));
-}
-
-TupleTensor2 FindContours(Tensor &input, std::optional<Stream> pstream)
-{
-    auto pointShape = nvcv::TensorShape{
-        {input.shape()[0], cvcuda::FindContours::MAX_TOTAL_POINTS, 2},
-        nvcv::TENSOR_NHW
-    };
-    Tensor points = Tensor::Create(pointShape, nvcv::TYPE_S32);
-
-    auto countShape = nvcv::TensorShape{
-        {input.shape()[0], cvcuda::FindContours::MAX_NUM_CONTOURS},
-        nvcv::TENSOR_NW
-    };
-    Tensor numPoints = Tensor::Create(countShape, nvcv::TYPE_S32);
-
-    return FindContoursInto(points, numPoints, input, pstream);
-}
-
-} // namespace
-
-void ExportOpFindContours(py::module &m)
-{
-    using namespace pybind11::literals;
-    py::options options;
-    options.disable_function_signatures();
-
-    m.def("find_contours", &FindContours, "image"_a, "stream"_a = nullptr, R"pbdoc(
-
-        cvcuda.find_contours(src : nvcv.Tensor, stream: Optional[nvcv.cuda.Stream] = None) -> nvcv.Tensor
-        Executes the FindContours operation on the given cuda stream.
-
-        See also:
-            Refer to the CV-CUDA C API reference for the FindContours operator
-            for more details and usage examples.
-
-        Args:
-            src (Tensor): Input tensor containing one or more images.
-            stream (Stream, optional): CUDA Stream on which to perform the operation.
-
-        Returns:
-            Tuple[Tensor, Tensor]: A tuple of two tensors. The first is the contour points tensor with dimensions NxMx2 -
-            where N is the batch size, M is the maximum number of points allowed. Each point of the contour is specified
-            in (x, y) coordinates. The second tensor specifies the number of valid contours per image and the number of
-            valid points in those contours. It has dimensions NxC where N is the batch size and C is the maximum number
-            of contours found. The actual number of contours can be calculated by counting the number of non-zero elements
-            in the C dimension and the actual number of points in each of those contours are the values stored in the C dimension.
-
-        Caution:
-            Restrictions to several arguments may apply. Check the C
-            API references of the CV-CUDA operator.
-    )pbdoc");
-
-    m.def("find_contours_into", &FindContoursInto, "points"_a, "num_points"_a, "src"_a, "stream"_a = nullptr, R"pbdoc(
-
-        cvcuda.find_contours_into(points : nvcv.Tensor, num_points : nvcv.Tensor, src : Tensor, stream: Optional[nvcv.cuda.Stream] = None)
-        Executes the FindContours operation on the given cuda stream.
-
-        See also:
-            Refer to the CV-CUDA C API reference for the FindContours operator
-            for more details and usage examples.
-
-        Args:
-            points (Tensor): Output tensor to store the coordinates of each contour point.
-            num_points (Tensor): Output tensor to store the number of points in a contour.
-            src (Tensor): Input tensor containing one or more images.
-            stream (Stream, optional): CUDA Stream on which to perform the operation.
-
-        Returns:
-            None
-
-        Caution:
-            Restrictions to several arguments may apply. Check the C
-            API references of the CV-CUDA operator.
-    )pbdoc");
-}
-
-} // namespace cvcudapy
diff --git a/python/mod_cvcuda/OpLabel.cpp b/python/mod_cvcuda/OpLabel.cpp
index 1d45618d8..c93158acd 100644
--- a/python/mod_cvcuda/OpLabel.cpp
+++ b/python/mod_cvcuda/OpLabel.cpp
@@ -33,9 +33,9 @@ using TupleTensor3 = std::tuple<Tensor, std::optional<Tensor>, std::optional<Ten
 namespace {
 
 TupleTensor3 LabelInto(Tensor &output, std::optional<Tensor> count, std::optional<Tensor> stats, Tensor &input,
-                       NVCVConnectivityType connectivity, NVCVLabelType assignLabels, std::optional<Tensor> bgLabel,
-                       std::optional<Tensor> minThresh, std::optional<Tensor> maxThresh, std::optional<Tensor> minSize,
-                       std::optional<Stream> pstream)
+                       NVCVConnectivityType connectivity, NVCVLabelType assignLabels, NVCVLabelMaskType maskType,
+                       std::optional<Tensor> bgLabel, std::optional<Tensor> minThresh, std::optional<Tensor> maxThresh,
+                       std::optional<Tensor> minSize, std::optional<Tensor> mask, std::optional<Stream> pstream)
 {
     if (!pstream)
     {
@@ -73,20 +73,26 @@ TupleTensor3 LabelInto(Tensor &output, std::optional<Tensor> count, std::optiona
     {
         guard.add(LockMode::LOCK_MODE_READ, {*minSize});
     }
+    if (mask)
+    {
+        guard.add(LockMode::LOCK_MODE_READ, {*mask});
+    }
 
     op->submit(pstream->cudaHandle(), input, output, (bgLabel ? *bgLabel : nvcv::Tensor{nullptr}),
                (minThresh ? *minThresh : nvcv::Tensor{nullptr}), (maxThresh ? *maxThresh : nvcv::Tensor{nullptr}),
                (minSize ? *minSize : nvcv::Tensor{nullptr}), (count ? *count : nvcv::Tensor{nullptr}),
-               (stats ? *stats : nvcv::Tensor{nullptr}), connectivity, assignLabels);
+               (stats ? *stats : nvcv::Tensor{nullptr}), (mask ? *mask : nvcv::Tensor{nullptr}), connectivity,
+               assignLabels, maskType);
 
     return TupleTensor3(std::move(output), count, stats);
 }
 
-TupleTensor3 Label(Tensor &input, NVCVConnectivityType connectivity, NVCVLabelType assignLabels, bool count, bool stats,
-                   int maxLabels, std::optional<Tensor> bgLabel, std::optional<Tensor> minThresh,
-                   std::optional<Tensor> maxThresh, std::optional<Tensor> minSize, std::optional<Stream> pstream)
+TupleTensor3 Label(Tensor &input, NVCVConnectivityType connectivity, NVCVLabelType assignLabels,
+                   NVCVLabelMaskType maskType, bool count, bool stats, int maxLabels, std::optional<Tensor> bgLabel,
+                   std::optional<Tensor> minThresh, std::optional<Tensor> maxThresh, std::optional<Tensor> minSize,
+                   std::optional<Tensor> mask, std::optional<Stream> pstream)
 {
-    constexpr nvcv::DataType outType = nvcv::TYPE_U32;
+    constexpr nvcv::DataType outType = nvcv::TYPE_S32;
 
     auto inputData = input.exportData<nvcv::TensorDataStridedCuda>();
     if (!inputData)
@@ -112,11 +118,11 @@ TupleTensor3 Label(Tensor &input, NVCVConnectivityType connectivity, NVCVLabelTy
         int numStats = 1;
         if (connectivity == NVCV_CONNECTIVITY_4_2D || connectivity == NVCV_CONNECTIVITY_8_2D)
         {
-            numStats = 6;
+            numStats = 7;
         }
         if (connectivity == NVCV_CONNECTIVITY_6_3D || connectivity == NVCV_CONNECTIVITY_26_3D)
         {
-            numStats = 8;
+            numStats = 9;
         }
 
         statsTensor = Tensor::Create(
@@ -127,8 +133,8 @@ TupleTensor3 Label(Tensor &input, NVCVConnectivityType connectivity, NVCVLabelTy
             outType);
     }
 
-    return LabelInto(output, countTensor, statsTensor, input, connectivity, assignLabels, bgLabel, minThresh, maxThresh,
-                     minSize, pstream);
+    return LabelInto(output, countTensor, statsTensor, input, connectivity, assignLabels, maskType, bgLabel, minThresh,
+                     maxThresh, minSize, mask, pstream);
 }
 
 } // namespace
@@ -137,9 +143,14 @@ void ExportOpLabel(py::module &m)
 {
     using namespace pybind11::literals;
 
+    py::enum_<NVCVLabelMaskType>(m, "LabelMaskType", py::arithmetic())
+        .value("REMOVE_ISLANDS_OUTSIDE_MASK_ONLY", NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY)
+        .export_values();
+
     m.def("label", &Label, "src"_a, "connectivity"_a = NVCV_CONNECTIVITY_4_2D, "assign_labels"_a = NVCV_LABEL_FAST,
-          py::kw_only(), "count"_a = false, "stats"_a = false, "max_labels"_a = 10000, "bg_label"_a = nullptr,
-          "min_thresh"_a = nullptr, "max_thresh"_a = nullptr, "min_size"_a = nullptr, "stream"_a = nullptr, R"pbdoc(
+          "mask_type"_a = NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY, py::kw_only(), "count"_a = false, "stats"_a = false,
+          "max_labels"_a = 10000, "bg_label"_a = nullptr, "min_thresh"_a = nullptr, "max_thresh"_a = nullptr,
+          "min_size"_a = nullptr, "mask"_a = nullptr, "stream"_a = nullptr, R"pbdoc(
 
         Executes the Label operation on the given cuda stream.
 
@@ -152,6 +163,8 @@ void ExportOpLabel(py::module &m)
                                                               default is cvcuda.CONNECTIVITY_4_2D.
             assign_labels (cvcuda.LABEL, optional): Choice on how labels are assigned,
                                                     default is cvcuda.LABEL.FAST.
+            mask_type (cvcuda.LabelMaskType, optional): Choice on how the mask is used,
+                                                        default is cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY.
             count (bool, optional): Use True to return the count of valid labeled regions.
             stats (bool, optional): Use True to return the statistics of valid labeled regions.
             max_labels (Number, optional): Maximum number of labels to compute statistics for, default is 10000.
@@ -161,6 +174,10 @@ void ExportOpLabel(py::module &m)
             max_thresh (Tensor, optional): Maximum threshold tensor to mask input values above it to be 0, and others 1.
             min_size (Tensor, optional): Minimum size tensor to remove islands, i.e. labeled regions with number of
                                          elements less than the minimum size.
+            mask (Tensor, optional): Mask tensor, its behavior is controlled by \ref mask_type.  One choice is to
+                                     control island removal in addition to \ref min_size, i.e. regions with at
+                                     least one element inside the mask (non-zero values) are not removed in case
+                                     mask_type is cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY.
             stream (Stream, optional): CUDA Stream on which to perform the operation.
 
         Returns:
@@ -172,8 +189,9 @@ void ExportOpLabel(py::module &m)
     )pbdoc");
 
     m.def("label_into", &LabelInto, "dst"_a, "count"_a = nullptr, "stats"_a = nullptr, "src"_a,
-          "connectivity"_a = NVCV_CONNECTIVITY_4_2D, "assign_labels"_a = NVCV_LABEL_FAST, py::kw_only(),
-          "bg_label"_a = nullptr, "min_thresh"_a = nullptr, "max_thresh"_a = nullptr, "min_size"_a = nullptr,
+          "connectivity"_a = NVCV_CONNECTIVITY_4_2D, "assign_labels"_a = NVCV_LABEL_FAST,
+          "mask_type"_a = NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY, py::kw_only(), "bg_label"_a = nullptr,
+          "min_thresh"_a = nullptr, "max_thresh"_a = nullptr, "min_size"_a = nullptr, "mask"_a = nullptr,
           "stream"_a = nullptr, R"pbdoc(
 
         Executes the Label operation on the given cuda stream.
@@ -190,12 +208,18 @@ void ExportOpLabel(py::module &m)
                                                               default is cvcuda.CONNECTIVITY_4_2D.
             assign_labels (cvcuda.LABEL, optional): Choice on how labels are assigned,
                                                     default is cvcuda.LABEL.FAST.
+            mask_type (cvcuda.LabelMaskType, optional): Choice on how the mask is used,
+                                                        default is cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY.
             bg_label (Tensor, optional): Background tensor to define input values to be considered background
                                          labels and thus ignored.
             min_thresh (Tensor, optional): Minimum threshold tensor to mask input values below it to be 0, and others 1.
             max_thresh (Tensor, optional): Maximum threshold tensor to mask input values above it to be 0, and others 1.
             min_size (Tensor, optional): Minimum size tensor to remove islands, i.e. labeled regions with number of
                                          elements less than the minimum size.
+            mask (Tensor, optional): Mask tensor, its behavior is controlled by \ref mask_type.  One choice is to
+                                     control island removal in addition to \ref min_size, i.e. regions with at
+                                     least one element inside the mask (non-zero values) are not removed in case
+                                     mask_type is cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY.
             stream (Stream, optional): CUDA Stream on which to perform the operation.
 
         Returns:
diff --git a/python/mod_cvcuda/OpResize.cpp b/python/mod_cvcuda/OpResize.cpp
index 7d42dcce7..a8e41fab7 100644
--- a/python/mod_cvcuda/OpResize.cpp
+++ b/python/mod_cvcuda/OpResize.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -173,7 +173,7 @@ void ExportOpResize(py::module &m)
             stream (Stream, optional): CUDA Stream on which to perform the operation.
 
         Returns:
-            cvcuda.Tensor: The output tensor.
+            cvcuda.ImageBatchVarShape: The output image batch.
 
         Caution:
             Restrictions to several arguments may apply. Check the C
diff --git a/python/mod_cvcuda/Operators.hpp b/python/mod_cvcuda/Operators.hpp
index b48f11fbd..6197e43ba 100644
--- a/python/mod_cvcuda/Operators.hpp
+++ b/python/mod_cvcuda/Operators.hpp
@@ -49,7 +49,6 @@ using nvcvpy::TensorBatch;
 namespace util = nvcvpy::util;
 namespace py   = ::pybind11;
 
-void ExportOpFindContours(py::module &m);
 void ExportOpReformat(py::module &m);
 void ExportOpResize(py::module &m);
 void ExportOpCustomCrop(py::module &m);
diff --git a/python/mod_nvcv/CAPI.cpp b/python/mod_nvcv/CAPI.cpp
index e15f6eff8..b31fc27ec 100644
--- a/python/mod_nvcv/CAPI.cpp
+++ b/python/mod_nvcv/CAPI.cpp
@@ -105,14 +105,9 @@ LockMode ToLockMode(PyObject *_mode)
     }
 }
 
-extern "C" void ImplResource_SubmitSync(PyObject *res, PyObject *stream, PyObject *lockMode)
+extern "C" void ImplResource_SubmitSync(PyObject *res, PyObject *stream)
 {
-    ToSharedObj<Resource>(res)->submitSync(*ToSharedObj<Stream>(stream), ToLockMode(lockMode));
-}
-
-extern "C" void ImplResource_SubmitSignal(PyObject *res, PyObject *stream, PyObject *lockMode)
-{
-    ToSharedObj<Resource>(res)->submitSignal(*ToSharedObj<Stream>(stream), ToLockMode(lockMode));
+    ToSharedObj<Resource>(res)->submitSync(*ToSharedObj<Stream>(stream));
 }
 
 extern "C" void ImplStream_HoldResources(PyObject *stream, PyObject *resourceList)
@@ -294,7 +289,6 @@ void ExportCAPI(py::module &m)
         .ImageFormat_ToPython            = &ImplImageFormat_ToPython,
         .ImageFormat_FromPython          = &ImplImageFormat_FromPython,
         .Resource_SubmitSync             = &ImplResource_SubmitSync,
-        .Resource_SubmitSignal           = &ImplResource_SubmitSignal,
         .Stream_HoldResources            = &ImplStream_HoldResources,
         .Stream_GetCurrent               = &ImplStream_GetCurrent,
         .Stream_GetCudaHandle            = &ImplStream_GetCudaHandle,
diff --git a/python/mod_nvcv/Resource.cpp b/python/mod_nvcv/Resource.cpp
index afe571569..a8d0fe67a 100644
--- a/python/mod_nvcv/Resource.cpp
+++ b/python/mod_nvcv/Resource.cpp
@@ -30,24 +30,21 @@ Resource::Resource()
 
     m_id = idnext++;
 
-    m_readEvent = m_writeEvent = nullptr;
+    m_event = nullptr;
     try
     {
-        util::CheckThrow(cudaEventCreateWithFlags(&m_readEvent, cudaEventDisableTiming));
-        util::CheckThrow(cudaEventCreateWithFlags(&m_writeEvent, cudaEventDisableTiming));
+        util::CheckThrow(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming));
     }
     catch (...)
     {
-        cudaEventDestroy(m_readEvent);
-        cudaEventDestroy(m_writeEvent);
+        cudaEventDestroy(m_event);
         throw;
     }
 }
 
 Resource::~Resource()
 {
-    cudaEventDestroy(m_readEvent);
-    cudaEventDestroy(m_writeEvent);
+    cudaEventDestroy(m_event);
 }
 
 uint64_t Resource::id() const
@@ -55,62 +52,29 @@ uint64_t Resource::id() const
     return m_id;
 }
 
-void Resource::submitSignal(Stream &stream, LockMode mode) const
+void Resource::submitSync(Stream &stream)
 {
-    doBeforeSubmitSignal(stream, mode);
-
-    if (mode & LOCK_MODE_READ)
-    {
-        util::CheckThrow(cudaEventRecord(m_readEvent, stream.handle()));
-    }
-    if (mode & LOCK_MODE_WRITE)
+    //Check if we have a last stream, if not set it to the current stream
+    if (!m_lastStream.has_value())
     {
-        util::CheckThrow(cudaEventRecord(m_writeEvent, stream.handle()));
+        m_lastStream.emplace(stream.shared_from_this()); //store a shared pointer to the stream
     }
-}
-
-void Resource::submitSync(Stream &stream, LockMode mode) const
-{
-    doBeforeSubmitSync(stream, mode);
 
-    doSubmitSync(stream, mode);
-}
-
-void Resource::doSubmitSync(Stream &stream, LockMode mode) const
-{
-    if (mode & LOCK_MODE_WRITE)
-    {
-        util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_writeEvent));
-        util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_readEvent));
-    }
-    else if (mode & LOCK_MODE_READ)
+    // if we are on the same stream we dont need to do anything
+    // as streams are sequential and we can assume that the last operation on the stream is done
+    if (m_lastStream.value()->handle() == stream.handle())
     {
-        util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_writeEvent));
+        return;
     }
-}
-
-void Resource::sync(LockMode mode) const
-{
-    py::gil_scoped_release release;
 
-    doBeforeSync(mode);
+    // if we are on a different stream we need to wait for that stream to finish
+    // write event on the old stream, the new stream will have to wait for it to be done
+    util::CheckThrow(cudaEventRecord(m_event, m_lastStream.value()->handle()));
+    util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_event));
 
-    doSync(mode);
-}
-
-void Resource::doSync(LockMode mode) const
-{
-    NVCV_ASSERT(PyGILState_Check() == 0);
-
-    if (mode & LOCK_MODE_WRITE)
-    {
-        util::CheckThrow(cudaEventSynchronize(m_writeEvent));
-        util::CheckThrow(cudaEventSynchronize(m_readEvent));
-    }
-    else if (mode & LOCK_MODE_READ)
-    {
-        util::CheckThrow(cudaEventSynchronize(m_writeEvent));
-    }
+    // update the last stream since we changed streams
+    m_lastStream.reset();
+    m_lastStream.emplace(stream.shared_from_this());
 }
 
 std::shared_ptr<Resource> Resource::shared_from_this()
@@ -127,8 +91,7 @@ void Resource::Export(py::module &m)
 {
     py::class_<Resource, std::shared_ptr<Resource>>(m, "Resource")
         .def_property_readonly("id", &Resource::id, "Unique resource instance identifier")
-        .def("submitSync", &Resource::submitSync)
-        .def("submitSignal", &Resource::submitSignal);
+        .def("submitStreamSync", &Resource::submitSync, "Syncs object on new Stream");
 }
 
 } // namespace nvcvpy::priv
diff --git a/python/mod_nvcv/Resource.hpp b/python/mod_nvcv/Resource.hpp
index 21e7cc181..010c8b33c 100644
--- a/python/mod_nvcv/Resource.hpp
+++ b/python/mod_nvcv/Resource.hpp
@@ -19,6 +19,7 @@
 #define NVCV_PYTHON_PRIV_RESOURCE_HPP
 
 #include "Object.hpp"
+#include "Stream.hpp"
 
 #include <nvcv/detail/CudaFwd.h>
 #include <nvcv/python/LockMode.hpp>
@@ -32,42 +33,64 @@ typedef struct CUevent_st *cudaEvent_t;
 namespace nvcvpy::priv {
 namespace py = pybind11;
 
-class Stream;
-
+/**
+ * @brief A class representing a CUDA resource.
+ *
+ * This class encapsulates a CUDA resource and provides methods for synchronization
+ * with CUDA streams.
+ */
 class PYBIND11_EXPORT Resource : public virtual Object
 {
 public:
+    /**
+     * @brief Destructor.
+     */
     ~Resource();
 
+    /**
+     * @brief Export the Resource class to Python.
+     *
+     * @param m The Python module to export the class to.
+     */
     static void Export(py::module &m);
 
+    /**
+     * @brief Get the unique identifier of the resource.
+     *
+     * @return uint64_t The unique identifier of the resource.
+     */
     uint64_t id() const;
 
-    void submitSync(Stream &stream, LockMode mode) const;
-    void submitSignal(Stream &stream, LockMode mode) const;
-
-    // Assumes GIL is locked (is in acquired state)
-    void sync(LockMode mode) const;
-
-    std::shared_ptr<Resource>       shared_from_this();
+    /**
+     * @brief Submit the resource for synchronization with a CUDA stream.
+     *
+     * This method synchronizes the resource with the specified CUDA stream.
+     *
+     * @param stream The CUDA stream to synchronize with.
+     */
+    void submitSync(Stream &stream);
+
+    /**
+     * @brief Get a shared pointer to this resource.
+     *
+     * @return std::shared_ptr<Resource> A shared pointer to this resource.
+     */
+    std::shared_ptr<Resource> shared_from_this();
+
+    /**
+     * @brief Get a shared pointer to this const resource.
+     *
+     * @return std::shared_ptr<const Resource> A shared pointer to this const resource.
+     */
     std::shared_ptr<const Resource> shared_from_this() const;
 
 protected:
     Resource();
 
-    void doSubmitSync(Stream &stream, LockMode mode) const;
-
-    // Assumes GIL is not locked (is in released state)
-    void doSync(LockMode mode) const;
-
 private:
-    // To be overriden by children if they have their own requirements
-    virtual void doBeforeSync(LockMode mode) const {};
-    virtual void doBeforeSubmitSync(Stream &stream, LockMode mode) const {};
-    virtual void doBeforeSubmitSignal(Stream &stream, LockMode mode) const {};
-
-    uint64_t    m_id;
-    cudaEvent_t m_readEvent, m_writeEvent;
+    uint64_t                                     m_id;         /**< The unique identifier of the resource. */
+    cudaEvent_t                                  m_event;      /**< The CUDA event used for synchronization. */
+    std::optional<std::shared_ptr<const Stream>> m_lastStream; /**< Cache the last stream used for this resource. */
 };
 
 } // namespace nvcvpy::priv
diff --git a/python/mod_nvcv/Stream.cpp b/python/mod_nvcv/Stream.cpp
index 4c120499f..bd3c1f9dc 100644
--- a/python/mod_nvcv/Stream.cpp
+++ b/python/mod_nvcv/Stream.cpp
@@ -28,6 +28,11 @@
 
 namespace nvcvpy::priv {
 
+// Static members initialization
+cudaStream_t     Stream::m_auxStream     = nullptr;
+std::atomic<int> Stream::m_instanceCount = 0;
+std::mutex       Stream::m_auxStreamMutex;
+
 // Here we define the representation of external cuda streams.
 // It defines pybind11's type casters from the python object
 // to the corresponding ExternalStream<E>.
@@ -193,7 +198,18 @@ std::shared_ptr<Stream> Stream::Create()
 Stream::Stream()
     : m_owns(true)
 {
-    util::CheckThrow(cudaStreamCreate(&m_handle));
+    try
+    {
+        util::CheckThrow(cudaStreamCreateWithFlags(&m_handle, cudaStreamNonBlocking));
+        incrementInstanceCount();
+        GetAuxStream();
+        util::CheckThrow(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming));
+    }
+    catch (...)
+    {
+        destroy();
+        throw;
+    }
 }
 
 Stream::Stream(IExternalStream &extStream)
@@ -206,14 +222,72 @@ Stream::Stream(IExternalStream &extStream)
     {
         throw std::runtime_error("Invalid cuda stream");
     }
+
+    try
+    {
+        incrementInstanceCount();
+        GetAuxStream(); // Make sure the singleton aux stream is created
+        util::CheckThrow(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming));
+    }
+    catch (...)
+    {
+        destroy();
+        throw;
+    }
+}
+
+void Stream::incrementInstanceCount()
+{
+    m_instanceCount.fetch_add(1, std::memory_order_relaxed);
+}
+
+int Stream::decrementInstanceCount()
+{
+    return m_instanceCount.fetch_sub(1, std::memory_order_acq_rel) - 1;
+}
+
+cudaStream_t &Stream::GetAuxStream()
+{
+    if (!m_auxStream)
+    {
+        std::lock_guard<std::mutex> lock(m_auxStreamMutex);
+        if (!m_auxStream)
+        {
+            util::CheckThrow(cudaStreamCreateWithFlags(&m_auxStream, cudaStreamNonBlocking));
+        }
+    }
+    return m_auxStream;
 }
 
 Stream::~Stream()
+{
+    destroy();
+}
+
+void Stream::destroy()
 {
     if (m_owns)
     {
-        util::CheckLog(cudaStreamSynchronize(m_handle));
-        util::CheckLog(cudaStreamDestroy(m_handle));
+        if (m_handle)
+        {
+            util::CheckLog(cudaStreamSynchronize(m_handle));
+            util::CheckLog(cudaStreamDestroy(m_handle));
+            m_handle = nullptr;
+        }
+    }
+    {
+        std::lock_guard<std::mutex> lock(m_auxStreamMutex);
+        if (m_auxStream && decrementInstanceCount() == 0)
+        {
+            util::CheckThrow(cudaStreamSynchronize(m_auxStream));
+            util::CheckThrow(cudaStreamDestroy(m_auxStream));
+            m_auxStream = nullptr;
+        }
+    }
+    if (m_event)
+    {
+        util::CheckThrow(cudaEventDestroy(m_event));
+        m_event = nullptr;
     }
 }
 
@@ -240,7 +314,6 @@ intptr_t Stream::pyhandle() const
 void Stream::sync()
 {
     py::gil_scoped_release release;
-
     util::CheckThrow(cudaStreamSynchronize(m_handle));
 }
 
@@ -283,8 +356,34 @@ void Stream::holdResources(LockResources usedResources)
             delete pclosure;
         };
 
-        util::CheckThrow(cudaStreamAddCallback(m_handle, fn, closure.get(), 0));
-
+        // If we naively execute the callback in the main stream (m_handle), the GPU will wait until the callback
+        // is executed (on host). For correctness, GPU doesn't need to wait - it's the CPU that needs
+        // to wait for the work already scheduled to complete.
+        //
+        // Naive timeline:
+        //
+        // stream        GPU_kernel1 | Callback | GPU_kernel2
+        // GPU activity  xxxxxxxxxxx              xxxxxxxxxxx
+        // CPU activity                xxxxxxxx
+        //
+        // Optimized timeline
+        //
+        //
+        //                event -----v
+        // stream        GPU_kernel1 | GPU_kernel2
+        // aux_stream     waitEvent >| Callback
+        //
+        // GPU activity  xxxxxxxxxxx   xxxxxxxxxxx
+        // CPU activity                xxxxxxxx
+
+        util::CheckThrow(cudaEventRecord(m_event, m_handle)); // add async record the event in the main stream
+        util::CheckThrow(
+            cudaStreamWaitEvent(GetAuxStream(), m_event)); // add async wait for the event in the aux stream
+        // The callback will be executed in the singleton aux stream there may be contention with other callbacks and waitEvents from
+        // other streams. However the callback is used to release resources from the cache and should not be a performance bottleneck.
+        // This avoids opening a new aux stream for each stream object.
+        util::CheckThrow(
+            cudaStreamAddCallback(GetAuxStream(), fn, closure.get(), 0)); // add async callback in the aux stream
         closure.release();
     }
 }
@@ -322,6 +421,8 @@ void Stream::Export(py::module &m)
     ExportExternalStream<VOIDP>(m);
     ExportExternalStream<INT>(m);
 
+    fflush(stdout);
+
     stream.def("__enter__", &Stream::activate, "Activate the CUDA stream as the current stream for this thread.")
         .def("__exit__", &Stream::deactivate, "Deactivate the CUDA stream as the current stream for this thread.")
         .def("sync", &Stream::sync, "Wait for all preceding CUDA calls in the current stream to complete.")
diff --git a/python/mod_nvcv/Stream.hpp b/python/mod_nvcv/Stream.hpp
index 81a3fc9fc..2dcceb726 100644
--- a/python/mod_nvcv/Stream.hpp
+++ b/python/mod_nvcv/Stream.hpp
@@ -24,8 +24,10 @@
 #include <cuda_runtime.h>
 #include <nvcv/python/LockMode.hpp>
 
+#include <atomic>
 #include <initializer_list>
 #include <memory>
+#include <mutex>
 #include <unordered_map>
 #include <vector>
 
@@ -51,7 +53,7 @@ class PYBIND11_EXPORT Stream : public CacheItem
 
     static std::shared_ptr<Stream> Create();
 
-    ~Stream();
+    virtual ~Stream();
 
     std::shared_ptr<Stream>       shared_from_this();
     std::shared_ptr<const Stream> shared_from_this() const;
@@ -75,6 +77,8 @@ class PYBIND11_EXPORT Stream : public CacheItem
     Stream(Stream &&) = delete;
     Stream();
 
+    // Singleton access to the auxiliary CUDA stream
+
     class Key final : public IKey
     {
     private:
@@ -88,9 +92,22 @@ class PYBIND11_EXPORT Stream : public CacheItem
         return key;
     }
 
-    bool         m_owns;
-    cudaStream_t m_handle;
+    void destroy();
+
+    bool         m_owns   = false;
+    cudaStream_t m_handle = nullptr;
+    cudaEvent_t  m_event  = nullptr;
     py::object   m_wrappedObj;
+
+    //singleton aux stream and protection. this a a bit overkill
+    //for now as python is single threaded, but it is a good practice
+    static std::mutex       m_auxStreamMutex;
+    static std::atomic<int> m_instanceCount;
+    static cudaStream_t     m_auxStream;
+
+    static void          incrementInstanceCount();
+    static int           decrementInstanceCount();
+    static cudaStream_t &GetAuxStream();
 };
 
 } // namespace nvcvpy::priv
diff --git a/python/mod_nvcv/include/nvcv/python/CAPI.hpp b/python/mod_nvcv/include/nvcv/python/CAPI.hpp
index db5f200a0..664ed87b5 100644
--- a/python/mod_nvcv/include/nvcv/python/CAPI.hpp
+++ b/python/mod_nvcv/include/nvcv/python/CAPI.hpp
@@ -44,8 +44,7 @@ struct CAPI
     PyObject *(*ImageFormat_ToPython)(NVCVImageFormat p);
     NVCVImageFormat (*ImageFormat_FromPython)(PyObject *obj);
 
-    void (*Resource_SubmitSync)(PyObject *res, PyObject *stream, PyObject *lockMode);
-    void (*Resource_SubmitSignal)(PyObject *res, PyObject *stream, PyObject *lockMode);
+    void (*Resource_SubmitSync)(PyObject *res, PyObject *stream);
 
     void (*Stream_HoldResources)(PyObject *stream, PyObject *resources);
     PyObject *(*Stream_GetCurrent)();
diff --git a/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp b/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp
index 40967a84b..5ad2bae5d 100644
--- a/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp
+++ b/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp
@@ -62,43 +62,16 @@ class ResourceGuard
         for (const std::reference_wrapper<const Resource> &r : resources)
         {
             py::object pyRes = r.get();
-
-            capi().Resource_SubmitSync(pyRes.ptr(), m_pyStream.ptr(), pyLockMode.ptr());
+            capi().Resource_SubmitSync(pyRes.ptr(), m_pyStream.ptr());
             m_resourcesPerLockMode.append(std::make_pair(pyLockMode, std::move(pyRes)));
         }
+
         return *this;
     }
 
     void commit()
     {
         capi().Stream_HoldResources(m_pyStream.ptr(), m_resourcesPerLockMode.ptr());
-
-        py::list newList;
-
-        auto it = m_resourcesPerLockMode.begin();
-        try
-        {
-            // Try to signal the resources, stop on the first that fails, or
-            // when all resources were signaled
-            for (; it != m_resourcesPerLockMode.end(); ++it)
-            {
-                py::tuple t = it->cast<py::tuple>();
-
-                // resource, stream, lockmode
-                capi().Resource_SubmitSignal(t[1].ptr(), m_pyStream.ptr(), t[0].ptr());
-            }
-        }
-        catch (...)
-        {
-            // Add all resources that weren't signaled to the newList.
-            for (; it != m_resourcesPerLockMode.end(); ++it)
-            {
-                newList.append(std::move(*it));
-            }
-            throw;
-        }
-
-        m_resourcesPerLockMode = std::move(newList);
     }
 
 private:
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index ca2ee0c29..806192fe1 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.20.1)
 project(nvcv_samples LANGUAGES CXX)
 
 find_package(CUDAToolkit REQUIRED)
diff --git a/samples/classification/python/main.py b/samples/classification/python/main.py
index f12c95f5a..cae6131e0 100644
--- a/samples/classification/python/main.py
+++ b/samples/classification/python/main.py
@@ -81,8 +81,10 @@ def run_sample(
     cuda_device = cuda.Device(device_id)
     cuda_ctx = cuda_device.retain_primary_context()
     cuda_ctx.push()
-    cvcuda_stream = cvcuda.Stream()
-    torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle)
+    # Use the the default stream for cvcuda and torch
+    # Since we never created a stream current will be the CUDA default stream
+    cvcuda_stream = cvcuda.Stream().current
+    torch_stream = torch.cuda.default_stream(device=cuda_device)
     # docs_tag: end_setup_gpu
 
     # docs_tag: begin_setup_stages
@@ -96,6 +98,7 @@ def run_sample(
             batch_size,
             device_id,
             cuda_ctx,
+            cvcuda_stream,
             cvcuda_perf,
         )
 
@@ -106,6 +109,7 @@ def run_sample(
             batch_size,
             device_id,
             cuda_ctx,
+            cvcuda_stream,
             cvcuda_perf,
         )
 
diff --git a/samples/common/python/nvcodec_utils.py b/samples/common/python/nvcodec_utils.py
index 2a300d385..420e15fe2 100644
--- a/samples/common/python/nvcodec_utils.py
+++ b/samples/common/python/nvcodec_utils.py
@@ -68,6 +68,7 @@ def __init__(
         batch_size,
         device_id,
         cuda_ctx,
+        cuda_stream,
         cvcuda_perf,
     ):
         # docs_tag: begin_init_videobatchdecoder_pyvideocodec
@@ -76,7 +77,7 @@ def __init__(
         self.batch_size = batch_size
         self.device_id = device_id
         self.cuda_ctx = cuda_ctx
-        self.cuda_stream = cvcuda.Stream().current
+        self.cuda_stream = cuda_stream
         self.cvcuda_perf = cvcuda_perf
         self.total_decoded = 0
         self.batch_idx = 0
@@ -229,6 +230,7 @@ def __init__(
         fps,
         device_id,
         cuda_ctx,
+        cuda_stream,
         cvcuda_perf,
     ):
         self.logger = logging.getLogger(__name__)
@@ -236,7 +238,7 @@ def __init__(
         self.fps = fps
         self.device_id = device_id
         self.cuda_ctx = cuda_ctx
-        self.cuda_stream = cvcuda.Stream().current
+        self.cuda_stream = cuda_stream
         self.cvcuda_perf = cvcuda_perf
 
         self.encoder = None
@@ -327,7 +329,7 @@ def start(self):
         pass
 
     def join(self):
-        self.encoder.flush()
+        # self.encoder.flush()
         self.logger.info("Wrote: %s" % self.output_file_name)
 
 
@@ -482,6 +484,7 @@ def __init__(
         batch_size,
         device_id,
         cuda_ctx,
+        cuda_stream,
         cvcuda_perf,
     ):
 
@@ -493,7 +496,7 @@ def __init__(
         self.total_decoded = 0
         self.batch_idx = 0
         self.cuda_ctx = cuda_ctx
-        self.cuda_stream = cvcuda.Stream().current
+        self.cuda_stream = cuda_stream
         self.cvcuda_perf = cvcuda_perf
         self.decoder = nvimgcodec.Decoder(device_id=device_id)
 
diff --git a/samples/label/python/label.py b/samples/label/python/label.py
new file mode 100644
index 000000000..0d8fcf219
--- /dev/null
+++ b/samples/label/python/label.py
@@ -0,0 +1,215 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import torch
+import torchvision
+
+import cvcuda
+
+
+def parse_arguments():
+    """Parse this program script arguments."""
+
+    parser = argparse.ArgumentParser(prog="label", description="Labels an input image.")
+
+    parser.add_argument("input", type=str, help="Input image png file path.")
+    parser.add_argument(
+        "output",
+        nargs="?",
+        default="out.png",
+        type=str,
+        help="Output image png file path.  Defaults to out.png.",
+    )
+    parser.add_argument(
+        "--max_labels",
+        default=1000,
+        type=int,
+        help="Maximum number of labels.  Defaults to 1000.",
+    )
+    parser.add_argument(
+        "--min_threshold",
+        default=None,
+        type=int,
+        help="Minimum threshold to binarize input.  Defaults to no minimum threshold.",
+    )
+    parser.add_argument(
+        "--max_threshold",
+        default=None,
+        type=int,
+        help="Maximum threshold to binarize input.  Defaults to no maximum threshold.",
+    )
+    parser.add_argument(
+        "--min_size",
+        default=None,
+        type=int,
+        help="Minimum size to prevent a region to be removed.  Defaults to no minimum size (no removals).",
+    )
+    parser.add_argument(
+        "--mask",
+        action=argparse.BooleanOptionalAction,
+        help="Apply mask to protect center islands (small regions).  Defaults to no mask.",
+    )
+    parser.add_argument(
+        "--background_label",
+        default=0,
+        type=int,
+        help="Background label. Defaults to zero.",
+    )
+
+    return parser.parse_args()
+
+
+def color_labels(
+    h_labels_hw,
+    bgl,
+    bgc=torch.as_tensor([0, 0, 0], dtype=torch.uint8),
+    fgc=torch.as_tensor([255, 255, 255], dtype=torch.uint8),
+    cmap=None,
+):
+    """Convert labels to colors
+
+    Args:
+        h_labels_hw (Tensor): Tensor with labels
+        bgl (int): Background label
+        bgc (Tensor): Background color, this color is used for the background label
+        fgc (Tensor): Foreground color, this color is used when cmap is None
+        cmap (function): Colormap, e.g. matplotlib.colormaps["jet"]
+
+    Returns:
+        Tensor: Tensor with colors
+    """
+    # Create an empty Tensor with same height and width as the labels Tensor and Channel = 3 for RGB
+    h_out_hwc = torch.empty(
+        (h_labels_hw.shape[0], h_labels_hw.shape[1], 3), dtype=torch.uint8
+    )
+
+    # Set all values to be the background color
+    h_out_hwc[:, :] = bgc
+
+    # Get the unique set of labels except background label from the labels Tensor
+    h_uniq = torch.unique(h_labels_hw)
+    h_uniq = h_uniq[h_uniq != bgl]
+
+    # Set the label RGB color to be the foreground color
+    label_rgb = fgc
+
+    for i, label in enumerate(h_uniq):
+        if cmap is not None:
+            # If a color map was provided, use it to generate the label color
+            label_rgb = [int(c * 255) for c in cmap(i / h_uniq.shape[0])[:3]]
+            label_rgb = torch.as_tensor(label_rgb, dtype=torch.uint8)
+
+        h_out_hwc[h_labels_hw == label] = label_rgb
+
+    return h_out_hwc
+
+
+if __name__ == "__main__":
+
+    args = parse_arguments()
+
+    print(
+        f"I Reading input image: {args.input}\nI Writing output image: {args.output}\n"
+        f"I Minimum threshold: {args.min_threshold}\nI Maximum threshold: {args.max_threshold}\n"
+        f"I Minimum size: {args.min_size}\nI Apply mask: {args.mask}\n"
+        f"I Background label: {args.background_label}"
+    )
+
+    # Use torchvision to read an input image, convert it to gray and store it as a CHW Tensor
+    h_in_chw = torchvision.io.read_image(args.input, torchvision.io.ImageReadMode.GRAY)
+
+    # Convert the image read from Pytorch Tensor to CVCUDA Tensor with zero copy
+    d_in_chw = cvcuda.as_tensor(h_in_chw.cuda(), layout="CHW")
+
+    # Reshape CVCUDA Tensor from CHW to HW (Channel is 1) with zero copy
+    d_in_hw = d_in_chw.reshape(d_in_chw.shape[1:], "HW")
+
+    # Tensors are initialized first in host (h_) and then copied to device (d_), using Pytorch's .as_tensor()
+    # and .cuda() methods, and then converted to CVCUDA with zero copy, using CVCUDA's .as_tensor() method
+    h_bgl = torch.as_tensor([args.background_label], dtype=h_in_chw.dtype)
+    d_bgl = cvcuda.as_tensor(h_bgl.cuda(), layout="N")
+
+    # Tensors for min/max thresholds min size and mask are optional
+    d_min_thrs = None
+    d_max_thrs = None
+    d_min_size = None
+    d_mask_hw = None
+
+    if args.min_threshold:
+        h_min_thrs = torch.as_tensor([args.min_threshold], dtype=h_in_chw.dtype)
+        d_min_thrs = cvcuda.as_tensor(h_min_thrs.cuda(), layout="N")
+
+    if args.max_threshold:
+        h_max_thrs = torch.as_tensor([args.max_threshold], dtype=h_in_chw.dtype)
+        d_max_thrs = cvcuda.as_tensor(h_max_thrs.cuda(), layout="N")
+
+    if args.min_size:
+        h_min_size = torch.as_tensor([args.min_size], dtype=torch.int32)
+        d_min_size = cvcuda.as_tensor(h_min_size.cuda(), layout="N")
+
+    if args.mask:
+        # Below are slices in between 10% and 90% (a center box) to be considered inside the mask
+        s_h_in_mask = slice(int(0.1 * h_in_chw.shape[1]), int(0.9 * h_in_chw.shape[1]))
+        s_w_in_mask = slice(int(0.1 * h_in_chw.shape[2]), int(0.9 * h_in_chw.shape[2]))
+
+        # The mask in host is first initialized with zeros
+        h_mask_hw = torch.zeros(h_in_chw.shape[1:], dtype=h_in_chw.dtype)
+
+        # Then the center of the mask defined by the slices is set to 1
+        h_mask_hw[s_h_in_mask, s_w_in_mask] = 1
+
+        # The Pytorch Tensor mask is copied to CUDA and converted to CVCUDA Tensor
+        d_mask_hw = cvcuda.as_tensor(h_mask_hw.cuda(), layout="HW")
+
+    # Call CVCUDA label operator using the arguments set above
+    d_out, d_count, d_stats = cvcuda.label(
+        src=d_in_hw,
+        connectivity=cvcuda.CONNECTIVITY_4_2D,
+        assign_labels=cvcuda.LABEL.SEQUENTIAL,
+        mask_type=cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY,
+        count=True,
+        stats=True,
+        max_labels=args.max_labels,
+        bg_label=d_bgl,
+        min_thresh=d_min_thrs,
+        max_thresh=d_max_thrs,
+        min_size=d_min_size,
+        mask=d_mask_hw,
+    )
+
+    # Convert CVCUDA output Tensors to Pytorch with zero copy, using CVCUDA's .cuda() method, then copy the
+    # Pytorch Tensor to the CPU, using Pytorch's .cpu() method
+    h_out = torch.as_tensor(d_out.cuda()).cpu()
+    h_count = torch.as_tensor(d_count.cuda()).cpu()
+    h_stats = torch.as_tensor(d_stats.cuda()).cpu()
+
+    print(f"I Number of labels found: {h_count[0]}")
+
+    # The stats Tensor (with statistics) has a region mark at index 6 that is set to 1 for removed regions
+    # and set to 2 for regions in the mask that cannot be removed
+    num_removed = sum([1 if h_stats[0, si, 6] == 1 else 0 for si in range(h_count[0])])
+    num_in_mask = sum([1 if h_stats[0, si, 6] == 2 else 0 for si in range(h_count[0])])
+
+    print(f"I Number of labeled regions removed: {num_removed}")
+    print(f"I Number of labeled regions in the mask: {num_in_mask}")
+    print(f"I Number of labeled regions kept: {h_count[0] - num_removed}")
+
+    # Color the labels using default behavior: white foreground and black background
+    h_out_rgb_hwc = color_labels(h_out, h_bgl[0])
+
+    # Use torchvision to write the output image from a CHW Tensor
+    torchvision.io.write_png(h_out_rgb_hwc.permute(2, 0, 1), args.output)
diff --git a/samples/label/python/main.py b/samples/label/python/main.py
index aeff0f85a..cb664f690 100644
--- a/samples/label/python/main.py
+++ b/samples/label/python/main.py
@@ -159,8 +159,10 @@ def run_sample(
     cuda_device = cuda.Device(device_id)
     cuda_ctx = cuda_device.retain_primary_context()
     cuda_ctx.push()
-    cvcuda_stream = cvcuda.Stream()
-    torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle)
+    # Use the the default stream for cvcuda and torch
+    # Since we never created a stream current will be the CUDA default stream
+    cvcuda_stream = cvcuda.Stream().current
+    torch_stream = torch.cuda.default_stream(device=cuda_device)
     # docs_tag: end_setup_gpu
 
     # docs_tag: encoder_decoder setup
@@ -168,7 +170,7 @@ def run_sample(
     if os.path.splitext(input_path)[1] == ".jpg" or os.path.isdir(input_path):
         # Treat this as data modality of images
         decoder = ImageBatchDecoder(
-            input_path, batch_size, device_id, cuda_ctx, cvcuda_perf
+            input_path, batch_size, device_id, cuda_ctx, cvcuda_stream, cvcuda_perf
         )
         encoder = ImageBatchEncoder(
             output_dir,
diff --git a/samples/object_detection/python/main.py b/samples/object_detection/python/main.py
index 935e121a1..0741ea2c8 100644
--- a/samples/object_detection/python/main.py
+++ b/samples/object_detection/python/main.py
@@ -85,8 +85,10 @@ def run_sample(
     cuda_device = cuda.Device(device_id)
     cuda_ctx = cuda_device.retain_primary_context()
     cuda_ctx.push()
-    cvcuda_stream = cvcuda.Stream()
-    torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle)
+    # Use the the default stream for cvcuda and torch
+    # Since we never created a stream current will be the CUDA default stream
+    cvcuda_stream = cvcuda.Stream().current
+    torch_stream = torch.cuda.default_stream(device=cuda_device)
     # docs_tag: end_setup_gpu
 
     # docs_tag: begin_setup_stages
@@ -96,7 +98,7 @@ def run_sample(
     if os.path.splitext(input_path)[1] == ".jpg" or os.path.isdir(input_path):
         # Treat this as data modality of images
         decoder = ImageBatchDecoder(
-            input_path, batch_size, device_id, cuda_ctx, cvcuda_perf
+            input_path, batch_size, device_id, cuda_ctx, cvcuda_stream, cvcuda_perf
         )
 
         encoder = ImageBatchEncoder(
@@ -107,11 +109,11 @@ def run_sample(
     else:
         # Treat this as data modality of videos
         decoder = VideoBatchDecoder(
-            input_path, batch_size, device_id, cuda_ctx, cvcuda_perf
+            input_path, batch_size, device_id, cuda_ctx, cvcuda_stream, cvcuda_perf
         )
 
         encoder = VideoBatchEncoder(
-            output_dir, decoder.fps, device_id, cuda_ctx, cvcuda_perf
+            output_dir, decoder.fps, device_id, cuda_ctx, cvcuda_stream, cvcuda_perf
         )
 
     # Define the post-processor
diff --git a/samples/scripts/benchmark.py b/samples/scripts/benchmark.py
index fe252d263..938f080c8 100644
--- a/samples/scripts/benchmark.py
+++ b/samples/scripts/benchmark.py
@@ -195,7 +195,7 @@ def parse_nvtx_gpu_proj_trace_json(json_path):
         # Grab the necessary values from the JSON file.
         range_id = row["RangeId"]
 
-        if range_id == "None":
+        if not range_id or range_id == "None":
             continue
 
         flat_name = row["Name"]
diff --git a/samples/scripts/run_samples.sh b/samples/scripts/run_samples.sh
index dea98a584..7a8fc3025 100755
--- a/samples/scripts/run_samples.sh
+++ b/samples/scripts/run_samples.sh
@@ -1,6 +1,6 @@
 #!/bin/bash -e
 
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -36,6 +36,16 @@ echo "SEGMENTATION_OUT_DIR: $SEGMENTATION_OUT_DIR"
 echo "DETECTION_OUT_DIR: $DETECTION_OUT_DIR"
 echo "DISTANCE_LABEL_OUT_DIR: $DISTANCE_LABEL_OUT_DIR"
 
+create_output_dir() {
+    local base_dir=$1
+    local run_number=1
+    while [[ -d "$base_dir/$run_number" ]]; do
+        let run_number++
+    done
+    mkdir -p "$base_dir/$run_number"
+    echo "$base_dir/$run_number"
+}
+
 # Crop and Resize Sample
 # Batch size 2
 LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/cropandresize/cvcuda_sample_cropandresize -i $SAMPLES_DIR/assets/images/ -b 2
@@ -45,20 +55,27 @@ LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/cropandresize/cvcuda_sample_
 # Run the segmentation Python sample with default settings, without any command-line args.
 rm -rf "$CLASSIFICATION_OUT_DIR"
 mkdir "$CLASSIFICATION_OUT_DIR"
-python3 $SAMPLES_DIR/classification/python/main.py -o "$CLASSIFICATION_OUT_DIR"
+CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR")
+python3 $SAMPLES_DIR/classification/python/main.py -o "$CLASSIFICATION_RUN_DIR"
 # Run it on a specific image with batch size 1 with PyTorch backend.
-python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 1 -bk pytorch -o "$CLASSIFICATION_OUT_DIR"
+CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR")
+python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 1 -bk pytorch -o "$CLASSIFICATION_RUN_DIR"
 # # Run it on a specific image with batch size 4 with PyTorch backend. Uses Same image multiple times
-python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 4 -bk pytorch -o "$CLASSIFICATION_OUT_DIR"
+CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR")
+python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 4 -bk pytorch -o "$CLASSIFICATION_RUN_DIR"
 # Run it on a folder worth of images with batch size 2 with PyTorch backend.
-python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/ -b 2 -bk pytorch -o "$CLASSIFICATION_OUT_DIR"
+CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR")
+python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/ -b 2 -bk pytorch -o "$CLASSIFICATION_RUN_DIR"
 # Run it on a specific image with batch size 1 with TensorRT backend with saving the output in a specific directory.
-
-python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 1 -bk tensorrt -o "$CLASSIFICATION_OUT_DIR"
+CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR")
+python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 1 -bk tensorrt -o "$CLASSIFICATION_RUN_DIR"
 # Run it on a specific image with batch size 1 with TensorRT backend with saving the output in a specific directory.
-python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 2 -bk tensorrt -o "$CLASSIFICATION_OUT_DIR"
+CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR")
+python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 2 -bk tensorrt -o "$CLASSIFICATION_RUN_DIR"
 # Run it on a video with batch size 1 with TensorRT backend with saving the output in a specific directory.
-python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 1 -bk tensorrt -o "$CLASSIFICATION_OUT_DIR"
+CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR")
+python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 1 -bk tensorrt -o "$CLASSIFICATION_RUN_DIR"
+
 # Run the classification C++ sample. Since the Python sample was already run, we can reuse the TensorRT model
 # and the labels file generated by it.
 # Batch size 1
@@ -66,44 +83,56 @@ LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample
 # Batch size 2
 LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample_classification -e /tmp/classification/model.2.224.224.trtmodel -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -l /tmp/classification/labels.txt -b 2
 
-
 # Run the segmentation Python sample with default settings, without any command-line args.
 rm -rf "$SEGMENTATION_OUT_DIR"
 mkdir "$SEGMENTATION_OUT_DIR"
-python3 $SAMPLES_DIR/segmentation/python/main.py -o "$SEGMENTATION_OUT_DIR"
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -o "$SEGMENTATION_RUN_DIR"
 # Run the segmentation sample with default settings for PyTorch backend.
-python3 $SAMPLES_DIR/segmentation/python/main.py -bk pytorch -o "$SEGMENTATION_OUT_DIR"
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -bk pytorch -o "$SEGMENTATION_RUN_DIR"
 # Run it on a single image with high batch size for the background class writing to a specific directory with PyTorch backend
-python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -o "$SEGMENTATION_OUT_DIR" -b 5 -c __background__ -bk pytorch
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -o "$SEGMENTATION_RUN_DIR" -b 5 -c __background__ -bk pytorch
 # Run it on a folder worth of images with the default tensorrt backend
-python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/ -o "$SEGMENTATION_OUT_DIR" -b 4 -c __background__
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/ -o "$SEGMENTATION_RUN_DIR" -b 4 -c __background__
 # Run it on a folder worth of images with PyTorch
-python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/ -o "$SEGMENTATION_OUT_DIR" -b 5 -c __background__ -bk pytorch
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/ -o "$SEGMENTATION_RUN_DIR" -b 5 -c __background__ -bk pytorch
 # Run on a single image with custom resized input given to the sample for the dog class
-python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/Weimaraner.jpg -o "$SEGMENTATION_OUT_DIR" -b 1 -c dog -th 512 -tw 512
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/Weimaraner.jpg -o "$SEGMENTATION_RUN_DIR" -b 1 -c dog -th 512 -tw 512
 # Run it on a video for class background.
-python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ -o "$SEGMENTATION_OUT_DIR"
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ -o "$SEGMENTATION_RUN_DIR"
 # Run it on a video for class background with the PyTorch backend.
-python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ -bk pytorch -o "$SEGMENTATION_OUT_DIR"
-
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ -bk pytorch -o "$SEGMENTATION_RUN_DIR"
 
 # Run the object detection Python sample with default settings, without any command-line args.
 rm -rf "$DETECTION_OUT_DIR"
 mkdir "$DETECTION_OUT_DIR"
-python3 $SAMPLES_DIR/object_detection/python/main.py -o "$DETECTION_OUT_DIR"
+DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR")
+python3 $SAMPLES_DIR/object_detection/python/main.py -o "$DETECTION_RUN_DIR"
 # Run it with batch size 1 on a single image
-python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg  -b 1 -o "$DETECTION_OUT_DIR"
+DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR")
+python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg  -b 1 -o "$DETECTION_RUN_DIR"
 # Run it with batch size 4 on a video
-python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -o "$DETECTION_OUT_DIR"
+DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR")
+python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -o "$DETECTION_RUN_DIR"
 # Run it with batch size 2 on a folder of images
-python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/images/ -b 3 -o "$DETECTION_OUT_DIR"
+DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR")
+python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/images/ -b 3 -o "$DETECTION_RUN_DIR"
 # RUn it with the TensorFlow backend
-python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -bk tensorflow -o "$DETECTION_OUT_DIR"
-
+DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR")
+python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -bk tensorflow -o "$DETECTION_RUN_DIR"
 
 # Run the distance label Python sample with default settings, without any command-line args.
 rm -rf "$DISTANCE_LABEL_OUT_DIR"
 mkdir "$DISTANCE_LABEL_OUT_DIR"
-python3 $SAMPLES_DIR/label/python/main.py -o "$DISTANCE_LABEL_OUT_DIR"
+DISTANCE_LABEL_RUN_DIR=$(create_output_dir "$DISTANCE_LABEL_OUT_DIR")
+python3 $SAMPLES_DIR/label/python/main.py -o "$DISTANCE_LABEL_RUN_DIR"
 # Run it with batch size 1 on a single image
-python3 $SAMPLES_DIR/label/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg  -b 1 -o "$DISTANCE_LABEL_OUT_DIR"
+DISTANCE_LABEL_RUN_DIR=$(create_output_dir "$DISTANCE_LABEL_OUT_DIR")
+python3 $SAMPLES_DIR/label/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg  -b 1 -o "$DISTANCE_LABEL_RUN_DIR"
diff --git a/samples/segmentation/python/main.py b/samples/segmentation/python/main.py
index 02c8a9820..6ee5411a5 100644
--- a/samples/segmentation/python/main.py
+++ b/samples/segmentation/python/main.py
@@ -85,8 +85,10 @@ def run_sample(
     cuda_device = cuda.Device(device_id)
     cuda_ctx = cuda_device.retain_primary_context()
     cuda_ctx.push()
-    cvcuda_stream = cvcuda.Stream()
-    torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle)
+    # Use the the default stream for cvcuda and torch
+    # Since we never created a stream current will be the CUDA default stream
+    cvcuda_stream = cvcuda.Stream().current
+    torch_stream = torch.cuda.default_stream(device=cuda_device)
     # docs_tag: end_setup_gpu
 
     # docs_tag: begin_setup_stages
@@ -100,6 +102,7 @@ def run_sample(
             batch_size,
             device_id,
             cuda_ctx,
+            cvcuda_stream,
             cvcuda_perf,
         )
 
@@ -115,6 +118,7 @@ def run_sample(
             batch_size,
             device_id,
             cuda_ctx,
+            cvcuda_stream,
             cvcuda_perf,
         )
 
@@ -123,6 +127,7 @@ def run_sample(
             decoder.fps,
             device_id,
             cuda_ctx,
+            cvcuda_stream,
             cvcuda_perf,
         )
 
@@ -169,7 +174,7 @@ def run_sample(
     batch_idx = 0
     while True:
         cvcuda_perf.push_range("batch", batch_idx=batch_idx)
-
+        # Make sure that cvcuda and torch are using the same stream
         with cvcuda_stream, torch.cuda.stream(torch_stream):
             # Stage 1: decode
             batch = decoder()
diff --git a/samples/segmentation/python/triton_client.py b/samples/segmentation/python/triton_client.py
index 7802fec2d..d6eff764d 100644
--- a/samples/segmentation/python/triton_client.py
+++ b/samples/segmentation/python/triton_client.py
@@ -104,8 +104,10 @@ def run_sample(
     cuda_device = cuda.Device(device_id)
     cuda_ctx = cuda_device.retain_primary_context()
     cuda_ctx.push()
-    cvcuda_stream = cvcuda.Stream()
-    torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle)
+    # Use the the default stream for cvcuda and torch
+    # Since we never created a stream current will be the CUDA default stream
+    cvcuda_stream = cvcuda.Stream().current
+    torch_stream = torch.cuda.default_stream(device=cuda_device)
     # docs_tag: end_stream_setup
 
     # docs_tag: begin_setup_triton_client
@@ -128,6 +130,7 @@ def run_sample(
             batch_size,
             device_id,
             cuda_ctx,
+            cvcuda_stream,
             cvcuda_perf,
         )
 
@@ -166,11 +169,12 @@ def run_sample(
                 batch_size,
                 device_id,
                 cuda_ctx,
+                cvcuda_stream,
                 cvcuda_perf,
             )
 
             encoder = VideoBatchEncoder(
-                output_dir, decoder.fps, device_id, cuda_ctx, cvcuda_perf
+                output_dir, decoder.fps, device_id, cuda_ctx, cvcuda_stream, cvcuda_perf
             )
 
     # Fire up encoder/decoder
diff --git a/src/cvcuda/CMakeLists.txt b/src/cvcuda/CMakeLists.txt
index 4a21a4c56..202caf756 100644
--- a/src/cvcuda/CMakeLists.txt
+++ b/src/cvcuda/CMakeLists.txt
@@ -22,7 +22,6 @@ set(CV_CUDA_OP_FILES
     OpOSD.cpp
     OpHistogramEq.cpp
     OpAdvCvtColor.cpp
-    OpFindContours.cpp
     OpSIFT.cpp
     OpMinMaxLoc.cpp
     OpHistogram.cpp
@@ -69,6 +68,7 @@ set(CV_CUDA_OP_FILES
     OpLabel.cpp
     OpPairwiseMatcher.cpp
     OpFindHomography.cpp
+    OpStack.cpp
 )
 
 # filter only one that matches the patern (case insensitive), should be set on the global level
@@ -91,7 +91,6 @@ else()
 endif()
 
 add_library(cvcuda SHARED
-    OpStack.cpp
     ${CV_CUDA_LIB_FILES}
 )
 
diff --git a/src/cvcuda/OpFindContours.cpp b/src/cvcuda/OpFindContours.cpp
deleted file mode 100644
index 8c5080908..000000000
--- a/src/cvcuda/OpFindContours.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "priv/OpFindContours.hpp"
-
-#include "priv/SymbolVersioning.hpp"
-
-#include <nvcv/Exception.hpp>
-#include <nvcv/ImageBatch.hpp>
-#include <nvcv/Tensor.hpp>
-#include <util/Assert.h>
-
-namespace priv = cvcuda::priv;
-
-CVCUDA_DEFINE_API(0, 4, NVCVStatus, cvcudaFindContoursCreate,
-                  (NVCVOperatorHandle * handle, int32_t maxWidth, int32_t maxHeight, int32_t maxBatchSize))
-{
-    return nvcv::ProtectCall(
-        [&]
-        {
-            if (handle == nullptr)
-            {
-                throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
-                                      "Pointer to NVCVOperator handle must not be NULL");
-            }
-
-            *handle = reinterpret_cast<NVCVOperatorHandle>(
-                new priv::FindContours(nvcv::Size2D{maxWidth, maxHeight}, maxBatchSize));
-        });
-}
-
-CVCUDA_DEFINE_API(0, 4, NVCVStatus, cvcudaFindContoursSubmit,
-                  (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, NVCVTensorHandle points,
-                   NVCVTensorHandle counts))
-{
-    return nvcv::ProtectCall(
-        [&]
-        {
-            nvcv::TensorWrapHandle point(points), input(in), count(counts);
-            priv::ToDynamicRef<priv::FindContours>(handle)(stream, input, point, count);
-        });
-}
diff --git a/src/cvcuda/OpLabel.cpp b/src/cvcuda/OpLabel.cpp
index 351cce2b4..807c99e6a 100644
--- a/src/cvcuda/OpLabel.cpp
+++ b/src/cvcuda/OpLabel.cpp
@@ -38,11 +38,11 @@ CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaLabelCreate, (NVCVOperatorHandle * han
         });
 }
 
-CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaLabelSubmit,
+CVCUDA_DEFINE_API(0, 7, NVCVStatus, cvcudaLabelSubmit,
                   (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, NVCVTensorHandle out,
                    NVCVTensorHandle bgLabel, NVCVTensorHandle minThresh, NVCVTensorHandle maxThresh,
-                   NVCVTensorHandle minSize, NVCVTensorHandle count, NVCVTensorHandle stats,
-                   NVCVConnectivityType connectivity, NVCVLabelType assignLabels))
+                   NVCVTensorHandle minSize, NVCVTensorHandle count, NVCVTensorHandle stats, NVCVTensorHandle mask,
+                   NVCVConnectivityType connectivity, NVCVLabelType assignLabels, NVCVLabelMaskType maskType))
 {
     return nvcv::ProtectCall(
         [&]
@@ -50,6 +50,7 @@ CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaLabelSubmit,
             cvcuda::priv::ToDynamicRef<cvcuda::priv::Label>(handle)(
                 stream, nvcv::TensorWrapHandle{in}, nvcv::TensorWrapHandle{out}, nvcv::TensorWrapHandle{bgLabel},
                 nvcv::TensorWrapHandle{minThresh}, nvcv::TensorWrapHandle{maxThresh}, nvcv::TensorWrapHandle{minSize},
-                nvcv::TensorWrapHandle{count}, nvcv::TensorWrapHandle{stats}, connectivity, assignLabels);
+                nvcv::TensorWrapHandle{count}, nvcv::TensorWrapHandle{stats}, nvcv::TensorWrapHandle{mask},
+                connectivity, assignLabels, maskType);
         });
 }
diff --git a/src/cvcuda/include/cvcuda/OpFindContours.h b/src/cvcuda/include/cvcuda/OpFindContours.h
deleted file mode 100644
index 78ea04e40..000000000
--- a/src/cvcuda/include/cvcuda/OpFindContours.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file OpFindContours.h
- *
- * @brief Defines types and functions to handle the resize operation.
- * @defgroup NVCV_C_ALGORITHM_FIND_CONTOURS Find Contours
- * @{
- */
-
-#ifndef CVCUDA_FIND_CONTOURS_H
-#define CVCUDA_FIND_CONTOURS_H
-
-#include "Operator.h"
-#include "Types.h"
-#include "detail/Export.h"
-
-#include <cuda_runtime.h>
-#include <nvcv/ImageBatch.h>
-#include <nvcv/Status.h>
-#include <nvcv/Tensor.h>
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-/** Constructs and an instance of the resize operator.
- *
- * @param [out] handle Where the image instance handle will be written to.
- *                     + Must not be NULL.
- *
- * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null.
- * @retval #NVCV_ERROR_OUT_OF_MEMORY    Not enough memory to create the operator.
- * @retval #NVCV_SUCCESS                Operation executed successfully.
- */
-CVCUDA_PUBLIC NVCVStatus cvcudaFindContoursCreate(NVCVOperatorHandle *handle, int32_t maxWidth, int32_t maxHeight,
-                                                  int32_t maxBatchSize);
-
-/**
- * Limitations:
- *
- * Input:
- *      Data Layout:    [kNHWC, kHWC]
- *      Channels:       [1]
- *
- *      Data Type      | Allowed
- *      -------------- | -------------
- *      8bit  Unsigned | Yes
- *      8bit  Signed   | No
- *      16bit Unsigned | No
- *      16bit Signed   | No
- *      32bit Unsigned | No
- *      32bit Signed   | No
- *      32bit Float    | No
- *      64bit Float    | No
- *
- * Output:
- *      Data Layout:    [kNHWC, kHWC]
- *      Channels:       [1, 3, 4]
- *
- *      Data Type      | Allowed
- *      -------------- | -------------
- *      8bit  Unsigned | Yes
- *      8bit  Signed   | No
- *      16bit Unsigned | Yes
- *      16bit Signed   | No
- *      32bit Unsigned | No
- *      32bit Signed   | Yes
- *      32bit Float    | Yes
- *      64bit Float    | No
- *
- * Input/Output dependency
- *
- *      Property      |  Input == Output
- *     -------------- | -------------
- *      Data Layout   | Yes
- *      Data Type     | Yes
- *      Number        | Yes
- *      Channels      | Yes
- *      Width         | Yes
- *      Height        | Yes
- *
- * @param [in] handle Handle to the operator.
- *                    + Must not be NULL.
- * @param [in] stream Handle to a valid CUDA stream.
- * @param [in] in GPU pointer to input data. Represents an 8-bit, unsigned,
- *     single-channel image. Non-zero pixels are treated as 1's, and zero
- *     pixels remain as 0's, which makes the image binary.
- * @param [out] points GPU pointer to output data. It contains the detected
- *     contours for the input image. The data is structured as: [x_c0_p0,
- *     y_c0_p0, ..., x_ci_pj, y_ci_pj, ...], where "ci" denotes a contour's
- *     index in the output array and "pj" is a point's index within a
- *     contour.
- * @param [out] numPoints Holds the number of contour points for each image.
- *     Specifically, numPoints[i] gives the number of contours for the i-th
- *     image, while numPoints[i][j] gives the number of points in the j-th
- *     contour of i-th image.
- */
-/** @{ */
-CVCUDA_PUBLIC NVCVStatus cvcudaFindContoursSubmit(NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in,
-                                                  NVCVTensorHandle points, NVCVTensorHandle numPoints);
-/** @} */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* CVCUDA_FIND_CONTOURS_H */
diff --git a/src/cvcuda/include/cvcuda/OpFindContours.hpp b/src/cvcuda/include/cvcuda/OpFindContours.hpp
deleted file mode 100644
index 29f84ffe3..000000000
--- a/src/cvcuda/include/cvcuda/OpFindContours.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file OpFindContours.hpp
- *
- * @brief Defines the public C++ Class for the resize operation.
- * @defgroup NVCV_CPP_ALGORITHM_FIND_CONTOURS Find Contours
- * @{
- */
-
-#ifndef CVCUDA_FIND_CONTOURS_HPP
-#define CVCUDA_FIND_CONTOURS_HPP
-
-#include "IOperator.hpp"
-#include "OpFindContours.h"
-
-#include <cuda_runtime.h>
-#include <nvcv/ImageBatch.hpp>
-#include <nvcv/ImageFormat.hpp>
-#include <nvcv/Tensor.hpp>
-#include <nvcv/alloc/Requirements.hpp>
-
-namespace cvcuda {
-
-class FindContours final : public IOperator
-{
-public:
-    static constexpr int32_t MAX_NUM_CONTOURS   = 256;
-    static constexpr int32_t MAX_CONTOUR_POINTS = 4 * 1024;
-    static constexpr int32_t MAX_TOTAL_POINTS   = MAX_NUM_CONTOURS * MAX_CONTOUR_POINTS;
-
-    explicit FindContours() = delete;
-    explicit FindContours(nvcv::Size2D maxSize, int32_t maxBatchSize);
-
-    ~FindContours();
-
-    void operator()(cudaStream_t stream, nvcv::Tensor &in, nvcv::Tensor &points, nvcv::Tensor &numPoints);
-
-    virtual NVCVOperatorHandle handle() const noexcept override;
-
-private:
-    NVCVOperatorHandle m_handle;
-};
-
-inline FindContours::FindContours(nvcv::Size2D maxSize, int32_t maxBatchSize)
-{
-    nvcv::detail::CheckThrow(cvcudaFindContoursCreate(&m_handle, maxSize.w, maxSize.h, maxBatchSize));
-    assert(m_handle);
-}
-
-inline FindContours::~FindContours()
-{
-    nvcvOperatorDestroy(m_handle);
-    m_handle = nullptr;
-}
-
-inline void FindContours::operator()(cudaStream_t stream, nvcv::Tensor &in, nvcv::Tensor &points,
-                                     nvcv::Tensor &numPoints)
-{
-    nvcv::detail::CheckThrow(
-        cvcudaFindContoursSubmit(m_handle, stream, in.handle(), points.handle(), numPoints.handle()));
-}
-
-inline NVCVOperatorHandle FindContours::handle() const noexcept
-{
-    return m_handle;
-}
-
-} // namespace cvcuda
-
-#endif // CVCUDA_FIND_CONTOURS_HPP
diff --git a/src/cvcuda/include/cvcuda/OpFindHomography.h b/src/cvcuda/include/cvcuda/OpFindHomography.h
index 6d5c5dcc5..b1806254d 100644
--- a/src/cvcuda/include/cvcuda/OpFindHomography.h
+++ b/src/cvcuda/include/cvcuda/OpFindHomography.h
@@ -105,15 +105,15 @@ CVCUDA_PUBLIC NVCVStatus cvcudaFindHomographyCreate(NVCVOperatorHandle *handle,
  *                from 0 to batch-1, j ranges from 4 to number of coordinates per image, and the data type being
  *                float2 for (x=x, y=y)
  *                + Number of coordinates must be >= 4
- *                + Must have data type 2F32
- *                + Must have rank 2
+ *                + Must have data type 2F32 or F32
+ *                + Must have rank 2 or 3
  *
  * * @param [in] dstPts Input tensor, dstPts[i, j] is the set of coordinates for the destination image where i ranges
  *                from 0 to batch-1, j ranges from 4 to number of coordinates per image, and the data type being
  *                float2 for (x=x, y=y)
  *                + Number of coordinates must be >= 4
- *                + Must have data type 2F32
- *                + Must have rank 2
+ *                + Must have data type 2F32 or F32
+ *                + Must have rank 2 or 3
  *
  * @param [out] out Output tensor, models[i, j, k] is the output model tensor which maps the src points to dst points
  *                  in image i, where i ranges from 0 to batch-1, j ranges from 0 to 2 and k ranges from 0 to 2, and
diff --git a/src/cvcuda/include/cvcuda/OpLabel.h b/src/cvcuda/include/cvcuda/OpLabel.h
index 77f620a52..06a3a7ac8 100644
--- a/src/cvcuda/include/cvcuda/OpLabel.h
+++ b/src/cvcuda/include/cvcuda/OpLabel.h
@@ -101,7 +101,7 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle);
  *       16bit Unsigned | No
  *       16bit Signed   | No
  *       32bit Unsigned | Yes
- *       32bit Signed   | No
+ *       32bit Signed   | Yes
  *       32bit Float    | No
  *       64bit Float    | No
  *
@@ -116,6 +116,8 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle);
  *       Height        | Yes
  *       Depth         | Yes
  *
+ * @note The number of elements (pixels or voxels) in input and output tensors must be at most \f$ 2^31 - 1 \f$.
+ *
  * @param [in] handle Handle to the operator.
  *                    + Must not be NULL.
  * @param [in] stream Handle to a valid CUDA stream.
@@ -177,7 +179,7 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle);
  *                     done before this post-filter step, also known as island-removal step.
  *                     + It must have the same number of samples as input and output tensors.
  *                     + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor.
- *                     + It must have U32 data type.
+ *                     + It must have S32 or U32 data type.
  *                     + It may be NULL to not apply minimum size regions removal as a post-filter.
  *                     + If not NULL, the \ref bgLabel and \ref stats tensors must not be NULL as well.
  *
@@ -189,32 +191,50 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle);
  *                    of \ref stats tensor, and regions potentially removed by \ref minSize tensor.
  *                    + It must have the same number of samples as input and output tensors.
  *                    + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor.
- *                    + It must have U32 data type.
+ *                    + It must have S32 or U32 data type.
  *                    + It may be NULL to disregard counting the number of different labels found.
  *
  * @param [out] stats Statistics tensor.  The expected layout is [NMA], meaning rank-3 tensor with first dimension
  *                    as the number of samples N, matching input and output tensors, second dimension M as maximum
  *                    number of different labels statistics to be computed, and a third dimension A as the amount
- *                    of statistics to be computed per label (fixed as 6 for 2D or 8 for 3D).  If present, this
+ *                    of statistics to be computed per label (fixed as 7 for 2D or 9 for 3D).  If present, this
  *                    tensor is used by the operator to store information per connected-component label.  The
  *                    background label is ignored and thus its statistics is not computed.
  *                    + It must have the same number of samples as input and output tensors.
  *                    + It must have a number of statistics M per sample N equal to the maximum allowed number of
  *                      label statistics that can be computed by the Label operator per sample image (or volume).
  *                      The actual number of labels found is stored in \ref count (see above).
- *                    + For 2D labeling, it must have in the last dimension A=6 elements to store at: (0) the
+ *                    + For 2D labeling, it must have in the last dimension A=7 elements to store at: (0) the
  *                      original label number; (1) leftmost position; (2) topmost position; (3) width size; (4)
- *                      height size; (5) count of pixels (i.e. size of the labeled region).  And for 3D labeling,
- *                      it must have in the last dimension A=8 elements to store at: (0) the original label number;
- *                      (1) leftmost position; (2) topmost position; (3) shallowmost position; (4) width size; (5)
- *                      height size; (6) depth size; (7) count of voxels (i.e. size of the labeled region).
- *                    + It must have U32 data type.
+ *                      height size; (5) count of pixels (i.e. size of the labeled region); (6) region marks (0
+ *                      means no marks, 1 means region was removed, 2 means region inside the \ref mask will not be
+ *                      removed).  And for 3D labeling, it must have in the last dimension A=9 elements to store
+ *                      at: (0) the original label number; (1) leftmost position; (2) topmost position; (3)
+ *                      shallowmost position; (4) width size; (5) height size; (6) depth size; (7) count of voxels
+ *                      (i.e. size of the labeled region); (8) region marks (0 means no marks, 1 means region was
+ *                      removed, 2 means region inside the \ref mask will not be removed).
+ *                    + It must have S32 or U32 data type.
  *                    + It may be NULL to disregard computing statistics information on different labels found.
  *                    + It must not be NULL if \ref assignLabel is NVCV_LABEL_SEQUENTIAL, the index of each label
  *                      statistics is used as the new sequential label replacing the original label in the output,
  *                      the sequential labels are up to the maximum capacity M
  *                    + If not NULL, the \ref count tensor must not be NULL as well.
  *
+ * @param [in] mask Mask tensor.  The expected layout is [HWC] or [NHWC] for 2D masking or [DHWC] or [NDHWC] for 3D
+ *                  masking, with either explicit C dimension or missing C with channels embedded in the data type.
+ *                  The N dimension is the number of samples, if missing it is considered to be N=1, in case N=1
+ *                  and \ref in and \ref out tensors have N>1 the same mask is to be applied to all images (2D) or
+ *                  volumes (3D).  A value of zero in the mask is considered to be outside the mask and non-zero is
+ *                  inside.  The mask behavior is controlled by \ref maskType.
+ *                  + If number of samples N is present in the layout, it must be either 1 or equal to N in the
+ *                    \ref in \ref out tensors.
+ *                  + It must have the same height H and width W as \ref in and \ref out tensors.
+ *                  + It must have the same depth D as \ref in and \ref out tensors in case of 3D.
+ *                  + If channel C is present in the layout, it must be 1.
+ *                  + It must have S8 or U8 data type.
+ *                  + If not NULL and maskType is NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY, the \ref minSize tensor
+ *                    must not be NULL as well.
+ *
  * @param [in] connectivity Specify connectivity of elements for the operator, see \ref NVCVConnectivityType.
  *                          + It must conform with \ref in and \ref out tensors, i.e. 3D labeling requires [DHWC]
  *                            or [NDHWC] tensor layouts and 2D labeling requires [HWC] or [NHWC], where the C
@@ -224,6 +244,10 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle);
  *                          NVCV_LABEL_FAST to do fast labeling, i.e. assign non-consecutive label numbers fast.
  *                          Use NCVC_LABEL_SEQUENTIAL to have consecutive label numbers instead.
  *
+ * @param [in] maskType Specify how the mask tensor affects this operator, see \ref NVCVLabelMaskType.  Use
+ *                      NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY to only remove islands, i.e. regions with less than
+ *                      \ref minSize elements, that are outside the mask (defined by zeros in the mask).
+ *
  * @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range.
  * @retval #NVCV_ERROR_INTERNAL         Internal error in the operator, invalid types passed in.
  * @retval #NVCV_SUCCESS                Operation executed successfully.
@@ -231,8 +255,9 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle);
 CVCUDA_PUBLIC NVCVStatus cvcudaLabelSubmit(NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in,
                                            NVCVTensorHandle out, NVCVTensorHandle bgLabel, NVCVTensorHandle minThresh,
                                            NVCVTensorHandle maxThresh, NVCVTensorHandle minSize, NVCVTensorHandle count,
-                                           NVCVTensorHandle stats, NVCVConnectivityType connectivity,
-                                           NVCVLabelType assignLabels);
+                                           NVCVTensorHandle stats, NVCVTensorHandle mask,
+                                           NVCVConnectivityType connectivity, NVCVLabelType assignLabels,
+                                           NVCVLabelMaskType maskType);
 
 #ifdef __cplusplus
 }
diff --git a/src/cvcuda/include/cvcuda/OpLabel.hpp b/src/cvcuda/include/cvcuda/OpLabel.hpp
index 54ebd54e2..1b6997d9a 100644
--- a/src/cvcuda/include/cvcuda/OpLabel.hpp
+++ b/src/cvcuda/include/cvcuda/OpLabel.hpp
@@ -45,8 +45,8 @@ class Label final : public IOperator
 
     void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const nvcv::Tensor &bgLabel,
                     const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, const nvcv::Tensor &minSize,
-                    const nvcv::Tensor &count, const nvcv::Tensor &stats, NVCVConnectivityType connectivity,
-                    NVCVLabelType assignLabels) const;
+                    const nvcv::Tensor &count, const nvcv::Tensor &stats, const nvcv::Tensor &mask,
+                    NVCVConnectivityType connectivity, NVCVLabelType assignLabels, NVCVLabelMaskType maskType) const;
 
     virtual NVCVOperatorHandle handle() const noexcept override;
 
@@ -69,11 +69,12 @@ inline Label::~Label()
 inline void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out,
                               const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh,
                               const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats,
-                              NVCVConnectivityType connectivity, NVCVLabelType assignLabels) const
+                              const nvcv::Tensor &mask, NVCVConnectivityType connectivity, NVCVLabelType assignLabels,
+                              NVCVLabelMaskType maskType) const
 {
     nvcv::detail::CheckThrow(cvcudaLabelSubmit(m_handle, stream, in.handle(), out.handle(), bgLabel.handle(),
                                                minThresh.handle(), maxThresh.handle(), minSize.handle(), count.handle(),
-                                               stats.handle(), connectivity, assignLabels));
+                                               stats.handle(), mask.handle(), connectivity, assignLabels, maskType));
 }
 
 inline NVCVOperatorHandle Label::handle() const noexcept
diff --git a/src/cvcuda/include/cvcuda/OpSIFT.h b/src/cvcuda/include/cvcuda/OpSIFT.h
index 45fa7308e..39e5142f2 100644
--- a/src/cvcuda/include/cvcuda/OpSIFT.h
+++ b/src/cvcuda/include/cvcuda/OpSIFT.h
@@ -146,8 +146,7 @@ CVCUDA_PUBLIC NVCVStatus cvcudaSIFTCreate(NVCVOperatorHandle *handle, int3 maxSh
  *                          + It must have S32 data type to store number of features found.
  *                          + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor.
  *
- * @param [in] numOctaveLayers Number of layers in each octave.  Since the minimum number of layers is 3, the
- *                             actual number is 3 + numOctaveLayers.  One suggestion, given by the original
+ * @param [in] numOctaveLayers Number of layers in each octave.  One suggestion, given by the original
  *                             algorithm description, is to use numOctaveLayers = 3.  The number of octaves is
  *                             computed from the input image resolution WxH as \f$ log(min(W, H))/log(2) - 2 \f$.
  *                             + It must be positive.
diff --git a/src/cvcuda/include/cvcuda/Types.h b/src/cvcuda/include/cvcuda/Types.h
index 37eb2e0cf..8dc5131f9 100644
--- a/src/cvcuda/include/cvcuda/Types.h
+++ b/src/cvcuda/include/cvcuda/Types.h
@@ -402,6 +402,12 @@ typedef enum
     NVCV_LABEL_SEQUENTIAL, //!< Assigns consecutive numbers to labels.
 } NVCVLabelType;
 
+// @brief Defines how mask affects label operation
+typedef enum
+{
+    NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY, //!< Prevent removing islands inside the mask
+} NVCVLabelMaskType;
+
 // @brief Defines pair-wise matcher algorithms of choice
 typedef enum
 {
diff --git a/src/cvcuda/priv/CMakeLists.txt b/src/cvcuda/priv/CMakeLists.txt
index 6b28a39f7..fa0e8c390 100644
--- a/src/cvcuda/priv/CMakeLists.txt
+++ b/src/cvcuda/priv/CMakeLists.txt
@@ -18,7 +18,6 @@ add_subdirectory(legacy)
 set(CV_CUDA_PRIV_FILES IOperator.cpp)
 
 set(CV_CUDA_PRIV_OP_FILES
-    OpFindContours.cpp
     OpOSD.cpp
     OpHistogramEq.cpp
     OpAdvCvtColor.cu
diff --git a/src/cvcuda/priv/OpBrightnessContrast.cu b/src/cvcuda/priv/OpBrightnessContrast.cu
index f97f67b49..2e55c3dfd 100644
--- a/src/cvcuda/priv/OpBrightnessContrast.cu
+++ b/src/cvcuda/priv/OpBrightnessContrast.cu
@@ -72,8 +72,7 @@ struct BatchArgsWrap
 };
 
 template<typename BT>
-inline __host__ __device__ BT GetArg(const cuda::Tensor1DWrap<const BT> &tensorArg, int argLen, int sampleIdx,
-                                     BT defaultVal)
+inline __device__ BT GetArg(const cuda::Tensor1DWrap<const BT> &tensorArg, int argLen, int sampleIdx, BT defaultVal)
 {
     if (argLen == 0)
     {
@@ -90,7 +89,7 @@ inline __host__ __device__ BT GetArg(const cuda::Tensor1DWrap<const BT> &tensorA
 }
 
 template<typename SrcBT, typename BT>
-inline __host__ __device__ SampleArgs<BT> GetBrightnessContrastArg(const BatchArgsWrap<BT> &args, int sampleIdx)
+inline __device__ SampleArgs<BT> GetBrightnessContrastArg(const BatchArgsWrap<BT> &args, int sampleIdx)
 {
     return {GetArg(args.brightness, args.brightnessLen, sampleIdx, BT{1}),
             GetArg(args.contrast, args.contrastLen, sampleIdx, BT{1}),
diff --git a/src/cvcuda/priv/OpFindContours.cpp b/src/cvcuda/priv/OpFindContours.cpp
deleted file mode 100644
index 51d253e1f..000000000
--- a/src/cvcuda/priv/OpFindContours.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "OpFindContours.hpp"
-
-#include "legacy/CvCudaLegacy.h"
-#include "legacy/CvCudaLegacyHelpers.hpp"
-
-#include <nvcv/Exception.hpp>
-#include <util/CheckError.hpp>
-
-namespace cvcuda::priv {
-
-namespace legacy = nvcv::legacy::cuda_op;
-
-FindContours::FindContours(nvcv::Size2D maxSize, int maxBatchSize)
-{
-    legacy::DataShape maxIn, maxOut;
-    // maxIn/maxOut not used by op.
-    maxIn.N = maxBatchSize;
-    maxIn.C = 1;
-    maxIn.H = maxSize.h;
-    maxIn.W = maxSize.w;
-
-    m_legacyOp = std::make_unique<legacy::FindContours>(maxIn, maxOut);
-}
-
-void FindContours::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &points,
-                              const nvcv::Tensor &numPoints) const
-{
-    auto inData = in.exportData<nvcv::TensorDataStridedCuda>();
-    if (inData == nullptr)
-    {
-        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
-                              "Input must be cuda-accessible, pitch-linear tensor");
-    }
-
-    auto pointCoords = points.exportData<nvcv::TensorDataStridedCuda>();
-    if (pointCoords == nullptr)
-    {
-        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
-                              "Output must be cuda-accessible, pitch-linear tensor");
-    }
-
-    auto pointCounts = numPoints.exportData<nvcv::TensorDataStridedCuda>();
-    if (pointCounts == nullptr)
-    {
-        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
-                              "Output must be cuda-accessible, pitch-linear tensor");
-    }
-
-    NVCV_CHECK_THROW(m_legacyOp->infer(*inData, *pointCoords, *pointCounts, stream));
-}
-
-} // namespace cvcuda::priv
diff --git a/src/cvcuda/priv/OpFindContours.hpp b/src/cvcuda/priv/OpFindContours.hpp
deleted file mode 100644
index ec4f21134..000000000
--- a/src/cvcuda/priv/OpFindContours.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file OpFindContours.hpp
- *
- * @brief Defines the private C++ Class for the find contours operation.
- */
-
-#ifndef CVCUDA_PRIV_FIND_CONTOURS_HPP
-#define CVCUDA_PRIV_FIND_CONTOURS_HPP
-
-#include "IOperator.hpp"
-#include "legacy/CvCudaLegacy.h"
-
-#include <nvcv/Tensor.hpp>
-#include <nvcv/TensorShapeInfo.hpp>
-
-#include <memory>
-
-namespace cvcuda::priv {
-
-namespace legacy = nvcv::legacy::cuda_op;
-
-class FindContours final : public IOperator
-{
-public:
-    explicit FindContours() = delete;
-
-    explicit FindContours(nvcv::Size2D maxSize, int maxBatchSize);
-
-    void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &points,
-                    const nvcv::Tensor &numPoints) const;
-
-private:
-    std::unique_ptr<nvcv::legacy::cuda_op::FindContours> m_legacyOp;
-};
-
-} // namespace cvcuda::priv
-
-#endif // CVCUDA_PRIV_FIND_CONTOURS_HPP
diff --git a/src/cvcuda/priv/OpFindHomography.cu b/src/cvcuda/priv/OpFindHomography.cu
index d3e712cbe..7e8beef82 100644
--- a/src/cvcuda/priv/OpFindHomography.cu
+++ b/src/cvcuda/priv/OpFindHomography.cu
@@ -295,12 +295,12 @@ __device__ void calculate_residual_and_jacobian_device(float2 *src, float2 *dst,
     }
 }
 
-__host__ __device__ inline float myfabs(float val)
+__device__ inline float myfabs(float val)
 {
     return fabsf(val);
 }
 
-inline __host__ __device__ float2 myfabs2(float2 val)
+inline __device__ float2 myfabs2(float2 val)
 {
     float2 ret;
     ret.x = fabsf(val.x);
@@ -308,14 +308,14 @@ inline __host__ __device__ float2 myfabs2(float2 val)
     return ret;
 }
 
-__host__ __device__ inline int getNumPoints(cuda::Tensor2DWrap<float2> src, int numPoints, int batch)
+__device__ inline int getNumPoints(cuda::Tensor2DWrap<float2> src, int numPoints, int batch)
 {
     return numPoints;
 }
 
 struct MeanOp
 {
-    __host__ __device__ float2 eval(float2 val, int numPoints, int batch)
+    __device__ float2 eval(float2 val, int numPoints, int batch)
     {
         return val / numPoints;
     }
@@ -323,7 +323,7 @@ struct MeanOp
 
 struct SquareOp
 {
-    __host__ __device__ float eval(float val, int batch)
+    __device__ float eval(float val, int batch)
     {
         return val * val;
     }
@@ -336,11 +336,11 @@ private:
 
 public:
     // Constructor that takes a float* pointer as a parameter
-    __host__ __device__ AbsShiftOp(float2 *data)
+    __host__ AbsShiftOp(float2 *data)
         : _data(data){};
 
     // Method to update the float value pointed to by the pointer
-    __host__ __device__ float2 eval(float2 newVal, int numPoints, int batch)
+    __device__ float2 eval(float2 newVal, int numPoints, int batch)
     {
         _data += batch;
         return myfabs2(newVal - _data[0]);
@@ -353,7 +353,7 @@ private:
     float2 *cm, *cM, *sm, *sM;
 
 public:
-    __host__ __device__ LtLOp(float2 *srcMean, float2 *dstMean, float2 *srcShiftSum, float2 *dstShiftSum)
+    __host__ LtLOp(float2 *srcMean, float2 *dstMean, float2 *srcShiftSum, float2 *dstShiftSum)
     {
         cM = srcMean;
         sM = srcShiftSum;
@@ -361,7 +361,7 @@ public:
         sm = dstShiftSum;
     }
 
-    __host__ __device__ float eval(float2 *src, float2 *dst, int batch, int numPoints, int tid, int j, int k)
+    __device__ float eval(float2 *src, float2 *dst, int batch, int numPoints, int tid, int j, int k)
     {
         cm += batch;
         cM += batch;
@@ -1410,10 +1410,59 @@ void FindHomographyWrapper(SrcDstWrapper srcWrap, SrcDstWrapper dstWrap, ModelTy
                                              calc_buffer, modelWrap, numPoints, batchSize);
 }
 
-template<typename SrcDstType>
-void RunFindHomography(const SrcDstType &src, const SrcDstType &dst, const nvcv::TensorDataStridedCuda &models,
-                       const BufferOffsets *bufferOffset, const cuSolver *cusolverData, cudaStream_t stream)
+inline void RunFindHomography(const nvcv::TensorDataStridedCuda &src, const nvcv::TensorDataStridedCuda &dst,
+                              const nvcv::TensorDataStridedCuda &models, const BufferOffsets *bufferOffset,
+                              const cuSolver *cusolverData, cudaStream_t stream)
 {
+    // validation of input data
+    if ((src.rank() != 2 && src.rank() != 3) || (dst.rank() != 2 && dst.rank() != 3))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "source and destination points must have rank 2 or 3");
+    }
+
+    if (!(src.shape(0) == dst.shape(0) && src.shape(0) == models.shape(0)))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "source, destination and model must have same batch size");
+    }
+
+    if (src.shape(1) != dst.shape(1))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "source and destination array length must be same length to return a valid model");
+    }
+
+    if (src.shape(1) < 4 || dst.shape(1) < 4)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "source and destination array length must be >=4 to return a valid model");
+    }
+
+    if (!(models.rank() == 3 && models.shape(1) == 3 && models.shape(2) == 3 && models.dtype() == nvcv::TYPE_F32))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "model tensor must be 2D with shape 3x3 and data type F32");
+    }
+
+    if (!((src.rank() == 2 && src.dtype() == nvcv::TYPE_2F32)
+          || (src.rank() == 3 && src.dtype() == nvcv::TYPE_F32 && src.shape(2) == 2)))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "source tensor must have data type 2F32 or F32 with last shape 2");
+    }
+    if (!((dst.rank() == 2 && dst.dtype() == nvcv::TYPE_2F32)
+          || (dst.rank() == 3 && dst.dtype() == nvcv::TYPE_F32 && dst.shape(2) == 2)))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "destination tensor must have data type 2F32 or F32 with last shape 2");
+    }
+    if (!(src.stride(1) == sizeof(float2) && dst.stride(1) == sizeof(float2)))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "source and destination tensors must have last dimensions packed");
+    }
+
     using SrcDstWrapper = cuda::Tensor2DWrap<float2>;
     SrcDstWrapper srcWrap(src);
     SrcDstWrapper dstWrap(dst);
@@ -1498,42 +1547,6 @@ void FindHomography::operator()(cudaStream_t stream, const nvcv::Tensor &srcPoin
                               "Input must be cuda-accessible, pitch-linear tensor");
     }
 
-    // validation of input data
-    if (!((srcData->rank() == dstData->rank()) && (srcData->rank() == 2)))
-    {
-        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "source and destination points must have rank 2");
-    }
-
-    if (!(srcData->shape(0) == dstData->shape(0) && srcData->shape(0) == modelData->shape(0)))
-    {
-        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
-                              "source, destination and model must have same batch size");
-    }
-
-    if (srcData->shape(1) != dstData->shape(1))
-    {
-        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
-                              "source and destination array length must be same length to return a valid model");
-    }
-
-    if (srcData->shape(1) < 4 || dstData->shape(1) < 4)
-    {
-        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
-                              "source and destination array length must be >=4 to return a valid model");
-    }
-
-    if (!(modelData->rank() == 3 && modelData->shape(1) == 3 && modelData->shape(2) == 3))
-    {
-        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "model tensor must be 2D with shape 3x3");
-    }
-
-    if (!(srcData->dtype() == nvcv::TYPE_2F32 && dstData->dtype() == nvcv::TYPE_2F32
-          && modelData->dtype() == nvcv::TYPE_F32))
-    {
-        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
-                              "source, destination and model tensors must have data type F32");
-    }
-
     RunFindHomography(*srcData, *dstData, *modelData, &bufferOffset, &cusolverData, stream);
 }
 
@@ -1569,45 +1582,6 @@ void FindHomography::operator()(cudaStream_t stream, const nvcv::TensorBatch &sr
                                   "model must be cuda-accessible, pitch-linear tensor");
         }
 
-        // validation of input data
-        if (!((srcData->shape(0) == dstData->shape(0)) && (srcData->shape(0) == modelData->shape(0))
-              && (srcData->shape(0) == 1)))
-        {
-            throw nvcv::Exception(
-                nvcv::Status::ERROR_INVALID_ARGUMENT,
-                "Invdividual samples (src, dst and model) in the batch must be tensors with batch size 1");
-        }
-
-        if (!((srcData->rank() == dstData->rank()) && (srcData->rank() == 2)))
-        {
-            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
-                                  "source and destination tensors must have rank 2");
-        }
-
-        if (srcData->shape(1) != dstData->shape(1))
-        {
-            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
-                                  "source and destination array length must be same length to return a valid model");
-        }
-
-        if (srcData->shape(1) < 4 || dstData->shape(1) < 4)
-        {
-            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
-                                  "source and destination array length must be >=4 to return a valid model");
-        }
-
-        if (!(modelData->rank() == 3 && modelData->shape(1) == 3 && modelData->shape(2) == 3))
-        {
-            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "model tensor must be 2D with shape 3x3");
-        }
-
-        if (!(srcData->dtype() == nvcv::TYPE_2F32 && dstData->dtype() == nvcv::TYPE_2F32
-              && modelData->dtype() == nvcv::TYPE_F32))
-        {
-            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
-                                  "source, destination and model tensors must have data type F32");
-        }
-
         RunFindHomography(*srcData, *dstData, *modelData, &bufferOffset, &cusolverData, stream);
     }
 }
diff --git a/src/cvcuda/priv/OpLabel.cu b/src/cvcuda/priv/OpLabel.cu
index 8a1c51182..b552e8656 100644
--- a/src/cvcuda/priv/OpLabel.cu
+++ b/src/cvcuda/priv/OpLabel.cu
@@ -68,6 +68,10 @@ namespace util = nvcv::util;
 
 namespace {
 
+constexpr int REGION_NOT_MARKED  = 0;
+constexpr int REGION_REMOVED     = 1;
+constexpr int REGION_INSIDE_MASK = 2;
+
 // CUDA kernels ----------------------------------------------------------------
 
 template<typename DT>
@@ -432,12 +436,13 @@ __global__ void CountLabels2D(cuda::Tensor1DWrap<DT> count, cuda::Tensor3DWrap<D
         *stats.ptr(gc.z, (int)regionIdx, 3) = 1;
         *stats.ptr(gc.z, (int)regionIdx, 4) = 1;
         *stats.ptr(gc.z, (int)regionIdx, 5) = 1;
+        *stats.ptr(gc.z, (int)regionIdx, 6) = REGION_NOT_MARKED;
     }
 }
 
-template<typename DT, typename ST>
-__global__ void ComputeStats2D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor3DWrap<DT> dst, cuda::Tensor1DWrap<ST> bgLabel,
-                               int2 size, bool relabel)
+template<typename DT, typename ST, typename MT>
+__global__ void ComputeStats2D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor3DWrap<DT> dst, cuda::Tensor3DWrap<MT> mask,
+                               cuda::Tensor1DWrap<ST> bgLabel, int2 size, int maskN, bool relabel)
 {
     int3 gc;
     gc.x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -449,26 +454,45 @@ __global__ void ComputeStats2D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor3DWrap<
         return;
     }
 
+    bool hasMask         = (mask.ptr(0) != nullptr);
+    bool isInsideMask    = false;
     bool hasBgLabel      = (bgLabel.ptr(0) != nullptr);
     ST   backgroundLabel = hasBgLabel ? bgLabel[gc.z] : 0;
     DT   endLabel        = dst.strides()[0] / sizeof(DT);
     DT   label           = dst[gc];
+    DT   regionIdx       = 0;
+
+    if (hasMask)
+    {
+        int3 mc{gc.x, gc.y, maskN == 1 ? 0 : gc.z};
+
+        isInsideMask = mask[mc] == 0 ? false : true; // mask value = 0 means outside the mask
+    }
 
     if (hasBgLabel && label == (DT)backgroundLabel)
     {
         return; // do not compute statistics for background labels
     }
+
     if (label & (DT)(1 << 31))
     {
+        if (isInsideMask)
+        {
+            regionIdx = label & (DT) ~(1 << 31);
+
+            *stats.ptr(gc.z, (int)regionIdx, 6) = REGION_INSIDE_MASK; // region is inside the mask, mark it as such
+        }
+
         return; // label is marked as region index, its statistics is already computed
     }
+
     if (hasBgLabel && label == endLabel)
     {
         // This is a special region marked with one-element-after-the-end label, its label was the backgroundLabel
         label = backgroundLabel;
     }
 
-    DT regionIdx = dst.ptr(gc.z)[label];
+    regionIdx = dst.ptr(gc.z)[label];
 
     if (regionIdx & (DT)(1 << 31))
     {
@@ -493,12 +517,18 @@ __global__ void ComputeStats2D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor3DWrap<
         atomicMax(stats.ptr(gc.z, (int)regionIdx, 3), (DT)bboxArea.x);
         atomicMax(stats.ptr(gc.z, (int)regionIdx, 4), (DT)bboxArea.y);
         atomicAdd(stats.ptr(gc.z, (int)regionIdx, 5), 1);
+
+        if (isInsideMask)
+        {
+            *stats.ptr(gc.z, (int)regionIdx, 6) = REGION_INSIDE_MASK; // region is inside the mask, mark it as such
+        }
     }
 }
 
 template<typename DT, typename ST>
 __global__ void RemoveIslands2D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor3DWrap<DT> dst,
-                                cuda::Tensor1DWrap<ST> bgLabel, cuda::Tensor1DWrap<DT> minSize, int2 size, bool relabel)
+                                cuda::Tensor1DWrap<ST> bgLabel, cuda::Tensor1DWrap<DT> minSize, int2 size, bool relabel,
+                                bool hasMask)
 {
     int3 gc;
     gc.x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -557,7 +587,7 @@ __global__ void RemoveIslands2D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor3DWrap
     }
     else
     {
-        regionIdx = label & (DT) ~(1 << 31);
+        return; // should not remove first region element with 1st bit 1 so other elements are not lost
     }
 
     DT regionSize = *stats.ptr(gc.z, (int)regionIdx, 5);
@@ -565,13 +595,18 @@ __global__ void RemoveIslands2D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor3DWrap
     // If region size is less than minimum size, it is an island and should be removed, i.e. set to background label
     if (regionSize < minSize[gc.z])
     {
-        dst[gc] = backgroundLabel;
+        // If there is no mask or if there is a mask and the region mark is not 2, meaning the region is not
+        // inside the mask, the region should be removed
+        if (!hasMask || *stats.ptr(gc.z, (int)regionIdx, 6) != REGION_INSIDE_MASK)
+        {
+            dst[gc] = backgroundLabel;
+        }
     }
 }
 
 template<typename DT, typename ST>
 __global__ void Relabel2D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor3DWrap<DT> dst, cuda::Tensor1DWrap<ST> bgLabel,
-                          int2 size, bool relabel)
+                          cuda::Tensor1DWrap<DT> minSize, int2 size, bool relabel, bool hasMask)
 {
     int3 gc;
     gc.x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -583,6 +618,8 @@ __global__ void Relabel2D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor3DWrap<DT> d
         return;
     }
 
+    bool removeIsland = minSize.ptr(0) != nullptr;
+
     DT label = dst[gc];
 
     if (label & (DT)(1 << 31))
@@ -608,6 +645,21 @@ __global__ void Relabel2D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor3DWrap<DT> d
         {
             dst[gc] = *stats.ptr(gc.z, (int)regionIdx, 0);
         }
+
+        if (removeIsland)
+        {
+            DT regionSize = *stats.ptr(gc.z, (int)regionIdx, 5);
+
+            if (regionSize < minSize[gc.z])
+            {
+                if (!hasMask || *stats.ptr(gc.z, (int)regionIdx, 6) != REGION_INSIDE_MASK)
+                {
+                    dst[gc] = (DT)bgLabel[gc.z];
+
+                    *stats.ptr(gc.z, (int)regionIdx, 6) = REGION_REMOVED;
+                }
+            }
+        }
     }
 }
 
@@ -1104,13 +1156,14 @@ __global__ void CountLabels3D(cuda::Tensor1DWrap<DT> count, cuda::Tensor3DWrap<D
             *stats.ptr(gc.w, (int)regionIdx, 5) = 1;
             *stats.ptr(gc.w, (int)regionIdx, 6) = 1;
             *stats.ptr(gc.w, (int)regionIdx, 7) = 1;
+            *stats.ptr(gc.w, (int)regionIdx, 8) = REGION_NOT_MARKED;
         }
     }
 }
 
-template<typename DT, typename ST>
-__global__ void ComputeStats3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap<DT> dst, cuda::Tensor1DWrap<ST> bgLabel,
-                               int4 shape, bool relabel)
+template<typename DT, typename ST, typename MT>
+__global__ void ComputeStats3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap<DT> dst, cuda::Tensor4DWrap<MT> mask,
+                               cuda::Tensor1DWrap<ST> bgLabel, int4 shape, int maskN, bool relabel)
 {
     int4 gc;
     gc.x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1122,8 +1175,11 @@ __global__ void ComputeStats3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap<
         return;
     }
 
-    bool hasBgLabel = (bgLabel.ptr(0) != nullptr);
-    DT   endLabel   = dst.strides()[0] / sizeof(DT);
+    bool hasMask      = (mask.ptr(0) != nullptr);
+    bool isInsideMask = false;
+    bool hasBgLabel   = (bgLabel.ptr(0) != nullptr);
+    DT   endLabel     = dst.strides()[0] / sizeof(DT);
+    DT   regionIdx    = 0;
 
     for (gc.w = 0; gc.w < shape.w; gc.w++)
     {
@@ -1131,12 +1187,26 @@ __global__ void ComputeStats3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap<
 
         DT label = dst[gc];
 
+        if (hasMask)
+        {
+            int4 mc{gc.x, gc.y, gc.z, maskN == 1 ? 0 : gc.w};
+
+            isInsideMask = mask[mc] == 0 ? false : true; // mask value = 0 means outside the mask
+        }
+
         if (hasBgLabel && label == (DT)backgroundLabel)
         {
             continue; // do not compute statistics for background labels
         }
         if (label & (DT)(1 << 31))
         {
+            if (isInsideMask)
+            {
+                regionIdx = label & (DT) ~(1 << 31);
+
+                *stats.ptr(gc.w, (int)regionIdx, 8) = REGION_INSIDE_MASK; // region is inside the mask, mark it as such
+            }
+
             continue; // label is marked as region index, its statistics is already computed
         }
         if (hasBgLabel && label == endLabel)
@@ -1145,7 +1215,7 @@ __global__ void ComputeStats3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap<
             label = backgroundLabel;
         }
 
-        DT regionIdx = dst.ptr(gc.w)[label];
+        regionIdx = dst.ptr(gc.w)[label];
 
         if (regionIdx & (DT)(1 << 31))
         {
@@ -1172,6 +1242,11 @@ __global__ void ComputeStats3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap<
             atomicMax(stats.ptr(gc.w, (int)regionIdx, 5), (DT)bboxArea.y);
             atomicMax(stats.ptr(gc.w, (int)regionIdx, 6), (DT)bboxArea.z);
             atomicAdd(stats.ptr(gc.w, (int)regionIdx, 7), 1);
+
+            if (isInsideMask)
+            {
+                *stats.ptr(gc.w, (int)regionIdx, 8) = REGION_INSIDE_MASK; // region is inside the mask, mark it as such
+            }
         }
     }
 }
@@ -1179,7 +1254,7 @@ __global__ void ComputeStats3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap<
 template<typename DT, typename ST>
 __global__ void RemoveIslands3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap<DT> dst,
                                 cuda::Tensor1DWrap<ST> bgLabel, cuda::Tensor1DWrap<DT> minSize, int4 shape,
-                                bool relabel)
+                                bool relabel, bool hasMask)
 {
     int4 gc;
     gc.x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1234,13 +1309,13 @@ __global__ void RemoveIslands3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap
                 }
                 else
                 {
-                    return; // invalid region index
+                    continue; // invalid region index
                 }
             }
         }
         else
         {
-            regionIdx = label & (DT) ~(1 << 31);
+            continue; // should not remove first region element with 1st bit 1 so other elements are not lost
         }
 
         DT regionSize = *stats.ptr(gc.w, (int)regionIdx, 7);
@@ -1248,14 +1323,19 @@ __global__ void RemoveIslands3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap
         // If region size is less than minimum size, it is an island and should be removed, i.e. set to background label
         if (regionSize < minSize[gc.w])
         {
-            dst[gc] = backgroundLabel;
+            // If there is no mask or if there is a mask and the region mark is not 2, meaning the region is not
+            // inside the mask, the region should be removed
+            if (!hasMask || *stats.ptr(gc.w, (int)regionIdx, 8) != REGION_INSIDE_MASK)
+            {
+                dst[gc] = backgroundLabel;
+            }
         }
     }
 }
 
 template<typename DT, typename ST>
 __global__ void Relabel3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap<DT> dst, cuda::Tensor1DWrap<ST> bgLabel,
-                          int4 shape, bool relabel)
+                          cuda::Tensor1DWrap<DT> minSize, int4 shape, bool relabel, bool hasMask)
 {
     int4 gc;
     gc.x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1267,6 +1347,8 @@ __global__ void Relabel3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap<DT> d
         return;
     }
 
+    bool removeIsland = minSize.ptr(0) != nullptr;
+
     for (gc.w = 0; gc.w < shape.w; gc.w++)
     {
         DT label = dst[gc];
@@ -1294,27 +1376,86 @@ __global__ void Relabel3D(cuda::Tensor3DWrap<DT> stats, cuda::Tensor4DWrap<DT> d
             {
                 dst[gc] = *stats.ptr(gc.w, (int)regionIdx, 0);
             }
+
+            if (removeIsland)
+            {
+                DT regionSize = *stats.ptr(gc.w, (int)regionIdx, 7);
+
+                if (regionSize < minSize[gc.w])
+                {
+                    if (!hasMask || *stats.ptr(gc.w, (int)regionIdx, 8) != REGION_INSIDE_MASK)
+                    {
+                        dst[gc] = (DT)bgLabel[gc.w];
+
+                        *stats.ptr(gc.w, (int)regionIdx, 8) = REGION_REMOVED;
+                    }
+                }
+            }
         }
     }
 }
 
 // Run functions ---------------------------------------------------------------
 
-template<typename SrcT, typename DstT = uint32_t>
+template<typename SrcT, typename DstT = uint32_t, typename MskT = uint8_t>
 inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCuda &srcData,
                             const nvcv::TensorDataStridedCuda &dstData, const int4 &shapeWHDN,
                             const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh,
                             const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats,
-                            int numDim, bool relabel)
+                            const nvcv::Tensor &mask, int numDim, bool relabel)
 {
     constexpr int BW = 32, BH = 4, BD = 2; // block width, height and depth
 
     int4 idsNDHW{srcData.layout().find('N'), srcData.layout().find('D'), srcData.layout().find('H'),
                  srcData.layout().find('W')};
 
+    // Although output tensors may have S32 or U32 data type, they are always considered U32 (DstT = uint32_t) as
+    // they are used as non-negative offset or position or size or count, or even as a 32-bit mask
+
+    // Although mask tensor may have S8 or U8 data type, it is always considered U8 (MskT = uint8_t) as it is used
+    // as zero (outside mask) or non-zero (inside mask)
+
     NVCV_ASSERT(srcData.stride(idsNDHW.w) == sizeof(SrcT));
     NVCV_ASSERT(dstData.stride(idsNDHW.w) == sizeof(DstT));
 
+    if ((srcData.stride(idsNDHW.z) > nvcv::cuda::TypeTraits<int>::max)
+        || (idsNDHW.y != -1 && srcData.stride(idsNDHW.y) > nvcv::cuda::TypeTraits<int>::max)
+        || (idsNDHW.x != -1 && srcData.stride(idsNDHW.x) > nvcv::cuda::TypeTraits<int>::max))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Too big in tensor");
+    }
+    if ((dstData.stride(idsNDHW.z) > nvcv::cuda::TypeTraits<int>::max)
+        || (idsNDHW.y != -1 && dstData.stride(idsNDHW.y) > nvcv::cuda::TypeTraits<int>::max)
+        || (idsNDHW.x != -1 && dstData.stride(idsNDHW.x) > nvcv::cuda::TypeTraits<int>::max))
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Too big out tensor");
+    }
+
+    nvcv::Optional<nvcv::TensorDataStridedCuda> mskData;
+    int4                                        mskIdsNDHW = {0, 0, 0, 0};
+    int                                         maskN      = 0;
+    bool                                        hasMask    = (mask) ? true : false;
+
+    if (hasMask)
+    {
+        mskData = mask.exportData<nvcv::TensorDataStridedCuda>(); // export data check already done
+
+        mskIdsNDHW = int4{mskData->layout().find('N'), mskData->layout().find('D'), mskData->layout().find('H'),
+                          mskData->layout().find('W')};
+
+        NVCV_ASSERT(mskData->stride(mskIdsNDHW.w) == sizeof(MskT));
+
+        if ((mskData->stride(mskIdsNDHW.z) > nvcv::cuda::TypeTraits<int>::max)
+            || (mskIdsNDHW.y != -1 && mskData->stride(mskIdsNDHW.y) > nvcv::cuda::TypeTraits<int>::max)
+            || (mskIdsNDHW.x != -1 && mskData->stride(mskIdsNDHW.x) > nvcv::cuda::TypeTraits<int>::max)
+            || (mskIdsNDHW.x != -1 && mskData->shape()[mskIdsNDHW.x] > nvcv::cuda::TypeTraits<int>::max))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Too big mask tensor");
+        }
+
+        maskN = mskIdsNDHW.x == -1 ? 1 : (int)mskData->shape()[mskIdsNDHW.x];
+    }
+
     cuda::Tensor1DWrap<SrcT> bgLabelWrap, minThreshWrap, maxThreshWrap;
     cuda::Tensor1DWrap<DstT> minSizeWrap, countWrap;
     cuda::Tensor3DWrap<DstT> statsWrap;
@@ -1380,6 +1521,15 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu
 
         cuda::Tensor3DWrap<SrcT> srcWrap(srcData.basePtr(), srcStridesNH.x, srcStridesNH.y);
         cuda::Tensor3DWrap<DstT> dstWrap(dstData.basePtr(), dstStridesNH.x, dstStridesNH.y);
+        cuda::Tensor3DWrap<MskT> mskWrap;
+
+        if (hasMask)
+        {
+            int2 mskStridesNH{0, (int)mskData->stride(mskIdsNDHW.z)};
+            mskStridesNH.x = mskIdsNDHW.x == -1 ? mskStridesNH.y * shapeWHDN.y : (int)mskData->stride(mskIdsNDHW.x);
+
+            mskWrap = cuda::Tensor3DWrap<MskT>(mskData->basePtr(), mskStridesNH.x, mskStridesNH.y);
+        }
 
         BlockLabel2D<BW, BH>
             <<<labBlocks, larThreads, 0, stream>>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap, sizeWH);
@@ -1404,15 +1554,17 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu
 
             if (stats)
             {
-                ComputeStats2D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, bgLabelWrap, sizeWH, relabel);
+                ComputeStats2D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, mskWrap, bgLabelWrap, sizeWH,
+                                                                     maskN, relabel);
 
                 if (minSize)
                 {
                     RemoveIslands2D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, bgLabelWrap, minSizeWrap,
-                                                                          sizeWH, relabel);
+                                                                          sizeWH, relabel, hasMask);
                 }
 
-                Relabel2D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, bgLabelWrap, sizeWH, relabel);
+                Relabel2D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, bgLabelWrap, minSizeWrap, sizeWH,
+                                                                relabel, hasMask);
             }
         }
     }
@@ -1432,6 +1584,15 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu
 
         cuda::Tensor4DWrap<SrcT> srcWrap(srcData.basePtr(), srcStridesNDH.x, srcStridesNDH.y, srcStridesNDH.z);
         cuda::Tensor4DWrap<DstT> dstWrap(dstData.basePtr(), dstStridesNDH.x, dstStridesNDH.y, dstStridesNDH.z);
+        cuda::Tensor4DWrap<MskT> mskWrap;
+
+        if (hasMask)
+        {
+            int3 mskStridesNDH{0, (int)mskData->stride(mskIdsNDHW.y), (int)mskData->stride(mskIdsNDHW.z)};
+            mskStridesNDH.x = mskIdsNDHW.x == -1 ? mskStridesNDH.y * shapeWHDN.z : (int)mskData->stride(mskIdsNDHW.x);
+
+            mskWrap = cuda::Tensor4DWrap<MskT>(mskData->basePtr(), mskStridesNDH.x, mskStridesNDH.y, mskStridesNDH.z);
+        }
 
         BlockLabel3D<BW, BH, BD>
             <<<labBlocks, larThreads, 0, stream>>>(dstWrap, srcWrap, minThreshWrap, maxThreshWrap, shapeWHDN);
@@ -1459,16 +1620,17 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu
 
             if (stats)
             {
-                ComputeStats3D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, bgLabelWrap, shapeWHDN,
-                                                                     relabel);
+                ComputeStats3D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, mskWrap, bgLabelWrap,
+                                                                     shapeWHDN, maskN, relabel);
 
                 if (minSize)
                 {
                     RemoveIslands3D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, bgLabelWrap, minSizeWrap,
-                                                                          shapeWHDN, relabel);
+                                                                          shapeWHDN, relabel, hasMask);
                 }
 
-                Relabel3D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, bgLabelWrap, shapeWHDN, relabel);
+                Relabel3D<<<labBlocks, larThreads, 0, stream>>>(statsWrap, dstWrap, bgLabelWrap, minSizeWrap, shapeWHDN,
+                                                                relabel, hasMask);
             }
         }
     }
@@ -1477,15 +1639,15 @@ inline void RunLabelForType(cudaStream_t stream, const nvcv::TensorDataStridedCu
 inline void RunLabel(cudaStream_t stream, const nvcv::TensorDataStridedCuda &srcData,
                      const nvcv::TensorDataStridedCuda &dstData, const int4 &srcShape, nvcv::DataType srcDataType,
                      const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh,
-                     const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats, int numDim,
-                     bool relabel)
+                     const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats,
+                     const nvcv::Tensor &mask, int numDim, bool relabel)
 {
     switch (srcDataType)
     {
 #define CVCUDA_LABEL_CASE(DT, T)                                                                                     \
     case nvcv::TYPE_##DT:                                                                                            \
         RunLabelForType<T>(stream, srcData, dstData, srcShape, bgLabel, minThresh, maxThresh, minSize, count, stats, \
-                           numDim, relabel);                                                                         \
+                           mask, numDim, relabel);                                                                   \
         break
 
         CVCUDA_LABEL_CASE(U8, uint8_t);
@@ -1515,7 +1677,8 @@ Label::Label() {}
 void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out,
                        const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh,
                        const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats,
-                       NVCVConnectivityType connectivity, NVCVLabelType assignLabels) const
+                       const nvcv::Tensor &mask, NVCVConnectivityType connectivity, NVCVLabelType assignLabels,
+                       NVCVLabelMaskType maskType) const
 {
     if (!(in.shape().layout() == nvcv::TENSOR_HW || in.shape().layout() == nvcv::TENSOR_HWC
           || in.shape().layout() == nvcv::TENSOR_NHW || in.shape().layout() == nvcv::TENSOR_NHWC
@@ -1532,9 +1695,10 @@ void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::
         throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
                               "Input and output tensors must have the same shape and layout");
     }
-    if (!(out.dtype() == nvcv::TYPE_U32))
+    if (!(out.dtype() == nvcv::TYPE_S32 || out.dtype() == nvcv::TYPE_U32))
     {
-        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output tensor data type must be U32");
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output tensor data type (%s) must be S32 or U32",
+                              nvcvDataTypeGetName(out.dtype()));
     }
 
     auto inData = in.exportData<nvcv::TensorDataStridedCuda>();
@@ -1549,12 +1713,6 @@ void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::
         throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output tensor must be cuda-accessible");
     }
 
-    if (outData->stride(0) >= cuda::TypeTraits<int>::max
-        || (uint32_t)outData->stride(0) / (uint32_t)sizeof(uint32_t) >= (uint32_t)(1 << 31))
-    {
-        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Too big input and output tensors");
-    }
-
     auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData);
     if (!inAccess)
     {
@@ -1613,6 +1771,16 @@ void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::
         }
     }
 
+    // Various outputs assume the maximum range of values to be representable as int, for this to happen the number
+    // of elements (pixels or voxels) in the input must not be greater than maximum int32_t
+    int64_t numElems = inShape.x * inShape.y * inShape.z;
+    if (numElems > cuda::TypeTraits<int32_t>::max)
+    {
+        throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                              "Too big input shape with %ld elements, must be smaller than or equal to %d", numElems,
+                              cuda::TypeTraits<int32_t>::max);
+    }
+
     if (bgLabel)
     {
         if (!((bgLabel.rank() == 1 && bgLabel.shape()[0] == inShape.w)
@@ -1681,9 +1849,10 @@ void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::
                                   "Output count must be [N] or [NC] tensor, with N=%d and C=1, got %s", inShape.w,
                                   oss.str().c_str());
         }
-        if (!(count.dtype() == nvcv::TYPE_U32))
+        if (!(count.dtype() == nvcv::TYPE_S32 || count.dtype() == nvcv::TYPE_U32))
         {
-            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output count (%s) must have U32 data type",
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Output count (%s) must have S32 or U32 data type",
                                   nvcvDataTypeGetName(count.dtype()));
         }
     }
@@ -1694,17 +1863,18 @@ void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::
         {
             throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output stats requires count tensor");
         }
-        if (!((stats.rank() == 3 && stats.shape()[0] == inShape.w && stats.shape()[2] == 2 + 2 * numDim)))
+        if (!((stats.rank() == 3 && stats.shape()[0] == inShape.w && stats.shape()[2] == 3 + 2 * numDim)))
         {
             std::ostringstream oss;
             oss << stats.shape();
             throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
                                   "Output stats must be [NMA] tensor, with rank=3 N=%d A=%d, got %s", inShape.w,
-                                  2 + 2 * numDim, oss.str().c_str());
+                                  3 + 2 * numDim, oss.str().c_str());
         }
-        if (!(stats.dtype() == nvcv::TYPE_U32))
+        if (!(stats.dtype() == nvcv::TYPE_S32 || stats.dtype() == nvcv::TYPE_U32))
         {
-            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Output stats (%s) must have U32 data type",
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Output stats (%s) must have S32 or U32 data type",
                                   nvcvDataTypeGetName(stats.dtype()));
         }
     }
@@ -1731,20 +1901,99 @@ void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::
                                   "Input minSize must be [N] or [NC] tensor, with N=%d and C=1, got %s", inShape.w,
                                   oss.str().c_str());
         }
-        if (!(minSize.dtype() == nvcv::TYPE_U32))
+        if (!(minSize.dtype() == nvcv::TYPE_S32 || minSize.dtype() == nvcv::TYPE_U32))
         {
-            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input minSize (%s) must have U32 data type",
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Input minSize (%s) must have S32 or U32 data type",
                                   nvcvDataTypeGetName(minSize.dtype()));
         }
     }
 
+    if (mask)
+    {
+        if (!minSize && maskType == NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Input mask requires minSize tensor "
+                                  "when maskType is NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY");
+        }
+        if (!(maskType == NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Mask type must be "
+                                  "NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY");
+        }
+
+        auto maskData = in.exportData<nvcv::TensorDataStridedCuda>();
+        if (!maskData)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Mask tensor must be cuda-accessible");
+        }
+
+        auto maskAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*maskData);
+        if (!maskAccess)
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Mask tensor must have strided access");
+        }
+        if (!(maskAccess->numChannels() == 1))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Mask tensor must have a single channel");
+        }
+        if (!(maskAccess->numPlanes() == 1))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Mask tensor must have a single plane");
+        }
+        if (!(maskAccess->numSamples() == 1 || maskAccess->numSamples() == inShape.w))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Mask tensor must have number of samples N=1 "
+                                  "or same N as input and output tensors");
+        }
+        if (!(maskAccess->numCols() == inShape.x))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Mask tensor must have the same width W "
+                                  "as input and output tensors");
+        }
+        if (!(maskAccess->numRows() == inShape.y))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                  "Mask tensor must have the same height H "
+                                  "as input and output tensors");
+        }
+
+        int maskDepthIdx = mask.shape().layout().find('D');
+        if (maskDepthIdx != -1)
+        {
+            if (mask.shape()[maskDepthIdx] != inShape.z)
+            {
+                throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+                                      "Mask tensor must have the same depth D "
+                                      "as input and output tensors");
+            }
+        }
+        else
+        {
+            if (numDim == 3)
+            {
+                throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Tensors in and out are 3D and mask is 2D");
+            }
+        }
+
+        if (!(mask.dtype() == nvcv::TYPE_S8 || mask.dtype() == nvcv::TYPE_U8))
+        {
+            throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Input mask (%s) must have S8 or U8 data type",
+                                  nvcvDataTypeGetName(mask.dtype()));
+        }
+    }
+
     // TODO: Support full connectivity
     if (connectivity == NVCV_CONNECTIVITY_8_2D || connectivity == NVCV_CONNECTIVITY_26_3D)
     {
         throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "Full neighborhood labeling not supported yet");
     }
 
-    RunLabel(stream, *inData, *outData, inShape, in.dtype(), bgLabel, minThresh, maxThresh, minSize, count, stats,
+    RunLabel(stream, *inData, *outData, inShape, in.dtype(), bgLabel, minThresh, maxThresh, minSize, count, stats, mask,
              numDim, relabel);
 }
 
diff --git a/src/cvcuda/priv/OpLabel.hpp b/src/cvcuda/priv/OpLabel.hpp
index 08d34f333..d397d90e8 100644
--- a/src/cvcuda/priv/OpLabel.hpp
+++ b/src/cvcuda/priv/OpLabel.hpp
@@ -39,8 +39,8 @@ class Label final : public IOperator
 
     void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const nvcv::Tensor &bgLabel,
                     const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, const nvcv::Tensor &minSize,
-                    const nvcv::Tensor &count, const nvcv::Tensor &stats, NVCVConnectivityType connectivity,
-                    NVCVLabelType assignLabels) const;
+                    const nvcv::Tensor &count, const nvcv::Tensor &stats, const nvcv::Tensor &mask,
+                    NVCVConnectivityType connectivity, NVCVLabelType assignLabels, NVCVLabelMaskType maskType) const;
 };
 
 } // namespace cvcuda::priv
diff --git a/src/cvcuda/priv/OpRemap.cu b/src/cvcuda/priv/OpRemap.cu
index fc9a65dbf..427c5c593 100644
--- a/src/cvcuda/priv/OpRemap.cu
+++ b/src/cvcuda/priv/OpRemap.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,6 +26,7 @@
 #include <nvcv/cuda/InterpolationWrap.hpp>
 #include <nvcv/cuda/MathOps.hpp>
 #include <nvcv/cuda/StaticCast.hpp>
+#include <util/Assert.h>
 #include <util/Math.hpp>
 
 namespace cuda = nvcv::cuda;
diff --git a/src/cvcuda/priv/legacy/CMakeLists.txt b/src/cvcuda/priv/legacy/CMakeLists.txt
index 11a2a5173..53bed6c41 100644
--- a/src/cvcuda/priv/legacy/CMakeLists.txt
+++ b/src/cvcuda/priv/legacy/CMakeLists.txt
@@ -16,7 +16,6 @@
 set(CV_CUDA_PRIV_LEGACY_FILES CvCudaLegacyHelpers.cpp)
 
 set(CV_CUDA_PRIV_LEGACY_OP_FILES
-    find_contours.cu
     filter_utils.cu
     custom_crop.cu
     reformat.cu
diff --git a/src/cvcuda/priv/legacy/CvCudaLegacy.h b/src/cvcuda/priv/legacy/CvCudaLegacy.h
index f2919dd97..5d2f42c33 100644
--- a/src/cvcuda/priv/legacy/CvCudaLegacy.h
+++ b/src/cvcuda/priv/legacy/CvCudaLegacy.h
@@ -442,13 +442,6 @@ class MinAreaRect : public CudaBaseOp
      */
     ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
                     const TensorDataStridedCuda &numPointsInContour, const int totalContours, cudaStream_t stream);
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     */
-    size_t    calBufferSize(DataShape max_input_shape, DataShape max_output_shape, int maxContourNum);
 
 private:
     int   mMaxContourNum;
@@ -1507,16 +1500,6 @@ class Gaussian : public CudaBaseOp
     ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, Size2D kernelSize,
                     double2 sigma, NVCVBorderType borderMode, cudaStream_t stream);
 
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     * @param maxKernelSize Maximum Gaussian kernel size that may be used
-     */
-    size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type,
-                         Size2D maxKernelSize);
-
 private:
     Size2D  m_maxKernelSize = {0, 0};
     Size2D  m_curKernelSize = {0, 0};
@@ -1625,16 +1608,6 @@ class AverageBlur : public CudaBaseOp
     ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, Size2D kernelSize,
                     int2 kernelAnchor, NVCVBorderType borderMode, cudaStream_t stream);
 
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param max_input_shape maximum input DataShape that may be used
-     * @param max_output_shape maximum output DataShape that may be used
-     * @param max_data_type DataType with the maximum size that may be used
-     * @param maxKernelSize Maximum average blur kernel size that may be used
-     */
-    size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type,
-                         Size2D maxKernelSize);
-
 private:
     Size2D m_maxKernelSize = {0, 0};
     Size2D m_curKernelSize = {0, 0};
@@ -1924,13 +1897,6 @@ class GaussianVarShape : public CudaBaseOp
                     const TensorDataStridedCuda &kernelSize, const TensorDataStridedCuda &sigma,
                     NVCVBorderType borderMode, cudaStream_t stream);
 
-    /**
-     * @brief calculate the gpu buffer size needed by this operator
-     * @param maxKernelSize Maximum Gaussian kernel size that may be used
-     * @param maxBatchSize Maximum batch size that may be used
-     */
-    size_t calBufferSize(Size2D maxKernelSize, int maxBatchSize);
-
 private:
     Size2D m_maxKernelSize = {0, 0};
     int    m_maxBatchSize  = 0;
@@ -2005,13 +1971,6 @@ class AverageBlurVarShape : public CudaBaseOp
                     const TensorDataStridedCuda &kernelSize, const TensorDataStridedCuda &kernelAnchor,
                     NVCVBorderType borderMode, cudaStream_t stream);
 
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param maxKernelSize Maximum Gaussian kernel size that may be used
-     * @param maxBatchSize Maximum batch size that may be used
-     */
-    size_t calBufferSize(Size2D maxKernelSize, int maxBatchSize);
-
 private:
     Size2D m_maxKernelSize = {0, 0};
     int    m_maxBatchSize  = 0;
@@ -2595,13 +2554,6 @@ class AdaptiveThreshold : public CudaBaseOp
     ErrorCode infer(const TensorDataStridedCuda &in, const TensorDataStridedCuda &out, const double maxValue,
                     const NVCVAdaptiveThresholdType adaptiveMethod, const NVCVThresholdType thresholdType,
                     const int32_t blockSize, const double c, cudaStream_t stream);
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param maxInputShape maximum input DataShape that may be used
-     * @param maxOutputShape maximum output DataShape that may be used
-     * @param maxBlockSize maximum block size that may be used
-     */
-    size_t    calBufferSize(DataShape maxInputShape, DataShape maxOutputShape, int maxBlockSize);
 
 private:
     int   m_blockSize      = -1;
@@ -2637,15 +2589,6 @@ class AdaptiveThresholdVarShape : public CudaBaseOp
                     const NVCVThresholdType thresholdType, const TensorDataStridedCuda &blockSize,
                     const TensorDataStridedCuda &c, cudaStream_t stream);
 
-    /**
-     * @brief calculate the cpu/gpu buffer size needed by this operator
-     * @param maxInputShape maximum input DataShape that may be used
-     * @param maxOutputShape maximum output DataShape that may be used
-     * @param maxBlockSize maximum block size that may be used
-     * @param maxBatchSize maximum batch size that may be used
-     */
-    size_t calBufferSize(DataShape maxInputShape, DataShape maxOutputShape, int maxBlockSize, int maxBatchSize);
-
 private:
     const int m_maxBatchSize;
     const int m_maxBlockSize;
@@ -2912,77 +2855,6 @@ class HistogramEqVarShape : public CudaBaseOp
     std::byte *m_histoArray;
 };
 
-class FindContours : public CudaBaseOp
-{
-public:
-    static constexpr int32_t MAX_NUM_CONTOURS   = 256;
-    static constexpr int32_t MAX_CONTOUR_POINTS = 4 * 1024;
-    static constexpr int32_t MAX_TOTAL_POINTS   = MAX_NUM_CONTOURS * MAX_CONTOUR_POINTS;
-
-    FindContours() = delete;
-    FindContours(DataShape max_input_shape, DataShape max_output_shape);
-
-    ~FindContours();
-
-    /**
-     * Limitations:
-     *
-     * Input:
-     *   Data Layout: [kNHWC, kHWC]
-     *   Channels:    [1]
-     *
-     *   | Data Type       | Allowed     |
-     *   |-----------------|-------------|
-     *   | 8bit  Unsigned  | Yes         |
-     *   ... [other types]
-     *
-     * Output:
-     *   Data Layout: [kNCW, CW]
-     *   Width:       [2]
-     *
-     *   | Data Type       | Allowed     |
-     *   |-----------------|-------------|
-     *   | 32bit Signed    | Yes         |
-     *   ... [other types]
-     *
-     * - Input/Output Dependency:
-     *   | Property        | Input == Output |
-     *   |-----------------|-----------------|
-     *   | Data Layout     | Yes             |
-     *   ... [other properties]
-     *
-     * @brief Extracts contours from a binary image.
-     *
-     * @param inData GPU pointer to input data. Represents an 8-bit, unsigned,
-     *     single-channel image. Non-zero pixels are treated as 1's, and zero
-     *     pixels remain as 0's, which makes the image binary.
-     * @param outData GPU pointer to output data. It contains the detected
-     *     contours for the input image. The data is structured as: [x_c0_p0,
-     *     y_c0_p0, ..., x_ci_pj, y_ci_pj, ...], where "ci" denotes a contour's
-     *     index in the output array and "pj" is a point's index within a
-     *     contour.
-     * @param numPoints Holds the number of contour points for each image.
-     *     Specifically, numPoints[i] gives the number of contours for the i-th
-     *     image, while numPoints[i][j] gives the number of points in the j-th
-     *     contour of i-th image.
-     * @param stream CUDA stream for asynchronous execution.
-     */
-    ErrorCode infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
-                    const TensorDataStridedCuda &numPoints, cudaStream_t stream);
-
-    /**
-     * @brief Computes the necessary GPU buffer size for the operation.
-     *
-     * @param max_input_shape The largest possible shape for input data.
-     * @param max_output_shape The largest possible shape for output data.
-     * @param max_data_type The data type of the maximum size that is used.
-     */
-    size_t calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type);
-
-private:
-    void *gpu_workspace{nullptr};
-};
-
 } // namespace nvcv::legacy::cuda_op
 
 #endif // CV_CUDA_LEGACY_H
diff --git a/src/cvcuda/priv/legacy/adaptive_threshold.cu b/src/cvcuda/priv/legacy/adaptive_threshold.cu
index b4274480a..56bdb1263 100644
--- a/src/cvcuda/priv/legacy/adaptive_threshold.cu
+++ b/src/cvcuda/priv/legacy/adaptive_threshold.cu
@@ -169,11 +169,6 @@ AdaptiveThreshold::~AdaptiveThreshold()
     NVCV_CHECK_LOG(cudaFree(m_kernel));
 }
 
-size_t AdaptiveThreshold::calBufferSize(DataShape maxInputShape, DataShape maxOutputShape, int maxBlockSize)
-{
-    return maxBlockSize * maxBlockSize * sizeof(float);
-}
-
 ErrorCode AdaptiveThreshold::infer(const TensorDataStridedCuda &in, const TensorDataStridedCuda &out,
                                    const double maxValue, const NVCVAdaptiveThresholdType adaptiveMethod,
                                    const NVCVThresholdType thresholdType, const int32_t blockSize, const double c,
diff --git a/src/cvcuda/priv/legacy/adaptive_threshold_var_shape.cu b/src/cvcuda/priv/legacy/adaptive_threshold_var_shape.cu
index eac9e6670..8f372ea1d 100644
--- a/src/cvcuda/priv/legacy/adaptive_threshold_var_shape.cu
+++ b/src/cvcuda/priv/legacy/adaptive_threshold_var_shape.cu
@@ -195,12 +195,6 @@ AdaptiveThresholdVarShape::~AdaptiveThresholdVarShape()
     NVCV_CHECK_LOG(cudaFree(m_kernel));
 }
 
-size_t AdaptiveThresholdVarShape::calBufferSize(DataShape maxInputShape, DataShape maxOutputShape, int maxBlockSize,
-                                                int maxBatchSize)
-{
-    return sizeof(float) * maxBatchSize * maxBlockSize * maxBlockSize;
-}
-
 ErrorCode AdaptiveThresholdVarShape::infer(const ImageBatchVarShapeDataStridedCuda &in,
                                            const ImageBatchVarShapeDataStridedCuda &out,
                                            const TensorDataStridedCuda             &maxValue,
diff --git a/src/cvcuda/priv/legacy/filter.cu b/src/cvcuda/priv/legacy/filter.cu
index 105f9260e..3ec059bec 100644
--- a/src/cvcuda/priv/legacy/filter.cu
+++ b/src/cvcuda/priv/legacy/filter.cu
@@ -237,12 +237,6 @@ Gaussian::~Gaussian()
     NVCV_CHECK_LOG(cudaFree(m_kernel));
 }
 
-size_t Gaussian::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type,
-                               Size2D maxKernelSize)
-{
-    return maxKernelSize.w * maxKernelSize.h * sizeof(float);
-}
-
 ErrorCode Gaussian::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData, Size2D kernelSize,
                           double2 sigma, NVCVBorderType borderMode, cudaStream_t stream)
 {
@@ -360,12 +354,6 @@ AverageBlur::~AverageBlur()
     NVCV_CHECK_LOG(cudaFree(m_kernel));
 }
 
-size_t AverageBlur::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type,
-                                  Size2D maxKernelSize)
-{
-    return maxKernelSize.w * maxKernelSize.h * sizeof(float);
-}
-
 ErrorCode AverageBlur::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
                              Size2D kernelSize, int2 kernelAnchor, NVCVBorderType borderMode, cudaStream_t stream)
 {
diff --git a/src/cvcuda/priv/legacy/filter_var_shape.cu b/src/cvcuda/priv/legacy/filter_var_shape.cu
index a2069509c..e86114238 100644
--- a/src/cvcuda/priv/legacy/filter_var_shape.cu
+++ b/src/cvcuda/priv/legacy/filter_var_shape.cu
@@ -480,11 +480,6 @@ GaussianVarShape::~GaussianVarShape()
     NVCV_CHECK_LOG(cudaFree(m_kernel));
 }
 
-size_t GaussianVarShape::calBufferSize(Size2D maxKernelSize, int maxBatchSize)
-{
-    return maxKernelSize.w * maxKernelSize.h * maxBatchSize * sizeof(float);
-}
-
 ErrorCode GaussianVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inData,
                                   const ImageBatchVarShapeDataStridedCuda &outData,
                                   const TensorDataStridedCuda &kernelSize, const TensorDataStridedCuda &sigma,
@@ -689,11 +684,6 @@ AverageBlurVarShape::~AverageBlurVarShape()
     NVCV_CHECK_LOG(cudaFree(m_kernel));
 }
 
-size_t AverageBlurVarShape::calBufferSize(Size2D maxKernelSize, int maxBatchSize)
-{
-    return maxKernelSize.w * maxKernelSize.h * maxBatchSize * sizeof(float);
-}
-
 ErrorCode AverageBlurVarShape::infer(const ImageBatchVarShapeDataStridedCuda &inData,
                                      const ImageBatchVarShapeDataStridedCuda &outData,
                                      const TensorDataStridedCuda &kernelSize, const TensorDataStridedCuda &kernelAnchor,
diff --git a/src/cvcuda/priv/legacy/find_contours.cu b/src/cvcuda/priv/legacy/find_contours.cu
deleted file mode 100644
index abcb798cf..000000000
--- a/src/cvcuda/priv/legacy/find_contours.cu
+++ /dev/null
@@ -1,1238 +0,0 @@
-/* Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- *
- * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
- * SPDX-License-Identifier: Apache-2.0
- *
- * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
- * Copyright (C) 2021-2022, Bytedance Inc. All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "CvCudaLegacy.h"
-#include "CvCudaLegacyHelpers.hpp"
-
-#include "CvCudaUtils.cuh"
-
-#include <cooperative_groups.h>
-#include <nvcv/alloc/Requirements.hpp>
-#include <util/Math.hpp>
-
-#include <type_traits>
-
-namespace cg = cooperative_groups;
-
-namespace nvcv::legacy::cuda_op {
-
-template<typename T>
-__forceinline__ __host__ __device__ std::enable_if_t<std::is_integral<T>::value, T> mod(const T &a, const T &b)
-{
-    T c = a % b;
-    return (c < 0) ? (c + b) : c;
-}
-
-// Importing scope of the helpers namespace
-using namespace nvcv::legacy::helpers;
-
-using CountType = uint32_t;
-using IndexType = int32_t;
-using PixelType = uint8_t;
-using LabelType = IndexType;
-using PointType = int2;
-using CoordType = uint3;
-using MaskType  = uint32_t;
-
-template<typename PixelType>
-using DeviceImage   = cuda::FullTensorWrap<PixelType, 3>;
-using BoundaryLabel = Ptr2dNHWC<LabelType>;
-using Neighborhood  = Ptr2dNHWC<MaskType>;
-using ConnectList   = Ptr2dNHWC<IndexType>;
-using CountList     = Ptr2dNL<CountType>;
-using NodeList      = cuda::FullTensorWrap<int, 3>;
-using NodeCounts    = cuda::FullTensorWrap<int, 2>;
-
-using KernelGrid  = cg::grid_group;
-using KernelBlock = cg::thread_block;
-using KernelWarp  = cg::thread_block_tile<32, KernelBlock>;
-using ActiveWarp  = cg::coalesced_group;
-
-template<typename ValueType>
-class SharedHWWrapper
-{
-public:
-    using type       = SharedHWWrapper<ValueType>;
-    using value_type = ValueType;
-    using size_type  = IndexType;
-
-    __forceinline__ __device__ SharedHWWrapper()
-        : m_height{0}
-        , m_width{0}
-        , m_data{nullptr}
-    {
-    }
-
-    __forceinline__ __device__ SharedHWWrapper(const type &other)
-        : m_height{other.m_height}
-        , m_width{other.m_width}
-        , m_data{other.m_data}
-    {
-    }
-
-    __forceinline__ __device__ SharedHWWrapper(type &&other)
-    {
-        this->m_height = other.m_height;
-        other.m_height = 0;
-        this->m_width  = other.m_width;
-        other.m_width  = 0;
-        this->m_data   = other.m_data;
-        other.m_data   = nullptr;
-    }
-
-    __forceinline__ __device__ SharedHWWrapper(size_type rows, size_type cols, ValueType *data)
-        : m_height{rows}
-        , m_width{cols}
-        , m_data{data}
-    {
-    }
-
-    __forceinline__ __device__ ~SharedHWWrapper()
-    {
-        m_height = 0;
-        m_width  = 0;
-        m_data   = nullptr;
-    }
-
-    __forceinline__ __device__ type &operator=(const type &other)
-    {
-        this->m_height = other.m_height;
-        this->m_width  = other.m_width;
-        this->m_data   = other.m_data;
-        return *this;
-    }
-
-    __forceinline__ __device__ type &operator=(type &&other)
-    {
-        this->m_height = other.m_height;
-        other.m_height = 0;
-        this->m_width  = other.m_width;
-        other.m_width  = 0;
-        this->m_data   = other.m_data;
-        other.m_data   = nullptr;
-        return *this;
-    }
-
-    __forceinline__ __device__ value_type &operator[](IndexType index)
-    {
-        assert(0 <= index && index < static_cast<IndexType>(m_width * m_height));
-        return this->m_data[index];
-    }
-
-    const __forceinline__ __device__ value_type &operator[](IndexType index) const
-    {
-        assert(0 <= index && index < static_cast<IndexType>(m_width * m_height));
-        return this->m_data[index];
-    }
-
-    __forceinline__ __device__ value_type &operator[](PointType pos)
-    {
-        using AxisType = decltype(pos.x);
-        assert(0 <= pos.x && pos.x < static_cast<AxisType>(m_width));
-        assert(0 <= pos.y && pos.y < static_cast<AxisType>(m_height));
-        return (*this)[this->pointToIndex(pos)];
-    }
-
-    const __forceinline__ __device__ value_type &operator[](PointType pos) const
-    {
-        using AxisType = decltype(pos.x);
-        assert(0 <= pos.x && pos.x < static_cast<AxisType>(m_width));
-        assert(0 <= pos.y && pos.y < static_cast<AxisType>(m_height));
-        return (*this)[this->pointToIndex(pos)];
-    }
-
-    const __forceinline__ __device__ value_type *ptr(IndexType index) const
-    {
-        return &((*this)[index]);
-    }
-
-    const __forceinline__ __device__ value_type *ptr(PointType pos) const
-    {
-        return &((*this)[pos]);
-    }
-
-    __forceinline__ __device__ IndexType pointToIndex(PointType pos) const
-    {
-        return static_cast<IndexType>(pos.y * this->m_width + pos.x);
-    }
-
-    __forceinline__ __device__ PointType indexToPoint(IndexType index) const
-    {
-        return PointType{mod(index, m_width), index / m_width};
-    }
-
-    __forceinline__ __device__ size_type height() const
-    {
-        return m_height;
-    }
-
-    __forceinline__ __device__ size_type width() const
-    {
-        return m_width;
-    }
-
-    __forceinline__ __device__ size_type volume() const
-    {
-        return this->height() * this->width();
-    }
-
-private:
-    size_type m_height{0};
-    size_type m_width{0};
-
-    value_type *m_data{nullptr};
-};
-
-// Representing the shared memory in int32 for to avoid bank conflicts
-using SharedImage = SharedHWWrapper<int32_t>;
-using SharedLabel = SharedHWWrapper<LabelType>;
-
-template<typename ValueType, IndexType MAX_SIZE>
-class LocalQueue
-{
-public:
-    __device__ LocalQueue()
-        : m_front(0)
-        , m_back(0)
-    {
-    }
-
-    // Push an item to the queue; returns false if the queue is full.
-    __device__ bool push(const ValueType &value)
-    {
-        if (mod(m_back + 1, MAX_SIZE) == m_front)
-            return false; // Queue is full
-
-        m_data[m_back] = value;
-        m_back         = mod(m_back + 1, MAX_SIZE);
-
-        return true;
-    }
-
-    __device__ IndexType pushOrDelete(const ValueType &value)
-    {
-        IndexType removeAt = MAX_SIZE;
-
-        // Check if value is already in the queue
-        for (auto i = m_front; i != m_back; i = mod(i + 1, MAX_SIZE))
-        {
-            if (m_data[i] == value)
-            {
-                // Remove the value by shifting everything to the left
-                removeAt = i;
-                this->remove(i);
-                i = m_back;
-            }
-        }
-
-        // If we've reached here, value is not in the queue. Push it.
-        if (removeAt == MAX_SIZE)
-        {
-            push(value);
-        }
-
-        return removeAt;
-    }
-
-    __device__ void remove(IndexType index)
-    {
-        // Remove the value by shifting everything to the left
-        for (auto j = mod(index + 1, MAX_SIZE); j != m_back; j = mod(j + 1, MAX_SIZE))
-        {
-            m_data[index] = m_data[j];
-            index         = j;
-            j             = mod(j + 1, MAX_SIZE);
-        }
-        m_back = mod(m_back - 1, MAX_SIZE);
-    }
-
-    // Pop an item from the queue; returns false if the queue is empty.
-    __device__ bool pop(ValueType &value)
-    {
-        if (m_front == m_back)
-            return false; // Queue is empty
-
-        value   = m_data[m_front];
-        m_front = mod(m_front + 1, MAX_SIZE);
-
-        return true;
-    }
-
-    // Check if the queue is empty.
-    __device__ bool isEmpty() const
-    {
-        return m_front == m_back;
-    }
-
-    // Check if the queue is full.
-    __device__ bool isFull() const
-    {
-        return mod(m_back + 1, MAX_SIZE) == m_front;
-    }
-
-private:
-    ValueType m_data[MAX_SIZE]; // Array to store the queue's elements.
-    IndexType m_front;          // Index of the front element.
-    IndexType m_back;           // Index where the next element will be pushed.
-};
-
-// Creating a list of point offsets for 8-point stencil which are specified
-// in clock-wise rotating order from top-left
-__constant__ PointType OFFSET[8] = {
-    {-1, -1}, // 0 ==> Left-Up
-    { 0, -1}, // 1 ==> Up
-    { 1, -1}, // 2 ==> Right-Up
-    { 1,  0}, // 3 ==> Right
-    { 1,  1}, // 4 ==> Right-Down
-    { 0,  1}, // 5 ==> Down
-    {-1,  1}, // 6 ==> Left-Down
-    {-1,  0}  // 7 ==> Left
-};
-
-// Pre-declarations
-/******************************************************************************/
-template<typename GlobalValueType, typename SharedValueType>
-__device__ SharedValueType globalAt(const DeviceImage<GlobalValueType> &global, PointType pos, IndexType batch,
-                                    GlobalValueType defaultValue = 0);
-
-template<int PAD_SIZE, typename GlobalValueType, typename SharedValueType>
-__device__ void doCopyDirected(const DeviceImage<GlobalValueType> &global, PointType globalPos, PointType localPos,
-                               IndexType batch, PointType direction, GlobalValueType defaultValue,
-                               SharedHWWrapper<SharedValueType> &shared);
-
-template<int PAD_SIZE, typename GlobalValueType, typename SharedValueType>
-__device__ void copyGlobalToShared(const DeviceImage<GlobalValueType> &global, PointType globalPos, PointType localPos,
-                                   IndexType batch, GlobalValueType defaultValue,
-                                   SharedHWWrapper<SharedValueType> &shared);
-
-__device__ MaskType getNeighborhoodMask(const SharedImage &sharedImage, PointType localPos);
-
-__device__ bool isEdgePixel(const SharedImage &sharedImage, PointType localPos);
-
-__device__ void setLabels(const SharedImage &sharedImage, PointType localPos, SharedLabel &sharedLabels,
-                          PointType globalPos, IndexType width, IndexType height);
-
-__device__ LabelType findRoot(const BoundaryLabel &labels, CoordType pos, LabelType badLabel);
-
-__device__ LabelType minLabelInNeighborhood(const BoundaryLabel &labels, CoordType pos, LabelType badLabel,
-                                            Neighborhood &neighbors);
-
-__device__ void resolveRoots(BoundaryLabel &segments, CoordType pos, LabelType badLabel, Neighborhood &neighbors);
-
-__device__ IndexType nextDirectionNot(const Neighborhood &neighbors, IndexType from, IndexType lastDir, CoordType pos,
-                                      bool flipDir = true);
-
-__device__ LabelType findHead(const BoundaryLabel &labels, const BoundaryLabel &segments, const Neighborhood &neighbors,
-                              CoordType pos, IndexType from, IndexType &lastDir);
-
-__device__ void traverseContour(const BoundaryLabel &labels, const BoundaryLabel &segments,
-                                const BoundaryLabel &connectedComponents, Neighborhood &neighbors, CoordType pos,
-                                ConnectList &connectList, CountList &nodeCount, CountType *contourCount,
-                                LabelType badLabel);
-
-template<typename PixelType>
-__host__ void findContours_impl(DeviceImage<PixelType> &dImage, LabelType *dLabels, LabelType *dSegments,
-                                LabelType *dConnectedComponents, MaskType *dNeighbors, IndexType *dConnectList,
-                                CountType *dNodeCount, CountType *dContourCount, NodeList &dNodeList,
-                                NodeCounts &dPointCount, IndexType height, IndexType width, IndexType batchSize,
-                                cudaStream_t stream);
-
-/******************************************************************************/
-
-template<typename GlobalValueType, typename SharedValueType>
-__forceinline__ __device__ SharedValueType globalAt(const DeviceImage<GlobalValueType> &global, PointType pos,
-                                                    IndexType batch, GlobalValueType defaultValue)
-{
-    SharedValueType result = static_cast<SharedValueType>(defaultValue);
-
-    // Batch size is 1
-    if (0 <= pos.x && pos.x < global.shapes()[2] && 0 <= pos.y && pos.y < global.shapes()[1] && 0 <= batch
-        && batch < global.shapes()[0])
-    {
-        result = static_cast<SharedValueType>(*global.ptr(batch, pos.y, pos.x) > 0);
-    }
-
-    return result;
-}
-
-template<int PAD_SIZE, typename GlobalValueType, typename SharedValueType>
-__forceinline__ __device__ void doCopyDirected(const DeviceImage<GlobalValueType> &global, PointType globalPos,
-                                               PointType localPos, IndexType batch, PointType direction,
-                                               GlobalValueType defaultValue, SharedHWWrapper<SharedValueType> &shared)
-{
-    for (IndexType i = 1; i <= PAD_SIZE; ++i)
-    {
-        for (IndexType j = 1; j <= PAD_SIZE; ++j)
-        {
-            const auto offset = PointType{i * direction.x, j * direction.y};
-            shared[localPos + offset]
-                = globalAt<GlobalValueType, SharedValueType>(global, globalPos + offset, batch, defaultValue);
-        }
-    }
-}
-
-template<int PAD_SIZE, typename GlobalValueType, typename SharedValueType>
-__device__ void copyGlobalToShared(const DeviceImage<GlobalValueType> &global, PointType globalPos, PointType localPos,
-                                   IndexType batch, GlobalValueType defaultValue,
-                                   SharedHWWrapper<SharedValueType> &shared)
-{
-    // Copying over all data within the boundary
-    shared[localPos] = globalAt<GlobalValueType, SharedValueType>(global, globalPos, batch, defaultValue);
-
-    if (localPos.x == PAD_SIZE)
-    {
-        doCopyDirected<PAD_SIZE>(global, globalPos, localPos, batch, OFFSET[7], defaultValue, shared);
-    }
-    if (localPos.x == (shared.width() - 1 - PAD_SIZE))
-    {
-        doCopyDirected<PAD_SIZE>(global, globalPos, localPos, batch, OFFSET[3], defaultValue, shared);
-    }
-    if (localPos.y == PAD_SIZE)
-    {
-        doCopyDirected<PAD_SIZE>(global, globalPos, localPos, batch, OFFSET[1], defaultValue, shared);
-    }
-    if (localPos.y == (shared.height() - 1 - PAD_SIZE))
-    {
-        doCopyDirected<PAD_SIZE>(global, globalPos, localPos, batch, OFFSET[5], defaultValue, shared);
-    }
-    if (localPos.x == PAD_SIZE && localPos.y == PAD_SIZE)
-    {
-        doCopyDirected<PAD_SIZE>(global, globalPos, localPos, batch, OFFSET[0], defaultValue, shared);
-    }
-    if (localPos.x == (shared.width() - 1 - PAD_SIZE) && localPos.y == PAD_SIZE)
-    {
-        doCopyDirected<PAD_SIZE>(global, globalPos, localPos, batch, OFFSET[2], defaultValue, shared);
-    }
-    if (localPos.x == PAD_SIZE && localPos.y == (shared.height() - 1 - PAD_SIZE))
-    {
-        doCopyDirected<PAD_SIZE>(global, globalPos, localPos, batch, OFFSET[6], defaultValue, shared);
-    }
-    if (localPos.x == (shared.width() - 1 - PAD_SIZE) && localPos.y == (shared.height() - 1 - PAD_SIZE))
-    {
-        doCopyDirected<PAD_SIZE>(global, globalPos, localPos, batch, OFFSET[4], defaultValue, shared);
-    }
-}
-
-__device__ MaskType getNeighborhoodMask(const SharedImage &sharedImage, PointType localPos)
-{
-    MaskType neighborhoodMask = 0;
-
-    for (auto dir = 0; dir < 8; ++dir)
-    {
-        const auto neighborPos = localPos + OFFSET[dir];
-
-        neighborhoodMask |= ((sharedImage[neighborPos] > 0) ? (1 << dir) : 0);
-    }
-
-    return neighborhoodMask;
-}
-
-__device__ bool isEdgePixel(const SharedImage &sharedImage, PointType localPos)
-{
-    // NOTE: This condition might need further thought. An edge pixel is a pixel
-    //   with at least 1 zero pixel neighbor and at least two set neighbors
-    const auto neighborhood = getNeighborhoodMask(sharedImage, localPos);
-    return sharedImage[localPos] > 0 && __popc(neighborhood & 0xaa) < 4;
-}
-
-__device__ void setLabels(const SharedImage &sharedImage, PointType localPos, SharedLabel &sharedLabels,
-                          PointType globalPos, IndexType width, IndexType height)
-{
-    // Collecting boundary evaluation
-    const auto isBoundary = isEdgePixel(sharedImage, localPos);
-    const auto index      = (localPos.y - 2) * sharedLabels.width() + (localPos.x - 2);
-
-    // Collect boundary determination of neighbors
-    LabelType minIndex = isBoundary ? (globalPos.y * width + globalPos.x) : (width * height);
-
-    for (auto i = 0; i < 4; ++i)
-    {
-        auto neighborPos    = localPos + OFFSET[mod(i + 7, 8)];
-        auto neighborIndex  = (neighborPos.y - 2) * sharedLabels.width() + (neighborPos.x - 2);
-        auto neighborIsEdge = isBoundary && isEdgePixel(sharedImage, neighborPos);
-
-        neighborPos   = neighborPos - localPos + globalPos;
-        neighborIndex = neighborIsEdge ? (neighborPos.y * width + neighborPos.x) : (width * height);
-
-        minIndex = min(minIndex, neighborIndex);
-    }
-
-    sharedLabels[index] = minIndex;
-}
-
-__device__ LabelType findRoot(const BoundaryLabel &labels, CoordType pos, LabelType badLabel)
-{
-    auto next = pos.y * labels.cols + pos.x; // Linearize the pixel position.
-    auto root = *labels.ptr(pos.z, pos.y, pos.x);
-
-    // Keep finding the root until the root is a bad label or the next label is the root itself.
-    while (root != badLabel && next != root)
-    {
-        next = root;                              // Move on to the next label.
-        root = *(labels.ptr(pos.z, 0, 0) + root); // Fetch the next root label.
-    }
-
-    return root; // Return the found root label.
-}
-
-__device__ LabelType minLabelInNeighborhood(const BoundaryLabel &labels, CoordType pos, LabelType badLabel,
-                                            Neighborhood &neighbors)
-{
-    auto      label    = *labels.ptr(pos.z, pos.y, pos.x);
-    LabelType minLabel = badLabel;
-
-    *neighbors.ptr(pos.z, pos.y, pos.x) = 0;
-
-    // Loop through all 8 neighbors to find the smallest label.
-    for (auto dir = 1; dir < 8; dir += 2)
-    {
-        const auto neighborLabel = *labels.ptr(pos.z, pos.y + OFFSET[dir].y, pos.x + OFFSET[dir].x);
-        minLabel                 = min(neighborLabel, minLabel);
-
-        // Update the edge neighbors in the flow structure based on valid neighbors.
-        *neighbors.ptr(pos.z, pos.y, pos.x) |= (neighborLabel != badLabel && label != badLabel) ? (1 << dir) : 0;
-    }
-
-    return label == badLabel ? badLabel : minLabel; // Return the smallest label found.
-}
-
-__device__ void resolveRoots(BoundaryLabel &segments, CoordType pos, LabelType badLabel, Neighborhood &neighbors)
-{
-    auto label1 = *segments.ptr(pos.z, pos.y, pos.x);
-    auto label2 = minLabelInNeighborhood(segments, pos, badLabel, neighbors);
-    auto label3 = badLabel;
-
-    // Resolve the root for the label1 until it remains unchanged.
-    while (label1 != badLabel && label2 != badLabel && label1 != label3)
-    {
-        label3 = label1;
-        label1 = *(segments.ptr(pos.z, 0, 0) + label1);
-    }
-
-    // Resolve the root for the label2 until it remains unchanged.
-    while (label1 != badLabel && label2 != badLabel && label2 != label3)
-    {
-        label3 = label2;
-        label2 = *(segments.ptr(pos.z, 0, 0) + label2);
-    }
-
-    // Merge label1 and label2 if they are different and not bad labels.
-    while (label1 != badLabel && label2 != badLabel && label1 != label2)
-    {
-        label3 = atomicMin(segments.ptr(pos.z, 0, 0) + label1, label2);
-        label1 = label1 == label3 ? label2 : label3;
-        label2 = label3;
-    }
-}
-
-__device__ IndexType nextDirectionNot(const Neighborhood &neighbors, IndexType from, IndexType lastDir, CoordType pos,
-                                      bool flipDir)
-{
-    // Start from the direction opposite (180 degrees) to the last direction.
-    // This is done by adding 4 (half of 8 directions) and taking modulo 8.
-    // This ensures the result lies between 0 and 7 (inclusive).
-    IndexType nextDirection = mod(lastDir + (flipDir ? 4 : 0), 8);
-
-    // Loop to search for the next valid direction in a clockwise manner.
-    // The loop starts from 1 and iterates 7 times, covering all directions.
-    for (auto dir = 1; dir < 8; ++dir)
-    {
-        // Move in a clockwise manner by incrementing the direction
-        // and taking modulo 8 to ensure it stays in the valid range.
-        nextDirection = mod(nextDirection + dir, 8);
-
-        // Check if the direction pointed by nextDirection is valid by inspecting
-        // the neighbors bitmask. If valid, break out of the loop.
-        if (((*(neighbors.ptr(pos.z, 0, 0) + from)) & (1 << nextDirection)) > 0)
-        {
-            break;
-        }
-    }
-
-    // Return the determined valid direction.
-    return nextDirection;
-}
-
-__device__ LabelType findHead(const BoundaryLabel &labels, const BoundaryLabel &segments, const Neighborhood &neighbors,
-                              CoordType pos, IndexType from, IndexType &lastDir)
-{
-    // Begin at the starting position.
-    IndexType next = from;
-
-    // If segments at this new pixel is equal to the pixel, end
-    while ((*(neighbors.ptr(pos.z, 0, 0) + next) & 0x87) != 0)
-    {
-        // Use nextDirectionNot to get the next direction to move in.
-        lastDir = nextDirectionNot(neighbors, next, lastDir - (next != from ? 0 : 1), pos, next != from);
-
-        // Update the current position by moving in the direction provided by nextDirectionNot.
-        next += OFFSET[lastDir].y * labels.cols + OFFSET[lastDir].x;
-    }
-
-    // Return the position that matches the condition.
-    return next;
-}
-
-__device__ void traverseContour(const BoundaryLabel &labels, const BoundaryLabel &segments,
-                                const BoundaryLabel &connectedComponents, Neighborhood &neighbors, CoordType pos,
-                                ConnectList &connectList, CountList &nodeCount, CountType *contourCount,
-                                LabelType badLabel)
-{
-    // Obtain the root label for the connected component.
-    // It represents the label assigned to this specific group of connected pixels.
-    auto root = *connectedComponents.ptr(pos.z, pos.y, pos.x);
-
-    // The head is a reference pixel on the contour; essentially, our starting point.
-    auto head = *segments.ptr(pos.z, pos.y, pos.x);
-
-    // The current pixel label we're working on.
-    auto next = *labels.ptr(pos.z, pos.y, pos.x);
-    auto curr = pos.y * labels.cols + pos.x;
-
-    // Return early if the current pixel isn't the root pixel.
-    // This ensures we are only processing root pixels.
-    if (curr != root || root == badLabel)
-        return;
-
-    // Get the contour neighbor data for the current pixel.
-    auto neighborhood = *neighbors.ptr(pos.z, pos.y, pos.x);
-
-    // Calculate the first direction which has a neighbor on the edge.
-    auto nextDir = __ffs(static_cast<int32_t>(neighborhood));
-    for (auto dir = nextDir; dir < 7; ++dir)
-    {
-        nextDir = (((1 << dir) & neighborhood) > 0) ? dir : nextDir;
-    }
-    auto lastDir = nextDir;
-
-    // Prepare local queues for storing pixel labels and directions.
-    // These help manage which pixels/directions are processed next.
-    constexpr int                   MAX_SIZE = 64;
-    LocalQueue<LabelType, MAX_SIZE> labelQueue;
-    LocalQueue<IndexType, MAX_SIZE> dirQueue;
-
-    // Initialize the queues with starting values.
-    labelQueue.push(root);
-    dirQueue.push(nextDir);
-
-    // Adjust the next label based on the initial direction.
-    next += OFFSET[nextDir].y * labels.cols + OFFSET[nextDir].x;
-
-    // Temporary variables for dequeuing operations.
-    LabelType frusLabel;
-    IndexType frusDir;
-
-    // Keep processing pixels until there's nothing left in our queues.
-    auto &counts = labels.batches > 1 ? contourCount[pos.z] : *contourCount;
-    while (!labelQueue.isEmpty() && !dirQueue.isEmpty() && counts < FindContours::MAX_NUM_CONTOURS)
-    {
-        // Fetch the next label and direction from the front of our queues.
-        labelQueue.pop(frusLabel);
-        dirQueue.pop(frusDir);
-
-        // Identify the contour's starting pixel for this segment.
-        head         = findHead(labels, segments, neighbors, pos, frusLabel, frusDir);
-        neighborhood = *(neighbors.ptr(pos.z, 0, 0) + head);
-        if (neighborhood == 0)
-        {
-            continue;
-        }
-
-        // Update tracking variables to work on the head pixel.
-        curr    = head;
-        nextDir = mod(frusDir + (frusLabel == head ? 0 : 4), 8);
-        lastDir = mod(nextDirectionNot(neighbors, head, nextDir, pos) + 4, 8);
-        next    = curr + OFFSET[nextDir].y * labels.cols + OFFSET[nextDir].x;
-
-        // Increment the total count of contours.
-        IndexType contourIndex = atomicInc(&counts, FindContours::MAX_NUM_CONTOURS);
-        if (contourIndex == FindContours::MAX_NUM_CONTOURS)
-        {
-            atomicExch(&counts, FindContours::MAX_NUM_CONTOURS);
-            break;
-        }
-        *nodeCount.ptr(pos.z, contourIndex) = 0;
-
-        // Traverse the contour until it loops back to the head pixel.
-        while (next != head && *nodeCount.ptr(pos.z, contourIndex) != FindContours::MAX_CONTOUR_POINTS)
-        {
-            // Register the current pixel to the contour.
-            IndexType pointIndex                              = (*nodeCount.ptr(pos.z, contourIndex))++;
-            *connectList.ptr(pos.z, contourIndex, pointIndex) = curr;
-
-            // Update the next direction based on the neighbors.
-            for (auto dir = 1; dir < 8; ++dir)
-            {
-                nextDir = mod(lastDir + 4 - dir, 8);
-                next    = curr + OFFSET[nextDir].y * labels.cols + OFFSET[nextDir].x;
-
-                auto ccNext = *(connectedComponents.ptr(pos.z, 0, 0) + next);
-                if (ccNext == root)
-                {
-                    break;
-                }
-            }
-
-            // Move to the next pixel in the chosen direction.
-            curr    = next;
-            lastDir = nextDir;
-        }
-    }
-}
-
-template<typename PixelType>
-__global__ void labelEdges(DeviceImage<PixelType> image, IndexType height, IndexType width, IndexType batchSize,
-                           LabelType *dLabels)
-{
-    // NOTE: Potential for improvement to reduce thread divergences.
-
-    // Shared memory buffer allocation.
-    extern __shared__ int32_t sharedBuffer[];
-
-    // Setting up the labels data structure.
-    BoundaryLabel labels{batchSize, height, width, 1, dLabels};
-
-    // Initializing cooperative groups for thread management.
-    auto grid  = cg::this_grid();
-    auto block = cg::this_thread_block();
-
-    // Deriving grid and block properties.
-    auto gridBlocks  = grid.group_dim().x * grid.group_dim().y * grid.group_dim().z;
-    auto gridShape   = grid.group_dim() * block.group_dim();
-    auto blockHeight = block.group_dim().y;
-    auto blockWidth  = block.group_dim().x;
-    auto blockRank   = block.group_index().z * grid.group_dim().x * grid.group_dim().y
-                   + block.group_index().y * grid.group_dim().x + block.group_index().x;
-
-    // Get pointers to shared memory for image and labels.
-    auto        sharedOffset = 0;
-    SharedImage sharedImage{static_cast<IndexType>(blockHeight + 4), static_cast<IndexType>(blockWidth + 4),
-                            reinterpret_cast<typename SharedImage::value_type *>(&sharedBuffer[sharedOffset])};
-    sharedOffset += sharedImage.volume() * sizeof(typename SharedImage::value_type) / sizeof(int32_t);
-    SharedLabel sharedLabels{static_cast<IndexType>(blockHeight), static_cast<IndexType>(blockWidth),
-                             reinterpret_cast<typename SharedLabel::value_type *>(&sharedBuffer[sharedOffset])};
-
-    // Computing block dimensions in terms of tiles.
-    auto blocksTileWidth  = util::DivUp(width, blockWidth);
-    auto blocksTileHeight = util::DivUp(height, blockHeight);
-    auto numSteps         = util::DivUp(blocksTileWidth * blocksTileHeight * batchSize, gridBlocks);
-
-    // Thread positions within a block.
-    PointType threadBlockPos{static_cast<int>(block.thread_rank() % blockWidth),
-                             static_cast<int>(block.thread_rank() / blockWidth)};
-
-    // Iterate through the steps to cover the entire image.
-    for (auto step = 0; step < numSteps; ++step)
-    {
-        // Compute block index.
-        auto      blockIndex = blockRank + step * gridBlocks;
-        CoordType blockGridPos{blockIndex % blocksTileWidth, (blockIndex / blocksTileWidth) % blocksTileHeight,
-                               blockIndex / (blocksTileWidth * blocksTileHeight)};
-
-        // Compute local and global positions.
-        PointType localPos{threadBlockPos.x + 2, threadBlockPos.y + 2};
-        PointType globalPos{static_cast<int>(threadBlockPos.x + blockWidth * blockGridPos.x),
-                            static_cast<int>(threadBlockPos.y + blockHeight * blockGridPos.y)};
-        IndexType batchIndex = static_cast<int>(blockGridPos.z);
-
-        // Populate shared memory with image data.
-        copyGlobalToShared<2, PixelType, int32_t>(image, globalPos, localPos, batchIndex, 0, sharedImage);
-        block.sync();
-
-        // Assign labels to the edges.
-        if (batchIndex < batchSize)
-        {
-            setLabels(sharedImage, localPos, sharedLabels, globalPos, width, height);
-        }
-        block.sync();
-
-        // Copy labels from shared memory back to global memory.
-        const auto index = (localPos.y - 2) * sharedLabels.width() + (localPos.x - 2);
-        if (globalPos.x < width && globalPos.y < height && batchIndex < batchSize)
-        {
-            *labels.ptr(batchIndex, globalPos.y, globalPos.x) = sharedLabels[index];
-        }
-    }
-}
-
-__global__ void labelConnectedComponents(LabelType *dLabels, IndexType height, IndexType width, IndexType batchSize,
-                                         LabelType *dSegments, LabelType *dConnectedComponents, MaskType *dNeighbors)
-{
-    // Set up data structures to provide structure and ease of access to labels, segments,
-    // connected components, and neighbors.
-    BoundaryLabel labels{batchSize, height, width, 1, dLabels};
-    BoundaryLabel segments{batchSize, height, width, 1, dSegments};
-    BoundaryLabel connectedComponents{batchSize, height, width, 1, dConnectedComponents};
-    Neighborhood  neighbors{batchSize, height, width, 1, dNeighbors};
-
-    // Initialize cooperative groups, which provide synchronization primitives for CUDA threads.
-    auto grid  = cg::this_grid();
-    auto block = cg::this_thread_block();
-
-    // Calculate properties for the grid and blocks.
-    auto gridBlocks  = grid.group_dim().x * grid.group_dim().y * grid.group_dim().z;
-    auto blockHeight = block.group_dim().y;
-    auto blockWidth  = block.group_dim().x;
-    auto blockRank   = block.group_index().z * grid.group_dim().x * grid.group_dim().y
-                   + block.group_index().y * grid.group_dim().x + block.group_index().x;
-
-    // Calculate the width and height of blocks in tiles and the total number of steps required.
-    auto blocksTileWidth  = util::DivUp(width, blockWidth);
-    auto blocksTileHeight = util::DivUp(height, blockHeight);
-    auto numSteps         = util::DivUp(blocksTileWidth * blocksTileHeight * batchSize, gridBlocks);
-
-    // Determine block dimensions and thread's position within the block.
-    CoordType blockDims{blockWidth, blockHeight, 1};
-    CoordType threadBlockPos{block.thread_rank() % blockWidth, block.thread_rank() / blockWidth, 0};
-
-    const auto badLabel = height * width;
-
-    // Lambda function to encapsulate the repeated logic. It operates on the thread's position
-    // and performs the given action if the position is within the image boundaries.
-    auto performOperationOnThreadPos = [&](auto operation)
-    {
-        for (auto step = 0; step < numSteps; ++step)
-        {
-            // Calculate the block's position in the grid.
-            auto      blockIndex = blockRank + step * gridBlocks;
-            CoordType blockGridPos{blockIndex % blocksTileWidth, (blockIndex / blocksTileWidth) % blocksTileHeight,
-                                   blockIndex / (blocksTileWidth * blocksTileHeight)};
-
-            // Calculate the global position of the thread.
-            auto threadPos = blockGridPos * blockDims + threadBlockPos;
-
-            // Check if the thread position is within the boundaries of the image.
-            bool inLabels = (threadPos.x < width && threadPos.y < height && threadPos.z < batchSize);
-
-            // If valid, perform the given operation.
-            if (inLabels)
-            {
-                operation(threadPos);
-            }
-        }
-        grid.sync(); // Synchronize threads in the grid to ensure they are all done.
-    };
-
-    // 1. Extract edge segments of contiguous edges.
-    performOperationOnThreadPos(
-        [&](const CoordType &threadPos)
-        { *segments.ptr(threadPos.z, threadPos.y, threadPos.x) = findRoot(labels, threadPos, badLabel); });
-
-    // 2. Resolve roots in the segments to connect neighboring components.
-    performOperationOnThreadPos([&](const CoordType &threadPos)
-                                { resolveRoots(segments, threadPos, badLabel, neighbors); });
-
-    // 3. Label the connected components.
-    performOperationOnThreadPos(
-        [&](const CoordType &threadPos)
-        { *connectedComponents.ptr(threadPos.z, threadPos.y, threadPos.x) = findRoot(segments, threadPos, badLabel); });
-}
-
-__global__ void resolveContours(LabelType *dLabels, LabelType *dSegments, LabelType *dConnectedComponents,
-                                MaskType *dNeighbors, IndexType height, IndexType width, IndexType batchSize,
-                                IndexType *dConnectList, CountType *dNodeCount, CountType *contourCount)
-{
-    // Organize input/output data into structured objects for easier access.
-    BoundaryLabel labels{batchSize, height, width, 1, dLabels};
-    BoundaryLabel segments{batchSize, height, width, 1, dSegments};
-    BoundaryLabel connectedComponents{batchSize, height, width, 1, dConnectedComponents};
-    Neighborhood  neighbors{batchSize, height, width, 1, dNeighbors};
-    ConnectList   connectList{batchSize, FindContours::MAX_NUM_CONTOURS, FindContours::MAX_CONTOUR_POINTS, 1,
-                            dConnectList};
-    CountList     nodeCount{batchSize, FindContours::MAX_NUM_CONTOURS, dNodeCount};
-
-    // Initialize cooperative groups for thread synchronization.
-    auto grid  = cg::this_grid();
-    auto block = cg::this_thread_block();
-
-    // Compute properties of the grid and blocks.
-    auto gridBlocks  = grid.group_dim().x * grid.group_dim().y * grid.group_dim().z;
-    auto blockHeight = block.group_dim().y;
-    auto blockWidth  = block.group_dim().x;
-    auto blockRank   = block.group_index().z * grid.group_dim().x * grid.group_dim().y
-                   + block.group_index().y * grid.group_dim().x + block.group_index().x;
-
-    // Calculate block tile dimensions and total number of iterations needed.
-    auto blocksTileWidth  = util::DivUp(width, blockWidth);
-    auto blocksTileHeight = util::DivUp(height, blockHeight);
-    auto numSteps         = util::DivUp(blocksTileWidth * blocksTileHeight * batchSize, gridBlocks);
-
-    // Calculate the thread's block dimensions and its position within the block.
-    CoordType blockDims{blockWidth, blockHeight, 1};
-    CoordType threadBlockPos{block.thread_rank() % blockWidth, block.thread_rank() / blockWidth, 0};
-
-    const auto badLabel = height * width;
-
-    // Traverse and label contours for each step.
-    for (auto step = 0; step < numSteps; ++step)
-    {
-        // Calculate block's position within the grid.
-        auto      blockIndex = blockRank + step * gridBlocks;
-        CoordType blockGridPos{blockIndex % blocksTileWidth, (blockIndex / blocksTileWidth) % blocksTileHeight,
-                               blockIndex / (blocksTileWidth * blocksTileHeight)};
-
-        // Calculate the thread's global position.
-        auto threadPos = blockGridPos * blockDims + threadBlockPos;
-
-        // Check if thread's position is within image boundaries.
-        bool inLabels = (threadPos.x < width && threadPos.y < height && threadPos.z < batchSize);
-
-        // If within boundaries, traverse and label the contour for the current position.
-        if (inLabels)
-        {
-            traverseContour(labels, segments, connectedComponents, neighbors, threadPos, connectList, nodeCount,
-                            contourCount, badLabel);
-        }
-    }
-}
-
-__global__ void flattenContours(IndexType *dConnectList, CountType *dNodeCount, CountType *contourCount,
-                                IndexType width, IndexType batchSize, NodeList nodeList, NodeCounts pointCount)
-{
-    // Structuring the input/output data
-    ConnectList connectList{batchSize, FindContours::MAX_NUM_CONTOURS, FindContours::MAX_CONTOUR_POINTS, 1,
-                            dConnectList};
-    CountList   nodeCount{batchSize, FindContours::MAX_NUM_CONTOURS, dNodeCount};
-
-    // Initialize cooperative groups for thread synchronization.
-    auto grid  = cg::this_grid();
-    auto block = cg::this_thread_block();
-    auto warp  = cg::tiled_partition<32>(block);
-
-    // Compute properties of the grid and blocks.
-    auto gridBlocks = grid.group_dim().x * grid.group_dim().y * grid.group_dim().z;
-    auto blockRank  = block.group_index().z * grid.group_dim().x * grid.group_dim().y
-                   + block.group_index().y * grid.group_dim().x + block.group_index().x;
-
-    // Calculate block tile dimensions and total number of iterations needed.
-    auto contourTile       = util::DivUp(FindContours::MAX_NUM_CONTOURS, warp.meta_group_size());
-    auto neededThreads     = warp.size() * FindContours::MAX_NUM_CONTOURS * batchSize;
-    auto neededBlocks      = (neededThreads + block.size() - 1) / block.size();
-    auto numStepsBatchSize = ((batchSize * contourTile - blockRank) + gridBlocks - 1) / gridBlocks;
-    auto numSteps          = max((neededBlocks + gridBlocks - 1) / gridBlocks, numStepsBatchSize);
-
-    // Calculate the thread's block dimensions and its position within the block.
-    CoordType blockDims{warp.size(), warp.meta_group_size(), 1};
-    CoordType threadBlockPos{warp.thread_rank(), warp.meta_group_rank(), 0};
-
-    // Traverse and label contours for each step.
-    for (auto step = 0; step < numSteps; ++step)
-    {
-        // Calculate block's position within the grid.
-        auto      blockIndex = blockRank + step * gridBlocks;
-        CoordType blockGridPos{0, blockIndex % contourTile, blockIndex / contourTile};
-
-        // Calculate the thread's global position.
-        auto pos          = blockGridPos * blockDims + threadBlockPos;
-        auto contourIndex = pos.y;
-
-        // Make sure we're within the boundaries of our contour count
-        if (pos.z < batchSize && contourIndex < contourCount[pos.z])
-        {
-            auto indexOffset = 0;
-            for (auto i = 0; i < contourIndex; ++i)
-            {
-                indexOffset += *nodeCount.ptr(pos.z, i);
-            }
-
-            if ((indexOffset + *nodeCount.ptr(pos.z, contourIndex)) > FindContours::MAX_TOTAL_POINTS)
-            {
-                return;
-            }
-
-            for (auto i = pos.x; i < *nodeCount.ptr(pos.z, contourIndex); i += blockDims.x)
-            {
-                auto      point = *connectList.ptr(pos.z, contourIndex, i);
-                PointType node{mod(point, width), point / width};
-                *nodeList.ptr(static_cast<int>(pos.z), static_cast<int>(indexOffset + i), 0) = node.x;
-                *nodeList.ptr(static_cast<int>(pos.z), static_cast<int>(indexOffset + i), 1) = node.y;
-            }
-            if (pos.x == 0)
-            {
-                *pointCount.ptr(static_cast<int>(pos.z), static_cast<int>(contourIndex))
-                    = *nodeCount.ptr(pos.z, contourIndex);
-            }
-        }
-    }
-}
-
-namespace detail {
-template<typename Lambda, typename... Args>
-void forwardArgs(Lambda &&f, Args &&...args)
-{
-    // Create a lambda to capture each forwarded arg, then use pack expansion
-    // to expand and call the lambda for each arg.
-    auto forwarder = [&f](auto &&...a)
-    {
-        (f(&a), ...);
-    };
-    forwarder(std::forward<Args>(args)...);
-}
-
-template<typename KernelFunction, typename... KernelParameters>
-inline void cooperativeLaunch(const KernelFunction &func, cudaStream_t stream, dim3 grid, dim3 block, size_t sharedMem,
-                              KernelParameters... params)
-{
-    void *args[sizeof...(params)];
-    int   argIndex = 0;
-
-    // Capture args by address into the args array
-    forwardArgs([&](auto p) { args[argIndex++] = p; }, params...);
-
-    cudaLaunchCooperativeKernel<KernelFunction>(&func, grid, block, args, sharedMem, stream);
-}
-} // namespace detail
-
-template<typename PixelType>
-__host__ void findContours_impl(DeviceImage<PixelType> &dImage, LabelType *dLabels, LabelType *dSegments,
-                                LabelType *dConnectedComponents, MaskType *dNeighbors, IndexType *dConnectList,
-                                CountType *dNodeCount, CountType *dContourCount, NodeList &dNodeList,
-                                NodeCounts &dPointCount, IndexType height, IndexType width, IndexType batchSize,
-                                cudaStream_t stream)
-{
-    // Determine shared memory size needed for labelEdges kernel, considering halo cells and storage.
-    auto labelEdgesSharedMem = [&](int blockSize)
-    {
-        int dimX = 32;
-        int dimY = static_cast<int>((blockSize + dimX - 1) / dimX);
-        return (dimX + 4) * (dimY + 4) * sizeof(typename SharedImage::value_type)
-             + dimX * dimY * sizeof(typename SharedLabel::value_type);
-    };
-
-    // Parameters for kernel launches
-    dim3 block(1, 1, 1);
-    dim3 grid(1, 1, 1);
-    int  maxGridSize  = 1;
-    int  maxBlockSize = 32;
-
-    // 1. Labeling Image Edges:
-    // Query for optimal block size for the labelEdges kernel.
-    checkCudaErrors(cudaOccupancyMaxPotentialBlockSizeVariableSMem(&maxGridSize, &maxBlockSize, labelEdges<PixelType>,
-                                                                   labelEdgesSharedMem, 1024));
-    block                 = dim3(32, (maxBlockSize + 31) / 32);
-    auto blocksTileWidth  = util::DivUp(width, block.x);
-    auto blocksTileHeight = util::DivUp(height, block.y);
-    grid.x                = std::min(blocksTileWidth * blocksTileHeight * batchSize, maxGridSize);
-    detail::cooperativeLaunch(labelEdges<PixelType>, stream, grid, block, labelEdgesSharedMem(block.x * block.y),
-                              dImage, height, width, batchSize, dLabels);
-
-    // 2. Labeling Connected Components:
-    checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&maxGridSize, &maxBlockSize, labelConnectedComponents, 0, 1024));
-    block            = dim3(32, (maxBlockSize + 31) / 32);
-    blocksTileWidth  = util::DivUp(width, block.x);
-    blocksTileHeight = util::DivUp(height, block.y);
-    grid.x           = std::min(blocksTileWidth * blocksTileHeight * batchSize, maxGridSize);
-    detail::cooperativeLaunch(labelConnectedComponents, stream, grid, block, 0, dLabels, height, width, batchSize,
-                              dSegments, dConnectedComponents, dNeighbors);
-
-    // 3. Resolving Contours:
-    checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&maxGridSize, &maxBlockSize, resolveContours, 0, 1024));
-    block            = dim3(32, (maxBlockSize + 31) / 32);
-    blocksTileWidth  = util::DivUp(width, block.x);
-    blocksTileHeight = util::DivUp(height, block.y);
-    grid.x           = std::min(blocksTileWidth * blocksTileHeight * batchSize, maxGridSize);
-    detail::cooperativeLaunch(resolveContours, stream, grid, block, 0, dLabels, dSegments, dConnectedComponents,
-                              dNeighbors, height, width, batchSize, dConnectList, dNodeCount, dContourCount);
-
-    // 4. Flattening Contours:
-    checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&maxGridSize, &maxBlockSize, flattenContours, 0, 1024));
-    auto idealThreads   = 32 * FindContours::MAX_NUM_CONTOURS * batchSize;
-    auto bestBlockCount = util::DivUp(idealThreads, maxBlockSize);
-    block               = dim3(32, (maxBlockSize + 31) / 32, 1);
-    grid.x              = 1;
-    grid.y              = std::min(bestBlockCount, maxGridSize);
-    grid.z              = 1;
-    detail::cooperativeLaunch(flattenContours, stream, grid, block, 0, dConnectList, dNodeCount, dContourCount, width,
-                              batchSize, dNodeList, dPointCount);
-}
-
-// =============================================================================
-// FindContours Class Definition
-// =============================================================================
-
-FindContours::FindContours(DataShape max_input_shape, DataShape max_output_shape)
-    : CudaBaseOp(max_input_shape, max_output_shape)
-{
-    // Calculating the size of the workspace buffers
-    auto gpuBufferSize = this->calBufferSize(max_input_shape, max_output_shape, kCV_8U);
-
-    // Allocating GPU memory
-    NVCV_CHECK_LOG(cudaMalloc(&gpu_workspace, gpuBufferSize));
-}
-
-FindContours::~FindContours()
-{
-    NVCV_CHECK_LOG(cudaFree(gpu_workspace));
-}
-
-size_t FindContours::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, DataType max_data_type)
-{
-    // Number of images in the batch times...
-    return max_input_shape.N
-         * (
-               // Size of labels buffer
-               max_input_shape.H * max_input_shape.W * sizeof(LabelType) +
-
-               // Size of segments buffer
-               max_input_shape.H * max_input_shape.W * sizeof(LabelType) +
-
-               // Size of connected components buffer
-               max_input_shape.H * max_input_shape.W * sizeof(LabelType) +
-
-               // Size of neighborhood flag buffer
-               max_input_shape.H * max_input_shape.W * sizeof(MaskType) +
-
-               // Size of maximum contours heads found
-               FindContours::MAX_TOTAL_POINTS * sizeof(IndexType) +
-
-               FindContours::MAX_NUM_CONTOURS * sizeof(CountType) +
-
-               // Size of contour counter
-               sizeof(CountType)
-
-               // done...
-         );
-}
-
-ErrorCode FindContours::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &pointCoords,
-                              const TensorDataStridedCuda &numPoints, cudaStream_t stream)
-{
-    // Testing inData for valid structure
-    auto format = GetLegacyDataFormat(inData.layout());
-    if (format != kNHWC && format != kHWC)
-    {
-        LOG_ERROR("Invalid DataFormat for input image: " << format);
-        return ErrorCode::INVALID_DATA_FORMAT;
-    }
-
-    auto data_type = GetLegacyDataType(inData.dtype());
-    if (!(data_type == kCV_8U /*|| data_type == kCV_16U || data_type == kCV_16S || data_type == kCV_32F */))
-    {
-        LOG_ERROR("Invalid DataType for input image: " << data_type);
-        return ErrorCode::INVALID_DATA_TYPE;
-    }
-
-    // Creating a access overlay for the input data
-    auto inAccess = TensorDataAccessStridedImage::Create(inData);
-    NVCV_ASSERT(inAccess);
-    auto pointsAccess = TensorDataAccessStrided::Create(pointCoords);
-    NVCV_ASSERT(pointsAccess);
-    auto countAccess = TensorDataAccessStrided::Create(numPoints);
-    NVCV_ASSERT(countAccess);
-
-    // Extracting input shape information
-    auto input_shape  = GetLegacyDataShape(inAccess->infoShape());
-    auto points_shape = pointsAccess->infoShape();
-    auto counts_shape = countAccess->infoShape();
-
-    const auto nImage   = input_shape.N;
-    const auto width    = input_shape.W;
-    const auto height   = input_shape.H;
-    const auto channels = input_shape.C;
-
-    if (channels != 1)
-    {
-        LOG_ERROR("Invalid channel number " << channels);
-        return ErrorCode::INVALID_DATA_SHAPE;
-    }
-
-    if (nImage != points_shape.shape()[0] || nImage != counts_shape.shape()[0])
-    {
-        LOG_ERROR("Invalid INVALID_PARAMETER: batch size must be equal for all parameters");
-        return ErrorCode::INVALID_PARAMETER;
-    }
-    if (points_shape.shape()[1] > FindContours::MAX_TOTAL_POINTS)
-    {
-        LOG_ERROR("Invalid INVALID_PARAMETER: points cannot be larger than the max total number of points");
-        return ErrorCode::INVALID_PARAMETER;
-    }
-    if (points_shape.shape()[2] != 2)
-    {
-        LOG_ERROR("Invalid INVALID_PARAMETER: points shape can only hold xy coordinates");
-        return ErrorCode::INVALID_PARAMETER;
-    }
-    if (counts_shape.shape()[1] > FindContours::MAX_NUM_CONTOURS)
-    {
-        LOG_ERROR("Invalid INVALID_PARAMETER: points cannot be larger than the max number of contours");
-        return ErrorCode::INVALID_PARAMETER;
-    }
-
-    DeviceImage<uint8_t> dImage{
-        reinterpret_cast<uint8_t *>(inAccess->sampleData(0)),
-        {static_cast<int>(inAccess->sampleStride()), static_cast<int>(inAccess->rowStride()),
-                                              static_cast<int>(inAccess->colStride())                                                                     },
-        {                  static_cast<int>(nImage),                static_cast<int>(height), static_cast<int>(width)}
-    };
-    NodeList   dNodeList{pointCoords};
-    NodeCounts dPointCount{numPoints};
-
-    // Creating some temporaries
-    char *bufferBoundaryStart = (char *)gpu_workspace;
-
-    // Initialize buffer for the GPU image.
-
-    // Initialize buffer for storing neighborhood indices.
-    LabelType *dLabels = reinterpret_cast<LabelType *>(bufferBoundaryStart);
-    bufferBoundaryStart += sizeof(LabelType) * input_shape.N * input_shape.H * input_shape.W;
-
-    // Initialize buffer for storing segment boundaries.
-    LabelType *dSegments = reinterpret_cast<LabelType *>(bufferBoundaryStart);
-    bufferBoundaryStart += sizeof(LabelType) * input_shape.N * input_shape.H * input_shape.W;
-
-    // Initialize buffer for storing connected component data.
-    LabelType *dConnectedComponents = reinterpret_cast<LabelType *>(bufferBoundaryStart);
-    bufferBoundaryStart += sizeof(LabelType) * input_shape.N * input_shape.H * input_shape.W;
-
-    // Initialize buffer for storing neighbor mask data.
-    MaskType *dNeighbors = reinterpret_cast<MaskType *>(bufferBoundaryStart);
-    bufferBoundaryStart += sizeof(MaskType) * input_shape.N * input_shape.H * input_shape.W;
-
-    // Initialize buffer to keep track of contours.
-    IndexType *dConnectList = reinterpret_cast<IndexType *>(bufferBoundaryStart);
-    bufferBoundaryStart += sizeof(IndexType) * input_shape.N * FindContours::MAX_TOTAL_POINTS;
-
-    // Initialize buffer to keep track of contours.
-    CountType *dNodeCount = reinterpret_cast<CountType *>(bufferBoundaryStart);
-    bufferBoundaryStart += sizeof(CountType) * input_shape.N * FindContours::MAX_NUM_CONTOURS;
-
-    // Initialize buffer for counting contours.
-    CountType *dContourCount = reinterpret_cast<CountType *>(bufferBoundaryStart);
-    bufferBoundaryStart += sizeof(CountType) * input_shape.N;
-
-    // Clear GPU buffers to prepare for computation.
-    checkCudaErrors(cudaMemsetAsync(reinterpret_cast<void *>(dLabels), height * width,
-                                    nImage * sizeof(LabelType) * input_shape.H * input_shape.W, stream));
-    checkCudaErrors(cudaMemsetAsync(reinterpret_cast<void *>(dSegments), height * width,
-                                    nImage * sizeof(LabelType) * input_shape.H * input_shape.W, stream));
-    checkCudaErrors(cudaMemsetAsync(reinterpret_cast<void *>(dConnectedComponents), height * width,
-                                    nImage * sizeof(LabelType) * input_shape.H * input_shape.W, stream));
-    checkCudaErrors(cudaMemsetAsync(reinterpret_cast<void *>(dNeighbors), 0,
-                                    nImage * sizeof(MaskType) * input_shape.H * input_shape.W, stream));
-    checkCudaErrors(cudaMemsetAsync(reinterpret_cast<void *>(dContourCount), 0, nImage * sizeof(CountType), stream));
-
-    // get boundaries of the binary image, which is called contour.
-    findContours_impl(dImage, dLabels, dSegments, dConnectedComponents, dNeighbors, dConnectList, dNodeCount,
-                      dContourCount, dNodeList, dPointCount, height, width, nImage, stream);
-
-    return ErrorCode::SUCCESS;
-}
-
-} // namespace nvcv::legacy::cuda_op
diff --git a/src/cvcuda/priv/legacy/min_area_rect.cu b/src/cvcuda/priv/legacy/min_area_rect.cu
index cdf463249..4554e2b51 100644
--- a/src/cvcuda/priv/legacy/min_area_rect.cu
+++ b/src/cvcuda/priv/legacy/min_area_rect.cu
@@ -273,11 +273,6 @@ MinAreaRect::~MinAreaRect()
     NVCV_CHECK_LOG(cudaFree(mRotatedPointsDev));
 }
 
-size_t MinAreaRect::calBufferSize(DataShape max_input_shape, DataShape max_output_shape, int maxContourNum)
-{
-    return maxContourNum * (_MAX_ROTATE_DEGREES + 1) * _MIN_AREA_EACH_ANGLE_STRID * sizeof(int);
-}
-
 ErrorCode MinAreaRect::infer(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &outData,
                              const TensorDataStridedCuda &numPointsInContour, const int totalContours,
                              cudaStream_t stream)
diff --git a/src/cvcuda/priv/legacy/osd.cu b/src/cvcuda/priv/legacy/osd.cu
index f979e6889..81c0fcc04 100644
--- a/src/cvcuda/priv/legacy/osd.cu
+++ b/src/cvcuda/priv/legacy/osd.cu
@@ -41,7 +41,7 @@ using namespace cvcuda::priv;
 namespace nvcv::legacy::cuda_op {
 
 template<typename _T>
-static __host__ __device__ unsigned char u8cast(_T value)
+static __device__ unsigned char u8cast(_T value)
 {
     return value < 0 ? 0 : (value > 255 ? 255 : value);
 }
diff --git a/src/cvcuda/priv/legacy/random_resized_crop.cu b/src/cvcuda/priv/legacy/random_resized_crop.cu
index 7f0959624..4d3240b14 100644
--- a/src/cvcuda/priv/legacy/random_resized_crop.cu
+++ b/src/cvcuda/priv/legacy/random_resized_crop.cu
@@ -87,51 +87,6 @@ inline const __device__ ValueType *_cacheAlignedBufferedRead(SrcWrapper srcImage
     }
 } //_cacheAlignedBufferedRead
 
-template<typename SrcWrapper, typename DstWrapper>
-__global__ void resize_linear_v2(const SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, const int *top_,
-                                 const int *left_, const float *scale_x_, const float *scale_y_)
-{
-    int       dst_x     = blockIdx.x * blockDim.x + threadIdx.x;
-    int       dst_y     = blockIdx.y * blockDim.y + threadIdx.y;
-    const int batch_idx = get_batch_idx();
-    int       height = srcSize.y, width = srcSize.x, out_height = dstSize.y, out_width = dstSize.x;
-    if (dst_x >= out_width || dst_y >= out_height)
-        return;
-
-    const float scale_x = scale_x_[batch_idx];
-    const float scale_y = scale_y_[batch_idx];
-    const int   top     = top_[batch_idx];
-    const int   left    = left_[batch_idx];
-
-    const float src_x = dst_x * scale_x + left;
-    const float src_y = dst_y * scale_y + top;
-
-    using work_type = cuda::ConvertBaseTypeTo<float, typename DstWrapper::ValueType>;
-    work_type out   = cuda::SetAll<work_type>(0);
-
-    const int x1      = __float2int_rd(src_x);
-    const int y1      = __float2int_rd(src_y);
-    const int x2      = x1 + 1;
-    const int y2      = y1 + 1;
-    const int x2_read = min(x2, width - 1);
-    const int y2_read = min(y2, height - 1);
-
-    typename SrcWrapper::ValueType src_reg;
-    src_reg = *src.ptr(batch_idx, y1, x1);
-    out     = out + src_reg * ((x2 - src_x) * (y2 - src_y));
-
-    src_reg = *src.ptr(batch_idx, y1, x2_read);
-    out     = out + src_reg * ((src_x - x1) * (y2 - src_y));
-
-    src_reg = *src.ptr(batch_idx, y2_read, x1);
-    out     = out + src_reg * ((x2 - src_x) * (src_y - y1));
-
-    src_reg = *src.ptr(batch_idx, y2_read, x2_read);
-    out     = out + src_reg * ((src_x - x1) * (src_y - y1));
-
-    *dst.ptr(batch_idx, dst_y, dst_x) = cuda::SaturateCast<typename DstWrapper::ValueType>(out);
-}
-
 template<typename SrcWrapper, typename DstWrapper, typename T = typename DstWrapper::ValueType>
 __global__ void resize_linear_v1(const SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, const int *top_,
                                  const int *left_, const float *scale_x_, const float *scale_y_)
@@ -175,32 +130,6 @@ __global__ void resize_linear_v1(const SrcWrapper src, DstWrapper dst, int2 srcS
     }
 }
 
-template<typename SrcWrapper, typename DstWrapper>
-__global__ void resize_nearest_v2(const SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, const int *top_,
-                                  const int *left_, const float *scale_x_, const float *scale_y_)
-{
-    int       dst_x     = blockIdx.x * blockDim.x + threadIdx.x;
-    int       dst_y     = blockIdx.y * blockDim.y + threadIdx.y;
-    const int batch_idx = get_batch_idx();
-
-    int out_height = dstSize.y, out_width = dstSize.x;
-    if (dst_x >= out_width || dst_y >= out_height)
-        return;
-
-    const float scale_x = scale_x_[batch_idx];
-    const float scale_y = scale_y_[batch_idx];
-    const int   top     = top_[batch_idx];
-    const int   left    = left_[batch_idx];
-
-    const float src_x = dst_x * scale_x + left;
-    const float src_y = dst_y * scale_y + top;
-
-    const int x1 = __float2int_rz(src_x);
-    const int y1 = __float2int_rz(src_y);
-
-    *dst.ptr(batch_idx, dst_y, dst_x) = *src.ptr(batch_idx, y1, x1);
-}
-
 template<typename SrcWrapper, typename DstWrapper>
 __global__ void resize_nearest_v1(const SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, const int *top_,
                                   const int *left_, const float *scale_x_, const float *scale_y_)
@@ -223,29 +152,6 @@ __global__ void resize_nearest_v1(const SrcWrapper src, DstWrapper dst, int2 src
     }
 }
 
-template<typename SrcWrapper, typename DstWrapper>
-__global__ void resize_cubic_v2(const SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, const int *top_,
-                                const int *left_, const float *scale_x_, const float *scale_y_)
-{
-    int       dst_x      = blockIdx.x * blockDim.x + threadIdx.x;
-    int       dst_y      = blockIdx.y * blockDim.y + threadIdx.y;
-    const int batch_idx  = get_batch_idx();
-    int       out_height = dstSize.y, out_width = dstSize.x;
-    if (dst_x >= out_width || dst_y >= out_height)
-        return;
-
-    const float scale_x = scale_x_[batch_idx];
-    const float scale_y = scale_y_[batch_idx];
-    const int   top     = top_[batch_idx];
-    const int   left    = left_[batch_idx];
-
-    const float  src_x = dst_x * scale_x + left;
-    const float  src_y = dst_y * scale_y + top;
-    const float3 srcCoord{src_x, src_y, static_cast<float>(batch_idx)};
-
-    *dst.ptr(batch_idx, dst_y, dst_x) = src[srcCoord];
-}
-
 template<typename SrcWrapper, typename DstWrapper, typename T = typename DstWrapper::ValueType>
 __global__ void resize_cubic_v1(const SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize, const int *top_,
                                 const int *left_, const float *scale_x_, const float *scale_y_)
@@ -336,7 +242,6 @@ void resize(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &ou
     auto src = cuda::CreateTensorWrapNHW<T>(inData);
     auto dst = cuda::CreateTensorWrapNHW<T>(outData);
 
-    // v2 is original impl, v1 is aligned with new resize op
     if (interpolation == NVCV_INTERP_LINEAR)
     {
         resize_linear_v1<<<gridSize, blockSize, 0, stream>>>(src, dst, srcSize, dstSize, top, left, scale_x, scale_y);
@@ -349,9 +254,6 @@ void resize(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &ou
     }
     else
     {
-        // this commented code is for v2
-        // auto src = cuda::CreateInterpolationWrapNHW<T, NVCV_BORDER_REPLICATE, NVCV_INTERP_CUBIC>(inData);
-
         resize_cubic_v1<<<gridSize, blockSize, 0, stream>>>(src, dst, srcSize, dstSize, top, left, scale_x, scale_y);
         checkKernelErrors();
     }
diff --git a/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu b/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu
index 279b2c875..f759a1a02 100644
--- a/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu
+++ b/src/cvcuda/priv/legacy/random_resized_crop_var_shape.cu
@@ -87,52 +87,6 @@ inline __device__ T *_cacheAlignedBufferedReadVS(cuda::ImageBatchVarShapeWrap<co
     }
 } //_cacheAlignedBufferedReadVS
 
-template<typename SrcWrapper, typename DstWrapper>
-__global__ void resize_linear_v2(const SrcWrapper src, DstWrapper dst, const int *top_, const int *left_,
-                                 const float *scale_x_, const float *scale_y_)
-{
-    int       dst_x     = blockIdx.x * blockDim.x + threadIdx.x;
-    int       dst_y     = blockIdx.y * blockDim.y + threadIdx.y;
-    const int batch_idx = get_batch_idx();
-    int       height = src.height(batch_idx), width = src.width(batch_idx);
-    int       out_height = dst.height(batch_idx), out_width = dst.width(batch_idx);
-    if (dst_x >= out_width || dst_y >= out_height)
-        return;
-
-    const float scale_x = scale_x_[batch_idx];
-    const float scale_y = scale_y_[batch_idx];
-    const int   top     = top_[batch_idx];
-    const int   left    = left_[batch_idx];
-
-    const float src_x = dst_x * scale_x + left;
-    const float src_y = dst_y * scale_y + top;
-
-    using work_type = cuda::ConvertBaseTypeTo<float, typename DstWrapper::ValueType>;
-    work_type out   = cuda::SetAll<work_type>(0);
-
-    const int x1      = __float2int_rd(src_x);
-    const int y1      = __float2int_rd(src_y);
-    const int x2      = x1 + 1;
-    const int y2      = y1 + 1;
-    const int x2_read = min(x2, width - 1);
-    const int y2_read = min(y2, height - 1);
-
-    typename SrcWrapper::ValueType src_reg;
-    src_reg = *src.ptr(batch_idx, y1, x1);
-    out     = out + src_reg * ((x2 - src_x) * (y2 - src_y));
-
-    src_reg = *src.ptr(batch_idx, y1, x2_read);
-    out     = out + src_reg * ((src_x - x1) * (y2 - src_y));
-
-    src_reg = *src.ptr(batch_idx, y2_read, x1);
-    out     = out + src_reg * ((x2 - src_x) * (src_y - y1));
-
-    src_reg = *src.ptr(batch_idx, y2_read, x2_read);
-    out     = out + src_reg * ((src_x - x1) * (src_y - y1));
-
-    *dst.ptr(batch_idx, dst_y, dst_x) = cuda::SaturateCast<typename DstWrapper::ValueType>(out);
-}
-
 template<typename SrcWrapper, typename DstWrapper, typename T = typename DstWrapper::ValueType>
 __global__ void resize_linear_v1(const SrcWrapper src, DstWrapper dst, const int *top_, const int *left_,
                                  const float *scale_x_, const float *scale_y_)
@@ -180,32 +134,6 @@ __global__ void resize_linear_v1(const SrcWrapper src, DstWrapper dst, const int
     }
 }
 
-template<typename SrcWrapper, typename DstWrapper>
-__global__ void resize_nearest_v2(const SrcWrapper src, DstWrapper dst, const int *top_, const int *left_,
-                                  const float *scale_x_, const float *scale_y_)
-{
-    int       dst_x     = blockIdx.x * blockDim.x + threadIdx.x;
-    int       dst_y     = blockIdx.y * blockDim.y + threadIdx.y;
-    const int batch_idx = get_batch_idx();
-
-    int out_height = dst.height(batch_idx), out_width = dst.width(batch_idx);
-    if (dst_x >= out_width || dst_y >= out_height)
-        return;
-
-    const float scale_x = scale_x_[batch_idx];
-    const float scale_y = scale_y_[batch_idx];
-    const int   top     = top_[batch_idx];
-    const int   left    = left_[batch_idx];
-
-    const float src_x = dst_x * scale_x + left;
-    const float src_y = dst_y * scale_y + top;
-
-    const int x1 = __float2int_rz(src_x);
-    const int y1 = __float2int_rz(src_y);
-
-    *dst.ptr(batch_idx, dst_y, dst_x) = *src.ptr(batch_idx, y1, x1);
-}
-
 template<typename SrcWrapper, typename DstWrapper>
 __global__ void resize_nearest_v1(const SrcWrapper src, DstWrapper dst, const int *top_, const int *left_,
                                   const float *scale_x_, const float *scale_y_)
@@ -233,29 +161,6 @@ __global__ void resize_nearest_v1(const SrcWrapper src, DstWrapper dst, const in
     }
 }
 
-template<typename SrcWrapper, typename DstWrapper>
-__global__ void resize_cubic_v2(const SrcWrapper src, DstWrapper dst, const int *top_, const int *left_,
-                                const float *scale_x_, const float *scale_y_)
-{
-    int       dst_x      = blockIdx.x * blockDim.x + threadIdx.x;
-    int       dst_y      = blockIdx.y * blockDim.y + threadIdx.y;
-    const int batch_idx  = get_batch_idx();
-    int       out_height = dst.height(batch_idx), out_width = dst.width(batch_idx);
-    if (dst_x >= out_width || dst_y >= out_height)
-        return;
-
-    const float scale_x = scale_x_[batch_idx];
-    const float scale_y = scale_y_[batch_idx];
-    const int   top     = top_[batch_idx];
-    const int   left    = left_[batch_idx];
-
-    const float  src_x = dst_x * scale_x + left;
-    const float  src_y = dst_y * scale_y + top;
-    const float3 srcCoord{src_x, src_y, static_cast<float>(batch_idx)};
-
-    *dst.ptr(batch_idx, dst_y, dst_x) = src[srcCoord];
-}
-
 template<typename SrcWrapper, typename DstWrapper, typename T = typename DstWrapper::ValueType>
 __global__ void resize_cubic_v1(const SrcWrapper src, DstWrapper dst, const int *top_, const int *left_,
                                 const float *scale_x_, const float *scale_y_)
@@ -355,9 +260,6 @@ void resize(const ImageBatchVarShapeDataStridedCuda &in, const ImageBatchVarShap
     }
     else
     {
-        // for v2, not used
-        // cuda::InterpolationVarShapeWrap<T, NVCV_BORDER_REPLICATE, NVCV_INTERP_CUBIC> src(in);
-
         resize_cubic_v1<<<gridSize, blockSize, 0, stream>>>(src, dst, top, left, scale_x, scale_y);
         checkKernelErrors();
     }
diff --git a/src/cvcuda/priv/legacy/resize.cu b/src/cvcuda/priv/legacy/resize.cu
index 51721843c..4c0996181 100644
--- a/src/cvcuda/priv/legacy/resize.cu
+++ b/src/cvcuda/priv/legacy/resize.cu
@@ -82,18 +82,6 @@ inline const __device__ ValueType *_cacheAlignedBufferedRead(SrcWrapper srcImage
     }
 } //_cacheAlignedBufferedRead
 
-template<typename T>
-inline void __device__ _alignedCudaMemcpyQuad(T *pDst, T *pSrc)
-{
-    //copy 4 T's, assuming 32-bit alignment for both pSrc and pDst
-    uint *uPtrSrc = (uint *)pSrc;
-    uint *uPtrDst = (uint *)pDst;
-
-#pragma unroll
-    for (int i = 0; i < sizeof(T); ++i) uPtrDst[i] = uPtrSrc[i];
-
-} //_alignedCudaMemcpyQuad
-
 //******************** NN = Nearest Neighbor
 
 template<class SrcWrapper, class DstWrapper>
@@ -113,53 +101,6 @@ __global__ void resize_NN(SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dst
     }
 } //resize_NN
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void resize_NN_quad_alignread(SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize,
-                                         const float scale_x, const float scale_y)
-{
-    const float MAX_BUFFERED_X_SCALE = 4.0f; //probably more efficient all the way up to 4.0
-
-    const int dst_x      = (blockIdx.x * blockDim.x + threadIdx.x) * 4; //quad
-    const int dst_y      = blockIdx.y * blockDim.y + threadIdx.y;
-    const int batch_idx  = get_batch_idx();
-    int       out_height = dstSize.y, out_width = dstSize.x;
-
-    //0 - quad-aligned so if one pixel is out, they're all out
-    if ((dst_x >= out_width) | (dst_y >= out_height))
-        return;
-
-    const int sx0 = cuda::min(cuda::round<cuda::RoundMode::DOWN, int>(dst_x * scale_x), srcSize.x - 1);
-    const int sx1 = cuda::min(cuda::round<cuda::RoundMode::DOWN, int>(dst_x * scale_x + scale_x), srcSize.x - 1);
-    const int sx2 = cuda::min(cuda::round<cuda::RoundMode::DOWN, int>((dst_x + 2) * scale_x), srcSize.x - 1);
-    const int sx3 = cuda::min(cuda::round<cuda::RoundMode::DOWN, int>((dst_x + 3) * scale_x), srcSize.x - 1);
-    const int sy  = cuda::min(cuda::round<cuda::RoundMode::DOWN, int>(dst_y * scale_y), srcSize.y - 1);
-
-    //1 - optimized case if scale_x < some finite limit
-    if ((scale_x <= MAX_BUFFERED_X_SCALE)) //local buffering is more efficient
-    {
-        uint readBuffer[MAX_BUFFER_WORDS];
-
-        //2 - copy out source data, 32-bit aligned aligned
-        const T *aPtr = _cacheAlignedBufferedRead<CACHE_MEMORY_ALIGNMENT>(src, srcSize.x, &readBuffer[0],
-                                                                          MAX_BUFFER_WORDS, batch_idx, sy, sx0, sx3);
-
-        //3 - NN sampling
-        T gather[4] = {aPtr[0], aPtr[sx1 - sx0], aPtr[sx2 - sx0], aPtr[sx3 - sx0]};
-
-        //4 - aligned write back out
-        _alignedCudaMemcpyQuad<T>(dst.ptr(batch_idx, dst_y, dst_x), gather);
-    }
-    else //6 - standard sampling, no optimization
-    {
-        //sample all 4 points
-
-        const T *aPtr      = src.ptr(batch_idx, sy, sx0);
-        T        gather[4] = {aPtr[0], aPtr[sx1 - sx0], aPtr[sx2 - sx0], aPtr[sx3 - sx0]};
-
-        _alignedCudaMemcpyQuad<T>(dst.ptr(batch_idx, dst_y, dst_x), gather);
-    }
-} //resize_NN_quad_alignread
-
 //******************** Bilinear
 
 template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
@@ -200,112 +141,6 @@ __global__ void resize_bilinear(SrcWrapper src, DstWrapper dst, int2 srcSize, in
     }
 } //resize_bilinear
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void resize_bilinear_quad_alignread(SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize,
-                                               const float scale_x, const float scale_y)
-{
-    const float MAX_BUFFERED_X_SCALE = 4.0f; //probably more efficient all the way up to 4.0
-
-    const int dst_x     = (blockIdx.x * blockDim.x + threadIdx.x) * 4; //quad
-    const int dst_y     = blockIdx.y * blockDim.y + threadIdx.y;
-    const int batch_idx = get_batch_idx();
-    int       height = srcSize.y, width = srcSize.x, out_height = dstSize.y, out_width = dstSize.x;
-
-    //0 - quad-aligned so if one pixel is out, they're all out
-    if ((dst_x >= out_width) | (dst_y >= out_height))
-        return;
-
-    //float space for weighted addition
-    using work_type = cuda::ConvertBaseTypeTo<float, T>;
-
-    //y coordinate math is the same for all points
-    float fy = (float)((dst_y + 0.5f) * scale_y - 0.5f);
-    int   sy = cuda::round<cuda::RoundMode::DOWN, int>(fy);
-    fy -= sy;
-    sy = cuda::max(0, cuda::min(sy, height - 2));
-
-    //sx0
-    float fx0 = (float)((dst_x + 0.5f) * scale_x - 0.5f);
-    int   sx0 = cuda::round<cuda::RoundMode::DOWN, int>(fx0);
-    fx0 -= sx0;
-    fx0 *= ((sx0 >= 0) && (sx0 < width - 1));
-    sx0 = cuda::max(0, cuda::min(sx0, width - 2));
-
-    //sx1
-    float fx1 = (float)((dst_x + 1.5) * scale_x - 0.5f);
-    int   sx1 = cuda::round<cuda::RoundMode::DOWN, int>(fx1);
-    fx1 -= sx1;
-    fx1 *= ((sx1 >= 0) && (sx1 < width - 1));
-    sx1 = cuda::max(0, cuda::min(sx1, width - 2));
-
-    //sx2
-    float fx2 = (float)((dst_x + 2.5f) * scale_x - 0.5f);
-    int   sx2 = cuda::round<cuda::RoundMode::DOWN, int>(fx2);
-    fx2 -= sx2;
-    fx2 *= ((sx2 >= 0) && (sx2 < width - 1));
-    sx2 = cuda::max(0, cuda::min(sx2, width - 2));
-
-    //sx3
-    float fx3 = (float)((dst_x + 3.5f) * scale_x - 0.5f);
-    int   sx3 = cuda::round<cuda::RoundMode::DOWN, int>(fx3);
-    fx3 -= sx3;
-    fx3 *= ((sx3 >= 0) && (sx3 < width - 1));
-    sx3 = cuda::max(0, cuda::min(sx3, width - 2));
-
-    uint readBuffer[MAX_BUFFER_WORDS];
-
-    T result[4];
-
-    //1 - optimized case if scale_x < some finite limit
-    if (scale_x <= MAX_BUFFERED_X_SCALE) //local buffering is more efficient
-    {
-        work_type accum[4];
-
-        //2 - aligned load a-row and add partial product
-        const T *aPtr = _cacheAlignedBufferedRead<CACHE_MEMORY_ALIGNMENT>(src, srcSize.x, readBuffer, MAX_BUFFER_WORDS,
-                                                                          batch_idx, sy, sx0, sx3 + 1);
-        //const T * aPtr = src.ptr(batch_idx, sy,   sx0); //start of upper row
-
-        accum[0] = (1.0f - fy) * (aPtr[sx0 - sx0] * (1.0f - fx0) + aPtr[sx0 - sx0 + 1] * fx0);
-        accum[1] = (1.0f - fy) * (aPtr[sx1 - sx0] * (1.0f - fx1) + aPtr[sx1 - sx0 + 1] * fx1);
-        accum[2] = (1.0f - fy) * (aPtr[sx2 - sx0] * (1.0f - fx2) + aPtr[sx2 - sx0 + 1] * fx2);
-        accum[3] = (1.0f - fy) * (aPtr[sx3 - sx0] * (1.0f - fx3) + aPtr[sx3 - sx0 + 1] * fx3);
-
-        //3 - aligned load b-row and add remaining partial product
-        const T *bPtr = _cacheAlignedBufferedRead<CACHE_MEMORY_ALIGNMENT>(src, srcSize.x, readBuffer, MAX_BUFFER_WORDS,
-                                                                          batch_idx, sy + 1, sx0, sx3 + 1);
-        //const T * bPtr = src.ptr(batch_idx, sy+1, sx0); //start of lower row
-
-        //$$$ only need to cast, not saturatecast
-        result[0] = cuda::SaturateCast<T>(accum[0] + fy * (bPtr[sx0 - sx0] * (1.0f - fx0) + bPtr[sx0 - sx0 + 1] * fx0));
-        result[1] = cuda::SaturateCast<T>(accum[1] + fy * (bPtr[sx1 - sx0] * (1.0f - fx1) + bPtr[sx1 - sx0 + 1] * fx1));
-        result[2] = cuda::SaturateCast<T>(accum[2] + fy * (bPtr[sx2 - sx0] * (1.0f - fx2) + bPtr[sx2 - sx0 + 1] * fx2));
-        result[3] = cuda::SaturateCast<T>(accum[3] + fy * (bPtr[sx3 - sx0] * (1.0f - fx3) + bPtr[sx3 - sx0 + 1] * fx3));
-    }
-    else //unbuffered
-    {
-        //row pointers
-        const T *aPtr = src.ptr(batch_idx, sy, 0);     //start of upper row
-        const T *bPtr = src.ptr(batch_idx, sy + 1, 0); //start of lower row
-
-        //$$$ only need to cast, not saturatecast
-        result[0] = cuda::SaturateCast<T>(aPtr[sx0] * (1.0f - fx0) * (1.0f - fy) + bPtr[sx0] * (1.0f - fx0) * fy
-                                          + aPtr[sx0 + 1] * fx0 * (1.0f - fy) + bPtr[sx0 + 1] * fx0 * fy);
-
-        result[1] = cuda::SaturateCast<T>(aPtr[sx1] * (1.0f - fx1) * (1.0f - fy) + bPtr[sx1] * (1.0f - fx1) * fy
-                                          + aPtr[sx1 + 1] * fx1 * (1.0f - fy) + bPtr[sx1 + 1] * fx1 * fy);
-
-        result[2] = cuda::SaturateCast<T>(aPtr[sx2] * (1.0f - fx2) * (1.0f - fy) + bPtr[sx2] * (1.0f - fx2) * fy
-                                          + aPtr[sx2 + 1] * fx2 * (1.0f - fy) + bPtr[sx2 + 1] * fx2 * fy);
-
-        result[3] = cuda::SaturateCast<T>(aPtr[sx3] * (1.0f - fx3) * (1.0f - fy) + bPtr[sx3] * (1.0f - fx3) * fy
-                                          + aPtr[sx3 + 1] * fx3 * (1.0f - fy) + bPtr[sx3 + 1] * fx3 * fy);
-    }
-
-    //aligned write 4 pixels
-    _alignedCudaMemcpyQuad<T>(dst.ptr(batch_idx, dst_y, dst_x), result);
-} //resize_bilinear_quad_alignread
-
 //******************** Bicubic
 
 template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
@@ -372,138 +207,6 @@ __global__ void resize_bicubic(SrcWrapper src, DstWrapper dst, int2 srcSize, int
     }
 } //resize_bicubic
 
-template<class SrcWrapper, class DstWrapper, typename T = typename DstWrapper::ValueType>
-__global__ void resize_bicubic_quad_alignread(SrcWrapper src, DstWrapper dst, int2 srcSize, int2 dstSize,
-                                              const float scale_x, const float scale_y)
-{                                            //optimized for aligned read and write, plus buffering
-    const float MAX_BUFFERED_X_SCALE = 4.0f; //probably more efficient all the way up to 4.0
-
-    const int dst_x     = (blockIdx.x * blockDim.x + threadIdx.x) * 4; //quad
-    const int dst_y     = blockIdx.y * blockDim.y + threadIdx.y;
-    const int batch_idx = get_batch_idx();
-    int       height = srcSize.y, width = srcSize.x, out_height = dstSize.y, out_width = dstSize.x;
-
-    //0 - quad-aligned so if one pixel is out, they're all out
-    if ((dst_x >= out_width) | (dst_y >= out_height))
-        return;
-
-    uint readBuffer[MAX_BUFFER_WORDS];
-    T    result[4];
-
-    //float space for weighted addition
-    using work_type = cuda::ConvertBaseTypeTo<float, T>;
-
-    //y coordinate
-    float fy = (float)((dst_y + 0.5f) * scale_y - 0.5f);
-    int   sy = cuda::round<cuda::RoundMode::DOWN, int>(fy);
-    fy -= sy;
-    sy = cuda::max(1, cuda::min(sy, height - 3));
-
-    const float A = -0.75f;
-
-    float cY[4];
-    cY[0] = ((A * (fy + 1) - 5 * A) * (fy + 1) + 8 * A) * (fy + 1) - 4 * A;
-    cY[1] = ((A + 2) * fy - (A + 3)) * fy * fy + 1;
-    cY[2] = ((A + 2) * (1 - fy) - (A + 3)) * (1 - fy) * (1 - fy) + 1;
-    cY[3] = 1.f - cY[0] - cY[1] - cY[2];
-
-    //1 - optimized case if scale_x < some finite limit
-    if (scale_x <= MAX_BUFFERED_X_SCALE) //local buffering
-    {                                    //buffered read
-        work_type accum[4];
-        float     fx[4];
-        int       sx[4];
-        float     cX[4][4];
-
-        //initialize data for each pixel position
-#pragma unroll
-        for (int pix = 0; pix < 4; ++pix)
-        {
-            accum[pix] = cuda::SetAll<work_type>(0);
-
-            //1 - precalc sx's ahead of time to get range from sx0-1..sx3+2
-            fx[pix] = (float)((dst_x + pix + 0.5f) * scale_x - 0.5f);
-            sx[pix] = cuda::round<cuda::RoundMode::DOWN, int>(fx[pix]);
-            fx[pix] -= sx[pix];
-            fx[pix] *= ((sx[pix] >= 1) && (sx[pix] < width - 3));
-            sx[pix] = cuda::max(1, cuda::min(sx[pix], width - 3));
-
-            //2 - precalc cX[][] 2D array
-            cX[pix][0]
-                = ((A * (fx[pix] + 1.0f) - 5.0f * A) * (fx[pix] + 1.0f) + 8.0f * A) * (fx[pix] + 1.0f) - 4.0f * A;
-            cX[pix][1] = ((A + 2.0f) * fx[pix] - (A + 3.0f)) * fx[pix] * fx[pix] + 1.0f;
-            cX[pix][2] = ((A + 2.0f) * (1.0f - fx[pix]) - (A + 3.0f)) * (1.0f - fx[pix]) * (1.0f - fx[pix]) + 1.0f;
-            cX[pix][3] = 1.0f - cX[pix][0] - cX[pix][1] - cX[pix][2];
-        }
-        const int rowOffset = sx[0] - 1;
-
-        //contribute each row into 4 pixels
-#pragma unroll
-        for (int row = 0; row < 4; ++row)
-        {
-            //1 - load each row from sx[0]-1 to sx[3]+2 inclusive, aligned
-            const T *aPtr = _cacheAlignedBufferedRead<CACHE_MEMORY_ALIGNMENT>(
-                src, srcSize.x, readBuffer, MAX_BUFFER_WORDS, batch_idx, sy + row - 1, sx[0] - 1, sx[3] + 2);
-
-//2 - do each pixel's partial on this row
-#pragma unroll
-            for (int pix = 0; pix < 4; ++pix)
-            {
-                accum[pix]
-                    += cY[row]
-                     * (cX[row][0] * aPtr[sx[pix] - rowOffset - 1] + cX[row][1] * aPtr[sx[pix] - rowOffset + 0]
-                        + cX[row][2] * aPtr[sx[pix] - rowOffset + 1] + cX[row][3] * aPtr[sx[pix] - rowOffset + 2]);
-            }
-        }
-
-        for (int pix = 0; pix < 4; ++pix)
-#ifndef LEGACY_BICUBIC_MATH
-            result[pix] = cuda::SaturateCast<T>(accum[pix]);
-#else
-            result[pix] = cuda::SaturateCast<T>(cuda::abs(accum[pix]));
-#endif
-    }
-    else
-    { //partially buffered read 4 pixels at a time across each bicubic: 16 coalesced reads instead of 64
-#pragma unroll
-        for (int pix = 0; pix < 4; ++pix)
-        {
-            work_type accum = cuda::SetAll<work_type>(0);
-
-            float fx = (float)((dst_x + pix + 0.5f) * scale_x - 0.5f);
-            int   sx = cuda::round<cuda::RoundMode::DOWN, int>(fx);
-            fx -= sx;
-            fx *= ((sx >= 1) && (sx < width - 3));
-            sx = cuda::max(1, cuda::min(sx, width - 3));
-
-            float cX[4];
-            cX[0] = ((A * (fx + 1.0f) - 5.0f * A) * (fx + 1.0f) + 8.0f * A) * (fx + 1.0f) - 4.0f * A;
-            cX[1] = ((A + 2.0f) * fx - (A + 3.0f)) * fx * fx + 1.0f;
-            cX[2] = ((A + 2.0f) * (1.0f - fx) - (A + 3.0f)) * (1.0f - fx) * (1.0f - fx) + 1.0f;
-            cX[3] = 1.0f - cX[0] - cX[1] - cX[2];
-
-            for (int row = 0; row < 4; ++row)
-            {
-                //1 - load each sub row from sx[pix]-1 to sx[pix]+2 inclusive, aligned
-                //const T * aPtr = src.ptr(batch_idx, sy + row - 1, sx-1);
-                const T *aPtr = _cacheAlignedBufferedRead<CACHE_MEMORY_ALIGNMENT>(
-                    src, srcSize.x, readBuffer, MAX_BUFFER_WORDS, batch_idx, sy + row - 1, sx - 1, sx + 2);
-
-                //2 - do a pixel's partial on this row
-                accum += cY[row] * (cX[0] * aPtr[0] + cX[1] * aPtr[1] + cX[2] * aPtr[2] + cX[3] * aPtr[3]);
-            } //for row
-#ifndef LEGACY_BICUBIC_MATH
-            result[pix] = cuda::SaturateCast<T>(accum);
-#else
-            result[pix] = cuda::SaturateCast<T>(cuda::abs(accum));
-#endif
-        } //for pix
-    }
-
-    //aligned write 4 pixels
-    _alignedCudaMemcpyQuad<T>(dst.ptr(batch_idx, dst_y, dst_x), result);
-} //resize_bicubic_quad_alignread
-
 template<class SrcWrapper, class DstWrapper>
 __global__ void resize_area_ocv_align(SrcWrapper src, DstWrapper dst, int2 dstSize)
 {
@@ -552,54 +255,21 @@ void resize(const TensorDataStridedCuda &inData, const TensorDataStridedCuda &ou
     const dim3 blockSize(BLOCK_WIDTH, THREADS_PER_BLOCK / BLOCK_WIDTH, 1);
     const dim3 gridSize(divUp(out_width, blockSize.x), divUp(out_height, blockSize.y), batch_size);
 
-    //rationale for quad: aligned gather and aligned output where quad is possible: use different threading
-    const int  out_quad_width = out_width / 4;
-    const dim3 quadGridSize(divUp(out_quad_width, blockSize.x), divUp(out_height, blockSize.y), batch_size);
-
-    //bool can_quad = ((((size_t)dst_ptr) % sizeof(T)) == 0) && ((out_width % 4) == 0);  //is the output buffer quad-pixel aligned?
-    //bool can_quad = ((out_width % 4) == 0); //is the output buffer quad-pixel aligned?
-    bool can_quad = false; // turning it off due to a reported regression
-
     //Note: resize is fundamentally a gather memory operation, with a little bit of compute
     //      our goals are to (a) maximize throughput, and (b) minimize occupancy for the same performance
 
     switch (interpolation)
     {
     case NVCV_INTERP_NEAREST:
-
-        if (can_quad)
-        { //thread does 4 pixels horizontally for aligned read and write
-            resize_NN_quad_alignread<<<quadGridSize, blockSize, 0, stream>>>(src, dst, srcSize, dstSize, scale_x,
-                                                                             scale_y);
-        }
-        else
-        { //generic single pixel per thread case
-            resize_NN<<<gridSize, blockSize, 0, stream>>>(src, dst, srcSize, dstSize, scale_x, scale_y);
-        }
+        resize_NN<<<gridSize, blockSize, 0, stream>>>(src, dst, srcSize, dstSize, scale_x, scale_y);
         break;
 
     case NVCV_INTERP_LINEAR:
-        if (can_quad)
-        { //thread does 4 pixels horizontally for aligned read and write
-            resize_bilinear_quad_alignread<<<quadGridSize, blockSize, 0, stream>>>(src, dst, srcSize, dstSize, scale_x,
-                                                                                   scale_y);
-        }
-        else
-        { //generic single pixel per thread case
-            resize_bilinear<<<gridSize, blockSize, 0, stream>>>(src, dst, srcSize, dstSize, scale_x, scale_y);
-        }
+        resize_bilinear<<<gridSize, blockSize, 0, stream>>>(src, dst, srcSize, dstSize, scale_x, scale_y);
         break;
 
     case NVCV_INTERP_CUBIC:
-        if (can_quad)
-        { //thread does 4 pixels horizontally for aligned read and write
-            resize_bicubic_quad_alignread<<<quadGridSize, blockSize, 0, stream>>>(src, dst, srcSize, dstSize, scale_x,
-                                                                                  scale_y);
-        }
-        else
-        { //generic single pixel per thread case
-            resize_bicubic<<<gridSize, blockSize, 0, stream>>>(src, dst, srcSize, dstSize, scale_x, scale_y);
-        }
+        resize_bicubic<<<gridSize, blockSize, 0, stream>>>(src, dst, srcSize, dstSize, scale_x, scale_y);
         break;
 
     case NVCV_INTERP_AREA:
diff --git a/src/cvcuda/priv/legacy/resize_var_shape.cu b/src/cvcuda/priv/legacy/resize_var_shape.cu
index 6d22bb6d3..e4627ab58 100644
--- a/src/cvcuda/priv/legacy/resize_var_shape.cu
+++ b/src/cvcuda/priv/legacy/resize_var_shape.cu
@@ -86,18 +86,6 @@ inline __device__ T *_cacheAlignedBufferedReadVS(cuda::ImageBatchVarShapeWrap<co
     }
 } //_cacheAlignedBufferedReadVS
 
-template<typename T>
-inline void __device__ _alignedCudaMemcpyQuadVS(T *pDst, T *pSrc)
-{
-    //copy 4 T's, assuming 32-bit alignment for both pSrc and pDst
-    uint *uPtrSrc = (uint *)pSrc;
-    uint *uPtrDst = (uint *)pDst;
-
-#pragma unroll
-    for (int i = 0; i < sizeof(T); ++i) uPtrDst[i] = uPtrSrc[i];
-
-} //_alignedCudaMemcpyQuadVS
-
 //******************** NN = Nearest Neighbor
 
 template<typename T>
@@ -123,73 +111,6 @@ __global__ void resize_NN(cuda::ImageBatchVarShapeWrap<const T> src, cuda::Image
     }
 } //resize_NN
 
-template<typename T>
-__global__ void resize_NN_quad_combo(cuda::ImageBatchVarShapeWrap<const T> src, cuda::ImageBatchVarShapeWrap<T> dst)
-{
-    const float MAX_BUFFERED_X_SCALE = 4.0f; //probably more efficient all the way up to 4.0
-
-    const int dst_x     = (blockIdx.x * blockDim.x + threadIdx.x) * 4; //quad
-    const int dst_y     = blockIdx.y * blockDim.y + threadIdx.y;
-    const int batch_idx = get_batch_idx();
-    const int dstWidth  = dst.width(batch_idx);
-    const int dstWidth4 = dstWidth & ~3;
-    const int dstHeight = dst.height(batch_idx);
-
-    //0 - bail if out-of-range
-    if ((dst_x >= dstWidth) | (dst_y >= dstHeight))
-        return;
-
-    const int   width   = src.width(batch_idx);
-    const int   height  = src.height(batch_idx);
-    const float scale_x = static_cast<float>(width) / dstWidth;
-    const float scale_y = static_cast<float>(height) / dstHeight;
-    const int   sy      = cuda::min(cuda::round<cuda::RoundMode::DOWN, int>(dst_y * scale_y), height - 1);
-
-    if (dstWidth != dstWidth4) //non-aligned case, up to 4 pixels
-    {                          //do up to 4 pixels, unoptimized
-        const int pixels = cuda::min(dstWidth - dst_x, 4);
-        for (int i = 0; i < pixels; ++i)
-        {
-            const int sxi = cuda::min(cuda::round<cuda::RoundMode::DOWN, int>((dst_x + i) * scale_x), width - 1);
-
-            *dst.ptr(batch_idx, dst_y, dst_x + i) = *src.ptr(batch_idx, sy, sxi);
-        }
-    }
-    else //quad-case: memory is aligned, do 4 pixels
-    {
-        const int sx0 = cuda::min(cuda::round<cuda::RoundMode::DOWN, int>(dst_x * scale_x), width - 1);
-        const int sx1 = cuda::min(cuda::round<cuda::RoundMode::DOWN, int>(dst_x * scale_x + scale_x), width - 1);
-        const int sx2 = cuda::min(cuda::round<cuda::RoundMode::DOWN, int>((dst_x + 2) * scale_x), width - 1);
-        const int sx3 = cuda::min(cuda::round<cuda::RoundMode::DOWN, int>((dst_x + 3) * scale_x), width - 1);
-
-        //1 - optimized case if scale_x < some finite limit
-        if ((scale_x <= MAX_BUFFERED_X_SCALE)) //local buffering is more efficient
-        {
-            uint readBuffer[MAX_BUFFER_WORDS_VS];
-
-            //2 - copy out source data, 32-bit aligned aligned
-            T *aPtr = _cacheAlignedBufferedReadVS<T, CACHE_MEMORY_ALIGNMENT_VS>(
-                src, width, &readBuffer[0], MAX_BUFFER_WORDS_VS, batch_idx, sy, sx0, sx3);
-
-            //3 - NN sampling
-            T gather[4] = {aPtr[0], aPtr[sx1 - sx0], aPtr[sx2 - sx0], aPtr[sx3 - sx0]};
-
-            //4 - aligned write back out
-            _alignedCudaMemcpyQuadVS<T>(dst.ptr(batch_idx, dst_y, dst_x), gather);
-        }
-        else //6 - standard sampling, no optimization
-        {
-            //sample all 4 points
-
-            const T *aPtr = src.ptr(batch_idx, sy, sx0);
-
-            T gather[4] = {aPtr[0], aPtr[sx1 - sx0], aPtr[sx2 - sx0], aPtr[sx3 - sx0]};
-
-            _alignedCudaMemcpyQuadVS<T>(dst.ptr(batch_idx, dst_y, dst_x), gather);
-        }
-    }
-} //resize_NN_quad_combo
-
 //******************** Bilinear
 
 template<typename T>
@@ -236,147 +157,6 @@ __global__ void resize_bilinear(cuda::ImageBatchVarShapeWrap<const T> src, cuda:
     }
 } //resize_bilinear
 
-template<typename T>
-__global__ void resize_bilinear_quad_combo(cuda::ImageBatchVarShapeWrap<const T> src,
-                                           cuda::ImageBatchVarShapeWrap<T>       dst)
-{
-    const float MAX_BUFFERED_X_SCALE = 4.0f; //probably more efficient all the way up to 4.0
-
-    const int dst_x     = (blockIdx.x * blockDim.x + threadIdx.x) * 4; //quad
-    const int dst_y     = blockIdx.y * blockDim.y + threadIdx.y;
-    const int batch_idx = get_batch_idx();
-    const int dstWidth  = dst.width(batch_idx);
-    const int dstWidth4 = dstWidth & ~3;
-    const int dstHeight = dst.height(batch_idx);
-
-    //0 - if one pixel is out, they're all out
-    if ((dst_x >= dstWidth) | (dst_y >= dstHeight))
-        return;
-
-    const int width  = src.width(batch_idx);
-    const int height = src.height(batch_idx);
-
-    const float scale_x = static_cast<float>(width) / dstWidth;
-    const float scale_y = static_cast<float>(height) / dstHeight;
-
-    //y coordinate math is the same for all points
-    float fy = (float)((dst_y + 0.5f) * scale_y - 0.5f);
-    int   sy = cuda::round<cuda::RoundMode::DOWN, int>(fy);
-    fy -= sy;
-    sy = cuda::max(0, cuda::min(sy, height - 2));
-
-    if (dstWidth != dstWidth4) //non-aligned case, up to 4 pixels
-    {
-        //row pointers
-        const T *aPtr = src.ptr(batch_idx, sy, 0);     //start of upper row
-        const T *bPtr = src.ptr(batch_idx, sy + 1, 0); //start of lower row
-
-        const int pixels = cuda::min(dstWidth - dst_x, 4);
-        for (int i = 0; i < pixels; ++i)
-        { //compute source data position and weight for [xi] components
-            float fxi = (float)((dst_x + 0.5f + i) * scale_x - 0.5f);
-            int   sxi = cuda::round<cuda::RoundMode::DOWN, int>(fxi);
-            fxi -= sxi;
-            fxi *= ((sxi >= 0) && (sxi < width - 1));
-            sxi = cuda::max(0, cuda::min(sxi, width - 2));
-
-            *dst.ptr(batch_idx, dst_y, dst_x + i)
-                = cuda::SaturateCast<T>((1.0f - fxi) * (aPtr[sxi] * (1.0f - fy) + bPtr[sxi] * fy)
-                                        + fxi * (aPtr[sxi + 1] * (1.0f - fy) + bPtr[sxi + 1] * fy));
-        }
-    }
-    else //quad-aligned case, 4 pixels
-    {
-        //float space for weighted addition
-        using work_type = cuda::ConvertBaseTypeTo<float, T>;
-
-        //sx0
-        float fx0 = (float)((dst_x + 0.5f) * scale_x - 0.5f);
-        int   sx0 = cuda::round<cuda::RoundMode::DOWN, int>(fx0);
-        fx0 -= sx0;
-        fx0 *= ((sx0 >= 0) && (sx0 < width - 1));
-        sx0 = cuda::max(0, cuda::min(sx0, width - 2));
-
-        //sx1
-        float fx1 = (float)((dst_x + 1.5) * scale_x - 0.5f);
-        int   sx1 = cuda::round<cuda::RoundMode::DOWN, int>(fx1);
-        fx1 -= sx1;
-        fx1 *= ((sx1 >= 0) && (sx1 < width - 1));
-        sx1 = cuda::max(0, cuda::min(sx1, width - 2));
-
-        //sx2
-        float fx2 = (float)((dst_x + 2.5f) * scale_x - 0.5f);
-        int   sx2 = cuda::round<cuda::RoundMode::DOWN, int>(fx2);
-        fx2 -= sx2;
-        fx2 *= ((sx2 >= 0) && (sx2 < width - 1));
-        sx2 = cuda::max(0, cuda::min(sx2, width - 2));
-
-        //sx3
-        float fx3 = (float)((dst_x + 3.5f) * scale_x - 0.5f);
-        int   sx3 = cuda::round<cuda::RoundMode::DOWN, int>(fx3);
-        fx3 -= sx3;
-        fx3 *= ((sx3 >= 0) && (sx3 < width - 1));
-        sx3 = cuda::max(0, cuda::min(sx3, width - 2));
-
-        uint readBuffer[MAX_BUFFER_WORDS_VS];
-
-        T result[4];
-
-        //1 - optimized case if scale_x < some finite limit
-        if (scale_x <= MAX_BUFFERED_X_SCALE) //local buffering is more efficient
-        {
-            work_type accum[4];
-
-            //2 - aligned load a-row and add partial product
-            T *aPtr = _cacheAlignedBufferedReadVS<T, CACHE_MEMORY_ALIGNMENT_VS>(
-                src, width, readBuffer, MAX_BUFFER_WORDS_VS, batch_idx, sy, sx0, sx3 + 1);
-            //const T * aPtr = src.ptr(batch_idx, sy,   sx0); //start of upper row
-
-            accum[0] = (1.0f - fy) * (aPtr[sx0 - sx0] * (1.0f - fx0) + aPtr[sx0 - sx0 + 1] * fx0);
-            accum[1] = (1.0f - fy) * (aPtr[sx1 - sx0] * (1.0f - fx1) + aPtr[sx1 - sx0 + 1] * fx1);
-            accum[2] = (1.0f - fy) * (aPtr[sx2 - sx0] * (1.0f - fx2) + aPtr[sx2 - sx0 + 1] * fx2);
-            accum[3] = (1.0f - fy) * (aPtr[sx3 - sx0] * (1.0f - fx3) + aPtr[sx3 - sx0 + 1] * fx3);
-
-            //3 - aligned load b-row and add remaining partial product
-            T *bPtr = _cacheAlignedBufferedReadVS<T, CACHE_MEMORY_ALIGNMENT_VS>(
-                src, width, readBuffer, MAX_BUFFER_WORDS_VS, batch_idx, sy + 1, sx0, sx3 + 1);
-            //const T * bPtr = src.ptr(batch_idx, sy+1, sx0); //start of lower row
-
-            //$$$ only need to cast, not saturatecast
-            result[0]
-                = cuda::SaturateCast<T>(accum[0] + fy * (bPtr[sx0 - sx0] * (1.0f - fx0) + bPtr[sx0 - sx0 + 1] * fx0));
-            result[1]
-                = cuda::SaturateCast<T>(accum[1] + fy * (bPtr[sx1 - sx0] * (1.0f - fx1) + bPtr[sx1 - sx0 + 1] * fx1));
-            result[2]
-                = cuda::SaturateCast<T>(accum[2] + fy * (bPtr[sx2 - sx0] * (1.0f - fx2) + bPtr[sx2 - sx0 + 1] * fx2));
-            result[3]
-                = cuda::SaturateCast<T>(accum[3] + fy * (bPtr[sx3 - sx0] * (1.0f - fx3) + bPtr[sx3 - sx0 + 1] * fx3));
-        }
-        else //unbuffered
-        {
-            //row pointers
-            const T *aPtr = src.ptr(batch_idx, sy, 0);     //start of upper row
-            const T *bPtr = src.ptr(batch_idx, sy + 1, 0); //start of lower row
-
-            //$$$ only need to cast, not saturatecast
-            result[0] = cuda::SaturateCast<T>(aPtr[sx0] * (1.0f - fx0) * (1.0f - fy) + bPtr[sx0] * (1.0f - fx0) * fy
-                                              + aPtr[sx0 + 1] * fx0 * (1.0f - fy) + bPtr[sx0 + 1] * fx0 * fy);
-
-            result[1] = cuda::SaturateCast<T>(aPtr[sx1] * (1.0f - fx1) * (1.0f - fy) + bPtr[sx1] * (1.0f - fx1) * fy
-                                              + aPtr[sx1 + 1] * fx1 * (1.0f - fy) + bPtr[sx1 + 1] * fx1 * fy);
-
-            result[2] = cuda::SaturateCast<T>(aPtr[sx2] * (1.0f - fx2) * (1.0f - fy) + bPtr[sx2] * (1.0f - fx2) * fy
-                                              + aPtr[sx2 + 1] * fx2 * (1.0f - fy) + bPtr[sx2 + 1] * fx2 * fy);
-
-            result[3] = cuda::SaturateCast<T>(aPtr[sx3] * (1.0f - fx3) * (1.0f - fy) + bPtr[sx3] * (1.0f - fx3) * fy
-                                              + aPtr[sx3 + 1] * fx3 * (1.0f - fy) + bPtr[sx3 + 1] * fx3 * fy);
-        }
-
-        //aligned write 4 pixels
-        _alignedCudaMemcpyQuadVS<T>(dst.ptr(batch_idx, dst_y, dst_x), result);
-    }
-} //resize_bilinear_quad_combo
-
 //******************** Bicubic
 
 template<typename T>
@@ -449,192 +229,6 @@ __global__ void resize_bicubic(cuda::ImageBatchVarShapeWrap<const T> src, cuda::
     }
 } //resize_bicubic
 
-template<typename T>
-__global__ void resize_bicubic_quad_combo(cuda::ImageBatchVarShapeWrap<const T> src,
-                                          cuda::ImageBatchVarShapeWrap<T>       dst)
-{                                            //optimized for aligned read and write, plus buffering
-    const float MAX_BUFFERED_X_SCALE = 4.0f; //probably more efficient all the way up to 4.0
-
-    const int dst_x     = (blockIdx.x * blockDim.x + threadIdx.x) * 4; //quad
-    const int dst_y     = blockIdx.y * blockDim.y + threadIdx.y;
-    const int batch_idx = get_batch_idx();
-    const int dstWidth  = dst.width(batch_idx);
-    const int dstWidth4 = dstWidth & ~3;
-    const int dstHeight = dst.height(batch_idx);
-
-    //0 - quad-aligned so if one pixel is out, they're all out
-    if ((dst_x >= dstWidth) | (dst_y >= dstHeight))
-        return;
-
-    uint readBuffer[MAX_BUFFER_WORDS_VS];
-    T    result[4];
-
-    const int width  = src.width(batch_idx);
-    const int height = src.height(batch_idx);
-
-    const float scale_x = static_cast<float>(width) / dstWidth;
-    const float scale_y = static_cast<float>(height) / dstHeight;
-
-    //float space for weighted addition
-    using work_type = cuda::ConvertBaseTypeTo<float, T>;
-
-    //y coordinate
-    float fy = (float)((dst_y + 0.5f) * scale_y - 0.5f);
-    int   sy = cuda::round<cuda::RoundMode::DOWN, int>(fy);
-    fy -= sy;
-    sy = cuda::max(1, cuda::min(sy, height - 3));
-
-    const float A = -0.75f;
-
-    float cY[4];
-    cY[0] = ((A * (fy + 1) - 5 * A) * (fy + 1) + 8 * A) * (fy + 1) - 4 * A;
-    cY[1] = ((A + 2) * fy - (A + 3)) * fy * fy + 1;
-    cY[2] = ((A + 2) * (1 - fy) - (A + 3)) * (1 - fy) * (1 - fy) + 1;
-    cY[3] = 1.f - cY[0] - cY[1] - cY[2];
-
-    if (dstWidth != dstWidth4) //non-aligned case, up to 4 pixels
-    {
-        uint readBuffer[MAX_BUFFER_WORDS_VS];
-
-        const int pixels = cuda::min(dstWidth - dst_x, 4);
-        for (int i = 0; i < pixels; ++i)
-        {
-            float fxi = (float)((dst_x + 0.5f + i) * scale_x - 0.5f);
-            int   sxi = cuda::round<cuda::RoundMode::DOWN, int>(fxi);
-            fxi -= sxi;
-            fxi *= ((sxi >= 1) && (sxi < width - 3));
-            sxi = cuda::max(1, cuda::min(sxi, width - 3));
-
-            float cX[4];
-            cX[0] = ((A * (fxi + 1.0f) - 5.0f * A) * (fxi + 1.0f) + 8.0f * A) * (fxi + 1.0f) - 4.0f * A;
-            cX[1] = ((A + 2.0f) * fxi - (A + 3.0f)) * fxi * fxi + 1.0f;
-            cX[2] = ((A + 2.0f) * (1.0f - fxi) - (A + 3.0f)) * (1.0f - fxi) * (1.0f - fxi) + 1.0f;
-            cX[3] = 1.0f - cX[0] - cX[1] - cX[2];
-
-            work_type accum = cuda::SetAll<work_type>(0);
-#pragma unroll
-            for (int row = 0; row < 4; ++row)
-            {
-                //1 - load each sub row from sx-1 to sx+3 inclusive, aligned
-                //const T * aPtr = src.ptr(batch_idx, sy + row - 1, sx-1);
-                T *aPtr = _cacheAlignedBufferedReadVS<T, CACHE_MEMORY_ALIGNMENT_VS>(
-                    src, width, readBuffer, MAX_BUFFER_WORDS_VS, batch_idx, sy + row - 1, sxi - 1, sxi + 2);
-
-                //2 - do a pixel's partial on this row
-                accum += cY[row] * (cX[0] * aPtr[0] + cX[1] * aPtr[1] + cX[2] * aPtr[2] + cX[3] * aPtr[3]);
-            } //for row
-#ifndef LEGACY_BICUBIC_MATH_VS
-            //correct math
-            *dst.ptr(batch_idx, dst_y, dst_x + i) = cuda::SaturateCast<T>(accum);
-#else
-            //abs() needed to match legacy operator.
-            *dst.ptr(batch_idx, dst_y, dst_x + i) = cuda::SaturateCast<T>(cuda::abs(accum));
-#endif
-        } //for pixels
-    }
-    else //quad-aligned case, 4 pixels
-    {
-        //1 - optimized case if scale_x < some finite limit
-        if (scale_x <= MAX_BUFFERED_X_SCALE) //local buffering
-        {                                    //buffered read
-
-            work_type accum[4];
-            float     fx[4];
-            int       sx[4];
-            float     cX[4][4];
-
-            //initialize data for each pixel position
-#pragma unroll
-            for (int pix = 0; pix < 4; ++pix)
-            {
-                accum[pix] = cuda::SetAll<work_type>(0);
-
-                //1 - precalc sx's ahead of time to get range from sx0-1..sx3+2
-                fx[pix] = (float)((dst_x + pix + 0.5f) * scale_x - 0.5f);
-                sx[pix] = cuda::round<cuda::RoundMode::DOWN, int>(fx[pix]);
-                fx[pix] -= sx[pix];
-                fx[pix] *= ((sx[pix] >= 1) && (sx[pix] < width - 3));
-                sx[pix] = cuda::max(1, cuda::min(sx[pix], width - 3));
-
-                //2 - precalc cX[][] 2D array
-                cX[pix][0]
-                    = ((A * (fx[pix] + 1.0f) - 5.0f * A) * (fx[pix] + 1.0f) + 8.0f * A) * (fx[pix] + 1.0f) - 4.0f * A;
-                cX[pix][1] = ((A + 2.0f) * fx[pix] - (A + 3.0f)) * fx[pix] * fx[pix] + 1.0f;
-                cX[pix][2] = ((A + 2.0f) * (1.0f - fx[pix]) - (A + 3.0f)) * (1.0f - fx[pix]) * (1.0f - fx[pix]) + 1.0f;
-                cX[pix][3] = 1.0f - cX[pix][0] - cX[pix][1] - cX[pix][2];
-            }
-            const int rowOffset = sx[0] - 1;
-
-            //contribute each row into 4 pixels
-#pragma unroll
-            for (int row = 0; row < 4; ++row)
-            {
-                //1 - load each row from sx[0]-1 to sx[3]+3 inclusive, aligned
-                T *aPtr = _cacheAlignedBufferedReadVS<T, CACHE_MEMORY_ALIGNMENT_VS>(
-                    src, width, readBuffer, MAX_BUFFER_WORDS_VS, batch_idx, sy + row - 1, sx[0] - 1, sx[3] + 2);
-
-//2 - do each pixel's partial on this row
-#pragma unroll
-                for (int pix = 0; pix < 4; ++pix)
-                {
-                    accum[pix]
-                        += cY[row]
-                         * (cX[row][0] * aPtr[sx[pix] - rowOffset - 1] + cX[row][1] * aPtr[sx[pix] - rowOffset + 0]
-                            + cX[row][2] * aPtr[sx[pix] - rowOffset + 1] + cX[row][3] * aPtr[sx[pix] - rowOffset + 2]);
-                }
-            }
-
-#pragma unroll
-            for (int pix = 0; pix < 4; ++pix)
-#ifndef LEGACY_BICUBIC_MATH_VS
-                result[pix] = cuda::SaturateCast<T>(accum[pix]);
-#else
-                result[pix] = cuda::SaturateCast<T>(cuda::abs(accum[pix]));
-#endif
-        }
-        else
-        { //partially buffered read 4 pixels at a time across each bicubic: 16 coalesced reads instead of 64
-#pragma unroll
-            for (int pix = 0; pix < 4; ++pix)
-            {
-                work_type accum = cuda::SetAll<work_type>(0);
-
-                float fx = (float)((dst_x + pix + 0.5f) * scale_x - 0.5f);
-                int   sx = cuda::round<cuda::RoundMode::DOWN, int>(fx);
-                fx -= sx;
-                fx *= ((sx >= 1) && (sx < width - 3));
-                sx = cuda::max(1, cuda::min(sx, width - 3));
-
-                float cX[4];
-                cX[0] = ((A * (fx + 1.0f) - 5.0f * A) * (fx + 1.0f) + 8.0f * A) * (fx + 1.0f) - 4.0f * A;
-                cX[1] = ((A + 2.0f) * fx - (A + 3.0f)) * fx * fx + 1.0f;
-                cX[2] = ((A + 2.0f) * (1.0f - fx) - (A + 3.0f)) * (1.0f - fx) * (1.0f - fx) + 1.0f;
-                cX[3] = 1.0f - cX[0] - cX[1] - cX[2];
-
-#pragma unroll
-                for (int row = 0; row < 4; ++row)
-                {
-                    //1 - load each sub row from sx[pix]-1 to sx[pix]+2 inclusive, aligned
-                    //const T * aPtr = src.ptr(batch_idx, sy + row - 1, sx-1);
-                    const T *aPtr = _cacheAlignedBufferedReadVS<T, CACHE_MEMORY_ALIGNMENT_VS>(
-                        src, width, readBuffer, MAX_BUFFER_WORDS_VS, batch_idx, sy + row - 1, sx - 1, sx + 2);
-
-                    //2 - do a pixel's partial on this row
-                    accum += cY[row] * (cX[0] * aPtr[0] + cX[1] * aPtr[1] + cX[2] * aPtr[2] + cX[3] * aPtr[3]);
-                } //for row
-#ifndef LEGACY_BICUBIC_MATH_VS
-                result[pix] = cuda::SaturateCast<T>(accum);
-#else
-                result[pix] = cuda::SaturateCast<T>(cuda::abs(accum));
-#endif
-            } //for pix
-        }
-
-        //aligned write 4 pixels
-        _alignedCudaMemcpyQuadVS<T>(dst.ptr(batch_idx, dst_y, dst_x), result);
-    }
-} //resize_bicubic_quad_combo
-
 //******************** Integrate area
 
 template<typename T>
@@ -832,18 +426,6 @@ __global__ void resize_area_ocv_align(const cuda::ImageBatchVarShapeWrap<const T
                                                        + *src.ptr(batch_idx, sy + 1, sx + 1) * cbufx[1] * cbufy[1]));
 }
 
-template<class Filter, typename T>
-__global__ void resize_area_v2(const Filter src, cuda_op::Ptr2dVarShapeNHWC<T> dst)
-{
-    int       dst_x     = blockDim.x * blockIdx.x + threadIdx.x;
-    int       dst_y     = blockDim.y * blockIdx.y + threadIdx.y;
-    const int batch_idx = get_batch_idx();
-    if (dst_x >= dst.cols[batch_idx] || dst_y >= dst.rows[batch_idx])
-        return;
-
-    *dst.ptr(batch_idx, dst_y, dst_x) = src(batch_idx, dst_y, dst_x);
-}
-
 template<typename T>
 void resize(const ImageBatchVarShapeDataStridedCuda &in, const ImageBatchVarShapeDataStridedCuda &out,
             const int interpolation, cudaStream_t stream)
@@ -855,9 +437,6 @@ void resize(const ImageBatchVarShapeDataStridedCuda &in, const ImageBatchVarShap
 
     Size2D outMaxSize = out.maxSize();
 
-    bool can_quad = false; // turning it off due to a reported regression
-    //bool can_quad = false;  //<-- force single pixel per kernel mode, smaller register file
-
     const int THREADS_PER_BLOCK = 256; //Performance degrades above 256 and below 16 (GMEM speed limited)
     const int BLOCK_WIDTH       = 8;   //as in 32x4 or 32x8 or 8x32.
 
@@ -871,41 +450,22 @@ void resize(const ImageBatchVarShapeDataStridedCuda &in, const ImageBatchVarShap
     switch (interpolation)
     {
     case NVCV_INTERP_NEAREST:
-        if (can_quad)
-        { //thread does 4 pixels horizontally for aligned read and write
-            resize_NN_quad_combo<T><<<quadGridSize, blockSize, 0, stream>>>(src_ptr, dst_ptr);
-        }
-        else
-        { //generic single pixel per thread case
-            resize_NN<T><<<gridSize, blockSize, 0, stream>>>(src_ptr, dst_ptr);
-        }
+        resize_NN<T><<<gridSize, blockSize, 0, stream>>>(src_ptr, dst_ptr);
         break;
+
     case NVCV_INTERP_LINEAR:
-        if (can_quad)
-        { //thread does 4 pixels horizontally for aligned read and write
-            resize_bilinear_quad_combo<T><<<quadGridSize, blockSize, 0, stream>>>(src_ptr, dst_ptr);
-        }
-        else
-        { //generic single pixel per thread case
-            resize_bilinear<T><<<gridSize, blockSize, 0, stream>>>(src_ptr, dst_ptr);
-        }
+        resize_bilinear<T><<<gridSize, blockSize, 0, stream>>>(src_ptr, dst_ptr);
         break;
+
     case NVCV_INTERP_CUBIC:
-        if (can_quad)
-        { //thread does 4 pixels horizontally for aligned read and write
-            resize_bicubic_quad_combo<T><<<quadGridSize, blockSize, 0, stream>>>(src_ptr, dst_ptr);
-        }
-        else
-        { //generic single pixel per thread case
-            resize_bicubic<T><<<gridSize, blockSize, 0, stream>>>(src_ptr, dst_ptr);
-        }
+        resize_bicubic<T><<<gridSize, blockSize, 0, stream>>>(src_ptr, dst_ptr);
         break;
+
     case NVCV_INTERP_AREA:
         cuda::BorderVarShapeWrap<const T, NVCV_BORDER_CONSTANT> brdSrc(in);
-
         resize_area_ocv_align<T><<<gridSize, blockSize, 0, stream>>>(src_ptr, brdSrc, dst_ptr);
-
         break;
+
     } //switch interpolation
     checkKernelErrors();
 
diff --git a/src/nvcv_types/include/nvcv/cuda/ArrayWrap.hpp b/src/nvcv_types/include/nvcv/cuda/ArrayWrap.hpp
index aa768b41c..2a6103d8c 100644
--- a/src/nvcv_types/include/nvcv/cuda/ArrayWrap.hpp
+++ b/src/nvcv_types/include/nvcv/cuda/ArrayWrap.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,12 +20,11 @@
 
 #include "TypeTraits.hpp"
 
-#include <assert.h>
 #include <nvcv/ArrayData.hpp>
 #include <nvcv/ArrayDataAccess.hpp>
-#include <util/Assert.h>
 
-#include <utility>
+#include <cassert>  // for assert, etc.
+#include <iterator> // for iterator_traits, etc.
 
 namespace nvcv::cuda {
 
diff --git a/src/nvcv_types/include/nvcv/cuda/Atomics.hpp b/src/nvcv_types/include/nvcv/cuda/Atomics.hpp
index 7cc05f87e..539c91a61 100644
--- a/src/nvcv_types/include/nvcv/cuda/Atomics.hpp
+++ b/src/nvcv_types/include/nvcv/cuda/Atomics.hpp
@@ -30,10 +30,12 @@
 namespace nvcv::cuda {
 
 /**
- * Metafunction to do a generic atomic operation in floating-point types.
- *
  * @defgroup NVCV_CPP_CUDATOOLS_ATOMICS Atomic operations
  * @{
+ */
+
+/**
+ * Metafunction to do a generic atomic operation in floating-point types.
  *
  * @tparam T Type of the values used in the atomic operation.
  * @tparam OP Operation class that defines the operator call to be used as atomics.
diff --git a/src/nvcv_types/include/nvcv/cuda/BorderWrap.hpp b/src/nvcv_types/include/nvcv/cuda/BorderWrap.hpp
index 5c29c6e5a..eb0009f4a 100644
--- a/src/nvcv_types/include/nvcv/cuda/BorderWrap.hpp
+++ b/src/nvcv_types/include/nvcv/cuda/BorderWrap.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,7 +31,6 @@
 #include <nvcv/BorderType.h>         // for NVCVBorderType, etc.
 #include <nvcv/TensorData.hpp>       // for TensorDataStridedCuda, etc.
 #include <nvcv/TensorDataAccess.hpp> // for TensorDataAccessStridedImagePlanar, etc.
-#include <util/Assert.h>             // for NVCV_ASSERT, etc.
 
 namespace nvcv::cuda {
 
@@ -196,7 +195,7 @@ class BorderWrapImpl
     explicit __host__ BorderWrapImpl(const TensorDataStridedCuda &tensor)
         : m_tensorWrap(tensor)
     {
-        NVCV_ASSERT(tensor.rank() >= kNumDimensions);
+        assert(tensor.rank() >= kNumDimensions);
 
         int j = 0;
 #pragma unroll
@@ -204,7 +203,7 @@ class BorderWrapImpl
         {
             if (kActiveDimensions[i])
             {
-                NVCV_ASSERT(tensor.shape(i) <= TypeTraits<int>::max);
+                assert(tensor.shape(i) <= TypeTraits<int>::max);
 
                 m_tensorShape[j++] = tensor.shape(i);
             }
@@ -548,9 +547,9 @@ template<typename T, NVCVBorderType B, class = Require<HasTypeTraits<T>>>
 __host__ auto CreateBorderWrapNHW(const TensorDataStridedCuda &tensor, T borderValue = {})
 {
     auto tensorAccess = TensorDataAccessStridedImagePlanar::Create(tensor);
-    NVCV_ASSERT(tensorAccess);
-    NVCV_ASSERT(tensorAccess->numRows() <= TypeTraits<int>::max);
-    NVCV_ASSERT(tensorAccess->numCols() <= TypeTraits<int>::max);
+    assert(tensorAccess);
+    assert(tensorAccess->numRows() <= TypeTraits<int>::max);
+    assert(tensorAccess->numCols() <= TypeTraits<int>::max);
 
     auto tensorWrap = CreateTensorWrapNHW<T>(tensor);
 
@@ -580,9 +579,9 @@ template<typename T, NVCVBorderType B, class = Require<HasTypeTraits<T>>>
 __host__ auto CreateBorderWrapNHWC(const TensorDataStridedCuda &tensor, T borderValue = {})
 {
     auto tensorAccess = TensorDataAccessStridedImagePlanar::Create(tensor);
-    NVCV_ASSERT(tensorAccess);
-    NVCV_ASSERT(tensorAccess->numRows() <= TypeTraits<int>::max);
-    NVCV_ASSERT(tensorAccess->numCols() <= TypeTraits<int>::max);
+    assert(tensorAccess);
+    assert(tensorAccess->numRows() <= TypeTraits<int>::max);
+    assert(tensorAccess->numCols() <= TypeTraits<int>::max);
 
     auto tensorWrap = CreateTensorWrapNHWC<T>(tensor);
 
diff --git a/src/nvcv_types/include/nvcv/cuda/DropCast.hpp b/src/nvcv_types/include/nvcv/cuda/DropCast.hpp
index 8e924d821..4caf13e75 100644
--- a/src/nvcv_types/include/nvcv/cuda/DropCast.hpp
+++ b/src/nvcv_types/include/nvcv/cuda/DropCast.hpp
@@ -28,6 +28,11 @@
 
 namespace nvcv::cuda {
 
+/**
+ * @defgroup NVCV_CPP_CUDATOOLS_DROPCAST Drop Cast
+ * @{
+ */
+
 /**
  * Metafunction to drop components of a compound value.
  *
@@ -37,9 +42,6 @@ namespace nvcv::cuda {
  * template argument (see example below).  The type \p T is not needed as it is inferred from the argument \p v.
  * It is a requirement of the DropCast function that the type \p T has at least N components.
  *
- * @defgroup NVCV_CPP_CUDATOOLS_DROPCAST Drop Cast
- * @{
- *
  * @code
  * uint2 dstIdx = DropCast<2>(blockIdx * blockDim + threadIdx);
  * @endcode
diff --git a/src/nvcv_types/include/nvcv/cuda/RangeCast.hpp b/src/nvcv_types/include/nvcv/cuda/RangeCast.hpp
index a4c392db8..a423d495f 100644
--- a/src/nvcv_types/include/nvcv/cuda/RangeCast.hpp
+++ b/src/nvcv_types/include/nvcv/cuda/RangeCast.hpp
@@ -29,6 +29,11 @@
 
 namespace nvcv::cuda {
 
+/**
+ * @defgroup NVCV_CPP_CUDATOOLS_RANGECAST Range cast
+ * @{
+ */
+
 /**
  * Metafunction to range cast (scale) all elements to a target range.
  *
@@ -46,9 +51,6 @@ namespace nvcv::cuda {
  * |    double      |      int      |     [-1, 1]     | [-2147483648, 2147483647] |
  * | unsigned short |     double    |      [0, 65535] |           [0, 1]          |
  *
- * @defgroup NVCV_CPP_CUDATOOLS_RANGECAST Range cast
- * @{
- *
  * @code
  * using DataType = MakeType<uchar, 4>;
  * using FloatDataType = ConvertBaseTypeTo<float, DataType>;
diff --git a/src/nvcv_types/include/nvcv/cuda/SaturateCast.hpp b/src/nvcv_types/include/nvcv/cuda/SaturateCast.hpp
index 7bd337d48..41dace2a2 100644
--- a/src/nvcv_types/include/nvcv/cuda/SaturateCast.hpp
+++ b/src/nvcv_types/include/nvcv/cuda/SaturateCast.hpp
@@ -29,6 +29,11 @@
 
 namespace nvcv::cuda {
 
+/**
+ * @defgroup NVCV_CPP_CUDATOOLS_SATURATECAST Saturate cast
+ * @{
+ */
+
 /**
  * Metafunction to saturate cast all elements to a target type.
  *
@@ -37,9 +42,6 @@ namespace nvcv::cuda {
  * casted to an uchar4 rounding-then-saturating each value to be in between 0 and 255 (see example below).  It is a
  * requirement of SaturateCast that both types have the same number of components or \p T is a regular C type.
  *
- * @defgroup NVCV_CPP_CUDATOOLS_SATURATECAST Saturate cast
- * @{
- *
  * @code
  * using DataType = MakeType<uchar, 4>;
  * using FloatDataType = ConvertBaseTypeTo<float, DataType>;
diff --git a/src/nvcv_types/include/nvcv/cuda/StaticCast.hpp b/src/nvcv_types/include/nvcv/cuda/StaticCast.hpp
index ab89de884..72f2929f9 100644
--- a/src/nvcv_types/include/nvcv/cuda/StaticCast.hpp
+++ b/src/nvcv_types/include/nvcv/cuda/StaticCast.hpp
@@ -28,6 +28,11 @@
 
 namespace nvcv::cuda {
 
+/**
+ * @defgroup NVCV_CPP_CUDATOOLS_STATICCAST Static Cast
+ * @{
+ */
+
 /**
  * Metafunction to static cast all values of a compound to a target type.
  *
@@ -38,9 +43,6 @@ namespace nvcv::cuda {
  * type \p U is not needed as it is inferred from the argument \u.  It is a requirement of the StaticCast function
  * that the type \p T is of regular C type and the type \p U is of CUDA compound type.
  *
- * @defgroup NVCV_CPP_CUDATOOLS_STATICCAST Static Cast
- * @{
- *
  * @code
  * int3 idx = StaticCast<int>(blockIdx * blockDim + threadIdx);
  * @endcode
diff --git a/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp b/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp
index 1cc7143be..b925ce60a 100644
--- a/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp
+++ b/src/nvcv_types/include/nvcv/cuda/TensorWrap.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,9 +29,9 @@
 #include <nvcv/ImageData.hpp>        // for ImageDataStridedCuda, etc.
 #include <nvcv/TensorData.hpp>       // for TensorDataStridedCuda, etc.
 #include <nvcv/TensorDataAccess.hpp> // for TensorDataAccessStridedImagePlanar, etc.
-#include <util/Assert.h>             // for NVCV_ASSERT, etc.
 
-#include <utility>
+#include <cassert> // for assert, etc.
+#include <utility> // for forward, etc.
 
 namespace nvcv::cuda {
 
@@ -150,7 +150,7 @@ class TensorWrap<const T, Strides...>
     {
         constexpr int kStride[] = {std::forward<int>(Strides)...};
 
-        NVCV_ASSERT(tensor.rank() >= kNumDimensions);
+        assert(tensor.rank() >= kNumDimensions);
 
         m_data = reinterpret_cast<const std::byte *>(tensor.basePtr());
 
@@ -159,11 +159,11 @@ class TensorWrap<const T, Strides...>
         {
             if (kStride[i] != -1)
             {
-                NVCV_ASSERT(tensor.stride(i) == kStride[i]);
+                assert(tensor.stride(i) == kStride[i]);
             }
             else if (i < kVariableStrides)
             {
-                NVCV_ASSERT(tensor.stride(i) <= TypeTraits<int>::max);
+                assert(tensor.stride(i) <= TypeTraits<int>::max);
 
                 m_strides[i] = tensor.stride(i);
             }
@@ -447,9 +447,9 @@ template<typename T, class = Require<HasTypeTraits<T>>>
 __host__ auto CreateTensorWrapNHW(const TensorDataStridedCuda &tensor)
 {
     auto tensorAccess = TensorDataAccessStridedImagePlanar::Create(tensor);
-    NVCV_ASSERT(tensorAccess);
-    NVCV_ASSERT(tensorAccess->sampleStride() <= TypeTraits<int>::max);
-    NVCV_ASSERT(tensorAccess->rowStride() <= TypeTraits<int>::max);
+    assert(tensorAccess);
+    assert(tensorAccess->sampleStride() <= TypeTraits<int>::max);
+    assert(tensorAccess->rowStride() <= TypeTraits<int>::max);
 
     return Tensor3DWrap<T>(tensor.basePtr(), static_cast<int>(tensorAccess->sampleStride()),
                            static_cast<int>(tensorAccess->rowStride()));
@@ -474,10 +474,10 @@ template<typename T, class = Require<HasTypeTraits<T>>>
 __host__ auto CreateTensorWrapNHWC(const TensorDataStridedCuda &tensor)
 {
     auto tensorAccess = TensorDataAccessStridedImagePlanar::Create(tensor);
-    NVCV_ASSERT(tensorAccess);
-    NVCV_ASSERT(tensorAccess->sampleStride() <= TypeTraits<int>::max);
-    NVCV_ASSERT(tensorAccess->rowStride() <= TypeTraits<int>::max);
-    NVCV_ASSERT(tensorAccess->colStride() <= TypeTraits<int>::max);
+    assert(tensorAccess);
+    assert(tensorAccess->sampleStride() <= TypeTraits<int>::max);
+    assert(tensorAccess->rowStride() <= TypeTraits<int>::max);
+    assert(tensorAccess->colStride() <= TypeTraits<int>::max);
 
     return Tensor4DWrap<T>(tensor.basePtr(), static_cast<int>(tensorAccess->sampleStride()),
                            static_cast<int>(tensorAccess->rowStride()), static_cast<int>(tensorAccess->colStride()));
diff --git a/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp b/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp
index d91e3f5f1..4287d485f 100644
--- a/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp
+++ b/src/nvcv_types/include/nvcv/cuda/math/LinAlg.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -25,14 +25,15 @@
 #define NVCV_CUDA_MATH_LINALG_HPP
 
 #include <nvcv/cuda/MathWrappers.hpp> // for cuda::max, etc.
+#include <nvcv/cuda/TypeTraits.hpp>   // for cuda::Require, etc.
 
-#include <algorithm>   // for std::swap, etc.
-#include <cassert>     // for assert, etc.
-#include <cmath>       // for std::pow, etc.
-#include <cstdlib>     // for std::size_t, etc.
-#include <ostream>     // for std::ostream, etc.
-#include <type_traits> // for std::enable_if_t, etc.
-#include <vector>      // for std::vector, etc.
+#include <algorithm>        // for std::swap, etc.
+#include <cassert>          // for assert, etc.
+#include <cmath>            // for std::pow, etc.
+#include <cstdlib>          // for std::size_t, etc.
+#include <initializer_list> // for std::initializer_list, etc.
+#include <ostream>          // for std::ostream, etc.
+#include <vector>           // for std::vector, etc.
 
 namespace nvcv::cuda::math {
 
@@ -68,6 +69,16 @@ class Vector
         }
     }
 
+    /**
+     * Load values from a C++ initiliazer list into this vector.
+     *
+     * @param[in] l Input C++ initializer list to load values from.
+     */
+    constexpr __host__ __device__ void load(std::initializer_list<T> l)
+    {
+        load(std::data(l));
+    }
+
     /**
      * Store values to a C-array from this vector.
      *
@@ -215,9 +226,12 @@ class Vector
 /**
  * Matrix class to represent small matrices.
  *
+ * It uses the Vector class to stores each row, storing elements in row-major order, i.e. it has M row vectors
+ * where each vector has N elements.
+ *
  * @tparam T Matrix value type.
  * @tparam M Number of rows.
- * @tparam N Number of columns. Default is M.
+ * @tparam N Number of columns. Default is M (a square matrix).
  */
 template<class T, int M, int N = M>
 class Matrix
@@ -245,6 +259,16 @@ class Matrix
         }
     }
 
+    /**
+     * Load values from a C++ initiliazer list into this matrix.
+     *
+     * @param[in] l Input C++ initializer list to load values from.
+     */
+    constexpr __host__ __device__ void load(std::initializer_list<T> l)
+    {
+        load(std::data(l));
+    }
+
     /**
      * Store values to a flatten array from this matrix.
      *
@@ -356,6 +380,21 @@ class Matrix
         return c;
     }
 
+    /**
+     * Set column j of this matrix.
+     *
+     * @param[in] j Index of column to set.
+     * @param[in] v Value to place in matrix column.
+     */
+    constexpr __host__ __device__ void set_col(int j, T v)
+    {
+#pragma unroll
+        for (int i = 0; i < rows(); ++i)
+        {
+            m_data[i][j] = v;
+        }
+    }
+
     /**
      * Set column j of this matrix.
      *
@@ -516,7 +555,7 @@ template<class T, int N>
 constexpr __host__ __device__ Vector<T, N> &operator*=(Vector<T, N> &lhs, const T &rhs)
 {
 #pragma unroll
-    for (int j = 0; j < lhs.size(); ++j)
+    for (int j = 0; j < N; ++j)
     {
         lhs[j] *= rhs;
     }
@@ -540,7 +579,7 @@ template<class T, int N>
 constexpr __host__ __device__ Vector<T, N> &operator*=(Vector<T, N> &lhs, const Vector<T, N> &rhs)
 {
 #pragma unroll
-    for (int j = 0; j < lhs.size(); ++j)
+    for (int j = 0; j < N; ++j)
     {
         lhs[j] *= rhs[j];
     }
@@ -606,12 +645,12 @@ template<class T, int N>
 std::ostream &operator<<(std::ostream &out, const Vector<T, N> &v)
 {
     out << '[';
-    for (int i = 0; i < v.size(); ++i)
+    for (int i = 0; i < N; ++i)
     {
         out << v[i];
-        if (i < v.size() - 1)
+        if (i < N - 1)
         {
-            out << ',';
+            out << ' ';
         }
     }
     return out << ']';
@@ -700,12 +739,12 @@ std::ostream &operator<<(std::ostream &out, const Matrix<T, M, N> &m)
             out << m[i][j];
             if (j < m.cols() - 1)
             {
-                out << ',';
+                out << ' ';
             }
         }
         if (i < m.rows() - 1)
         {
-            out << ";";
+            out << "\n";
         }
     }
     return out << ']';
@@ -835,7 +874,7 @@ constexpr __host__ __device__ Vector<T, M> operator*(const Matrix<T, M, N> &m, c
     return r;
 }
 
-template<class T, int M, int N, class = std::enable_if_t<(M == N && N > 1)>>
+template<class T, int M, int N, class = cuda::Require<(M == N && N > 1)>>
 constexpr __host__ __device__ Matrix<T, M, N> operator*(const Matrix<T, M, 1> &m, const Vector<T, N> &v)
 {
     Matrix<T, M, N> r;
@@ -966,107 +1005,79 @@ constexpr Matrix<T, N, M> as_matrix(const T (&values)[N][M])
 template<class T, int N>
 constexpr __host__ __device__ Vector<T, N> zeros()
 {
-    Vector<T, N> v;
-    if constexpr (N > 0)
+    Vector<T, N> v = {};
+#pragma unroll
+    for (int j = 0; j < N; ++j)
     {
-#if __CUDA_ARCH__
-#    pragma unroll
-        for (int j = 0; j < v.size(); ++j)
-        {
-            v[j] = T{0};
-        }
-#else
-        std::fill(&v[0], &v[N - 1] + 1, T{0});
-#endif
+        v[j] = T{0};
     }
-    return v; // I'm hoping that RVO will kick in
+    return v;
 }
 
 template<class T, int M, int N>
 constexpr __host__ __device__ Matrix<T, M, N> zeros()
 {
-    Matrix<T, M, N> mat;
-    if constexpr (M > 0 && N > 0)
+    Matrix<T, M, N> m = {};
+#pragma unroll
+    for (int i = 0; i < M; ++i)
     {
-#if __CUDA_ARCH__
-#    pragma unroll
-        for (int i = 0; i < mat.rows(); ++i)
+#pragma unroll
+        for (int j = 0; j < N; ++j)
         {
-#    pragma unroll
-            for (int j = 0; j < mat.cols(); ++j)
-            {
-                mat[i][j] = T{0};
-            }
+            m[i][j] = T{0};
         }
-#else
-        std::fill(&mat[0][0], &mat[M - 1][N - 1] + 1, T{0});
-#endif
     }
-    return mat; // I'm hoping that RVO will kick in
+    return m;
 }
 
 template<class T, int N>
 constexpr __host__ __device__ Vector<T, N> ones()
 {
-    Vector<T, N> v;
-    if constexpr (N > 0)
+    Vector<T, N> v = {};
+#pragma unroll
+    for (int j = 0; j < N; ++j)
     {
-#if __CUDA_ARCH__
-#    pragma unroll
-        for (int j = 0; j < v.size(); ++j)
-        {
-            v[j] = T{1};
-        }
-#else
-        std::fill(&v[0], &v[N - 1] + 1, T{1});
-#endif
+        v[j] = T{1};
     }
-    return v; // I'm hoping that RVO will kick in
+    return v;
 }
 
 template<class T, int M, int N>
 constexpr __host__ __device__ Matrix<T, M, N> ones()
 {
-    Matrix<T, M, N> mat;
-    if constexpr (M > 0 && N > 0)
+    Matrix<T, M, N> m = {};
+#pragma unroll
+    for (int i = 0; i < M; ++i)
     {
-#if __CUDA_ARCH__
-#    pragma unroll
-        for (int i = 0; i < mat.rows(); ++i)
+#pragma unroll
+        for (int j = 0; j < N; ++j)
         {
-#    pragma unroll
-            for (int j = 0; j < mat.cols(); ++j)
-            {
-                mat[i][j] = T{1};
-            }
+            m[i][j] = T{1};
         }
-#else
-        std::fill(&mat[0][0], &mat[M - 1][N - 1] + 1, T{1});
-#endif
     }
-    return mat; // I'm hoping that RVO will kick in
+    return m;
 }
 
 template<class T, int M, int N>
 constexpr __host__ __device__ Matrix<T, M, N> identity()
 {
-    Matrix<T, M, N> mat;
-
+    Matrix<T, M, N> m = {};
+#pragma unroll
     for (int i = 0; i < M; ++i)
     {
+#pragma unroll
         for (int j = 0; j < N; ++j)
         {
-            mat[i][j] = i == j ? 1 : 0;
+            m[i][j] = i == j ? 1 : 0;
         }
     }
-
-    return mat;
+    return m;
 }
 
 template<class T, int M>
 constexpr __host__ __device__ Matrix<T, M, M> vander(const Vector<T, M> &v)
 {
-    Matrix<T, M, M> m;
+    Matrix<T, M, M> m = {};
     for (int i = 0; i < M; ++i)
     {
         for (int j = 0; j < M; ++j)
@@ -1074,7 +1085,6 @@ constexpr __host__ __device__ Matrix<T, M, M> vander(const Vector<T, M> &v)
             m[i][j] = cuda::pow(v[j], i);
         }
     }
-
     return m;
 }
 
@@ -1102,7 +1112,7 @@ constexpr __host__ __device__ Matrix<T, R, R> compan(const Vector<T, R> &a)
 template<class T, int M>
 constexpr __host__ __device__ Matrix<T, M, M> diag(const Vector<T, M> &v)
 {
-    Matrix<T, M, M> m;
+    Matrix<T, M, M> m = {};
     for (int i = 0; i < M; ++i)
     {
         for (int j = 0; j < M; ++j)
@@ -1120,7 +1130,7 @@ constexpr __host__ __device__ T dot(const Vector<T, N> &a, const Vector<T, N> &b
 {
     T d = a[0] * b[0];
 #pragma unroll
-    for (int j = 1; j < a.size(); ++j)
+    for (int j = 1; j < N; ++j)
     {
         d += a[j] * b[j];
     }
@@ -1130,23 +1140,23 @@ constexpr __host__ __device__ T dot(const Vector<T, N> &a, const Vector<T, N> &b
 template<class T, int N>
 constexpr __host__ __device__ Vector<T, N> reverse(const Vector<T, N> &a)
 {
-    Vector<T, N> r;
+    Vector<T, N> r = {};
 #pragma unroll
-    for (int j = 0; j < r.size(); ++j)
+    for (int j = 0; j < N; ++j)
     {
-        r[j] = a[a.size() - 1 - j];
+        r[j] = a[N - 1 - j];
     }
     return r;
 }
 
-// Transposition ---------------------------------------------------------------
+// Transformations -------------------------------------------------------------
 
 template<class T, int M>
 constexpr __host__ __device__ Matrix<T, M, M> &transp_inplace(Matrix<T, M, M> &m)
 {
-    for (int i = 0; i < m.rows(); ++i)
+    for (int i = 0; i < M; ++i)
     {
-        for (int j = i + 1; j < m.cols(); ++j)
+        for (int j = i + 1; j < M; ++j)
         {
             detail::swap(m[i][j], m[j][i]);
         }
@@ -1157,12 +1167,12 @@ constexpr __host__ __device__ Matrix<T, M, M> &transp_inplace(Matrix<T, M, M> &m
 template<class T, int M, int N>
 constexpr __host__ __device__ Matrix<T, N, M> transp(const Matrix<T, M, N> &m)
 {
-    Matrix<T, N, M> tm;
+    Matrix<T, N, M> tm = {};
 #pragma unroll
-    for (int i = 0; i < m.rows(); ++i)
+    for (int i = 0; i < M; ++i)
     {
 #pragma unroll
-        for (int j = 0; j < m.cols(); ++j)
+        for (int j = 0; j < N; ++j)
         {
             tm[j][i] = m[i][j];
         }
@@ -1173,7 +1183,7 @@ constexpr __host__ __device__ Matrix<T, N, M> transp(const Matrix<T, M, N> &m)
 template<class T, int N>
 constexpr __host__ __device__ Matrix<T, N, 1> transp(const Vector<T, N> &v)
 {
-    Matrix<T, N, 1> tv;
+    Matrix<T, N, 1> tv = {};
     tv.set_col(0, v);
     return tv;
 }
@@ -1181,12 +1191,12 @@ constexpr __host__ __device__ Matrix<T, N, 1> transp(const Vector<T, N> &v)
 template<class T, int M, int N>
 constexpr __host__ __device__ Matrix<T, M, N> flip_rows(const Matrix<T, M, N> &m)
 {
-    Matrix<T, M, N> f;
+    Matrix<T, M, N> f = {};
 #pragma unroll
-    for (int i = 0; i < m.rows(); ++i)
+    for (int i = 0; i < M; ++i)
     {
 #pragma unroll
-        for (int j = 0; j < m.cols(); ++j)
+        for (int j = 0; j < N; ++j)
         {
             f[i][j] = m[M - 1 - i][j];
         }
@@ -1197,12 +1207,12 @@ constexpr __host__ __device__ Matrix<T, M, N> flip_rows(const Matrix<T, M, N> &m
 template<class T, int M, int N>
 constexpr __host__ __device__ Matrix<T, M, N> flip_cols(const Matrix<T, M, N> &m)
 {
-    Matrix<T, M, N> f;
+    Matrix<T, M, N> f = {};
 #pragma unroll
-    for (int i = 0; i < m.rows(); ++i)
+    for (int i = 0; i < M; ++i)
     {
 #pragma unroll
-        for (int j = 0; j < m.cols(); ++j)
+        for (int j = 0; j < N; ++j)
         {
             f[i][j] = m[i][N - 1 - j];
         }
@@ -1213,12 +1223,12 @@ constexpr __host__ __device__ Matrix<T, M, N> flip_cols(const Matrix<T, M, N> &m
 template<class T, int M, int N>
 constexpr __host__ __device__ Matrix<T, M, N> flip(const Matrix<T, M, N> &m)
 {
-    Matrix<T, M, N> f;
+    Matrix<T, M, N> f = {};
 #pragma unroll
-    for (int i = 0; i < m.rows(); ++i)
+    for (int i = 0; i < M; ++i)
     {
 #pragma unroll
-        for (int j = 0; j < m.cols(); ++j)
+        for (int j = 0; j < N; ++j)
         {
             f[i][j] = m[M - 1 - i][N - 1 - j];
         }
@@ -1226,99 +1236,225 @@ constexpr __host__ __device__ Matrix<T, M, N> flip(const Matrix<T, M, N> &m)
     return f;
 }
 
-// Determinant -----------------------------------------------------------------
-
-template<class T>
-constexpr __host__ __device__ T det(const Matrix<T, 0, 0> &m)
+template<int R, typename T, int M, int N, class = cuda::Require<R <= M>>
+constexpr __host__ __device__ Matrix<T, R, N> head(const Matrix<T, M, N> &m)
 {
-    return T{1};
+    Matrix<T, R, N> h;
+
+#pragma unroll
+    for (int i = 0; i < R; ++i)
+    {
+#pragma unroll
+        for (int j = 0; j < N; ++j)
+        {
+            h[i][j] = m[i][j];
+        }
+    }
+
+    return h;
 }
 
-template<class T>
-constexpr __host__ __device__ T det(const Matrix<T, 1, 1> &m)
+template<int R, typename T, int M, int N, class = cuda::Require<R <= M>>
+constexpr __host__ __device__ Matrix<T, R, N> tail(const Matrix<T, M, N> &m)
 {
-    return m[0][0];
+    Matrix<T, R, N> t;
+
+#pragma unroll
+    for (int i = 0; i < R; ++i)
+    {
+#pragma unroll
+        for (int j = 0; j < N; ++j)
+        {
+            t[i][j] = m[M - R + i][j];
+        }
+    }
+
+    return t;
 }
 
-template<class T>
-constexpr __host__ __device__ T det(const Matrix<T, 2, 2> &m)
+// Advanced operations ---------------------------------------------------------
+
+// Linear-time invariant (LTI) filtering is a fundamental operation in signal and image processing.  Many
+// applications use LTI filters that can be expressed as linear, constant-coefficient difference equations.
+
+// Functions below implement a convolution pass, i.e. finite impulse response (FIR), and a causal/anticausal
+// combination of recursive filter passes, i.e. infinite impulse response (IIR), both defined by a set of weights.
+
+// Definitions: input single element x, block b, length N; filter weights w, order R; prologue p; epilogue e.
+// Illustrative example of N=11 and R=3 showing a block b in between previous and next blocks.
+
+//                  |----------------- b -----------------|
+// <previous block> | [ e0 e1 e2 ] x x x x x [ p0 p1 p2 ] | <next block>
+
+// FIR + IIR filtering with causal combination is called forward; with anticausal combination is called reverse.
+
+// Forward (fwd): y = w[0] * x - w[1] * p2 - w[2] * p1 - w[3] * p0
+// The y passed in is considered to be: y = w[0] * x
+
+// Reverse (rev): z = w[0] * y - w[1] * e0 - w[2] * e1 - w[3] * e2
+// The z passed in is considered to be: z = w[0] * y
+
+// Forward pass in a single element, updating prologue accordingly and returning result
+template<typename T, int R>
+constexpr __host__ __device__ T fwd1(Vector<T, R> &p, T y, const Vector<T, R + 1> &w)
 {
-    return m[0][0] * m[1][1] - m[0][1] * m[1][0];
+    y = y - p[R - 1] * w[1];
+
+#pragma unroll
+    for (int k = R - 1; k >= 1; --k)
+    {
+        y = y - p[R - 1 - k] * w[k + 1];
+
+        p[R - 1 - k] = p[R - 1 - k + 1];
+    }
+
+    p[R - 1] = y;
+
+    return y;
 }
 
-template<class T>
-constexpr __host__ __device__ T det(const Matrix<T, 3, 3> &m)
+// Forward pass in a block of N elements, updating prologue accordingly and in-place
+template<typename T, int N, int R>
+constexpr __host__ __device__ void fwdN(Vector<T, R> &p, Vector<T, N> &b, const Vector<T, R + 1> &w)
 {
-    return m[0][0] * (m[1][1] * m[2][2] - m[1][2] * m[2][1]) + m[0][1] * (m[1][2] * m[2][0] - m[1][0] * m[2][2])
-         + m[0][2] * (m[1][0] * m[2][1] - m[1][1] * m[2][0]);
+#pragma unroll
+    for (int k = 0; k < N; ++k)
+    {
+        b[k] = fwd1(p, w[0] * b[k], w);
+    }
 }
 
-template<class T, int M>
-constexpr __host__ __device__ T det(const Matrix<T, M, M> &m)
+// Forward-transpose pass over rows of a block of MxN elements, updating prologue accordingly and in-place
+template<typename T, int M, int N, int R>
+constexpr __host__ void fwdT(Matrix<T, M, R> &p, Matrix<T, M, N> &b, const Vector<T, R + 1> &w)
 {
-    T d = T{0};
 #pragma unroll
     for (int i = 0; i < M; ++i)
     {
-        d += ((i % 2 == 0 ? 1 : -1) * m[0][i] * det(m.subm(0, i)));
+        fwdN(p[i], b[i], w);
     }
-    return d;
 }
 
-// Matrix Inverse --------------------------------------------------------------
+// Forward pass over columns of a block of MxN elements, returning result
+template<typename T, int M, int N, int R>
+constexpr __host__ Matrix<T, M, N> fwd(const Matrix<T, R, N> &p, const Matrix<T, M, N> &b, const Vector<T, R + 1> &w)
+{
+    Matrix<T, M, N> bout;
 
-namespace detail {
+#pragma unroll
+    for (int j = 0; j < N; ++j)
+    {
+        Vector<T, R> pT = p.col(j);
 
-template<class T>
-constexpr __host__ __device__ void inv_inplace(Matrix<T, 1, 1> &m, const T &d)
+#pragma unroll
+        for (int i = 0; i < M; ++i)
+        {
+            bout[i][j] = fwd1(pT, b[i][j] * w[0], w);
+        }
+    }
+
+    return bout;
+}
+
+// Reverse pass in a single element, updating epilogue accordingly and returning result
+template<class T, int R>
+constexpr __host__ __device__ T rev1(T z, Vector<T, R> &e, const Vector<T, R + 1> &w)
 {
-    m[0][0] = T{1} / d;
+    z = z - e[0] * w[1];
+
+#pragma unroll
+    for (int k = R - 1; k >= 1; --k)
+    {
+        z = z - e[k] * w[k + 1];
+
+        e[k] = e[k - 1];
+    }
+
+    e[0] = z;
+
+    return z;
 }
 
-template<class T>
-constexpr __host__ __device__ void inv_inplace(Matrix<T, 2, 2> &m, const T &d)
+// Reverse pass in a block of N elements, updating prologue accordingly and in-place
+template<typename T, int N, int R>
+constexpr __host__ __device__ void revN(Vector<T, N> &b, Vector<T, R> &e, const Vector<T, R + 1> &w)
 {
-    swap(m[0][0], m[1][1]);
-    m[0][0] /= d;
-    m[1][1] /= d;
+#pragma unroll
+    for (int k = N - 1; k >= 0; --k)
+    {
+        b[k] = rev1(w[0] * b[k], e, w);
+    }
+}
 
-    m[0][1] = -m[0][1] / d;
-    m[1][0] = -m[1][0] / d;
+// Reverse-transpose pass over rows of a block of MxN elements, updating prologue accordingly and in-place
+template<typename T, int M, int N, int R>
+constexpr __host__ void revT(Matrix<T, M, N> &b, Matrix<T, M, R> &e, const Vector<T, R + 1> &w)
+{
+#pragma unroll
+    for (int i = 0; i < M; ++i)
+    {
+        revN(b[i], e[i], w);
+    }
+}
+
+// Reverse pass over columns of a block of MxN elements, returning result
+template<typename T, int M, int N, int R>
+constexpr __host__ Matrix<T, M, N> rev(const Matrix<T, M, N> &b, const Matrix<T, R, N> &e, const Vector<T, R + 1> &w)
+{
+    Matrix<T, M, N> bout;
+
+#pragma unroll
+    for (int j = 0; j < N; ++j)
+    {
+        Vector<T, R> eT = e.col(j);
+
+#pragma unroll
+        for (int i = M - 1; i >= 0; --i)
+        {
+            bout[i][j] = rev1(b[i][j] * w[0], eT, w);
+        }
+    }
+
+    return bout;
 }
 
+// Determinant -----------------------------------------------------------------
+
 template<class T>
-constexpr __host__ __device__ void inv_inplace(Matrix<T, 3, 3> &m, const T &d)
+constexpr __host__ __device__ T det(const Matrix<T, 0, 0> &m)
 {
-    Matrix<T, 3, 3> A;
-    A[0][0] = (m[1][1] * m[2][2] - m[1][2] * m[2][1]) / d;
-    A[0][1] = -(m[0][1] * m[2][2] - m[0][2] * m[2][1]) / d;
-    A[0][2] = (m[0][1] * m[1][2] - m[0][2] * m[1][1]) / d;
-    A[1][0] = -(m[1][0] * m[2][2] - m[1][2] * m[2][0]) / d;
-    A[1][1] = (m[0][0] * m[2][2] - m[0][2] * m[2][0]) / d;
-    A[1][2] = -(m[0][0] * m[1][2] - m[0][2] * m[1][0]) / d;
-    A[2][0] = (m[1][0] * m[2][1] - m[1][1] * m[2][0]) / d;
-    A[2][1] = -(m[0][0] * m[2][1] - m[0][1] * m[2][0]) / d;
-    A[2][2] = (m[0][0] * m[1][1] - m[0][1] * m[1][0]) / d;
+    return T{1};
+}
 
-    m = A;
+template<class T>
+constexpr __host__ __device__ T det(const Matrix<T, 1, 1> &m)
+{
+    return m[0][0];
 }
 
-} // namespace detail
+template<class T>
+constexpr __host__ __device__ T det(const Matrix<T, 2, 2> &m)
+{
+    return m[0][0] * m[1][1] - m[0][1] * m[1][0];
+}
 
-// Do inverse in-place of matrix m returning true if succeeded (m has determinant)
-template<class T, int N, class = std::enable_if_t<(N < 4)>>
-constexpr __host__ __device__ bool inv_inplace(Matrix<T, N, N> &m)
+template<class T>
+constexpr __host__ __device__ T det(const Matrix<T, 3, 3> &m)
 {
-    T d = det(m);
+    return m[0][0] * (m[1][1] * m[2][2] - m[1][2] * m[2][1]) + m[0][1] * (m[1][2] * m[2][0] - m[1][0] * m[2][2])
+         + m[0][2] * (m[1][0] * m[2][1] - m[1][1] * m[2][0]);
+}
 
-    if (d == 0)
+template<class T, int M>
+constexpr __host__ __device__ T det(const Matrix<T, M, M> &m)
+{
+    T d = T{0};
+#pragma unroll
+    for (int i = 0; i < M; ++i)
     {
-        return false;
+        d += ((i % 2 == 0 ? 1 : -1) * m[0][i] * det(m.subm(0, i)));
     }
-
-    detail::inv_inplace(m, d);
-
-    return true;
+    return d;
 }
 
 // LU decomposition & solve ----------------------------------------------------
@@ -1327,7 +1463,7 @@ constexpr __host__ __device__ bool inv_inplace(Matrix<T, N, N> &m)
 template<class F = float, class T, int N>
 constexpr __host__ __device__ bool lu_inplace(Matrix<T, N, N> &m, Vector<int, N> &p)
 {
-    Vector<F, N> v;
+    Vector<F, N> v = {};
 
 #pragma unroll
     for (int i = 0; i < N; ++i)
@@ -1396,6 +1532,7 @@ constexpr __host__ __device__ bool lu_inplace(Matrix<T, N, N> &m, Vector<int, N>
     return true;
 }
 
+// Solve in-place using given LU decomposition lu and pivot p, the result x is returned in b
 template<class T, int N>
 constexpr __host__ __device__ void solve_inplace(const Matrix<T, N, N> &lu, const Vector<int, N> &p, Vector<T, N> &b)
 {
@@ -1439,10 +1576,11 @@ constexpr __host__ __device__ void solve_inplace(const Matrix<T, N, N> &lu, cons
     }
 }
 
+// Solve in-place m * x = b, where x is returned in b
 template<class T, int N>
 constexpr __host__ __device__ bool solve_inplace(const Matrix<T, N, N> &m, Vector<T, N> &b)
 {
-    Vector<int, N>  p;
+    Vector<int, N>  p  = {};
     Matrix<T, N, N> LU = m;
 
     if (!lu_inplace(LU, p))
@@ -1455,6 +1593,136 @@ constexpr __host__ __device__ bool solve_inplace(const Matrix<T, N, N> &m, Vecto
     return true;
 }
 
+// Matrix Inverse --------------------------------------------------------------
+
+namespace detail {
+
+// In this detail, all inverse (and in-place) functions use determinant d of the input matrix m
+template<class T>
+constexpr __host__ __device__ void inv_inplace(Matrix<T, 1, 1> &m, const T &d)
+{
+    m[0][0] = T{1} / d;
+}
+
+template<class T>
+constexpr __host__ __device__ Matrix<T, 1, 1> inv(const Matrix<T, 1, 1> &m, const T &d)
+{
+    Matrix<T, 1, 1> A;
+    inv_inplace(A, d);
+    return A;
+}
+
+template<class T>
+constexpr __host__ __device__ void inv_inplace(Matrix<T, 2, 2> &m, const T &d)
+{
+    detail::swap(m[0][0], m[1][1]);
+    m[0][0] /= d;
+    m[1][1] /= d;
+
+    m[0][1] = -m[0][1] / d;
+    m[1][0] = -m[1][0] / d;
+}
+
+template<class T>
+constexpr __host__ __device__ Matrix<T, 2, 2> inv(const Matrix<T, 2, 2> &m, const T &d)
+{
+    Matrix<T, 2, 2> A = m;
+    inv_inplace(A, d);
+
+    return A;
+}
+
+template<class T>
+constexpr __host__ __device__ Matrix<T, 3, 3> inv(const Matrix<T, 3, 3> &m, const T &d)
+{
+    Matrix<T, 3, 3> A;
+    A[0][0] = (m[1][1] * m[2][2] - m[1][2] * m[2][1]) / d;
+    A[0][1] = -(m[0][1] * m[2][2] - m[0][2] * m[2][1]) / d;
+    A[0][2] = (m[0][1] * m[1][2] - m[0][2] * m[1][1]) / d;
+    A[1][0] = -(m[1][0] * m[2][2] - m[1][2] * m[2][0]) / d;
+    A[1][1] = (m[0][0] * m[2][2] - m[0][2] * m[2][0]) / d;
+    A[1][2] = -(m[0][0] * m[1][2] - m[0][2] * m[1][0]) / d;
+    A[2][0] = (m[1][0] * m[2][1] - m[1][1] * m[2][0]) / d;
+    A[2][1] = -(m[0][0] * m[2][1] - m[0][1] * m[2][0]) / d;
+    A[2][2] = (m[0][0] * m[1][1] - m[0][1] * m[1][0]) / d;
+
+    return A;
+}
+
+template<class T>
+constexpr __host__ __device__ void inv_inplace(Matrix<T, 3, 3> &m, const T &d)
+{
+    m = inv(m, d);
+}
+
+} // namespace detail
+
+// Do inverse of matrix m asserting its success
+template<class T, int N, class = cuda::Require<(N < 4)>>
+constexpr __host__ __device__ Matrix<T, N, N> inv(const Matrix<T, N, N> &m)
+{
+    T d = det(m);
+    assert(d != 0);
+    return detail::inv(m, d);
+}
+
+// Do inverse in-place of matrix m returning true if succeeded (m has determinant)
+template<class T, int N, class = cuda::Require<(N < 4)>>
+constexpr __host__ __device__ bool inv_inplace(Matrix<T, N, N> &m)
+{
+    T d = det(m);
+
+    if (d == 0)
+    {
+        return false;
+    }
+
+    detail::inv_inplace(m, d);
+
+    return true;
+}
+
+// Do inverse in-place of matrix m returning out using LU decomposition written to m
+template<class T, int M>
+constexpr __host__ __device__ void inv_lu_inplace(Matrix<T, M, M> &out, Matrix<T, M, M> &m)
+{
+    Vector<int, M> p = {};
+
+    bool validResult = lu_inplace(m, p);
+    assert(validResult);
+    if (!validResult)
+    {
+        return;
+    }
+
+    out = identity<T, M, M>();
+
+#pragma unroll
+    for (int i = 0; i < M; ++i)
+    {
+        solve_inplace(m, p, out[i]);
+    }
+
+    transp_inplace(out);
+}
+
+// Do inverse in-place of matrix m using LU decomposition
+template<class T, int M>
+constexpr __host__ __device__ void inv_lu_inplace(Matrix<T, M, M> &m)
+{
+    Matrix<T, M, M> res;
+    inv_lu_inplace(res, m);
+    m = res;
+}
+
+// Do inverse using LU decomposition
+template<class T, int M>
+constexpr __host__ __device__ Matrix<T, M, M> inv_lu(Matrix<T, M, M> m)
+{
+    inv_lu_inplace(m);
+    return m;
+}
+
 /**@}*/
 
 } // namespace nvcv::cuda::math
diff --git a/tests/cvcuda/python/cvcuda_test_python.in b/tests/cvcuda/python/cvcuda_test_python.in
index f1242e371..94cc6651e 100755
--- a/tests/cvcuda/python/cvcuda_test_python.in
+++ b/tests/cvcuda/python/cvcuda_test_python.in
@@ -15,11 +15,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-tests_dir="@PYTHON_TEST_DIR@"
+tests_dir=${1:-"@PYTHON_TEST_DIR@"}
 python_versions_tentative="@PYTHON_TEST_VERSIONS@"
 
 python_versions=""
 
+# Check if test scripts exist in test_dir
+if [ ! -f "$tests_dir/cvcuda_util.py" ]; then
+    # try to find the scripts in relative path
+    echo "Python test scripts (E.g. cvcuda_util.py, test_op*.py) doesn't exist in $tests_dir"
+    echo "Trying to find python test scripts via relative path"
+    cvcuda_test_types_python_tar_dir=$(cd "$(dirname "$0")"; pwd)/../@PYTHON_TEST_INSTDIR@ # relative path in tarball
+    if [ -f "$cvcuda_test_types_python_tar_dir/cvcuda_util.py" ]; then
+        echo "Found python test scripts at $cvcuda_test_types_python_tar_dir via relative path"
+        tests_dir=$cvcuda_test_types_python_tar_dir
+    else
+        echo "Cannot find python test scripts in $tests_dir and $cvcuda_test_types_python_tar_dir"
+        echo "Please run ./cvcuda_test_python [python test scripts folder]"
+        exit 1 #hard exit
+    fi
+fi
+
 # Verify if correct package dependencies are installed --------
 pip_depends="pytest torch"
 
@@ -32,7 +48,10 @@ for ver in $python_versions_tentative; do
         echo "WARNING: Python version $ver not installed or missing proper dependencies"
         echo "Please install Python version $ver and run the following commands before running tests: sudo python$ver -m pip install $pip_depends"
         if [[ "$NVCV_FORCE_PYTHON" == 1 || "$NVCV_FORCE_PYTHON" == yes ]]; then
-            exit 1 #hard exit
+            echo "Exiting with FAILURE, as NVCV_FORCE_PYTHON=$NVCV_FORCE_PYTHON"
+            exit 2 #hard exit
+        else
+            echo "Continue and skipping python version $ver, as NVCV_FORCE_PYTHON=$NVCV_FORCE_PYTHON"
         fi
     else
         echo "Found Python version $ver installed with proper dependencies, adding to tests"
@@ -67,13 +86,13 @@ for ver in $python_versions; do
     pubsyms=$(readelf -sWD $modfile | grep -v ' UND ' | grep ' GLOBAL ')
     if [[ $(echo "$pubsyms" | wc -l) != 1 ]]; then
         echo -e "cvcuda python $ver module is exposing too many symbols:\n$pubsyms"
-        exit 1
+        exit 3
     fi
     if ! echo "$pubsyms" | grep PyInit_cvcuda > /dev/null; then
         echo -e "cvcuda python $ver module must expose symbol PyInit_cvcuda, but instead exposes:\n$pubsyms"
-        exit 2
+        exit 4
     fi
 
     # Run python tests
-    NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -o cache_dir="$tmpdir" "$@" "$tests_dir"
+    NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -v --tb=line -o cache_dir="$tmpdir" "$@" "$tests_dir"
 done
diff --git a/tests/cvcuda/python/test_multi_stream.py b/tests/cvcuda/python/test_multi_stream.py
new file mode 100644
index 000000000..24a2bf833
--- /dev/null
+++ b/tests/cvcuda/python/test_multi_stream.py
@@ -0,0 +1,163 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import cvcuda
+import pytest as t
+
+
+def test_multiple_streams():
+    stream1 = cvcuda.cuda.Stream()  # create a new stream
+    stream2 = cvcuda.cuda.Stream()  # create a new stream
+    stream3 = cvcuda.cuda.Stream()  # create a new stream
+    assert stream1 is not stream2
+    assert stream1 is not stream3
+    assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default
+    assert cvcuda.cuda.Stream.current is not stream1
+    assert cvcuda.cuda.Stream.current is not stream2
+    assert cvcuda.cuda.Stream.current is not stream3
+
+
+def test_stream_context():
+    stream1 = cvcuda.cuda.Stream()  # create a new stream
+    stream2 = cvcuda.cuda.Stream()  # create a new stream
+    with stream1:
+        assert cvcuda.cuda.Stream.current is stream1
+    with stream2:
+        assert cvcuda.cuda.Stream.current is stream2
+    assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default
+
+
+def test_stream_context_nested():
+    stream1 = cvcuda.cuda.Stream()  # create a new stream
+    stream2 = cvcuda.cuda.Stream()  # create a new stream
+    with stream1:
+        assert cvcuda.cuda.Stream.current is stream1
+        with stream2:
+            assert cvcuda.cuda.Stream.current is stream2
+        assert cvcuda.cuda.Stream.current is stream1
+    assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default
+    with stream2:
+        assert cvcuda.cuda.Stream.current is stream2
+    assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default
+
+
+def test_stream_context_exception():
+    stream1 = cvcuda.cuda.Stream()  # create a new stream
+    stream2 = cvcuda.cuda.Stream()  # create a new stream
+    with t.raises(Exception):
+        with stream1:
+            assert cvcuda.cuda.Stream.current is stream1
+            with stream2:
+                assert cvcuda.cuda.Stream.current is stream2
+                raise Exception()
+            assert cvcuda.cuda.Stream.current is stream1
+        assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default
+    with stream2:
+        assert cvcuda.cuda.Stream.current is stream2
+    assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default
+
+
+def test_operator_stream():
+    stream1 = cvcuda.cuda.Stream()  # create a new stream
+    stream2 = cvcuda.cuda.Stream()  # create a new stream
+    stream3 = cvcuda.cuda.Stream()  # create a new stream
+    assert stream1 is not stream2
+    assert stream1 is not stream3
+    assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default
+    assert cvcuda.cuda.Stream.current is not stream1
+    assert cvcuda.cuda.Stream.current is not stream2
+    assert cvcuda.cuda.Stream.current is not stream3
+    with stream1:
+        assert cvcuda.cuda.Stream.current is stream1
+        img = torch.zeros(10, 10, 3, dtype=torch.uint8, device="cuda")
+        img = cvcuda.as_tensor(img, "HWC")
+        cvcuda.cvtcolor(img, cvcuda.ColorConversion.BGR2GRAY)
+        assert cvcuda.cuda.Stream.current is stream1
+    with stream2:
+        assert cvcuda.cuda.Stream.current is stream2
+        img = torch.zeros(10, 10, 3, dtype=torch.uint8, device="cuda")
+        img = cvcuda.as_tensor(img, "HWC")
+        cvcuda.cvtcolor(img, cvcuda.ColorConversion.BGR2GRAY)
+        assert cvcuda.cuda.Stream.current is stream2
+    with stream3:
+        assert cvcuda.cuda.Stream.current is stream3
+        img = torch.zeros(10, 10, 3, dtype=torch.uint8, device="cuda")
+        img = cvcuda.as_tensor(img, "HWC")
+        cvcuda.cvtcolor(img, cvcuda.ColorConversion.BGR2GRAY)
+        assert cvcuda.cuda.Stream.current is stream3
+    assert cvcuda.cuda.Stream.current is cvcuda.cuda.Stream.default
+
+
+def test_operator_changing_stream():
+
+    N = 10
+    H = 1080
+    W = 1080
+    C = 3
+    Loop = 50
+    streams = [cvcuda.cuda.Stream() for _ in range(4)]  # create a list of streams
+
+    inputTensor = torch.randint(0, 256, (N, H, W, C), dtype=torch.uint8).cuda()
+    outputTensor = torch.randint(0, 256, (N, H, W, C), dtype=torch.uint8).cuda()
+    # Perform deep copy
+    inputTensor_copy = inputTensor.clone()
+
+    inTensor = cvcuda.as_tensor(inputTensor.data, "NHWC")
+    outTensor = cvcuda.as_tensor(outputTensor.data, "NHWC")
+
+    for _ in range(Loop):
+        for stream in streams:
+            cvcuda.flip_into(outTensor, inTensor, -1, stream=stream)  # output x flipped
+            cvcuda.flip_into(inTensor, outTensor, -1, stream=stream)  # output y flipped
+
+    final_out = torch.as_tensor(inTensor.cuda()).cpu()
+    assert torch.equal(final_out, inputTensor_copy.cpu())
+
+
+def test_operator_changing_stream_loaded():
+
+    N = 10
+    H = 1080
+    W = 1080
+    C = 3
+    Loop = 50
+    stream1 = cvcuda.cuda.Stream()
+    stream2 = cvcuda.cuda.Stream()
+
+    inputTensor = torch.randint(0, 256, (N, H, W, C), dtype=torch.uint8).cuda()
+    inputTensorTmp = torch.randint(0, 256, (N, H, W, C), dtype=torch.uint8).cuda()
+    outputTensor = torch.randint(0, 256, (N, H, W, C), dtype=torch.uint8).cuda()
+    # Perform deep copy
+    inputTensor_copy = inputTensor.clone()
+
+    inTensor = cvcuda.as_tensor(inputTensor.data, "NHWC")
+    inTensorTmp = cvcuda.as_tensor(inputTensorTmp.data, "NHWC")
+    outTensor = cvcuda.as_tensor(outputTensor.data, "NHWC")
+
+    for _ in range(Loop):
+        # put a bunch of work on stream 1
+        for _ in range(Loop * 2):
+            cvcuda.flip(inTensorTmp, 0, stream=stream1)
+        # put a bunch of work on stream 1 this will happen after the above work on stream 1
+        cvcuda.flip_into(
+            inTensorTmp, inTensor, -1, stream=stream1
+        )  # output x/y flipped
+        cvcuda.flip_into(
+            outTensor, inTensorTmp, -1, stream=stream2
+        )  # output y/y flipped
+
+    final_out = torch.as_tensor(outTensor.cuda()).cpu()
+    assert torch.equal(final_out, inputTensor_copy.cpu())
diff --git a/tests/cvcuda/python/test_opfindcontours.py b/tests/cvcuda/python/test_opfindcontours.py
deleted file mode 100644
index 90e1e89e7..000000000
--- a/tests/cvcuda/python/test_opfindcontours.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import nvcv
-import cvcuda
-import pytest as t
-import numpy as np
-import cvcuda_util as util
-
-RNG = np.random.default_rng(0)
-
-
-@t.mark.parametrize(
-    "shape, dtype, layout",
-    [((1, 16, 23, 1), np.uint8, "NHWC"), ((1, 32, 32, 1), np.uint8, "NHWC")],
-)
-def test_op_find_contours(shape, dtype, layout):
-    print(shape, dtype, layout)
-    image = util.create_tensor(shape, dtype, layout, 1, rng=RNG)
-    points, num_contours_and_points = cvcuda.find_contours(image)
-    assert points.shape[0] == image.shape[0]
-    assert points.shape[2] == 2
-    assert points.shape[0] == num_contours_and_points.shape[0]
-
-    stream = cvcuda.Stream()
-    points = cvcuda.Tensor(
-        (image.shape[0], 1024, 2), nvcv.Type.S32, nvcv.TensorLayout.NHW
-    )
-    num_points = cvcuda.Tensor(
-        (image.shape[0], 32), nvcv.Type.U32, nvcv.TensorLayout.NW
-    )
-    points_into, num_contours_and_points_into = cvcuda.find_contours_into(
-        src=image,
-        points=points,
-        num_points=num_points,
-        stream=stream,
-    )
-    assert points_into is points
-    assert points_into.shape[0] == image.shape[0]
-    assert points_into.shape[2] == 2
-    assert points_into.shape[0] == num_contours_and_points_into.shape[0]
diff --git a/tests/cvcuda/python/test_opfindhomography.py b/tests/cvcuda/python/test_opfindhomography.py
index 3f8d5faa9..bbc57e9aa 100644
--- a/tests/cvcuda/python/test_opfindhomography.py
+++ b/tests/cvcuda/python/test_opfindhomography.py
@@ -15,11 +15,10 @@
 
 import nvcv
 import cvcuda
+import cvcuda_util
 import pytest as t
 import numpy as np
 
-RNG = np.random.default_rng(0)
-
 
 @t.mark.parametrize(
     "num_samples, num_points",
@@ -37,6 +36,10 @@ def test_op_findhomography(num_samples, num_points):
     assert out.shape == (num_samples, 3, 3)
     assert out.dtype == np.float32
 
+    create_tensor_args = ((num_samples, num_points, 2), np.float32, "NWC")
+    src = cvcuda_util.create_tensor(*create_tensor_args)
+    dst = cvcuda_util.create_tensor(*create_tensor_args)
+
     stream = cvcuda.Stream()
     out_tensor_args = ((num_samples, 3, 3), np.float32, "NHW")
     out = cvcuda.Tensor(*out_tensor_args)
diff --git a/tests/cvcuda/python/test_oplabel.py b/tests/cvcuda/python/test_oplabel.py
index 8a3eb92c6..ec61fe236 100644
--- a/tests/cvcuda/python/test_oplabel.py
+++ b/tests/cvcuda/python/test_oplabel.py
@@ -18,12 +18,12 @@
 import numpy as np
 
 
-DEF_OUT_DTYPE = np.uint32
+DEF_OUT_DTYPE = np.int32
 DEF_MAX_CAPACITY = 10000
 
 
 def defaultNumStats(layout):
-    return 8 if "D" in layout else 6
+    return 9 if "D" in layout else 7
 
 
 @t.mark.parametrize(
@@ -92,7 +92,7 @@ def test_op_label_api(src_args):
     out, count, stats = cvcuda.label(
         src,
         connectivity,
-        cvcuda.LABEL.SEQUENTIAL,
+        assign_labels=cvcuda.LABEL.SEQUENTIAL,
         count=True,
         stats=True,
         bg_label=bg_label,
@@ -103,6 +103,40 @@ def test_op_label_api(src_args):
     assert out.shape == src.shape
     assert out.dtype == DEF_OUT_DTYPE
 
+    mask_layout = "".join([lc for lc in src_args[2] if lc != "N"])
+    mask_shape = tuple([sv for sv, lc in zip(src_args[0], src_args[2]) if lc != "N"])
+    mask = cvcuda.Tensor(mask_shape, np.int8, mask_layout)
+
+    out, count, stats = cvcuda.label(
+        src,
+        connectivity,
+        cvcuda.LABEL.FAST,
+        mask_type=cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY,
+        count=True,
+        stats=True,
+        bg_label=bg_label,
+        min_size=min_size,
+        mask=mask,
+    )
+    assert count is not None and stats is not None
+    assert out.layout == src.layout
+    assert out.shape == src.shape
+    assert out.dtype == DEF_OUT_DTYPE
+
+    mask = cvcuda.Tensor(src.shape, np.uint8, src.layout)
+
+    t_out, _, _ = cvcuda.label_into(
+        out,
+        count,
+        stats,
+        src,
+        connectivity,
+        bg_label=bg_label,
+        min_size=min_size,
+        mask=mask,
+    )
+    assert t_out is out
+
     t_out, t_count, t_stats = cvcuda.label_into(out, count, stats, src, connectivity)
     assert t_out is out and t_count is count and t_stats is stats
     assert out.layout == src.layout
@@ -126,6 +160,7 @@ def test_op_label_api(src_args):
     assert out.shape == src.shape
     assert out.dtype == DEF_OUT_DTYPE
 
+    out = cvcuda.Tensor(src.shape, np.uint32, src.layout)
     tmp, _, _ = cvcuda.label_into(
         dst=out, src=src, connectivity=connectivity, stream=stream
     )
diff --git a/tests/cvcuda/system/CMakeLists.txt b/tests/cvcuda/system/CMakeLists.txt
index a14060961..e82b17070 100644
--- a/tests/cvcuda/system/CMakeLists.txt
+++ b/tests/cvcuda/system/CMakeLists.txt
@@ -34,7 +34,6 @@ add_executable(cvcuda_test_system
     TestOpPairwiseMatcher.cpp
     TestOpStack.cpp
     TestOpLabel.cpp
-    TestOpFindContours.cpp
     TestOpOSD.cpp
     TestOpHistogramEq.cpp
     TestOpAdvCvtColor.cpp
diff --git a/tests/cvcuda/system/TestOpFindContours.cpp b/tests/cvcuda/system/TestOpFindContours.cpp
deleted file mode 100644
index 8081e28fb..000000000
--- a/tests/cvcuda/system/TestOpFindContours.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Definitions.hpp"
-
-#include <common/TypedTests.hpp>
-#include <common/ValueTests.hpp>
-#include <cvcuda/OpFindContours.hpp>
-#include <nvcv/Image.hpp>
-#include <nvcv/ImageBatch.hpp>
-#include <nvcv/Tensor.hpp>
-#include <nvcv/TensorDataAccess.hpp>
-#include <util/TensorDataUtils.hpp>
-
-#include <iostream>
-#include <random>
-#include <unordered_set>
-#include <vector>
-
-namespace gt    = ::testing;
-namespace test  = nvcv::test;
-namespace ttest = test::type;
-
-using CPUImage = std::vector<uint8_t>;
-
-// clang-format off
-
-using Types = ttest::Concat<
-    // ttest::Combine<ttest::Zip<ttest::Values<32, 64>,
-    //                           ttest::Values<32, 64>>,
-    //                ttest::Values<1, 2, 4, 8, 16>>,
-    // ttest::Combine<ttest::Zip<ttest::Values<128, 256>,
-    //                           ttest::Values<128, 256>>,
-    //                ttest::Values<1, 2, 4>>,
-    // ttest::Combine<ttest::Zip<ttest::Values<512>,
-    //                           ttest::Values<512>>,
-    //                ttest::Values<1, 2>>,
-    // ttest::Combine<ttest::Zip<ttest::Values<1024>,
-    //                           ttest::Values<1024>>,
-    //                ttest::Values<1>>
-    ttest::Combine<ttest::Zip<ttest::Values<32, 64, 128, 256, 512, 1024>,
-                              ttest::Values<32, 64, 128, 256, 512, 1024>>,
-                   ttest::Values<1, 2, 4, 8, 16, 32, 64, 128>>,
-    ttest::Combine<ttest::Zip<ttest::Values<1920, 3840>,
-                              ttest::Values<1080, 2160>>,
-                   ttest::Values<1, 2, 4, 8>>,
-    ttest::Combine<ttest::Zip<ttest::Values<7680>,
-                              ttest::Values<4320>>,
-                   ttest::Values<1, 2>>
->;
-NVCV_TYPED_TEST_SUITE(OpFindContours, Types);
-
-void generateRectangle(CPUImage &image, nvcv::Size2D boundary, nvcv::Size2D anchor = {0, 0}, nvcv::Size2D size = {5, 5},
-                       double angle = 0.0, bool fill = true, uint8_t setValue = 1);
-
-void generateRectangle(CPUImage &image, nvcv::Size2D boundary, nvcv::Size2D anchor, nvcv::Size2D size, double angle,
-                       bool fill, uint8_t setValue)
-{
-    auto rad      = angle * (M_PI / 180.0);
-    auto cosAngle = std::cos(rad);
-    auto sinAngle = std::sin(rad);
-
-    auto transformed = anchor;
-    for (auto y = 0; y < size.h; ++y)
-    {
-        for (auto x = 0; x < size.w; ++x)
-        {
-            transformed.w = anchor.w + (x * cosAngle - y * sinAngle);
-            transformed.h = anchor.h + (x * sinAngle + y * cosAngle);
-
-            if (fill || y == 0 || y == size.h - 1 || x == 0 || x == size.w - 1)
-            {
-                if (transformed.w >= 0 && transformed.w < boundary.w && transformed.h >= 0
-                    && transformed.h < boundary.h)
-                {
-                    image[transformed.h * boundary.w + transformed.w] = setValue;
-                }
-            }
-        }
-    }
-}
-
-TYPED_TEST(OpFindContours, correct_output)
-{
-    cudaStream_t stream;
-    ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream));
-
-    int width          = ttest::GetValue<TypeParam, 0>;
-    int height         = ttest::GetValue<TypeParam, 1>;
-    int numberOfImages = ttest::GetValue<TypeParam, 2>;
-
-    nvcv::Tensor imgIn         = nvcv::util::CreateTensor(numberOfImages, width, height, nvcv::FMT_U8);
-    auto         dtype         = nvcv::TYPE_S32;
-    auto         tshape_points = nvcv::TensorShape{
-        {numberOfImages, 1024, 2},
-        nvcv::TENSOR_NCW
-    };
-    auto tshape_counts = nvcv::TensorShape{
-        {numberOfImages, 4},
-        nvcv::TENSOR_NW
-    };
-    nvcv::Tensor points{tshape_points, dtype};
-    nvcv::Tensor counts{tshape_counts, dtype};
-
-    auto inData = imgIn.exportData<nvcv::TensorDataStridedCuda>();
-    ASSERT_NE(nullptr, inData);
-    auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData);
-    ASSERT_TRUE(inAccess);
-    ASSERT_EQ(numberOfImages, inAccess->numSamples());
-
-    auto imgPtr = make_cudaPitchedPtr(reinterpret_cast<void *>(inAccess->sampleData(0)), inAccess->rowStride(), width, height);
-    auto extent = make_cudaExtent(sizeof(uint8_t) * width, height, numberOfImages);
-    ASSERT_EQ(cudaSuccess, cudaMemset3DAsync(imgPtr, 0, extent));
-
-    //Generate input
-    CPUImage srcVec(height * width, 0);
-
-    // Creating a 16-pixel contour (simple)
-    // Head Node at (5, 5)
-    generateRectangle(srcVec, {width, height}, {5, 5});
-
-    // Creating a 26-pixel contour (complex)
-    // Head Node at (17, 17)
-    generateRectangle(srcVec, {width, height}, {17, 17});
-    generateRectangle(srcVec, {width, height}, {20, 20});
-
-    // Creating a 12-pixel contour (simple rotated)
-    // Head Node at (12, 12)
-    generateRectangle(srcVec, {width, height}, {12, 12}, {5, 5}, 45.0);
-
-    for (auto i = 0; i < numberOfImages; ++i)
-    {
-        ASSERT_EQ(cudaSuccess, cudaMemcpy2D(inAccess->sampleData(i), inAccess->rowStride(), srcVec.data(), width, width,
-                                            height, cudaMemcpyHostToDevice));
-    }
-
-    // Creating contour validator
-    std::unordered_set<int> expectedSizes{{0, 16, 26, 12}};
-
-    // run operator
-    cvcuda::FindContours findContoursOp(nvcv::Size2D{width, height}, numberOfImages);
-    EXPECT_NO_THROW(findContoursOp(stream, imgIn, points, counts));
-    ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
-    ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
-
-    auto             outData = counts.exportData<nvcv::TensorDataStridedCuda>();
-    ASSERT_NE(nullptr, outData);
-    auto outAccess = nvcv::TensorDataAccessStrided::Create(*outData);
-    ASSERT_TRUE(outAccess);
-
-    std::vector<int> hcounts(4, 0);
-    for (auto i = 0; i < numberOfImages; ++i)
-    {
-        ASSERT_EQ(cudaSuccess, cudaMemcpy(hcounts.data(), outAccess->sampleData(i),
-                                          hcounts.size() * sizeof(int), cudaMemcpyDeviceToHost));
-
-        std::unordered_set<int> resultSizes{hcounts.begin(), hcounts.end()};
-        EXPECT_EQ(resultSizes, expectedSizes);
-    }
-}
diff --git a/tests/cvcuda/system/TestOpLabel.cpp b/tests/cvcuda/system/TestOpLabel.cpp
index 12516ab4c..3439a5905 100644
--- a/tests/cvcuda/system/TestOpLabel.cpp
+++ b/tests/cvcuda/system/TestOpLabel.cpp
@@ -317,14 +317,16 @@ void SortStats(std::vector<std::vector<std::vector<DT>>> &stats, std::vector<std
 }
 
 // Compute statistics of labeled regions
-template<typename ST, typename DT>
+template<typename ST, typename DT, typename MT>
 void ComputeStats(std::vector<std::vector<std::vector<DT>>> &stats, const RawBufferType &dstVec,
-                  const RawBufferType &bglVec, const long4 &dstStrides, const long1 &bglStrides,
-                  const std::vector<std::set<DT>> &labels, const long4 &shape, int numStats)
+                  const RawBufferType &mskVec, const RawBufferType &bglVec, const long4 &dstStrides,
+                  const long4 &mskStrides, const long1 &bglStrides, const std::vector<std::set<DT>> &labels,
+                  const long4 &shape, long maskN, int numStats)
 {
     // One-element-after-the-end label is a special label assigned to a region which got the background label
     DT endLabel = dstStrides.x / sizeof(DT);
 
+    bool hasMask    = mskStrides.x > 0;
     bool hasBgLabel = bglStrides.x > 0;
 
     for (long x = 0; x < shape.x; ++x)
@@ -350,18 +352,26 @@ void ComputeStats(std::vector<std::vector<std::vector<DT>>> &stats, const RawBuf
 
                     if ((hasBgLabel && label == endLabel && posLabel == (DT)backgroundLabel) || label == posLabel)
                     {
-                        long regionIdx = std::distance(labels[x].cbegin(), fit);
+                        long regionIdx  = std::distance(labels[x].cbegin(), fit);
+                        DT   regionMark = 0; // region has no marks
+
+                        // If has mask and the element is inside the mask
+                        if (hasMask && util::ValueAt<MT>(mskVec, mskStrides, long4{maskN == 1 ? 0 : x, y, z, w}) != 0)
+                        {
+                            regionMark = 2; // mark the region as inside the mask (= 2)
+                        }
 
                         stats[x][regionIdx].resize(numStats);
                         stats[x][regionIdx][0] = label;
                         stats[x][regionIdx][1] = w;
                         stats[x][regionIdx][2] = z;
 
-                        if (numStats == 6)
+                        if (numStats == 7)
                         {
                             stats[x][regionIdx][3] = 1;
                             stats[x][regionIdx][4] = 1;
                             stats[x][regionIdx][5] = 1;
+                            stats[x][regionIdx][6] = regionMark;
                         }
                         else
                         {
@@ -370,6 +380,7 @@ void ComputeStats(std::vector<std::vector<std::vector<DT>>> &stats, const RawBuf
                             stats[x][regionIdx][5] = 1;
                             stats[x][regionIdx][6] = 1;
                             stats[x][regionIdx][7] = 1;
+                            stats[x][regionIdx][8] = regionMark;
                         }
                     }
                 }
@@ -399,7 +410,17 @@ void ComputeStats(std::vector<std::vector<std::vector<DT>>> &stats, const RawBuf
                     DT   bboxAreaW = std::abs(stats[x][regionIdx][1] - w) + 1;
                     DT   bboxAreaH = std::abs(stats[x][regionIdx][2] - z) + 1;
 
-                    if (numStats == 6)
+                    // If has mask and the region has no marks (it is no marked as inside mask)
+                    if (hasMask && stats[x][regionIdx][numStats - 1] == 0)
+                    {
+                        // If element is inside mask
+                        if (util::ValueAt<MT>(mskVec, mskStrides, long4{maskN == 1 ? 0 : x, y, z, w}) != 0)
+                        {
+                            stats[x][regionIdx][numStats - 1] = 2; // mark the region as inside mask (= 2)
+                        }
+                    }
+
+                    if (numStats == 7)
                     {
                         stats[x][regionIdx][3] = std::max(stats[x][regionIdx][3], bboxAreaW);
                         stats[x][regionIdx][4] = std::max(stats[x][regionIdx][4], bboxAreaH);
@@ -424,7 +445,7 @@ void ComputeStats(std::vector<std::vector<std::vector<DT>>> &stats, const RawBuf
 template<typename ST, typename DT>
 void RemoveIslands(std::vector<std::set<DT>> &labels, RawBufferType &dstVec, const RawBufferType &bglVec,
                    const RawBufferType &mszVec, const long4 &dstStrides, const long1 &bglStrides,
-                   const long1 &mszStrides, const std::vector<std::vector<std::vector<DT>>> &stats, const long4 &shape,
+                   const long1 &mszStrides, std::vector<std::vector<std::vector<DT>>> &stats, const long4 &shape,
                    int numStats)
 {
     for (long x = 0; x < shape.x; ++x)
@@ -448,11 +469,15 @@ void RemoveIslands(std::vector<std::set<DT>> &labels, RawBufferType &dstVec, con
                     }
 
                     long regionIdx  = std::distance(labels[x].cbegin(), fit);
-                    DT   regionSize = stats[x][regionIdx][numStats - 1];
+                    DT   regionSize = stats[x][regionIdx][numStats - 2];
 
-                    if (regionSize < minSize)
+                    // If region size is smaller than minimum size (it is an island) and the region is not marked
+                    // as inside the mask (= 2), then remove the island and mark it as removed
+                    if (regionSize < minSize && stats[x][regionIdx][numStats - 1] != 2)
                     {
                         util::ValueAt<DT>(dstVec, dstStrides, curCoord) = backgroundLabel;
+
+                        stats[x][regionIdx][numStats - 1] = 1;
                     }
                 }
             }
@@ -518,25 +543,29 @@ void Relabel(RawBufferType &dstVec, const RawBufferType &bglVec, const RawBuffer
     type::Types<type::Value<InShape>, type::Value<DataType>, Type, type::Value<HasBgLabel>, type::Value<HasMinThresh>, \
                 type::Value<HasMaxThresh>, type::Value<DoPostFilters>, type::Value<DoRelabel>>
 
-// DoPostFilters: (0) none; (1) count regions; (2) + compute statistics; (3) + island removal.
+// DoPostFilters: (0) none; (1) count regions; (2) + compute statistics; (3) + island removal; (4) + masked.
 
 NVCV_TYPED_TEST_SUITE(OpLabel, type::Types<
     NVCV_TEST_ROW(NVCV_SHAPE(33, 16, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, false, false, 0, false),
     NVCV_TEST_ROW(NVCV_SHAPE(23, 81, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, true, false, 1, false),
     NVCV_TEST_ROW(NVCV_SHAPE(13, 14, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, true, true, 2, false),
     NVCV_TEST_ROW(NVCV_SHAPE(32, 43, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, true, false, false, 3, false),
+    NVCV_TEST_ROW(NVCV_SHAPE(13, 52, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, true, false, false, 4, false),
     NVCV_TEST_ROW(NVCV_SHAPE(22, 12, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, false, false, true, 0, false),
     NVCV_TEST_ROW(NVCV_SHAPE(15, 16, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, true, false, true, 1, false),
     NVCV_TEST_ROW(NVCV_SHAPE(14, 26, 1, 1), NVCV_DATA_TYPE_U8, uint8_t, true, true, false, 2, true),
     NVCV_TEST_ROW(NVCV_SHAPE(28, 73, 1, 3), NVCV_DATA_TYPE_U16, uint16_t, true, true, true, 3, true),
+    NVCV_TEST_ROW(NVCV_SHAPE(19, 61, 1, 3), NVCV_DATA_TYPE_U16, uint16_t, true, true, true, 4, true),
     NVCV_TEST_ROW(NVCV_SHAPE(23, 21, 12, 1), NVCV_DATA_TYPE_U32, uint32_t, false, false, false, 0, false),
     NVCV_TEST_ROW(NVCV_SHAPE(33, 41, 22, 1), NVCV_DATA_TYPE_U32, uint32_t, false, false, false, 1, false),
     NVCV_TEST_ROW(NVCV_SHAPE(25, 38, 13, 2), NVCV_DATA_TYPE_S8, int8_t, true, false, false, 2, false),
     NVCV_TEST_ROW(NVCV_SHAPE(25, 18, 13, 1), NVCV_DATA_TYPE_S8, int8_t, true, false, false, 3, false),
+    NVCV_TEST_ROW(NVCV_SHAPE(45, 17, 11, 1), NVCV_DATA_TYPE_S8, int8_t, true, false, false, 4, false),
     NVCV_TEST_ROW(NVCV_SHAPE(22, 37, 19, 2), NVCV_DATA_TYPE_S16, int16_t, true, true, false, 0, false),
     NVCV_TEST_ROW(NVCV_SHAPE(18, 27, 3, 1), NVCV_DATA_TYPE_S32, int32_t, true, false, true, 1, false),
     NVCV_TEST_ROW(NVCV_SHAPE(17, 29, 5, 2), NVCV_DATA_TYPE_U8, uint8_t, true, true, true, 2, false),
-    NVCV_TEST_ROW(NVCV_SHAPE(16, 28, 4, 3), NVCV_DATA_TYPE_U8, uint8_t, true, true, true, 3, true)
+    NVCV_TEST_ROW(NVCV_SHAPE(16, 28, 4, 3), NVCV_DATA_TYPE_U8, uint8_t, true, true, true, 3, true),
+    NVCV_TEST_ROW(NVCV_SHAPE(17, 27, 5, 2), NVCV_DATA_TYPE_U8, uint8_t, true, true, true, 4, true)
 >);
 
 // clang-format on
@@ -547,10 +576,16 @@ TYPED_TEST(OpLabel, correct_output)
 
     int4           shape{type::GetValue<TypeParam, 0>};
     nvcv::DataType srcDT{type::GetValue<TypeParam, 1>};
-    nvcv::DataType dstDT{nvcv::TYPE_U32};
+    nvcv::DataType dstDT{srcDT.dataKind() == nvcv::DataKind::SIGNED ? nvcv::TYPE_S32 : nvcv::TYPE_U32};
+    nvcv::DataType mskDT{srcDT.dataKind() == nvcv::DataKind::SIGNED ? nvcv::TYPE_S8 : nvcv::TYPE_U8};
+
+    // Testing dstDT/mskDT with S32/S8 when srcDT is signed
+    // DstT must be U32 even though dstDT may be S32 (ref. code expects it as U32 since it treated it as a mask)
+    // MskT must be U8 even though mskDT may be S8 (ref. code only check if it is zero as outside the mask)
 
     using SrcT = type::GetType<TypeParam, 2>;
     using DstT = uint32_t;
+    using MskT = uint8_t;
 
     bool hasBgLabel    = type::GetValue<TypeParam, 3>;
     bool hasMinThresh  = type::GetValue<TypeParam, 4>;
@@ -562,14 +597,20 @@ TYPED_TEST(OpLabel, correct_output)
     // labels (bgl), minimum threshold (min), maximum threshold (max), minimum size for islands removal (msz),
     // count of labeled regions (count) and statistics computed per labeled region (sta)
 
-    nvcv::Tensor srcTensor, dstTensor, bglTensor, minTensor, maxTensor, mszTensor, cntTensor, staTensor;
+    nvcv::Tensor srcTensor, dstTensor, bglTensor, minTensor, maxTensor, mszTensor, cntTensor, staTensor, mskTensor;
 
-    nvcv::Optional<nvcv::TensorDataStridedCuda> srcData, dstData, bglData, minData, maxData, mszData, cntData, staData;
+    nvcv::Optional<nvcv::TensorDataStridedCuda> srcData, dstData, bglData, minData, maxData, mszData, cntData, staData,
+        mskData;
 
     NVCVConnectivityType connectivity = (shape.z == 1) ? NVCV_CONNECTIVITY_4_2D : NVCV_CONNECTIVITY_6_3D;
     NVCVLabelType        assignLabels = doRelabel ? NVCV_LABEL_SEQUENTIAL : NVCV_LABEL_FAST;
+    NVCVLabelMaskType    maskType     = NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY; // this is the only mask type allowed
+
+    long maskN{shape.w % 2 == 1 ? 1 : shape.w}; // test a single mask for all N when src/dst N is odd
 
-    long3 staShape{shape.w, 10000, (shape.z == 1) ? 6 : 8};
+    long4 mskShape{maskN, shape.z, shape.y, shape.x}; // mskShape is NDHW whereas shape is WHDN
+
+    long3 staShape{shape.w, 10000, (shape.z == 1) ? 7 : 9};
 
     // clang-format off
 
@@ -631,13 +672,20 @@ TYPED_TEST(OpLabel, correct_output)
         staData = staTensor.exportData<nvcv::TensorDataStridedCuda>();
         ASSERT_TRUE(staData);
     }
-    if (doPostFilters == 3)
+    if (doPostFilters >= 3)
     {
         mszTensor = nvcv::Tensor({{shape.w}, "N"}, dstDT);
 
         mszData = mszTensor.exportData<nvcv::TensorDataStridedCuda>();
         ASSERT_TRUE(mszData);
     }
+    if (doPostFilters >= 4)
+    {
+        mskTensor = nvcv::Tensor({{mskShape.x, mskShape.y, mskShape.z, mskShape.w}, "NDHW"}, mskDT);
+
+        mskData = mskTensor.exportData<nvcv::TensorDataStridedCuda>();
+        ASSERT_TRUE(mskData);
+    }
 
     // clang-format on
 
@@ -664,6 +712,16 @@ TYPED_TEST(OpLabel, correct_output)
     long1 mszStrides{(mszTensor) ? mszData->stride(0) : 0};
     long1 cntStrides{(cntTensor) ? cntData->stride(0) : 0};
     long3 staStrides = (staTensor) ? long3{staData->stride(0), staData->stride(1), staData->stride(2)} : long3{0, 0, 0};
+    long4 mskStrides{0, 0, 0, 0};
+
+    if (mskTensor)
+    {
+        int4 maskIds{mskTensor.layout().find('N'), mskTensor.layout().find('D'), mskTensor.layout().find('H'),
+                     mskTensor.layout().find('W')};
+
+        mskStrides = long4{mskData->stride(maskIds.x), mskData->stride(maskIds.y), mskData->stride(maskIds.z),
+                           mskData->stride(maskIds.w)};
+    }
 
     srcStrides.y = (ids.y == -1) ? srcStrides.z * srcShape.z : srcData->stride(ids.y);
     srcStrides.x = (ids.x == -1) ? srcStrides.y * srcShape.y : srcData->stride(ids.x);
@@ -672,6 +730,7 @@ TYPED_TEST(OpLabel, correct_output)
 
     long srcBufSize = srcStrides.x * srcShape.x;
     long dstBufSize = dstStrides.x * srcShape.x;
+    long mskBufSize = mskStrides.x * mskShape.x;
     long bglBufSize = bglStrides.x * srcShape.x;
     long minBufSize = minStrides.x * srcShape.x;
     long maxBufSize = maxStrides.x * srcShape.x;
@@ -682,6 +741,7 @@ TYPED_TEST(OpLabel, correct_output)
     // Third setup: generate raw buffer data and copy them into tensors
 
     RawBufferType srcVec(srcBufSize);
+    RawBufferType mskVec(mskBufSize);
     RawBufferType bglVec(bglBufSize);
     RawBufferType minVec(minBufSize);
     RawBufferType maxVec(maxBufSize);
@@ -690,6 +750,7 @@ TYPED_TEST(OpLabel, correct_output)
     std::default_random_engine rng(0);
 
     std::uniform_int_distribution<SrcT> srcRandom(0, 6);
+    std::uniform_int_distribution<MskT> mskRandom(0, 1);
     std::uniform_int_distribution<SrcT> bglRandom(0, (minTensor || maxTensor) ? 1 : 6);
     std::uniform_int_distribution<SrcT> minRandom(1, 3);
     std::uniform_int_distribution<SrcT> maxRandom(3, 5);
@@ -732,6 +793,16 @@ TYPED_TEST(OpLabel, correct_output)
 
         ASSERT_EQ(cudaSuccess, cudaMemcpy(mszData->basePtr(), mszVec.data(), mszBufSize, cudaMemcpyHostToDevice));
     }
+    if (mskTensor)
+    {
+        for (long x = 0; x < mskShape.x; ++x)
+            for (long y = 0; y < mskShape.y; ++y)
+                for (long z = 0; z < mskShape.z; ++z)
+                    for (long w = 0; w < mskShape.w; ++w)
+                        util::ValueAt<MskT>(mskVec, mskStrides, long4{x, y, z, w}) = mskRandom(rng);
+
+        ASSERT_EQ(cudaSuccess, cudaMemcpy(mskData->basePtr(), mskVec.data(), mskBufSize, cudaMemcpyHostToDevice));
+    }
 
     // clang-format on
 
@@ -742,7 +813,7 @@ TYPED_TEST(OpLabel, correct_output)
 
     cvcuda::Label op;
     EXPECT_NO_THROW(op(stream, srcTensor, dstTensor, bglTensor, minTensor, maxTensor, mszTensor, cntTensor, staTensor,
-                       connectivity, assignLabels));
+                       mskTensor, connectivity, assignLabels, maskType));
 
     ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(stream));
     ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream));
@@ -800,8 +871,8 @@ TYPED_TEST(OpLabel, correct_output)
     {
         ASSERT_EQ(cudaSuccess, cudaMemcpy(staTestVec.data(), staData->basePtr(), staBufSize, cudaMemcpyDeviceToHost));
 
-        ref::ComputeStats<SrcT, DstT>(goldStats, labGoldVec, bglVec, dstStrides, bglStrides, goldLabels, srcShape,
-                                      staShape.z);
+        ref::ComputeStats<SrcT, DstT, MskT>(goldStats, labGoldVec, mskVec, bglVec, dstStrides, mskStrides, bglStrides,
+                                            goldLabels, srcShape, maskN, staShape.z);
 
         ref::GetLabels<DstT>(testLabels, cntTestVec, staTestVec, cntStrides, staStrides, srcShape.x);
     }
diff --git a/tests/cvcuda/system/TestOpOSD.cpp b/tests/cvcuda/system/TestOpOSD.cpp
index 5ef18eabb..d401408f8 100644
--- a/tests/cvcuda/system/TestOpOSD.cpp
+++ b/tests/cvcuda/system/TestOpOSD.cpp
@@ -298,7 +298,7 @@ static void runOp(cudaStream_t &stream, cvcuda::OSD &op, int &inN, int &inW, int
                                   (unsigned char)randl(0, 255), (unsigned char)randl(0, 255)};
                 rb.bgColor = {(unsigned char)randl(0, 255), (unsigned char)randl(0, 255), (unsigned char)randl(0, 255),
                               (unsigned char)randl(0, 255)};
-                rb.interpolation = false;
+                rb.interpolation = (bool)randl(0, 1);
                 element          = std::make_shared<NVCVElement>(type, &rb);
                 break;
             }
diff --git a/tests/nvcv_types/cudatools_system/TestLinAlg.cpp b/tests/nvcv_types/cudatools_system/TestLinAlg.cpp
index e820c66ae..fcef2cb6e 100644
--- a/tests/nvcv_types/cudatools_system/TestLinAlg.cpp
+++ b/tests/nvcv_types/cudatools_system/TestLinAlg.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,8 +19,10 @@
 #include <common/ValueTests.hpp>     // for StringLiteral
 #include <nvcv/cuda/math/LinAlg.hpp> // the object of this test
 
+#include <algorithm>   // for std::generate, etc.
 #include <cmath>       // for std::pow, etc.
 #include <numeric>     // for std::iota, etc.
+#include <random>      // for std::random_device, etc.
 #include <sstream>     // for std::stringstream, etc.
 #include <type_traits> // for std::remove_reference_t, etc.
 
@@ -34,6 +36,13 @@ using TStr = typename test::StringLiteral<N>;
 using schar = signed char;
 using uchar = unsigned char;
 
+static std::random_device rd;
+static std::mt19937       mt(rd()); // to generate random input
+
+// Maximum absolute error expected given type T as either float or double
+template<typename T>
+constexpr T MaxAbsErr = std::is_same_v<T, float> ? 1e-5 : 1e-8;
+
 #define SCALAR(T, V) ttype::Value<T{V}>
 
 #define VEC(T, N, ...) ttype::Value<math::Vector<T, N>{{__VA_ARGS__}}>
@@ -175,6 +184,36 @@ TYPED_TEST(LinAlgVectorTest, load_works)
     }
 }
 
+TYPED_TEST(LinAlgVectorTest, load_initializer_list_works)
+{
+    using VectorType = ttype::GetType<TypeParam, 0>;
+    constexpr int N  = ttype::GetValue<TypeParam, 1>;
+
+    math::Vector<VectorType, N> vec;
+
+    std::array<VectorType, N> arr;
+
+    std::iota(arr.begin(), arr.end(), 0);
+
+    if constexpr (N == 1)
+    {
+        vec.load({0});
+    }
+    else if constexpr (N == 3)
+    {
+        vec.load({0, 1, 2});
+    }
+    else if constexpr (N == 5)
+    {
+        vec.load({0, 1, 2, 3, 4});
+    }
+
+    for (int i = 0; i < N; ++i)
+    {
+        EXPECT_EQ(vec[i], i);
+    }
+}
+
 TYPED_TEST(LinAlgVectorTest, store_works)
 {
     using VectorType = ttype::GetType<TypeParam, 0>;
@@ -364,6 +403,24 @@ TYPED_TEST(LinAlgMatrixTest, set_col_with_pointer_works)
     }
 }
 
+TYPED_TEST(LinAlgMatrixTest, set_col_with_value_works)
+{
+    using MatrixType = ttype::GetType<TypeParam, 0>;
+    constexpr int M  = ttype::GetValue<TypeParam, 1>;
+    constexpr int N  = ttype::GetValue<TypeParam, 2>;
+
+    math::Matrix<MatrixType, M, N> mat{{1}};
+
+    MatrixType val = 1;
+
+    mat.set_col(0, val);
+
+    for (int i = 0; i < M; ++i)
+    {
+        EXPECT_EQ(mat[i][0], val);
+    }
+}
+
 TYPED_TEST(LinAlgMatrixTest, load_works)
 {
     using MatrixType = ttype::GetType<TypeParam, 0>;
@@ -388,6 +445,41 @@ TYPED_TEST(LinAlgMatrixTest, load_works)
     }
 }
 
+TYPED_TEST(LinAlgMatrixTest, load_initializer_list_works)
+{
+    using MatrixType = ttype::GetType<TypeParam, 0>;
+    constexpr int M  = ttype::GetValue<TypeParam, 1>;
+    constexpr int N  = ttype::GetValue<TypeParam, 2>;
+
+    math::Matrix<MatrixType, M, N> mat;
+
+    std::array<MatrixType, M * N> arr;
+
+    std::iota(arr.begin(), arr.end(), 0);
+
+    if constexpr (M * N == 1)
+    {
+        mat.load({0});
+    }
+    else if constexpr (M * N == 6)
+    {
+        mat.load({0, 1, 2, 3, 4, 5});
+    }
+    else if constexpr (M * N == 12)
+    {
+        mat.load({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+    }
+
+    int val = 0;
+    for (int i = 0; i < M; ++i)
+    {
+        for (int j = 0; j < N; ++j)
+        {
+            EXPECT_EQ(mat[i][j], val++);
+        }
+    }
+}
+
 TYPED_TEST(LinAlgMatrixTest, store_works)
 {
     using MatrixType = ttype::GetType<TypeParam, 0>;
@@ -568,8 +660,8 @@ TYPED_TEST(LinAlgOperatorLessTest, correct_output)
 
 // clang-format off
 NVCV_TYPED_TEST_SUITE(LinAlgOutputStreamTest, ttype::Types<
-    ttype::Types<VEC(int, 3, 1, 2, 3), ttype::Value<TStr("[1,2,3]")>>,
-    ttype::Types<MAT(int, 2, 2, {1, 2}, {3, 4}), ttype::Value<TStr("[1,2;3,4]")>>
+    ttype::Types<VEC(int, 3, 1, 2, 3), ttype::Value<TStr("[1 2 3]")>>,
+    ttype::Types<MAT(int, 2, 2, {1, 2}, {3, 4}), ttype::Value<TStr("[1 2\n3 4]")>>
 >);
 
 // clang-format on
@@ -1073,10 +1165,10 @@ TYPED_TEST(LinAlgDotAndReverseVectorTest, correct_content_of_reverse)
     EXPECT_EQ(test, gold);
 }
 
-// -------------------- Testing LinAlg transp* operations ----------------------
+// ------------------- Testing LinAlg matrix transformations -------------------
 
 // clang-format off
-NVCV_TYPED_TEST_SUITE(LinAlgTranspTest, ttype::Zip<
+NVCV_TYPED_TEST_SUITE(LinAlgTransfTest, ttype::Zip<
                       test::Types<int, float>,
                       test::Values<4, 9>,
                       test::Values<8, 13>
@@ -1084,11 +1176,11 @@ NVCV_TYPED_TEST_SUITE(LinAlgTranspTest, ttype::Zip<
 
 // clang-format on
 
-TYPED_TEST(LinAlgTranspTest, correct_content_of_transp)
+TYPED_TEST(LinAlgTransfTest, correct_content_of_transp)
 {
     using Type      = ttype::GetType<TypeParam, 0>;
     constexpr int M = ttype::GetValue<TypeParam, 1>;
-    constexpr int N = ttype::GetValue<TypeParam, 1>;
+    constexpr int N = ttype::GetValue<TypeParam, 2>;
 
     math::Matrix<Type, M, N> mat;
 
@@ -1117,7 +1209,7 @@ TYPED_TEST(LinAlgTranspTest, correct_content_of_transp)
     EXPECT_EQ(test, gold);
 }
 
-TYPED_TEST(LinAlgTranspTest, correct_content_of_transp_inplace)
+TYPED_TEST(LinAlgTransfTest, correct_content_of_transp_inplace)
 {
     using Type      = ttype::GetType<TypeParam, 0>;
     constexpr int M = ttype::GetValue<TypeParam, 1>;
@@ -1152,7 +1244,7 @@ TYPED_TEST(LinAlgTranspTest, correct_content_of_transp_inplace)
     EXPECT_EQ(test1, gold);
 }
 
-TYPED_TEST(LinAlgTranspTest, correct_content_of_transp_vector)
+TYPED_TEST(LinAlgTransfTest, correct_content_of_transp_vector)
 {
     using Type      = ttype::GetType<TypeParam, 0>;
     constexpr int M = ttype::GetValue<TypeParam, 1>;
@@ -1181,22 +1273,11 @@ TYPED_TEST(LinAlgTranspTest, correct_content_of_transp_vector)
     EXPECT_EQ(test, gold);
 }
 
-// ------------------ Testing LinAlg flip* matrix operations -------------------
-
-// clang-format off
-NVCV_TYPED_TEST_SUITE(LinAlgFlipMatrixTest, ttype::Zip<
-                      test::Types<int, float>,
-                      test::Values<4, 9>,
-                      test::Values<8, 13>
->);
-
-// clang-format on
-
-TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip)
+TYPED_TEST(LinAlgTransfTest, correct_content_of_flip)
 {
     using Type      = ttype::GetType<TypeParam, 0>;
     constexpr int M = ttype::GetValue<TypeParam, 1>;
-    constexpr int N = ttype::GetValue<TypeParam, 1>;
+    constexpr int N = ttype::GetValue<TypeParam, 2>;
 
     math::Matrix<Type, M, N> mat;
 
@@ -1212,7 +1293,7 @@ TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip)
     EXPECT_EQ(test.rows(), M);
     EXPECT_EQ(test.cols(), N);
 
-    math::Matrix<Type, N, M> gold;
+    math::Matrix<Type, M, N> gold;
 
     for (int i = 0; i < gold.rows(); ++i)
     {
@@ -1225,11 +1306,11 @@ TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip)
     EXPECT_EQ(test, gold);
 }
 
-TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip_rows)
+TYPED_TEST(LinAlgTransfTest, correct_content_of_flip_rows)
 {
     using Type      = ttype::GetType<TypeParam, 0>;
     constexpr int M = ttype::GetValue<TypeParam, 1>;
-    constexpr int N = ttype::GetValue<TypeParam, 1>;
+    constexpr int N = ttype::GetValue<TypeParam, 2>;
 
     math::Matrix<Type, M, N> mat;
 
@@ -1245,7 +1326,7 @@ TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip_rows)
     EXPECT_EQ(test.rows(), M);
     EXPECT_EQ(test.cols(), N);
 
-    math::Matrix<Type, N, M> gold;
+    math::Matrix<Type, M, N> gold;
 
     for (int i = 0; i < gold.rows(); ++i)
     {
@@ -1258,11 +1339,11 @@ TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip_rows)
     EXPECT_EQ(test, gold);
 }
 
-TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip_cols)
+TYPED_TEST(LinAlgTransfTest, correct_content_of_flip_cols)
 {
     using Type      = ttype::GetType<TypeParam, 0>;
     constexpr int M = ttype::GetValue<TypeParam, 1>;
-    constexpr int N = ttype::GetValue<TypeParam, 1>;
+    constexpr int N = ttype::GetValue<TypeParam, 2>;
 
     math::Matrix<Type, M, N> mat;
 
@@ -1278,7 +1359,7 @@ TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip_cols)
     EXPECT_EQ(test.rows(), M);
     EXPECT_EQ(test.cols(), N);
 
-    math::Matrix<Type, N, M> gold;
+    math::Matrix<Type, M, N> gold;
 
     for (int i = 0; i < gold.rows(); ++i)
     {
@@ -1291,14 +1372,240 @@ TYPED_TEST(LinAlgFlipMatrixTest, correct_content_of_flip_cols)
     EXPECT_EQ(test, gold);
 }
 
+TYPED_TEST(LinAlgTransfTest, correct_content_of_head)
+{
+    using Type      = ttype::GetType<TypeParam, 0>;
+    constexpr int M = ttype::GetValue<TypeParam, 1>;
+    constexpr int N = ttype::GetValue<TypeParam, 2>;
+
+    math::Matrix<Type, M, N> mat;
+
+    for (int i = 0; i < mat.rows(); ++i)
+    {
+        std::iota(mat[i].begin(), mat[i].end(), 0);
+    }
+
+    auto test = math::head<1>(mat);
+
+    EXPECT_TRUE((std::is_same_v<typename decltype(test)::Type, Type>));
+
+    EXPECT_EQ(test.rows(), 1);
+    EXPECT_EQ(test.cols(), N);
+
+    math::Matrix<Type, 1, N> gold;
+
+    for (int i = 0; i < gold.rows(); ++i)
+    {
+        for (int j = 0; j < gold.cols(); ++j)
+        {
+            gold[i][j] = mat[i][j];
+        }
+    }
+
+    EXPECT_EQ(test, gold);
+}
+
+TYPED_TEST(LinAlgTransfTest, correct_content_of_tail)
+{
+    using Type      = ttype::GetType<TypeParam, 0>;
+    constexpr int M = ttype::GetValue<TypeParam, 1>;
+    constexpr int N = ttype::GetValue<TypeParam, 2>;
+
+    math::Matrix<Type, M, N> mat;
+
+    for (int i = 0; i < mat.rows(); ++i)
+    {
+        std::iota(mat[i].begin(), mat[i].end(), 0);
+    }
+
+    auto test = math::tail<1>(mat);
+
+    EXPECT_TRUE((std::is_same_v<typename decltype(test)::Type, Type>));
+
+    EXPECT_EQ(test.rows(), 1);
+    EXPECT_EQ(test.cols(), N);
+
+    math::Matrix<Type, 1, N> gold;
+
+    for (int i = 0; i < gold.rows(); ++i)
+    {
+        for (int j = 0; j < gold.cols(); ++j)
+        {
+            gold[i][j] = mat[M - 1 - i][j];
+        }
+    }
+
+    EXPECT_EQ(test, gold);
+}
+
+// -------------------- Testing LinAlg advanced operations ---------------------
+
+// clang-format off
+NVCV_TYPED_TEST_SUITE(LinAlgLTIFilterTest, ttype::Zip<
+                      test::Types<double, float>,
+                      test::Values<5, 32>,
+                      test::Values<3, 32>,
+                      test::Values<1, 3>
+>);
+
+// clang-format on
+
+TYPED_TEST(LinAlgLTIFilterTest, correct_content_of_fwd)
+{
+    using Type      = ttype::GetType<TypeParam, 0>;
+    constexpr int M = ttype::GetValue<TypeParam, 1>;
+    constexpr int N = ttype::GetValue<TypeParam, 2>;
+    constexpr int R = ttype::GetValue<TypeParam, 3>;
+
+    std::uniform_real_distribution<Type> d(-1, 1);
+
+    math::Vector<Type, R + 1> weights;
+    math::Matrix<Type, R, N>  prologue;
+    math::Matrix<Type, M, N>  block;
+
+    std::generate(weights.begin(), weights.end(), [&]() { return d(mt); });
+    for (int i = 0; i < R; ++i)
+    {
+        std::generate(prologue[i].begin(), prologue[i].end(), [&]() { return d(mt); });
+    }
+    for (int i = 0; i < M; ++i)
+    {
+        std::generate(block[i].begin(), block[i].end(), [&]() { return d(mt); });
+    }
+
+    auto test = math::fwd(prologue, block, weights);
+
+    EXPECT_TRUE((std::is_same_v<typename decltype(test)::Type, Type>));
+
+    EXPECT_EQ(test.rows(), M);
+    EXPECT_EQ(test.cols(), N);
+
+    math::Matrix<Type, M, N> gold;
+
+    for (int j = 0; j < N; ++j)
+    {
+        for (int i = 0; i < M; ++i)
+        {
+            Type y = block[i][j] * weights[0] - prologue[R - 1][j] * weights[1];
+
+            for (int r = R - 1; r >= 1; --r)
+            {
+                y = y - prologue[R - 1 - r][j] * weights[r + 1];
+
+                prologue[R - 1 - r][j] = prologue[R - 1 - r + 1][j];
+            }
+
+            gold[i][j] = prologue[R - 1][j] = y;
+        }
+    }
+
+    for (int i = 0; i < M; ++i)
+    {
+        for (int j = 0; j < N; ++j)
+        {
+            EXPECT_NEAR(test[i][j], gold[i][j], MaxAbsErr<Type>);
+        }
+    }
+}
+
+TYPED_TEST(LinAlgLTIFilterTest, correct_content_of_rev)
+{
+    using Type      = ttype::GetType<TypeParam, 0>;
+    constexpr int M = ttype::GetValue<TypeParam, 1>;
+    constexpr int N = ttype::GetValue<TypeParam, 2>;
+    constexpr int R = ttype::GetValue<TypeParam, 3>;
+
+    std::uniform_real_distribution<Type> d(-1, 1);
+
+    math::Vector<Type, R + 1> weights;
+    math::Matrix<Type, R, N>  epilogue;
+    math::Matrix<Type, M, N>  block;
+
+    std::generate(weights.begin(), weights.end(), [&]() { return d(mt); });
+    for (int i = 0; i < R; ++i)
+    {
+        std::generate(epilogue[i].begin(), epilogue[i].end(), [&]() { return d(mt); });
+    }
+    for (int i = 0; i < M; ++i)
+    {
+        std::generate(block[i].begin(), block[i].end(), [&]() { return d(mt); });
+    }
+
+    auto test = math::rev(block, epilogue, weights);
+
+    EXPECT_TRUE((std::is_same_v<typename decltype(test)::Type, Type>));
+
+    EXPECT_EQ(test.rows(), M);
+    EXPECT_EQ(test.cols(), N);
+
+    math::Matrix<Type, M, N> gold;
+
+    for (int j = 0; j < N; ++j)
+    {
+        for (int i = M - 1; i >= 0; --i)
+        {
+            Type z = block[i][j] * weights[0] - epilogue[0][j] * weights[1];
+
+            for (int r = R - 1; r >= 1; --r)
+            {
+                z = z - epilogue[r][j] * weights[r + 1];
+
+                epilogue[r][j] = epilogue[r - 1][j];
+            }
+
+            gold[i][j] = epilogue[0][j] = z;
+        }
+    }
+
+    for (int i = 0; i < M; ++i)
+    {
+        for (int j = 0; j < N; ++j)
+        {
+            EXPECT_NEAR(test[i][j], gold[i][j], MaxAbsErr<Type>);
+        }
+    }
+}
+
 // ------------------- Testing LinAlg det matrix operations --------------------
 
 // clang-format off
 NVCV_TYPED_TEST_SUITE(LinAlgDetMatrixTest, ttype::Zip<
-                      test::Types<uchar, int, float>,
-                      test::Values<2, 3, 4>
+                      test::Types<double, float, double, float>,
+                      test::Values<1, 2, 3, 4>
 >);
 
+template<typename T, int M>
+void GetTestInput(math::Matrix<T, M, M> &input)
+{
+    if constexpr (M == 1)
+    {
+        input.load({0.999998682});
+    }
+    else if constexpr (M == 2)
+    {
+        input.load(
+            {1.00034897, -0.000357094,
+             0.000348814, 0.999643171});
+    }
+    else if constexpr (M == 3)
+    {
+        input.load(
+            {1.01250394, -0.02495176, 0.01244351,
+             0.01199532,  0.97607735, 0.01192297,
+             0.01149353, -0.02290747, 1.01140953});
+    }
+    else
+    {
+        static_assert(M == 4);
+
+        input.load(
+            {1.,          0.292789199,  0.384852709,  0.200596131,
+             0.,          0.941267619,  0.215589234,  0.344613902,
+             0.,         -0.100899228,  0.808642026,  0.146461019,
+             0.,         -0.042882204, -0.157265148,  0.779262512});
+    }
+}
+
 // clang-format on
 
 template<typename T, int M>
@@ -1330,7 +1637,9 @@ TYPED_TEST(LinAlgDetMatrixTest, correct_content_of_det)
     using Type      = ttype::GetType<TypeParam, 0>;
     constexpr int M = ttype::GetValue<TypeParam, 1>;
 
-    math::Matrix<Type, M, M> mat = math::identity<Type, M, M>();
+    math::Matrix<Type, M, M> mat;
+
+    GetTestInput(mat);
 
     auto test = math::det(mat);
 
@@ -1340,10 +1649,30 @@ TYPED_TEST(LinAlgDetMatrixTest, correct_content_of_det)
 
     Type gold = goldDet(mat);
 
-    EXPECT_EQ(test, gold);
+    EXPECT_NEAR(test, gold, MaxAbsErr<Type>);
 }
 
-// --------------- Testing LinAlg inv_inplace matrix operations ----------------
+// --------------------- Testing LinAlg solve operations -----------------------
+
+NVCV_TYPED_TEST_SUITE(
+    LinAlgSolveTest,
+    ttype::Types<ttype::Types<MAT(int, 1, 1, {1}), VEC(int, 1, 2), VEC(int, 1, 2)>,
+                 ttype::Types<MAT(float, 2, 2, {1, 2}, {3, 4}), VEC(float, 2, 0, 0.5), VEC(float, 2, 1, 2)>,
+                 ttype::Types<MAT(float, 3, 3, {3, -2, 5}, {4, -7, -1}, {5, -6, 4}), VEC(float, 3, 1, -2, -1),
+                              VEC(float, 3, 2, 19, 13)>>);
+
+TYPED_TEST(LinAlgSolveTest, correct_solve)
+{
+    auto A = ttype::GetValue<TypeParam, 0>;
+    auto x = ttype::GetValue<TypeParam, 1>;
+    auto b = ttype::GetValue<TypeParam, 2>;
+
+    math::solve_inplace(A, b);
+
+    EXPECT_EQ(b, x);
+}
+
+// ------------- Testing LinAlg various inverse matrix operations --------------
 
 // clang-format off
 NVCV_TYPED_TEST_SUITE(LinAlgInvMatrixTest, ttype::Zip<
@@ -1351,98 +1680,110 @@ NVCV_TYPED_TEST_SUITE(LinAlgInvMatrixTest, ttype::Zip<
                       test::Values<1, 2, 3>
 >);
 
-// clang-format on
-
 template<typename T, int M>
-struct GoldInv
+void GetTestInputAndGoldOutput(math::Matrix<T, M, M> &input, math::Matrix<T, M, M> &output)
 {
-    void operator()(math::Matrix<T, M, M> &m) {}
-};
+    GetTestInput(input);
 
-template<typename T>
-struct GoldInv<T, 1>
-{
-    void operator()(math::Matrix<T, 1, 1> &m)
+    if constexpr (M == 1)
     {
-        m[0][0] = T{1} / m[0][0];
+        output.load({1.000001318});
     }
-};
-
-template<typename T>
-struct GoldInv<T, 2>
-{
-    void operator()(math::Matrix<T, 2, 2> &m)
+    else if constexpr (M == 2)
     {
-        GoldDet<T, 2> goldDet;
-        T             d = goldDet(m);
-        std::swap(m[0][0], m[1][1]);
-        m[0][0] /= d;
-        m[1][1] /= d;
-
-        m[0][1] = -m[0][1] / d;
-        m[1][0] = -m[1][0] / d;
+        output.load(
+            { 0.999651028, 0.000357097,
+             -0.000348817, 1.000356831});
     }
-};
+    else
+    {
+        static_assert(M == 3);
 
-template<typename T>
-struct GoldInv<T, 3>
+        output.load(
+            { 0.98749612, 0.02495163, -0.01244344,
+             -0.01199525, 1.02392251, -0.0119229,
+             -0.01149346, 0.02290733,  0.98859054});
+    }
+}
+
+// clang-format on
+
+TYPED_TEST(LinAlgInvMatrixTest, correct_content_of_inv)
 {
-    void operator()(math::Matrix<T, 3, 3> &m)
+    using Type      = ttype::GetType<TypeParam, 0>;
+    constexpr int M = ttype::GetValue<TypeParam, 1>;
+
+    math::Matrix<Type, M, M> mat, gold, test;
+
+    GetTestInputAndGoldOutput(mat, gold);
+
+    EXPECT_NO_THROW(test = math::inv(mat));
+
+    for (int i = 0; i < M; ++i)
     {
-        GoldDet<T, 3> goldDet;
-        T             d = goldDet(m);
-
-        math::Matrix<T, 3, 3> A;
-        A[0][0] = (m[1][1] * m[2][2] - m[1][2] * m[2][1]) / d;
-        A[0][1] = -(m[0][1] * m[2][2] - m[0][2] * m[2][1]) / d;
-        A[0][2] = (m[0][1] * m[1][2] - m[0][2] * m[1][1]) / d;
-        A[1][0] = -(m[1][0] * m[2][2] - m[1][2] * m[2][0]) / d;
-        A[1][1] = (m[0][0] * m[2][2] - m[0][2] * m[2][0]) / d;
-        A[1][2] = -(m[0][0] * m[1][2] - m[0][2] * m[1][0]) / d;
-        A[2][0] = (m[1][0] * m[2][1] - m[1][1] * m[2][0]) / d;
-        A[2][1] = -(m[0][0] * m[2][1] - m[0][1] * m[2][0]) / d;
-        A[2][2] = (m[0][0] * m[1][1] - m[0][1] * m[1][0]) / d;
-
-        m = A;
+        for (int j = 0; j < M; ++j)
+        {
+            EXPECT_NEAR(test[i][j], gold[i][j], MaxAbsErr<Type>);
+        }
     }
-};
+}
 
 TYPED_TEST(LinAlgInvMatrixTest, correct_content_of_inv_inplace)
 {
     using Type      = ttype::GetType<TypeParam, 0>;
     constexpr int M = ttype::GetValue<TypeParam, 1>;
 
-    math::Matrix<Type, M, M> mat = math::identity<Type, M, M>();
+    math::Matrix<Type, M, M> mat, gold;
 
-    auto test = mat;
+    GetTestInputAndGoldOutput(mat, gold);
 
-    EXPECT_TRUE(math::inv_inplace(test));
+    EXPECT_TRUE(math::inv_inplace(mat));
 
-    GoldInv<Type, M> goldInv;
+    for (int i = 0; i < M; ++i)
+    {
+        for (int j = 0; j < M; ++j)
+        {
+            EXPECT_NEAR(mat[i][j], gold[i][j], MaxAbsErr<Type>);
+        }
+    }
+}
 
-    auto gold = mat;
+TYPED_TEST(LinAlgInvMatrixTest, correct_content_of_inv_lu)
+{
+    using Type      = ttype::GetType<TypeParam, 0>;
+    constexpr int M = ttype::GetValue<TypeParam, 1>;
 
-    goldInv(gold);
+    math::Matrix<Type, M, M> mat, gold, test;
 
-    EXPECT_EQ(test, gold);
-}
+    GetTestInputAndGoldOutput(mat, gold);
 
-// --------------------- Testing LinAlg solve operations -----------------------
+    EXPECT_NO_THROW(test = math::inv_lu(mat));
 
-NVCV_TYPED_TEST_SUITE(
-    LinAlgSolveTest,
-    ttype::Types<ttype::Types<MAT(int, 1, 1, {1}), VEC(int, 1, 2), VEC(int, 1, 2)>,
-                 ttype::Types<MAT(float, 2, 2, {1, 2}, {3, 4}), VEC(float, 2, 0, 0.5), VEC(float, 2, 1, 2)>,
-                 ttype::Types<MAT(float, 3, 3, {3, -2, 5}, {4, -7, -1}, {5, -6, 4}), VEC(float, 3, 1, -2, -1),
-                              VEC(float, 3, 2, 19, 13)>>);
+    for (int i = 0; i < M; ++i)
+    {
+        for (int j = 0; j < M; ++j)
+        {
+            EXPECT_NEAR(test[i][j], gold[i][j], MaxAbsErr<Type>);
+        }
+    }
+}
 
-TYPED_TEST(LinAlgSolveTest, correct_solve)
+TYPED_TEST(LinAlgInvMatrixTest, correct_content_of_inv_lu_inplace)
 {
-    auto A = ttype::GetValue<TypeParam, 0>;
-    auto x = ttype::GetValue<TypeParam, 1>;
-    auto b = ttype::GetValue<TypeParam, 2>;
+    using Type      = ttype::GetType<TypeParam, 0>;
+    constexpr int M = ttype::GetValue<TypeParam, 1>;
 
-    math::solve_inplace(A, b);
+    math::Matrix<Type, M, M> mat, gold;
 
-    EXPECT_EQ(b, x);
+    GetTestInputAndGoldOutput(mat, gold);
+
+    EXPECT_NO_THROW(math::inv_lu_inplace(mat));
+
+    for (int i = 0; i < M; ++i)
+    {
+        for (int j = 0; j < M; ++j)
+        {
+            EXPECT_NEAR(mat[i][j], gold[i][j], MaxAbsErr<Type>);
+        }
+    }
 }
diff --git a/tests/nvcv_types/cudatools_system/TestTensorWrap.cpp b/tests/nvcv_types/cudatools_system/TestTensorWrap.cpp
index cae960eef..427cf6db0 100644
--- a/tests/nvcv_types/cudatools_system/TestTensorWrap.cpp
+++ b/tests/nvcv_types/cudatools_system/TestTensorWrap.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -1199,7 +1199,7 @@ TEST_P(CreateTensorWrapNHWxTests, correct_properties_in_nhw)
     EXPECT_EQ(wrap.ptr(), reinterpret_cast<unsigned char *>(dev->basePtr()));
 
     auto tensorAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dev);
-    NVCV_ASSERT(tensorAccess);
+    ASSERT_TRUE(tensorAccess);
 
     EXPECT_EQ(wrap.strides()[0], tensorAccess->sampleStride());
     EXPECT_EQ(wrap.strides()[1], tensorAccess->rowStride());
@@ -1226,7 +1226,7 @@ TEST_P(CreateTensorWrapNHWxTests, correct_properties_in_nhwc)
     EXPECT_EQ(wrap.ptr(), reinterpret_cast<unsigned char *>(dev->basePtr()));
 
     auto tensorAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*dev);
-    NVCV_ASSERT(tensorAccess);
+    ASSERT_TRUE(tensorAccess);
 
     EXPECT_EQ(wrap.strides()[0], tensorAccess->sampleStride());
     EXPECT_EQ(wrap.strides()[1], tensorAccess->rowStride());
diff --git a/tests/nvcv_types/python/nvcv_test_types_python.in b/tests/nvcv_types/python/nvcv_test_types_python.in
index ee9d70f8c..ee25eda73 100755
--- a/tests/nvcv_types/python/nvcv_test_types_python.in
+++ b/tests/nvcv_types/python/nvcv_test_types_python.in
@@ -16,11 +16,27 @@
 # limitations under the License.
 
 
-tests_dir="@PYTHON_TEST_DIR@"
+tests_dir=${1:-"@PYTHON_TEST_DIR@"}
 python_versions_tentative="@PYTHON_TEST_VERSIONS@"
 
 python_versions=""
 
+# Check if test scripts exist in test_dir
+if [ ! -f "$tests_dir/nvcv_util.py" ]; then
+    # try to find the scripts in relative path
+    echo "Python test scripts (E.g. nvcv_util.py, test_image.py) doesn't exist in $tests_dir"
+    echo "Trying to find python test scripts via relative path"
+    nvcv_test_types_python_tar_dir=$(cd "$(dirname "$0")"; pwd)/../@PYTHON_TEST_INSTDIR@ # relative path in tarball
+    if [ -f "$nvcv_test_types_python_tar_dir/nvcv_util.py" ]; then
+        echo "Found python test scripts at $nvcv_test_types_python_tar_dir via relative path"
+        tests_dir=$nvcv_test_types_python_tar_dir
+    else
+        echo "Cannot find python test scripts in $tests_dir and $nvcv_test_types_python_tar_dir"
+        echo "Please run ./nvcv_test_types_python [python test scripts folder]"
+        exit 1 #hard exit
+    fi
+fi
+
 # Verify if correct package dependencies are installed --------
 pip_depends="pytest torch"
 
@@ -33,7 +49,10 @@ for ver in $python_versions_tentative; do
         echo "WARNING: Python version $ver not installed or missing proper dependencies"
         echo "Please install Python version $ver and run the following commands before running tests: sudo python$ver -m pip install $pip_depends"
         if [[ "$NVCV_FORCE_PYTHON" == 1 || "$NVCV_FORCE_PYTHON" == yes ]]; then
-            exit 1 #hard exit
+            echo "Exiting with FAILURE, as NVCV_FORCE_PYTHON=$NVCV_FORCE_PYTHON"
+            exit 2 #hard exit
+        else
+            echo "Continue and skipping python version $ver, as NVCV_FORCE_PYTHON=$NVCV_FORCE_PYTHON"
         fi
     else
         echo "Found Python version $ver installed with proper dependencies, adding to tests"
@@ -69,13 +88,13 @@ for ver in $python_versions; do
     pubsyms=$(readelf -sWD $modfile | grep -v ' UND ' | grep ' GLOBAL ')
     if [[ $(echo "$pubsyms" | wc -l) != 1 ]]; then
         echo -e "nvcv python $ver module is exposing too many symbols:\n$pubsyms"
-        exit 1
+        exit 3
     fi
     if ! echo "$pubsyms" | grep PyInit_nvcv > /dev/null; then
         echo -e "nvcv python $ver module must expose symbol PyInit_nvcv, but instead exposes:\n$pubsyms"
-        exit 2
+        exit 4
     fi
 
     # Run python tests
-    NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -o cache_dir="$tmpdir" "$@" "$tests_dir"
+    NVCV_VERSION="@NVCV_VERSION_FULL@" python$ver -m pytest -v --tb=line -o cache_dir="$tmpdir" "$@" "$tests_dir"
 done
diff --git a/tests/nvcv_types/system/TestArray.cpp b/tests/nvcv_types/system/TestArray.cpp
index cbc67ab5d..dbf204ba3 100644
--- a/tests/nvcv_types/system/TestArray.cpp
+++ b/tests/nvcv_types/system/TestArray.cpp
@@ -163,9 +163,26 @@ TEST_P(ArrayWrapTests, smoke_create)
     EXPECT_EQ(data->length(), access->length());
     EXPECT_EQ(data->kind(), access->kind());
     EXPECT_EQ(data->stride(), access->stride());
+    EXPECT_EQ(data->rank(), 1);
 
     auto array = nvcv::ArrayWrapData(*data);
     ASSERT_NE(array.handle(), nullptr);
+    EXPECT_EQ(array.rank(), 1);
+    EXPECT_EQ(array.capacity(), capacity);
+    EXPECT_EQ(array.length(), data->length());
+    EXPECT_EQ(array.dtype(), data->dtype());
+    EXPECT_EQ(array.target(), baseArray.target());
+
+    auto arrayData = array.exportData<nvcv::ArrayData>();
+    ASSERT_TRUE(arrayData);
+    auto arrayAccess = nvcv::ArrayDataAccess::Create(*arrayData);
+    ASSERT_TRUE(arrayAccess);
+
+    EXPECT_EQ(arrayData->basePtr(), arrayAccess->ptr());
+    EXPECT_EQ(arrayData->length(), arrayAccess->length());
+    EXPECT_EQ(arrayData->kind(), arrayAccess->kind());
+    EXPECT_EQ(arrayData->stride(), arrayAccess->stride());
+    EXPECT_EQ(arrayData->rank(), 1);
 }
 
 INSTANTIATE_TEST_SUITE_P(_, ArrayWrapTests,
@@ -402,9 +419,11 @@ TEST(ArrayTests, invalid_out_get_data_type)
 
 TEST(ArrayTests, valid_get_allocator)
 {
+    int                   tmp = 1;
     NVCVArrayHandle       arrayHandle;
     NVCVArrayRequirements req;
-    NVCVAllocatorHandle   alloc;
+    NVCVAllocatorHandle   alloc = reinterpret_cast<NVCVAllocatorHandle>(&tmp);
+    EXPECT_NE(alloc, nullptr);
 
     EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req));
     EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle));
@@ -473,3 +492,90 @@ TEST(ArrayTests, invalid_out_get_target)
     EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayGetTarget(arrayHandle, nullptr));
     EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr));
 }
+
+TEST(ArrayTests, validResize)
+{
+    NVCVArrayHandle       arrayHandle;
+    NVCVArrayRequirements req;
+    int64_t               length = 0;
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle));
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayResize(arrayHandle, 8));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayGetLength(arrayHandle, &length));
+    EXPECT_EQ(length, 8);
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr));
+}
+
+TEST(ArrayTests, invalidResize)
+{
+    NVCVArrayHandle       arrayHandle;
+    NVCVArrayRequirements req;
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle));
+
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayResize(arrayHandle, 17));
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr));
+}
+
+TEST(ArrayWrapTests, validResize)
+{
+    NVCVArrayHandle       arrayHandle, arrayWrapHandle;
+    NVCVArrayData         arrayData;
+    NVCVArrayRequirements req;
+    int64_t               length = 0;
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayExportData(arrayHandle, &arrayData));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayWrapDataConstruct(&arrayData, nullptr, nullptr, &arrayWrapHandle));
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayResize(arrayWrapHandle, 8));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayGetLength(arrayWrapHandle, &length));
+    EXPECT_EQ(length, 8);
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayWrapHandle, nullptr));
+}
+
+TEST(ArrayWrapTests, invalidResize)
+{
+    NVCVArrayHandle       arrayHandle, arrayWrapHandle;
+    NVCVArrayData         arrayData;
+    NVCVArrayRequirements req;
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayExportData(arrayHandle, &arrayData));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayWrapDataConstruct(&arrayData, nullptr, nullptr, &arrayWrapHandle));
+
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvArrayResize(arrayHandle, 17));
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayWrapHandle, nullptr));
+}
+
+TEST(ArrayWrapTests, valid_get_allocator)
+{
+    int                   tmp = 1;
+    NVCVArrayHandle       arrayHandle, arrayWrapHandle;
+    NVCVArrayData         arrayData;
+    NVCVArrayRequirements req;
+    NVCVAllocatorHandle   alloc = reinterpret_cast<NVCVAllocatorHandle>(&tmp);
+    EXPECT_NE(alloc, nullptr);
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayCalcRequirements(16, NVCV_DATA_TYPE_U8, 0, &req));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayConstruct(&req, nullptr, &arrayHandle));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayExportData(arrayHandle, &arrayData));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayWrapDataConstruct(&arrayData, nullptr, nullptr, &arrayWrapHandle));
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayGetAllocator(arrayWrapHandle, &alloc));
+    EXPECT_EQ(alloc, nullptr);
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayHandle, nullptr));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvArrayDecRef(arrayWrapHandle, nullptr));
+}
diff --git a/tests/nvcv_types/system/TestColorSpec.cpp b/tests/nvcv_types/system/TestColorSpec.cpp
index 7d6d6a73f..bc06a055e 100644
--- a/tests/nvcv_types/system/TestColorSpec.cpp
+++ b/tests/nvcv_types/system/TestColorSpec.cpp
@@ -629,7 +629,7 @@ TEST(ColorSpaceTests, get_name)
 
 // White point ===========================
 
-TEST(WhitePointTests, get_name)
+TEST(WhitePointTests, get_name0)
 {
     EXPECT_STREQ("NVCV_WHITE_POINT_D65", nvcvWhitePointGetName(NVCV_WHITE_POINT_D65));
     EXPECT_STREQ("NVCVWhitePoint(255)", nvcvWhitePointGetName(NVCV_WHITE_POINT_FORCE8));
diff --git a/tests/nvcv_types/system/TestImage.cpp b/tests/nvcv_types/system/TestImage.cpp
index 98dd6f470..371f48319 100644
--- a/tests/nvcv_types/system/TestImage.cpp
+++ b/tests/nvcv_types/system/TestImage.cpp
@@ -247,6 +247,36 @@ TEST(Image, smoke_operator)
     }
 }
 
+TEST(Image, valid_get_allocator)
+{
+    int                   tmp = 1;
+    NVCVImageHandle       handle;
+    NVCVImageRequirements reqs;
+    NVCVAllocatorHandle   alloc = reinterpret_cast<NVCVAllocatorHandle>(&tmp);
+    EXPECT_NE(alloc, nullptr);
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageCalcRequirements(224, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, 0, &reqs));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageConstruct(&reqs, nullptr, &handle));
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageGetAllocator(handle, &alloc));
+    EXPECT_EQ(alloc, nullptr);
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageDecRef(handle, nullptr));
+}
+
+TEST(Image, invalid_out_get_allocator)
+{
+    NVCVImageHandle       handle;
+    NVCVImageRequirements reqs;
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageCalcRequirements(224, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, 0, &reqs));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageConstruct(&reqs, nullptr, &handle));
+
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageGetAllocator(handle, nullptr));
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageDecRef(handle, nullptr));
+}
+
 TEST(ImageWrapData, smoke_cleanup)
 {
     nvcv::ImageDataStridedCuda::Buffer buf;
@@ -300,6 +330,27 @@ TEST(ImageWrapData, smoke_mem_reqs)
     }
 }
 
+TEST(ImageWrapData, valid_get_allocator)
+{
+    int                   tmp = 1;
+    NVCVImageHandle       handle, warpHandle;
+    NVCVImageData         imageData;
+    NVCVImageRequirements reqs;
+    NVCVAllocatorHandle   alloc = reinterpret_cast<NVCVAllocatorHandle>(&tmp);
+    EXPECT_NE(alloc, nullptr);
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageCalcRequirements(224, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, 0, &reqs));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageConstruct(&reqs, nullptr, &handle));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageExportData(handle, &imageData));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageWrapDataConstruct(&imageData, nullptr, nullptr, &warpHandle));
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageGetAllocator(warpHandle, &alloc));
+    EXPECT_EQ(alloc, nullptr);
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageDecRef(handle, nullptr));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageDecRef(warpHandle, nullptr));
+}
+
 // Future API ideas
 #if 0
 TEST(Image, smoke_image_managed_memory)
diff --git a/tests/nvcv_types/system/TestImageBatch.cpp b/tests/nvcv_types/system/TestImageBatch.cpp
index cc6b1250a..83030ee34 100644
--- a/tests/nvcv_types/system/TestImageBatch.cpp
+++ b/tests/nvcv_types/system/TestImageBatch.cpp
@@ -628,6 +628,36 @@ TEST(ImageBatch, construct_null_parameters)
     EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchVarShapeConstruct(&reqs, nullptr, nullptr));
 }
 
+TEST(ImageBatch, valid_get_allocator)
+{
+    int                                tmp = 1;
+    NVCVImageBatchHandle               handle;
+    NVCVImageBatchVarShapeRequirements reqs;
+    NVCVAllocatorHandle                alloc = reinterpret_cast<NVCVAllocatorHandle>(&tmp);
+    EXPECT_NE(alloc, nullptr);
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapeCalcRequirements(5, &reqs));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapeConstruct(&reqs, nullptr, &handle));
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchGetAllocator(handle, &alloc));
+    EXPECT_EQ(alloc, nullptr);
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchDecRef(handle, nullptr));
+}
+
+TEST(ImageBatch, invalid_out_get_allocator)
+{
+    NVCVImageBatchHandle               handle;
+    NVCVImageBatchVarShapeRequirements reqs;
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapeCalcRequirements(5, &reqs));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchVarShapeConstruct(&reqs, nullptr, &handle));
+
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchGetAllocator(handle, nullptr));
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvImageBatchDecRef(handle, nullptr));
+}
+
 TEST_F(ImageBatchNullParamTest, get_user_pointer_null_output)
 {
     EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvImageBatchGetUserPointer(handle, nullptr));
diff --git a/tests/nvcv_types/system/TestTensor.cpp b/tests/nvcv_types/system/TestTensor.cpp
index 2f45bcccb..1e31cb051 100644
--- a/tests/nvcv_types/system/TestTensor.cpp
+++ b/tests/nvcv_types/system/TestTensor.cpp
@@ -274,6 +274,51 @@ TEST(Tensor, smoke_user_pointer)
     ASSERT_EQ(nullptr, userPtr);
 }
 
+TEST(Tensor, valid_get_allocator)
+{
+    int                    tmp = 1;
+    NVCVTensorHandle       tensorHandle;
+    NVCVTensorRequirements reqs;
+    NVCVAllocatorHandle    alloc = reinterpret_cast<NVCVAllocatorHandle>(&tmp);
+    EXPECT_NE(alloc, nullptr);
+
+    ASSERT_EQ(NVCV_SUCCESS, nvcvTensorCalcRequirementsForImages(1, 224, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, 0, &reqs));
+    ASSERT_EQ(NVCV_SUCCESS, nvcvTensorConstruct(&reqs, nullptr, &tensorHandle));
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvTensorGetAllocator(tensorHandle, &alloc));
+    EXPECT_EQ(alloc, nullptr);
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvTensorDecRef(tensorHandle, nullptr));
+}
+
+TEST(Tensor, layout_ne_op)
+{
+    NVCVTensorLayout lLayout = NVCV_TENSOR_NHWC;
+    NVCVTensorLayout rLayout = NVCV_TENSOR_NCHW;
+    EXPECT_TRUE(lLayout != rLayout);
+}
+
+TEST(TensorWrapData, valid_get_allocator)
+{
+    int                    tmp = 1;
+    NVCVTensorHandle       tensorHandle, tensorWrapHandle;
+    NVCVTensorData         tensorData;
+    NVCVTensorRequirements reqs;
+    NVCVAllocatorHandle    alloc = reinterpret_cast<NVCVAllocatorHandle>(&tmp);
+    EXPECT_NE(alloc, nullptr);
+
+    ASSERT_EQ(NVCV_SUCCESS, nvcvTensorCalcRequirementsForImages(1, 224, 224, NVCV_IMAGE_FORMAT_RGBA8, 0, 0, &reqs));
+    ASSERT_EQ(NVCV_SUCCESS, nvcvTensorConstruct(&reqs, nullptr, &tensorHandle));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvTensorExportData(tensorHandle, &tensorData));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvTensorWrapDataConstruct(&tensorData, nullptr, nullptr, &tensorWrapHandle));
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvTensorGetAllocator(tensorWrapHandle, &alloc));
+    EXPECT_EQ(alloc, nullptr);
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvTensorDecRef(tensorHandle, nullptr));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvTensorDecRef(tensorWrapHandle, nullptr));
+}
+
 TEST(TensorWrapData, smoke_create)
 {
     nvcv::ImageFormat fmt
@@ -626,6 +671,11 @@ TEST_F(TensorTests_Negative, invalid_parameter_TensorShapePermute)
               nvcvTensorShapePermute(srcLayout, srcShape.data(), dstLayout, nullptr)); // null outShape
 }
 
+TEST_F(TensorTests_Negative, invalid_out_get_allocator)
+{
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorGetAllocator(handle, nullptr));
+}
+
 class TensorPermuteTests
     : public t::TestWithParam<
           std::tuple<test::Param<"srcLayout", NVCVTensorLayout>, test::Param<"srcShape", std::vector<int64_t>>,
diff --git a/tests/nvcv_types/system/TestTensorBatch.cpp b/tests/nvcv_types/system/TestTensorBatch.cpp
index 8cd1bf282..8a62d6095 100644
--- a/tests/nvcv_types/system/TestTensorBatch.cpp
+++ b/tests/nvcv_types/system/TestTensorBatch.cpp
@@ -465,3 +465,31 @@ TEST(TensorBatch, set_tensor)
     EXPECT_EQ(tensorA.refCount(), 4);
     EXPECT_EQ(tensorB.refCount(), 3);
 }
+
+TEST(TensorBatch, valid_get_allocator)
+{
+    int                         tmp = 1;
+    NVCVTensorBatchHandle       tensorBatchHandle;
+    NVCVTensorBatchRequirements req;
+    NVCVAllocatorHandle         alloc = reinterpret_cast<NVCVAllocatorHandle>(&tmp);
+    EXPECT_NE(alloc, nullptr);
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvTensorBatchCalcRequirements(16, &req));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvTensorBatchConstruct(&req, nullptr, &tensorBatchHandle));
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvTensorBatchGetAllocator(tensorBatchHandle, &alloc));
+    EXPECT_EQ(alloc, nullptr);
+    EXPECT_EQ(NVCV_SUCCESS, nvcvTensorBatchDecRef(tensorBatchHandle, nullptr));
+}
+
+TEST(TensorBatch, invalid_out_get_allocator)
+{
+    NVCVTensorBatchHandle       tensorBatchHandle;
+    NVCVTensorBatchRequirements req;
+
+    EXPECT_EQ(NVCV_SUCCESS, nvcvTensorBatchCalcRequirements(16, &req));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvTensorBatchConstruct(&req, nullptr, &tensorBatchHandle));
+
+    EXPECT_EQ(NVCV_ERROR_INVALID_ARGUMENT, nvcvTensorBatchGetAllocator(tensorBatchHandle, nullptr));
+    EXPECT_EQ(NVCV_SUCCESS, nvcvTensorBatchDecRef(tensorBatchHandle, nullptr));
+}
diff --git a/tests/nvcv_types/unit/TestCheckError.cpp b/tests/nvcv_types/unit/TestCheckError.cpp
index 8478c2e5b..1c03441ae 100644
--- a/tests/nvcv_types/unit/TestCheckError.cpp
+++ b/tests/nvcv_types/unit/TestCheckError.cpp
@@ -89,3 +89,12 @@ TEST_P(CheckStatusMacroTests, throw_return_something_else)
                            return a;
                        })
 }
+
+TEST(CheckStatusMacroTests, throw_with_extra_string)
+{
+    const cudaError_t cudaErrCode = cudaErrorTextureFetchFailed;
+    const char       *fmt         = " Extra String: %s";
+    const char       *extraString = "abc\n\0";
+
+    NVCV_EXPECT_STATUS(NVCV_ERROR_INTERNAL, NVCV_CHECK_THROW(cudaErrCode, fmt, extraString));
+}