diff --git a/.gitattributes b/.gitattributes
index 1255c68cb..892050179 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -21,3 +21,4 @@
*.mp4 filter=lfs diff=lfs merge=lfs -text
*.a filter=lfs diff=lfs merge=lfs -text
*.hdf5 filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 540810c2f..359a414ff 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -7,6 +7,22 @@ assignees: ''
---
+[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved."
+[//]: # "SPDX-License-Identifier: Apache-2.0"
+[//]: # ""
+[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');"
+[//]: # "you may not use this file except in compliance with the License."
+[//]: # "You may obtain a copy of the License at"
+[//]: # "http://www.apache.org/licenses/LICENSE-2.0"
+[//]: # ""
+[//]: # "Unless required by applicable law or agreed to in writing, software"
+[//]: # "distributed under the License is distributed on an 'AS IS' BASIS"
+[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied."
+[//]: # "See the License for the specific language governing permissions and"
+[//]: # "limitations under the License."
+
+
+
**Describe the bug**
A clear and concise description of the bug.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
index d4b540a76..ffd08012b 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -7,6 +7,22 @@ assignees: ''
---
+[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved."
+[//]: # "SPDX-License-Identifier: Apache-2.0"
+[//]: # ""
+[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');"
+[//]: # "you may not use this file except in compliance with the License."
+[//]: # "You may obtain a copy of the License at"
+[//]: # "http://www.apache.org/licenses/LICENSE-2.0"
+[//]: # ""
+[//]: # "Unless required by applicable law or agreed to in writing, software"
+[//]: # "distributed under the License is distributed on an 'AS IS' BASIS"
+[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied."
+[//]: # "See the License for the specific language governing permissions and"
+[//]: # "limitations under the License."
+
+
+
**Is your feature request related to a problem? Please describe.**
A clear and concise description of the problem. Ex. I wish I could use CV-CUDA to do [...]
diff --git a/.github/ISSUE_TEMPLATE/submit-question.md b/.github/ISSUE_TEMPLATE/submit-question.md
index 72b2b74c5..6900ea6b2 100644
--- a/.github/ISSUE_TEMPLATE/submit-question.md
+++ b/.github/ISSUE_TEMPLATE/submit-question.md
@@ -7,4 +7,20 @@ assignees: ''
---
+[//]: # "SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved."
+[//]: # "SPDX-License-Identifier: Apache-2.0"
+[//]: # ""
+[//]: # "Licensed under the Apache License, Version 2.0 (the 'License');"
+[//]: # "you may not use this file except in compliance with the License."
+[//]: # "You may obtain a copy of the License at"
+[//]: # "http://www.apache.org/licenses/LICENSE-2.0"
+[//]: # ""
+[//]: # "Unless required by applicable law or agreed to in writing, software"
+[//]: # "distributed under the License is distributed on an 'AS IS' BASIS"
+[//]: # "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied."
+[//]: # "See the License for the specific language governing permissions and"
+[//]: # "limitations under the License."
+
+
+
**What is your question?**
diff --git a/.gitignore b/.gitignore
index 718aa64f1..4b0a6a14c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,7 @@
/build/
/build-*/
/install/
+/cvcuda-installer*/
# Visual Studio Code
# ------------------
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f0db22406..f5197e2ed 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -72,8 +72,8 @@ repos:
- id: copyright_check
name: 'check copyright message'
language: system
- types: ['file', 'text']
- exclude_types: ['markdown', 'xml', 'json', 'csv']
+ types: ['file', 'text', 'markdown']
+ exclude_types: ['xml', 'json', 'csv']
entry: ./lint/copyright_check.sh
exclude: 'models/.*'
- id: lfs_check
@@ -83,7 +83,7 @@ repos:
require_serial: true
- repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
- rev: v9.0.0
+ rev: v9.13.0
hooks:
- id: commitlint
stages: [commit-msg]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fccd9c7eb..0f98aedef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,7 @@ endif()
project(cvcuda
LANGUAGES C CXX
- VERSION 0.6.0
+ VERSION 0.7.0
DESCRIPTION "CUDA-accelerated Computer Vision algorithms"
)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d21011b97..37852a875 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -16,7 +16,7 @@
# Contributing to CV-CUDA
-**As of release v0.6.0-beta, CV-CUDA is not accepting outside contribution.**
+**As of release v0.7.0-beta, CV-CUDA is not accepting outside contribution.**
Contributions to CV-CUDA fall into the following categories:
@@ -28,7 +28,7 @@ Contributions to CV-CUDA fall into the following categories:
1. To propose a new feature, please file a new feature request
[issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). Describe the
intended feature and discuss the design and implementation with the team and
- community. NOTE: Currently, as of release v0.6.0-beta, CV-CUDA is not accepting
+ community. NOTE: Currently, as of release v0.7.0-beta, CV-CUDA is not accepting
outside contribution.
1. To ask a general question, please sumbit a question
[issue](https://github.com/CVCUDA/CV-CUDA/issues/new/choose). If you need
diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
index a5f4bec53..83e42f22c 100644
--- a/DEVELOPER_GUIDE.md
+++ b/DEVELOPER_GUIDE.md
@@ -30,7 +30,7 @@ CV-CUDA includes:
- C, C++, and Python APIs
- Batching support, with variable shape images
- Zero-copy interfaces to PyTorch
-- Sample applications: object classification and image segmentation
+- Sample applications
## What Pre- and Post-Processing Operators Are Included?
@@ -40,7 +40,7 @@ CV-CUDA includes:
| Advanced Color Format Conversions | Performs color conversion from interleaved RGB/BGR <-> YUV/YVU and semi planar. Supported standards: BT.601. BT.709. BT.2020 |
| AverageBlur | Reduces image noise using an average filter |
| BilateralFilter | Reduces image noise while preserving strong edges |
-| Bounding Box | Draws a rectangular border using the X-Y coordinates and dimensions typically to define the location and size of an object in an image |
+| Bounding Box | Draws an rectangular border using the X-Y coordinates and dimensions typically to define the location and size of an object in an image |
| Box Blurring | Overlays a blurred rectangle using the X-Y coordinates and dimensions that define the location and size of an object in an image |
| Brightness_Contrast | Adjusts brightness and contrast of an image |
| CenterCrop | Crops an image at its center |
@@ -53,8 +53,6 @@ CV-CUDA includes:
| CvtColor | Converts an image from one color space to another |
| DataTypeConvert | Converts an image’s data type, with optional scaling |
| Erase | Erases image regions |
-| Find Contours | Extract closed contours from an input binary image |
-| FindHomography | Calculates a perspective transform from four pairs of the corresponding points |
| Flip | Flips a 2D image around its axis |
| GammaContrast | Adjusts image contrast |
| Gaussian | Applies a gaussian blur filter to the image |
@@ -70,9 +68,9 @@ CV-CUDA includes:
| MinArea Rect | Finds the minimum area rotated rectangle typically used to draw bounding rectangle with minimum area |
| MinMaxLoc | Finds the maximum and minimum values in a given array |
| Morphology | Performs morphological erode and dilate transformations |
-| Morphology (close) | Performs a morphological operation that involves dilation followed by erosion on an image |
-| Morphology (open) | Performs a morphological operation that involves erosion followed by dilation on an image |
-| Non-max Suppression | Enables selecting a single entity out of many overlapping ones typically used for selecting from multiple bounding boxes during object detection |
+| Morphology (close) | Performs morphological operation that involves dilation followed by erosion on an image |
+| Morphology (open) | Performs morphological operation that involves erosion followed by dilation on an image |
+| Non-Maximum Suppression | Enables selecting a single entity out of many overlapping ones typically used for selecting from multiple bounding boxes during object detection |
| Normalize | Normalizes an image pixel’s range |
| OSD (Polyline Line Text Rotated Rect Segmented Mask) | Displays an overlay on the image of different forms including polyline line text rotated rectangle segmented mask |
| PadStack | Stacks several images into a tensor with border extension |
@@ -83,20 +81,19 @@ CV-CUDA includes:
| Remap | Maps pixels in an image with one projection to another projection in a new image. |
| Resize | Changes the size and scale of an image |
| Rotate | Rotates a 2D array in multiples of 90 degrees |
-| SIFT | Identifies and matches features in images that are invariant to scale rotation and affine distortion. |
-| Stack | Concatenates two input tensors into a single output tensor |
+| SIFT | Identifies and describes features in images that are invariant to scale rotation and affine distortion. |
| Thresholding | Chooses a global threshold value that is the same for all pixels across the image. |
| WarpAffine | Applies an affine transformation to an image |
| WarpPerspective | Applies a perspective transformation to an image |
## Where Are the Release Notes?
-An awesome product requires excellent support. CV-CUDA release notes can be
+CV-CUDA release notes can be
found [here](https://github.com/CVCUDA/CV-CUDA/releases)
## Where Can I Get Help?
-File requests for enhancements and bug reports
+An awesome product requires excellent support. File requests for enhancements and bug reports
[here](https://github.com/CVCUDA/CV-CUDA/issues/new/choose).
We are providing limited, direct, support to select enterprises using CV-CUDA.
@@ -208,5 +205,5 @@ companies with which they are associated.
Copyright
-© 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+© 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
diff --git a/README.md b/README.md
index 31ae4466b..4eaf478d2 100644
--- a/README.md
+++ b/README.md
@@ -15,9 +15,10 @@
# CV-CUDA
+
[![License](https://img.shields.io/badge/License-Apache_2.0-yellogreen.svg)](https://opensource.org/licenses/Apache-2.0)
-![Version](https://img.shields.io/badge/Version-v0.6.0--beta-blue)
+![Version](https://img.shields.io/badge/Version-v0.7.0--beta-blue)
![Platform](https://img.shields.io/badge/Platform-linux--64_%7C_win--64_wsl2%7C_aarch64-gray)
@@ -33,7 +34,7 @@ efficient pre- and post-processing pipelines. CV-CUDA originated as a
collaborative effort between [NVIDIA][NVIDIA Develop] and [ByteDance][ByteDance].
Refer to our [Developer Guide](DEVELOPER_GUIDE.md) for more information on the
-operators available as of release v0.6.0-beta.
+operators available.
## Getting Started
@@ -43,10 +44,10 @@ To get a local copy up and running follow these steps.
|CV-CUDA Build|Platform|CUDA Version|CUDA Compute Capability|Hardware Architectures|Nvidia Driver|Python Versions|Supported Compilers (build from source)|API compatibility with prebuilt binaries|OS/Linux distributions tested with prebuilt packages|
|-|-|-|-|-|-|-|-|-|-|
-|x86_64_cu11|x86_64|11.7 or later|SM7 and later|Volta, Turing, Amper, Hopper, Ada Lovelace|r520 or later*** |3.8, 3.9, 3.10, 3.11|gcc>=9*
gcc>=11**|gcc>=9|Ubuntu>= 20.04
WSL2/Ubuntu>=20.04|
-|x86_64_cu12|x86_64|12.2 or later|SM7 and later|Volta, Turing, Amper, Hopper, Ada Lovelace|r520 or later***|3.8, 3.9, 3.10, 3.11|gcc>=9*
gcc>=11**|gcc>=9|Ubuntu>= 20.04
WSL2/Ubuntu>=20.04|
-|aarch64_cu11 (JetPack 5.1)|aarch64|11.4|SM7 and later|Jetson AGX Orin|JetPack 5.1|3.8|gcc>=9*
gcc>=11**|gcc>=9|Jetson Linux 35.x|
-|aarch64_cu12 (JetPack 6.0)|aarch64|12.2|SM7 and later|Jetson AGX Orin|JetPack 6.0 DP|3.10|gcc>=9*
gcc>=11**|gcc>=9|Jetson Linux 36.2|
+|x86_64_cu11|x86_64|11.7 or later|SM7 and later|Volta, Turing, Ampere, Hopper, Ada Lovelace|r525 or later*** |3.8, 3.9, 3.10, 3.11|gcc>=9*
gcc>=11**|gcc>=9|Ubuntu>= 20.04
WSL2/Ubuntu>=20.04|
+|x86_64_cu12|x86_64|12.2 or later|SM7 and later|Volta, Turing, Ampere, Hopper, Ada Lovelace|r525 or later***|3.8, 3.9, 3.10, 3.11|gcc>=9*
gcc>=11**|gcc>=9|Ubuntu>= 20.04
WSL2/Ubuntu>=20.04|
+|aarch64_cu11|aarch64|11.4|SM7 and later|Jetson AGX Orin|JetPack 5.1|3.8|gcc>=9*
gcc>=11**|gcc>=9|Jetson Linux 35.x|
+|aarch64_cu12|aarch64|12.2|SM7 and later|Jetson AGX Orin, IGX Orin + Ampere RTX6000, IGX Orin + ADA RTX6000|JetPack 6.0 DP, r535 (IGX OS v0.6)|3.10|gcc>=9*
gcc>=11**|gcc>=9|Jetson Linux 36.2
IGX OS v0.6|
\* partial build, no test module (see Known Limitations)
\** full build, including test module
@@ -58,7 +59,7 @@ To get a local copy up and running follow these steps.
- The C++ test module cannot build with gcc<11 (requires specific C++-20 features). With gcc-9 or gcc-10, please build with option `-DBUILD_TESTS=0`
- [CV-CUDA Samples] require driver r535 or later to run and are only officially supported with CUDA 12.
- Only one CUDA version (CUDA 11.x or CUDA 12.x) of CV-CUDA packages (Debian packages, tarballs, Python Wheels) can be installed at a time. Please uninstall all packages from a given CUDA version before installing packages from a different version.
-- Test tarballs (cvcuda-tests-*.tar.xz) need to be unpacked at the root level to find existing tests.
+- Documentation built with older toolchains (doxygen, sphinx, breathe, exhale) may be incomplete. We recommend using Ubuntu 22.04 or later.
### Installation
@@ -66,211 +67,189 @@ For convenience, we provide pre-built packages for various combinations of CUDA
The following steps describe how to install CV-CUDA from such pre-built packages.
We support two main alternative pathways:
-- DEB or Tar archive installation (C++/CUDA Libraries, Headers, Python bindings)
- Standalone Python Wheels (containing C++/CUDA Libraries and Python bindings)
+- DEB or Tar archive installation (C++/CUDA Libraries, Headers, Python bindings)
Choose the installation method that meets your environment needs.
-#### Tar File Installation
+#### Python Wheel File Installation
-- Installation of C++/CUDA libraries (cvcuda-lib*) and development headers (cvcuda-dev*):
-```shell
-tar -xvf cvcuda-lib-0.6.0_beta---linux.tar.xz
-tar -xvf cvcuda-dev-0.6.0_beta---linux.tar.xz
-```
-- Installation of Python bindings (cvcuda-python*)
-```shell
-tar -xvf cvcuda-python-0.6.0_beta---linux.tar.xz
-```
-with `` the desired CUDA version,
-`` the desired Python version and
-`` the desired architecture
+Download the appropriate .whl file for your computer architecture, Python and CUDA version from the release assets of current CV-CUDA release. Release information of all CV-CUDA releases can be found [here][CV-CUDA GitHub Releases]. Once downloaded, execute the `pip install` command to install the Python wheel. For example:
+ ```shell
+ pip install cvcuda_-0.7.0b0-cp-cp-linux_.whl
+ ```
+
+where `` is the desired CUDA version, `` is the desired Python version and `` is the desired architecture.
+
+Please note that the Python wheels are standalone, they include both the C++/CUDA libraries and the Python bindings.
#### DEB File Installation
-- Installation of C++/CUDA libraries (cvcuda-lib*) and development headers (cvcuda-dev*):
+Install C++/CUDA libraries (cvcuda-lib*) and development headers (cvcuda-dev*) using `apt`:
```shell
-sudo apt-get install -y ./cvcuda-lib-0.6.0_beta---linux.deb ./cvcuda-dev-0.6.0_beta---linux.deb
+apt install -y ./cvcuda-lib----linux.deb ./cvcuda-dev----linux.deb
```
-- Installation of Python bindings (cvcuda-python*)
+
+Install Python bindings (cvcuda-python*) using `apt`:
```shell
-sudo apt-get install -y cvcuda-python-0.6.0_beta---linux.deb
+apt install -y ./cvcuda-python----linux.deb
```
-with `` the desired CUDA version,
-`` the desired Python version and
-`` the desired architecture
-
-#### Python Wheel File Installation
-
-
-Download the appropriate .whl file for your computer architecture, Python and CUDA version from the release assets of current CV-CUDA release. Release information of all CV-CUDA releases can be accessed [here][CV-CUDA GitHub Releases]. Once downloaded, execute the `pip install` command to install the Python wheel. For example:
+where `` is the desired CUDA version, `` is the desired Python version and `` is the desired architecture.
+#### Tar File Installation
+Install C++/CUDA libraries (cvcuda-lib*) and development headers (cvcuda-dev*):
```shell
-pip install cvcuda_-0.6.0b0-cp-cp-linux_.whl
+tar -xvf cvcuda-lib----linux.tar.xz
+tar -xvf cvcuda-dev----linux.tar.xz
```
-with `` the desired CUDA version,
-`` the desired Python version and
-`` the desired architecture
-
-Please note that the Python wheels provided are standalone, they include both the C++/CUDA libraries and the Python bindings.
+Install Python bindings (cvcuda-python*)
+```shell
+tar -xvf cvcuda-python----linux.tar.xz
+```
+where `` is the desired CUDA version, `` is the desired Python version and `` is the desired architecture.
### Build from Source
Follow these instruction to build CV-CUDA from source:
-1. Set up your local CV-CUDA repository
-
- a. Install prerequisites needed to setup up the repository.
-
- On Ubuntu >= 20.04, install the following packages:
- - git-lfs: to retrieve binary files from remote repository
-
- ```shell
- sudo apt-get install -y git git-lfs
- ```
-
- b. After cloning the repository (assuming it was cloned in `~/cvcuda`),
- it needs to be properly configured by running the `init_repo.sh` script only once.
-
- ```shell
- cd ~/cvcuda
- ./init_repo.sh
- ```
-
-2. Build CV-CUDA
-
- a. Install the dependencies required for building CV-CUDA
-
- On Ubuntu >= 20.04, install the following packages:
- - g++-11: compiler to be used
- - cmake (>= 3.20), ninja-build (optional): manage build rules
- - python3-dev: for python bindings
- - libssl-dev: needed by the testsuite (MD5 hashing utilities)
-
- ```shell
- sudo apt-get install -y g++-11 cmake ninja-build python3-dev libssl-dev
- ```
-
- For CUDA Toolkit, any version of the 11.x or 12.x series should work.
- CV-CUDA was tested with 11.7 and 12.2, thus those should be preferred.
-
- ```shell
- sudo apt-get install -y cuda-11-7
- # or
- sudo apt-get install -y cuda-12-2
- ```
-
- b. Build the project
+#### 1. Set up your local CV-CUDA repository
- ```shell
- ci/build.sh [release|debug] [output build tree path] [-DBUILD_TESTS=1|0] [-DPYTHON_VERSIONS='3.8;3.9;3.10;3.11'] [-DPUBLIC_API_COMPILERS='gcc-9;gcc-11;clang-11;clang-14']
- ```
+Install the dependencies needed to setup up the repository:
+- git
+- git-lfs: to retrieve binary files from remote repository
- The default build type is 'release'.
-
- If output build tree path isn't specified, it will be `build-rel` for release
- builds, and `build-deb` for debug.
-
- The library is in `build-rel/lib` and executables (tests, etc...) are in `build-rel/bin`.
+On Ubuntu >= 20.04, install the following packages using `apt`:
+```shell
+apt install -y git git-lfs
+```
- The `-DBUILD_TESTS` option can be used to disable/enable building the tests (enabled by default, see Known Limitations).
+Clone the repository
+```shell
+git clone https://github.com/CVCUDA/CV-CUDA.git
+```
- The `-DPYTHON_VERSIONS` option can be used to select Python versions to build bindings and Wheels for.
- By default, only the default system Python3 version will be selected.
+Assuming the repository was cloned in `~/cvcuda`, it needs to be properly configured by running the `init_repo.sh` script only once.
- The `-DPUBLIC_API_COMPILERS` option can be used to select the compilers used to check public API compatibility.
- By default, gcc-11, gcc-9, clang-11, and clang-14 is tried to be selected and checked.
+```shell
+cd ~/cvcuda
+./init_repo.sh
+```
-3. Build Documentation
+#### 2. Build CV-CUDA
- a. Install the dependencies required for building the documentation
+Install the dependencies required to build CV-CUDA:
+- g++-11: compiler to be used
+- cmake (>= 3.20), ninja-build (optional): manage build rules
+- python3-dev: for python bindings
+- libssl-dev: needed by the testsuite (MD5 hashing utilities)
+- CUDA toolkit
- On Ubuntu >= 20.04, install the following packages:
- - doxygen: parse header files for reference documentation
- - python3, python3-pip: to install some python packages needed
- - sphinx, breathe, exhale, recommonmark, graphiviz: to render the documentation
- - sphinx-rtd-theme: documenation theme used
+On Ubuntu >= 20.04, install the following packages using `apt`:
+```shell
+apt install -y g++-11 cmake ninja-build python3-dev libssl-dev
+```
- ```shell
- sudo apt-get install -y doxygen graphviz python3 python3-pip
- sudo python3 -m pip install sphinx==4.5.0 breathe exhale recommonmark graphviz sphinx-rtd-theme
- ```
+Any version of the 11.x or 12.x CUDA toolkit should work.
+CV-CUDA was tested with 11.7 and 12.2, these versions are thus recommended.
- b. Build the documentation
- ```shell
- ci/build_docs.sh [build folder]
- ```
+```shell
+apt install -y cuda-11-7
+# or
+apt install -y cuda-12-2
+```
- Example:
- `ci/build_docs.sh build_docs`
+Build the project:
+```shell
+ci/build.sh [release|debug] [output build tree path] [-DBUILD_TESTS=1|0] [-DPYTHON_VERSIONS='3.8;3.9;3.10;3.11'] [-DPUBLIC_API_COMPILERS='gcc-9;gcc-11;clang-11;clang-14']
+```
-4. Build and run Samples
+- The default build type is 'release'.
+- If output build tree path isn't specified, it will be `build-rel` for release
+ builds, and `build-deb` for debug.
+- The library is in `build-rel/lib` and executables (tests, etc...) are in `build-rel/bin`.
+- The `-DBUILD_TESTS` option can be used to disable/enable building the tests (enabled by default, see Known Limitations).
+- The `-DPYTHON_VERSIONS` option can be used to select Python versions to build bindings and Wheels for. By default, only the default system Python3 version will be selected.
+- The `-DPUBLIC_API_COMPILERS` option can be used to select the compilers used to check public API compatibility. By default, gcc-11, gcc-9, clang-11, and clang-14 is tried to be selected and checked.
- For instructions on how to build samples from source and run them, see the [Samples](samples/README.md) documentation.
+#### 3. Build Documentation
-5. Run Tests
+Known limitation: documentation built with older toolchains (doxygen, sphinx, breathe, exhale) may be incomplete. We recommend using Ubuntu 22.04 or later.
- a. Install the dependencies required for running the tests
+Install the dependencies required to build the documentation:
+- doxygen: parse header files for reference documentation
+- python3, python3-pip: to install some python packages needed
+- sphinx, breathe, exhale, recommonmark, graphiviz: to render the documentation
+- sphinx-rtd-theme: documentation theme used
- On Ubuntu >= 20.04, install the following packages:
- - python3, python3-pip: to run python bindings tests
- - torch: dependencies needed by python bindings tests
+On Ubuntu, install the following packages using `apt` and `pip`:
+```shell
+apt install -y doxygen graphviz python3 python3-pip
+python3 -m pip install sphinx==4.5.0 breathe exhale recommonmark graphviz sphinx-rtd-theme
+```
- ```shell
- sudo apt-get install -y python3 python3-pip
- sudo python3 -m pip install pytest torch
- ```
+Build the documentation:
+```shell
+ci/build_docs.sh [build folder]
+```
+Default build folder is 'build'.
- b. Run the tests
+#### 4. Build and run Samples
- The tests are in `/bin`. You can run the script below to run all
- tests at once. Here's an example when build tree is created in `build-rel`
+For instructions on how to build samples from source and run them, see the [Samples](samples/README.md) documentation.
- ```shell
- build-rel/bin/run_tests.sh
- ```
+#### 5. Run Tests
-6. Package installers and Python Wheels
+Install the dependencies required for running the tests:
+- python3, python3-pip: to run python bindings tests
+- torch: dependencies needed by python bindings tests
- a. Package installers
+On Ubuntu >= 20.04, install the following packages using `apt` and `pip`:
+```shell
+apt install -y python3 python3-pip
+python3 -m pip install pytest torch
+```
- Installers can be generated using the following cpack command once you have successfully built the project
+The tests are in `/bin`. You can run the script below to run all tests at once. Here's an example when build tree is created in `build-rel`:
+```shell
+build-rel/bin/run_tests.sh
+```
- ```shell
- cd build-rel
- cpack .
- ```
+#### 6. Package installers and Python Wheels
- This will generate in the build directory both Debian installers and tarballs
- (\*.tar.xz), needed for integration in other distros.
+Package installers
- For a fine-grained choice of what installers to generate, the full syntax is:
+Installers can be generated using the following cpack command once you have successfully built the project:
+```shell
+cd build-rel
+cpack .
+```
+This will generate in the build directory both Debian installers and tarballs (\*.tar.xz), needed for integration in other distros.
- ```shell
- cpack . -G [DEB|TXZ]
- ```
+For a fine-grained choice of what installers to generate, the full syntax is:
- - DEB for Debian packages
- - TXZ for \*.tar.xz tarballs.
+```shell
+cpack . -G [DEB|TXZ]
+```
+- DEB for Debian packages
+- TXZ for \*.tar.xz tarballs.
- b. Python Wheels
+Python Wheels
- By default during the `release` build, Python bindings and wheels are created for the available CUDA version and the specified Python
- version(s). The wheels are stored in `build-rel/pythonX.Y/wheel` folder, where `build-rel` is the build directory
- used to build the release build and `X` and `Y` are Python major and minor versions. The built wheels can be installed using pip.
- For example, to install the Python wheel built for CUDA 12.x, Python 3.10 on Linux x86_64 systems:
+By default during the `release` build, Python bindings and wheels are created for the available CUDA version and the specified Python version(s). The wheels are stored in `build-rel/pythonX.Y/wheel` folder, where `build-rel` is the build directory used to build the release build and `X` and `Y` are Python major and minor versions.
- ```shell
- pip install cvcuda_cu12-0.6.0b0-cp310-cp310-linux_x86_64.whl
- ```
+The built wheels can be installed using pip.
+For example, to install the Python wheel built for CUDA 12.x, Python 3.10 on Linux x86_64 systems:
+```shell
+pip install cvcuda_cu12--cp310-cp310-linux_x86_64.whl
+```
## Contributing
CV-CUDA is an open source project. As part of the Open Source Community, we are
committed to the cycle of learning, improving, and updating that makes this
-community thrive. However, as of release v0.6.0-beta, CV-CUDA is not yet ready
+community thrive. However, as of release v0.7.0-beta, CV-CUDA is not yet ready
for external contributions.
To understand the process for contributing the CV-CUDA, see our
@@ -287,27 +266,27 @@ The `mkop.sh` script is a powerful tool for creating a scaffold for new operator
1. **Operator Stub Creation**: Generates no-op (no-operation) operator templates, which serve as a starting point for implementing new functionalities.
-1. **File Customization**: Modifies template files to include the new operator's name, ensuring consistent naming conventions across the codebase.
+2. **File Customization**: Modifies template files to include the new operator's name, ensuring consistent naming conventions across the codebase.
-1. **CMake Integration**: Adds the new operator files to the appropriate CMakeLists, facilitating seamless compilation and integration into the build system.
+3. **CMake Integration**: Adds the new operator files to the appropriate CMakeLists, facilitating seamless compilation and integration into the build system.
-1. **Python Bindings**: Creates Python wrapper stubs for the new operator, allowing it to be used within Python environments.
+4. **Python Bindings**: Creates Python wrapper stubs for the new operator, allowing it to be used within Python environments.
-1. **Test Setup**: Generates test files for both C++ and Python, enabling immediate development of unit tests for the new operator.
+5. **Test Setup**: Generates test files for both C++ and Python, enabling immediate development of unit tests for the new operator.
#### How to Use `mkop.sh`:
-Run the script with the desired operator name. The script assumes it's located in `/cvcuda/tools/mkop`.
+Run the script with the desired operator name. The script assumes it's located in `~/cvcuda/tools/mkop`.
- ```shell
- ./mkop.sh [Operator Name]
- ```
+```shell
+./mkop.sh [Operator Name]
+```
If the script is run from a different location, provide the path to the CV-CUDA root directory.
- ```shell
- ./mkop.sh [Operator Name] [CV-CUDA root]
- ```
+```shell
+./mkop.sh [Operator Name] [CV-CUDA root]
+```
**NOTE**: The first letter of the new operator name is captitalized where needed to match the rest of the file structures.
diff --git a/bench/BenchAdaptiveThreshold.cpp b/bench/BenchAdaptiveThreshold.cpp
index 658281fd4..10fe8570f 100644
--- a/bench/BenchAdaptiveThreshold.cpp
+++ b/bench/BenchAdaptiveThreshold.cpp
@@ -92,5 +92,5 @@ using AdaptiveThresholdTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(AdaptiveThreshold, NVBENCH_TYPE_AXES(AdaptiveThresholdTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_int64_axis("blockSize", {7});
diff --git a/bench/BenchAverageBlur.cpp b/bench/BenchAverageBlur.cpp
index fbfc9c4cd..0736ccd47 100644
--- a/bench/BenchAverageBlur.cpp
+++ b/bench/BenchAverageBlur.cpp
@@ -88,6 +88,6 @@ using AverageBlurTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(AverageBlur, NVBENCH_TYPE_AXES(AverageBlurTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_string_axis("kernelSize", {"7x7"})
.add_string_axis("border", {"REPLICATE"});
diff --git a/bench/BenchBilateralFilter.cpp b/bench/BenchBilateralFilter.cpp
index 73875d8ed..ff41b9494 100644
--- a/bench/BenchBilateralFilter.cpp
+++ b/bench/BenchBilateralFilter.cpp
@@ -90,7 +90,7 @@ using BilateralFilterTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(BilateralFilter, NVBENCH_TYPE_AXES(BilateralFilterTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_int64_axis("diameter", {-1})
.add_float64_axis("sigmaSpace", {1.2})
.add_string_axis("border", {"REFLECT"});
diff --git a/bench/BenchBrightnessContrast.cpp b/bench/BenchBrightnessContrast.cpp
index 8e741169a..ea79f5a13 100644
--- a/bench/BenchBrightnessContrast.cpp
+++ b/bench/BenchBrightnessContrast.cpp
@@ -88,4 +88,4 @@ using BrightnessContrastTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(BrightnessContrast, NVBENCH_TYPE_AXES(BrightnessContrastTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1});
+ .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchColorTwist.cpp b/bench/BenchColorTwist.cpp
index 67e90af8b..1ade029f4 100644
--- a/bench/BenchColorTwist.cpp
+++ b/bench/BenchColorTwist.cpp
@@ -82,4 +82,4 @@ using ColorTwistTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(ColorTwist, NVBENCH_TYPE_AXES(ColorTwistTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1});
+ .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchComposite.cpp b/bench/BenchComposite.cpp
index 2293ecab0..f29f26acf 100644
--- a/bench/BenchComposite.cpp
+++ b/bench/BenchComposite.cpp
@@ -88,4 +88,4 @@ using CompositeTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(Composite, NVBENCH_TYPE_AXES(CompositeTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1});
+ .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchCopyMakeBorder.cpp b/bench/BenchCopyMakeBorder.cpp
index 722c37d03..8d26487a7 100644
--- a/bench/BenchCopyMakeBorder.cpp
+++ b/bench/BenchCopyMakeBorder.cpp
@@ -92,5 +92,5 @@ using CopyMakeBorderTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(CopyMakeBorder, NVBENCH_TYPE_AXES(CopyMakeBorderTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_string_axis("border", {"REFLECT101"});
diff --git a/bench/BenchCvtColor.cpp b/bench/BenchCvtColor.cpp
index 05469e0f7..abe1951ea 100644
--- a/bench/BenchCvtColor.cpp
+++ b/bench/BenchCvtColor.cpp
@@ -80,4 +80,4 @@ using CvtColorTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(CvtColor, NVBENCH_TYPE_AXES(CvtColorTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1});
+ .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchErase.cpp b/bench/BenchErase.cpp
index 68419ad9d..2bb504d2b 100644
--- a/bench/BenchErase.cpp
+++ b/bench/BenchErase.cpp
@@ -91,5 +91,5 @@ using EraseTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(Erase, NVBENCH_TYPE_AXES(EraseTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {0})
+ .add_int64_axis("varShape", {-1, 0})
.add_int64_axis("numErase", {3});
diff --git a/bench/BenchFindContours.cpp b/bench/BenchFindContours.cpp
deleted file mode 100644
index 06deb9732..000000000
--- a/bench/BenchFindContours.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "BenchUtils.hpp"
-
-#include
-#include
-#include
-
-#include
-
-using CPUImage = std::vector;
-
-static void generateRectangle(CPUImage &image, nvcv::Size2D boundary, nvcv::Size2D anchor = {0, 0},
- nvcv::Size2D size = {5, 5}, double angle = 0.0, bool fill = true, uint8_t setValue = 1);
-
-static void generateRectangle(CPUImage &image, nvcv::Size2D boundary, nvcv::Size2D anchor, nvcv::Size2D size,
- double angle, bool fill, uint8_t setValue)
-{
- auto rad = angle * (M_PI / 180.0);
- auto cosAngle = std::cos(rad);
- auto sinAngle = std::sin(rad);
-
- auto transformed = anchor;
- for (auto y = 0; y < size.h; ++y)
- {
- for (auto x = 0; x < size.w; ++x)
- {
- transformed.w = anchor.w + (x * cosAngle - y * sinAngle);
- transformed.h = anchor.h + (x * sinAngle + y * cosAngle);
-
- if (fill || y == 0 || y == size.h - 1 || x == 0 || x == size.w - 1)
- {
- if (transformed.w >= 0 && transformed.w < boundary.w && transformed.h >= 0
- && transformed.h < boundary.h)
- {
- image[transformed.h * boundary.w + transformed.w] = setValue;
- }
- }
- }
- }
-}
-
-template
-inline void FindContours(nvbench::state &state, nvbench::type_list)
-try
-{
- srand(0U); // Use a fixed random seed
- long3 shape = benchutils::GetShape<3>(state.get_string("shape"));
- long varShape = state.get_int64("varShape");
- int numPoints = static_cast(state.get_int64("numPoints"));
-
- // R/W bandwidth rationale:
- // Read image + connected components (S32)
- // Write points + contours (U32)
- state.add_global_memory_reads(shape.x * shape.y * shape.z * (sizeof(T) + sizeof(int)));
- state.add_global_memory_writes(shape.x * numPoints * sizeof(int) * 2 + shape.x * 4 * sizeof(int));
-
- cvcuda::FindContours op(nvcv::Size2D{(int)shape.z, (int)shape.y}, shape.x);
-
- // clang-format off
-
- nvcv::Tensor points({{shape.x, numPoints, 2}, "NCW"}, nvcv::TYPE_S32);
- nvcv::Tensor counts({{shape.x, 4}, "NW"}, nvcv::TYPE_S32);
-
- if (varShape < 0) // negative var shape means use Tensor
- {
- nvcv::Tensor src({{shape.x, shape.y, shape.z, 1}, "NHWC"}, benchutils::GetDataType());
- auto inData = src.exportData();
- auto inAccess = nvcv::TensorDataAccessStridedImagePlanar::Create(*inData);
-
- //Generate input
- CPUImage srcVec(shape.y * shape.z, 0);
- for (auto i = 0; i < 10; ++i)
- {
- int anchorX = rand() % shape.z;
- int anchorY = rand() % shape.y;
- int sizeX = rand() % (shape.z - anchorX);
- int sizeY = rand() % (shape.y - anchorY);
- generateRectangle(srcVec, {anchorX, anchorY}, {sizeX, sizeY});
- }
-
- for (auto i = 0; i < shape.x; ++i)
- {
- CUDA_CHECK_ERROR(cudaMemcpy2D(inAccess->sampleData(i), inAccess->rowStride(), srcVec.data(), shape.z, shape.z,
- shape.y, cudaMemcpyHostToDevice));
- }
-
- state.exec(nvbench::exec_tag::sync, [&op, &src, &points, &counts](nvbench::launch &launch)
- {
- op(launch.get_stream(), src, points, counts);
- });
- }
- else // zero and positive var shape means use ImageBatchVarShape
- {
- throw std::invalid_argument("ImageBatchVarShape not implemented for this operator");
- }
-}
-catch (const std::exception &err)
-{
- state.skip(err.what());
-}
-
-// clang-format on
-
-using FindContoursTypes = nvbench::type_list;
-
-NVBENCH_BENCH_TYPES(FindContours, NVBENCH_TYPE_AXES(FindContoursTypes))
- .set_type_axes_names({"InOutDataType"})
- .add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
- .add_int64_axis("numPoints", {1024});
diff --git a/bench/BenchFlip.cpp b/bench/BenchFlip.cpp
index 620eac7f6..9c052f62a 100644
--- a/bench/BenchFlip.cpp
+++ b/bench/BenchFlip.cpp
@@ -95,5 +95,5 @@ using FlipTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(Flip, NVBENCH_TYPE_AXES(FlipTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_string_axis("flipType", {"BOTH"});
diff --git a/bench/BenchGaussian.cpp b/bench/BenchGaussian.cpp
index 8b4fc30d1..a1976581d 100644
--- a/bench/BenchGaussian.cpp
+++ b/bench/BenchGaussian.cpp
@@ -91,6 +91,6 @@ using GaussianTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(Gaussian, NVBENCH_TYPE_AXES(GaussianTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_float64_axis("sigma", {1.2})
.add_string_axis("border", {"REFLECT"});
diff --git a/bench/BenchGaussianNoise.cpp b/bench/BenchGaussianNoise.cpp
index 68633a90f..09dcd04e4 100644
--- a/bench/BenchGaussianNoise.cpp
+++ b/bench/BenchGaussianNoise.cpp
@@ -84,4 +84,4 @@ using GaussianNoiseTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(GaussianNoise, NVBENCH_TYPE_AXES(GaussianNoiseTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1});
+ .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchHQResize.cpp b/bench/BenchHQResize.cpp
index 9d80963ec..49ff41412 100644
--- a/bench/BenchHQResize.cpp
+++ b/bench/BenchHQResize.cpp
@@ -122,7 +122,7 @@ using HQResizeTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(HQResize, NVBENCH_TYPE_AXES(HQResizeTypes))
.set_type_axes_names({"InOutDataType"})
- .add_int64_axis("batch", {false})
+ .add_int64_axis("batch", {false, true})
.add_string_axis("shape", {"1x1080x1920"})
.add_string_axis("interpolation", {"CUBIC"})
.add_int64_axis("antialias", {false, true})
diff --git a/bench/BenchHistogramEq.cpp b/bench/BenchHistogramEq.cpp
index 54082d550..74bcb9d46 100644
--- a/bench/BenchHistogramEq.cpp
+++ b/bench/BenchHistogramEq.cpp
@@ -74,4 +74,4 @@ using HistogramEqTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(HistogramEq, NVBENCH_TYPE_AXES(HistogramEqTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1});
+ .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchInpaint.cpp b/bench/BenchInpaint.cpp
index 88a237b31..ed6dbd055 100644
--- a/bench/BenchInpaint.cpp
+++ b/bench/BenchInpaint.cpp
@@ -82,4 +82,4 @@ using InpaintTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(Inpaint, NVBENCH_TYPE_AXES(InpaintTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1});
+ .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchJointBilateralFilter.cpp b/bench/BenchJointBilateralFilter.cpp
index 45c325bd8..2aa748048 100644
--- a/bench/BenchJointBilateralFilter.cpp
+++ b/bench/BenchJointBilateralFilter.cpp
@@ -94,7 +94,7 @@ using JointBilateralFilterTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(JointBilateralFilter, NVBENCH_TYPE_AXES(JointBilateralFilterTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_int64_axis("diameter", {-1})
.add_float64_axis("sigmaSpace", {1.2})
.add_string_axis("border", {"REFLECT"});
diff --git a/bench/BenchLabel.cpp b/bench/BenchLabel.cpp
index 41005379d..5e1870f50 100644
--- a/bench/BenchLabel.cpp
+++ b/bench/BenchLabel.cpp
@@ -32,15 +32,16 @@ try
std::string runChoice = state.get_string("runChoice");
- // Use [BG][MIN][MAX][ISLAND][COUNT][STAT] in runChoice to run Label with:
- // background; minThreshold; maxThreshold; island removal; count; statistics
+ // Use [BG][MIN][MAX][ISLAND][COUNT][STAT][MASK] in runChoice to run Label with:
+ // background; minThreshold; maxThreshold; island removal; count; statistics; mask
- long3 staShape{srcShape.x, 10000, 6}; // using fixed 10K max. cap. and 2D problem
+ long3 staShape{srcShape.x, 10000, 7}; // using fixed 10K max. cap. and 2D problem
- NVCVConnectivityType conn = NVCV_CONNECTIVITY_4_2D;
- NVCVLabelType alab = NVCV_LABEL_FAST;
+ NVCVConnectivityType conn = NVCV_CONNECTIVITY_4_2D;
+ NVCVLabelType alab = NVCV_LABEL_FAST;
+ NVCVLabelMaskType mType = NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY;
- nvcv::Tensor bgT, minT, maxT, countT, statsT, mszT;
+ nvcv::Tensor bgT, minT, maxT, countT, statsT, mszT, maskT;
cvcuda::Label op;
@@ -81,16 +82,20 @@ try
{
statsT = nvcv::Tensor({{staShape.x, staShape.y, staShape.z}, "NMA"}, benchutils::GetDataType());
}
+ if (runChoice.find("MASK") != std::string::npos)
+ {
+ maskT = nvcv::Tensor({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, nvcv::TYPE_U8);
+ }
nvcv::Tensor src({{srcShape.x, srcShape.y, srcShape.z, 1}, "NHWC"}, benchutils::GetDataType());
nvcv::Tensor dst({{dstShape.x, dstShape.y, dstShape.z, 1}, "NHWC"}, benchutils::GetDataType());
benchutils::FillTensor(src, benchutils::RandomValues());
- state.exec(nvbench::exec_tag::sync,
- [&op, &src, &dst, &bgT, &minT, &maxT, &mszT, &countT, &statsT, &conn, &alab](nvbench::launch &launch)
+ state.exec(nvbench::exec_tag::sync, [&op, &src, &dst, &bgT, &minT, &maxT, &mszT, &countT, &statsT, &maskT, &conn,
+ &alab, &mType](nvbench::launch &launch)
{
- op(launch.get_stream(), src, dst, bgT, minT, maxT, mszT, countT, statsT, conn, alab);
+ op(launch.get_stream(), src, dst, bgT, minT, maxT, mszT, countT, statsT, maskT, conn, alab, mType);
});
}
catch (const std::exception &err)
diff --git a/bench/BenchLaplacian.cpp b/bench/BenchLaplacian.cpp
index e685198ef..7956d8c22 100644
--- a/bench/BenchLaplacian.cpp
+++ b/bench/BenchLaplacian.cpp
@@ -85,7 +85,7 @@ using LaplacianTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(Laplacian, NVBENCH_TYPE_AXES(LaplacianTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_int64_axis("ksize", {1})
.add_float64_axis("scale", {1.0})
.add_string_axis("border", {"REFLECT101"});
diff --git a/bench/BenchMedianBlur.cpp b/bench/BenchMedianBlur.cpp
index 45b2c1a6e..0520f5f26 100644
--- a/bench/BenchMedianBlur.cpp
+++ b/bench/BenchMedianBlur.cpp
@@ -82,5 +82,5 @@ using MedianBlurTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(MedianBlur, NVBENCH_TYPE_AXES(MedianBlurTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_string_axis("kernelSize", {"5x5"});
diff --git a/bench/BenchMinMaxLoc.cpp b/bench/BenchMinMaxLoc.cpp
index 582348fde..40e8385bf 100644
--- a/bench/BenchMinMaxLoc.cpp
+++ b/bench/BenchMinMaxLoc.cpp
@@ -88,5 +88,5 @@ using MinMaxLocTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(MinMaxLoc, NVBENCH_TYPE_AXES(MinMaxLocTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_int64_axis("maxLocations", {100000});
diff --git a/bench/BenchMorphology.cpp b/bench/BenchMorphology.cpp
index d3947e788..f357dbffb 100644
--- a/bench/BenchMorphology.cpp
+++ b/bench/BenchMorphology.cpp
@@ -128,7 +128,7 @@ using MorphologyTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(Morphology, NVBENCH_TYPE_AXES(MorphologyTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_int64_axis("iteration", {1})
.add_string_axis("kernelSize", {"3x3"})
.add_string_axis("morphType", {"ERODE", "DILATE", "OPEN", "CLOSE"})
diff --git a/bench/BenchNormalize.cpp b/bench/BenchNormalize.cpp
index 64eed3e33..9e7cc09e6 100644
--- a/bench/BenchNormalize.cpp
+++ b/bench/BenchNormalize.cpp
@@ -96,4 +96,4 @@ using NormalizeTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(Normalize, NVBENCH_TYPE_AXES(NormalizeTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1});
+ .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchPillowResize.cpp b/bench/BenchPillowResize.cpp
index 359480e25..1340a9f26 100644
--- a/bench/BenchPillowResize.cpp
+++ b/bench/BenchPillowResize.cpp
@@ -100,6 +100,6 @@ using PillowResizeTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(PillowResize, NVBENCH_TYPE_AXES(PillowResizeTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_string_axis("resizeType", {"CONTRACT"})
.add_string_axis("interpolation", {"CUBIC"});
diff --git a/bench/BenchRandomResizedCrop.cpp b/bench/BenchRandomResizedCrop.cpp
index b7f58c57f..661a5e42c 100644
--- a/bench/BenchRandomResizedCrop.cpp
+++ b/bench/BenchRandomResizedCrop.cpp
@@ -98,6 +98,6 @@ using RandomResizedCropTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(RandomResizedCrop, NVBENCH_TYPE_AXES(RandomResizedCropTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_string_axis("resizeType", {"EXPAND"})
.add_string_axis("interpolation", {"LINEAR"});
diff --git a/bench/BenchRemap.cpp b/bench/BenchRemap.cpp
index 7fc20600c..3f3825c82 100644
--- a/bench/BenchRemap.cpp
+++ b/bench/BenchRemap.cpp
@@ -116,5 +116,5 @@ using RemapTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(Remap, NVBENCH_TYPE_AXES(RemapTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_string_axis("mapType", {"DENSE"});
diff --git a/bench/BenchResize.cpp b/bench/BenchResize.cpp
index 7446a6f80..b8fb517a0 100644
--- a/bench/BenchResize.cpp
+++ b/bench/BenchResize.cpp
@@ -92,6 +92,6 @@ using ResizeTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(Resize, NVBENCH_TYPE_AXES(ResizeTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_string_axis("resizeType", {"EXPAND"})
.add_string_axis("interpolation", {"LINEAR"});
diff --git a/bench/BenchRotate.cpp b/bench/BenchRotate.cpp
index 4f4af05c7..bfd58527b 100644
--- a/bench/BenchRotate.cpp
+++ b/bench/BenchRotate.cpp
@@ -87,5 +87,5 @@ using RotateTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(Rotate, NVBENCH_TYPE_AXES(RotateTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_string_axis("interpolation", {"CUBIC"});
diff --git a/bench/BenchThreshold.cpp b/bench/BenchThreshold.cpp
index 648a83ac7..1c87a7995 100644
--- a/bench/BenchThreshold.cpp
+++ b/bench/BenchThreshold.cpp
@@ -82,4 +82,4 @@ using ThresholdTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(Threshold, NVBENCH_TYPE_AXES(ThresholdTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1});
+ .add_int64_axis("varShape", {-1, 0});
diff --git a/bench/BenchWarpAffine.cpp b/bench/BenchWarpAffine.cpp
index 459c3b32d..a028e28b9 100644
--- a/bench/BenchWarpAffine.cpp
+++ b/bench/BenchWarpAffine.cpp
@@ -89,7 +89,7 @@ using WarpAffineTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(WarpAffine, NVBENCH_TYPE_AXES(WarpAffineTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_string_axis("border", {"REFLECT"})
.add_string_axis("interpolation", {"CUBIC"})
.add_string_axis("inverseMap", {"Y"});
diff --git a/bench/BenchWarpPerspective.cpp b/bench/BenchWarpPerspective.cpp
index 874986129..f18108e87 100644
--- a/bench/BenchWarpPerspective.cpp
+++ b/bench/BenchWarpPerspective.cpp
@@ -89,7 +89,7 @@ using WarpPerspectiveTypes = nvbench::type_list;
NVBENCH_BENCH_TYPES(WarpPerspective, NVBENCH_TYPE_AXES(WarpPerspectiveTypes))
.set_type_axes_names({"InOutDataType"})
.add_string_axis("shape", {"1x1080x1920"})
- .add_int64_axis("varShape", {-1})
+ .add_int64_axis("varShape", {-1, 0})
.add_string_axis("border", {"REFLECT"})
.add_string_axis("interpolation", {"CUBIC"})
.add_string_axis("inverseMap", {"Y"});
diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt
index e82bf3da4..3ca000274 100644
--- a/bench/CMakeLists.txt
+++ b/bench/CMakeLists.txt
@@ -53,7 +53,6 @@ set(bench_sources
BenchCropFlipNormalizeReformat.cpp
BenchCustomCrop.cpp
BenchErase.cpp
- BenchFindContours.cpp
BenchGammaContrast.cpp
BenchGaussianNoise.cpp
BenchHistogramEq.cpp
diff --git a/bench/python/all_ops/op_copymakeborder.py b/bench/python/all_ops/op_copymakeborder.py
index c0bca25b6..2f57475d0 100644
--- a/bench/python/all_ops/op_copymakeborder.py
+++ b/bench/python/all_ops/op_copymakeborder.py
@@ -24,7 +24,7 @@
class OpCopyMakeBorder(AbstractOpBase):
def setup(self, input):
self.border_mode = cvcuda.Border.CONSTANT
- self.border_values = [255, 0, 0] # Border values for 3 channel input.
+ self.border_values = [255, 0, 0] # Border values for 3 channel RGB input.
self.top = 30
self.left = 40
self.bottom = 50
diff --git a/bench/python/all_ops/op_findcontours.py b/bench/python/all_ops/op_findcontours.py
deleted file mode 100644
index 7fe31cab0..000000000
--- a/bench/python/all_ops/op_findcontours.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# NOTE: One must import PyCuda driver first, before CVCUDA or VPF otherwise
-# things may throw unexpected errors.
-import pycuda.driver as cuda # noqa: F401
-
-from bench_utils import AbstractOpBase
-import cvcuda
-import torch
-from torchvision.io import read_image
-import matplotlib.pyplot as plt
-import numpy as np
-import os
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-class OpFindContours(AbstractOpBase):
- def setup(self, input):
- grayscale_input = read_image(
- os.path.join(self.assets_dir, "countour_lines.jpg")
- )
- grayscale_input = grayscale_input.moveaxis(
- 0, -1
- ).contiguous() # From CHW to HWC
- # Binarize the grayscale_input
- grayscale_input[grayscale_input <= 50] = 0
- grayscale_input[grayscale_input > 50] = 255
-
- grayscale_input = [grayscale_input.clone() for _ in range(input.shape[0])]
- grayscale_input = torch.stack(grayscale_input)
- grayscale_input = grayscale_input.cuda(self.device_id)
- self.grayscale_input = cvcuda.as_tensor(grayscale_input, "NHWC")
-
- def run(self, input):
- return cvcuda.find_contours(self.grayscale_input)
-
- def visualize(self):
- """
- Attempts to visualize the output produced by the operator as an image by writing it
- down to the disk. May raise exceptions if visualization is not successful.
- """
- output_dir = self._setup_clear_output_dir(filename_ends_with="_op_out.jpg")
- # Convert the inputs and outputs to numpy arrays first.
- # input shape: NHWC
- # out[0] = points_info shape: NxMx2 (M == max points, 2 for x and y coordinates)
- # out[1] = contours_info shape: NxC where
- # (C == max contours, number of non-zero elements are number of contours)
- input_npy = (
- torch.as_tensor(
- self.grayscale_input.cuda(), device="cuda:%d" % self.device_id
- )
- .cpu()
- .numpy()
- )
- points_npy = (
- torch.as_tensor(self.op_output[0].cuda(), device="cuda:%d" % self.device_id)
- .cpu()
- .numpy()
- )
- num_contours_npy = (
- torch.as_tensor(self.op_output[1].cuda(), device="cuda:%d" % self.device_id)
- .cpu()
- .numpy()
- )
-
- # Loop over all the images...
- for i, img in enumerate(input_npy):
-
- # Grab the information on the points and the contours of this image.
- points_info = points_npy[i]
- contours_info = num_contours_npy[i]
-
- # Keep only the non-zero entries from contours_info
- contours_info = contours_info[np.nonzero(contours_info)]
- # Use the num_points in contours_info to split the points_info
- # Since the values in num_points are not start-stop indices of the points
- # we need to use cumsum to fix it and use it inside the split function
- valid_points = np.split(points_info, contours_info.cumsum())
- # Last element in valid_points is the remainder of the points so need to drop it.
- all_contours = valid_points[:-1] # This list stores OpenCV style contours.
-
- plt.figure(figsize=(img.shape[1] / 100.0, img.shape[0] / 100.0))
- plt.gca().invert_yaxis()
-
- plt.plot(0, 0, color="white")
- plt.plot(img.shape[1], img.shape[0], color="white")
- for contour in all_contours:
- x, y = contour[:, 0], contour[:, 1]
- plt.plot(x, y, color="green", linewidth=2)
-
- # Save using PIL
- out_file_name = "img_%d_op_out.jpg" % i
- plt.savefig(os.path.join(output_dir, out_file_name))
- plt.close()
diff --git a/bench/python/all_ops/op_flip.py b/bench/python/all_ops/op_flip.py
index 962a12856..d93a1c148 100644
--- a/bench/python/all_ops/op_flip.py
+++ b/bench/python/all_ops/op_flip.py
@@ -21,9 +21,25 @@
import cvcuda
-class OpFlip(AbstractOpBase):
+class OpFlipX(AbstractOpBase):
def setup(self, input):
- self.flip_code = -1 # means flipping around both axes.
+ self.flip_code = 0 # means flipping around x axis.
+
+ def run(self, input):
+ return cvcuda.flip(input, flipCode=self.flip_code)
+
+
+class OpFlipY(AbstractOpBase):
+ def setup(self, input):
+ self.flip_code = 1 # means flipping around y axis.
+
+ def run(self, input):
+ return cvcuda.flip(input, flipCode=self.flip_code)
+
+
+class OpFlipXY(AbstractOpBase):
+ def setup(self, input):
+ self.flip_code = -1 # means flipping around x and y axis.
def run(self, input):
return cvcuda.flip(input, flipCode=self.flip_code)
diff --git a/bench/python/assets/brooklyn_bboxes.pt b/bench/python/assets/brooklyn_bboxes.pt
index 3261e4720..69bc4260c 100644
Binary files a/bench/python/assets/brooklyn_bboxes.pt and b/bench/python/assets/brooklyn_bboxes.pt differ
diff --git a/bench/python/assets/brooklyn_nms_masks.pt b/bench/python/assets/brooklyn_nms_masks.pt
index 2e13cef5e..0b97d7a0d 100644
Binary files a/bench/python/assets/brooklyn_nms_masks.pt and b/bench/python/assets/brooklyn_nms_masks.pt differ
diff --git a/bench/python/assets/brooklyn_scores.pt b/bench/python/assets/brooklyn_scores.pt
index 013cebc71..fdb4a3b29 100644
Binary files a/bench/python/assets/brooklyn_scores.pt and b/bench/python/assets/brooklyn_scores.pt differ
diff --git a/bench/python/run_bench.py b/bench/python/run_bench.py
index ae2c69b08..8c00b43c1 100644
--- a/bench/python/run_bench.py
+++ b/bench/python/run_bench.py
@@ -68,22 +68,25 @@ def run_bench(
logger = logging.getLogger("run_bench")
logger.info("Benchmarking started.")
+ # Set up various CUDA stuff.
+ cuda_device = cuda.Device(device_id)
+ cuda_ctx = cuda_device.retain_primary_context()
+ cuda_ctx.push()
+ # Use the the default stream for cvcuda and torch
+ # Since we never created a stream current will be the CUDA default stream
+ cvcuda_stream = cvcuda.Stream().current
+ torch_stream = torch.cuda.default_stream(device=cuda_device)
+
# Create an image batch decoder to supply us the input test data.
decoder = ImageBatchDecoder(
input_path,
batch_size,
device_id,
- cuda_ctx=None,
+ cuda_ctx,
+ cvcuda_stream,
cvcuda_perf=cvcuda_perf,
)
- # Set up various CUDA stuff.
- cuda_device = cuda.Device(device_id)
- cuda_ctx = cuda_device.retain_primary_context()
- cuda_ctx.push()
- cvcuda_stream = cvcuda.Stream()
- torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle)
-
# Get a list of (class names, class types) of all the ops that can be profiled.
ops_info_list = get_benchmark_eligible_ops_info()
logger.info("Found a total of %d operators for benchmarking." % len(ops_info_list))
diff --git a/ci/check_formatting.sh b/ci/check_formatting.sh
new file mode 100755
index 000000000..b91d518cb
--- /dev/null
+++ b/ci/check_formatting.sh
@@ -0,0 +1,42 @@
+#!/bin/bash -e
+
+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# = 0 ]; then
+ # No arguments? Lint all code.
+ echo "Linting all code in the repository =========================="
+ pre-commit run -a
+else
+ from=$1
+ if [ $# = 1 ]; then
+ to=HEAD
+ elif [ $# = 2 ]; then
+ to=$2
+ else
+ echo "Invalid arguments"
+ echo "Usage: $(basename "$0") [ref_from [ref_to]]"
+ exit 1
+ fi
+
+ echo "Linting files touched from commit $from to $to =============="
+ echo "Files to be linted:"
+ git diff --stat $from..$to
+ if ! pre-commit run --from-ref $from --to-ref $to ; then
+ echo "Formatting errors:"
+ git diff
+ false
+ fi
+fi
diff --git a/cmake/ConfigCUDA.cmake b/cmake/ConfigCUDA.cmake
index 24bc2453c..88a2707c5 100644
--- a/cmake/ConfigCUDA.cmake
+++ b/cmake/ConfigCUDA.cmake
@@ -38,9 +38,14 @@ if(NOT USE_CMAKE_CUDA_ARCHITECTURES)
if(ENABLE_TEGRA)
list(APPEND CMAKE_CUDA_ARCHITECTURES
72-real # Volta - gv11b/Tegra (Jetson AGX Xavier)
- 86-real # Ampere - Jetson IGX Orin
+ 86-real # Jetson IGX Orin with optional Ampere RTX A6000
87-real # Ampere - ga10b,ga10c/Tegra (Jetson AGX Orin)
)
+ if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8")
+ list(APPEND CMAKE_CUDA_ARCHITECTURES
+ 89-real # Jetson IGX Orin with optional RTX 6000 Ada
+ )
+ endif()
else()
# All architectures we build sass for
list(APPEND CMAKE_CUDA_ARCHITECTURES
diff --git a/docker/config b/docker/config
index aa84ebf0d..56df16cb9 100644
--- a/docker/config
+++ b/docker/config
@@ -27,5 +27,5 @@ TAG_IMAGE_SAMPLES=6.1
TAG_IMAGE_TEST=5
VER_CUDA=11.7.1
-VER_UBUNTU=22.04
+VER_UBUNTU=20.04
VER_TRT=24.01
diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py
index a32ec33c5..f7fac8e63 100644
--- a/docs/sphinx/conf.py
+++ b/docs/sphinx/conf.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +33,7 @@
import sys
project = "CV-CUDA"
-copyright = "2022-2023, NVIDIA."
+copyright = "2022-2024, NVIDIA."
author = "NVIDIA"
version = "Beta"
release = version
diff --git a/docs/sphinx/content/cvcuda_oplist.csv b/docs/sphinx/content/cvcuda_oplist.csv
index bc4aecd54..85e45f080 100644
--- a/docs/sphinx/content/cvcuda_oplist.csv
+++ b/docs/sphinx/content/cvcuda_oplist.csv
@@ -16,15 +16,13 @@ CustomCrop,Crops an image with a given region-of-interest
CvtColor,Converts an image from one color space to another
DataTypeConvert,Converts an image’s data type with optional scaling
Erase,Erases image regions
-Find Contours,Extract closed contours from an input binary image
-FindHomography,Calculates a perspective transform from four pairs of the corresponding points
Flip,Flips a 2D image around its axis
GammaContrast,Adjusts image contrast
Gaussian,Applies a gaussian blur filter to the image
Gaussian Noise,Generates a statistical noise with a normal (Gaussian) distribution
Histogram,Provides a grayscale value distribution showing the frequency of occurrence of each gray value.
Histogram Equalizer,Allows effective spreading out the intensity range of the image typically used to improve contrast
-HqResize,Performs advanced resizing supporting 2D and 3D data, tensors, tensor batches, and varshape image batches (2D only). Supports nearest neighbor, linear, cubic, Gaussian and Lanczos interpolation, with optional antialiasing when down-sampling.
+HqResize, "Performs advanced resizing supporting 2D and 3D data, tensors, tensor batches, and varshape image batches (2D only). Supports nearest neighbor, linear, cubic, Gaussian and Lanczos interpolation, with optional antialiasing when down-sampling."
Inpainting,Performs inpainting by replacing a pixel by normalized weighted sum of all the known pixels in the neighborhood
Joint Bilateral Filter,Reduces image noise while preserving strong edges based on a guidance image
Label,Labels connected regions in an image using 4-way connectivity for foreground and 8-way for background pixels
@@ -35,11 +33,11 @@ MinMaxLoc,Finds the maximum and minimum values in a given array
Morphology,Performs morphological erode and dilate transformations
Morphology (close), Performs morphological operation that involves dilation followed by erosion on an image
Morphology (open), Performs morphological operation that involves erosion followed by dilation on an image
-Non-max Suppression,Enables selecting a single entity out of many overlapping ones typically used for selecting from multiple bounding boxes during object detection
+Non-Maximum Suppression,Enables selecting a single entity out of many overlapping ones typically used for selecting from multiple bounding boxes during object detection
Normalize,Normalizes an image pixel’s range
OSD (Polyline Line Text Rotated Rect Segmented Mask),Displays an overlay on the image of of different forms including polyline line text rotated rectangle segmented mask
PadStack,Stacks several images into a tensor with border extension
-PairwiseMatcher,Matches features computed separately (e.g. via the SIFT operator) in two images, e.g. using the brute force method
+PairwiseMatcher,"Matches features computed separately (e.g. via the SIFT operator) in two images, e.g. using the brute force method"
PillowResize,Changes the size and scale of an image using python-pillow algorithm
RandomResizedCrop,Crops a random portion of an image and resizes it to a specified size.
Reformat,Converts a planar image into non-planar and vice versa
@@ -47,7 +45,6 @@ Remap,Maps pixels in an image with one projection to another projection in a new
Resize,Changes the size and scale of an image
Rotate,Rotates a 2D array in multiples of 90 degrees
SIFT,Identifies and matches features in images that are invariant to scale rotation and affine distortion.
-Stack,Concatenates two input tensors into a single output tensor
Thresholding,Chooses a global threshold value that is the same for all pixels across the image.
WarpAffine,Applies an affine transformation to an image
WarpPerspective,Applies a perspective transformation to an image
diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst
index 890d44262..254a0bf63 100644
--- a/docs/sphinx/index.rst
+++ b/docs/sphinx/index.rst
@@ -1,5 +1,5 @@
..
- # SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -38,10 +38,9 @@ CV-CUDA includes:
CV-CUDA Pre- and Post-Processing Operators
------------------
-CV-CUDA offers more than 20 Computer Vision and Image Processing operators. Find the operator that is right for your workflow below.
+CV-CUDA offers a comprehensive collection of Computer Vision and Image Processing operators, listed below.
-
-.. csv-table::
+.. csv-table:: List of operators
:file: content/cvcuda_oplist.csv
:widths: 30, 70
:header-rows: 1
@@ -50,12 +49,13 @@ CV-CUDA offers more than 20 Computer Vision and Image Processing operators. Find
Where Are the Release Notes?
------------------
-An awesome product requires excellent support. CV-CUDA release notes can be found `here `_.
+CV-CUDA release notes can be found `here `_.
Where Can I Get Help?
------------------
+An awesome product requires excellent support.
File requests for enhancements and bug reports `here `_.
@@ -97,7 +97,7 @@ NVIDIA, the NVIDIA logo, NVIDIA CV-CUDA, and NVIDIA TensorRT are trademarks and/
Copyright
--------------------
-© 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+© 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
@@ -124,6 +124,7 @@ Copyright
:maxdepth: 1
:hidden:
+ Beta.5
Beta.4
Beta.3
Beta.2
diff --git a/docs/sphinx/installation.rst b/docs/sphinx/installation.rst
index 5e213d536..6c05a33d8 100644
--- a/docs/sphinx/installation.rst
+++ b/docs/sphinx/installation.rst
@@ -24,14 +24,15 @@ Pre-requisites
This section describes the recommended dependencies to install CV-CUDA.
-* Ubuntu >= 20.04
-* CUDA driver >= 11.7
+* Ubuntu >= 20.04 (22.04 recommended for building the documentation)
+* CUDA >= 11.7 (cuda 12 required for samples)
+* NVIDIA driver r525 or later (r535 required for samples)
Setup
-----
The following steps describe how to install CV-CUDA. Choose the installation method that meets your environment needs.
-You can download the CV-CUDA tar, deb or wheel packages from `here `_
+You can download the CV-CUDA tar, deb or wheel packages from `the asset section `_
* Tar File Installation
@@ -73,11 +74,11 @@ You can download the CV-CUDA tar, deb or wheel packages from `here `_
+ Download the appropriate .whl file for your computer architecture, Python and CUDA version from `here `_
Execute the following command to install appropriate CV-CUDA Python wheel ::
- pip install cvcuda_-0.6.0b0-cp-cp-linux_.whl
+ pip install cvcuda_-0.7.0b0-cp-cp-linux_.whl
where is the desired CUDA version, the desired Python version and the desired architecture.
diff --git a/docs/sphinx/relnotes/v0.7.0-beta.rst b/docs/sphinx/relnotes/v0.7.0-beta.rst
new file mode 100644
index 000000000..5ad3ae437
--- /dev/null
+++ b/docs/sphinx/relnotes/v0.7.0-beta.rst
@@ -0,0 +1,69 @@
+..
+ # SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ # SPDX-License-Identifier: Apache-2.0
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ # http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+
+.. _v0.7.0-beta:
+
+Beta.5
+======
+
+CV-CUDA 0.7.0 introduces performance and support enhancements, along with bug fixes and new features.
+
+Release Highlights
+------------------
+
+CV-CUDA v0.7.0 includes the following improvements:
+
+* **New Features**:
+
+ * Optimized Python bindings: near-zero overhead compared to C++ calls
+
+ * Added masking option to Label operator: conditional island removal
+
+ * Added IGX Orin support (with dGPU, Ampere or Ada RTX6000)
+
+ * Added support of signed 32bits output datatype for Label operator
+
+* **Removed Operator**:
+
+ * Removed Find Contours operator for troubleshooting of major limitations
+
+* **Bug Fixes**:
+
+ * Fixed constraint on installation directory for Python tests: tar test packages can now be used from any directory
+
+
+Compatibility and Known Limitations
+-----------------------------------
+
+See main README on `CV-CUDA GitHub `_.
+
+License
+-------
+
+CV-CUDA is licensed under the `Apache 2.0 `_ license.
+
+Resources
+---------
+
+1. `CV-CUDA GitHub `_
+2. `CV-CUDA Increasing Throughput and Reducing Costs for AI-Based Computer Vision with CV-CUDA `_
+3. `NVIDIA Announces Microsoft, Tencent, Baidu Adopting CV-CUDA for Computer Vision AI `_
+4. `CV-CUDA helps Tencent Cloud audio and video PaaS platform achieve full-process GPU acceleration for video enhancement AI `_
+
+Acknowledgements
+----------------
+
+CV-CUDA is developed jointly by NVIDIA and the ByteDance Machine Learning team.
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 7647d0491..65f61d879 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-cmake_minimum_required(VERSION 3.18)
+cmake_minimum_required(VERSION 3.20.1)
project(cvcuda_python CXX C)
diff --git a/python/mod_cvcuda/CMakeLists.txt b/python/mod_cvcuda/CMakeLists.txt
index 45ecc94e0..66b53d87f 100644
--- a/python/mod_cvcuda/CMakeLists.txt
+++ b/python/mod_cvcuda/CMakeLists.txt
@@ -29,7 +29,6 @@ nvcv_python_add_module(
OpLabel.cpp
LabelType.cpp
ConnectivityType.cpp
- OpFindContours.cpp
OpHistogramEq.cpp
OpOSD.cpp
OpAdvCvtColor.cpp
diff --git a/python/mod_cvcuda/Main.cpp b/python/mod_cvcuda/Main.cpp
index 130d01680..aff67174b 100644
--- a/python/mod_cvcuda/Main.cpp
+++ b/python/mod_cvcuda/Main.cpp
@@ -94,7 +94,6 @@ PYBIND11_MODULE(cvcuda, m)
// CV-CUDA Operators
ExportOpPairwiseMatcher(m);
ExportOpLabel(m);
- ExportOpFindContours(m);
ExportOpOSD(m);
ExportOpHistogramEq(m);
ExportOpAdvCvtColor(m);
diff --git a/python/mod_cvcuda/OpFindContours.cpp b/python/mod_cvcuda/OpFindContours.cpp
deleted file mode 100644
index 137bf645f..000000000
--- a/python/mod_cvcuda/OpFindContours.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "Operators.hpp"
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-namespace cvcudapy {
-
-namespace {
-
-using TupleTensor2 = std::tuple;
-
-TupleTensor2 FindContoursInto(Tensor &points, Tensor &numPoints, Tensor &input, std::optional pstream)
-{
- if (!pstream)
- {
- pstream = Stream::Current();
- }
-
- nvcv::Size2D size{static_cast(input.shape()[2]), static_cast(input.shape()[1])};
- auto findContours = CreateOperator(size, static_cast(input.shape()[0]));
-
- ResourceGuard guard(*pstream);
- guard.add(LockMode::LOCK_MODE_READ, {input});
- guard.add(LockMode::LOCK_MODE_WRITE, {points});
- guard.add(LockMode::LOCK_MODE_WRITE, {numPoints});
- guard.add(LockMode::LOCK_MODE_READWRITE, {*findContours});
-
- findContours->submit(pstream->cudaHandle(), input, points, numPoints);
-
- return TupleTensor2(std::move(points), std::move(numPoints));
-}
-
-TupleTensor2 FindContours(Tensor &input, std::optional pstream)
-{
- auto pointShape = nvcv::TensorShape{
- {input.shape()[0], cvcuda::FindContours::MAX_TOTAL_POINTS, 2},
- nvcv::TENSOR_NHW
- };
- Tensor points = Tensor::Create(pointShape, nvcv::TYPE_S32);
-
- auto countShape = nvcv::TensorShape{
- {input.shape()[0], cvcuda::FindContours::MAX_NUM_CONTOURS},
- nvcv::TENSOR_NW
- };
- Tensor numPoints = Tensor::Create(countShape, nvcv::TYPE_S32);
-
- return FindContoursInto(points, numPoints, input, pstream);
-}
-
-} // namespace
-
-void ExportOpFindContours(py::module &m)
-{
- using namespace pybind11::literals;
- py::options options;
- options.disable_function_signatures();
-
- m.def("find_contours", &FindContours, "image"_a, "stream"_a = nullptr, R"pbdoc(
-
- cvcuda.find_contours(src : nvcv.Tensor, stream: Optional[nvcv.cuda.Stream] = None) -> nvcv.Tensor
- Executes the FindContours operation on the given cuda stream.
-
- See also:
- Refer to the CV-CUDA C API reference for the FindContours operator
- for more details and usage examples.
-
- Args:
- src (Tensor): Input tensor containing one or more images.
- stream (Stream, optional): CUDA Stream on which to perform the operation.
-
- Returns:
- Tuple[Tensor, Tensor]: A tuple of two tensors. The first is the contour points tensor with dimensions NxMx2 -
- where N is the batch size, M is the maximum number of points allowed. Each point of the contour is specified
- in (x, y) coordinates. The second tensor specifies the number of valid contours per image and the number of
- valid points in those contours. It has dimensions NxC where N is the batch size and C is the maximum number
- of contours found. The actual number of contours can be calculated by counting the number of non-zero elements
- in the C dimension and the actual number of points in each of those contours are the values stored in the C dimension.
-
- Caution:
- Restrictions to several arguments may apply. Check the C
- API references of the CV-CUDA operator.
- )pbdoc");
-
- m.def("find_contours_into", &FindContoursInto, "points"_a, "num_points"_a, "src"_a, "stream"_a = nullptr, R"pbdoc(
-
- cvcuda.find_contours_into(points : nvcv.Tensor, num_points : nvcv.Tensor, src : Tensor, stream: Optional[nvcv.cuda.Stream] = None)
- Executes the FindContours operation on the given cuda stream.
-
- See also:
- Refer to the CV-CUDA C API reference for the FindContours operator
- for more details and usage examples.
-
- Args:
- points (Tensor): Output tensor to store the coordinates of each contour point.
- num_points (Tensor): Output tensor to store the number of points in a contour.
- src (Tensor): Input tensor containing one or more images.
- stream (Stream, optional): CUDA Stream on which to perform the operation.
-
- Returns:
- None
-
- Caution:
- Restrictions to several arguments may apply. Check the C
- API references of the CV-CUDA operator.
- )pbdoc");
-}
-
-} // namespace cvcudapy
diff --git a/python/mod_cvcuda/OpLabel.cpp b/python/mod_cvcuda/OpLabel.cpp
index 1d45618d8..c93158acd 100644
--- a/python/mod_cvcuda/OpLabel.cpp
+++ b/python/mod_cvcuda/OpLabel.cpp
@@ -33,9 +33,9 @@ using TupleTensor3 = std::tuple, std::optional count, std::optional stats, Tensor &input,
- NVCVConnectivityType connectivity, NVCVLabelType assignLabels, std::optional bgLabel,
- std::optional minThresh, std::optional maxThresh, std::optional minSize,
- std::optional pstream)
+ NVCVConnectivityType connectivity, NVCVLabelType assignLabels, NVCVLabelMaskType maskType,
+ std::optional bgLabel, std::optional minThresh, std::optional maxThresh,
+ std::optional minSize, std::optional mask, std::optional pstream)
{
if (!pstream)
{
@@ -73,20 +73,26 @@ TupleTensor3 LabelInto(Tensor &output, std::optional count, std::optiona
{
guard.add(LockMode::LOCK_MODE_READ, {*minSize});
}
+ if (mask)
+ {
+ guard.add(LockMode::LOCK_MODE_READ, {*mask});
+ }
op->submit(pstream->cudaHandle(), input, output, (bgLabel ? *bgLabel : nvcv::Tensor{nullptr}),
(minThresh ? *minThresh : nvcv::Tensor{nullptr}), (maxThresh ? *maxThresh : nvcv::Tensor{nullptr}),
(minSize ? *minSize : nvcv::Tensor{nullptr}), (count ? *count : nvcv::Tensor{nullptr}),
- (stats ? *stats : nvcv::Tensor{nullptr}), connectivity, assignLabels);
+ (stats ? *stats : nvcv::Tensor{nullptr}), (mask ? *mask : nvcv::Tensor{nullptr}), connectivity,
+ assignLabels, maskType);
return TupleTensor3(std::move(output), count, stats);
}
-TupleTensor3 Label(Tensor &input, NVCVConnectivityType connectivity, NVCVLabelType assignLabels, bool count, bool stats,
- int maxLabels, std::optional bgLabel, std::optional minThresh,
- std::optional maxThresh, std::optional minSize, std::optional pstream)
+TupleTensor3 Label(Tensor &input, NVCVConnectivityType connectivity, NVCVLabelType assignLabels,
+ NVCVLabelMaskType maskType, bool count, bool stats, int maxLabels, std::optional bgLabel,
+ std::optional minThresh, std::optional maxThresh, std::optional minSize,
+ std::optional mask, std::optional pstream)
{
- constexpr nvcv::DataType outType = nvcv::TYPE_U32;
+ constexpr nvcv::DataType outType = nvcv::TYPE_S32;
auto inputData = input.exportData();
if (!inputData)
@@ -112,11 +118,11 @@ TupleTensor3 Label(Tensor &input, NVCVConnectivityType connectivity, NVCVLabelTy
int numStats = 1;
if (connectivity == NVCV_CONNECTIVITY_4_2D || connectivity == NVCV_CONNECTIVITY_8_2D)
{
- numStats = 6;
+ numStats = 7;
}
if (connectivity == NVCV_CONNECTIVITY_6_3D || connectivity == NVCV_CONNECTIVITY_26_3D)
{
- numStats = 8;
+ numStats = 9;
}
statsTensor = Tensor::Create(
@@ -127,8 +133,8 @@ TupleTensor3 Label(Tensor &input, NVCVConnectivityType connectivity, NVCVLabelTy
outType);
}
- return LabelInto(output, countTensor, statsTensor, input, connectivity, assignLabels, bgLabel, minThresh, maxThresh,
- minSize, pstream);
+ return LabelInto(output, countTensor, statsTensor, input, connectivity, assignLabels, maskType, bgLabel, minThresh,
+ maxThresh, minSize, mask, pstream);
}
} // namespace
@@ -137,9 +143,14 @@ void ExportOpLabel(py::module &m)
{
using namespace pybind11::literals;
+ py::enum_(m, "LabelMaskType", py::arithmetic())
+ .value("REMOVE_ISLANDS_OUTSIDE_MASK_ONLY", NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY)
+ .export_values();
+
m.def("label", &Label, "src"_a, "connectivity"_a = NVCV_CONNECTIVITY_4_2D, "assign_labels"_a = NVCV_LABEL_FAST,
- py::kw_only(), "count"_a = false, "stats"_a = false, "max_labels"_a = 10000, "bg_label"_a = nullptr,
- "min_thresh"_a = nullptr, "max_thresh"_a = nullptr, "min_size"_a = nullptr, "stream"_a = nullptr, R"pbdoc(
+ "mask_type"_a = NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY, py::kw_only(), "count"_a = false, "stats"_a = false,
+ "max_labels"_a = 10000, "bg_label"_a = nullptr, "min_thresh"_a = nullptr, "max_thresh"_a = nullptr,
+ "min_size"_a = nullptr, "mask"_a = nullptr, "stream"_a = nullptr, R"pbdoc(
Executes the Label operation on the given cuda stream.
@@ -152,6 +163,8 @@ void ExportOpLabel(py::module &m)
default is cvcuda.CONNECTIVITY_4_2D.
assign_labels (cvcuda.LABEL, optional): Choice on how labels are assigned,
default is cvcuda.LABEL.FAST.
+ mask_type (cvcuda.LabelMaskType, optional): Choice on how the mask is used,
+ default is cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY.
count (bool, optional): Use True to return the count of valid labeled regions.
stats (bool, optional): Use True to return the statistics of valid labeled regions.
max_labels (Number, optional): Maximum number of labels to compute statistics for, default is 10000.
@@ -161,6 +174,10 @@ void ExportOpLabel(py::module &m)
max_thresh (Tensor, optional): Maximum threshold tensor to mask input values above it to be 0, and others 1.
min_size (Tensor, optional): Minimum size tensor to remove islands, i.e. labeled regions with number of
elements less than the minimum size.
+ mask (Tensor, optional): Mask tensor, its behavior is controlled by \ref mask_type. One choice is to
+ control island removal in addition to \ref min_size, i.e. regions with at
+ least one element inside the mask (non-zero values) are not removed in case
+ mask_type is cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY.
stream (Stream, optional): CUDA Stream on which to perform the operation.
Returns:
@@ -172,8 +189,9 @@ void ExportOpLabel(py::module &m)
)pbdoc");
m.def("label_into", &LabelInto, "dst"_a, "count"_a = nullptr, "stats"_a = nullptr, "src"_a,
- "connectivity"_a = NVCV_CONNECTIVITY_4_2D, "assign_labels"_a = NVCV_LABEL_FAST, py::kw_only(),
- "bg_label"_a = nullptr, "min_thresh"_a = nullptr, "max_thresh"_a = nullptr, "min_size"_a = nullptr,
+ "connectivity"_a = NVCV_CONNECTIVITY_4_2D, "assign_labels"_a = NVCV_LABEL_FAST,
+ "mask_type"_a = NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY, py::kw_only(), "bg_label"_a = nullptr,
+ "min_thresh"_a = nullptr, "max_thresh"_a = nullptr, "min_size"_a = nullptr, "mask"_a = nullptr,
"stream"_a = nullptr, R"pbdoc(
Executes the Label operation on the given cuda stream.
@@ -190,12 +208,18 @@ void ExportOpLabel(py::module &m)
default is cvcuda.CONNECTIVITY_4_2D.
assign_labels (cvcuda.LABEL, optional): Choice on how labels are assigned,
default is cvcuda.LABEL.FAST.
+ mask_type (cvcuda.LabelMaskType, optional): Choice on how the mask is used,
+ default is cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY.
bg_label (Tensor, optional): Background tensor to define input values to be considered background
labels and thus ignored.
min_thresh (Tensor, optional): Minimum threshold tensor to mask input values below it to be 0, and others 1.
max_thresh (Tensor, optional): Maximum threshold tensor to mask input values above it to be 0, and others 1.
min_size (Tensor, optional): Minimum size tensor to remove islands, i.e. labeled regions with number of
elements less than the minimum size.
+ mask (Tensor, optional): Mask tensor, its behavior is controlled by \ref mask_type. One choice is to
+ control island removal in addition to \ref min_size, i.e. regions with at
+ least one element inside the mask (non-zero values) are not removed in case
+ mask_type is cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY.
stream (Stream, optional): CUDA Stream on which to perform the operation.
Returns:
diff --git a/python/mod_cvcuda/OpResize.cpp b/python/mod_cvcuda/OpResize.cpp
index 7d42dcce7..a8e41fab7 100644
--- a/python/mod_cvcuda/OpResize.cpp
+++ b/python/mod_cvcuda/OpResize.cpp
@@ -1,5 +1,5 @@
/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -173,7 +173,7 @@ void ExportOpResize(py::module &m)
stream (Stream, optional): CUDA Stream on which to perform the operation.
Returns:
- cvcuda.Tensor: The output tensor.
+ cvcuda.ImageBatchVarShape: The output image batch.
Caution:
Restrictions to several arguments may apply. Check the C
diff --git a/python/mod_cvcuda/Operators.hpp b/python/mod_cvcuda/Operators.hpp
index b48f11fbd..6197e43ba 100644
--- a/python/mod_cvcuda/Operators.hpp
+++ b/python/mod_cvcuda/Operators.hpp
@@ -49,7 +49,6 @@ using nvcvpy::TensorBatch;
namespace util = nvcvpy::util;
namespace py = ::pybind11;
-void ExportOpFindContours(py::module &m);
void ExportOpReformat(py::module &m);
void ExportOpResize(py::module &m);
void ExportOpCustomCrop(py::module &m);
diff --git a/python/mod_nvcv/CAPI.cpp b/python/mod_nvcv/CAPI.cpp
index e15f6eff8..b31fc27ec 100644
--- a/python/mod_nvcv/CAPI.cpp
+++ b/python/mod_nvcv/CAPI.cpp
@@ -105,14 +105,9 @@ LockMode ToLockMode(PyObject *_mode)
}
}
-extern "C" void ImplResource_SubmitSync(PyObject *res, PyObject *stream, PyObject *lockMode)
+extern "C" void ImplResource_SubmitSync(PyObject *res, PyObject *stream)
{
- ToSharedObj(res)->submitSync(*ToSharedObj(stream), ToLockMode(lockMode));
-}
-
-extern "C" void ImplResource_SubmitSignal(PyObject *res, PyObject *stream, PyObject *lockMode)
-{
- ToSharedObj(res)->submitSignal(*ToSharedObj(stream), ToLockMode(lockMode));
+ ToSharedObj(res)->submitSync(*ToSharedObj(stream));
}
extern "C" void ImplStream_HoldResources(PyObject *stream, PyObject *resourceList)
@@ -294,7 +289,6 @@ void ExportCAPI(py::module &m)
.ImageFormat_ToPython = &ImplImageFormat_ToPython,
.ImageFormat_FromPython = &ImplImageFormat_FromPython,
.Resource_SubmitSync = &ImplResource_SubmitSync,
- .Resource_SubmitSignal = &ImplResource_SubmitSignal,
.Stream_HoldResources = &ImplStream_HoldResources,
.Stream_GetCurrent = &ImplStream_GetCurrent,
.Stream_GetCudaHandle = &ImplStream_GetCudaHandle,
diff --git a/python/mod_nvcv/Resource.cpp b/python/mod_nvcv/Resource.cpp
index afe571569..a8d0fe67a 100644
--- a/python/mod_nvcv/Resource.cpp
+++ b/python/mod_nvcv/Resource.cpp
@@ -30,24 +30,21 @@ Resource::Resource()
m_id = idnext++;
- m_readEvent = m_writeEvent = nullptr;
+ m_event = nullptr;
try
{
- util::CheckThrow(cudaEventCreateWithFlags(&m_readEvent, cudaEventDisableTiming));
- util::CheckThrow(cudaEventCreateWithFlags(&m_writeEvent, cudaEventDisableTiming));
+ util::CheckThrow(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming));
}
catch (...)
{
- cudaEventDestroy(m_readEvent);
- cudaEventDestroy(m_writeEvent);
+ cudaEventDestroy(m_event);
throw;
}
}
Resource::~Resource()
{
- cudaEventDestroy(m_readEvent);
- cudaEventDestroy(m_writeEvent);
+ cudaEventDestroy(m_event);
}
uint64_t Resource::id() const
@@ -55,62 +52,29 @@ uint64_t Resource::id() const
return m_id;
}
-void Resource::submitSignal(Stream &stream, LockMode mode) const
+void Resource::submitSync(Stream &stream)
{
- doBeforeSubmitSignal(stream, mode);
-
- if (mode & LOCK_MODE_READ)
- {
- util::CheckThrow(cudaEventRecord(m_readEvent, stream.handle()));
- }
- if (mode & LOCK_MODE_WRITE)
+ //Check if we have a last stream, if not set it to the current stream
+ if (!m_lastStream.has_value())
{
- util::CheckThrow(cudaEventRecord(m_writeEvent, stream.handle()));
+ m_lastStream.emplace(stream.shared_from_this()); //store a shared pointer to the stream
}
-}
-
-void Resource::submitSync(Stream &stream, LockMode mode) const
-{
- doBeforeSubmitSync(stream, mode);
- doSubmitSync(stream, mode);
-}
-
-void Resource::doSubmitSync(Stream &stream, LockMode mode) const
-{
- if (mode & LOCK_MODE_WRITE)
- {
- util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_writeEvent));
- util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_readEvent));
- }
- else if (mode & LOCK_MODE_READ)
+ // if we are on the same stream we dont need to do anything
+ // as streams are sequential and we can assume that the last operation on the stream is done
+ if (m_lastStream.value()->handle() == stream.handle())
{
- util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_writeEvent));
+ return;
}
-}
-
-void Resource::sync(LockMode mode) const
-{
- py::gil_scoped_release release;
- doBeforeSync(mode);
+ // if we are on a different stream we need to wait for that stream to finish
+ // write event on the old stream, the new stream will have to wait for it to be done
+ util::CheckThrow(cudaEventRecord(m_event, m_lastStream.value()->handle()));
+ util::CheckThrow(cudaStreamWaitEvent(stream.handle(), m_event));
- doSync(mode);
-}
-
-void Resource::doSync(LockMode mode) const
-{
- NVCV_ASSERT(PyGILState_Check() == 0);
-
- if (mode & LOCK_MODE_WRITE)
- {
- util::CheckThrow(cudaEventSynchronize(m_writeEvent));
- util::CheckThrow(cudaEventSynchronize(m_readEvent));
- }
- else if (mode & LOCK_MODE_READ)
- {
- util::CheckThrow(cudaEventSynchronize(m_writeEvent));
- }
+ // update the last stream since we changed streams
+ m_lastStream.reset();
+ m_lastStream.emplace(stream.shared_from_this());
}
std::shared_ptr Resource::shared_from_this()
@@ -127,8 +91,7 @@ void Resource::Export(py::module &m)
{
py::class_>(m, "Resource")
.def_property_readonly("id", &Resource::id, "Unique resource instance identifier")
- .def("submitSync", &Resource::submitSync)
- .def("submitSignal", &Resource::submitSignal);
+ .def("submitStreamSync", &Resource::submitSync, "Syncs object on new Stream");
}
} // namespace nvcvpy::priv
diff --git a/python/mod_nvcv/Resource.hpp b/python/mod_nvcv/Resource.hpp
index 21e7cc181..010c8b33c 100644
--- a/python/mod_nvcv/Resource.hpp
+++ b/python/mod_nvcv/Resource.hpp
@@ -19,6 +19,7 @@
#define NVCV_PYTHON_PRIV_RESOURCE_HPP
#include "Object.hpp"
+#include "Stream.hpp"
#include
#include
@@ -32,42 +33,64 @@ typedef struct CUevent_st *cudaEvent_t;
namespace nvcvpy::priv {
namespace py = pybind11;
-class Stream;
-
+/**
+ * @brief A class representing a CUDA resource.
+ *
+ * This class encapsulates a CUDA resource and provides methods for synchronization
+ * with CUDA streams.
+ */
class PYBIND11_EXPORT Resource : public virtual Object
{
public:
+ /**
+ * @brief Destructor.
+ */
~Resource();
+ /**
+ * @brief Export the Resource class to Python.
+ *
+ * @param m The Python module to export the class to.
+ */
static void Export(py::module &m);
+ /**
+ * @brief Get the unique identifier of the resource.
+ *
+ * @return uint64_t The unique identifier of the resource.
+ */
uint64_t id() const;
- void submitSync(Stream &stream, LockMode mode) const;
- void submitSignal(Stream &stream, LockMode mode) const;
-
- // Assumes GIL is locked (is in acquired state)
- void sync(LockMode mode) const;
-
- std::shared_ptr shared_from_this();
+ /**
+ * @brief Submit the resource for synchronization with a CUDA stream.
+ *
+ * This method synchronizes the resource with the specified CUDA stream.
+ *
+ * @param stream The CUDA stream to synchronize with.
+ */
+ void submitSync(Stream &stream);
+
+ /**
+ * @brief Get a shared pointer to this resource.
+ *
+ * @return std::shared_ptr A shared pointer to this resource.
+ */
+ std::shared_ptr shared_from_this();
+
+ /**
+ * @brief Get a shared pointer to this const resource.
+ *
+ * @return std::shared_ptr A shared pointer to this const resource.
+ */
std::shared_ptr shared_from_this() const;
protected:
Resource();
- void doSubmitSync(Stream &stream, LockMode mode) const;
-
- // Assumes GIL is not locked (is in released state)
- void doSync(LockMode mode) const;
-
private:
- // To be overriden by children if they have their own requirements
- virtual void doBeforeSync(LockMode mode) const {};
- virtual void doBeforeSubmitSync(Stream &stream, LockMode mode) const {};
- virtual void doBeforeSubmitSignal(Stream &stream, LockMode mode) const {};
-
- uint64_t m_id;
- cudaEvent_t m_readEvent, m_writeEvent;
+ uint64_t m_id; /**< The unique identifier of the resource. */
+ cudaEvent_t m_event; /**< The CUDA event used for synchronization. */
+ std::optional> m_lastStream; /**< Cache the last stream used for this resource. */
};
} // namespace nvcvpy::priv
diff --git a/python/mod_nvcv/Stream.cpp b/python/mod_nvcv/Stream.cpp
index 4c120499f..bd3c1f9dc 100644
--- a/python/mod_nvcv/Stream.cpp
+++ b/python/mod_nvcv/Stream.cpp
@@ -28,6 +28,11 @@
namespace nvcvpy::priv {
+// Static members initialization
+cudaStream_t Stream::m_auxStream = nullptr;
+std::atomic Stream::m_instanceCount = 0;
+std::mutex Stream::m_auxStreamMutex;
+
// Here we define the representation of external cuda streams.
// It defines pybind11's type casters from the python object
// to the corresponding ExternalStream.
@@ -193,7 +198,18 @@ std::shared_ptr Stream::Create()
Stream::Stream()
: m_owns(true)
{
- util::CheckThrow(cudaStreamCreate(&m_handle));
+ try
+ {
+ util::CheckThrow(cudaStreamCreateWithFlags(&m_handle, cudaStreamNonBlocking));
+ incrementInstanceCount();
+ GetAuxStream();
+ util::CheckThrow(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming));
+ }
+ catch (...)
+ {
+ destroy();
+ throw;
+ }
}
Stream::Stream(IExternalStream &extStream)
@@ -206,14 +222,72 @@ Stream::Stream(IExternalStream &extStream)
{
throw std::runtime_error("Invalid cuda stream");
}
+
+ try
+ {
+ incrementInstanceCount();
+ GetAuxStream(); // Make sure the singleton aux stream is created
+ util::CheckThrow(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming));
+ }
+ catch (...)
+ {
+ destroy();
+ throw;
+ }
+}
+
+void Stream::incrementInstanceCount()
+{
+ m_instanceCount.fetch_add(1, std::memory_order_relaxed);
+}
+
+int Stream::decrementInstanceCount()
+{
+ return m_instanceCount.fetch_sub(1, std::memory_order_acq_rel) - 1;
+}
+
+cudaStream_t &Stream::GetAuxStream()
+{
+ if (!m_auxStream)
+ {
+ std::lock_guard lock(m_auxStreamMutex);
+ if (!m_auxStream)
+ {
+ util::CheckThrow(cudaStreamCreateWithFlags(&m_auxStream, cudaStreamNonBlocking));
+ }
+ }
+ return m_auxStream;
}
Stream::~Stream()
+{
+ destroy();
+}
+
+void Stream::destroy()
{
if (m_owns)
{
- util::CheckLog(cudaStreamSynchronize(m_handle));
- util::CheckLog(cudaStreamDestroy(m_handle));
+ if (m_handle)
+ {
+ util::CheckLog(cudaStreamSynchronize(m_handle));
+ util::CheckLog(cudaStreamDestroy(m_handle));
+ m_handle = nullptr;
+ }
+ }
+ {
+ std::lock_guard lock(m_auxStreamMutex);
+ if (m_auxStream && decrementInstanceCount() == 0)
+ {
+ util::CheckThrow(cudaStreamSynchronize(m_auxStream));
+ util::CheckThrow(cudaStreamDestroy(m_auxStream));
+ m_auxStream = nullptr;
+ }
+ }
+ if (m_event)
+ {
+ util::CheckThrow(cudaEventDestroy(m_event));
+ m_event = nullptr;
}
}
@@ -240,7 +314,6 @@ intptr_t Stream::pyhandle() const
void Stream::sync()
{
py::gil_scoped_release release;
-
util::CheckThrow(cudaStreamSynchronize(m_handle));
}
@@ -283,8 +356,34 @@ void Stream::holdResources(LockResources usedResources)
delete pclosure;
};
- util::CheckThrow(cudaStreamAddCallback(m_handle, fn, closure.get(), 0));
-
+ // If we naively execute the callback in the main stream (m_handle), the GPU will wait until the callback
+ // is executed (on host). For correctness, GPU doesn't need to wait - it's the CPU that needs
+ // to wait for the work already scheduled to complete.
+ //
+ // Naive timeline:
+ //
+ // stream GPU_kernel1 | Callback | GPU_kernel2
+ // GPU activity xxxxxxxxxxx xxxxxxxxxxx
+ // CPU activity xxxxxxxx
+ //
+ // Optimized timeline
+ //
+ //
+ // event -----v
+ // stream GPU_kernel1 | GPU_kernel2
+ // aux_stream waitEvent >| Callback
+ //
+ // GPU activity xxxxxxxxxxx xxxxxxxxxxx
+ // CPU activity xxxxxxxx
+
+ util::CheckThrow(cudaEventRecord(m_event, m_handle)); // add async record the event in the main stream
+ util::CheckThrow(
+ cudaStreamWaitEvent(GetAuxStream(), m_event)); // add async wait for the event in the aux stream
+ // The callback will be executed in the singleton aux stream there may be contention with other callbacks and waitEvents from
+ // other streams. However the callback is used to release resources from the cache and should not be a performance bottleneck.
+ // This avoids opening a new aux stream for each stream object.
+ util::CheckThrow(
+ cudaStreamAddCallback(GetAuxStream(), fn, closure.get(), 0)); // add async callback in the aux stream
closure.release();
}
}
@@ -322,6 +421,8 @@ void Stream::Export(py::module &m)
ExportExternalStream(m);
ExportExternalStream(m);
+ fflush(stdout);
+
stream.def("__enter__", &Stream::activate, "Activate the CUDA stream as the current stream for this thread.")
.def("__exit__", &Stream::deactivate, "Deactivate the CUDA stream as the current stream for this thread.")
.def("sync", &Stream::sync, "Wait for all preceding CUDA calls in the current stream to complete.")
diff --git a/python/mod_nvcv/Stream.hpp b/python/mod_nvcv/Stream.hpp
index 81a3fc9fc..2dcceb726 100644
--- a/python/mod_nvcv/Stream.hpp
+++ b/python/mod_nvcv/Stream.hpp
@@ -24,8 +24,10 @@
#include
#include
+#include
#include
#include
+#include
#include
#include
@@ -51,7 +53,7 @@ class PYBIND11_EXPORT Stream : public CacheItem
static std::shared_ptr Create();
- ~Stream();
+ virtual ~Stream();
std::shared_ptr shared_from_this();
std::shared_ptr shared_from_this() const;
@@ -75,6 +77,8 @@ class PYBIND11_EXPORT Stream : public CacheItem
Stream(Stream &&) = delete;
Stream();
+ // Singleton access to the auxiliary CUDA stream
+
class Key final : public IKey
{
private:
@@ -88,9 +92,22 @@ class PYBIND11_EXPORT Stream : public CacheItem
return key;
}
- bool m_owns;
- cudaStream_t m_handle;
+ void destroy();
+
+ bool m_owns = false;
+ cudaStream_t m_handle = nullptr;
+ cudaEvent_t m_event = nullptr;
py::object m_wrappedObj;
+
+ //singleton aux stream and protection. this a a bit overkill
+ //for now as python is single threaded, but it is a good practice
+ static std::mutex m_auxStreamMutex;
+ static std::atomic m_instanceCount;
+ static cudaStream_t m_auxStream;
+
+ static void incrementInstanceCount();
+ static int decrementInstanceCount();
+ static cudaStream_t &GetAuxStream();
};
} // namespace nvcvpy::priv
diff --git a/python/mod_nvcv/include/nvcv/python/CAPI.hpp b/python/mod_nvcv/include/nvcv/python/CAPI.hpp
index db5f200a0..664ed87b5 100644
--- a/python/mod_nvcv/include/nvcv/python/CAPI.hpp
+++ b/python/mod_nvcv/include/nvcv/python/CAPI.hpp
@@ -44,8 +44,7 @@ struct CAPI
PyObject *(*ImageFormat_ToPython)(NVCVImageFormat p);
NVCVImageFormat (*ImageFormat_FromPython)(PyObject *obj);
- void (*Resource_SubmitSync)(PyObject *res, PyObject *stream, PyObject *lockMode);
- void (*Resource_SubmitSignal)(PyObject *res, PyObject *stream, PyObject *lockMode);
+ void (*Resource_SubmitSync)(PyObject *res, PyObject *stream);
void (*Stream_HoldResources)(PyObject *stream, PyObject *resources);
PyObject *(*Stream_GetCurrent)();
diff --git a/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp b/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp
index 40967a84b..5ad2bae5d 100644
--- a/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp
+++ b/python/mod_nvcv/include/nvcv/python/ResourceGuard.hpp
@@ -62,43 +62,16 @@ class ResourceGuard
for (const std::reference_wrapper &r : resources)
{
py::object pyRes = r.get();
-
- capi().Resource_SubmitSync(pyRes.ptr(), m_pyStream.ptr(), pyLockMode.ptr());
+ capi().Resource_SubmitSync(pyRes.ptr(), m_pyStream.ptr());
m_resourcesPerLockMode.append(std::make_pair(pyLockMode, std::move(pyRes)));
}
+
return *this;
}
void commit()
{
capi().Stream_HoldResources(m_pyStream.ptr(), m_resourcesPerLockMode.ptr());
-
- py::list newList;
-
- auto it = m_resourcesPerLockMode.begin();
- try
- {
- // Try to signal the resources, stop on the first that fails, or
- // when all resources were signaled
- for (; it != m_resourcesPerLockMode.end(); ++it)
- {
- py::tuple t = it->cast();
-
- // resource, stream, lockmode
- capi().Resource_SubmitSignal(t[1].ptr(), m_pyStream.ptr(), t[0].ptr());
- }
- }
- catch (...)
- {
- // Add all resources that weren't signaled to the newList.
- for (; it != m_resourcesPerLockMode.end(); ++it)
- {
- newList.append(std::move(*it));
- }
- throw;
- }
-
- m_resourcesPerLockMode = std::move(newList);
}
private:
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index ca2ee0c29..806192fe1 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-cmake_minimum_required(VERSION 3.22)
+cmake_minimum_required(VERSION 3.20.1)
project(nvcv_samples LANGUAGES CXX)
find_package(CUDAToolkit REQUIRED)
diff --git a/samples/classification/python/main.py b/samples/classification/python/main.py
index f12c95f5a..cae6131e0 100644
--- a/samples/classification/python/main.py
+++ b/samples/classification/python/main.py
@@ -81,8 +81,10 @@ def run_sample(
cuda_device = cuda.Device(device_id)
cuda_ctx = cuda_device.retain_primary_context()
cuda_ctx.push()
- cvcuda_stream = cvcuda.Stream()
- torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle)
+ # Use the the default stream for cvcuda and torch
+ # Since we never created a stream current will be the CUDA default stream
+ cvcuda_stream = cvcuda.Stream().current
+ torch_stream = torch.cuda.default_stream(device=cuda_device)
# docs_tag: end_setup_gpu
# docs_tag: begin_setup_stages
@@ -96,6 +98,7 @@ def run_sample(
batch_size,
device_id,
cuda_ctx,
+ cvcuda_stream,
cvcuda_perf,
)
@@ -106,6 +109,7 @@ def run_sample(
batch_size,
device_id,
cuda_ctx,
+ cvcuda_stream,
cvcuda_perf,
)
diff --git a/samples/common/python/nvcodec_utils.py b/samples/common/python/nvcodec_utils.py
index 2a300d385..420e15fe2 100644
--- a/samples/common/python/nvcodec_utils.py
+++ b/samples/common/python/nvcodec_utils.py
@@ -68,6 +68,7 @@ def __init__(
batch_size,
device_id,
cuda_ctx,
+ cuda_stream,
cvcuda_perf,
):
# docs_tag: begin_init_videobatchdecoder_pyvideocodec
@@ -76,7 +77,7 @@ def __init__(
self.batch_size = batch_size
self.device_id = device_id
self.cuda_ctx = cuda_ctx
- self.cuda_stream = cvcuda.Stream().current
+ self.cuda_stream = cuda_stream
self.cvcuda_perf = cvcuda_perf
self.total_decoded = 0
self.batch_idx = 0
@@ -229,6 +230,7 @@ def __init__(
fps,
device_id,
cuda_ctx,
+ cuda_stream,
cvcuda_perf,
):
self.logger = logging.getLogger(__name__)
@@ -236,7 +238,7 @@ def __init__(
self.fps = fps
self.device_id = device_id
self.cuda_ctx = cuda_ctx
- self.cuda_stream = cvcuda.Stream().current
+ self.cuda_stream = cuda_stream
self.cvcuda_perf = cvcuda_perf
self.encoder = None
@@ -327,7 +329,7 @@ def start(self):
pass
def join(self):
- self.encoder.flush()
+ # self.encoder.flush()
self.logger.info("Wrote: %s" % self.output_file_name)
@@ -482,6 +484,7 @@ def __init__(
batch_size,
device_id,
cuda_ctx,
+ cuda_stream,
cvcuda_perf,
):
@@ -493,7 +496,7 @@ def __init__(
self.total_decoded = 0
self.batch_idx = 0
self.cuda_ctx = cuda_ctx
- self.cuda_stream = cvcuda.Stream().current
+ self.cuda_stream = cuda_stream
self.cvcuda_perf = cvcuda_perf
self.decoder = nvimgcodec.Decoder(device_id=device_id)
diff --git a/samples/label/python/label.py b/samples/label/python/label.py
new file mode 100644
index 000000000..0d8fcf219
--- /dev/null
+++ b/samples/label/python/label.py
@@ -0,0 +1,215 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import torch
+import torchvision
+
+import cvcuda
+
+
+def parse_arguments():
+ """Parse this program script arguments."""
+
+ parser = argparse.ArgumentParser(prog="label", description="Labels an input image.")
+
+ parser.add_argument("input", type=str, help="Input image png file path.")
+ parser.add_argument(
+ "output",
+ nargs="?",
+ default="out.png",
+ type=str,
+ help="Output image png file path. Defaults to out.png.",
+ )
+ parser.add_argument(
+ "--max_labels",
+ default=1000,
+ type=int,
+ help="Maximum number of labels. Defaults to 1000.",
+ )
+ parser.add_argument(
+ "--min_threshold",
+ default=None,
+ type=int,
+ help="Minimum threshold to binarize input. Defaults to no minimum threshold.",
+ )
+ parser.add_argument(
+ "--max_threshold",
+ default=None,
+ type=int,
+ help="Maximum threshold to binarize input. Defaults to no maximum threshold.",
+ )
+ parser.add_argument(
+ "--min_size",
+ default=None,
+ type=int,
+ help="Minimum size to prevent a region to be removed. Defaults to no minimum size (no removals).",
+ )
+ parser.add_argument(
+ "--mask",
+ action=argparse.BooleanOptionalAction,
+ help="Apply mask to protect center islands (small regions). Defaults to no mask.",
+ )
+ parser.add_argument(
+ "--background_label",
+ default=0,
+ type=int,
+ help="Background label. Defaults to zero.",
+ )
+
+ return parser.parse_args()
+
+
+def color_labels(
+ h_labels_hw,
+ bgl,
+ bgc=torch.as_tensor([0, 0, 0], dtype=torch.uint8),
+ fgc=torch.as_tensor([255, 255, 255], dtype=torch.uint8),
+ cmap=None,
+):
+ """Convert labels to colors
+
+ Args:
+ h_labels_hw (Tensor): Tensor with labels
+ bgl (int): Background label
+ bgc (Tensor): Background color, this color is used for the background label
+ fgc (Tensor): Foreground color, this color is used when cmap is None
+ cmap (function): Colormap, e.g. matplotlib.colormaps["jet"]
+
+ Returns:
+ Tensor: Tensor with colors
+ """
+ # Create an empty Tensor with same height and width as the labels Tensor and Channel = 3 for RGB
+ h_out_hwc = torch.empty(
+ (h_labels_hw.shape[0], h_labels_hw.shape[1], 3), dtype=torch.uint8
+ )
+
+ # Set all values to be the background color
+ h_out_hwc[:, :] = bgc
+
+ # Get the unique set of labels except background label from the labels Tensor
+ h_uniq = torch.unique(h_labels_hw)
+ h_uniq = h_uniq[h_uniq != bgl]
+
+ # Set the label RGB color to be the foreground color
+ label_rgb = fgc
+
+ for i, label in enumerate(h_uniq):
+ if cmap is not None:
+ # If a color map was provided, use it to generate the label color
+ label_rgb = [int(c * 255) for c in cmap(i / h_uniq.shape[0])[:3]]
+ label_rgb = torch.as_tensor(label_rgb, dtype=torch.uint8)
+
+ h_out_hwc[h_labels_hw == label] = label_rgb
+
+ return h_out_hwc
+
+
+if __name__ == "__main__":
+
+ args = parse_arguments()
+
+ print(
+ f"I Reading input image: {args.input}\nI Writing output image: {args.output}\n"
+ f"I Minimum threshold: {args.min_threshold}\nI Maximum threshold: {args.max_threshold}\n"
+ f"I Minimum size: {args.min_size}\nI Apply mask: {args.mask}\n"
+ f"I Background label: {args.background_label}"
+ )
+
+ # Use torchvision to read an input image, convert it to gray and store it as a CHW Tensor
+ h_in_chw = torchvision.io.read_image(args.input, torchvision.io.ImageReadMode.GRAY)
+
+ # Convert the image read from Pytorch Tensor to CVCUDA Tensor with zero copy
+ d_in_chw = cvcuda.as_tensor(h_in_chw.cuda(), layout="CHW")
+
+ # Reshape CVCUDA Tensor from CHW to HW (Channel is 1) with zero copy
+ d_in_hw = d_in_chw.reshape(d_in_chw.shape[1:], "HW")
+
+ # Tensors are initialized first in host (h_) and then copied to device (d_), using Pytorch's .as_tensor()
+ # and .cuda() methods, and then converted to CVCUDA with zero copy, using CVCUDA's .as_tensor() method
+ h_bgl = torch.as_tensor([args.background_label], dtype=h_in_chw.dtype)
+ d_bgl = cvcuda.as_tensor(h_bgl.cuda(), layout="N")
+
+ # Tensors for min/max thresholds min size and mask are optional
+ d_min_thrs = None
+ d_max_thrs = None
+ d_min_size = None
+ d_mask_hw = None
+
+ if args.min_threshold:
+ h_min_thrs = torch.as_tensor([args.min_threshold], dtype=h_in_chw.dtype)
+ d_min_thrs = cvcuda.as_tensor(h_min_thrs.cuda(), layout="N")
+
+ if args.max_threshold:
+ h_max_thrs = torch.as_tensor([args.max_threshold], dtype=h_in_chw.dtype)
+ d_max_thrs = cvcuda.as_tensor(h_max_thrs.cuda(), layout="N")
+
+ if args.min_size:
+ h_min_size = torch.as_tensor([args.min_size], dtype=torch.int32)
+ d_min_size = cvcuda.as_tensor(h_min_size.cuda(), layout="N")
+
+ if args.mask:
+ # Below are slices in between 10% and 90% (a center box) to be considered inside the mask
+ s_h_in_mask = slice(int(0.1 * h_in_chw.shape[1]), int(0.9 * h_in_chw.shape[1]))
+ s_w_in_mask = slice(int(0.1 * h_in_chw.shape[2]), int(0.9 * h_in_chw.shape[2]))
+
+ # The mask in host is first initialized with zeros
+ h_mask_hw = torch.zeros(h_in_chw.shape[1:], dtype=h_in_chw.dtype)
+
+ # Then the center of the mask defined by the slices is set to 1
+ h_mask_hw[s_h_in_mask, s_w_in_mask] = 1
+
+ # The Pytorch Tensor mask is copied to CUDA and converted to CVCUDA Tensor
+ d_mask_hw = cvcuda.as_tensor(h_mask_hw.cuda(), layout="HW")
+
+ # Call CVCUDA label operator using the arguments set above
+ d_out, d_count, d_stats = cvcuda.label(
+ src=d_in_hw,
+ connectivity=cvcuda.CONNECTIVITY_4_2D,
+ assign_labels=cvcuda.LABEL.SEQUENTIAL,
+ mask_type=cvcuda.REMOVE_ISLANDS_OUTSIDE_MASK_ONLY,
+ count=True,
+ stats=True,
+ max_labels=args.max_labels,
+ bg_label=d_bgl,
+ min_thresh=d_min_thrs,
+ max_thresh=d_max_thrs,
+ min_size=d_min_size,
+ mask=d_mask_hw,
+ )
+
+ # Convert CVCUDA output Tensors to Pytorch with zero copy, using CVCUDA's .cuda() method, then copy the
+ # Pytorch Tensor to the CPU, using Pytorch's .cpu() method
+ h_out = torch.as_tensor(d_out.cuda()).cpu()
+ h_count = torch.as_tensor(d_count.cuda()).cpu()
+ h_stats = torch.as_tensor(d_stats.cuda()).cpu()
+
+ print(f"I Number of labels found: {h_count[0]}")
+
+ # The stats Tensor (with statistics) has a region mark at index 6 that is set to 1 for removed regions
+ # and set to 2 for regions in the mask that cannot be removed
+ num_removed = sum([1 if h_stats[0, si, 6] == 1 else 0 for si in range(h_count[0])])
+ num_in_mask = sum([1 if h_stats[0, si, 6] == 2 else 0 for si in range(h_count[0])])
+
+ print(f"I Number of labeled regions removed: {num_removed}")
+ print(f"I Number of labeled regions in the mask: {num_in_mask}")
+ print(f"I Number of labeled regions kept: {h_count[0] - num_removed}")
+
+ # Color the labels using default behavior: white foreground and black background
+ h_out_rgb_hwc = color_labels(h_out, h_bgl[0])
+
+ # Use torchvision to write the output image from a CHW Tensor
+ torchvision.io.write_png(h_out_rgb_hwc.permute(2, 0, 1), args.output)
diff --git a/samples/label/python/main.py b/samples/label/python/main.py
index aeff0f85a..cb664f690 100644
--- a/samples/label/python/main.py
+++ b/samples/label/python/main.py
@@ -159,8 +159,10 @@ def run_sample(
cuda_device = cuda.Device(device_id)
cuda_ctx = cuda_device.retain_primary_context()
cuda_ctx.push()
- cvcuda_stream = cvcuda.Stream()
- torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle)
+ # Use the the default stream for cvcuda and torch
+ # Since we never created a stream current will be the CUDA default stream
+ cvcuda_stream = cvcuda.Stream().current
+ torch_stream = torch.cuda.default_stream(device=cuda_device)
# docs_tag: end_setup_gpu
# docs_tag: encoder_decoder setup
@@ -168,7 +170,7 @@ def run_sample(
if os.path.splitext(input_path)[1] == ".jpg" or os.path.isdir(input_path):
# Treat this as data modality of images
decoder = ImageBatchDecoder(
- input_path, batch_size, device_id, cuda_ctx, cvcuda_perf
+ input_path, batch_size, device_id, cuda_ctx, cvcuda_stream, cvcuda_perf
)
encoder = ImageBatchEncoder(
output_dir,
diff --git a/samples/object_detection/python/main.py b/samples/object_detection/python/main.py
index 935e121a1..0741ea2c8 100644
--- a/samples/object_detection/python/main.py
+++ b/samples/object_detection/python/main.py
@@ -85,8 +85,10 @@ def run_sample(
cuda_device = cuda.Device(device_id)
cuda_ctx = cuda_device.retain_primary_context()
cuda_ctx.push()
- cvcuda_stream = cvcuda.Stream()
- torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle)
+ # Use the the default stream for cvcuda and torch
+ # Since we never created a stream current will be the CUDA default stream
+ cvcuda_stream = cvcuda.Stream().current
+ torch_stream = torch.cuda.default_stream(device=cuda_device)
# docs_tag: end_setup_gpu
# docs_tag: begin_setup_stages
@@ -96,7 +98,7 @@ def run_sample(
if os.path.splitext(input_path)[1] == ".jpg" or os.path.isdir(input_path):
# Treat this as data modality of images
decoder = ImageBatchDecoder(
- input_path, batch_size, device_id, cuda_ctx, cvcuda_perf
+ input_path, batch_size, device_id, cuda_ctx, cvcuda_stream, cvcuda_perf
)
encoder = ImageBatchEncoder(
@@ -107,11 +109,11 @@ def run_sample(
else:
# Treat this as data modality of videos
decoder = VideoBatchDecoder(
- input_path, batch_size, device_id, cuda_ctx, cvcuda_perf
+ input_path, batch_size, device_id, cuda_ctx, cvcuda_stream, cvcuda_perf
)
encoder = VideoBatchEncoder(
- output_dir, decoder.fps, device_id, cuda_ctx, cvcuda_perf
+ output_dir, decoder.fps, device_id, cuda_ctx, cvcuda_stream, cvcuda_perf
)
# Define the post-processor
diff --git a/samples/scripts/benchmark.py b/samples/scripts/benchmark.py
index fe252d263..938f080c8 100644
--- a/samples/scripts/benchmark.py
+++ b/samples/scripts/benchmark.py
@@ -195,7 +195,7 @@ def parse_nvtx_gpu_proj_trace_json(json_path):
# Grab the necessary values from the JSON file.
range_id = row["RangeId"]
- if range_id == "None":
+ if not range_id or range_id == "None":
continue
flat_name = row["Name"]
diff --git a/samples/scripts/run_samples.sh b/samples/scripts/run_samples.sh
index dea98a584..7a8fc3025 100755
--- a/samples/scripts/run_samples.sh
+++ b/samples/scripts/run_samples.sh
@@ -1,6 +1,6 @@
#!/bin/bash -e
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -36,6 +36,16 @@ echo "SEGMENTATION_OUT_DIR: $SEGMENTATION_OUT_DIR"
echo "DETECTION_OUT_DIR: $DETECTION_OUT_DIR"
echo "DISTANCE_LABEL_OUT_DIR: $DISTANCE_LABEL_OUT_DIR"
+create_output_dir() {
+ local base_dir=$1
+ local run_number=1
+ while [[ -d "$base_dir/$run_number" ]]; do
+ let run_number++
+ done
+ mkdir -p "$base_dir/$run_number"
+ echo "$base_dir/$run_number"
+}
+
# Crop and Resize Sample
# Batch size 2
LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/cropandresize/cvcuda_sample_cropandresize -i $SAMPLES_DIR/assets/images/ -b 2
@@ -45,20 +55,27 @@ LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/cropandresize/cvcuda_sample_
# Run the segmentation Python sample with default settings, without any command-line args.
rm -rf "$CLASSIFICATION_OUT_DIR"
mkdir "$CLASSIFICATION_OUT_DIR"
-python3 $SAMPLES_DIR/classification/python/main.py -o "$CLASSIFICATION_OUT_DIR"
+CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR")
+python3 $SAMPLES_DIR/classification/python/main.py -o "$CLASSIFICATION_RUN_DIR"
# Run it on a specific image with batch size 1 with PyTorch backend.
-python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 1 -bk pytorch -o "$CLASSIFICATION_OUT_DIR"
+CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR")
+python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 1 -bk pytorch -o "$CLASSIFICATION_RUN_DIR"
# # Run it on a specific image with batch size 4 with PyTorch backend. Uses Same image multiple times
-python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 4 -bk pytorch -o "$CLASSIFICATION_OUT_DIR"
+CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR")
+python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 4 -bk pytorch -o "$CLASSIFICATION_RUN_DIR"
# Run it on a folder worth of images with batch size 2 with PyTorch backend.
-python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/ -b 2 -bk pytorch -o "$CLASSIFICATION_OUT_DIR"
+CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR")
+python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/ -b 2 -bk pytorch -o "$CLASSIFICATION_RUN_DIR"
# Run it on a specific image with batch size 1 with TensorRT backend with saving the output in a specific directory.
-
-python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 1 -bk tensorrt -o "$CLASSIFICATION_OUT_DIR"
+CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR")
+python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 1 -bk tensorrt -o "$CLASSIFICATION_RUN_DIR"
# Run it on a specific image with batch size 1 with TensorRT backend with saving the output in a specific directory.
-python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 2 -bk tensorrt -o "$CLASSIFICATION_OUT_DIR"
+CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR")
+python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -b 2 -bk tensorrt -o "$CLASSIFICATION_RUN_DIR"
# Run it on a video with batch size 1 with TensorRT backend with saving the output in a specific directory.
-python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 1 -bk tensorrt -o "$CLASSIFICATION_OUT_DIR"
+CLASSIFICATION_RUN_DIR=$(create_output_dir "$CLASSIFICATION_OUT_DIR")
+python3 $SAMPLES_DIR/classification/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 1 -bk tensorrt -o "$CLASSIFICATION_RUN_DIR"
+
# Run the classification C++ sample. Since the Python sample was already run, we can reuse the TensorRT model
# and the labels file generated by it.
# Batch size 1
@@ -66,44 +83,56 @@ LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample
# Batch size 2
LD_LIBRARY_PATH=$SAMPLES_DIR/lib $SAMPLES_DIR/build/classification/cvcuda_sample_classification -e /tmp/classification/model.2.224.224.trtmodel -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -l /tmp/classification/labels.txt -b 2
-
# Run the segmentation Python sample with default settings, without any command-line args.
rm -rf "$SEGMENTATION_OUT_DIR"
mkdir "$SEGMENTATION_OUT_DIR"
-python3 $SAMPLES_DIR/segmentation/python/main.py -o "$SEGMENTATION_OUT_DIR"
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -o "$SEGMENTATION_RUN_DIR"
# Run the segmentation sample with default settings for PyTorch backend.
-python3 $SAMPLES_DIR/segmentation/python/main.py -bk pytorch -o "$SEGMENTATION_OUT_DIR"
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -bk pytorch -o "$SEGMENTATION_RUN_DIR"
# Run it on a single image with high batch size for the background class writing to a specific directory with PyTorch backend
-python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -o "$SEGMENTATION_OUT_DIR" -b 5 -c __background__ -bk pytorch
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/tabby_tiger_cat.jpg -o "$SEGMENTATION_RUN_DIR" -b 5 -c __background__ -bk pytorch
# Run it on a folder worth of images with the default tensorrt backend
-python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/ -o "$SEGMENTATION_OUT_DIR" -b 4 -c __background__
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/ -o "$SEGMENTATION_RUN_DIR" -b 4 -c __background__
# Run it on a folder worth of images with PyTorch
-python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/ -o "$SEGMENTATION_OUT_DIR" -b 5 -c __background__ -bk pytorch
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/ -o "$SEGMENTATION_RUN_DIR" -b 5 -c __background__ -bk pytorch
# Run on a single image with custom resized input given to the sample for the dog class
-python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/Weimaraner.jpg -o "$SEGMENTATION_OUT_DIR" -b 1 -c dog -th 512 -tw 512
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/images/Weimaraner.jpg -o "$SEGMENTATION_RUN_DIR" -b 1 -c dog -th 512 -tw 512
# Run it on a video for class background.
-python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ -o "$SEGMENTATION_OUT_DIR"
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ -o "$SEGMENTATION_RUN_DIR"
# Run it on a video for class background with the PyTorch backend.
-python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ -bk pytorch -o "$SEGMENTATION_OUT_DIR"
-
+SEGMENTATION_RUN_DIR=$(create_output_dir "$SEGMENTATION_OUT_DIR")
+python3 $SAMPLES_DIR/segmentation/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-ilimdar-avgezer-7081456.mp4 -b 4 -c __background__ -bk pytorch -o "$SEGMENTATION_RUN_DIR"
# Run the object detection Python sample with default settings, without any command-line args.
rm -rf "$DETECTION_OUT_DIR"
mkdir "$DETECTION_OUT_DIR"
-python3 $SAMPLES_DIR/object_detection/python/main.py -o "$DETECTION_OUT_DIR"
+DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR")
+python3 $SAMPLES_DIR/object_detection/python/main.py -o "$DETECTION_RUN_DIR"
# Run it with batch size 1 on a single image
-python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg -b 1 -o "$DETECTION_OUT_DIR"
+DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR")
+python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg -b 1 -o "$DETECTION_RUN_DIR"
# Run it with batch size 4 on a video
-python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -o "$DETECTION_OUT_DIR"
+DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR")
+python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -o "$DETECTION_RUN_DIR"
# Run it with batch size 2 on a folder of images
-python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/images/ -b 3 -o "$DETECTION_OUT_DIR"
+DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR")
+python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/images/ -b 3 -o "$DETECTION_RUN_DIR"
# RUn it with the TensorFlow backend
-python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -bk tensorflow -o "$DETECTION_OUT_DIR"
-
+DETECTION_RUN_DIR=$(create_output_dir "$DETECTION_OUT_DIR")
+python3 $SAMPLES_DIR/object_detection/python/main.py -i $SAMPLES_DIR/assets/videos/pexels-chiel-slotman-4423925-1920x1080-25fps.mp4 -b 4 -bk tensorflow -o "$DETECTION_RUN_DIR"
# Run the distance label Python sample with default settings, without any command-line args.
rm -rf "$DISTANCE_LABEL_OUT_DIR"
mkdir "$DISTANCE_LABEL_OUT_DIR"
-python3 $SAMPLES_DIR/label/python/main.py -o "$DISTANCE_LABEL_OUT_DIR"
+DISTANCE_LABEL_RUN_DIR=$(create_output_dir "$DISTANCE_LABEL_OUT_DIR")
+python3 $SAMPLES_DIR/label/python/main.py -o "$DISTANCE_LABEL_RUN_DIR"
# Run it with batch size 1 on a single image
-python3 $SAMPLES_DIR/label/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg -b 1 -o "$DISTANCE_LABEL_OUT_DIR"
+DISTANCE_LABEL_RUN_DIR=$(create_output_dir "$DISTANCE_LABEL_OUT_DIR")
+python3 $SAMPLES_DIR/label/python/main.py -i $SAMPLES_DIR/assets/images/peoplenet.jpg -b 1 -o "$DISTANCE_LABEL_RUN_DIR"
diff --git a/samples/segmentation/python/main.py b/samples/segmentation/python/main.py
index 02c8a9820..6ee5411a5 100644
--- a/samples/segmentation/python/main.py
+++ b/samples/segmentation/python/main.py
@@ -85,8 +85,10 @@ def run_sample(
cuda_device = cuda.Device(device_id)
cuda_ctx = cuda_device.retain_primary_context()
cuda_ctx.push()
- cvcuda_stream = cvcuda.Stream()
- torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle)
+ # Use the the default stream for cvcuda and torch
+ # Since we never created a stream current will be the CUDA default stream
+ cvcuda_stream = cvcuda.Stream().current
+ torch_stream = torch.cuda.default_stream(device=cuda_device)
# docs_tag: end_setup_gpu
# docs_tag: begin_setup_stages
@@ -100,6 +102,7 @@ def run_sample(
batch_size,
device_id,
cuda_ctx,
+ cvcuda_stream,
cvcuda_perf,
)
@@ -115,6 +118,7 @@ def run_sample(
batch_size,
device_id,
cuda_ctx,
+ cvcuda_stream,
cvcuda_perf,
)
@@ -123,6 +127,7 @@ def run_sample(
decoder.fps,
device_id,
cuda_ctx,
+ cvcuda_stream,
cvcuda_perf,
)
@@ -169,7 +174,7 @@ def run_sample(
batch_idx = 0
while True:
cvcuda_perf.push_range("batch", batch_idx=batch_idx)
-
+ # Make sure that cvcuda and torch are using the same stream
with cvcuda_stream, torch.cuda.stream(torch_stream):
# Stage 1: decode
batch = decoder()
diff --git a/samples/segmentation/python/triton_client.py b/samples/segmentation/python/triton_client.py
index 7802fec2d..d6eff764d 100644
--- a/samples/segmentation/python/triton_client.py
+++ b/samples/segmentation/python/triton_client.py
@@ -104,8 +104,10 @@ def run_sample(
cuda_device = cuda.Device(device_id)
cuda_ctx = cuda_device.retain_primary_context()
cuda_ctx.push()
- cvcuda_stream = cvcuda.Stream()
- torch_stream = torch.cuda.ExternalStream(cvcuda_stream.handle)
+ # Use the the default stream for cvcuda and torch
+ # Since we never created a stream current will be the CUDA default stream
+ cvcuda_stream = cvcuda.Stream().current
+ torch_stream = torch.cuda.default_stream(device=cuda_device)
# docs_tag: end_stream_setup
# docs_tag: begin_setup_triton_client
@@ -128,6 +130,7 @@ def run_sample(
batch_size,
device_id,
cuda_ctx,
+ cvcuda_stream,
cvcuda_perf,
)
@@ -166,11 +169,12 @@ def run_sample(
batch_size,
device_id,
cuda_ctx,
+ cvcuda_stream,
cvcuda_perf,
)
encoder = VideoBatchEncoder(
- output_dir, decoder.fps, device_id, cuda_ctx, cvcuda_perf
+ output_dir, decoder.fps, device_id, cuda_ctx, cvcuda_stream, cvcuda_perf
)
# Fire up encoder/decoder
diff --git a/src/cvcuda/CMakeLists.txt b/src/cvcuda/CMakeLists.txt
index 4a21a4c56..202caf756 100644
--- a/src/cvcuda/CMakeLists.txt
+++ b/src/cvcuda/CMakeLists.txt
@@ -22,7 +22,6 @@ set(CV_CUDA_OP_FILES
OpOSD.cpp
OpHistogramEq.cpp
OpAdvCvtColor.cpp
- OpFindContours.cpp
OpSIFT.cpp
OpMinMaxLoc.cpp
OpHistogram.cpp
@@ -69,6 +68,7 @@ set(CV_CUDA_OP_FILES
OpLabel.cpp
OpPairwiseMatcher.cpp
OpFindHomography.cpp
+ OpStack.cpp
)
# filter only one that matches the patern (case insensitive), should be set on the global level
@@ -91,7 +91,6 @@ else()
endif()
add_library(cvcuda SHARED
- OpStack.cpp
${CV_CUDA_LIB_FILES}
)
diff --git a/src/cvcuda/OpFindContours.cpp b/src/cvcuda/OpFindContours.cpp
deleted file mode 100644
index 8c5080908..000000000
--- a/src/cvcuda/OpFindContours.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "priv/OpFindContours.hpp"
-
-#include "priv/SymbolVersioning.hpp"
-
-#include
-#include
-#include
-#include
-
-namespace priv = cvcuda::priv;
-
-CVCUDA_DEFINE_API(0, 4, NVCVStatus, cvcudaFindContoursCreate,
- (NVCVOperatorHandle * handle, int32_t maxWidth, int32_t maxHeight, int32_t maxBatchSize))
-{
- return nvcv::ProtectCall(
- [&]
- {
- if (handle == nullptr)
- {
- throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
- "Pointer to NVCVOperator handle must not be NULL");
- }
-
- *handle = reinterpret_cast(
- new priv::FindContours(nvcv::Size2D{maxWidth, maxHeight}, maxBatchSize));
- });
-}
-
-CVCUDA_DEFINE_API(0, 4, NVCVStatus, cvcudaFindContoursSubmit,
- (NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, NVCVTensorHandle points,
- NVCVTensorHandle counts))
-{
- return nvcv::ProtectCall(
- [&]
- {
- nvcv::TensorWrapHandle point(points), input(in), count(counts);
- priv::ToDynamicRef(handle)(stream, input, point, count);
- });
-}
diff --git a/src/cvcuda/OpLabel.cpp b/src/cvcuda/OpLabel.cpp
index 351cce2b4..807c99e6a 100644
--- a/src/cvcuda/OpLabel.cpp
+++ b/src/cvcuda/OpLabel.cpp
@@ -38,11 +38,11 @@ CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaLabelCreate, (NVCVOperatorHandle * han
});
}
-CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaLabelSubmit,
+CVCUDA_DEFINE_API(0, 7, NVCVStatus, cvcudaLabelSubmit,
(NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in, NVCVTensorHandle out,
NVCVTensorHandle bgLabel, NVCVTensorHandle minThresh, NVCVTensorHandle maxThresh,
- NVCVTensorHandle minSize, NVCVTensorHandle count, NVCVTensorHandle stats,
- NVCVConnectivityType connectivity, NVCVLabelType assignLabels))
+ NVCVTensorHandle minSize, NVCVTensorHandle count, NVCVTensorHandle stats, NVCVTensorHandle mask,
+ NVCVConnectivityType connectivity, NVCVLabelType assignLabels, NVCVLabelMaskType maskType))
{
return nvcv::ProtectCall(
[&]
@@ -50,6 +50,7 @@ CVCUDA_DEFINE_API(0, 5, NVCVStatus, cvcudaLabelSubmit,
cvcuda::priv::ToDynamicRef(handle)(
stream, nvcv::TensorWrapHandle{in}, nvcv::TensorWrapHandle{out}, nvcv::TensorWrapHandle{bgLabel},
nvcv::TensorWrapHandle{minThresh}, nvcv::TensorWrapHandle{maxThresh}, nvcv::TensorWrapHandle{minSize},
- nvcv::TensorWrapHandle{count}, nvcv::TensorWrapHandle{stats}, connectivity, assignLabels);
+ nvcv::TensorWrapHandle{count}, nvcv::TensorWrapHandle{stats}, nvcv::TensorWrapHandle{mask},
+ connectivity, assignLabels, maskType);
});
}
diff --git a/src/cvcuda/include/cvcuda/OpFindContours.h b/src/cvcuda/include/cvcuda/OpFindContours.h
deleted file mode 100644
index 78ea04e40..000000000
--- a/src/cvcuda/include/cvcuda/OpFindContours.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file OpFindContours.h
- *
- * @brief Defines types and functions to handle the resize operation.
- * @defgroup NVCV_C_ALGORITHM_FIND_CONTOURS Find Contours
- * @{
- */
-
-#ifndef CVCUDA_FIND_CONTOURS_H
-#define CVCUDA_FIND_CONTOURS_H
-
-#include "Operator.h"
-#include "Types.h"
-#include "detail/Export.h"
-
-#include
-#include
-#include
-#include
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-/** Constructs and an instance of the resize operator.
- *
- * @param [out] handle Where the image instance handle will be written to.
- * + Must not be NULL.
- *
- * @retval #NVCV_ERROR_INVALID_ARGUMENT Handle is null.
- * @retval #NVCV_ERROR_OUT_OF_MEMORY Not enough memory to create the operator.
- * @retval #NVCV_SUCCESS Operation executed successfully.
- */
-CVCUDA_PUBLIC NVCVStatus cvcudaFindContoursCreate(NVCVOperatorHandle *handle, int32_t maxWidth, int32_t maxHeight,
- int32_t maxBatchSize);
-
-/**
- * Limitations:
- *
- * Input:
- * Data Layout: [kNHWC, kHWC]
- * Channels: [1]
- *
- * Data Type | Allowed
- * -------------- | -------------
- * 8bit Unsigned | Yes
- * 8bit Signed | No
- * 16bit Unsigned | No
- * 16bit Signed | No
- * 32bit Unsigned | No
- * 32bit Signed | No
- * 32bit Float | No
- * 64bit Float | No
- *
- * Output:
- * Data Layout: [kNHWC, kHWC]
- * Channels: [1, 3, 4]
- *
- * Data Type | Allowed
- * -------------- | -------------
- * 8bit Unsigned | Yes
- * 8bit Signed | No
- * 16bit Unsigned | Yes
- * 16bit Signed | No
- * 32bit Unsigned | No
- * 32bit Signed | Yes
- * 32bit Float | Yes
- * 64bit Float | No
- *
- * Input/Output dependency
- *
- * Property | Input == Output
- * -------------- | -------------
- * Data Layout | Yes
- * Data Type | Yes
- * Number | Yes
- * Channels | Yes
- * Width | Yes
- * Height | Yes
- *
- * @param [in] handle Handle to the operator.
- * + Must not be NULL.
- * @param [in] stream Handle to a valid CUDA stream.
- * @param [in] in GPU pointer to input data. Represents an 8-bit, unsigned,
- * single-channel image. Non-zero pixels are treated as 1's, and zero
- * pixels remain as 0's, which makes the image binary.
- * @param [out] points GPU pointer to output data. It contains the detected
- * contours for the input image. The data is structured as: [x_c0_p0,
- * y_c0_p0, ..., x_ci_pj, y_ci_pj, ...], where "ci" denotes a contour's
- * index in the output array and "pj" is a point's index within a
- * contour.
- * @param [out] numPoints Holds the number of contour points for each image.
- * Specifically, numPoints[i] gives the number of contours for the i-th
- * image, while numPoints[i][j] gives the number of points in the j-th
- * contour of i-th image.
- */
-/** @{ */
-CVCUDA_PUBLIC NVCVStatus cvcudaFindContoursSubmit(NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in,
- NVCVTensorHandle points, NVCVTensorHandle numPoints);
-/** @} */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* CVCUDA_FIND_CONTOURS_H */
diff --git a/src/cvcuda/include/cvcuda/OpFindContours.hpp b/src/cvcuda/include/cvcuda/OpFindContours.hpp
deleted file mode 100644
index 29f84ffe3..000000000
--- a/src/cvcuda/include/cvcuda/OpFindContours.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file OpFindContours.hpp
- *
- * @brief Defines the public C++ Class for the resize operation.
- * @defgroup NVCV_CPP_ALGORITHM_FIND_CONTOURS Find Contours
- * @{
- */
-
-#ifndef CVCUDA_FIND_CONTOURS_HPP
-#define CVCUDA_FIND_CONTOURS_HPP
-
-#include "IOperator.hpp"
-#include "OpFindContours.h"
-
-#include
-#include
-#include
-#include
-#include
-
-namespace cvcuda {
-
-class FindContours final : public IOperator
-{
-public:
- static constexpr int32_t MAX_NUM_CONTOURS = 256;
- static constexpr int32_t MAX_CONTOUR_POINTS = 4 * 1024;
- static constexpr int32_t MAX_TOTAL_POINTS = MAX_NUM_CONTOURS * MAX_CONTOUR_POINTS;
-
- explicit FindContours() = delete;
- explicit FindContours(nvcv::Size2D maxSize, int32_t maxBatchSize);
-
- ~FindContours();
-
- void operator()(cudaStream_t stream, nvcv::Tensor &in, nvcv::Tensor &points, nvcv::Tensor &numPoints);
-
- virtual NVCVOperatorHandle handle() const noexcept override;
-
-private:
- NVCVOperatorHandle m_handle;
-};
-
-inline FindContours::FindContours(nvcv::Size2D maxSize, int32_t maxBatchSize)
-{
- nvcv::detail::CheckThrow(cvcudaFindContoursCreate(&m_handle, maxSize.w, maxSize.h, maxBatchSize));
- assert(m_handle);
-}
-
-inline FindContours::~FindContours()
-{
- nvcvOperatorDestroy(m_handle);
- m_handle = nullptr;
-}
-
-inline void FindContours::operator()(cudaStream_t stream, nvcv::Tensor &in, nvcv::Tensor &points,
- nvcv::Tensor &numPoints)
-{
- nvcv::detail::CheckThrow(
- cvcudaFindContoursSubmit(m_handle, stream, in.handle(), points.handle(), numPoints.handle()));
-}
-
-inline NVCVOperatorHandle FindContours::handle() const noexcept
-{
- return m_handle;
-}
-
-} // namespace cvcuda
-
-#endif // CVCUDA_FIND_CONTOURS_HPP
diff --git a/src/cvcuda/include/cvcuda/OpFindHomography.h b/src/cvcuda/include/cvcuda/OpFindHomography.h
index 6d5c5dcc5..b1806254d 100644
--- a/src/cvcuda/include/cvcuda/OpFindHomography.h
+++ b/src/cvcuda/include/cvcuda/OpFindHomography.h
@@ -105,15 +105,15 @@ CVCUDA_PUBLIC NVCVStatus cvcudaFindHomographyCreate(NVCVOperatorHandle *handle,
* from 0 to batch-1, j ranges from 4 to number of coordinates per image, and the data type being
* float2 for (x=x, y=y)
* + Number of coordinates must be >= 4
- * + Must have data type 2F32
- * + Must have rank 2
+ * + Must have data type 2F32 or F32
+ * + Must have rank 2 or 3
*
* * @param [in] dstPts Input tensor, dstPts[i, j] is the set of coordinates for the destination image where i ranges
* from 0 to batch-1, j ranges from 4 to number of coordinates per image, and the data type being
* float2 for (x=x, y=y)
* + Number of coordinates must be >= 4
- * + Must have data type 2F32
- * + Must have rank 2
+ * + Must have data type 2F32 or F32
+ * + Must have rank 2 or 3
*
* @param [out] out Output tensor, models[i, j, k] is the output model tensor which maps the src points to dst points
* in image i, where i ranges from 0 to batch-1, j ranges from 0 to 2 and k ranges from 0 to 2, and
diff --git a/src/cvcuda/include/cvcuda/OpLabel.h b/src/cvcuda/include/cvcuda/OpLabel.h
index 77f620a52..06a3a7ac8 100644
--- a/src/cvcuda/include/cvcuda/OpLabel.h
+++ b/src/cvcuda/include/cvcuda/OpLabel.h
@@ -101,7 +101,7 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle);
* 16bit Unsigned | No
* 16bit Signed | No
* 32bit Unsigned | Yes
- * 32bit Signed | No
+ * 32bit Signed | Yes
* 32bit Float | No
* 64bit Float | No
*
@@ -116,6 +116,8 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle);
* Height | Yes
* Depth | Yes
*
+ * @note The number of elements (pixels or voxels) in input and output tensors must be at most \f$ 2^31 - 1 \f$.
+ *
* @param [in] handle Handle to the operator.
* + Must not be NULL.
* @param [in] stream Handle to a valid CUDA stream.
@@ -177,7 +179,7 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle);
* done before this post-filter step, also known as island-removal step.
* + It must have the same number of samples as input and output tensors.
* + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor.
- * + It must have U32 data type.
+ * + It must have S32 or U32 data type.
* + It may be NULL to not apply minimum size regions removal as a post-filter.
* + If not NULL, the \ref bgLabel and \ref stats tensors must not be NULL as well.
*
@@ -189,32 +191,50 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle);
* of \ref stats tensor, and regions potentially removed by \ref minSize tensor.
* + It must have the same number of samples as input and output tensors.
* + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor.
- * + It must have U32 data type.
+ * + It must have S32 or U32 data type.
* + It may be NULL to disregard counting the number of different labels found.
*
* @param [out] stats Statistics tensor. The expected layout is [NMA], meaning rank-3 tensor with first dimension
* as the number of samples N, matching input and output tensors, second dimension M as maximum
* number of different labels statistics to be computed, and a third dimension A as the amount
- * of statistics to be computed per label (fixed as 6 for 2D or 8 for 3D). If present, this
+ * of statistics to be computed per label (fixed as 7 for 2D or 9 for 3D). If present, this
* tensor is used by the operator to store information per connected-component label. The
* background label is ignored and thus its statistics is not computed.
* + It must have the same number of samples as input and output tensors.
* + It must have a number of statistics M per sample N equal to the maximum allowed number of
* label statistics that can be computed by the Label operator per sample image (or volume).
* The actual number of labels found is stored in \ref count (see above).
- * + For 2D labeling, it must have in the last dimension A=6 elements to store at: (0) the
+ * + For 2D labeling, it must have in the last dimension A=7 elements to store at: (0) the
* original label number; (1) leftmost position; (2) topmost position; (3) width size; (4)
- * height size; (5) count of pixels (i.e. size of the labeled region). And for 3D labeling,
- * it must have in the last dimension A=8 elements to store at: (0) the original label number;
- * (1) leftmost position; (2) topmost position; (3) shallowmost position; (4) width size; (5)
- * height size; (6) depth size; (7) count of voxels (i.e. size of the labeled region).
- * + It must have U32 data type.
+ * height size; (5) count of pixels (i.e. size of the labeled region); (6) region marks (0
+ * means no marks, 1 means region was removed, 2 means region inside the \ref mask will not be
+ * removed). And for 3D labeling, it must have in the last dimension A=9 elements to store
+ * at: (0) the original label number; (1) leftmost position; (2) topmost position; (3)
+ * shallowmost position; (4) width size; (5) height size; (6) depth size; (7) count of voxels
+ * (i.e. size of the labeled region); (8) region marks (0 means no marks, 1 means region was
+ * removed, 2 means region inside the \ref mask will not be removed).
+ * + It must have S32 or U32 data type.
* + It may be NULL to disregard computing statistics information on different labels found.
* + It must not be NULL if \ref assignLabel is NVCV_LABEL_SEQUENTIAL, the index of each label
* statistics is used as the new sequential label replacing the original label in the output,
* the sequential labels are up to the maximum capacity M
* + If not NULL, the \ref count tensor must not be NULL as well.
*
+ * @param [in] mask Mask tensor. The expected layout is [HWC] or [NHWC] for 2D masking or [DHWC] or [NDHWC] for 3D
+ * masking, with either explicit C dimension or missing C with channels embedded in the data type.
+ * The N dimension is the number of samples, if missing it is considered to be N=1, in case N=1
+ * and \ref in and \ref out tensors have N>1 the same mask is to be applied to all images (2D) or
+ * volumes (3D). A value of zero in the mask is considered to be outside the mask and non-zero is
+ * inside. The mask behavior is controlled by \ref maskType.
+ * + If number of samples N is present in the layout, it must be either 1 or equal to N in the
+ * \ref in \ref out tensors.
+ * + It must have the same height H and width W as \ref in and \ref out tensors.
+ * + It must have the same depth D as \ref in and \ref out tensors in case of 3D.
+ * + If channel C is present in the layout, it must be 1.
+ * + It must have S8 or U8 data type.
+ * + If not NULL and maskType is NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY, the \ref minSize tensor
+ * must not be NULL as well.
+ *
* @param [in] connectivity Specify connectivity of elements for the operator, see \ref NVCVConnectivityType.
* + It must conform with \ref in and \ref out tensors, i.e. 3D labeling requires [DHWC]
* or [NDHWC] tensor layouts and 2D labeling requires [HWC] or [NHWC], where the C
@@ -224,6 +244,10 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle);
* NVCV_LABEL_FAST to do fast labeling, i.e. assign non-consecutive label numbers fast.
* Use NCVC_LABEL_SEQUENTIAL to have consecutive label numbers instead.
*
+ * @param [in] maskType Specify how the mask tensor affects this operator, see \ref NVCVLabelMaskType. Use
+ * NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY to only remove islands, i.e. regions with less than
+ * \ref minSize elements, that are outside the mask (defined by zeros in the mask).
+ *
* @retval #NVCV_ERROR_INVALID_ARGUMENT Some parameter is outside valid range.
* @retval #NVCV_ERROR_INTERNAL Internal error in the operator, invalid types passed in.
* @retval #NVCV_SUCCESS Operation executed successfully.
@@ -231,8 +255,9 @@ CVCUDA_PUBLIC NVCVStatus cvcudaLabelCreate(NVCVOperatorHandle *handle);
CVCUDA_PUBLIC NVCVStatus cvcudaLabelSubmit(NVCVOperatorHandle handle, cudaStream_t stream, NVCVTensorHandle in,
NVCVTensorHandle out, NVCVTensorHandle bgLabel, NVCVTensorHandle minThresh,
NVCVTensorHandle maxThresh, NVCVTensorHandle minSize, NVCVTensorHandle count,
- NVCVTensorHandle stats, NVCVConnectivityType connectivity,
- NVCVLabelType assignLabels);
+ NVCVTensorHandle stats, NVCVTensorHandle mask,
+ NVCVConnectivityType connectivity, NVCVLabelType assignLabels,
+ NVCVLabelMaskType maskType);
#ifdef __cplusplus
}
diff --git a/src/cvcuda/include/cvcuda/OpLabel.hpp b/src/cvcuda/include/cvcuda/OpLabel.hpp
index 54ebd54e2..1b6997d9a 100644
--- a/src/cvcuda/include/cvcuda/OpLabel.hpp
+++ b/src/cvcuda/include/cvcuda/OpLabel.hpp
@@ -45,8 +45,8 @@ class Label final : public IOperator
void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out, const nvcv::Tensor &bgLabel,
const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh, const nvcv::Tensor &minSize,
- const nvcv::Tensor &count, const nvcv::Tensor &stats, NVCVConnectivityType connectivity,
- NVCVLabelType assignLabels) const;
+ const nvcv::Tensor &count, const nvcv::Tensor &stats, const nvcv::Tensor &mask,
+ NVCVConnectivityType connectivity, NVCVLabelType assignLabels, NVCVLabelMaskType maskType) const;
virtual NVCVOperatorHandle handle() const noexcept override;
@@ -69,11 +69,12 @@ inline Label::~Label()
inline void Label::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &out,
const nvcv::Tensor &bgLabel, const nvcv::Tensor &minThresh, const nvcv::Tensor &maxThresh,
const nvcv::Tensor &minSize, const nvcv::Tensor &count, const nvcv::Tensor &stats,
- NVCVConnectivityType connectivity, NVCVLabelType assignLabels) const
+ const nvcv::Tensor &mask, NVCVConnectivityType connectivity, NVCVLabelType assignLabels,
+ NVCVLabelMaskType maskType) const
{
nvcv::detail::CheckThrow(cvcudaLabelSubmit(m_handle, stream, in.handle(), out.handle(), bgLabel.handle(),
minThresh.handle(), maxThresh.handle(), minSize.handle(), count.handle(),
- stats.handle(), connectivity, assignLabels));
+ stats.handle(), mask.handle(), connectivity, assignLabels, maskType));
}
inline NVCVOperatorHandle Label::handle() const noexcept
diff --git a/src/cvcuda/include/cvcuda/OpSIFT.h b/src/cvcuda/include/cvcuda/OpSIFT.h
index 45fa7308e..39e5142f2 100644
--- a/src/cvcuda/include/cvcuda/OpSIFT.h
+++ b/src/cvcuda/include/cvcuda/OpSIFT.h
@@ -146,8 +146,7 @@ CVCUDA_PUBLIC NVCVStatus cvcudaSIFTCreate(NVCVOperatorHandle *handle, int3 maxSh
* + It must have S32 data type to store number of features found.
* + It must have one element per sample, i.e. number of channels must be 1 in a [NC] tensor.
*
- * @param [in] numOctaveLayers Number of layers in each octave. Since the minimum number of layers is 3, the
- * actual number is 3 + numOctaveLayers. One suggestion, given by the original
+ * @param [in] numOctaveLayers Number of layers in each octave. One suggestion, given by the original
* algorithm description, is to use numOctaveLayers = 3. The number of octaves is
* computed from the input image resolution WxH as \f$ log(min(W, H))/log(2) - 2 \f$.
* + It must be positive.
diff --git a/src/cvcuda/include/cvcuda/Types.h b/src/cvcuda/include/cvcuda/Types.h
index 37eb2e0cf..8dc5131f9 100644
--- a/src/cvcuda/include/cvcuda/Types.h
+++ b/src/cvcuda/include/cvcuda/Types.h
@@ -402,6 +402,12 @@ typedef enum
NVCV_LABEL_SEQUENTIAL, //!< Assigns consecutive numbers to labels.
} NVCVLabelType;
+// @brief Defines how mask affects label operation
+typedef enum
+{
+ NVCV_REMOVE_ISLANDS_OUTSIDE_MASK_ONLY, //!< Prevent removing islands inside the mask
+} NVCVLabelMaskType;
+
// @brief Defines pair-wise matcher algorithms of choice
typedef enum
{
diff --git a/src/cvcuda/priv/CMakeLists.txt b/src/cvcuda/priv/CMakeLists.txt
index 6b28a39f7..fa0e8c390 100644
--- a/src/cvcuda/priv/CMakeLists.txt
+++ b/src/cvcuda/priv/CMakeLists.txt
@@ -18,7 +18,6 @@ add_subdirectory(legacy)
set(CV_CUDA_PRIV_FILES IOperator.cpp)
set(CV_CUDA_PRIV_OP_FILES
- OpFindContours.cpp
OpOSD.cpp
OpHistogramEq.cpp
OpAdvCvtColor.cu
diff --git a/src/cvcuda/priv/OpBrightnessContrast.cu b/src/cvcuda/priv/OpBrightnessContrast.cu
index f97f67b49..2e55c3dfd 100644
--- a/src/cvcuda/priv/OpBrightnessContrast.cu
+++ b/src/cvcuda/priv/OpBrightnessContrast.cu
@@ -72,8 +72,7 @@ struct BatchArgsWrap
};
template
-inline __host__ __device__ BT GetArg(const cuda::Tensor1DWrap &tensorArg, int argLen, int sampleIdx,
- BT defaultVal)
+inline __device__ BT GetArg(const cuda::Tensor1DWrap &tensorArg, int argLen, int sampleIdx, BT defaultVal)
{
if (argLen == 0)
{
@@ -90,7 +89,7 @@ inline __host__ __device__ BT GetArg(const cuda::Tensor1DWrap &tensorA
}
template
-inline __host__ __device__ SampleArgs GetBrightnessContrastArg(const BatchArgsWrap &args, int sampleIdx)
+inline __device__ SampleArgs GetBrightnessContrastArg(const BatchArgsWrap &args, int sampleIdx)
{
return {GetArg(args.brightness, args.brightnessLen, sampleIdx, BT{1}),
GetArg(args.contrast, args.contrastLen, sampleIdx, BT{1}),
diff --git a/src/cvcuda/priv/OpFindContours.cpp b/src/cvcuda/priv/OpFindContours.cpp
deleted file mode 100644
index 51d253e1f..000000000
--- a/src/cvcuda/priv/OpFindContours.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "OpFindContours.hpp"
-
-#include "legacy/CvCudaLegacy.h"
-#include "legacy/CvCudaLegacyHelpers.hpp"
-
-#include
-#include
-
-namespace cvcuda::priv {
-
-namespace legacy = nvcv::legacy::cuda_op;
-
-FindContours::FindContours(nvcv::Size2D maxSize, int maxBatchSize)
-{
- legacy::DataShape maxIn, maxOut;
- // maxIn/maxOut not used by op.
- maxIn.N = maxBatchSize;
- maxIn.C = 1;
- maxIn.H = maxSize.h;
- maxIn.W = maxSize.w;
-
- m_legacyOp = std::make_unique(maxIn, maxOut);
-}
-
-void FindContours::operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &points,
- const nvcv::Tensor &numPoints) const
-{
- auto inData = in.exportData();
- if (inData == nullptr)
- {
- throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
- "Input must be cuda-accessible, pitch-linear tensor");
- }
-
- auto pointCoords = points.exportData();
- if (pointCoords == nullptr)
- {
- throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
- "Output must be cuda-accessible, pitch-linear tensor");
- }
-
- auto pointCounts = numPoints.exportData();
- if (pointCounts == nullptr)
- {
- throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
- "Output must be cuda-accessible, pitch-linear tensor");
- }
-
- NVCV_CHECK_THROW(m_legacyOp->infer(*inData, *pointCoords, *pointCounts, stream));
-}
-
-} // namespace cvcuda::priv
diff --git a/src/cvcuda/priv/OpFindContours.hpp b/src/cvcuda/priv/OpFindContours.hpp
deleted file mode 100644
index ec4f21134..000000000
--- a/src/cvcuda/priv/OpFindContours.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file OpFindContours.hpp
- *
- * @brief Defines the private C++ Class for the find contours operation.
- */
-
-#ifndef CVCUDA_PRIV_FIND_CONTOURS_HPP
-#define CVCUDA_PRIV_FIND_CONTOURS_HPP
-
-#include "IOperator.hpp"
-#include "legacy/CvCudaLegacy.h"
-
-#include
-#include
-
-#include
-
-namespace cvcuda::priv {
-
-namespace legacy = nvcv::legacy::cuda_op;
-
-class FindContours final : public IOperator
-{
-public:
- explicit FindContours() = delete;
-
- explicit FindContours(nvcv::Size2D maxSize, int maxBatchSize);
-
- void operator()(cudaStream_t stream, const nvcv::Tensor &in, const nvcv::Tensor &points,
- const nvcv::Tensor &numPoints) const;
-
-private:
- std::unique_ptr m_legacyOp;
-};
-
-} // namespace cvcuda::priv
-
-#endif // CVCUDA_PRIV_FIND_CONTOURS_HPP
diff --git a/src/cvcuda/priv/OpFindHomography.cu b/src/cvcuda/priv/OpFindHomography.cu
index d3e712cbe..7e8beef82 100644
--- a/src/cvcuda/priv/OpFindHomography.cu
+++ b/src/cvcuda/priv/OpFindHomography.cu
@@ -295,12 +295,12 @@ __device__ void calculate_residual_and_jacobian_device(float2 *src, float2 *dst,
}
}
-__host__ __device__ inline float myfabs(float val)
+__device__ inline float myfabs(float val)
{
return fabsf(val);
}
-inline __host__ __device__ float2 myfabs2(float2 val)
+inline __device__ float2 myfabs2(float2 val)
{
float2 ret;
ret.x = fabsf(val.x);
@@ -308,14 +308,14 @@ inline __host__ __device__ float2 myfabs2(float2 val)
return ret;
}
-__host__ __device__ inline int getNumPoints(cuda::Tensor2DWrap src, int numPoints, int batch)
+__device__ inline int getNumPoints(cuda::Tensor2DWrap src, int numPoints, int batch)
{
return numPoints;
}
struct MeanOp
{
- __host__ __device__ float2 eval(float2 val, int numPoints, int batch)
+ __device__ float2 eval(float2 val, int numPoints, int batch)
{
return val / numPoints;
}
@@ -323,7 +323,7 @@ struct MeanOp
struct SquareOp
{
- __host__ __device__ float eval(float val, int batch)
+ __device__ float eval(float val, int batch)
{
return val * val;
}
@@ -336,11 +336,11 @@ private:
public:
// Constructor that takes a float* pointer as a parameter
- __host__ __device__ AbsShiftOp(float2 *data)
+ __host__ AbsShiftOp(float2 *data)
: _data(data){};
// Method to update the float value pointed to by the pointer
- __host__ __device__ float2 eval(float2 newVal, int numPoints, int batch)
+ __device__ float2 eval(float2 newVal, int numPoints, int batch)
{
_data += batch;
return myfabs2(newVal - _data[0]);
@@ -353,7 +353,7 @@ private:
float2 *cm, *cM, *sm, *sM;
public:
- __host__ __device__ LtLOp(float2 *srcMean, float2 *dstMean, float2 *srcShiftSum, float2 *dstShiftSum)
+ __host__ LtLOp(float2 *srcMean, float2 *dstMean, float2 *srcShiftSum, float2 *dstShiftSum)
{
cM = srcMean;
sM = srcShiftSum;
@@ -361,7 +361,7 @@ public:
sm = dstShiftSum;
}
- __host__ __device__ float eval(float2 *src, float2 *dst, int batch, int numPoints, int tid, int j, int k)
+ __device__ float eval(float2 *src, float2 *dst, int batch, int numPoints, int tid, int j, int k)
{
cm += batch;
cM += batch;
@@ -1410,10 +1410,59 @@ void FindHomographyWrapper(SrcDstWrapper srcWrap, SrcDstWrapper dstWrap, ModelTy
calc_buffer, modelWrap, numPoints, batchSize);
}
-template
-void RunFindHomography(const SrcDstType &src, const SrcDstType &dst, const nvcv::TensorDataStridedCuda &models,
- const BufferOffsets *bufferOffset, const cuSolver *cusolverData, cudaStream_t stream)
+inline void RunFindHomography(const nvcv::TensorDataStridedCuda &src, const nvcv::TensorDataStridedCuda &dst,
+ const nvcv::TensorDataStridedCuda &models, const BufferOffsets *bufferOffset,
+ const cuSolver *cusolverData, cudaStream_t stream)
{
+ // validation of input data
+ if ((src.rank() != 2 && src.rank() != 3) || (dst.rank() != 2 && dst.rank() != 3))
+ {
+ throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+ "source and destination points must have rank 2 or 3");
+ }
+
+ if (!(src.shape(0) == dst.shape(0) && src.shape(0) == models.shape(0)))
+ {
+ throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+ "source, destination and model must have same batch size");
+ }
+
+ if (src.shape(1) != dst.shape(1))
+ {
+ throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+ "source and destination array length must be same length to return a valid model");
+ }
+
+ if (src.shape(1) < 4 || dst.shape(1) < 4)
+ {
+ throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+ "source and destination array length must be >=4 to return a valid model");
+ }
+
+ if (!(models.rank() == 3 && models.shape(1) == 3 && models.shape(2) == 3 && models.dtype() == nvcv::TYPE_F32))
+ {
+ throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+ "model tensor must be 2D with shape 3x3 and data type F32");
+ }
+
+ if (!((src.rank() == 2 && src.dtype() == nvcv::TYPE_2F32)
+ || (src.rank() == 3 && src.dtype() == nvcv::TYPE_F32 && src.shape(2) == 2)))
+ {
+ throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+ "source tensor must have data type 2F32 or F32 with last shape 2");
+ }
+ if (!((dst.rank() == 2 && dst.dtype() == nvcv::TYPE_2F32)
+ || (dst.rank() == 3 && dst.dtype() == nvcv::TYPE_F32 && dst.shape(2) == 2)))
+ {
+ throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+ "destination tensor must have data type 2F32 or F32 with last shape 2");
+ }
+ if (!(src.stride(1) == sizeof(float2) && dst.stride(1) == sizeof(float2)))
+ {
+ throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
+ "source and destination tensors must have last dimensions packed");
+ }
+
using SrcDstWrapper = cuda::Tensor2DWrap;
SrcDstWrapper srcWrap(src);
SrcDstWrapper dstWrap(dst);
@@ -1498,42 +1547,6 @@ void FindHomography::operator()(cudaStream_t stream, const nvcv::Tensor &srcPoin
"Input must be cuda-accessible, pitch-linear tensor");
}
- // validation of input data
- if (!((srcData->rank() == dstData->rank()) && (srcData->rank() == 2)))
- {
- throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "source and destination points must have rank 2");
- }
-
- if (!(srcData->shape(0) == dstData->shape(0) && srcData->shape(0) == modelData->shape(0)))
- {
- throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
- "source, destination and model must have same batch size");
- }
-
- if (srcData->shape(1) != dstData->shape(1))
- {
- throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
- "source and destination array length must be same length to return a valid model");
- }
-
- if (srcData->shape(1) < 4 || dstData->shape(1) < 4)
- {
- throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
- "source and destination array length must be >=4 to return a valid model");
- }
-
- if (!(modelData->rank() == 3 && modelData->shape(1) == 3 && modelData->shape(2) == 3))
- {
- throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "model tensor must be 2D with shape 3x3");
- }
-
- if (!(srcData->dtype() == nvcv::TYPE_2F32 && dstData->dtype() == nvcv::TYPE_2F32
- && modelData->dtype() == nvcv::TYPE_F32))
- {
- throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
- "source, destination and model tensors must have data type F32");
- }
-
RunFindHomography(*srcData, *dstData, *modelData, &bufferOffset, &cusolverData, stream);
}
@@ -1569,45 +1582,6 @@ void FindHomography::operator()(cudaStream_t stream, const nvcv::TensorBatch &sr
"model must be cuda-accessible, pitch-linear tensor");
}
- // validation of input data
- if (!((srcData->shape(0) == dstData->shape(0)) && (srcData->shape(0) == modelData->shape(0))
- && (srcData->shape(0) == 1)))
- {
- throw nvcv::Exception(
- nvcv::Status::ERROR_INVALID_ARGUMENT,
- "Invdividual samples (src, dst and model) in the batch must be tensors with batch size 1");
- }
-
- if (!((srcData->rank() == dstData->rank()) && (srcData->rank() == 2)))
- {
- throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
- "source and destination tensors must have rank 2");
- }
-
- if (srcData->shape(1) != dstData->shape(1))
- {
- throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
- "source and destination array length must be same length to return a valid model");
- }
-
- if (srcData->shape(1) < 4 || dstData->shape(1) < 4)
- {
- throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
- "source and destination array length must be >=4 to return a valid model");
- }
-
- if (!(modelData->rank() == 3 && modelData->shape(1) == 3 && modelData->shape(2) == 3))
- {
- throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT, "model tensor must be 2D with shape 3x3");
- }
-
- if (!(srcData->dtype() == nvcv::TYPE_2F32 && dstData->dtype() == nvcv::TYPE_2F32
- && modelData->dtype() == nvcv::TYPE_F32))
- {
- throw nvcv::Exception(nvcv::Status::ERROR_INVALID_ARGUMENT,
- "source, destination and model tensors must have data type F32");
- }
-
RunFindHomography(*srcData, *dstData, *modelData, &bufferOffset, &cusolverData, stream);
}
}
diff --git a/src/cvcuda/priv/OpLabel.cu b/src/cvcuda/priv/OpLabel.cu
index 8a1c51182..b552e8656 100644
--- a/src/cvcuda/priv/OpLabel.cu
+++ b/src/cvcuda/priv/OpLabel.cu
@@ -68,6 +68,10 @@ namespace util = nvcv::util;
namespace {
+constexpr int REGION_NOT_MARKED = 0;
+constexpr int REGION_REMOVED = 1;
+constexpr int REGION_INSIDE_MASK = 2;
+
// CUDA kernels ----------------------------------------------------------------
template